diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 319752482..07b4f6fc5 100755
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -129,7 +129,16 @@ else ()
         if (CMAKE_BUILD_TYPE STREQUAL "Debug")
             set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g")
         elseif (CMAKE_BUILD_TYPE STREQUAL "Release")
-            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O2")
+            # -flto + MinGW gcc + statically-linked antlr4_static produces
+            # unresolved-reference errors at link time (LTO intermediate objects
+            # can't see the .a's vtable thunks). -march=native is also a poor
+            # default for CI binaries shipped to other machines. Keep both on
+            # Linux/macOS where the optimization actually pays off.
+            if (MINGW OR WIN32)
+                set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
+            else ()
+                set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native -flto")
+            endif ()
         elseif (CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
             set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O2 -g")
         elseif (CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
diff --git a/cpp/pom.xml b/cpp/pom.xml
index 5415212f0..153e75dc2 100644
--- a/cpp/pom.xml
+++ b/cpp/pom.xml
@@ -99,8 +99,8 @@
                                     plugin's generate goal throw an NPE.
                                 -->
                             </options>
-                            <sourcePath />
-                            <targetPath />
+                            <sourcePath/>
+                            <targetPath/>
                         </configuration>
                     </execution>
                     <!-- Compile the test code -->
diff --git a/cpp/src/CMakeLists.txt b/cpp/src/CMakeLists.txt
index 93342c113..895c1ddba 100644
--- a/cpp/src/CMakeLists.txt
+++ b/cpp/src/CMakeLists.txt
@@ -37,6 +37,9 @@ message("cmake using: ENABLE_LZOKAY=${ENABLE_LZOKAY}")
 option(ENABLE_ZLIB "Enable Zlib compression" ON)
 message("cmake using: ENABLE_ZLIB=${ENABLE_ZLIB}")
 
+# ENABLE_SIMD is defined in the top-level CMakeLists.txt
+message("cmake using: ENABLE_SIMD=${ENABLE_SIMD}")
+
 message("Running in src directory")
 if (${COV_ENABLED})
     add_compile_options(-fprofile-arcs -ftest-coverage)
@@ -89,6 +92,13 @@ if (ENABLE_ANTLR4)
     message("Adding ANTLR4 include directory")
 endif()
 
+if (ENABLE_SIMD)
+    add_definitions(-DENABLE_SIMD)
+    list(APPEND PROJECT_INCLUDE_DIR
+            ${CMAKE_SOURCE_DIR}/third_party/simde-0.8.4-rc3
+    )
+endif()
+
 include_directories(${PROJECT_INCLUDE_DIR})
 
 # Mark every translation unit that is compiled into the tsfile library so that
@@ -144,10 +154,17 @@ add_library(tsfile SHARED)
 
 if (${COV_ENABLED})
     message("Enable code cov...")
+    # Apple clang ships coverage runtime via --coverage; libgcov isn't a
+    # standalone library on macOS.  Use --coverage there.
+    if (APPLE)
+        set(COV_LINK_LIB --coverage)
+    else()
+        set(COV_LINK_LIB -lgcov)
+    endif()
     if (ENABLE_ANTLR4)
-        target_link_libraries(tsfile common_obj compress_obj cwrapper_obj file_obj read_obj write_obj parser_obj -lgcov)
+        target_link_libraries(tsfile common_obj compress_obj cwrapper_obj file_obj read_obj write_obj parser_obj ${COV_LINK_LIB})
     else()
-        target_link_libraries(tsfile common_obj compress_obj cwrapper_obj file_obj read_obj write_obj -lgcov)
+        target_link_libraries(tsfile common_obj compress_obj cwrapper_obj file_obj read_obj write_obj ${COV_LINK_LIB})
     endif()
 else()
     message("Disable code cov...")
@@ -171,4 +188,4 @@ set_target_properties(tsfile PROPERTIES SOVERSION ${LIBTSFILE_SO_VERSION})
 install(TARGETS tsfile
         RUNTIME DESTINATION ${LIBRARY_OUTPUT_PATH}
         LIBRARY DESTINATION ${LIBRARY_OUTPUT_PATH}
-        ARCHIVE DESTINATION ${LIBRARY_OUTPUT_PATH})
\ No newline at end of file
+        ARCHIVE DESTINATION ${LIBRARY_OUTPUT_PATH})
diff --git a/cpp/src/common/CMakeLists.txt b/cpp/src/common/CMakeLists.txt
index 4406cb219..60e0fdccf 100644
--- a/cpp/src/common/CMakeLists.txt
+++ b/cpp/src/common/CMakeLists.txt
@@ -22,21 +22,15 @@ aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} common_SRC_LIST)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/allocator common_allocator_SRC_LIST)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/container common_container_SRC_LIST)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/tsblock common_tsblock_SRC_LIST)
-aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/mutex common_mutex_SRC_LIST)
 aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR}/datatype common_datatype_SRC_LIST)
 
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
-add_library(common_obj OBJECT ${common_SRC_LIST} 
+add_library(common_obj OBJECT ${common_SRC_LIST}
     ${common_allocator_SRC_LIST}
     ${common_container_SRC_LIST}
-    ${common_tsblock_SRC_LIST} 
-    ${common_mutex_SRC_LIST} 
+    ${common_tsblock_SRC_LIST}
     ${common_datatype_SRC_LIST})
 
-if (ENABLE_ANTLR4)
-    target_compile_definitions(common_obj PRIVATE ENABLE_ANTLR4)
-endif()
-
 # install header files recursively
 file(GLOB_RECURSE HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/*.h")
 copy_to_dir(${HEADERS} "common_obj")
\ No newline at end of file
diff --git a/cpp/src/common/allocator/alloc_base.h b/cpp/src/common/allocator/alloc_base.h
index c89aed077..dd2e0ab61 100644
--- a/cpp/src/common/allocator/alloc_base.h
+++ b/cpp/src/common/allocator/alloc_base.h
@@ -82,35 +82,43 @@ class ModStat {
     }
     void init();
     void destroy();
-    INLINE void update_alloc(AllocModID mid, int32_t size) {
+    INLINE void update_alloc(AllocModID mid, int64_t size) {
 #ifdef ENABLE_MEM_STAT
         ASSERT(mid < __LAST_MOD_ID);
         ATOMIC_FAA(get_item(mid), size);
 #endif
     }
-    void update_free(AllocModID mid, uint32_t size) {
+    void update_free(AllocModID mid, uint64_t size) {
 #ifdef ENABLE_MEM_STAT
         ASSERT(mid < __LAST_MOD_ID);
-        ATOMIC_FAA(get_item(mid), 0 - size);
+        ATOMIC_FAA(get_item(mid), -static_cast<int64_t>(size));
 #endif
     }
     void print_stat();
 
+    int64_t get_stat(int8_t mid) {
+#ifdef ENABLE_MEM_STAT
+        if (stat_arr_ != NULL && mid < __LAST_MOD_ID)
+            return ATOMIC_FAA(get_item(mid), 0LL);
+#endif
+        return 0;
+    }
+
 #ifdef ENABLE_TEST
-    int32_t TEST_get_stat(int8_t mid) { return ATOMIC_FAA(get_item(mid), 0); }
+    int64_t TEST_get_stat(int8_t mid) { return ATOMIC_FAA(get_item(mid), 0LL); }
 #endif
 
    private:
-    INLINE int32_t* get_item(int8_t mid) {
-        return &(stat_arr_[mid * (ITEM_SIZE / sizeof(int32_t))]);
+    INLINE int64_t* get_item(int8_t mid) {
+        return &(stat_arr_[mid * (ITEM_SIZE / sizeof(int64_t))]);
     }
 
    private:
     static const int32_t ITEM_SIZE = CACHE_LINE_SIZE;
     static const int32_t ITEM_COUNT = __LAST_MOD_ID;
-    int32_t* stat_arr_;
+    int64_t* stat_arr_;
 
-    STATIC_ASSERT((ITEM_SIZE % sizeof(int32_t) == 0), ModStat_ITEM_SIZE_ERROR);
+    STATIC_ASSERT((ITEM_SIZE % sizeof(int64_t) == 0), ModStat_ITEM_SIZE_ERROR);
 };
 
 /* base allocator */
diff --git a/cpp/src/common/allocator/byte_stream.h b/cpp/src/common/allocator/byte_stream.h
index 435a1f6fd..ad8dbb90d 100644
--- a/cpp/src/common/allocator/byte_stream.h
+++ b/cpp/src/common/allocator/byte_stream.h
@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#include <atomic>
 #include <iostream>
 #include <string>
 
@@ -33,51 +34,51 @@
 
 namespace common {
 
+// std::atomic<T> as the actual storage so the MSVC fallback no longer needs
+// `reinterpret_cast<atomic<T>*>(T*)` — that cast is UB because the underlying
+// object was never constructed as a std::atomic<T>.  When the caller asks for
+// non-atomic mode we still go through the atomic interface but with
+// memory_order_relaxed, which on x86/ARM compiles to a plain load/store.
+// std::atomic<T> is non-copyable, so neither is OptionalAtomic; existing
+// callers either construct in place or use shallow_clone_from / store.
 template <typename T>
 class OptionalAtomic {
    public:
     OptionalAtomic(T t, bool enable_atomic = false)
         : val_(t), enable_atomic_(enable_atomic) {}
 
+    OptionalAtomic(const OptionalAtomic&) = delete;
+    OptionalAtomic& operator=(const OptionalAtomic&) = delete;
+    OptionalAtomic(OptionalAtomic&&) = delete;
+    OptionalAtomic& operator=(OptionalAtomic&&) = delete;
+
     FORCE_INLINE T load() const {
-        if (UNLIKELY(enable_atomic_)) {
-            return ATOMIC_LOAD(&val_);
-        } else {
-            return val_;
-        }
+        return val_.load(UNLIKELY(enable_atomic_) ? std::memory_order_seq_cst
+                                                  : std::memory_order_relaxed);
     }
 
     FORCE_INLINE void store(const T t) {
-        if (UNLIKELY(enable_atomic_)) {
-            ATOMIC_STORE(&val_, t);
-        } else {
-            val_ = t;
-        }
+        val_.store(t, UNLIKELY(enable_atomic_) ? std::memory_order_seq_cst
+                                               : std::memory_order_relaxed);
     }
 
     FORCE_INLINE T atomic_faa(const T increment) {
-        if (UNLIKELY(enable_atomic_)) {
-            return ATOMIC_FAA(&val_, increment);
-        } else {
-            T old_val = val_;
-            val_ = val_ + increment;
-            return old_val;
-        }
+        return val_.fetch_add(increment, UNLIKELY(enable_atomic_)
+                                             ? std::memory_order_seq_cst
+                                             : std::memory_order_relaxed);
     }
 
     FORCE_INLINE T atomic_aaf(const T increment) {
-        if (UNLIKELY(enable_atomic_)) {
-            return ATOMIC_AAF(&val_, increment);
-        } else {
-            val_ = val_ + increment;
-            return val_;
-        }
+        return val_.fetch_add(increment, UNLIKELY(enable_atomic_)
+                                             ? std::memory_order_seq_cst
+                                             : std::memory_order_relaxed) +
+               increment;
     }
 
     FORCE_INLINE bool enable_atomic() const { return enable_atomic_; }
 
    private:
-    T val_;
+    std::atomic<T> val_;
     bool enable_atomic_;
 };
 
@@ -231,6 +232,23 @@ FORCE_INLINE double bytes_to_double(uint8_t bytes[8]) {
 
 // TODO define a WrappedByteStream class
 
+// Round n up to the next power of two (>=1). Used to normalize ByteStream
+// page sizes so that `& page_mask_` is equivalent to `% page_size_`.
+// Values above the largest power-of-two that fits in uint32_t are clamped to
+// 0x80000000 — the previous `while (ps < n) ps <<= 1` would shift past 2^31
+// and overflow to 0, looping forever.
+FORCE_INLINE uint32_t round_up_pow2(uint32_t n) {
+    if (n <= 1) return 1;
+    if (n > 0x80000000u) return 0x80000000u;
+    uint32_t v = n - 1;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    return v + 1;
+}
+
 // auto extend buffer for serialization
 class ByteStream {
    private:
@@ -253,6 +271,8 @@ class ByteStream {
     };
 
    public:
+    static const uint32_t DEFAULT_PAGE_SIZE = 1024;
+
     ByteStream(uint32_t page_size, AllocModID mid, bool enable_atomic = false,
                BaseAllocator& allocator = g_base_allocator)
         : allocator_(allocator),
@@ -262,11 +282,16 @@ class ByteStream {
           total_size_(0, enable_atomic),
           read_pos_(0),
           marked_read_pos_(0),
-          page_size_(page_size),
+          // page_mask_ is used as a bitmask in the hot read/write paths
+          // (`x & page_mask_` instead of `x % page_size_`), which only
+          // matches modulo arithmetic when page_size_ is a power of two.
+          // Round up so callers passing non-power-of-2 sizes still get a
+          // correctly-sized page, at the cost of <2x memory in the worst
+          // case (e.g. 1000 → 1024).
+          page_size_(round_up_pow2(page_size)),
+          page_mask_(round_up_pow2(page_size) - 1),
           mid_(mid),
-          wrapped_page_(false, nullptr) {
-        // assert(page_size >= 16);  // commented out by gxh on 2023.03.09
-    }
+          wrapped_page_(false, nullptr) {}
 
     // for wrap plain buffer to ByteStream
     ByteStream(AllocModID mid = MOD_DEFAULT)
@@ -278,6 +303,7 @@ class ByteStream {
           read_pos_(0),
           marked_read_pos_(0),
           page_size_(0),
+          page_mask_(0),
           mid_(mid),
           wrapped_page_(false, nullptr) {}
 
@@ -290,7 +316,10 @@ class ByteStream {
         wrapped_page_.next_.store(nullptr);
         wrapped_page_.buf_ = (uint8_t*)buf;
 
-        page_size_ = buf_len;
+        // page_mask_ is used as a bitmask; only correct for power-of-2
+        // page sizes (see ByteStream ctor comment).
+        page_size_ = round_up_pow2(static_cast<uint32_t>(buf_len));
+        page_mask_ = page_size_ - 1;
         head_.store(&wrapped_page_);
         tail_.store(&wrapped_page_);
         total_size_.store(buf_len);
@@ -305,14 +334,14 @@ class ByteStream {
     void clear_wrapped_buf() { wrapped_page_.buf_ = nullptr; }
 
     /* ================ Part 1: basic ================ */
-    FORCE_INLINE uint32_t remaining_size() const {
+    FORCE_INLINE uint64_t remaining_size() const {
         ASSERT(total_size_.load() >= read_pos_);
         return total_size_.load() - read_pos_;
     }
     FORCE_INLINE bool has_remaining() const { return remaining_size() > 0; }
 
     FORCE_INLINE void mark_read_pos() { marked_read_pos_ = read_pos_; }
-    FORCE_INLINE uint32_t get_mark_len() const {
+    FORCE_INLINE uint64_t get_mark_len() const {
         ASSERT(marked_read_pos_ <= read_pos_);
         return read_pos_ - marked_read_pos_;
     }
@@ -339,30 +368,46 @@ class ByteStream {
     // never used TODO
     void shallow_clone_from(ByteStream& other) {
         this->page_size_ = other.page_size_;
+        this->page_mask_ = other.page_mask_;
         this->mid_ = other.mid_;
         this->head_.store(other.head_.load());
         this->tail_.store(other.tail_.load());
         this->total_size_.store(other.total_size_.load());
     }
 
-    FORCE_INLINE uint32_t total_size() const { return total_size_.load(); }
-    FORCE_INLINE uint32_t read_pos() const { return read_pos_; };
+    FORCE_INLINE uint64_t total_size() const { return total_size_.load(); }
+    FORCE_INLINE uint64_t read_pos() const { return read_pos_; };
+    // Sum of bytes physically allocated for this stream's pages.  For a
+    // wrapped stream this just reports total_size(); for an owning stream
+    // it counts page_size_ per backing page so callers doing memory-pressure
+    // accounting see the real footprint, not the few bytes that happen to
+    // have been written into the latest 64 KiB page.
+    FORCE_INLINE uint64_t allocated_bytes() const {
+        if (is_wrapped()) return total_size_.load();
+        uint64_t total = 0;
+        Page* p = head_.load();
+        while (p != nullptr) {
+            total += page_size_;
+            p = p->next_.load();
+        }
+        return total;
+    }
     /**
      * Seek the read cursor to an absolute offset. Re-anchors read_page_ for
      * multi-page streams.
      */
-    void set_read_pos(uint32_t pos) {
+    void set_read_pos(uint64_t pos) {
         ASSERT(pos <= total_size());
         read_pos_ = pos;
         Page* p = head_.load();
-        uint32_t skipped = 0;
+        uint64_t skipped = 0;
         while (p != nullptr && skipped + page_size_ <= pos) {
             skipped += page_size_;
             p = p->next_.load();
         }
         read_page_ = p;
     }
-    FORCE_INLINE void wrapped_buf_advance_read_pos(uint32_t size) {
+    FORCE_INLINE void wrapped_buf_advance_read_pos(uint64_t size) {
         if (size + read_pos_ > total_size_.load()) {
             read_pos_ = total_size_.load();
         } else {
@@ -380,10 +425,10 @@ class ByteStream {
                 std::cout << "write_buf error " << ret << std::endl;
                 return ret;
             }
-            uint32_t remainder = page_size_ - (total_size_.load() % page_size_);
+            uint32_t remainder = page_size_ - (total_size_.load() & page_mask_);
             uint32_t copy_len =
                 remainder < (len - write_len) ? remainder : (len - write_len);
-            memcpy(tail_.load()->buf_ + total_size_.load() % page_size_,
+            memcpy(tail_.load()->buf_ + (total_size_.load() & page_mask_),
                    buf + write_len, copy_len);
             total_size_.atomic_aaf(copy_len);
             write_len += copy_len;
@@ -404,11 +449,11 @@ class ByteStream {
             if (RET_FAIL(check_space())) {
                 return ret;
             }
-            uint32_t remainder = page_size_ - (read_pos_ % page_size_);
+            uint32_t remainder = page_size_ - (read_pos_ & page_mask_);
             uint32_t copy_len = remainder < want_len_limited - read_len
                                     ? remainder
                                     : want_len_limited - read_len;
-            memcpy(buf + read_len, read_page_->buf_ + (read_pos_ % page_size_),
+            memcpy(buf + read_len, read_page_->buf_ + (read_pos_ & page_mask_),
                    copy_len);
             read_len += copy_len;
             read_pos_ += copy_len;
@@ -460,16 +505,17 @@ class ByteStream {
             return b;
         }
         b.buf_ =
-            (char*)(tail_.load()->buf_ + (total_size_.load() % page_size_));
-        b.len_ = page_size_ - (total_size_.load() % page_size_);
+            (char*)(tail_.load()->buf_ + (total_size_.load() & page_mask_));
+        b.len_ = page_size_ - (total_size_.load() & page_mask_);
         return b;
     }
 
     void buffer_used(uint32_t used_bytes) {
         ASSERT(used_bytes >= 1);
         // would not span page
-        ASSERT((total_size_.load() / page_size_) ==
-               ((total_size_.load() + used_bytes - 1) / page_size_));
+        ASSERT(page_size_ == 0 ||
+               (total_size_.load() / page_size_) ==
+                   ((total_size_.load() + used_bytes - 1) / page_size_));
         total_size_.atomic_aaf(used_bytes);
     }
 
@@ -485,7 +531,7 @@ class ByteStream {
             if (RET_FAIL(prepare_space())) {
                 return ret;
             }
-            uint32_t remainder = page_size_ - (total_size_.load() % page_size_);
+            uint32_t remainder = page_size_ - (total_size_.load() & page_mask_);
             uint32_t step =
                 remainder < (len - advanced) ? remainder : (len - advanced);
             total_size_.atomic_aaf(step);
@@ -504,6 +550,7 @@ class ByteStream {
         Page* cur_;
         Page* end_;
         int64_t total_size_;
+        int64_t consumed_ = 0;
         BufferIterator(const ByteStream& bs) : host_(bs) {
             cur_ = bs.head_.load();
             end_ = bs.tail_.load();
@@ -514,13 +561,17 @@ class ByteStream {
             Buffer b;
             if (cur_ != nullptr) {
                 b.buf_ = (char*)cur_->buf_;
-                if (cur_ == end_ &&
-                    host_.total_size_.load() % host_.page_size_ != 0) {
-                    b.len_ = host_.total_size_.load() % host_.page_size_;
+                if (cur_ == end_) {
+                    // Last page: clamp to remaining total_size_. For wrapped
+                    // streams page_size_ may have been rounded up past the
+                    // user buffer (see wrap_from), so we must not return
+                    // page_size_ as the length here.
+                    b.len_ = static_cast<uint32_t>(total_size_ - consumed_);
                 } else {
                     b.len_ = host_.page_size_;
                 }
                 ASSERT(b.len_ > 0);
+                consumed_ += b.len_;
                 cur_ = cur_->next_.load();
             }
             return b;
@@ -566,7 +617,7 @@ class ByteStream {
 
             // get tail position <tail_, total_size_> atomically
             Page* host_end = nullptr;
-            uint32_t host_total_size = 0;
+            uint64_t host_total_size = 0;
             while (true) {
                 host_end = host_.tail_.load();
                 host_total_size = host_.total_size_.load();
@@ -577,7 +628,7 @@ class ByteStream {
 
             while (true) {
                 if (cur_ == host_end) {
-                    if (host_total_size % host_.page_size_ == 0) {
+                    if ((host_total_size & host_.page_mask_) == 0) {
                         if (read_offset_within_cur_page_ == host_.page_size_) {
                             return b;
                         } else {
@@ -591,15 +642,15 @@ class ByteStream {
                         }
                     } else {
                         if (read_offset_within_cur_page_ ==
-                            (host_total_size % host_.page_size_)) {
+                            (host_total_size & host_.page_mask_)) {
                             return b;
                         } else {
                             b.buf_ = ((char*)(cur_->buf_)) +
                                      read_offset_within_cur_page_;
-                            b.len_ = (host_total_size % host_.page_size_) -
+                            b.len_ = (host_total_size & host_.page_mask_) -
                                      read_offset_within_cur_page_;
                             read_offset_within_cur_page_ =
-                                (host_total_size % host_.page_size_);
+                                (host_total_size & host_.page_mask_);
                             total_end_offset_ += b.len_;
                             return b;
                         }
@@ -629,7 +680,7 @@ class ByteStream {
     FORCE_INLINE int prepare_space() {
         int ret = common::E_OK;
         if (UNLIKELY(tail_.load() == nullptr ||
-                     total_size_.load() % page_size_ == 0)) {
+                     (total_size_.load() & page_mask_) == 0)) {
             Page* p = nullptr;
             if (RET_FAIL(alloc_page(p))) {
                 return ret;
@@ -646,7 +697,7 @@ class ByteStream {
         }
         if (UNLIKELY(read_page_ == nullptr)) {
             read_page_ = head_.load();
-        } else if (UNLIKELY(read_pos_ % page_size_ == 0)) {
+        } else if (UNLIKELY((read_pos_ & page_mask_) == 0)) {
             read_page_ = read_page_->next_.load();
         }
         if (UNLIKELY(read_page_ == nullptr)) {
@@ -682,10 +733,14 @@ class ByteStream {
     OptionalAtomic<Page*> head_;
     OptionalAtomic<Page*> tail_;
     Page* read_page_;  // only one thread is allow to reader this ByteStream
-    OptionalAtomic<uint32_t> total_size_;  // total size in byte
-    uint32_t read_pos_;                    // current reader position
-    uint32_t marked_read_pos_;             // current reader position
+    OptionalAtomic<uint64_t> total_size_;  // total size in byte
+    // 64-bit so streams that legitimately grow past 4 GiB don't truncate
+    // the read cursor (e.g. concatenated chunk buffers in the writer's
+    // write_stream_ before the next flush).
+    uint64_t read_pos_;         // current reader position
+    uint64_t marked_read_pos_;  // current reader position
     uint32_t page_size_;
+    uint32_t page_mask_;  // page_size_ - 1, for bitwise AND instead of modulo
     AllocModID mid_;
 
    public:
@@ -1185,6 +1240,7 @@ class SerializationUtil {
     // indicates that memory has been allocated and must be freed.
     FORCE_INLINE static int read_var_char_ptr(std::string*& str,
                                               ByteStream& in) {
+        str = nullptr;
         int ret = common::E_OK;
         int32_t len = 0;
         int32_t read_len = 0;
@@ -1192,7 +1248,6 @@ class SerializationUtil {
             return ret;
         } else {
             if (len == storage::NO_STR_TO_READ) {
-                str = nullptr;
                 return ret;
             } else {
                 char* tmp_buf =
diff --git a/cpp/src/common/allocator/mem_alloc.cc b/cpp/src/common/allocator/mem_alloc.cc
index 524287e75..b7c5c09c1 100644
--- a/cpp/src/common/allocator/mem_alloc.cc
+++ b/cpp/src/common/allocator/mem_alloc.cc
@@ -95,7 +95,7 @@ void* mem_alloc(uint32_t size, AllocModID mid) {
     auto high4b = static_cast<uint32_t>(header >> 32);
     *reinterpret_cast<uint32_t*>(raw) = high4b;
     *reinterpret_cast<uint32_t*>(raw + 4) = low4b;
-    ModStat::get_instance().update_alloc(mid, static_cast<int32_t>(size));
+    ModStat::get_instance().update_alloc(mid, static_cast<int64_t>(size));
     return raw + header_size;
 }
 
@@ -158,7 +158,7 @@ void* mem_realloc(void* ptr, uint32_t size) {
     *reinterpret_cast<uint32_t*>(p) = high4b;
     *reinterpret_cast<uint32_t*>(p + 4) = low4b;
     ModStat::get_instance().update_alloc(
-        mid, int32_t(size) - int32_t(original_size));
+        mid, int64_t(size) - int64_t(original_size));
     return p + ALIGNMENT;
 }
 
@@ -166,9 +166,9 @@ void ModStat::init() {
     if (stat_arr_ != NULL) {
         return;
     }
-    stat_arr_ = (int32_t*)(::malloc(ITEM_SIZE * ITEM_COUNT));
+    stat_arr_ = (int64_t*)(::malloc(ITEM_SIZE * ITEM_COUNT));
     for (int8_t i = 0; i < __LAST_MOD_ID; i++) {
-        int32_t* item = get_item(i);
+        int64_t* item = get_item(i);
         *item = 0;
     }
 }
@@ -183,14 +183,14 @@ void ModStat::print_stat() {
 
     struct Entry {
         const char* name;
-        int32_t val;
+        int64_t val;
     };
     Entry entries[__LAST_MOD_ID];
     int count = 0;
     int64_t total = 0;
 
     for (int i = 0; i < __LAST_MOD_ID; i++) {
-        int32_t val = ATOMIC_FAA(get_item(i), 0);
+        int64_t val = ATOMIC_FAA(get_item(i), 0LL);
         total += val;
         if (val != 0) {
             entries[count++] = {g_mod_names[i], val};
diff --git a/cpp/src/common/allocator/page_arena.h b/cpp/src/common/allocator/page_arena.h
index 9b8ce5ef6..c0dfbebb9 100644
--- a/cpp/src/common/allocator/page_arena.h
+++ b/cpp/src/common/allocator/page_arena.h
@@ -47,6 +47,19 @@ class PageArena {
     FORCE_INLINE void destroy() { reset(); }
     void reset();
 
+    // Returns the number of bytes actually consumed across all pages.
+    // This is the precise M_meta size: metadata structs are not data-encoded,
+    // so arena used bytes == metadata memory exactly.
+    int64_t get_total_used_bytes() const {
+        int64_t total = 0;
+        Page* p = dummy_head_.next_;
+        while (p) {
+            total += p->cur_alloc_ - reinterpret_cast<char*>(p + 1);
+            p = p->next_;
+        }
+        return total;
+    }
+
 #ifdef ENABLE_TEST
     int TEST_get_page_count() const {
         int count = 0;
diff --git a/cpp/src/common/config/config.h b/cpp/src/common/config/config.h
index e2b2039a7..5cf968688 100644
--- a/cpp/src/common/config/config.h
+++ b/cpp/src/common/config/config.h
@@ -36,7 +36,7 @@ typedef struct ConfigValue {
     TSEncoding time_encoding_type_;
     TSDataType time_data_type_;
     CompressionType time_compress_type_;
-    int32_t chunk_group_size_threshold_;
+    int64_t chunk_group_size_threshold_;
     int32_t record_count_for_next_mem_check_;
     bool encrypt_flag_ = false;
     TSEncoding boolean_encoding_type_;
@@ -46,14 +46,21 @@ typedef struct ConfigValue {
     TSEncoding double_encoding_type_;
     TSEncoding string_encoding_type_;
     CompressionType default_compression_type_;
+    bool parallel_read_enabled_;
     bool parallel_write_enabled_;
-    int32_t write_thread_count_;
-    // When true, aligned writer enforces page size limit strictly by
-    // interleaving time/value writes and sealing pages together when any side
-    // becomes full.
-    // When false, aligned writer may disable some page-size checks to improve
-    // write performance.
-    bool strict_page_size_ = true;
+    // Size of the single global worker pool (common::g_thread_pool_) shared by
+    // the parallel write and parallel read paths.  The pool is (re)created from
+    // this value in init_common().  Like sync_on_close_/encrypt_flag_ it keeps
+    // its in-class default rather than being reset by init_config_value(), so a
+    // set_thread_count() call made before libtsfile_init() actually sizes the
+    // pool instead of being clobbered by the init-time defaults.
+    int32_t thread_count_ = 6;
+    // Durability knob: when true (default), TsFileIOWriter::end_file() issues
+    // an fsync() before closing so that a process / OS crash cannot leave a
+    // partially-flushed file behind. Disabling this trades durability for
+    // throughput: writes return success as soon as data is in the page cache.
+    // Only set to false if the caller drives its own fsync policy.
+    bool sync_on_close_ = true;
 } ConfigValue;
 
 extern void init_config_value();
@@ -62,10 +69,14 @@ extern CompressionType get_default_compressor();
 // In the future, configuration items need to be dynamically adjusted according
 // to the level
 extern void set_config_value();
-extern void config_set_page_max_point_count(uint32_t page_max_point_count);
-extern void config_set_max_degree_of_index_node(
+// Public config setters: validate at the entry point and return
+// E_INVALID_ARG when the requested value is outside the supported range.
+// On rejection the underlying field is left untouched so the writer keeps
+// running with whatever value it had before — callers that don't check the
+// return are no worse off than they were before validation existed.
+extern int config_set_page_max_point_count(uint32_t page_max_point_count);
+extern int config_set_max_degree_of_index_node(
     uint32_t max_degree_of_index_node);
-extern void config_set_strict_page_size(bool strict_page_size);
 
 }  // namespace common
 
diff --git a/cpp/src/common/container/bit_map.cc b/cpp/src/common/container/bit_map.cc
index 407605e56..3b1af6ab2 100644
--- a/cpp/src/common/container/bit_map.cc
+++ b/cpp/src/common/container/bit_map.cc
@@ -31,14 +31,15 @@ BitMap::~BitMap() {
     }
 }
 
-int BitMap::init(uint32_t item_size, bool init_as_zero) {
+int BitMap::init(uint32_t item_size, bool init_as_zero, AllocModID mod_id) {
     uint32_t size = (item_size + 7) / 8;
-    bitmap_ = static_cast<char*>(mem_alloc(size, MOD_TSBLOCK));
+    bitmap_ = static_cast<char*>(mem_alloc(size, mod_id));
     // need set to 0, otherwise there will be wrong data
     const char initial_char = init_as_zero ? 0x00 : 0xFF;
     memset(bitmap_, initial_char, size);
     size_ = size;
     init_as_zero_ = init_as_zero;
+    has_set_bits_ = !init_as_zero;
     return common::E_OK;
 }
 
diff --git a/cpp/src/common/container/bit_map.h b/cpp/src/common/container/bit_map.h
index 757ab1fb1..90ed0e0b6 100644
--- a/cpp/src/common/container/bit_map.h
+++ b/cpp/src/common/container/bit_map.h
@@ -25,16 +25,13 @@
 #include <intrin.h>
 #endif
 
+#include "common/allocator/alloc_base.h"
 #include "utils/errno_define.h"
 #include "utils/util_define.h"
 
 namespace common {
 
-// Cross-platform bit-twiddling helpers. GCC/Clang use their builtins; MSVC
-// uses the equivalent intrinsics from <intrin.h>; any other compiler falls
-// back to a portable loop.
 namespace bitops {
-// Population count of an 8-bit value.
 FORCE_INLINE int popcount8(uint8_t v) {
 #if defined(__GNUC__) || defined(__clang__)
     return __builtin_popcount(v);
@@ -49,7 +46,7 @@ FORCE_INLINE int popcount8(uint8_t v) {
     return c;
 #endif
 }
-// Count trailing zero bits. The argument must be non-zero.
+
 FORCE_INLINE int ctz_nonzero(uint32_t v) {
 #if defined(__GNUC__) || defined(__clang__)
     return __builtin_ctz(v);
@@ -66,23 +63,13 @@ FORCE_INLINE int ctz_nonzero(uint32_t v) {
     return c;
 #endif
 }
-// Count trailing zero bits of a 64-bit value. The argument must be non-zero.
-FORCE_INLINE int ctz64_nonzero(uint64_t v) {
+
+FORCE_INLINE int ctz_nonzero(uint64_t v) {
 #if defined(__GNUC__) || defined(__clang__)
     return __builtin_ctzll(v);
 #elif defined(_MSC_VER)
     unsigned long idx;
-#if defined(_M_X64) || defined(_M_ARM64)
     _BitScanForward64(&idx, v);
-#else
-    // 32-bit MSVC has no _BitScanForward64.
-    if (static_cast<uint32_t>(v) != 0) {
-        _BitScanForward(&idx, static_cast<uint32_t>(v));
-    } else {
-        _BitScanForward(&idx, static_cast<uint32_t>(v >> 32));
-        idx += 32;
-    }
-#endif
     return static_cast<int>(idx);
 #else
     int c = 0;
@@ -97,13 +84,19 @@ FORCE_INLINE int ctz64_nonzero(uint64_t v) {
 
 class BitMap {
    public:
-    BitMap() : bitmap_(nullptr), size_(0), init_as_zero_(true) {}
+    BitMap()
+        : bitmap_(nullptr),
+          size_(0),
+          init_as_zero_(true),
+          has_set_bits_(false) {}
     ~BitMap();
-    int init(uint32_t item_size, bool init_as_zero = true);
+    int init(uint32_t item_size, bool init_as_zero = true,
+             AllocModID mod_id = MOD_TSBLOCK);
 
     FORCE_INLINE void reset() {
         const char initial_char = init_as_zero_ ? 0x00 : 0xFF;
         memset(bitmap_, initial_char, size_);
+        has_set_bits_ = !init_as_zero_;
     }
 
     FORCE_INLINE void set(uint32_t index) {
@@ -113,6 +106,7 @@ class BitMap {
         char* start_addr = bitmap_ + offset;
         uint8_t bit_mask = get_bit_mask(index);
         *start_addr = (*start_addr) | (bit_mask);
+        has_set_bits_ = true;
     }
 
     FORCE_INLINE void clear(uint32_t index) {
@@ -124,7 +118,26 @@ class BitMap {
         *start_addr = (*start_addr) & (~bit_mask);
     }
 
-    FORCE_INLINE void clear_all() { memset(bitmap_, 0x00, size_); }
+    FORCE_INLINE void clear_all() {
+        memset(bitmap_, 0x00, size_);
+        has_set_bits_ = false;
+    }
+
+    // Copy `bytes` of externally-owned bitmap data into this BitMap's buffer
+    // and keep has_set_bits_ in sync. Without this, callers that memcpy
+    // directly into get_bitmap() can leave the has_set_bits_ shortcut stale
+    // and downstream readers (may_have_set_bits()) will falsely treat the
+    // bitmap as empty.
+    FORCE_INLINE void copy_from(const char* src, uint32_t bytes) {
+        ASSERT(bytes <= size_);
+        memcpy(bitmap_, src, bytes);
+        // Conservative: assume the caller-provided bitmap can have set bits.
+        // We could scan to be precise, but the false-positive only costs a
+        // bit of per-cell testing in writers — never silent data loss.
+        if (bytes > 0) {
+            has_set_bits_ = true;
+        }
+    }
 
     FORCE_INLINE bool test(uint32_t index) {
         uint32_t offset = index >> 3;
@@ -135,7 +148,6 @@ class BitMap {
         return (*start_addr & bit_mask);
     }
 
-    // Count the number of bits set to 1 (i.e., number of null entries).
     FORCE_INLINE uint32_t count_set_bits() const {
         uint32_t count = 0;
         const uint8_t* p = reinterpret_cast<const uint8_t*>(bitmap_);
@@ -145,26 +157,21 @@ class BitMap {
         return count;
     }
 
-    // Find the next set bit (null position) at or after @from,
-    // within [0, total_bits). Returns total_bits if none found.
-    // Skips zero bytes in bulk so cost is proportional to the number
-    // of null bytes, not total rows.
     FORCE_INLINE uint32_t next_set_bit(uint32_t from,
                                        uint32_t total_bits) const {
         if (from >= total_bits) return total_bits;
         const uint8_t* p = reinterpret_cast<const uint8_t*>(bitmap_);
         uint32_t byte_idx = from >> 3;
-        // Check remaining bits in the first (partial) byte
         uint8_t byte_val = p[byte_idx] >> (from & 7);
         if (byte_val) {
-            return from + bitops::ctz_nonzero(byte_val);
+            return from + bitops::ctz_nonzero(static_cast<uint32_t>(byte_val));
         }
-        // Scan subsequent full bytes, skipping zeros
         const uint32_t byte_end = (total_bits + 7) >> 3;
         for (++byte_idx; byte_idx < byte_end; ++byte_idx) {
             if (p[byte_idx]) {
                 uint32_t pos =
-                    (byte_idx << 3) + bitops::ctz_nonzero(p[byte_idx]);
+                    (byte_idx << 3) +
+                    bitops::ctz_nonzero(static_cast<uint32_t>(p[byte_idx]));
                 return pos < total_bits ? pos : total_bits;
             }
         }
@@ -175,6 +182,10 @@ class BitMap {
 
     FORCE_INLINE char* get_bitmap() { return bitmap_; }
 
+    // Fast check: returns false only when guaranteed no bits are set.
+    // May return true even when no bits are actually set (conservative).
+    FORCE_INLINE bool may_have_set_bits() const { return has_set_bits_; }
+
    private:
     FORCE_INLINE uint8_t get_bit_mask(uint32_t index) {
         return 1 << (index & 7);
@@ -184,6 +195,7 @@ class BitMap {
     char* bitmap_;
     uint32_t size_;
     bool init_as_zero_;
+    bool has_set_bits_;
 };
 }  // namespace common
 
diff --git a/cpp/src/common/container/byte_buffer.h b/cpp/src/common/container/byte_buffer.h
index 88006dac6..4e2dfab15 100644
--- a/cpp/src/common/container/byte_buffer.h
+++ b/cpp/src/common/container/byte_buffer.h
@@ -107,11 +107,11 @@ class ByteBuffer {
 
     // for variable len value
     FORCE_INLINE char* read(uint32_t offset, uint32_t* len) {
+        ASSERT(offset + variable_type_len_ <= real_data_size_);
         uint32_t tmp;
-        // Directly memcpy to avoid potential alignment issues when casting
-        // int32_t array pointer
         std::memcpy(&tmp, data_ + offset, sizeof(tmp));
         *len = tmp;
+        ASSERT(offset + variable_type_len_ + *len <= real_data_size_);
         char* p = &data_[offset + variable_type_len_];
         return p;
     }
@@ -128,4 +128,4 @@ class ByteBuffer {
 };
 
 }  // namespace common
-#endif  // COMMON_CONTAINER_BYTE_BUFFER_H
\ No newline at end of file
+#endif  // COMMON_CONTAINER_BYTE_BUFFER_H
diff --git a/cpp/src/common/device_id.cc b/cpp/src/common/device_id.cc
index b35a8593f..e88cdac8a 100644
--- a/cpp/src/common/device_id.cc
+++ b/cpp/src/common/device_id.cc
@@ -144,7 +144,7 @@ int StringArrayDeviceID::deserialize(common::ByteStream& read_stream) {
 
     segments_.clear();
     for (uint32_t i = 0; i < num_segments; ++i) {
-        std::string* segment;
+        std::string* segment = nullptr;
         if (RET_FAIL(common::SerializationUtil::read_var_char_ptr(
                 segment, read_stream))) {
             delete segment;
diff --git a/cpp/src/common/global.cc b/cpp/src/common/global.cc
index b49b55657..cc6c5117f 100644
--- a/cpp/src/common/global.cc
+++ b/cpp/src/common/global.cc
@@ -19,31 +19,31 @@
 
 #include "global.h"
 
+#ifdef ENABLE_THREADS
+#include "common/thread_pool.h"
+#endif
+
 #ifndef _WIN32
 #include <execinfo.h>
+#include <strings.h>  // strncasecmp
 #endif
 #include <stdlib.h>
+#include <string.h>  // strlen
 
-#include <thread>
-
-#ifdef ENABLE_THREADS
-#include "common/thread_pool.h"
-#endif
 #include "utils/injection.h"
-#include "utils/util_define.h"  // strncasecmp and other platform-compat shims
+#include "utils/util_define.h"  // strncasecmp -> _strnicmp shim on Windows
 
 namespace common {
 
 ColumnSchema g_time_column_schema;
+ConfigValue g_config_value_;
 #ifdef ENABLE_THREADS
-ThreadPool* g_write_thread_pool_ = nullptr;
+ThreadPool* g_thread_pool_ = nullptr;
 #endif
-ConfigValue g_config_value_;
 
 void init_config_value() {
-    g_config_value_.tsblock_mem_inc_step_size_ = 8000;  // 8k
-    g_config_value_.tsblock_max_memory_ = 64000;        // 64k
-    // g_config_value_.tsblock_max_memory_ = 32;
+    g_config_value_.tsblock_mem_inc_step_size_ = 8000;      // 8k
+    g_config_value_.tsblock_max_memory_ = 2 * 1024 * 1024;  // 2 MB
     g_config_value_.page_writer_max_point_num_ = 10000;
     g_config_value_.page_writer_max_memory_bytes_ = 128 * 1024;  // 128 k
     g_config_value_.max_degree_of_index_node_ = 256;
@@ -64,19 +64,21 @@ void init_config_value() {
     g_config_value_.float_encoding_type_ = GORILLA;
     g_config_value_.double_encoding_type_ = GORILLA;
     g_config_value_.string_encoding_type_ = PLAIN;
-    // Default compression type is LZ4
-#ifdef ENABLE_LZ4
+    // Pick the strongest compressor that was actually compiled in. Gating on
+    // ENABLE_LZ4 while setting SNAPPY (the original code) would request a
+    // compressor that the factory can't produce when the build disables
+    // Snappy, returning nullptr at write time.
+#ifdef ENABLE_SNAPPY
+    g_config_value_.default_compression_type_ = SNAPPY;
+#elif defined(ENABLE_LZ4)
     g_config_value_.default_compression_type_ = LZ4;
 #else
     g_config_value_.default_compression_type_ = UNCOMPRESSED;
 #endif
-    unsigned int hw_cores = std::thread::hardware_concurrency();
-    if (hw_cores == 0) hw_cores = 1;  // fallback if detection fails
-    g_config_value_.parallel_write_enabled_ = (hw_cores > 1);
-    g_config_value_.write_thread_count_ =
-        static_cast<int32_t>(std::min(hw_cores, 64u));
-    // Enforce aligned page size limits strictly by default.
-    g_config_value_.strict_page_size_ = true;
+    g_config_value_.parallel_read_enabled_ = true;
+    g_config_value_.parallel_write_enabled_ = true;
+    // thread_count_ keeps its in-class default (see config.h) so a
+    // set_thread_count() before libtsfile_init() is not reset here.
 }
 
 extern TSEncoding get_value_encoder(TSDataType data_type) {
@@ -113,16 +115,20 @@ extern CompressionType get_default_compressor() {
     return g_config_value_.default_compression_type_;
 }
 
-void config_set_page_max_point_count(uint32_t page_max_point_count) {
+int config_set_page_max_point_count(uint32_t page_max_point_count) {
+    if (page_max_point_count == 0) {
+        return E_INVALID_ARG;
+    }
     g_config_value_.page_writer_max_point_num_ = page_max_point_count;
+    return E_OK;
 }
 
-void config_set_max_degree_of_index_node(uint32_t max_degree_of_index_node) {
+int config_set_max_degree_of_index_node(uint32_t max_degree_of_index_node) {
+    if (max_degree_of_index_node < 2u) {
+        return E_INVALID_ARG;
+    }
     g_config_value_.max_degree_of_index_node_ = max_degree_of_index_node;
-}
-
-void config_set_strict_page_size(bool strict_page_size) {
-    g_config_value_.strict_page_size_ = strict_page_size;
+    return E_OK;
 }
 
 void set_config_value() {}
@@ -145,17 +151,35 @@ int init_common() {
     g_time_column_schema.compression_ = UNCOMPRESSED;
     g_time_column_schema.column_name_ = storage::TIME_COLUMN_NAME;
 #ifdef ENABLE_THREADS
-    // (Re)create the global write thread pool with the configured size.
-    delete g_write_thread_pool_;
-    size_t pool_size =
-        g_config_value_.write_thread_count_ > 0
-            ? static_cast<size_t>(g_config_value_.write_thread_count_)
-            : size_t{1};
-    g_write_thread_pool_ = new ThreadPool(pool_size);
+    // (Re)create the single global worker pool with the configured size.  All
+    // parallel write/read paths submit here; torn down in libtsfile_destroy().
+    delete g_thread_pool_;
+    size_t pool_size = g_config_value_.thread_count_ > 0
+                           ? static_cast<size_t>(g_config_value_.thread_count_)
+                           : size_t{1};
+    g_thread_pool_ = new ThreadPool(pool_size);
 #endif
     return ret;
 }
 
+int set_thread_count(int32_t count) {
+    if (count < 1 || count > 64) return E_INVALID_ARG;
+    g_config_value_.thread_count_ = count;
+#ifdef ENABLE_THREADS
+    // If the global pool already exists (libtsfile_init has run) rebuild it at
+    // the new size so the change takes effect immediately instead of only at
+    // the next libtsfile_init().  This joins all current workers and recreates
+    // them, so the caller must ensure no read/write is concurrently using the
+    // pool — intended for setup / benchmark reconfiguration, not mid-operation
+    // resizing.
+    if (g_thread_pool_ != nullptr) {
+        delete g_thread_pool_;
+        g_thread_pool_ = new ThreadPool(static_cast<size_t>(count));
+    }
+#endif
+    return E_OK;
+}
+
 bool is_timestamp_column_name(const char* time_col_name) {
     // both "time" and "timestamp" refer to timestamp column.
     int32_t len = strlen(time_col_name);
diff --git a/cpp/src/common/global.h b/cpp/src/common/global.h
index 5bee0fa60..ae04c6afa 100644
--- a/cpp/src/common/global.h
+++ b/cpp/src/common/global.h
@@ -29,6 +29,15 @@ namespace common {
 extern TSFILE_API ConfigValue g_config_value_;
 extern TSFILE_API ColumnSchema g_time_column_schema;
 
+#ifdef ENABLE_THREADS
+class ThreadPool;
+// The single process-wide worker pool shared by every parallel code path
+// (write column encoding, read column decoding).  Created in init_common()
+// and torn down in libtsfile_destroy(); null until libtsfile_init() runs, so
+// every caller must fall back to the serial path when it is null.
+extern TSFILE_API ThreadPool* g_thread_pool_;
+#endif
+
 FORCE_INLINE int set_global_time_data_type(uint8_t data_type) {
     ASSERT(data_type >= BOOLEAN && data_type <= STRING);
     if (data_type != INT64) {
@@ -163,29 +172,28 @@ FORCE_INLINE uint8_t get_global_compression() {
     return static_cast<uint8_t>(g_config_value_.default_compression_type_);
 }
 
+FORCE_INLINE void set_parallel_read_enabled(bool enabled) {
+    g_config_value_.parallel_read_enabled_ = enabled;
+}
+
+FORCE_INLINE bool get_parallel_read_enabled() {
+    return g_config_value_.parallel_read_enabled_;
+}
+
 FORCE_INLINE void set_parallel_write_enabled(bool enabled) {
     g_config_value_.parallel_write_enabled_ = enabled;
 }
 
 FORCE_INLINE bool get_parallel_write_enabled() {
-    return g_config_value_.parallel_write_enabled_ &&
-           g_config_value_.write_thread_count_ > 1;
-}
-
-// Set the number of threads for parallel writes.  Must be called before
-// init_common() / libtsfile_init() — the global thread pool is created
-// during initialization and is not resized at runtime.
-FORCE_INLINE int set_write_thread_count(int32_t count) {
-    if (count < 1 || count > 64) return E_INVALID_ARG;
-    g_config_value_.write_thread_count_ = count;
-    return E_OK;
+    return g_config_value_.parallel_write_enabled_;
 }
 
-#ifdef ENABLE_THREADS
-class ThreadPool;
-// Global write thread pool, created by init_common().
-extern ThreadPool* g_write_thread_pool_;
-#endif
+// Size of the single global worker pool.  Rejects values outside [1, 64] with
+// E_INVALID_ARG, leaving the field untouched.  If the pool already exists
+// (libtsfile_init has run) it is rebuilt at the new size immediately; the
+// caller must ensure no read/write is concurrently using the pool.  Defined in
+// global.cc (needs the full ThreadPool type).
+extern int set_thread_count(int32_t count);
 
 extern int init_common();
 extern bool is_timestamp_column_name(const char* time_col_name);
diff --git a/cpp/src/common/mutex/CMakeLists.txt b/cpp/src/common/mutex/CMakeLists.txt
deleted file mode 100644
index e7ef66faa..000000000
--- a/cpp/src/common/mutex/CMakeLists.txt
+++ /dev/null
@@ -1,20 +0,0 @@
-#[[
-Licensed to the Apache Software Foundation (ASF) under one
-or more contributor license agreements.  See the NOTICE file
-distributed with this work for additional information
-regarding copyright ownership.  The ASF licenses this file
-to you under the Apache License, Version 2.0 (the
-"License"); you may not use this file except in compliance
-with the License.  You may obtain a copy of the License at
-
-    https://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing,
-software distributed under the License is distributed on an
-"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-KIND, either express or implied.  See the License for the
-specific language governing permissions and limitations
-under the License.
-]]
-
-
diff --git a/cpp/src/common/mutex/mutex.h b/cpp/src/common/mutex/mutex.h
deleted file mode 100644
index b35d328de..000000000
--- a/cpp/src/common/mutex/mutex.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef COMMON_MUTEX_MUTEX_H
-#define COMMON_MUTEX_MUTEX_H
-
-#include <mutex>
-
-#include "utils/util_define.h"
-
-namespace common {
-
-// Thin wrapper over std::mutex. Implemented with the C++11 standard library
-// (instead of pthreads directly) so it builds on every platform, including
-// MSVC where pthreads is not available.
-class Mutex {
-   public:
-    Mutex() {}
-    ~Mutex() {}
-
-    void lock() { mutex_.lock(); }
-
-    void unlock() { mutex_.unlock(); }
-
-    bool try_lock() { return mutex_.try_lock(); }
-
-   private:
-    std::mutex mutex_;
-};
-
-class MutexGuard {
-   public:
-    MutexGuard(Mutex& m) : m_(m) { m_.lock(); }
-    ~MutexGuard() { m_.unlock(); }
-
-   private:
-    Mutex& m_;
-};
-
-}  // end namespace common
-#endif  // COMMON_MUTEX_MUTEX_H
diff --git a/cpp/src/common/path.cc b/cpp/src/common/path.cc
deleted file mode 100644
index d70a9d6c6..000000000
--- a/cpp/src/common/path.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "common/path.h"
-
-#include "common/constant/tsfile_constant.h"
-
-#ifdef ENABLE_ANTLR4
-#include "parser/path_nodes_generator.h"
-#endif
-
-namespace storage {
-
-Path::Path() = default;
-
-Path::Path(std::string& device, std::string& measurement)
-    : measurement_(measurement),
-      device_id_(std::make_shared<StringArrayDeviceID>(device)) {
-    full_path_ = device + "." + measurement;
-}
-
-Path::Path(const std::string& path_sc, bool if_split) {
-    if (!path_sc.empty()) {
-        if (!if_split) {
-            full_path_ = path_sc;
-            device_id_ = std::make_shared<StringArrayDeviceID>(path_sc);
-        } else {
-#ifdef ENABLE_ANTLR4
-            std::vector<std::string> nodes =
-                PathNodesGenerator::invokeParser(path_sc);
-#else
-            std::vector<std::string> nodes =
-                IDeviceID::split_string(path_sc, '.');
-#endif
-            if (nodes.size() > 1) {
-                // Join nodes, then parse like write path / Java Path (not
-                // per-segment vector).
-                std::string device_joined;
-                for (size_t i = 0; i + 1 < nodes.size(); ++i) {
-                    if (i > 0) {
-                        device_joined += PATH_SEPARATOR_CHAR;
-                    }
-                    device_joined += nodes[i];
-                }
-                device_id_ =
-                    std::make_shared<StringArrayDeviceID>(device_joined);
-                measurement_ = nodes[nodes.size() - 1];
-                full_path_ = device_id_->get_device_name() + "." + measurement_;
-            } else {
-                full_path_ = path_sc;
-                device_id_ = std::make_shared<StringArrayDeviceID>();
-                measurement_ = path_sc;
-            }
-        }
-    } else {
-        full_path_ = "";
-        device_id_ = std::make_shared<StringArrayDeviceID>();
-        measurement_ = "";
-    }
-}
-
-}  // namespace storage
diff --git a/cpp/src/common/path.h b/cpp/src/common/path.h
index 3896b2715..c176d93db 100644
--- a/cpp/src/common/path.h
+++ b/cpp/src/common/path.h
@@ -21,7 +21,12 @@
 
 #include <string>
 
+#include "common/constant/tsfile_constant.h"
 #include "common/device_id.h"
+#ifdef ENABLE_ANTLR4
+#include "parser/generated/PathParser.h"
+#include "parser/path_nodes_generator.h"
+#endif
 #include "utils/errno_define.h"
 
 namespace storage {
@@ -31,9 +36,57 @@ struct Path {
     std::shared_ptr<IDeviceID> device_id_;
     std::string full_path_;
 
-    Path();
-    Path(std::string& device, std::string& measurement);
-    Path(const std::string& path_sc, bool if_split = true);
+    Path() {}
+
+    Path(std::string& device, std::string& measurement)
+        : measurement_(measurement),
+          device_id_(std::make_shared<StringArrayDeviceID>(device)) {
+        full_path_ = device + "." + measurement;
+    }
+
+    Path(const std::string& path_sc, bool if_split = true) {
+        if (!path_sc.empty()) {
+            if (!if_split) {
+                full_path_ = path_sc;
+                device_id_ = std::make_shared<StringArrayDeviceID>(path_sc);
+            } else {
+#ifdef ENABLE_ANTLR4
+                std::vector<std::string> nodes =
+                    PathNodesGenerator::invokeParser(path_sc);
+#else
+                std::vector<std::string> nodes =
+                    IDeviceID::split_string(path_sc, '.');
+#endif
+                if (nodes.size() > 1) {
+                    // Join nodes, then parse like write path / Java Path
+                    // (route through the interpretive string ctor instead of
+                    // the literal per-segment vector ctor, so a stored
+                    // "root.sg.d1" device matches a query path
+                    // "root.sg.d1.s1").
+                    std::string device_joined;
+                    for (size_t i = 0; i + 1 < nodes.size(); ++i) {
+                        if (i > 0) {
+                            device_joined += PATH_SEPARATOR_CHAR;
+                        }
+                        device_joined += nodes[i];
+                    }
+                    device_id_ =
+                        std::make_shared<StringArrayDeviceID>(device_joined);
+                    measurement_ = nodes[nodes.size() - 1];
+                    full_path_ =
+                        device_id_->get_device_name() + "." + measurement_;
+                } else {
+                    full_path_ = path_sc;
+                    device_id_ = std::make_shared<StringArrayDeviceID>();
+                    measurement_ = path_sc;
+                }
+            }
+        } else {
+            full_path_ = "";
+            device_id_ = std::make_shared<StringArrayDeviceID>();
+            measurement_ = "";
+        }
+    }
 
     bool operator==(const Path& path) {
         if (measurement_.compare(path.measurement_) == 0 &&
diff --git a/cpp/src/common/seq_tvlist.h b/cpp/src/common/seq_tvlist.h
deleted file mode 100644
index 24805ac5d..000000000
--- a/cpp/src/common/seq_tvlist.h
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef COMMON_SEQ_TVLIST_H
-#define COMMON_SEQ_TVLIST_H
-
-#include "common/allocator/alloc_base.h"
-#include "common/allocator/page_arena.h"
-#include "common/mutex/mutex.h"
-#include "utils/db_utils.h"
-#include "utils/errno_define.h"
-#include "utils/storage_utils.h"
-#include "utils/util_define.h"
-
-namespace storage {
-
-class SeqTVListBase {
-   public:
-    SeqTVListBase()
-        : data_type_(common::VECTOR),
-          mutex_(),
-          ref_count_(0),
-          primary_array_size_(0),
-          list_size_(0),
-          write_count_(0),
-          page_arena_(common::g_base_allocator),
-          use_page_arena_(false),
-          is_immutable_(false) {}
-    virtual ~SeqTVListBase() {}
-    virtual void destroy() {}
-
-    FORCE_INLINE void ref() { ATOMIC_AAF(&ref_count_, 1); }
-    FORCE_INLINE bool unref() { return 0 == ATOMIC_AAF(&ref_count_, -1); }
-
-    FORCE_INLINE void lock() { mutex_.lock(); }
-    FORCE_INLINE void unlock() { mutex_.unlock(); }
-
-    int32_t get_total_count() const { return write_count_; }
-    common::TSDataType get_data_type() const { return data_type_; }
-    virtual TimeRange get_time_range() const = 0;
-    void mark_immutable() { is_immutable_ = true; }
-    bool is_immutable() const { return is_immutable_; }
-
-   protected:
-    common::TSDataType data_type_;
-    mutable common::Mutex mutex_;
-    int32_t ref_count_;
-    int32_t primary_array_size_;
-    int32_t list_size_;
-    int32_t write_count_;
-    common::PageArena page_arena_;
-    bool use_page_arena_;
-    bool is_immutable_;
-};
-
-template <typename Type>
-class SeqTVList : public SeqTVListBase {
-   public:
-    typedef struct TV {
-        int64_t time_;
-        Type value_;
-    } TV;
-
-    struct Iterator {
-        SeqTVList* host_list_;
-        int32_t read_idx_;
-        int32_t end_idx_;
-
-        Iterator() : host_list_(nullptr), read_idx_(UINT32_MAX), end_idx_(0) {}
-
-        INLINE void init(SeqTVList* host, int32_t start_idx, int32_t end_idx) {
-            host_list_ = host;
-            read_idx_ = start_idx;
-            end_idx_ = end_idx;
-        }
-
-        int next(TV& tv) {
-            if (read_idx_ >= end_idx_) {
-                return common::E_NO_MORE_DATA;
-            }
-            tv = host_list_->at(read_idx_);
-            read_idx_++;
-            return common::E_OK;
-        }
-    };
-
-   public:
-    SeqTVList() : tv_array_list_(nullptr), last_time_(-1) {
-        data_type_ = common::GetDataTypeFromTemplateType<Type>();
-    }
-    virtual ~SeqTVList() {}
-
-    int init(int32_t primary_array_size, int32_t max_count,
-             bool use_page_arena);
-    void destroy() OVERRIDE;
-
-    int push(int64_t time, Type value);
-    int push_without_lock(int64_t time, Type value);
-    Iterator scan_without_lock(int64_t start_time, int64_t end_time);
-    Iterator scan_without_lock();
-
-    TimeRange get_time_range() const OVERRIDE {
-        TimeRange time_range;
-        common::MutexGuard mg(mutex_);
-        if (write_count_ > 0) {
-            time_range.start_time_ = time_at(0);
-            time_range.end_time_ = time_at(write_count_ - 1);
-            ASSERT(time_range.start_time_ <= time_range.end_time_);
-        }
-        return time_range;
-    }
-
-    FORCE_INLINE TV at(int32_t tv_idx) const {
-        ASSERT(tv_idx < write_count_);
-        int32_t list_idx = tv_idx / primary_array_size_;
-        int32_t list_offset = tv_idx % primary_array_size_;
-        return tv_array_list_[list_idx][list_offset];
-    }
-
-    FORCE_INLINE int64_t time_at(int32_t tv_idx) const {
-        return at(tv_idx).time_;
-    }
-
-#ifdef ENABLE_TEST
-    int32_t TEST_binary_search_upper(int64_t time) {
-        return binary_search_upper(time);
-    }
-    int32_t TEST_binary_search_lower(int64_t time) {
-        return binary_search_lower(time);
-    }
-#endif
-
-   private:
-    FORCE_INLINE void* alloc(uint32_t size) {
-        if (use_page_arena_) {
-            return page_arena_.alloc(size);
-        } else {
-            return common::mem_alloc(size, common::MOD_TVLIST_DATA);
-        }
-    }
-
-    // return the first tv which is larger or equal to @time
-    int32_t binary_search_upper(int64_t time);
-    // return the last tv which is less or equal to @time
-    int32_t binary_search_lower(int64_t time);
-
-   private:
-    TV** tv_array_list_;
-    int64_t last_time_;
-};
-
-}  // namespace storage
-
-#include "seq_tvlist.inc"
-
-#endif  // COMMON_SEQ_TVLIST_H
diff --git a/cpp/src/common/seq_tvlist.inc b/cpp/src/common/seq_tvlist.inc
deleted file mode 100644
index c25e49f45..000000000
--- a/cpp/src/common/seq_tvlist.inc
+++ /dev/null
@@ -1,174 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * License); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-// #include "seq_tvlist.h"
-#include <stdio.h>
-#include <string.h>
-#include <iostream>
-#include "common/mutex/mutex.h"
-#include "common/logger/elog.h"
-
-
-namespace storage
-{
-
-template<typename Type>
-int SeqTVList<Type>::init(int32_t primary_array_size,
-                          int32_t max_count,
-                          bool use_page_arena)
-{
-  if (primary_array_size > max_count) {
-    //common:://log_err("TVList init error, primary_array_size=%u, max_count=%u", primary_array_size, max_count);
-    return common::E_INVALID_ARG;
-  }
-  use_page_arena_ = use_page_arena;
-
-  primary_array_size_ = primary_array_size;
-  list_size_ = (max_count / primary_array_size_) +
-               (max_count % primary_array_size_ == 0 ? 0 : 1);
-
-  int32_t alloc_size = sizeof(TV) * list_size_;
-  tv_array_list_ = (TV**)alloc(alloc_size);
-  if (tv_array_list_ == nullptr) {
-    return common::E_OOM;
-  }
-  memset(tv_array_list_, 0, alloc_size);
-  write_count_ = 0;
-  if (use_page_arena_) {
-    // TODO make it configurable
-    page_arena_.init(sizeof(TV) * primary_array_size_ * 4, common::MOD_TVLIST_OBJ);
-  }
-  return common::E_OK;
-}
-
-template<typename Type>
-int SeqTVList<Type>::push(int64_t time, Type value)
-{
-  common::MutexGuard mg(mutex_);
-  return push_without_lock(time, value);
-};
-
-template<typename Type>
-int SeqTVList<Type>::push_without_lock(int64_t time, Type value)
-{
-  if (UNLIKELY(time <= last_time_)) {
-    return common::E_OUT_OF_ORDER;
-  }
-  if (UNLIKELY(write_count_ >= list_size_ * primary_array_size_)) {
-    return common::E_OVERFLOW;
-  }
-
-  int32_t list_idx = write_count_ / primary_array_size_;
-  int32_t list_offset = write_count_ % primary_array_size_;
-  if (UNLIKELY(list_offset == 0)) {
-    ASSERT(tv_array_list_[list_idx] == nullptr);
-    tv_array_list_[list_idx] = static_cast<TV*>(alloc(sizeof(TV) * primary_array_size_));
-    if (UNLIKELY(tv_array_list_[list_idx] == nullptr)) {
-      return common::E_OOM;
-    }
-  }
-
-  TV insert_tv;
-  insert_tv.time_ = time;
-  insert_tv.value_ = value;
-#if STORAGE_ENGINE_DEBUG
-  std::cout << "tvlist[" << list_idx << "][" << list_offset << "] = (" << time << ", " << value << ")" << std::endl;
-#endif
-  tv_array_list_[list_idx][list_offset] = insert_tv;
-  write_count_++;
-  last_time_ = time;
-  return common::E_OK;
-};
-
-template<typename Type>
-void SeqTVList<Type>::destroy()
-{
-  if (use_page_arena_) {
-    page_arena_.destroy();
-  } else {
-    int32_t list_size = write_count_ / primary_array_size_
-                        + (write_count_ % primary_array_size_ == 0 ? 0 : 1);
-    for (int i = 0; i < list_size; i++) {
-      common::mem_free(tv_array_list_[i]);
-    }
-    common::mem_free(tv_array_list_);
-  }
-}
-
-template<typename Type>
-typename SeqTVList<Type>::Iterator SeqTVList<Type>::scan_without_lock(int64_t start_time, int64_t end_time)
-{
-  ASSERT(start_time < end_time);
-  int32_t start_idx = binary_search_lower(start_time);
-  int32_t end_idx = binary_search_upper(end_time);
-  ASSERT(start_idx <= end_time + 1);
-  SeqTVList::Iterator iter;
-  iter.init(this, start_idx, end_idx);
-  return iter;
-}
-
-template<typename Type>
-typename SeqTVList<Type>::Iterator SeqTVList<Type>::scan_without_lock()
-{
-  SeqTVList::Iterator iter;
-  iter.init(this, 0, write_count_);
-  return iter;
-}
-
-// return the first tv which is larger or equal to @time
-template<typename Type>
-int32_t SeqTVList<Type>::binary_search_lower(int64_t time)
-{
-  int32_t start = -1;
-  int32_t end = write_count_;
-
-  // arr[start] < time <= arr[end]
-  while (start + 1 != end) {
-    int mid = (start + end) / 2;
-    int64_t mid_time = time_at(mid);
-    if (mid_time < time) {
-      start = mid;
-    } else {
-      end = mid;
-    }
-  }
-  return end;
-}
-
-// return the last tv which is less or equal to @time
-template<typename Type>
-int32_t SeqTVList<Type>::binary_search_upper(int64_t time)
-{
-  int32_t start = 0;
-  int32_t end = write_count_;
-
-  // arr[start] <= time < arr[end]
-  while (start + 1 != end) {
-    int mid = (start + end) / 2;
-    int64_t mid_time = time_at(mid);
-    if (mid_time <= time) {
-      start = mid;
-    } else {
-      end = mid;
-    }
-  }
-  return start;
-}
-
-} // namespace storage
-
diff --git a/cpp/src/common/statistic.h b/cpp/src/common/statistic.h
index bced66173..3d45b4f43 100644
--- a/cpp/src/common/statistic.h
+++ b/cpp/src/common/statistic.h
@@ -22,12 +22,18 @@
 
 #include <inttypes.h>
 
+#include <algorithm>
 #include <sstream>
 
 #include "common/allocator/alloc_base.h"
 #include "common/allocator/byte_stream.h"
 #include "common/db_common.h"
 
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define TSFILE_HAS_NEON 1
+#endif
+
 namespace storage {
 
 /*
@@ -176,6 +182,48 @@ class Statistic {
     }
     virtual FORCE_INLINE void update(int64_t time) { ASSERT(false); }
 
+    virtual void update_time_batch(const int64_t* timestamps, uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i]);
+        }
+    }
+    virtual void update_batch(const int64_t* timestamps, const bool* values,
+                              uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i], values[i]);
+        }
+    }
+    virtual void update_batch(const int64_t* timestamps, const int32_t* values,
+                              uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i], values[i]);
+        }
+    }
+    virtual void update_batch(const int64_t* timestamps, const int64_t* values,
+                              uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i], values[i]);
+        }
+    }
+    virtual void update_batch(const int64_t* timestamps, const float* values,
+                              uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i], values[i]);
+        }
+    }
+    virtual void update_batch(const int64_t* timestamps, const double* values,
+                              uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i], values[i]);
+        }
+    }
+    virtual void update_batch(const int64_t* timestamps,
+                              const common::String* values, uint32_t count) {
+        for (uint32_t i = 0; i < count; i++) {
+            update(timestamps[i], values[i]);
+        }
+    }
+
     virtual int serialize_to(common::ByteStream& out) {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_var_uint(count_, out))) {
@@ -554,17 +602,17 @@ class BooleanStatistic : public Statistic {
         last_value_ = that.last_value_;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         sum_value_ = 0;
         first_value_ = false;
         last_value_ = false;
     }
 
-    FORCE_INLINE void update(int64_t time, bool value) {
+    FORCE_INLINE void update(int64_t time, bool value) override {
         BOOL_STAT_UPDATE(time, value);
     }
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_ui8(first_value_ ? 1 : 0,
                                                           out))) {
@@ -575,7 +623,7 @@ class BooleanStatistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::read_ui8((uint8_t&)first_value_,
                                                          in))) {
@@ -587,13 +635,15 @@ class BooleanStatistic : public Statistic {
         return ret;
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::BOOLEAN; }
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::BOOLEAN;
+    }
 
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_BOOL_STAT_FROM(BooleanStatistic, stat);
     }
 
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_BOOL_STAT_FROM(BooleanStatistic, stat);
     }
 };
@@ -625,7 +675,7 @@ class Int32Statistic : public Statistic {
         last_value_ = that.last_value_;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         sum_value_ = 0;
         min_value_ = 0;
@@ -634,13 +684,41 @@ class Int32Statistic : public Statistic {
         last_value_ = 0;
     }
 
-    FORCE_INLINE void update(int64_t time, int32_t value) {
+    FORCE_INLINE void update(int64_t time, int32_t value) override {
         NUM_STAT_UPDATE(time, value);
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::INT32; }
+    void update_batch(const int64_t* timestamps, const int32_t* values,
+                      uint32_t count) override {
+        if (count == 0) return;
+        uint32_t start = 0;
+        if (count_ == 0) {
+            start_time_ = timestamps[0];
+            end_time_ = timestamps[0];
+            first_value_ = values[0];
+            last_value_ = values[0];
+            min_value_ = values[0];
+            max_value_ = values[0];
+            sum_value_ = (int64_t)values[0];
+            count_ = 1;
+            start = 1;
+        }
+        for (uint32_t i = start; i < count; i++) {
+            if (timestamps[i] < start_time_) start_time_ = timestamps[i];
+            if (timestamps[i] > end_time_) end_time_ = timestamps[i];
+            if (values[i] < min_value_) min_value_ = values[i];
+            if (values[i] > max_value_) max_value_ = values[i];
+            sum_value_ += (int64_t)values[i];
+        }
+        last_value_ = values[count - 1];
+        count_ += (count - start);
+    }
+
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::INT32;
+    }
 
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_ui32(min_value_, out))) {
         } else if (RET_FAIL(common::SerializationUtil::write_ui32(max_value_,
@@ -654,7 +732,7 @@ class Int32Statistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::read_ui32((uint32_t&)min_value_,
                                                           in))) {
@@ -676,15 +754,15 @@ class Int32Statistic : public Statistic {
         //           << std::endl;
         return ret;
     }
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_NUM_STAT_FROM(Int32Statistic, stat);
     }
 
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_NUM_STAT_FROM(Int32Statistic, stat);
     }
 
-    std::string to_string() const {
+    std::string to_string() const override {
         std::ostringstream oss;
         oss << "{count=" << count_ << ", start_time=" << start_time_
             << ", end_time=" << end_time_ << ", first_val=" << first_value_
@@ -696,7 +774,7 @@ class Int32Statistic : public Statistic {
 };
 
 class DateStatistic : public Int32Statistic {
-    FORCE_INLINE common::TSDataType get_type() { return common::DATE; }
+    FORCE_INLINE common::TSDataType get_type() override { return common::DATE; }
 };
 
 class Int64Statistic : public Statistic {
@@ -726,7 +804,7 @@ class Int64Statistic : public Statistic {
         last_value_ = that.last_value_;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         sum_value_ = 0;
         min_value_ = 0;
@@ -734,13 +812,69 @@ class Int64Statistic : public Statistic {
         first_value_ = 0;
         last_value_ = 0;
     }
-    FORCE_INLINE void update(int64_t time, int64_t value) {
+    FORCE_INLINE void update(int64_t time, int64_t value) override {
         NUM_STAT_UPDATE(time, value);
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::INT64; }
+    void update_batch(const int64_t* timestamps, const int64_t* values,
+                      uint32_t count) override {
+        if (count == 0) return;
+        uint32_t start = 0;
+        if (count_ == 0) {
+            start_time_ = timestamps[0];
+            end_time_ = timestamps[0];
+            first_value_ = values[0];
+            last_value_ = values[0];
+            min_value_ = values[0];
+            max_value_ = values[0];
+            sum_value_ = (double)values[0];
+            count_ = 1;
+            start = 1;
+        }
+        // Timestamps are monotonic (verified by TimePageWriter),
+        // so only first/last matter for start_time_/end_time_.
+        if (count > start) {
+            if (timestamps[start] < start_time_)
+                start_time_ = timestamps[start];
+            if (timestamps[count - 1] > end_time_)
+                end_time_ = timestamps[count - 1];
+        }
+        uint32_t i = start;
+#if TSFILE_HAS_NEON
+        {
+            int64x2_t vmin = vdupq_n_s64(min_value_);
+            int64x2_t vmax = vdupq_n_s64(max_value_);
+            float64x2_t vsum = vdupq_n_f64(0.0);
+            for (; i + 2 <= count; i += 2) {
+                int64x2_t v = vld1q_s64(&values[i]);
+                // min/max via compare+select (no vminq_s64 in NEON)
+                uint64x2_t lt = vcltq_s64(v, vmin);
+                vmin = vbslq_s64(lt, v, vmin);
+                uint64x2_t gt = vcgtq_s64(v, vmax);
+                vmax = vbslq_s64(gt, v, vmax);
+                vsum = vaddq_f64(vsum, vcvtq_f64_s64(v));
+            }
+            min_value_ =
+                std::min(vgetq_lane_s64(vmin, 0), vgetq_lane_s64(vmin, 1));
+            max_value_ =
+                std::max(vgetq_lane_s64(vmax, 0), vgetq_lane_s64(vmax, 1));
+            sum_value_ += vgetq_lane_f64(vsum, 0) + vgetq_lane_f64(vsum, 1);
+        }
+#endif
+        for (; i < count; i++) {
+            if (values[i] < min_value_) min_value_ = values[i];
+            if (values[i] > max_value_) max_value_ = values[i];
+            sum_value_ += (double)values[i];
+        }
+        last_value_ = values[count - 1];
+        count_ += (count - start);
+    }
+
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::INT64;
+    }
 
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_ui64(min_value_, out))) {
         } else if (RET_FAIL(common::SerializationUtil::write_ui64(max_value_,
@@ -754,7 +888,7 @@ class Int64Statistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::read_ui64((uint64_t&)min_value_,
                                                           in))) {
@@ -769,15 +903,15 @@ class Int64Statistic : public Statistic {
         }
         return ret;
     }
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_NUM_STAT_FROM(Int64Statistic, stat);
     }
 
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_NUM_STAT_FROM(Int64Statistic, stat);
     }
 
-    std::string to_string() const {
+    std::string to_string() const override {
         std::ostringstream oss;
         oss << "{count=" << count_ << ", start_time=" << start_time_
             << ", end_time=" << end_time_ << ", first_val=" << first_value_
@@ -815,7 +949,7 @@ class FloatStatistic : public Statistic {
         last_value_ = that.last_value_;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         sum_value_ = 0;
         min_value_ = 0;
@@ -823,13 +957,15 @@ class FloatStatistic : public Statistic {
         first_value_ = 0;
         last_value_ = 0;
     }
-    FORCE_INLINE void update(int64_t time, float value) {
+    FORCE_INLINE void update(int64_t time, float value) override {
         NUM_STAT_UPDATE(time, value);
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::FLOAT; }
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::FLOAT;
+    }
 
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_float(min_value_, out))) {
         } else if (RET_FAIL(common::SerializationUtil::write_float(max_value_,
@@ -843,7 +979,7 @@ class FloatStatistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::read_float(min_value_, in))) {
         } else if (RET_FAIL(
@@ -857,10 +993,10 @@ class FloatStatistic : public Statistic {
         }
         return ret;
     }
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_NUM_STAT_FROM(FloatStatistic, stat);
     }
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_NUM_STAT_FROM(FloatStatistic, stat);
     }
 };
@@ -892,7 +1028,7 @@ class DoubleStatistic : public Statistic {
         last_value_ = that.last_value_;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         sum_value_ = 0;
         min_value_ = 0;
@@ -900,13 +1036,64 @@ class DoubleStatistic : public Statistic {
         first_value_ = 0;
         last_value_ = 0;
     }
-    FORCE_INLINE void update(int64_t time, double value) {
+    FORCE_INLINE void update(int64_t time, double value) override {
         NUM_STAT_UPDATE(time, value);
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::DOUBLE; }
+    void update_batch(const int64_t* timestamps, const double* values,
+                      uint32_t count) override {
+        if (count == 0) return;
+        uint32_t start = 0;
+        if (count_ == 0) {
+            start_time_ = timestamps[0];
+            end_time_ = timestamps[0];
+            first_value_ = values[0];
+            last_value_ = values[0];
+            min_value_ = values[0];
+            max_value_ = values[0];
+            sum_value_ = values[0];
+            count_ = 1;
+            start = 1;
+        }
+        if (count > start) {
+            if (timestamps[start] < start_time_)
+                start_time_ = timestamps[start];
+            if (timestamps[count - 1] > end_time_)
+                end_time_ = timestamps[count - 1];
+        }
+        uint32_t i = start;
+#if TSFILE_HAS_NEON
+        {
+            float64x2_t vmin = vdupq_n_f64(min_value_);
+            float64x2_t vmax = vdupq_n_f64(max_value_);
+            float64x2_t vsum = vdupq_n_f64(0.0);
+            for (; i + 2 <= count; i += 2) {
+                float64x2_t v = vld1q_f64(&values[i]);
+                vmin = vminq_f64(vmin, v);
+                vmax = vmaxq_f64(vmax, v);
+                vsum = vaddq_f64(vsum, v);
+            }
+            min_value_ =
+                std::min(vgetq_lane_f64(vmin, 0), vgetq_lane_f64(vmin, 1));
+            max_value_ =
+                std::max(vgetq_lane_f64(vmax, 0), vgetq_lane_f64(vmax, 1));
+            sum_value_ += vgetq_lane_f64(vsum, 0) + vgetq_lane_f64(vsum, 1);
+        }
+#endif
+        for (; i < count; i++) {
+            if (values[i] < min_value_) min_value_ = values[i];
+            if (values[i] > max_value_) max_value_ = values[i];
+            sum_value_ += values[i];
+        }
+        last_value_ = values[count - 1];
+        count_ += (count - start);
+    }
+
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::DOUBLE;
+    }
 
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(
                 common::SerializationUtil::write_double(min_value_, out))) {
@@ -921,7 +1108,7 @@ class DoubleStatistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::read_double(min_value_, in))) {
         } else if (RET_FAIL(common::SerializationUtil::read_double(max_value_,
@@ -935,10 +1122,10 @@ class DoubleStatistic : public Statistic {
         }
         return ret;
     }
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_NUM_STAT_FROM(DoubleStatistic, stat);
     }
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_NUM_STAT_FROM(DoubleStatistic, stat);
     }
 };
@@ -960,30 +1147,50 @@ class TimeStatistic : public Statistic {
         end_time_ = that.end_time_;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         start_time_ = 0;
         end_time_ = 0;
     }
 
-    FORCE_INLINE void update(int64_t time) {
+    FORCE_INLINE void update(int64_t time) override {
         TIME_STAT_UPDATE((time));
         count_++;
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::VECTOR; }
+    void update_time_batch(const int64_t* timestamps, uint32_t count) override {
+        if (count == 0) return;
+        if (count_ == 0) {
+            start_time_ = timestamps[0];
+            end_time_ = timestamps[0];
+        }
+        // Timestamps are already verified monotonic in TimePageWriter,
+        // so first element is min candidate and last is max candidate.
+        if (timestamps[0] < start_time_) start_time_ = timestamps[0];
+        if (timestamps[count - 1] > end_time_)
+            end_time_ = timestamps[count - 1];
+        count_ += count;
+    }
 
-    int serialize_typed_stat(common::ByteStream& out) { return common::E_OK; }
-    int deserialize_typed_stat(common::ByteStream& in) { return common::E_OK; }
-    int merge_with(Statistic* stat) {
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::VECTOR;
+    }
+
+    int serialize_typed_stat(common::ByteStream& out) override {
+        return common::E_OK;
+    }
+    int deserialize_typed_stat(common::ByteStream& in) override {
+        return common::E_OK;
+    }
+    int merge_with(Statistic* stat) override {
         MERGE_TIME_STAT_FROM(TimeStatistic, stat);
     }
 
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_TIME_STAT_FROM(TimeStatistic, stat);
     }
 
-    std::string to_string() const {
+    std::string to_string() const override {
         std::ostringstream oss;
         oss << "{count=" << count_ << ", start_time=" << start_time_
             << ", end_time=" << end_time_ << "}";
@@ -992,7 +1199,9 @@ class TimeStatistic : public Statistic {
 };
 
 class TimestampStatistics : public Int64Statistic {
-    FORCE_INLINE common::TSDataType get_type() { return common::TIMESTAMP; }
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::TIMESTAMP;
+    }
 };
 
 class StringStatistic : public Statistic {
@@ -1002,35 +1211,24 @@ class StringStatistic : public Statistic {
     common::String first_value_;
     common::String last_value_;
     StringStatistic()
-        : min_value_(),
-          max_value_(),
-          first_value_(),
-          last_value_(),
-          pa_(nullptr),
-          owns_pa_(true) {
+        : min_value_(), max_value_(), first_value_(), last_value_() {
         pa_ = new common::PageArena();
         pa_->init(512, common::MOD_STATISTIC_OBJ);
     }
 
     StringStatistic(common::PageArena* pa)
-        : min_value_(),
-          max_value_(),
-          first_value_(),
-          last_value_(),
-          pa_(pa),
-          owns_pa_(false) {}
+        : min_value_(), max_value_(), first_value_(), last_value_(), pa_(pa) {}
 
     ~StringStatistic() { destroy(); }
 
-    void destroy() {
-        if (owns_pa_ && pa_) {
+    void destroy() override {
+        if (pa_) {
             delete pa_;
             pa_ = nullptr;
         }
-        owns_pa_ = false;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         start_time_ = 0;
         end_time_ = 0;
@@ -1050,13 +1248,15 @@ class StringStatistic : public Statistic {
         last_value_.dup_from(that.last_value_, *pa_);
     }
 
-    FORCE_INLINE void update(int64_t time, common::String value) {
+    FORCE_INLINE void update(int64_t time, common::String value) override {
         STRING_STAT_UPDATE(time, value);
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::STRING; }
+    FORCE_INLINE common::TSDataType get_type() override {
+        return common::STRING;
+    }
 
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_str(first_value_, out))) {
         } else if (RET_FAIL(common::SerializationUtil::write_str(last_value_,
@@ -1068,7 +1268,7 @@ class StringStatistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(
                 common::SerializationUtil::read_str(first_value_, pa_, in))) {
@@ -1081,42 +1281,39 @@ class StringStatistic : public Statistic {
         }
         return ret;
     }
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_STRING_STAT_FROM(StringStatistic, stat);
     }
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_STRING_STAT_FROM(StringStatistic, stat);
     }
 
    private:
     common::PageArena* pa_;
-    bool owns_pa_;
 };
 
 class TextStatistic : public Statistic {
    public:
     common::String first_value_;
     common::String last_value_;
-    TextStatistic()
-        : first_value_(), last_value_(), pa_(nullptr), owns_pa_(true) {
+    TextStatistic() : first_value_(), last_value_() {
         pa_ = new common::PageArena();
         pa_->init(512, common::MOD_STATISTIC_OBJ);
     }
 
     TextStatistic(common::PageArena* pa)
-        : first_value_(), last_value_(), pa_(pa), owns_pa_(false) {}
+        : first_value_(), last_value_(), pa_(pa) {}
 
     ~TextStatistic() { destroy(); }
 
-    void destroy() {
-        if (owns_pa_ && pa_) {
+    void destroy() override {
+        if (pa_) {
             delete pa_;
             pa_ = nullptr;
         }
-        owns_pa_ = false;
     }
 
-    FORCE_INLINE void reset() {
+    FORCE_INLINE void reset() override {
         count_ = 0;
         start_time_ = 0;
         end_time_ = 0;
@@ -1132,13 +1329,13 @@ class TextStatistic : public Statistic {
         last_value_.dup_from(that.last_value_, *pa_);
     }
 
-    FORCE_INLINE void update(int64_t time, common::String value) {
+    FORCE_INLINE void update(int64_t time, common::String value) override {
         TEXT_STAT_UPDATE(time, value);
     }
 
-    FORCE_INLINE common::TSDataType get_type() { return common::TEXT; }
+    FORCE_INLINE common::TSDataType get_type() override { return common::TEXT; }
 
-    int serialize_typed_stat(common::ByteStream& out) {
+    int serialize_typed_stat(common::ByteStream& out) override {
         int ret = common::E_OK;
         if (RET_FAIL(common::SerializationUtil::write_str(first_value_, out))) {
         } else if (RET_FAIL(common::SerializationUtil::write_str(last_value_,
@@ -1146,7 +1343,7 @@ class TextStatistic : public Statistic {
         }
         return ret;
     }
-    int deserialize_typed_stat(common::ByteStream& in) {
+    int deserialize_typed_stat(common::ByteStream& in) override {
         int ret = common::E_OK;
         if (RET_FAIL(
                 common::SerializationUtil::read_str(first_value_, pa_, in))) {
@@ -1155,35 +1352,33 @@ class TextStatistic : public Statistic {
         }
         return ret;
     }
-    int merge_with(Statistic* stat) {
+    int merge_with(Statistic* stat) override {
         MERGE_TEXT_STAT_FROM(TextStatistic, stat);
     }
-    int deep_copy_from(Statistic* stat) {
+    int deep_copy_from(Statistic* stat) override {
         DEEP_COPY_TEXT_STAT_FROM(TextStatistic, stat);
     }
 
    private:
     common::PageArena* pa_;
-    bool owns_pa_;
 };
 
 class BlobStatistic : public Statistic {
    public:
-    BlobStatistic() : pa_(nullptr), owns_pa_(true) {
+    BlobStatistic() {
         pa_ = new common::PageArena();
         pa_->init(512, common::MOD_STATISTIC_OBJ);
     }
 
-    BlobStatistic(common::PageArena* pa) : pa_(pa), owns_pa_(false) {}
+    BlobStatistic(common::PageArena* pa) {}
 
     ~BlobStatistic() { destroy(); }
 
     void destroy() {
-        if (owns_pa_ && pa_) {
+        if (pa_) {
             delete pa_;
             pa_ = nullptr;
         }
-        owns_pa_ = false;
     }
 
     FORCE_INLINE void reset() {
@@ -1214,7 +1409,6 @@ class BlobStatistic : public Statistic {
 
    private:
     common::PageArena* pa_;
-    bool owns_pa_;
 };
 
 FORCE_INLINE uint32_t get_typed_statistic_sizeof(common::TSDataType type) {
diff --git a/cpp/src/common/tablet.cc b/cpp/src/common/tablet.cc
index b9ae5301a..ba37a3245 100644
--- a/cpp/src/common/tablet.cc
+++ b/cpp/src/common/tablet.cc
@@ -20,8 +20,10 @@
 #include "tablet.h"
 
 #include <cstdlib>
+#include <limits>
 
 #include "allocator/alloc_base.h"
+#include "container/bit_map.h"
 #include "datatype/date_converter.h"
 #include "utils/errno_define.h"
 
@@ -98,14 +100,13 @@ int Tablet::init() {
             case BLOB:
             case TEXT:
             case STRING: {
-                auto* sc = static_cast<StringColumn*>(common::mem_alloc(
-                    sizeof(StringColumn), common::MOD_TABLET));
-                if (sc == nullptr) return E_OOM;
-                new (sc) StringColumn();
-                // 8 bytes/row is a conservative initial estimate for short
-                // string columns (e.g. device IDs, tags). The buffer grows
-                // automatically on demand via mem_realloc.
-                sc->init(max_row_num_, max_row_num_ * 8);
+                void* mem =
+                    common::mem_alloc(sizeof(StringColumn), common::MOD_TABLET);
+                if (mem == nullptr) {
+                    return E_OOM;
+                }
+                auto* sc = new (mem) StringColumn();
+                sc->init(max_row_num_, max_row_num_ * 32);
                 value_matrix_[c].string_col = sc;
                 break;
             }
@@ -120,8 +121,9 @@ int Tablet::init() {
     if (bitmaps_ == nullptr) return E_OOM;
     for (size_t c = 0; c < schema_count; c++) {
         new (&bitmaps_[c]) BitMap();
-        bitmaps_[c].init(max_row_num_, false);
+        bitmaps_[c].init(max_row_num_, false, common::MOD_TABLET);
     }
+
     return E_OK;
 }
 
@@ -156,6 +158,7 @@ void Tablet::destroy() {
                 case TEXT:
                 case STRING:
                     value_matrix_[c].string_col->destroy();
+                    value_matrix_[c].string_col->~StringColumn();
                     common::mem_free(value_matrix_[c].string_col);
                     break;
                 default:
@@ -192,9 +195,7 @@ int Tablet::add_timestamp(uint32_t row_index, int64_t timestamp) {
 }
 
 int Tablet::set_timestamps(const int64_t* timestamps, uint32_t count) {
-    if (err_code_ != E_OK) {
-        return err_code_;
-    }
+    if (err_code_ != E_OK) return err_code_;
     ASSERT(timestamps_ != NULL);
     if (UNLIKELY(count > static_cast<uint32_t>(max_row_num_))) {
         return E_OUT_OF_RANGE;
@@ -206,15 +207,10 @@ int Tablet::set_timestamps(const int64_t* timestamps, uint32_t count) {
 
 int Tablet::set_column_values(uint32_t schema_index, const void* data,
                               const uint8_t* bitmap, uint32_t count) {
-    if (err_code_ != E_OK) {
-        return err_code_;
-    }
-    if (UNLIKELY(schema_index >= schema_vec_->size())) {
-        return E_OUT_OF_RANGE;
-    }
-    if (UNLIKELY(count > static_cast<uint32_t>(max_row_num_))) {
+    if (err_code_ != E_OK) return err_code_;
+    if (UNLIKELY(schema_index >= schema_vec_->size())) return E_OUT_OF_RANGE;
+    if (UNLIKELY(count > static_cast<uint32_t>(max_row_num_)))
         return E_OUT_OF_RANGE;
-    }
 
     const MeasurementSchema& schema = schema_vec_->at(schema_index);
     size_t elem_size = 0;
@@ -250,9 +246,13 @@ int Tablet::set_column_values(uint32_t schema_index, const void* data,
     if (bitmap == nullptr) {
         bitmaps_[schema_index].clear_all();
     } else {
-        char* tsfile_bm = bitmaps_[schema_index].get_bitmap();
+        // copy_from also refreshes has_set_bits_; a plain memcpy into
+        // get_bitmap() would leave the flag stale (e.g. cleared by a prior
+        // clear_all()) and downstream may_have_set_bits() checks would skip
+        // null-mask handling for the column.
         uint32_t bm_bytes = (count + 7) / 8;
-        std::memcpy(tsfile_bm, bitmap, bm_bytes);
+        bitmaps_[schema_index].copy_from(reinterpret_cast<const char*>(bitmap),
+                                         bm_bytes);
     }
     cur_row_size_ = std::max(count, cur_row_size_);
     return E_OK;
@@ -271,15 +271,36 @@ int Tablet::set_column_string_values(uint32_t schema_index,
         return E_OUT_OF_RANGE;
     }
 
+    // Reject non-string types: the union member is StringColumn*, but for
+    // numeric columns the same slot holds the numeric buffer pointer.
+    // Interpreting it as StringColumn* and writing into ->buffer/->offsets
+    // would corrupt the numeric buffer.
+    const TSDataType dt = schema_vec_->at(schema_index).data_type_;
+    if (dt != STRING && dt != TEXT && dt != BLOB) {
+        return E_TYPE_NOT_MATCH;
+    }
     StringColumn* sc = value_matrix_[schema_index].string_col;
     if (sc == nullptr) {
         return E_INVALID_ARG;
     }
 
+    // offsets is the Arrow-style "offsets" array (count + 1 entries).  All
+    // downstream code assumes offsets[0] == 0, offsets are non-negative,
+    // and offsets[i] <= offsets[i+1].  Skipping these checks would let a
+    // caller pass e.g. {0, 10, 5} and trigger an unsigned underflow on
+    // (offsets[i+1] - offsets[i]) at serialize time, plus a wild memcpy.
+    if (UNLIKELY(offsets == nullptr)) return E_INVALID_ARG;
+    if (UNLIKELY(offsets[0] != 0)) return E_INVALID_ARG;
+    for (uint32_t i = 0; i < count; i++) {
+        if (UNLIKELY(offsets[i + 1] < offsets[i])) return E_INVALID_ARG;
+    }
+    if (UNLIKELY(offsets[count] < 0)) return E_INVALID_ARG;
     uint32_t total_bytes = static_cast<uint32_t>(offsets[count]);
     if (total_bytes > sc->buf_capacity) {
+        char* new_buf = (char*)mem_realloc(sc->buffer, total_bytes);
+        if (UNLIKELY(new_buf == nullptr)) return E_OOM;
+        sc->buffer = new_buf;
         sc->buf_capacity = total_bytes;
-        sc->buffer = (char*)mem_realloc(sc->buffer, sc->buf_capacity);
     }
 
     if (total_bytes > 0) {
@@ -291,14 +312,74 @@ int Tablet::set_column_string_values(uint32_t schema_index,
     if (bitmap == nullptr) {
         bitmaps_[schema_index].clear_all();
     } else {
-        char* tsfile_bm = bitmaps_[schema_index].get_bitmap();
         uint32_t bm_bytes = (count + 7) / 8;
-        std::memcpy(tsfile_bm, bitmap, bm_bytes);
+        bitmaps_[schema_index].copy_from(reinterpret_cast<const char*>(bitmap),
+                                         bm_bytes);
+    }
+    cur_row_size_ = std::max(count, cur_row_size_);
+    return E_OK;
+}
+
+int Tablet::set_column_string_repeated(uint32_t schema_index, const char* str,
+                                       uint32_t str_len, uint32_t count) {
+    if (err_code_ != E_OK) return err_code_;
+    if (UNLIKELY(schema_index >= schema_vec_->size())) return E_OUT_OF_RANGE;
+    if (UNLIKELY(count > static_cast<uint32_t>(max_row_num_)))
+        return E_OUT_OF_RANGE;
+
+    // See set_column_string_values: the union member is only valid as
+    // StringColumn* when the schema column is a variable-width type.
+    const TSDataType dt = schema_vec_->at(schema_index).data_type_;
+    if (dt != STRING && dt != TEXT && dt != BLOB) {
+        return E_TYPE_NOT_MATCH;
+    }
+    StringColumn* sc = value_matrix_[schema_index].string_col;
+    if (sc == nullptr) return E_INVALID_ARG;
+
+    // str_len * count can overflow uint32_t; do the multiply in uint64_t and
+    // reject anything that wouldn't fit, otherwise the subsequent loop would
+    // walk past the truncated buf_capacity allocation.
+    uint64_t total_bytes_64 =
+        static_cast<uint64_t>(str_len) * static_cast<uint64_t>(count);
+    if (total_bytes_64 > std::numeric_limits<uint32_t>::max()) {
+        return E_OVERFLOW;
     }
+    uint32_t total_bytes = static_cast<uint32_t>(total_bytes_64);
+    if (total_bytes > sc->buf_capacity) {
+        char* new_buf = (char*)mem_realloc(sc->buffer, total_bytes);
+        if (UNLIKELY(new_buf == nullptr)) return E_OOM;
+        sc->buffer = new_buf;
+        sc->buf_capacity = total_bytes;
+    }
+
+    for (uint32_t i = 0; i < count; i++) {
+        sc->offsets[i] = i * str_len;
+        memcpy(sc->buffer + i * str_len, str, str_len);
+    }
+    sc->offsets[count] = total_bytes;
+    sc->buf_used = total_bytes;
+
+    bitmaps_[schema_index].clear_all();
     cur_row_size_ = std::max(count, cur_row_size_);
     return E_OK;
 }
 
+void Tablet::reset(uint32_t row_count) {
+    ASSERT(row_count <= max_row_num_);
+    cur_row_size_ = row_count;
+    reset_string_columns();
+    // Bitmaps init to all-null (bit=1); writes flip bits to mark non-null.
+    // Without resetting them here, a reused Tablet would inherit cleared
+    // bits from the previous batch, causing stale values to be reported as
+    // non-null and written out again.
+    if (bitmaps_ != nullptr) {
+        const size_t schema_count = schema_vec_->size();
+        for (size_t c = 0; c < schema_count; c++) {
+            bitmaps_[c].reset();
+        }
+    }
+}
+
 void* Tablet::get_value(int row_index, uint32_t schema_index,
                         common::TSDataType& data_type) const {
     if (UNLIKELY(schema_index >= schema_vec_->size())) {
@@ -505,31 +586,21 @@ void Tablet::reset_string_columns() {
     }
 }
 
-// Find all row indices where the device ID changes.  A device ID is the
-// composite key formed by all id columns (e.g. region + sensor_id).  Row i
-// is a boundary when at least one id column differs between row i-1 and row i.
-//
-// Example (2 id columns: region, sensor_id):
-//   row 0: "A", "s1"
-//   row 1: "A", "s2"  <- boundary: sensor_id changed
-//   row 2: "B", "s1"  <- boundary: region changed
-//   row 3: "B", "s1"
-//   row 4: "B", "s2"  <- boundary: sensor_id changed
-//   result: [1, 2, 4]
-//
-// Boundaries are computed in one shot at flush time rather than maintained
-// incrementally during add_value / set_column_*. The total work is similar
-// either way, but batch computation here is far more CPU-friendly: the inner
-// loop is a tight memcmp scan over contiguous buffers with good cache
-// locality, and the CPU can pipeline comparisons without the branch overhead
-// and cache thrashing of per-row bookkeeping spread across the write path.
 std::vector<uint32_t> Tablet::find_all_device_boundaries() const {
     const uint32_t row_count = get_cur_row_size();
     if (row_count <= 1) return {};
 
+    // Use uint64_t bitmap instead of vector<bool> for faster set/test/scan.
     const uint32_t nwords = (row_count + 63) / 64;
     std::vector<uint64_t> boundary(nwords, 0);
 
+    // Walk id columns RIGHT to LEFT.  In time-series tag systems the rightmost
+    // tags (sensor_id, metric_name, etc.) typically have the highest
+    // cardinality and change most often.  By processing them first we mark most
+    // of the boundary bitmap early; subsequent (lower-cardinality) columns then
+    // short- circuit on `boundary[i] already set` for the bulk of their rows.
+    // Reverse order also lets us bail out of the entire scan as soon as every
+    // possible boundary is marked.
     uint32_t boundary_count = 0;
     const uint32_t max_boundaries = row_count - 1;
     for (auto it = id_column_indexes_.rbegin(); it != id_column_indexes_.rend();
@@ -537,43 +608,55 @@ std::vector<uint32_t> Tablet::find_all_device_boundaries() const {
         const StringColumn& sc = *value_matrix_[*it].string_col;
         const int32_t* off = sc.offsets;
         const char* buf = sc.buffer;
+        common::BitMap& bitmap = const_cast<common::BitMap&>(bitmaps_[*it]);
         for (uint32_t i = 1; i < row_count; i++) {
-            if (boundary[i >> 6] & (1ULL << (i & 63))) continue;
+            if (boundary[i >> 6] & (1ULL << (i & 63))) {
+                continue;
+            }
+            const bool prev_null = bitmap.test(i - 1);
+            const bool curr_null = bitmap.test(i);
+            if (prev_null != curr_null) {
+                boundary[i >> 6] |= (1ULL << (i & 63));
+                if (++boundary_count >= max_boundaries) {
+                    break;
+                }
+                continue;
+            }
+            if (prev_null) {
+                continue;
+            }
+            // Signed int32 widths so an offset-array corruption that would
+            // otherwise underflow to a huge unsigned value surfaces as
+            // len < 0 instead.  memcmp's size_t param needs an explicit cast,
+            // guarded by `len_a > 0`.
             int32_t len_a = off[i] - off[i - 1];
             int32_t len_b = off[i + 1] - off[i];
             if (len_a != len_b ||
                 (len_a > 0 && memcmp(buf + off[i - 1], buf + off[i],
-                                     static_cast<uint32_t>(len_a)) != 0)) {
+                                     static_cast<size_t>(len_a)) != 0)) {
                 boundary[i >> 6] |= (1ULL << (i & 63));
-                if (++boundary_count >= max_boundaries) break;
+                if (++boundary_count >= max_boundaries) {
+                    break;
+                }
             }
         }
-        if (boundary_count >= max_boundaries) break;
-    }
-
-    // Sweep the bitmap word by word, extracting set bit positions in order.
-    // Each word covers 64 consecutive rows: word w covers rows [w*64, w*64+63].
-    //
-    // For each word we use two standard bit tricks:
-    //   __builtin_ctzll(bits)  — count trailing zeros = index of lowest set bit
-    //   bits &= bits - 1       — clear the lowest set bit
-    //
-    // Example: w=1, bits=0b...00010100 (bits 2 and 4 set)
-    //   iter 1: ctzll=2 → idx=1*64+2=66, bits becomes 0b...00010000
-    //   iter 2: ctzll=4 → idx=1*64+4=68, bits becomes 0b...00000000 → exit
-    //
-    // Guards: idx>0 because row 0 can never be a boundary (no predecessor);
-    // idx<row_count trims padding bits in the last word when row_count%64 != 0.
+        if (boundary_count >= max_boundaries) {
+            break;
+        }
+    }
+
+    // Collect boundary positions using bitscan
     std::vector<uint32_t> result;
     for (uint32_t w = 0; w < nwords; w++) {
         uint64_t bits = boundary[w];
         while (bits) {
-            uint32_t bit = bitops::ctz64_nonzero(bits);
+            uint32_t bit =
+                static_cast<uint32_t>(common::bitops::ctz_nonzero(bits));
             uint32_t idx = w * 64 + bit;
             if (idx > 0 && idx < row_count) {
                 result.push_back(idx);
             }
-            bits &= bits - 1;
+            bits &= bits - 1;  // clear lowest set bit
         }
     }
     return result;
@@ -612,4 +695,4 @@ std::shared_ptr<IDeviceID> Tablet::get_device_id(int i) const {
     return res;
 }
 
-}  // end namespace storage
\ No newline at end of file
+}  // end namespace storage
diff --git a/cpp/src/common/tablet.h b/cpp/src/common/tablet.h
index 799d6b7cc..76af3ac0e 100644
--- a/cpp/src/common/tablet.h
+++ b/cpp/src/common/tablet.h
@@ -22,7 +22,6 @@
 
 #include <algorithm>
 #include <memory>
-#include <utility>
 #include <vector>
 
 #include "common/config/config.h"
@@ -47,7 +46,6 @@ class TabletColIterator;
  * with their associated metadata such as column names and types.
  */
 class Tablet {
-   public:
     // Arrow-style string column: offsets + contiguous buffer.
     // string[i] = buffer + offsets[i], len = offsets[i+1] - offsets[i]
     struct StringColumn {
@@ -61,11 +59,10 @@ class Tablet {
 
         void init(uint32_t max_rows, uint32_t init_buf_capacity) {
             offsets = (int32_t*)common::mem_alloc(
-                sizeof(int32_t) * (max_rows + 1), common::MOD_DEFAULT);
+                sizeof(int32_t) * (max_rows + 1), common::MOD_TABLET);
             offsets[0] = 0;
             buf_capacity = init_buf_capacity;
-            buffer =
-                (char*)common::mem_alloc(buf_capacity, common::MOD_DEFAULT);
+            buffer = (char*)common::mem_alloc(buf_capacity, common::MOD_TABLET);
             buf_used = 0;
         }
 
@@ -98,14 +95,13 @@ class Tablet {
             return buffer + offsets[row];
         }
         uint32_t get_len(uint32_t row) const {
-            return static_cast<uint32_t>(offsets[row + 1] - offsets[row]);
+            return offsets[row + 1] - offsets[row];
         }
         // Return a String view for a given row. The returned reference is
         // valid until the next call to get_string_view on this column.
         common::String& get_string_view(uint32_t row) {
             view_cache_.buf_ = buffer + offsets[row];
-            view_cache_.len_ =
-                static_cast<uint32_t>(offsets[row + 1] - offsets[row]);
+            view_cache_.len_ = offsets[row + 1] - offsets[row];
             return view_cache_;
         }
 
@@ -231,11 +227,14 @@ class Tablet {
 
     ~Tablet() { destroy(); }
 
-    // Tablet owns raw heap buffers (timestamps_, value_matrix_, bitmaps_) that
-    // destroy() frees. The implicitly generated copy operations would shallow-
-    // copy those pointers, causing double-free / use-after-free, so copying is
-    // disabled. Move transfers ownership and leaves the source empty (its
-    // pointers nulled) so the moved-from object destructs harmlessly.
+    // Tablet owns several heap buffers (timestamps_, value_matrix_ with its
+    // StringColumn::buffer/offsets, bitmaps_) that ~Tablet frees. The default
+    // copy ctor / copy-assign shallow-copies the raw pointers, so any copy
+    // path (e.g. `return tablet;` without NRVO under MSVC Debug) leaves the
+    // source Tablet's destructor freeing buffers the copy still points at,
+    // triggering heap-use-after-free in code like
+    // Tablet::find_all_device_boundaries. Make Tablet move-only with a
+    // pointer-stealing move ctor / move-assign so return-by-value is safe.
     Tablet(const Tablet&) = delete;
     Tablet& operator=(const Tablet&) = delete;
 
@@ -250,10 +249,14 @@ class Tablet {
           value_matrix_(other.value_matrix_),
           bitmaps_(other.bitmaps_),
           column_categories_(std::move(other.column_categories_)),
-          id_column_indexes_(std::move(other.id_column_indexes_)) {
+          id_column_indexes_(std::move(other.id_column_indexes_)),
+          single_device_(other.single_device_) {
         other.timestamps_ = nullptr;
         other.value_matrix_ = nullptr;
         other.bitmaps_ = nullptr;
+        other.cur_row_size_ = 0;
+        // Leaving other.schema_vec_ moved-from is fine; destroy() only
+        // touches the heap buffers above, which we've now nulled out.
     }
 
     Tablet& operator=(Tablet&& other) noexcept {
@@ -270,9 +273,11 @@ class Tablet {
             bitmaps_ = other.bitmaps_;
             column_categories_ = std::move(other.column_categories_);
             id_column_indexes_ = std::move(other.id_column_indexes_);
+            single_device_ = other.single_device_;
             other.timestamps_ = nullptr;
             other.value_matrix_ = nullptr;
             other.bitmaps_ = nullptr;
+            other.cur_row_size_ = 0;
         }
         return *this;
     }
@@ -283,12 +288,6 @@ class Tablet {
     }
     size_t get_column_count() const { return schema_vec_->size(); }
     uint32_t get_cur_row_size() const { return cur_row_size_; }
-    int64_t get_timestamp(uint32_t row_index) const {
-        return timestamps_[row_index];
-    }
-    bool is_null(uint32_t row_index, uint32_t col_index) const {
-        return bitmaps_[col_index].test(row_index);
-    }
 
     /**
      * @brief Adds a timestamp to the specified row.
@@ -300,25 +299,27 @@ class Tablet {
      */
     int add_timestamp(uint32_t row_index, int64_t timestamp);
 
-    /**
-     * @brief Bulk copy timestamps into the tablet.
-     *
-     * @param timestamps Pointer to an array of timestamp values.
-     * @param count Number of timestamps to copy. Must be <= max_row_num.
-     *        If count > cur_row_size_, cur_row_size_ is updated to count,
-     *        so that subsequent operations know how many rows are populated.
-     * @return Returns 0 on success, or a non-zero error code on failure
-     *         (E_OUT_OF_RANGE if count > max_row_num).
-     */
     int set_timestamps(const int64_t* timestamps, uint32_t count);
 
-    // Bulk copy fixed-length column data. If bitmap is nullptr, all rows are
-    // non-null. Otherwise bit=1 means null, bit=0 means valid (same as TsFile
-    // BitMap convention). Callers using other conventions (e.g. Arrow, where
-    // 1=valid) must invert before calling.
+    // Bulk copy fixed-length column data. bitmap=nullptr means all non-null.
+    // bitmap uses TsFile convention: bit=1 is null, bit=0 is valid.
     int set_column_values(uint32_t schema_index, const void* data,
                           const uint8_t* bitmap, uint32_t count);
 
+    // Bulk copy a STRING column from Arrow-style offsets + flat data buffer.
+    // bitmap=nullptr means all non-null; same convention as set_column_values.
+    int set_column_string_values(uint32_t schema_index, const int32_t* offsets,
+                                 const char* data, const uint8_t* bitmap,
+                                 uint32_t count);
+
+    // Bulk fill a STRING column with the same value for all rows.
+    int set_column_string_repeated(uint32_t schema_index, const char* str,
+                                   uint32_t str_len, uint32_t count);
+
+    // Reset per-batch state so the tablet can be reused without reallocating
+    // its backing buffers. row_count is typically 0 before refilling.
+    void reset(uint32_t row_count = 0);
+
     void* get_value(int row_index, uint32_t schema_index,
                     common::TSDataType& data_type) const;
     /**
@@ -341,14 +342,10 @@ class Tablet {
     std::shared_ptr<IDeviceID> get_device_id(int i) const;
     std::vector<uint32_t> find_all_device_boundaries() const;
 
-    // Bulk copy string column data (offsets + data buffer).
-    // offsets has count+1 entries and must start from 0 (offsets[0] == 0).
-    // bitmap follows TsFile convention (bit=1 means null, nullptr means all
-    // valid). Callers using Arrow convention (bit=1 means valid) must invert
-    // before calling.
-    int set_column_string_values(uint32_t schema_index, const int32_t* offsets,
-                                 const char* data, const uint8_t* bitmap,
-                                 uint32_t count);
+    // When the caller guarantees that all rows belong to a single device,
+    // set this flag to skip the O(n*m) boundary detection in the write path.
+    void set_single_device(bool v) { single_device_ = v; }
+    bool is_single_device() const { return single_device_; }
     /**
      * @brief Template function to add a value of type T to the specified row
      * and column by name.
@@ -406,6 +403,7 @@ class Tablet {
     common::BitMap* bitmaps_;
     std::vector<common::ColumnCategory> column_categories_;
     std::vector<int> id_column_indexes_;
+    bool single_device_ = false;
 };
 
 }  // end namespace storage
diff --git a/cpp/src/common/thread_pool.h b/cpp/src/common/thread_pool.h
index f82aea038..191001bd9 100644
--- a/cpp/src/common/thread_pool.h
+++ b/cpp/src/common/thread_pool.h
@@ -27,7 +27,6 @@
 #include <mutex>
 #include <queue>
 #include <thread>
-#include <type_traits>
 #include <vector>
 
 namespace common {
@@ -38,12 +37,27 @@ namespace common {
 // (column-parallel decoding).
 class ThreadPool {
    public:
-    explicit ThreadPool(size_t num_threads) : stop_(false), active_(0) {
-        for (size_t i = 0; i < num_threads; i++) {
-            workers_.emplace_back([this] { worker_loop(); });
+    explicit ThreadPool(size_t num_threads)
+        // A zero-thread pool would silently accept submit() but wait_all()
+        // would block forever because active_ never reaches 0.  init_common()
+        // already clamps the configured size to >= 1 before building the
+        // global pool; this normalization is a defensive backstop so any
+        // direct ThreadPool(0) still makes progress.
+        : num_threads_(num_threads == 0 ? 1 : num_threads),
+          stop_(false),
+          active_(0) {
+        for (size_t i = 0; i < num_threads_; i++) {
+            workers_.emplace_back([this, i] { worker_loop(i); });
         }
     }
 
+    // Returns this worker's index in [0, num_threads).  Returns SIZE_MAX when
+    // called from a non-pool thread.  Used by callers that want per-worker
+    // state (e.g., per-worker decoders/compressors).
+    static size_t current_worker_id() { return tl_worker_id_(); }
+
+    size_t num_threads() const { return num_threads_; }
+
     ~ThreadPool() {
         {
             std::lock_guard<std::mutex> lk(mu_);
@@ -88,7 +102,8 @@ class ThreadPool {
     }
 
    private:
-    void worker_loop() {
+    void worker_loop(size_t id) {
+        tl_worker_id_() = id;
         while (true) {
             std::function<void()> task;
             {
@@ -98,7 +113,23 @@ class ThreadPool {
                 task = std::move(tasks_.front());
                 tasks_.pop();
             }
-            task();
+            // Without the try/catch, a task that throws would:
+            //   (1) skip the active_-- below → wait_all() blocks forever
+            //       because active_ never drops to zero, and
+            //   (2) propagate the exception out of the std::thread function
+            //       → std::terminate() takes down the whole process.
+            // Swallowing the exception is unfortunate but it matches the
+            // contract of the public submit(std::function<void()>) overload
+            // which has no way to surface the failure back to the caller.
+            // submit<F>() callers receive their error via the std::future
+            // wrapper installed by std::packaged_task — that path never
+            // reaches here, so this catch only fires for fire-and-forget
+            // tasks where the alternative is termination.
+            try {
+                task();
+            } catch (...) {
+                // Intentionally suppressed; see comment above.
+            }
             {
                 std::lock_guard<std::mutex> lk(mu_);
                 active_--;
@@ -107,6 +138,14 @@ class ThreadPool {
         }
     }
 
+    // Wrapped in a function so static-initialization order is well-defined
+    // (function-local static is zero-initialized to a sentinel).
+    static size_t& tl_worker_id_() {
+        static thread_local size_t id = static_cast<size_t>(-1);
+        return id;
+    }
+
+    size_t num_threads_;
     std::vector<std::thread> workers_;
     std::queue<std::function<void()>> tasks_;
     std::mutex mu_;
diff --git a/cpp/src/common/tsblock/tsblock.h b/cpp/src/common/tsblock/tsblock.h
index 859ad393d..b68af1611 100644
--- a/cpp/src/common/tsblock/tsblock.h
+++ b/cpp/src/common/tsblock/tsblock.h
@@ -144,6 +144,12 @@ class RowAppender {
         ASSERT(tsblock_->row_count_ > 0);
         tsblock_->row_count_--;
     }
+    FORCE_INLINE uint32_t remaining() const {
+        return tsblock_->max_row_count_ - tsblock_->row_count_;
+    }
+    FORCE_INLINE void add_rows(uint32_t count) {
+        tsblock_->row_count_ += count;
+    }
 
     FORCE_INLINE void append(uint32_t slot_index, const char* value,
                              uint32_t len) {
@@ -222,6 +228,19 @@ class ColAppender {
     }
     FORCE_INLINE void reset() { column_row_count_ = 0; }
 
+    FORCE_INLINE void bulk_append_fixed(const char* data, uint32_t count,
+                                        uint32_t elem_size) {
+        vec_->get_value_data().append_fixed_value(data, count * elem_size);
+        vec_->add_row_nums(count);
+        column_row_count_ += count;
+    }
+
+    FORCE_INLINE uint32_t get_column_row_count() const {
+        return column_row_count_;
+    }
+
+    FORCE_INLINE Vector* get_vector() { return vec_; }
+
    private:
     uint32_t column_index_;
     uint32_t column_row_count_;
@@ -242,6 +261,8 @@ class RowIterator {
 
     FORCE_INLINE bool has_next() { return row_id_ < tsblock_->row_count_; }
 
+    FORCE_INLINE uint32_t get_row_id() const { return row_id_; }
+
     FORCE_INLINE uint32_t get_column_count() { return column_count_; }
 
     FORCE_INLINE TSDataType get_data_type(uint32_t column_index) {
@@ -251,17 +272,14 @@ class RowIterator {
 
     FORCE_INLINE void next() {
         ASSERT(row_id_ < tsblock_->row_count_);
-        ++row_id_;
+        const uint32_t current_row_id = row_id_++;
         for (uint32_t i = 0; i < column_count_; ++i) {
-            tsblock_->vectors_[i]->update_offset();
+            if (!tsblock_->vectors_[i]->is_null(current_row_id)) {
+                tsblock_->vectors_[i]->update_offset();
+            }
         }
     }
 
-    FORCE_INLINE void next(size_t ind) const {
-        ASSERT(row_id_ < tsblock_->row_count_);
-        tsblock_->vectors_[ind]->update_offset();
-    }
-
     FORCE_INLINE void update_row_id() { row_id_++; }
 
     FORCE_INLINE char* read(uint32_t column_index, uint32_t* __restrict len,
@@ -271,6 +289,22 @@ class RowIterator {
         return vec->read(len, null, row_id_);
     }
 
+    // Cheap null check at the current row that avoids the value-read path.
+    FORCE_INLINE bool is_null_at(uint32_t column_index) {
+        ASSERT(column_index < column_count_);
+        return tsblock_->vectors_[column_index]->is_null(row_id_);
+    }
+
+    // Direct access to the underlying Vector for the column. Caller is
+    // responsible for type-correct interpretation of the buffer; intended
+    // for the fast typed-read path that wants to bypass Vector::read's
+    // virtual dispatch (read into the raw buffer at the vector's current
+    // offset_).
+    FORCE_INLINE Vector* get_vector(uint32_t column_index) {
+        ASSERT(column_index < column_count_);
+        return tsblock_->vectors_[column_index];
+    }
+
     std::string debug_string();  // for debug
 
    private:
@@ -311,6 +345,23 @@ class ColIterator {
 
     FORCE_INLINE uint32_t get_column_index() { return column_index_; }
 
+    FORCE_INLINE uint32_t remaining() const {
+        return tsblock_->row_count_ - row_id_;
+    }
+    FORCE_INLINE char* data_ptr() {
+        return vec_->get_value_data().get_data() + vec_->get_offset();
+    }
+    FORCE_INLINE void advance(uint32_t n, uint32_t elem_size) {
+        row_id_ += n;
+        vec_->advance_offset(n * elem_size);
+    }
+
+    FORCE_INLINE void advance_row_only(uint32_t n) { row_id_ += n; }
+
+    FORCE_INLINE uint32_t get_row_id() const { return row_id_; }
+
+    FORCE_INLINE Vector* get_vector() { return vec_; }
+
    private:
     uint32_t column_index_;
     uint32_t row_id_;
diff --git a/cpp/src/common/tsblock/vector/variable_length_vector.h b/cpp/src/common/tsblock/vector/variable_length_vector.h
index b98a9c739..84e541e5c 100644
--- a/cpp/src/common/tsblock/vector/variable_length_vector.h
+++ b/cpp/src/common/tsblock/vector/variable_length_vector.h
@@ -45,8 +45,15 @@ class VariableLengthVector : public Vector {
 
     // cppcheck-suppress missingOverride
     FORCE_INLINE void update_offset() OVERRIDE {
-        offset_ += variable_type_len_;
-        offset_ += last_value_len_;
+        // Self-contained advance: read the length prefix at the current
+        // offset from the buffer rather than relying on a side effect from
+        // a prior read(). This makes update_offset safe when callers skip
+        // reading variable-length columns for some rows (e.g. a row
+        // iterator that only consumes fixed-width columns).
+        uint32_t value_len = 0;
+        std::memcpy(&value_len, values_.get_data() + offset_,
+                    sizeof(value_len));
+        offset_ += variable_type_len_ + value_len;
     }
 
     // cppcheck-suppress missingOverride
diff --git a/cpp/src/common/tsblock/vector/vector.h b/cpp/src/common/tsblock/vector/vector.h
index 37a96c543..dde3e76cc 100644
--- a/cpp/src/common/tsblock/vector/vector.h
+++ b/cpp/src/common/tsblock/vector/vector.h
@@ -73,6 +73,9 @@ class Vector {
     FORCE_INLINE uint32_t get_row_num() { return row_num_; }
 
     FORCE_INLINE void add_row_num() { row_num_++; }
+    FORCE_INLINE void add_row_nums(uint32_t n) { row_num_ += n; }
+    FORCE_INLINE uint32_t get_offset() const { return offset_; }
+    FORCE_INLINE void advance_offset(uint32_t bytes) { offset_ += bytes; }
 
     FORCE_INLINE common::TsBlock* get_tsblock() { return tsblock_; }
 
diff --git a/cpp/src/common/tsfile_common.h b/cpp/src/common/tsfile_common.h
index b516b608f..fd3690200 100644
--- a/cpp/src/common/tsfile_common.h
+++ b/cpp/src/common/tsfile_common.h
@@ -314,6 +314,11 @@ class ITimeseriesIndex {
     virtual common::SimpleList<ChunkMeta*>* get_value_chunk_meta_list() const {
         return nullptr;
     }
+    virtual uint32_t get_value_column_count() const { return 1; }
+    virtual common::SimpleList<ChunkMeta*>* get_value_chunk_meta_list(
+        uint32_t col_index) const {
+        return col_index == 0 ? get_value_chunk_meta_list() : nullptr;
+    }
 
     virtual common::String get_measurement_name() const {
         return common::String();
@@ -457,7 +462,7 @@ class TimeseriesIndex : public ITimeseriesIndex {
                 (timeseries_meta_type_ & 0x3F);  // TODO
             chunk_meta_list_ =
                 new (chunk_meta_list_buf) common::SimpleList<ChunkMeta*>(pa);
-            uint32_t start_pos = in.read_pos();
+            uint64_t start_pos = in.read_pos();
             while (IS_SUCC(ret) &&
                    in.read_pos() < start_pos + chunk_meta_list_data_size_) {
                 void* cm_buf = pa->alloc(sizeof(ChunkMeta));
@@ -589,11 +594,17 @@ class AlignedTimeseriesIndex : public ITimeseriesIndex {
     virtual common::String get_measurement_name() const {
         return value_ts_idx_->get_measurement_name();
     }
+    // Return the VALUE column's data type — that's what consumers like
+    // TsFileReader::get_timeseries_schema and metadata APIs expect for an
+    // aligned measurement.  Returning time_ts_idx_->get_data_type() would
+    // surface the time chunk's on-wire VECTOR marker (or INT64 depending
+    // on how the marker is interpreted) for every aligned timeseries,
+    // breaking schema introspection.
     virtual common::TSDataType get_data_type() const {
         return value_ts_idx_ == nullptr ? common::INVALID_DATATYPE
                                         : value_ts_idx_->get_data_type();
     }
-    virtual bool is_aligned() const { return true; }
+    bool is_aligned() const override { return true; }
     virtual Statistic* get_statistic() const {
         return value_ts_idx_->get_statistic();
     }
@@ -608,6 +619,52 @@ class AlignedTimeseriesIndex : public ITimeseriesIndex {
 #endif
 };
 
+class MultiAlignedTimeseriesIndex : public ITimeseriesIndex {
+   public:
+    TimeseriesIndex* time_ts_idx_ = nullptr;
+    std::vector<TimeseriesIndex*> value_ts_idxs_;
+
+    MultiAlignedTimeseriesIndex() {}
+    ~MultiAlignedTimeseriesIndex() {}
+
+    common::SimpleList<ChunkMeta*>* get_time_chunk_meta_list() const override {
+        return time_ts_idx_ ? time_ts_idx_->get_chunk_meta_list() : nullptr;
+    }
+    common::SimpleList<ChunkMeta*>* get_value_chunk_meta_list() const override {
+        return value_ts_idxs_.empty()
+                   ? nullptr
+                   : value_ts_idxs_[0]->get_chunk_meta_list();
+    }
+    uint32_t get_value_column_count() const override {
+        return value_ts_idxs_.size();
+    }
+    common::SimpleList<ChunkMeta*>* get_value_chunk_meta_list(
+        uint32_t col_index) const override {
+        return col_index < value_ts_idxs_.size()
+                   ? value_ts_idxs_[col_index]->get_chunk_meta_list()
+                   : nullptr;
+    }
+    common::String get_measurement_name() const override {
+        return value_ts_idxs_.empty()
+                   ? common::String()
+                   : value_ts_idxs_[0]->get_measurement_name();
+    }
+    // Same fix as AlignedTimeseriesIndex: report the first value column's
+    // type rather than the time chunk's VECTOR marker.  Consumers walking
+    // a multi-aligned device for schema info expect the measurement type.
+    common::TSDataType get_data_type() const override {
+        return value_ts_idxs_.empty() || value_ts_idxs_[0] == nullptr
+                   ? common::INVALID_DATATYPE
+                   : value_ts_idxs_[0]->get_data_type();
+    }
+    bool is_aligned() const override { return true; }
+    Statistic* get_statistic() const override { return nullptr; }
+
+    const std::vector<TimeseriesIndex*>& get_value_indices() const {
+        return value_ts_idxs_;
+    }
+};
+
 class TSMIterator {
    public:
     explicit TSMIterator(
@@ -629,7 +686,6 @@ class TSMIterator {
     common::SimpleList<ChunkMeta*>::Iterator chunk_meta_iter_;
 
     // timeseries measurenemnt chunk meta info
-    // map <device_name, <measurement_name, vector<chunk_meta>>>
     std::map<std::shared_ptr<IDeviceID>,
              std::map<common::String, std::vector<ChunkMeta*>>,
              IDeviceIDComparator>
diff --git a/cpp/src/compress/lz4_compressor.cc b/cpp/src/compress/lz4_compressor.cc
index 88c64466f..0f19ce179 100644
--- a/cpp/src/compress/lz4_compressor.cc
+++ b/cpp/src/compress/lz4_compressor.cc
@@ -76,9 +76,13 @@ int LZ4Compressor::compress(char* uncompressed_buf,
 }
 
 void LZ4Compressor::after_compress(char* compressed_buf) {
+    // See SnappyCompressor::after_compress for the same reasoning: the member
+    // pointer can lag behind the caller-known buffer across page reuse.
     if (compressed_buf != nullptr) {
-        mem_free(compressed_buf_);
-        compressed_buf_ = nullptr;
+        mem_free(compressed_buf);
+        if (compressed_buf_ == compressed_buf) {
+            compressed_buf_ = nullptr;
+        }
     }
 }
 
@@ -132,9 +136,11 @@ int LZ4Compressor::uncompress(char* compressed_buf, uint32_t compressed_buf_len,
 
 void LZ4Compressor::after_uncompress(char* uncompressed_buf) {
     if (uncompressed_buf != nullptr) {
-        mem_free(uncompressed_buf_);
-        uncompressed_buf_ = nullptr;
+        mem_free(uncompressed_buf);
+        if (uncompressed_buf_ == uncompressed_buf) {
+            uncompressed_buf_ = nullptr;
+        }
     }
 }
 
-}  // end namespace storage
\ No newline at end of file
+}  // end namespace storage
diff --git a/cpp/src/compress/snappy_compressor.cc b/cpp/src/compress/snappy_compressor.cc
index 6a2735e7b..e78a67ac3 100644
--- a/cpp/src/compress/snappy_compressor.cc
+++ b/cpp/src/compress/snappy_compressor.cc
@@ -73,9 +73,16 @@ int SnappyCompressor::compress(char* uncompressed_buf,
 }
 
 void SnappyCompressor::after_compress(char* compressed_buf) {
+    // Free the buffer the caller is releasing, not whatever we last cached in
+    // compressed_buf_. The member is only kept so destroy() can clean up if
+    // after_compress is never called. When the same compressor is reused
+    // across pages, compressed_buf_ may point to a different (live) allocation
+    // or be null by the time the caller releases an earlier page's buffer.
     if (compressed_buf != nullptr) {
-        mem_free(compressed_buf_);
-        compressed_buf_ = nullptr;
+        mem_free(compressed_buf);
+        if (compressed_buf_ == compressed_buf) {
+            compressed_buf_ = nullptr;
+        }
     }
 }
 
@@ -109,9 +116,11 @@ int SnappyCompressor::uncompress(char* compressed_buf,
 
 void SnappyCompressor::after_uncompress(char* uncompressed_buf) {
     if (uncompressed_buf != nullptr) {
-        mem_free(uncompressed_buf_);
-        uncompressed_buf_ = nullptr;
+        mem_free(uncompressed_buf);
+        if (uncompressed_buf_ == uncompressed_buf) {
+            uncompressed_buf_ = nullptr;
+        }
     }
 }
 
-}  // end namespace storage
\ No newline at end of file
+}  // end namespace storage
diff --git a/cpp/src/compress/uncompressed_compressor.h b/cpp/src/compress/uncompressed_compressor.h
index c262837a8..c342b5001 100644
--- a/cpp/src/compress/uncompressed_compressor.h
+++ b/cpp/src/compress/uncompressed_compressor.h
@@ -20,19 +20,38 @@
 #ifndef COMPRESS_UNCOMPRESSED_COMPRESSOR_H
 #define COMPRESS_UNCOMPRESSED_COMPRESSOR_H
 
+#include <string.h>
+
+#include "common/allocator/alloc_base.h"
 #include "compressor.h"
+#include "utils/errno_define.h"
+#include "utils/util_define.h"
 
 namespace storage {
 
 class UncompressedCompressor : public Compressor {
    public:
-    UncompressedCompressor() {}
-    virtual ~UncompressedCompressor() {}
+    UncompressedCompressor() : uncompressed_buf_(nullptr) {}
+    virtual ~UncompressedCompressor() {
+        if (uncompressed_buf_ != nullptr) {
+            common::mem_free(uncompressed_buf_);
+            uncompressed_buf_ = nullptr;
+        }
+    }
     int reset(bool for_compress) {
         UNUSED(for_compress);
+        if (uncompressed_buf_ != nullptr) {
+            common::mem_free(uncompressed_buf_);
+            uncompressed_buf_ = nullptr;
+        }
         return common::E_OK;
     }
-    void destroy() {}
+    void destroy() {
+        if (uncompressed_buf_ != nullptr) {
+            common::mem_free(uncompressed_buf_);
+            uncompressed_buf_ = nullptr;
+        }
+    }
     int compress(char* uncompressed_buf, uint32_t uncompressed_buf_len,
                  char*& compressed_buf, uint32_t& compressed_buf_len) {
         compressed_buf = uncompressed_buf;
@@ -43,11 +62,33 @@ class UncompressedCompressor : public Compressor {
 
     int uncompress(char* compressed_buf, uint32_t compressed_buf_len,
                    char*& uncompressed_buf, uint32_t& uncompressed_buf_len) {
-        uncompressed_buf = compressed_buf;
+        char* buf = static_cast<char*>(
+            common::mem_alloc(compressed_buf_len, common::MOD_COMPRESSOR_OBJ));
+        if (buf == nullptr) {
+            return common::E_OOM;
+        }
+        memcpy(buf, compressed_buf, compressed_buf_len);
+        uncompressed_buf = buf;
+        uncompressed_buf_ = buf;
         uncompressed_buf_len = compressed_buf_len;
         return common::E_OK;
     }
-    void after_uncompress(char* uncompressed_buf) { UNUSED(uncompressed_buf); }
+    void after_uncompress(char* uncompressed_buf) {
+        // Free the buffer the caller is releasing, not the most-recently
+        // allocated one cached in uncompressed_buf_.  Two successive
+        // uncompress() calls would overwrite uncompressed_buf_ with the
+        // second allocation; after_uncompress(first) used to free that
+        // second buffer (use-after-free for the still-live one) and leak
+        // the first.
+        if (uncompressed_buf == nullptr) return;
+        common::mem_free(uncompressed_buf);
+        if (uncompressed_buf_ == uncompressed_buf) {
+            uncompressed_buf_ = nullptr;
+        }
+    }
+
+   private:
+    char* uncompressed_buf_;
 };
 
 }  // end namespace storage
diff --git a/cpp/src/cwrapper/arrow_c.cc b/cpp/src/cwrapper/arrow_c.cc
index 931c17de7..3f02a7692 100644
--- a/cpp/src/cwrapper/arrow_c.cc
+++ b/cpp/src/cwrapper/arrow_c.cc
@@ -843,7 +843,12 @@ int ArrowStructToTablet(const char* table_name, const ArrowArray* in_array,
         const ArrowArray* ts_arr = in_array->children[time_col_index];
         const int64_t* ts_buf =
             static_cast<const int64_t*>(ts_arr->buffers[1]) + ts_arr->offset;
-        tablet->set_timestamps(ts_buf, static_cast<uint32_t>(n_rows));
+        int sret =
+            tablet->set_timestamps(ts_buf, static_cast<uint32_t>(n_rows));
+        if (sret != common::E_OK) {
+            delete tablet;
+            return sret;
+        }
     }
 
     // Fill data columns from Arrow children (use read_modes to decode buffers)
@@ -892,11 +897,15 @@ int ArrowStructToTablet(const char* table_name, const ArrowArray* in_array,
                     delete tablet;
                     return common::E_OOM;
                 }
-                tablet->set_column_values(tcol, data, null_bm,
-                                          static_cast<uint32_t>(n_rows));
+                int sret = tablet->set_column_values(
+                    tcol, data, null_bm, static_cast<uint32_t>(n_rows));
                 if (null_bm != nullptr) {
                     common::mem_free(null_bm);
                 }
+                if (sret != common::E_OK) {
+                    delete tablet;
+                    return sret;
+                }
                 break;
             }
             case common::DATE: {
@@ -948,14 +957,18 @@ int ArrowStructToTablet(const char* table_name, const ArrowArray* in_array,
                     delete tablet;
                     return common::E_OOM;
                 }
-                tablet->set_column_string_values(tcol, offsets, data, null_bm,
-                                                 nrows);
+                int sret = tablet->set_column_string_values(tcol, offsets, data,
+                                                            null_bm, nrows);
                 if (null_bm != nullptr) {
                     common::mem_free(null_bm);
                 }
                 if (norm_offsets != nullptr) {
                     common::mem_free(norm_offsets);
                 }
+                if (sret != common::E_OK) {
+                    delete tablet;
+                    return sret;
+                }
                 break;
             }
             default:
diff --git a/cpp/src/cwrapper/tsfile_cwrapper.cc b/cpp/src/cwrapper/tsfile_cwrapper.cc
index 0934981f9..0fc915974 100644
--- a/cpp/src/cwrapper/tsfile_cwrapper.cc
+++ b/cpp/src/cwrapper/tsfile_cwrapper.cc
@@ -21,7 +21,9 @@
 
 #include <file/write_file.h>
 #include <reader/qds_without_timegenerator.h>
+#include <sys/stat.h>
 #include <writer/tsfile_table_writer.h>
+
 #ifdef _WIN32
 #include <io.h>
 #else
@@ -92,8 +94,14 @@ WriteFile write_file_new(const char* pathname, ERRNO* err_code) {
     int ret;
     init_tsfile_config();
 
-    if (access(pathname, F_OK) == 0) {
-        *err_code = common::E_ALREADY_EXIST;
+    struct stat path_stat {};
+    if (stat(pathname, &path_stat) == 0) {
+#ifdef _WIN32
+        const bool is_dir = (path_stat.st_mode & _S_IFDIR) != 0;
+#else
+        const bool is_dir = S_ISDIR(path_stat.st_mode);
+#endif
+        *err_code = is_dir ? common::E_FILE_OPEN_ERR : common::E_ALREADY_EXIST;
         return nullptr;
     }
 
@@ -110,6 +118,17 @@ WriteFile write_file_new(const char* pathname, ERRNO* err_code) {
 
 TsFileWriter tsfile_writer_new(WriteFile file, TableSchema* schema,
                                ERRNO* err_code) {
+    // C API: every public entry must defend against null callers — a null
+    // schema or err_code would crash the host process the moment it's
+    // dereferenced.  The tag-filter helpers already follow this pattern.
+    if (err_code == nullptr) {
+        return nullptr;
+    }
+    if (file == nullptr || schema == nullptr ||
+        schema->column_schemas == nullptr || schema->table_name == nullptr) {
+        *err_code = common::E_INVALID_ARG;
+        return nullptr;
+    }
     if (schema->column_num == 0) {
         *err_code = common::E_INVALID_SCHEMA;
         return nullptr;
@@ -149,6 +168,15 @@ TsFileWriter tsfile_writer_new_with_memory_threshold(WriteFile file,
                                                      TableSchema* schema,
                                                      uint64_t memory_threshold,
                                                      ERRNO* err_code) {
+    // See tsfile_writer_new() above for the null-guard rationale.
+    if (err_code == nullptr) {
+        return nullptr;
+    }
+    if (file == nullptr || schema == nullptr ||
+        schema->column_schemas == nullptr || schema->table_name == nullptr) {
+        *err_code = common::E_INVALID_ARG;
+        return nullptr;
+    }
     if (schema->column_num == 0) {
         *err_code = common::E_INVALID_SCHEMA;
         return nullptr;
@@ -158,11 +186,21 @@ TsFileWriter tsfile_writer_new_with_memory_threshold(WriteFile file,
     std::set<std::string> column_names;
     for (int i = 0; i < schema->column_num; i++) {
         ColumnSchema cur_schema = schema->column_schemas[i];
-        if (column_names.find(cur_schema.column_name) == column_names.end()) {
+        // Reject only when the name has already been seen.  The previous
+        // condition was inverted, so the first column (always a fresh name)
+        // was rejected as a duplicate and this constructor was effectively
+        // unusable — tsfile_writer_new()'s loop above has the correct check
+        // for comparison.
+        if (column_names.find(cur_schema.column_name) != column_names.end()) {
             *err_code = common::E_INVALID_SCHEMA;
             return nullptr;
         }
         column_names.insert(cur_schema.column_name);
+        if (cur_schema.column_category == TAG &&
+            cur_schema.data_type != TS_DATATYPE_STRING) {
+            *err_code = common::E_INVALID_SCHEMA;
+            return nullptr;
+        }
         column_schemas.emplace_back(
             cur_schema.column_name,
             static_cast<common::TSDataType>(cur_schema.data_type),
@@ -1205,6 +1243,8 @@ ERRNO populate_c_metadata_map_from_cpp(
             if (m.measurement_name == nullptr) {
                 for (uint32_t u = 0; u < slot; u++) {
                     free_timeseries_statistic_heap(&e.timeseries[u].statistic);
+                    free_timeseries_statistic_heap(
+                        &e.timeseries[u].timeline_statistic);
                     free(e.timeseries[u].measurement_name);
                 }
                 free(e.timeseries);
@@ -1465,6 +1505,13 @@ Tablet _tablet_new_with_target_name(const char* device_id,
 }
 
 ERRNO _tsfile_writer_register_table(TsFileWriter writer, TableSchema* schema) {
+    if (writer == nullptr || schema == nullptr ||
+        schema->column_schemas == nullptr || schema->table_name == nullptr) {
+        return common::E_INVALID_ARG;
+    }
+    if (schema->column_num <= 0) {
+        return common::E_INVALID_SCHEMA;
+    }
     std::vector<storage::MeasurementSchema*> measurement_schemas;
     std::vector<common::ColumnCategory> column_categories;
     measurement_schemas.resize(schema->column_num);
@@ -1587,13 +1634,50 @@ ResultSet _tsfile_reader_query_device(TsFileReader reader,
     return qds;
 }
 
-// ---------- Tag Filter API ----------
+// ============== Tag Filter API Implementation ==============
+
+// Helper macro to avoid repetition in tag filter factory functions.
+// The shared_ptr must stay alive while TagFilterBuilder accesses the schema.
+// Every C-API entry must validate its pointers: a null reader would deref
+// during the static_cast, and null table/column/value would feed std::string
+// a null pointer (UB / crash).
+#define DEFINE_TAG_FILTER_FACTORY(name, method)                               \
+    TagFilterHandle tsfile_tag_filter_##name(                                 \
+        TsFileReader reader, const char* table_name, const char* column_name, \
+        const char* value) {                                                  \
+        if (reader == nullptr || table_name == nullptr ||                     \
+            column_name == nullptr || value == nullptr) {                     \
+            return nullptr;                                                   \
+        }                                                                     \
+        auto* r = static_cast<storage::TsFileReader*>(reader);                \
+        auto schema = r->get_table_schema(table_name);                        \
+        if (!schema) return nullptr;                                          \
+        storage::TagFilterBuilder builder(schema.get());                      \
+        return builder.method(column_name, value);                            \
+    }
+
+DEFINE_TAG_FILTER_FACTORY(eq, eq)
+DEFINE_TAG_FILTER_FACTORY(neq, neq)
+DEFINE_TAG_FILTER_FACTORY(lt, lt)
+DEFINE_TAG_FILTER_FACTORY(lteq, lteq)
+DEFINE_TAG_FILTER_FACTORY(gt, gt)
+DEFINE_TAG_FILTER_FACTORY(gteq, gteq)
+
+#undef DEFINE_TAG_FILTER_FACTORY
 
 TagFilterHandle tsfile_tag_filter_create(TsFileReader reader,
                                          const char* table_name,
                                          const char* column_name,
                                          const char* value, TagFilterOp op,
                                          ERRNO* err_code) {
+    if (err_code == nullptr) {
+        return nullptr;
+    }
+    if (reader == nullptr || table_name == nullptr || column_name == nullptr ||
+        value == nullptr) {
+        *err_code = common::E_INVALID_ARG;
+        return nullptr;
+    }
     auto* r = static_cast<storage::TsFileReader*>(reader);
     auto schema = r->get_table_schema(table_name);
     if (!schema) {
@@ -1656,25 +1740,30 @@ TagFilterHandle tsfile_tag_filter_between(TsFileReader reader,
 
 TagFilterHandle tsfile_tag_filter_and(TagFilterHandle left,
                                       TagFilterHandle right) {
-    return static_cast<void*>(storage::TagFilterBuilder::and_filter(
+    if (!left || !right) return nullptr;
+    return storage::TagFilterBuilder::and_filter(
         static_cast<storage::Filter*>(left),
-        static_cast<storage::Filter*>(right)));
+        static_cast<storage::Filter*>(right));
 }
 
 TagFilterHandle tsfile_tag_filter_or(TagFilterHandle left,
                                      TagFilterHandle right) {
-    return static_cast<void*>(storage::TagFilterBuilder::or_filter(
+    if (!left || !right) return nullptr;
+    return storage::TagFilterBuilder::or_filter(
         static_cast<storage::Filter*>(left),
-        static_cast<storage::Filter*>(right)));
+        static_cast<storage::Filter*>(right));
 }
 
 TagFilterHandle tsfile_tag_filter_not(TagFilterHandle filter) {
-    return static_cast<void*>(storage::TagFilterBuilder::not_filter(
-        static_cast<storage::Filter*>(filter)));
+    if (!filter) return nullptr;
+    return storage::TagFilterBuilder::not_filter(
+        static_cast<storage::Filter*>(filter));
 }
 
 void tsfile_tag_filter_free(TagFilterHandle filter) {
-    delete static_cast<storage::Filter*>(filter);
+    if (filter) {
+        delete static_cast<storage::Filter*>(filter);
+    }
 }
 
 ResultSet tsfile_query_table_with_tag_filter(
diff --git a/cpp/src/cwrapper/tsfile_cwrapper.h b/cpp/src/cwrapper/tsfile_cwrapper.h
index ae3e28eed..3b3b13c36 100644
--- a/cpp/src/cwrapper/tsfile_cwrapper.h
+++ b/cpp/src/cwrapper/tsfile_cwrapper.h
@@ -905,32 +905,68 @@ TagFilterHandle tsfile_tag_filter_between(TsFileReader reader,
                                           bool is_not, ERRNO* err_code);
 
 /**
- * @brief Combine two tag filters with AND.
+ * @brief Create a tag equality filter: column == value.
+ *
+ * @param reader [in] Valid TsFileReader handle (used to resolve column index).
+ * @param table_name [in] Target table name.
+ * @param column_name [in] Tag column name.
+ * @param value [in] Value to compare against.
+ * @return TagFilterHandle on success, NULL on failure.
+ */
+TagFilterHandle tsfile_tag_filter_eq(TsFileReader reader,
+                                     const char* table_name,
+                                     const char* column_name,
+                                     const char* value);
+
+TagFilterHandle tsfile_tag_filter_neq(TsFileReader reader,
+                                      const char* table_name,
+                                      const char* column_name,
+                                      const char* value);
+
+TagFilterHandle tsfile_tag_filter_lt(TsFileReader reader,
+                                     const char* table_name,
+                                     const char* column_name,
+                                     const char* value);
+
+TagFilterHandle tsfile_tag_filter_lteq(TsFileReader reader,
+                                       const char* table_name,
+                                       const char* column_name,
+                                       const char* value);
+
+TagFilterHandle tsfile_tag_filter_gt(TsFileReader reader,
+                                     const char* table_name,
+                                     const char* column_name,
+                                     const char* value);
+
+TagFilterHandle tsfile_tag_filter_gteq(TsFileReader reader,
+                                       const char* table_name,
+                                       const char* column_name,
+                                       const char* value);
+
+/**
+ * @brief Logical AND of two tag filters. Takes ownership of left and right.
  */
 TagFilterHandle tsfile_tag_filter_and(TagFilterHandle left,
                                       TagFilterHandle right);
 
 /**
- * @brief Combine two tag filters with OR.
+ * @brief Logical OR of two tag filters. Takes ownership of left and right.
  */
 TagFilterHandle tsfile_tag_filter_or(TagFilterHandle left,
                                      TagFilterHandle right);
 
 /**
- * @brief Negate a tag filter.
+ * @brief Logical NOT of a tag filter. Takes ownership of filter.
  */
 TagFilterHandle tsfile_tag_filter_not(TagFilterHandle filter);
 
 /**
- * @brief Free a tag filter and all its children.
+ * @brief Free a tag filter handle.
  */
 void tsfile_tag_filter_free(TagFilterHandle filter);
 
 /**
- * @brief Query table with tag filter.
- *
- * @param batch_size <= 0 means row-by-row return mode,
- *                   > 0 means return TsBlock with the specified block size.
+ * @brief Batch query with tag filter support.
  */
 ResultSet tsfile_query_table_with_tag_filter(
     TsFileReader reader, const char* table_name, char** columns,
diff --git a/cpp/src/encoding/decoder.h b/cpp/src/encoding/decoder.h
index c290b5791..24455ca01 100644
--- a/cpp/src/encoding/decoder.h
+++ b/cpp/src/encoding/decoder.h
@@ -21,6 +21,7 @@
 #define ENCODING_DECODER_H
 
 #include "common/allocator/byte_stream.h"
+#include "common/db_common.h"
 
 namespace storage {
 
@@ -37,6 +38,140 @@ class Decoder {
     virtual int read_double(double& ret_value, common::ByteStream& in) = 0;
     virtual int read_String(common::String& ret_value, common::PageArena& pa,
                             common::ByteStream& in) = 0;
+
+    virtual int read_batch_int32(int32_t* out, int capacity, int& actual,
+                                 common::ByteStream& in) {
+        actual = 0;
+        int ret = common::E_OK;
+        int32_t val;
+        while (actual < capacity && has_remaining(in)) {
+            ret = read_int32(val, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            out[actual++] = val;
+        }
+        return common::E_OK;
+    }
+
+    virtual int read_batch_int64(int64_t* out, int capacity, int& actual,
+                                 common::ByteStream& in) {
+        actual = 0;
+        int ret = common::E_OK;
+        int64_t val;
+        while (actual < capacity && has_remaining(in)) {
+            ret = read_int64(val, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            out[actual++] = val;
+        }
+        return common::E_OK;
+    }
+
+    virtual int read_batch_float(float* out, int capacity, int& actual,
+                                 common::ByteStream& in) {
+        actual = 0;
+        int ret = common::E_OK;
+        float val;
+        while (actual < capacity && has_remaining(in)) {
+            ret = read_float(val, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            out[actual++] = val;
+        }
+        return common::E_OK;
+    }
+
+    virtual int read_batch_double(double* out, int capacity, int& actual,
+                                  common::ByteStream& in) {
+        actual = 0;
+        int ret = common::E_OK;
+        double val;
+        while (actual < capacity && has_remaining(in)) {
+            ret = read_double(val, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            out[actual++] = val;
+        }
+        return common::E_OK;
+    }
+
+    virtual int skip_int32(int count, int& skipped, common::ByteStream& in) {
+        skipped = 0;
+        int ret = common::E_OK;
+        int32_t dummy;
+        while (skipped < count && has_remaining(in)) {
+            ret = read_int32(dummy, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            ++skipped;
+        }
+        return common::E_OK;
+    }
+
+    virtual int skip_int64(int count, int& skipped, common::ByteStream& in) {
+        skipped = 0;
+        int ret = common::E_OK;
+        int64_t dummy;
+        while (skipped < count && has_remaining(in)) {
+            ret = read_int64(dummy, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            ++skipped;
+        }
+        return common::E_OK;
+    }
+
+    virtual int skip_float(int count, int& skipped, common::ByteStream& in) {
+        skipped = 0;
+        int ret = common::E_OK;
+        float dummy;
+        while (skipped < count && has_remaining(in)) {
+            ret = read_float(dummy, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            ++skipped;
+        }
+        return common::E_OK;
+    }
+
+    virtual int skip_double(int count, int& skipped, common::ByteStream& in) {
+        skipped = 0;
+        int ret = common::E_OK;
+        double dummy;
+        while (skipped < count && has_remaining(in)) {
+            ret = read_double(dummy, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            ++skipped;
+        }
+        return common::E_OK;
+    }
+
+    // Block-level filter check: peek the next block header and compute
+    // the value range [block_min, block_max] without decoding.
+    // Returns true if a block was peeked; false if not supported or no data.
+    // After peeking, caller must either:
+    //   - Call skip_peeked_block_int64() to skip the block
+    //   - Call read_batch_int64() which will use the peeked header
+    virtual bool peek_next_block_range_int64(common::ByteStream& in,
+                                             int64_t& block_min,
+                                             int64_t& block_max,
+                                             int& block_count) {
+        return false;
+    }
+
+    // Skip the block whose header was already consumed by peek.
+    virtual int skip_peeked_block_int64(common::ByteStream& in, int& skipped) {
+        return common::E_NOT_SUPPORT;
+    }
 };
 
 }  // end namespace storage
diff --git a/cpp/src/encoding/dictionary_encoder.h b/cpp/src/encoding/dictionary_encoder.h
index be5f78a09..8f7c495c4 100644
--- a/cpp/src/encoding/dictionary_encoder.h
+++ b/cpp/src/encoding/dictionary_encoder.h
@@ -83,7 +83,12 @@ class DictionaryEncoder : public Encoder {
         if (entry_index_.count(value) == 0) {
             index_entry_.push_back(value);
             map_size_ = map_size_ + value.length();
-            entry_index_[value] = static_cast<int>(index_entry_.size()) - 1;
+            // Compute the index before the insert: LHS/RHS evaluation order of
+            // `m[k] = m.size()` is unspecified before C++17, so a compiler
+            // that evaluates the LHS first would store size()+1 and corrupt
+            // the dictionary.
+            const int new_idx = static_cast<int>(index_entry_.size()) - 1;
+            entry_index_[value] = new_idx;
         }
         values_encoder_.encode(entry_index_[value], out);
         return common::E_OK;
diff --git a/cpp/src/encoding/encoder.h b/cpp/src/encoding/encoder.h
index 921686446..386129f6e 100644
--- a/cpp/src/encoding/encoder.h
+++ b/cpp/src/encoding/encoder.h
@@ -48,6 +48,81 @@ class Encoder {
      * @return the maximal size of possible memory occupied by current encoder
      */
     virtual int get_max_byte_size() = 0;
+
+    /*
+     * Batch encoding interfaces.
+     * Default implementations fall back to per-value encode().
+     * Subclasses may override for better performance.
+     */
+    virtual int encode_batch(const bool* values, uint32_t count,
+                             common::ByteStream& out_stream) {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            if (RET_FAIL(encode(values[i], out_stream))) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+    virtual int encode_batch(const int32_t* values, uint32_t count,
+                             common::ByteStream& out_stream) {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            if (RET_FAIL(encode(values[i], out_stream))) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+    virtual int encode_batch(const int64_t* values, uint32_t count,
+                             common::ByteStream& out_stream) {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            if (RET_FAIL(encode(values[i], out_stream))) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+    virtual int encode_batch(const float* values, uint32_t count,
+                             common::ByteStream& out_stream) {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            if (RET_FAIL(encode(values[i], out_stream))) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+    virtual int encode_batch(const double* values, uint32_t count,
+                             common::ByteStream& out_stream) {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            if (RET_FAIL(encode(values[i], out_stream))) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+
+    // Batch encode strings from a contiguous buffer with offset array
+    // (Arrow-style layout from Tablet::StringColumn).
+    // string[i] = buffer + offsets[start_idx + i], length = offsets[start_idx +
+    // i + 1] - offsets[start_idx + i].
+    virtual int encode_string_batch(const char* buffer, const uint32_t* offsets,
+                                    uint32_t start_idx, uint32_t count,
+                                    common::ByteStream& out_stream) {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t idx = start_idx + i;
+            uint32_t len = offsets[idx + 1] - offsets[idx];
+            common::String val(buffer + offsets[idx], len);
+            if (RET_FAIL(encode(val, out_stream))) {
+                return ret;
+            }
+        }
+        return ret;
+    }
 };
 
 }  // end namespace storage
diff --git a/cpp/src/encoding/gorilla_decoder.h b/cpp/src/encoding/gorilla_decoder.h
index 5684561aa..e1e490105 100644
--- a/cpp/src/encoding/gorilla_decoder.h
+++ b/cpp/src/encoding/gorilla_decoder.h
@@ -30,6 +30,163 @@
 
 namespace storage {
 
+// ── Raw-pointer bit reader ────────────────────────────────────────────────
+// Operates directly on a contiguous byte array, bypassing ByteStream's
+// per-byte read_buf() overhead (atomic loads, page boundary checks, memcpy).
+
+struct GorillaBitReader {
+    const uint8_t* data;
+    uint32_t pos;       // next byte index to load
+    uint32_t data_len;  // total bytes
+    int bits;           // remaining bits in cur_byte (0..8)
+    uint8_t cur_byte;
+    // Set once a load was attempted on an empty input, or once read_bit /
+    // read_long ran out of bits mid-value.  Without this, a truncated page
+    // would spin read_long() forever (bits stays 0, n -= 0 makes no
+    // progress) and read_bit() would execute a negative shift via
+    // (cur_byte >> (bits - 1)).
+    bool exhausted = false;
+
+    FORCE_INLINE void load_byte_if_empty() {
+        if (bits == 0) {
+            if (pos < data_len) {
+                cur_byte = data[pos++];
+                bits = 8;
+            } else {
+                exhausted = true;
+            }
+        }
+    }
+
+    FORCE_INLINE bool read_bit() {
+        if (UNLIKELY(bits == 0)) {
+            exhausted = true;
+            return false;
+        }
+        bool bit = ((cur_byte >> (bits - 1)) & 1) == 1;
+        bits--;
+        load_byte_if_empty();
+        return bit;
+    }
+
+    FORCE_INLINE int64_t read_long(int n) {
+        int64_t value = 0;
+        while (n > 0) {
+            if (UNLIKELY(bits == 0)) {
+                // Input drained mid-value; bail so the outer loop in
+                // read_control_bits / batch_decode_raw doesn't spin.
+                exhausted = true;
+                return value;
+            }
+            if (n > bits || n == 8) {
+                value = (value << bits) + (cur_byte & ((1 << bits) - 1));
+                n -= bits;
+                bits = 0;
+            } else {
+                value =
+                    (value << n) + ((cur_byte >> (bits - n)) & ((1 << n) - 1));
+                bits -= n;
+                n = 0;
+            }
+            load_byte_if_empty();
+        }
+        return value;
+    }
+
+    FORCE_INLINE uint8_t read_control_bits(int max_bits) {
+        uint8_t value = 0x00;
+        for (int i = 0; i < max_bits; i++) {
+            value <<= 1;
+            if (exhausted) break;
+            if (read_bit()) {
+                value |= 0x01;
+            } else {
+                break;
+            }
+        }
+        return value;
+    }
+};
+
+// ── Templated raw-pointer decode helpers ──────────────────────────────────
+
+template <typename T>
+struct GorillaRawOps {
+    static FORCE_INLINE T read_next(GorillaBitReader& r, T& stored_value,
+                                    int& stored_leading_zeros,
+                                    int& stored_trailing_zeros);
+};
+
+template <>
+struct GorillaRawOps<int32_t> {
+    static constexpr int VALUE_BITS = VALUE_BITS_LENGTH_32BIT;
+
+    static FORCE_INLINE int32_t read_next(GorillaBitReader& r,
+                                          int32_t& stored_value,
+                                          int& stored_leading_zeros,
+                                          int& stored_trailing_zeros) {
+        uint8_t ctrl = r.read_control_bits(2);
+        switch (ctrl) {
+            case 3: {
+                stored_leading_zeros =
+                    (int)r.read_long(LEADING_ZERO_BITS_LENGTH_32BIT);
+                uint8_t sig =
+                    (uint8_t)r.read_long(MEANINGFUL_XOR_BITS_LENGTH_32BIT);
+                sig++;
+                stored_trailing_zeros = VALUE_BITS - sig - stored_leading_zeros;
+            }
+            // fallthrough
+            case 2: {
+                int32_t xor_value = (int32_t)r.read_long(
+                    VALUE_BITS - stored_leading_zeros - stored_trailing_zeros);
+                xor_value = static_cast<uint32_t>(xor_value)
+                            << stored_trailing_zeros;
+                stored_value ^= xor_value;
+            }
+            // fallthrough
+            default:
+                return stored_value;
+        }
+        return stored_value;
+    }
+};
+
+template <>
+struct GorillaRawOps<int64_t> {
+    static constexpr int VALUE_BITS = VALUE_BITS_LENGTH_64BIT;
+
+    static FORCE_INLINE int64_t read_next(GorillaBitReader& r,
+                                          int64_t& stored_value,
+                                          int& stored_leading_zeros,
+                                          int& stored_trailing_zeros) {
+        uint8_t ctrl = r.read_control_bits(2);
+        switch (ctrl) {
+            case 3: {
+                stored_leading_zeros =
+                    (int)r.read_long(LEADING_ZERO_BITS_LENGTH_64BIT);
+                uint8_t sig =
+                    (uint8_t)r.read_long(MEANINGFUL_XOR_BITS_LENGTH_64BIT);
+                sig++;
+                stored_trailing_zeros = VALUE_BITS - sig - stored_leading_zeros;
+            }
+            // fallthrough
+            case 2: {
+                int64_t xor_value = r.read_long(
+                    VALUE_BITS - stored_leading_zeros - stored_trailing_zeros);
+                xor_value = static_cast<uint64_t>(xor_value)
+                            << stored_trailing_zeros;
+                stored_value ^= xor_value;
+            }
+            // fallthrough
+            default:
+                return stored_value;
+        }
+        return stored_value;
+    }
+};
+
+// ──────────────────────────────────────────────────────────────────────────
+
 template <typename T>
 class GorillaDecoder : public Decoder {
    public:
@@ -127,6 +284,197 @@ class GorillaDecoder : public Decoder {
     int read_String(common::String& ret_value, common::PageArena& pa,
                     common::ByteStream& in) override;
 
+    // Batch overrides — declared here, defined after template specializations
+    int read_batch_int32(int32_t* out, int capacity, int& actual,
+                         common::ByteStream& in) override;
+    int read_batch_int64(int64_t* out, int capacity, int& actual,
+                         common::ByteStream& in) override;
+    int skip_int32(int count, int& skipped, common::ByteStream& in) override;
+    int skip_int64(int count, int& skipped, common::ByteStream& in) override;
+
+   protected:
+    // ── Batch decode using raw pointer (bypasses ByteStream) ─────────────
+    // The decode() contract:
+    //   stored_value_ holds the "next" value to be returned.
+    //   decode() returns stored_value_, then advances via cache_next().
+    //   has_next_==false means the ending sentinel was hit.
+    //
+    // batch_decode_raw replicates this logic using GorillaBitReader on the
+    // wrapped contiguous buffer, then syncs state back to ByteStream.
+    int batch_decode_raw(T* out, int capacity, int& actual, T ending,
+                         common::ByteStream& in) {
+        int ret = common::E_OK;
+        actual = 0;
+        // Bootstrap below would unconditionally write out[0]; guard the
+        // zero-capacity edge case so callers can probe without writing.
+        if (capacity <= 0) {
+            return common::E_OK;
+        }
+        if (!in.is_wrapped()) {
+            return batch_decode_fallback(out, capacity, actual, ending, in);
+        }
+
+        const uint8_t* base =
+            (const uint8_t*)in.get_wrapped_buf() + in.read_pos();
+        // Gorilla pages are bounded by the page-writer cap (well below 4 GiB),
+        // so saturating to uint32_t is safe and matches GorillaBitReader's
+        // 32-bit cursor.
+        uint32_t remain = static_cast<uint32_t>(
+            std::min<uint64_t>(in.remaining_size(), UINT32_MAX));
+
+        GorillaBitReader r;
+        r.data = base;
+        r.pos = 0;
+        r.data_len = remain;
+        r.bits = bits_left_;
+        r.cur_byte = buffer_;
+
+        // Bootstrap first value if needed (mirrors decode()'s first-call path)
+        if (UNLIKELY(!first_value_was_read_)) {
+            if (r.bits == 0 && r.pos >= r.data_len) goto done;
+            r.load_byte_if_empty();
+            stored_value_ = (T)r.read_long(GorillaRawOps<T>::VALUE_BITS);
+            if (UNLIKELY(r.exhausted)) {
+                // Page truncated before the first value finished; refuse to
+                // emit a partially-decoded sentinel.
+                first_value_was_read_ = false;
+                ret = common::E_BUF_NOT_ENOUGH;
+                goto done;
+            }
+            first_value_was_read_ = true;
+            // Save the first value before cache_next mutates stored_value_
+            T first_value = stored_value_;
+            // cache_next: read_next then check ending
+            GorillaRawOps<T>::read_next(r, stored_value_, stored_leading_zeros_,
+                                        stored_trailing_zeros_);
+            if (UNLIKELY(r.exhausted)) {
+                ret = common::E_BUF_NOT_ENOUGH;
+                goto done;
+            }
+            if (stored_value_ == ending) {
+                has_next_ = false;
+            } else {
+                has_next_ = true;
+            }
+            // Output the first value
+            out[actual++] = first_value;
+            if (!has_next_ || actual >= capacity) goto done;
+        }
+
+        // Main batch loop
+        while (actual < capacity && has_next_) {
+            out[actual++] = stored_value_;
+            GorillaRawOps<T>::read_next(r, stored_value_, stored_leading_zeros_,
+                                        stored_trailing_zeros_);
+            if (UNLIKELY(r.exhausted)) {
+                ret = common::E_BUF_NOT_ENOUGH;
+                goto done;
+            }
+            if (stored_value_ == ending) {
+                has_next_ = false;
+            }
+        }
+
+    done:
+        // Sync bit-reader state back
+        buffer_ = r.cur_byte;
+        bits_left_ = r.bits;
+        in.wrapped_buf_advance_read_pos(r.pos);
+        return ret;
+    }
+
+    int batch_skip_raw(int count, int& skipped, T ending,
+                       common::ByteStream& in) {
+        int ret = common::E_OK;
+        skipped = 0;
+        // Bootstrap below would consume first_value_ even when count == 0,
+        // advancing the stream past data the caller didn't ask to skip.
+        if (count <= 0) {
+            return common::E_OK;
+        }
+        if (!in.is_wrapped()) {
+            return batch_skip_fallback(count, skipped, ending, in);
+        }
+
+        const uint8_t* base =
+            (const uint8_t*)in.get_wrapped_buf() + in.read_pos();
+        // Same saturation as batch_decode_raw: GorillaBitReader is 32-bit
+        // internally; pages are well under 4 GiB.
+        uint32_t remain = static_cast<uint32_t>(
+            std::min<uint64_t>(in.remaining_size(), UINT32_MAX));
+
+        GorillaBitReader r;
+        r.data = base;
+        r.pos = 0;
+        r.data_len = remain;
+        r.bits = bits_left_;
+        r.cur_byte = buffer_;
+
+        if (UNLIKELY(!first_value_was_read_)) {
+            if (r.bits == 0 && r.pos >= r.data_len) goto done;
+            r.load_byte_if_empty();
+            stored_value_ = (T)r.read_long(GorillaRawOps<T>::VALUE_BITS);
+            if (UNLIKELY(r.exhausted)) {
+                first_value_was_read_ = false;
+                ret = common::E_BUF_NOT_ENOUGH;
+                goto done;
+            }
+            first_value_was_read_ = true;
+            GorillaRawOps<T>::read_next(r, stored_value_, stored_leading_zeros_,
+                                        stored_trailing_zeros_);
+            if (UNLIKELY(r.exhausted)) {
+                ret = common::E_BUF_NOT_ENOUGH;
+                goto done;
+            }
+            if (stored_value_ == ending) {
+                has_next_ = false;
+            } else {
+                has_next_ = true;
+            }
+            // The first value counts as one skip
+            skipped++;
+            if (!has_next_ || skipped >= count) goto done;
+        }
+
+        while (skipped < count && has_next_) {
+            skipped++;
+            GorillaRawOps<T>::read_next(r, stored_value_, stored_leading_zeros_,
+                                        stored_trailing_zeros_);
+            if (UNLIKELY(r.exhausted)) {
+                ret = common::E_BUF_NOT_ENOUGH;
+                goto done;
+            }
+            if (stored_value_ == ending) {
+                has_next_ = false;
+            }
+        }
+
+    done:
+        buffer_ = r.cur_byte;
+        bits_left_ = r.bits;
+        in.wrapped_buf_advance_read_pos(r.pos);
+        return ret;
+    }
+
+    int batch_decode_fallback(T* out, int capacity, int& actual, T ending,
+                              common::ByteStream& in) {
+        actual = 0;
+        while (actual < capacity && has_remaining(in)) {
+            out[actual++] = decode(in);
+        }
+        return common::E_OK;
+    }
+
+    int batch_skip_fallback(int count, int& skipped, T ending,
+                            common::ByteStream& in) {
+        skipped = 0;
+        while (skipped < count && has_remaining(in)) {
+            decode(in);
+            skipped++;
+        }
+        return common::E_OK;
+    }
+
    public:
     common::TSEncoding type_;
     T stored_value_;
@@ -254,18 +602,18 @@ FORCE_INLINE int64_t GorillaDecoder<int64_t>::decode(common::ByteStream& in) {
 
 class FloatGorillaDecoder : public GorillaDecoder<int32_t> {
    public:
-    int read_boolean(bool& ret_value, common::ByteStream& in);
-    int read_int32(int32_t& ret_value, common::ByteStream& in);
-    int read_int64(int64_t& ret_value, common::ByteStream& in);
-    int read_float(float& ret_value, common::ByteStream& in);
-    int read_double(double& ret_value, common::ByteStream& in);
+    int read_boolean(bool& ret_value, common::ByteStream& in) override;
+    int read_int32(int32_t& ret_value, common::ByteStream& in) override;
+    int read_int64(int64_t& ret_value, common::ByteStream& in) override;
+    int read_float(float& ret_value, common::ByteStream& in) override;
+    int read_double(double& ret_value, common::ByteStream& in) override;
 
     float decode(common::ByteStream& in) {
         int32_t value_int = GorillaDecoder<int32_t>::decode(in);
         return common::int_to_float(value_int);
     }
 
-    int32_t cache_next(common::ByteStream& in) {
+    int32_t cache_next(common::ByteStream& in) override {
         read_next(in);
         if (stored_value_ ==
             common::float_to_int(GORILLA_ENCODING_ENDING_FLOAT)) {
@@ -273,22 +621,46 @@ class FloatGorillaDecoder : public GorillaDecoder<int32_t> {
         }
         return stored_value_;
     }
+
+    int read_batch_float(float* out, int capacity, int& actual,
+                         common::ByteStream& in) override {
+        int32_t ending = common::float_to_int(GORILLA_ENCODING_ENDING_FLOAT);
+        actual = 0;
+        while (actual < capacity && has_remaining(in)) {
+            int32_t buf[129];
+            int batch = std::min(129, capacity - actual);
+            int buf_actual = 0;
+            int ret = batch_decode_raw(buf, batch, buf_actual, ending, in);
+            if (ret != common::E_OK) return ret;
+            if (buf_actual == 0) break;
+            for (int i = 0; i < buf_actual; i++) {
+                out[actual + i] = common::int_to_float(buf[i]);
+            }
+            actual += buf_actual;
+        }
+        return common::E_OK;
+    }
+
+    int skip_float(int count, int& skipped, common::ByteStream& in) override {
+        int32_t ending = common::float_to_int(GORILLA_ENCODING_ENDING_FLOAT);
+        return batch_skip_raw(count, skipped, ending, in);
+    }
 };
 
 class DoubleGorillaDecoder : public GorillaDecoder<int64_t> {
    public:
-    int read_boolean(bool& ret_value, common::ByteStream& in);
-    int read_int32(int32_t& ret_value, common::ByteStream& in);
-    int read_int64(int64_t& ret_value, common::ByteStream& in);
-    int read_float(float& ret_value, common::ByteStream& in);
-    int read_double(double& ret_value, common::ByteStream& in);
+    int read_boolean(bool& ret_value, common::ByteStream& in) override;
+    int read_int32(int32_t& ret_value, common::ByteStream& in) override;
+    int read_int64(int64_t& ret_value, common::ByteStream& in) override;
+    int read_float(float& ret_value, common::ByteStream& in) override;
+    int read_double(double& ret_value, common::ByteStream& in) override;
 
     double decode(common::ByteStream& in) {
         int64_t value_long = GorillaDecoder<int64_t>::decode(in);
         return common::long_to_double(value_long);
     }
 
-    int64_t cache_next(common::ByteStream& in) {
+    int64_t cache_next(common::ByteStream& in) override {
         read_next(in);
         if (stored_value_ ==
             common::double_to_long(GORILLA_ENCODING_ENDING_DOUBLE)) {
@@ -296,12 +668,88 @@ class DoubleGorillaDecoder : public GorillaDecoder<int64_t> {
         }
         return stored_value_;
     }
+
+    int read_batch_double(double* out, int capacity, int& actual,
+                          common::ByteStream& in) override {
+        int64_t ending = common::double_to_long(GORILLA_ENCODING_ENDING_DOUBLE);
+        actual = 0;
+        while (actual < capacity && has_remaining(in)) {
+            int64_t buf[129];
+            int batch = std::min(129, capacity - actual);
+            int buf_actual = 0;
+            int ret = batch_decode_raw(buf, batch, buf_actual, ending, in);
+            if (ret != common::E_OK) return ret;
+            if (buf_actual == 0) break;
+            for (int i = 0; i < buf_actual; i++) {
+                out[actual + i] = common::long_to_double(buf[i]);
+            }
+            actual += buf_actual;
+        }
+        return common::E_OK;
+    }
+
+    int skip_double(int count, int& skipped, common::ByteStream& in) override {
+        int64_t ending = common::double_to_long(GORILLA_ENCODING_ENDING_DOUBLE);
+        return batch_skip_raw(count, skipped, ending, in);
+    }
 };
 
 typedef GorillaDecoder<int32_t> IntGorillaDecoder;
 typedef GorillaDecoder<int64_t> LongGorillaDecoder;
 
-// wrap as Decoder interface
+// ── IntGorillaDecoder batch/skip overrides ─────────────────────────────────
+template <>
+inline int GorillaDecoder<int32_t>::read_batch_int32(int32_t* out, int capacity,
+                                                     int& actual,
+                                                     common::ByteStream& in) {
+    return batch_decode_raw(out, capacity, actual,
+                            GORILLA_ENCODING_ENDING_INTEGER, in);
+}
+template <>
+inline int GorillaDecoder<int32_t>::read_batch_int64(int64_t*, int, int& actual,
+                                                     common::ByteStream&) {
+    actual = 0;
+    return common::E_NOT_SUPPORT;
+}
+template <>
+inline int GorillaDecoder<int32_t>::skip_int32(int count, int& skipped,
+                                               common::ByteStream& in) {
+    return batch_skip_raw(count, skipped, GORILLA_ENCODING_ENDING_INTEGER, in);
+}
+template <>
+inline int GorillaDecoder<int32_t>::skip_int64(int, int& skipped,
+                                               common::ByteStream&) {
+    skipped = 0;
+    return common::E_NOT_SUPPORT;
+}
+
+// ── LongGorillaDecoder batch/skip overrides ───────────────────────────────
+template <>
+inline int GorillaDecoder<int64_t>::read_batch_int32(int32_t*, int, int& actual,
+                                                     common::ByteStream&) {
+    actual = 0;
+    return common::E_NOT_SUPPORT;
+}
+template <>
+inline int GorillaDecoder<int64_t>::read_batch_int64(int64_t* out, int capacity,
+                                                     int& actual,
+                                                     common::ByteStream& in) {
+    return batch_decode_raw(out, capacity, actual, GORILLA_ENCODING_ENDING_LONG,
+                            in);
+}
+template <>
+inline int GorillaDecoder<int64_t>::skip_int32(int, int& skipped,
+                                               common::ByteStream&) {
+    skipped = 0;
+    return common::E_NOT_SUPPORT;
+}
+template <>
+inline int GorillaDecoder<int64_t>::skip_int64(int count, int& skipped,
+                                               common::ByteStream& in) {
+    return batch_skip_raw(count, skipped, GORILLA_ENCODING_ENDING_LONG, in);
+}
+
+// ── Scalar Decoder interface wrappers (unchanged) ─────────────────────────
 template <>
 FORCE_INLINE int IntGorillaDecoder::read_boolean(bool& ret_value,
                                                  common::ByteStream& in) {
diff --git a/cpp/src/encoding/plain_decoder.h b/cpp/src/encoding/plain_decoder.h
index c2627f71d..3e83cfc76 100644
--- a/cpp/src/encoding/plain_decoder.h
+++ b/cpp/src/encoding/plain_decoder.h
@@ -20,10 +20,47 @@
 #ifndef ENCODING_PLAIN_DECODER_H
 #define ENCODING_PLAIN_DECODER_H
 
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#include <stdlib.h>
+#endif
+
 #include "encoding/decoder.h"
 
 namespace storage {
 
+FORCE_INLINE uint32_t plain_bswap32(uint32_t v) {
+#if defined(__GNUC__) || defined(__clang__)
+    return __builtin_bswap32(v);
+#elif defined(_MSC_VER)
+    return _byteswap_ulong(v);
+#else
+    return ((v & 0x000000FFu) << 24) | ((v & 0x0000FF00u) << 8) |
+           ((v & 0x00FF0000u) >> 8) | ((v & 0xFF000000u) >> 24);
+#endif
+}
+
+FORCE_INLINE uint64_t plain_bswap64(uint64_t v) {
+#if defined(__GNUC__) || defined(__clang__)
+    return __builtin_bswap64(v);
+#elif defined(_MSC_VER)
+    return _byteswap_uint64(v);
+#else
+    return ((v & 0x00000000000000FFull) << 56) |
+           ((v & 0x000000000000FF00ull) << 40) |
+           ((v & 0x0000000000FF0000ull) << 24) |
+           ((v & 0x00000000FF000000ull) << 8) |
+           ((v & 0x000000FF00000000ull) >> 8) |
+           ((v & 0x0000FF0000000000ull) >> 24) |
+           ((v & 0x00FF000000000000ull) >> 40) |
+           ((v & 0xFF00000000000000ull) >> 56);
+#endif
+}
+
 class PlainDecoder : public Decoder {
    public:
     ~PlainDecoder() override = default;
@@ -62,6 +99,113 @@ class PlainDecoder : public Decoder {
                                  common::ByteStream& in) override {
         return common::SerializationUtil::read_mystring(ret_String, &pa, in);
     }
+
+    // ── Batch overrides ──────────────────────────────────────────────────────
+    //
+    // INT32: PLAIN encoding uses varint (variable stride).  Override to avoid
+    // virtual dispatch per element; actual decode is still per-value.
+    int read_batch_int32(int32_t* out, int capacity, int& actual,
+                         common::ByteStream& in) override {
+        actual = 0;
+        while (actual < capacity && in.has_remaining()) {
+            int ret = common::SerializationUtil::read_var_int(out[actual], in);
+            if (ret != common::E_OK) return ret;
+            ++actual;
+        }
+        return common::E_OK;
+    }
+
+    int skip_int32(int count, int& skipped, common::ByteStream& in) override {
+        skipped = 0;
+        int32_t dummy;
+        while (skipped < count && in.has_remaining()) {
+            int ret = common::SerializationUtil::read_var_int(dummy, in);
+            if (ret != common::E_OK) {
+                return ret;
+            }
+            ++skipped;
+        }
+        return common::E_OK;
+    }
+
+    // Fixed-stride INT64 / FLOAT / DOUBLE share the same shape: when the
+    // ByteStream is wrapped (contiguous buf), advance the read pointer in one
+    // step and byte-swap in place; otherwise fall back to per-value reads.
+    // The macros below expand into one override per type.
+#define PLAIN_SKIP_FIXED(NAME, T, STRIDE, READ_ONE)                         \
+    int NAME(int count, int& skipped, common::ByteStream& in) override {    \
+        skipped = 0;                                                        \
+        if (!in.is_wrapped()) {                                             \
+            T dummy;                                                        \
+            while (skipped < count && in.has_remaining()) {                 \
+                int ret = READ_ONE(dummy, in);                              \
+                if (ret != common::E_OK) {                                  \
+                    return ret;                                             \
+                }                                                           \
+                ++skipped;                                                  \
+            }                                                               \
+            return common::E_OK;                                            \
+        }                                                                   \
+        skipped = static_cast<int>(std::min<uint32_t>(                      \
+            in.remaining_size() / (STRIDE), static_cast<uint32_t>(count))); \
+        if (skipped <= 0) {                                                 \
+            skipped = 0;                                                    \
+            return common::E_OK;                                            \
+        }                                                                   \
+        in.wrapped_buf_advance_read_pos(static_cast<uint32_t>(skipped) *    \
+                                        (STRIDE));                          \
+        return common::E_OK;                                                \
+    }
+
+#define PLAIN_READ_BATCH_FIXED(NAME, T, U, STRIDE, READ_ONE, BSWAP)            \
+    int NAME(T* out, int capacity, int& actual, common::ByteStream& in)        \
+        override {                                                             \
+        actual = 0;                                                            \
+        if (!in.is_wrapped()) {                                                \
+            while (actual < capacity && in.has_remaining()) {                  \
+                int ret = READ_ONE(out[actual], in);                           \
+                if (ret != common::E_OK) {                                     \
+                    return ret;                                                \
+                }                                                              \
+                ++actual;                                                      \
+            }                                                                  \
+            return common::E_OK;                                               \
+        }                                                                      \
+        int n = static_cast<int>(std::min<uint32_t>(                           \
+            in.remaining_size() / (STRIDE), static_cast<uint32_t>(capacity))); \
+        if (n <= 0) {                                                          \
+            return common::E_OK;                                               \
+        }                                                                      \
+        const uint8_t* src =                                                   \
+            (const uint8_t*)in.get_wrapped_buf() + in.read_pos();              \
+        in.wrapped_buf_advance_read_pos(static_cast<uint32_t>(n) * (STRIDE));  \
+        actual = n;                                                            \
+        for (int i = 0; i < n; ++i) {                                          \
+            U v;                                                               \
+            memcpy(&v, src + i * (STRIDE), (STRIDE));                          \
+            v = BSWAP(v);                                                      \
+            memcpy(&out[i], &v, (STRIDE));                                     \
+        }                                                                      \
+        return common::E_OK;                                                   \
+    }
+
+    PLAIN_SKIP_FIXED(skip_int64, int64_t, 8,
+                     common::SerializationUtil::read_i64)
+    PLAIN_SKIP_FIXED(skip_float, float, 4,
+                     common::SerializationUtil::read_float)
+    PLAIN_SKIP_FIXED(skip_double, double, 8,
+                     common::SerializationUtil::read_double)
+
+    PLAIN_READ_BATCH_FIXED(read_batch_int64, int64_t, uint64_t, 8,
+                           common::SerializationUtil::read_i64, plain_bswap64)
+    PLAIN_READ_BATCH_FIXED(read_batch_float, float, uint32_t, 4,
+                           common::SerializationUtil::read_float, plain_bswap32)
+    PLAIN_READ_BATCH_FIXED(read_batch_double, double, uint64_t, 8,
+                           common::SerializationUtil::read_double,
+                           plain_bswap64)
+
+#undef PLAIN_SKIP_FIXED
+#undef PLAIN_READ_BATCH_FIXED
 };
 
 }  // end namespace storage
diff --git a/cpp/src/encoding/plain_encoder.h b/cpp/src/encoding/plain_encoder.h
index b768c9bf0..84ebee238 100644
--- a/cpp/src/encoding/plain_encoder.h
+++ b/cpp/src/encoding/plain_encoder.h
@@ -20,50 +20,221 @@
 #ifndef ENCODING_PLAIN_ENCODER_H
 #define ENCODING_PLAIN_ENCODER_H
 
+#include <cstring>
+
 #include "encoder.h"
 
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#include <arm_neon.h>
+#define TSFILE_HAS_NEON 1
+#endif
+
 namespace storage {
 
 class PlainEncoder : public Encoder {
    public:
     PlainEncoder() {}
     ~PlainEncoder() { destroy(); }
-    void destroy() { /* do nothing for PlainEncoder */
+    void destroy() override { /* do nothing for PlainEncoder */
     }
-    void reset() { /* do thing for PlainEncoder */
+    void reset() override { /* do thing for PlainEncoder */
     }
 
-    FORCE_INLINE int encode(bool value, common::ByteStream& out_stream) {
+    FORCE_INLINE int encode(bool value,
+                            common::ByteStream& out_stream) override {
         return common::SerializationUtil::write_i8(value ? 1 : 0, out_stream);
     }
 
-    FORCE_INLINE int encode(int32_t value, common::ByteStream& out_stream) {
+    FORCE_INLINE int encode(int32_t value,
+                            common::ByteStream& out_stream) override {
         return common::SerializationUtil::write_var_int(value, out_stream);
     }
 
-    FORCE_INLINE int encode(int64_t value, common::ByteStream& out_stream) {
+    FORCE_INLINE int encode(int64_t value,
+                            common::ByteStream& out_stream) override {
         return common::SerializationUtil::write_i64(value, out_stream);
     }
 
-    FORCE_INLINE int encode(float value, common::ByteStream& out_stream) {
+    FORCE_INLINE int encode(float value,
+                            common::ByteStream& out_stream) override {
         return common::SerializationUtil::write_float(value, out_stream);
     }
 
-    FORCE_INLINE int encode(double value, common::ByteStream& out_stream) {
+    FORCE_INLINE int encode(double value,
+                            common::ByteStream& out_stream) override {
         return common::SerializationUtil::write_double(value, out_stream);
     }
 
     FORCE_INLINE int encode(common::String value,
-                            common::ByteStream& out_stream) {
+                            common::ByteStream& out_stream) override {
         return common::SerializationUtil::write_mystring(value, out_stream);
     }
 
-    int flush(common::ByteStream& out_stream) {
+    int flush(common::ByteStream& out_stream) override {
         // do nothing for PlainEncoder
         return common::E_OK;
     }
 
-    int get_max_byte_size() { return 0; }
+    int get_max_byte_size() override { return 0; }
+
+    // Optimized batch encoding: directly byte-swap into ByteStream page buffer.
+    // Avoids per-value write_buf overhead entirely — only calls acquire_buf()
+    // once per page boundary crossing.
+    int encode_batch(const int64_t* values, uint32_t count,
+                     common::ByteStream& out_stream) override {
+        if (count == 0) return common::E_OK;
+        uint32_t offset = 0;
+        while (offset < count) {
+            common::ByteStream::Buffer buf = out_stream.acquire_buf();
+            if (UNLIKELY(buf.buf_ == nullptr)) return common::E_OOM;
+            // How many int64 values fit in the remaining page space?
+            uint32_t capacity = buf.len_ / 8;
+            if (capacity == 0) {
+                // Page has < 8 bytes left, fall back to write_buf for this one
+                return Encoder::encode_batch(values + offset, count - offset,
+                                             out_stream);
+            }
+            uint32_t batch = std::min(count - offset, capacity);
+            uint8_t* dst = (uint8_t*)buf.buf_;
+            const int64_t* src = values + offset;
+            uint32_t i = 0;
+#if TSFILE_HAS_NEON
+            // NEON: byte-reverse 2 x int64 per iteration
+            for (; i + 2 <= batch; i += 2) {
+                uint8x16_t v = vld1q_u8((const uint8_t*)&src[i]);
+                v = vrev64q_u8(v);
+                vst1q_u8(dst, v);
+                dst += 16;
+            }
+#endif
+            // Scalar tail
+            for (; i < batch; i++) {
+                uint64_t v = (uint64_t)src[i];
+                dst[0] = (uint8_t)(v >> 56);
+                dst[1] = (uint8_t)(v >> 48);
+                dst[2] = (uint8_t)(v >> 40);
+                dst[3] = (uint8_t)(v >> 32);
+                dst[4] = (uint8_t)(v >> 24);
+                dst[5] = (uint8_t)(v >> 16);
+                dst[6] = (uint8_t)(v >> 8);
+                dst[7] = (uint8_t)(v);
+                dst += 8;
+            }
+            out_stream.buffer_used(batch * 8);
+            offset += batch;
+        }
+        return common::E_OK;
+    }
+
+    int encode_batch(const double* values, uint32_t count,
+                     common::ByteStream& out_stream) override {
+        if (count == 0) return common::E_OK;
+        uint32_t offset = 0;
+        while (offset < count) {
+            common::ByteStream::Buffer buf = out_stream.acquire_buf();
+            if (UNLIKELY(buf.buf_ == nullptr)) return common::E_OOM;
+            uint32_t capacity = buf.len_ / 8;
+            if (capacity == 0) {
+                return Encoder::encode_batch(values + offset, count - offset,
+                                             out_stream);
+            }
+            uint32_t batch = std::min(count - offset, capacity);
+            uint8_t* dst = (uint8_t*)buf.buf_;
+            const double* src = values + offset;
+            uint32_t i = 0;
+#if TSFILE_HAS_NEON
+            // NEON byte-reverse of raw bytes works for double bits too.
+            for (; i + 2 <= batch; i += 2) {
+                uint8x16_t v = vld1q_u8((const uint8_t*)&src[i]);
+                v = vrev64q_u8(v);
+                vst1q_u8(dst, v);
+                dst += 16;
+            }
+#endif
+            // Scalar tail: round-trip the bits via memcpy to avoid the
+            // strict-aliasing violation of reading a double through an
+            // int64_t* (the old reinterpret_cast dispatch).
+            for (; i < batch; i++) {
+                uint64_t v;
+                memcpy(&v, &src[i], sizeof(double));
+                dst[0] = (uint8_t)(v >> 56);
+                dst[1] = (uint8_t)(v >> 48);
+                dst[2] = (uint8_t)(v >> 40);
+                dst[3] = (uint8_t)(v >> 32);
+                dst[4] = (uint8_t)(v >> 24);
+                dst[5] = (uint8_t)(v >> 16);
+                dst[6] = (uint8_t)(v >> 8);
+                dst[7] = (uint8_t)(v);
+                dst += 8;
+            }
+            out_stream.buffer_used(batch * 8);
+            offset += batch;
+        }
+        return common::E_OK;
+    }
+
+    int encode_batch(const float* values, uint32_t count,
+                     common::ByteStream& out_stream) override {
+        if (count == 0) return common::E_OK;
+        uint32_t offset = 0;
+        while (offset < count) {
+            common::ByteStream::Buffer buf = out_stream.acquire_buf();
+            if (UNLIKELY(buf.buf_ == nullptr)) return common::E_OOM;
+            uint32_t capacity = buf.len_ / 4;
+            if (capacity == 0) {
+                return Encoder::encode_batch(values + offset, count - offset,
+                                             out_stream);
+            }
+            uint32_t batch = std::min(count - offset, capacity);
+            uint8_t* dst = (uint8_t*)buf.buf_;
+            const float* src = values + offset;
+            uint32_t i = 0;
+#if TSFILE_HAS_NEON
+            // NEON: byte-reverse 4 x float (32-bit) per iteration
+            for (; i + 4 <= batch; i += 4) {
+                uint8x16_t v = vld1q_u8((const uint8_t*)&src[i]);
+                v = vrev32q_u8(v);
+                vst1q_u8(dst, v);
+                dst += 16;
+            }
+#endif
+            for (; i < batch; i++) {
+                uint32_t v;
+                memcpy(&v, &src[i], sizeof(float));
+                dst[0] = (uint8_t)(v >> 24);
+                dst[1] = (uint8_t)(v >> 16);
+                dst[2] = (uint8_t)(v >> 8);
+                dst[3] = (uint8_t)(v);
+                dst += 4;
+            }
+            out_stream.buffer_used(batch * 4);
+            offset += batch;
+        }
+        return common::E_OK;
+    }
+
+    // Batch encode strings from Arrow-style offset+buffer layout.
+    // Each string is serialized as: var_int(len) + raw bytes.
+    int encode_string_batch(const char* buffer, const uint32_t* offsets,
+                            uint32_t start_idx, uint32_t count,
+                            common::ByteStream& out_stream) override {
+        int ret = common::E_OK;
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t idx = start_idx + i;
+            uint32_t len = offsets[idx + 1] - offsets[idx];
+            if (RET_FAIL(common::SerializationUtil::write_var_int(
+                    (int32_t)len, out_stream))) {
+                return ret;
+            }
+            if (len > 0) {
+                if (RET_FAIL(
+                        out_stream.write_buf(buffer + offsets[idx], len))) {
+                    return ret;
+                }
+            }
+        }
+        return ret;
+    }
 };
 
 }  // end namespace storage
diff --git a/cpp/src/encoding/ts2diff_decoder.h b/cpp/src/encoding/ts2diff_decoder.h
index f37001003..bc6e89613 100644
--- a/cpp/src/encoding/ts2diff_decoder.h
+++ b/cpp/src/encoding/ts2diff_decoder.h
@@ -24,6 +24,7 @@
 
 #include <cmath>
 #include <cstddef>
+#include <cstring>
 #include <vector>
 
 #include "common/allocator/alloc_base.h"
@@ -31,8 +32,174 @@
 #include "decoder.h"
 #include "utils/util_define.h"
 
+#ifdef ENABLE_SIMD
+#include "simde/x86/avx2.h"
+#endif
+
 namespace storage {
 
+// ============================================================================
+// SIMD batch decode helpers (INT32)
+// ============================================================================
+#ifdef ENABLE_SIMD
+
+// Decode 4 INT32 values from bit-packed data using SIMD gather + shift.
+// @in:        pointer to the start of packed bit data for the block
+// @bit_width: bits per delta value
+// @delta_min: minimum delta offset for this block
+// @index:     current position within the block (0-based, among write_index_
+//             deltas)
+// @base:      the previous reconstructed value (for prefix-sum)
+// @out:       output array (4 values written)
+// Returns:    the last reconstructed value (new base for next group)
+static inline int32_t simd_decode_4_i32(const uint8_t* in, int32_t bit_width,
+                                        int32_t delta_min, int32_t index,
+                                        int32_t base, int32_t out[4]) {
+    static const simde__m128i SHUF_REV4 = simde_mm_setr_epi8(
+        3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12);
+
+    const simde__m128i VMIN4 = simde_mm_set1_epi32(delta_min);
+
+    int32_t pos0 = index * bit_width;
+    int32_t pos[4] = {pos0, pos0 + bit_width, pos0 + 2 * bit_width,
+                      pos0 + 3 * bit_width};
+    int32_t bidx[4] = {pos[0] >> 3, pos[1] >> 3, pos[2] >> 3, pos[3] >> 3};
+    int32_t off[4] = {pos[0] & 7, pos[1] & 7, pos[2] & 7, pos[3] & 7};
+
+    simde__m128i IDX = simde_mm_setr_epi32(bidx[0], bidx[1], bidx[2], bidx[3]);
+    simde__m128i OFF = simde_mm_setr_epi32(off[0], off[1], off[2], off[3]);
+
+    simde__m128i V4;
+
+    if (bit_width <= 16) {
+        int rshift = 32 - bit_width;
+        simde__m128i w32_le = simde_mm_i32gather_epi32((const int*)in, IDX, 1);
+        simde__m128i w32_be = simde_mm_shuffle_epi8(w32_le, SHUF_REV4);
+        simde__m128i U32 = simde_mm_sllv_epi32(w32_be, OFF);
+        simde__m128i RS32 = simde_mm_set1_epi32(rshift);
+        V4 = simde_mm_srlv_epi32(U32, RS32);
+    } else {
+        static const simde__m256i SHUF_REV8 = simde_mm256_setr_epi8(
+            7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3,
+            2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+        int rshift = 64 - bit_width;
+        simde__m256i w64_le =
+            simde_mm256_i32gather_epi64((const int64_t*)in, IDX, 1);
+        simde__m256i w64_be = simde_mm256_shuffle_epi8(w64_le, SHUF_REV8);
+        simde__m256i OFF64 = simde_mm256_cvtepu32_epi64(OFF);
+        simde__m256i U64 = simde_mm256_sllv_epi64(w64_be, OFF64);
+        simde__m256i V64 =
+            simde_mm256_srl_epi64(U64, simde_mm_cvtsi32_si128(rshift));
+        simde__m256i perm = simde_mm256_setr_epi32(0, 2, 4, 6, 0, 0, 0, 0);
+        simde__m256i comp = simde_mm256_permutevar8x32_epi32(V64, perm);
+        V4 = simde_mm256_castsi256_si128(comp);
+    }
+
+    // Add delta_min
+    V4 = simde_mm_add_epi32(V4, VMIN4);
+
+    // Prefix sum to reconstruct absolute values
+    simde__m128i t;
+    t = simde_mm_slli_si128(V4, 4);
+    V4 = simde_mm_add_epi32(V4, t);
+    t = simde_mm_slli_si128(V4, 8);
+    V4 = simde_mm_add_epi32(V4, t);
+
+    // Add base
+    simde__m128i C4 = simde_mm_set1_epi32(base);
+    V4 = simde_mm_add_epi32(V4, C4);
+
+    simde_mm_storeu_si128((simde__m128i*)out, V4);
+    return out[3];
+}
+
+// Decode 4 INT64 values from bit-packed data using SIMD.
+static inline int64_t simd_decode_4_i64(const uint8_t* in, int32_t bit_width,
+                                        int64_t delta_min, int32_t index,
+                                        int64_t base, int64_t out[4]) {
+    static const simde__m256i SHUF_REV8 = simde_mm256_setr_epi8(
+        7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2,
+        1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
+
+    const simde__m256i VMIN4 = simde_mm256_set1_epi64x(delta_min);
+
+    int32_t pos0 = index * bit_width;
+    int32_t pos[4] = {pos0, pos0 + bit_width, pos0 + 2 * bit_width,
+                      pos0 + 3 * bit_width};
+    int32_t bidx[4] = {pos[0] >> 3, pos[1] >> 3, pos[2] >> 3, pos[3] >> 3};
+    int32_t off[4] = {pos[0] & 7, pos[1] & 7, pos[2] & 7, pos[3] & 7};
+
+    simde__m128i IDX = simde_mm_setr_epi32(bidx[0], bidx[1], bidx[2], bidx[3]);
+
+    int rshift = 64 - bit_width;
+    simde__m256i w64_le =
+        simde_mm256_i32gather_epi64((const int64_t*)in, IDX, 1);
+    simde__m256i w64_be = simde_mm256_shuffle_epi8(w64_le, SHUF_REV8);
+    simde__m256i OFF64 = simde_mm256_cvtepu32_epi64(
+        simde_mm_setr_epi32(off[0], off[1], off[2], off[3]));
+    simde__m256i U64 = simde_mm256_sllv_epi64(w64_be, OFF64);
+    simde__m256i V64 =
+        simde_mm256_srl_epi64(U64, simde_mm_cvtsi32_si128(rshift));
+
+    // Add delta_min
+    V64 = simde_mm256_add_epi64(V64, VMIN4);
+
+    // Prefix sum (64-bit, 4 lanes)
+    simde__m256i t;
+    // shift by 8 bytes = 1 lane
+    t = simde_mm256_slli_si256(V64, 8);
+    V64 = simde_mm256_add_epi64(V64, t);
+    // cross-lane: add lane[1] to lane[2] and lane[3]
+    // Extract high 128 bits, add broadcast of element[1] to both elements
+    int64_t tmp_buf[4];
+    simde_mm256_storeu_si256((simde__m256i*)tmp_buf, V64);
+    tmp_buf[2] += tmp_buf[1];
+    tmp_buf[3] += tmp_buf[1];
+    V64 = simde_mm256_loadu_si256((const simde__m256i*)tmp_buf);
+
+    // Add base
+    simde__m256i C4 = simde_mm256_set1_epi64x(base);
+    V64 = simde_mm256_add_epi64(V64, C4);
+
+    simde_mm256_storeu_si256((simde__m256i*)out, V64);
+    return out[3];
+}
+
+#endif  // ENABLE_SIMD
+
+// ============================================================================
+// Scalar batch decode helpers
+// ============================================================================
+
+// Scalar: extract one value from bit-packed data.
+// @data:      pointer to packed bits (NOT advanced; caller handles position)
+// @bit_pos:   bit offset from start of data
+// @bit_width: bits per value
+static inline int64_t scalar_read_bits(const uint8_t* data, int32_t bit_pos,
+                                       int32_t bit_width) {
+    int64_t value = 0;
+    int bits = bit_width;
+    int byte_idx = bit_pos >> 3;
+    int bit_offset = bit_pos & 7;
+    int bits_avail = 8 - bit_offset;
+
+    while (bits > 0) {
+        if (bits >= bits_avail) {
+            uint8_t d = data[byte_idx] & ((1 << bits_avail) - 1);
+            value = (value << bits_avail) | d;
+            bits -= bits_avail;
+            byte_idx++;
+            bits_avail = 8;
+        } else {
+            uint8_t d =
+                (data[byte_idx] >> (bits_avail - bits)) & ((1 << bits) - 1);
+            value = (value << bits) | d;
+            bits = 0;
+        }
+    }
+    return value;
+}
+
 namespace ts2diff_java_detail {
 
 // Java float/double TS_2DIFF overflow page markers.
@@ -54,7 +221,7 @@ inline bool bitmap_marked(const std::vector<uint8_t>& bm, int idx) {
 
 inline bool looks_like_ts2diff_header(common::ByteStream& in) {
     int ret = common::E_OK;
-    uint32_t probe_mark = in.read_pos();
+    uint64_t probe_mark = in.read_pos();
     int32_t write_index = 0;
     int32_t bit_width = 0;
     if (RET_FAIL(common::SerializationUtil::read_i32(write_index, in)) ||
@@ -82,7 +249,7 @@ inline int consume_float_double_ts2diff_prefix(
     underflow_bm.clear();
     overflow_bm.clear();
     segment_size = 0;
-    uint32_t mark = in.read_pos();
+    uint64_t mark = in.read_pos();
     uint32_t tag = 0;
     if (RET_FAIL(common::SerializationUtil::read_var_uint(tag, in))) {
         return ret;
@@ -132,6 +299,9 @@ inline int consume_float_double_ts2diff_prefix(
 
 }  // namespace ts2diff_java_detail
 
+// ============================================================================
+// TS2DIFFDecoder template
+// ============================================================================
 template <typename T>
 class TS2DIFFDecoder : public Decoder {
    public:
@@ -148,12 +318,14 @@ class TS2DIFFDecoder : public Decoder {
         previous_value_ = 0;
         bit_width_ = 0;
         current_index_ = 0;
+        header_peeked_ = false;
     }
 
     FORCE_INLINE bool has_remaining(const common::ByteStream& buffer) override {
         if (buffer.has_remaining()) return true;
-        return bits_left_ != 0 || (current_index_ <= write_index_ &&
-                                   write_index_ != -1 && current_index_ != 0);
+        return header_peeked_ || bits_left_ != 0 ||
+               (current_index_ <= write_index_ && write_index_ != -1 &&
+                current_index_ != 0);
     }
 
     void read_header(common::ByteStream& in) {
@@ -208,6 +380,18 @@ class TS2DIFFDecoder : public Decoder {
     int read_String(common::String& ret_value, common::PageArena& pa,
                     common::ByteStream& in) override;
 
+    int read_batch_int32(int32_t* out, int capacity, int& actual,
+                         common::ByteStream& in) override;
+    int read_batch_int64(int64_t* out, int capacity, int& actual,
+                         common::ByteStream& in) override;
+    int skip_int32(int count, int& skipped, common::ByteStream& in) override;
+    int skip_int64(int count, int& skipped, common::ByteStream& in) override;
+
+    bool peek_next_block_range_int64(common::ByteStream& in, int64_t& block_min,
+                                     int64_t& block_max,
+                                     int& block_count) override;
+    int skip_peeked_block_int64(common::ByteStream& in, int& skipped) override;
+
    public:
     T first_value_;
     T previous_value_;
@@ -218,8 +402,13 @@ class TS2DIFFDecoder : public Decoder {
     int bit_width_;
     int write_index_;
     int current_index_;
+    bool header_peeked_;
 };
 
+// ============================================================================
+// Per-value decode (unchanged)
+// ============================================================================
+
 template <>
 inline int32_t TS2DIFFDecoder<int32_t>::decode(common::ByteStream& in) {
     int32_t ret_value = stored_value_;
@@ -274,6 +463,436 @@ inline int64_t TS2DIFFDecoder<int64_t>::decode(common::ByteStream& in) {
     return ret_value;
 }
 
+// ============================================================================
+// Batch decode: INT32
+// Decodes one full block (up to 129 values) per call using SIMD when enabled.
+// ============================================================================
+
+template <>
+inline int TS2DIFFDecoder<int32_t>::read_batch_int32(int32_t* out, int capacity,
+                                                     int& actual,
+                                                     common::ByteStream& in) {
+    actual = 0;
+
+    while (actual < capacity && has_remaining(in)) {
+        // If we are mid-block (current_index_ != 0), finish it per-value.
+        if (current_index_ != 0) {
+            while (actual < capacity && current_index_ != 0 &&
+                   has_remaining(in)) {
+                out[actual++] = decode(in);
+            }
+            continue;
+        }
+
+        // Start of a new block — read header
+        read_header(in);
+        common::SerializationUtil::read_i32(delta_min_, in);
+        common::SerializationUtil::read_i32(first_value_, in);
+        bits_left_ = 0;
+        buffer_ = 0;
+
+        // Output first_value
+        if (actual >= capacity) {
+            // Must consume first_value next time; set state for per-value path
+            current_index_ = 0;
+            // We already consumed the header; push first_value as stored
+            // and let the next call to decode() handle it.
+            // Actually, we need to handle this: rewind is not possible.
+            // So we output first_value and accept going 1 over capacity.
+        }
+        out[actual++] = first_value_;
+
+        if (write_index_ == 0) {
+            // Block has only first_value, no deltas
+            current_index_ = 0;
+            continue;
+        }
+
+        int32_t remaining = write_index_;
+        if (actual + remaining > capacity) {
+            // Block won't fit in output. Fall back to per-value decode.
+            // Stream is at packed data start; bits_left_/buffer_ are reset.
+            current_index_ = 1;
+            continue;
+        }
+        if (!in.is_wrapped()) {
+            // SIMD/scalar block decode below requires a contiguous wrapped
+            // buffer.  For a paged ByteStream, drop down to per-value
+            // decode the same way the doesn't-fit branch does.
+            current_index_ = 1;
+            continue;
+        }
+
+        // Full block decode. Validate against corrupt headers before
+        // advancing the read position — a bogus bit_width_ or write_index_
+        // could compute a block_bytes that overflows the int32_t multiply
+        // or runs past the wrapped buffer.
+        if (UNLIKELY(write_index_ < 0 || bit_width_ < 0 || bit_width_ > 32)) {
+            return common::E_TSFILE_CORRUPTED;
+        }
+        int64_t block_bytes_64 =
+            (static_cast<int64_t>(write_index_) * bit_width_ + 7) / 8;
+        if (UNLIKELY(block_bytes_64 > in.remaining_size())) {
+            return common::E_TSFILE_CORRUPTED;
+        }
+        int32_t block_bytes = static_cast<int32_t>(block_bytes_64);
+        const uint8_t* blk_ptr =
+            (const uint8_t*)in.get_wrapped_buf() + in.read_pos();
+        in.wrapped_buf_advance_read_pos(static_cast<uint32_t>(block_bytes));
+
+        int32_t prev = first_value_;
+        int32_t i = 0;
+
+#ifdef ENABLE_SIMD
+        // SIMD path: decode 8 values at a time (2 groups of 4)
+        for (; i + 7 < remaining; i += 8) {
+            int32_t need_bytes = ((i + 7) * bit_width_ + bit_width_ + 7) / 8 +
+                                 (bit_width_ > 16 ? 8 : 4);
+            if (need_bytes > block_bytes) break;
+
+            int32_t grp_out[8];
+            prev = simd_decode_4_i32(blk_ptr, bit_width_, delta_min_, i, prev,
+                                     grp_out);
+            prev = simd_decode_4_i32(blk_ptr, bit_width_, delta_min_, i + 4,
+                                     prev, grp_out + 4);
+
+            memcpy(out + actual, grp_out, 8 * sizeof(int32_t));
+            actual += 8;
+        }
+#endif
+
+        // Scalar tail
+        int32_t bit_pos = i * bit_width_;
+        for (; i < remaining; ++i) {
+            int64_t delta = scalar_read_bits(blk_ptr, bit_pos, bit_width_);
+            bit_pos += bit_width_;
+            int32_t val = (int32_t)delta + prev + delta_min_;
+            prev = val;
+            out[actual++] = val;
+        }
+
+        // Block done, reset state
+        first_value_ = prev;
+        current_index_ = 0;
+    }
+
+    return common::E_OK;
+}
+
+// ============================================================================
+// Batch decode: INT64
+// ============================================================================
+
+template <>
+inline int TS2DIFFDecoder<int64_t>::read_batch_int64(int64_t* out, int capacity,
+                                                     int& actual,
+                                                     common::ByteStream& in) {
+    actual = 0;
+
+    while (actual < capacity && has_remaining(in)) {
+        // If mid-block, finish per-value
+        if (current_index_ != 0) {
+            while (actual < capacity && current_index_ != 0 &&
+                   has_remaining(in)) {
+                out[actual++] = decode(in);
+            }
+            continue;
+        }
+
+        // Start of a new block
+        if (!header_peeked_) {
+            read_header(in);
+            common::SerializationUtil::read_i64(delta_min_, in);
+            common::SerializationUtil::read_i64(first_value_, in);
+            bits_left_ = 0;
+            buffer_ = 0;
+        }
+        header_peeked_ = false;
+
+        out[actual++] = first_value_;
+
+        if (write_index_ == 0) {
+            current_index_ = 0;
+            continue;
+        }
+
+        int32_t remaining = write_index_;
+        if (actual + remaining > capacity) {
+            // Block won't fit in output. Fall back to per-value decode.
+            // Stream is at packed data start; bits_left_/buffer_ are reset.
+            current_index_ = 1;
+            continue;
+        }
+        if (!in.is_wrapped()) {
+            // SIMD/scalar block decode below requires a contiguous wrapped
+            // buffer.  Page-backed ByteStreams must use the per-value path.
+            current_index_ = 1;
+            continue;
+        }
+
+        // Validate against corrupt headers (see int32 path).
+        if (UNLIKELY(write_index_ < 0 || bit_width_ < 0 || bit_width_ > 64)) {
+            return common::E_TSFILE_CORRUPTED;
+        }
+        int64_t block_bytes_64 =
+            (static_cast<int64_t>(write_index_) * bit_width_ + 7) / 8;
+        if (UNLIKELY(block_bytes_64 > in.remaining_size())) {
+            return common::E_TSFILE_CORRUPTED;
+        }
+        int32_t block_bytes = static_cast<int32_t>(block_bytes_64);
+        // Direct pointer into the wrapped ByteStream buffer.
+        const uint8_t* blk_ptr =
+            (const uint8_t*)in.get_wrapped_buf() + in.read_pos();
+        in.wrapped_buf_advance_read_pos(static_cast<uint32_t>(block_bytes));
+
+        int64_t prev = first_value_;
+        int32_t i = 0;
+
+#ifdef ENABLE_SIMD
+        // SIMD path: decode 4 INT64 values at a time
+        for (; i + 3 < remaining; i += 4) {
+            int32_t need_bytes =
+                ((i + 3) * bit_width_ + bit_width_ + 7) / 8 + 8;
+            if (need_bytes > block_bytes) break;
+
+            int64_t grp_out[4];
+            prev = simd_decode_4_i64(blk_ptr, bit_width_, delta_min_, i, prev,
+                                     grp_out);
+            memcpy(out + actual, grp_out, 4 * sizeof(int64_t));
+            actual += 4;
+        }
+#endif
+
+        // Scalar tail
+        int32_t bit_pos = i * bit_width_;
+        for (; i < remaining; ++i) {
+            int64_t delta = scalar_read_bits(blk_ptr, bit_pos, bit_width_);
+            bit_pos += bit_width_;
+            int64_t val = delta + prev + delta_min_;
+            prev = val;
+            out[actual++] = val;
+        }
+
+        first_value_ = prev;
+        current_index_ = 0;
+    }
+
+    return common::E_OK;
+}
+
+// ============================================================================
+// Skip: INT32 — read header only, jump over packed data
+// ============================================================================
+
+template <>
+inline int TS2DIFFDecoder<int32_t>::skip_int32(int count, int& skipped,
+                                               common::ByteStream& in) {
+    skipped = 0;
+
+    // If mid-block, finish current block per-value
+    while (skipped < count && current_index_ != 0 && has_remaining(in)) {
+        decode(in);
+        ++skipped;
+    }
+
+    while (skipped < count && has_remaining(in)) {
+        int32_t wi, bw, dm, fv;
+        common::SerializationUtil::read_i32(wi, in);
+        common::SerializationUtil::read_i32(bw, in);
+        common::SerializationUtil::read_i32(dm, in);
+        common::SerializationUtil::read_i32(fv, in);
+
+        int32_t block_vals = wi + 1;
+        bits_left_ = 0;
+        buffer_ = 0;
+
+        if (count - skipped >= block_vals) {
+            // Whole-block fast path: jump over packed body.
+            int32_t skip_bytes = (wi * bw + 7) / 8;
+            in.wrapped_buf_advance_read_pos(skip_bytes);
+            skipped += block_vals;
+            current_index_ = 0;
+            write_index_ = -1;
+        } else {
+            // Partial block: reinstate decoder state as if we'd just
+            // emitted first_value_ from decode(), bump skipped by 1,
+            // then per-value decode the remaining count, leaving the
+            // rest of the block intact for the next decode() call.
+            write_index_ = wi;
+            bit_width_ = bw;
+            delta_min_ = dm;
+            first_value_ = fv;
+            current_index_ = (wi == 0) ? 0 : 1;
+            ++skipped;
+            while (skipped < count && current_index_ != 0 &&
+                   has_remaining(in)) {
+                decode(in);
+                ++skipped;
+            }
+        }
+    }
+
+    return common::E_OK;
+}
+
+// ============================================================================
+// Skip: INT64
+// ============================================================================
+
+template <>
+inline int TS2DIFFDecoder<int64_t>::skip_int64(int count, int& skipped,
+                                               common::ByteStream& in) {
+    skipped = 0;
+
+    while (skipped < count && current_index_ != 0 && has_remaining(in)) {
+        decode(in);
+        ++skipped;
+    }
+
+    while (skipped < count && has_remaining(in)) {
+        int32_t wi, bw;
+        int64_t dm, fv;
+        common::SerializationUtil::read_i32(wi, in);
+        common::SerializationUtil::read_i32(bw, in);
+        common::SerializationUtil::read_i64(dm, in);
+        common::SerializationUtil::read_i64(fv, in);
+
+        int32_t block_vals = wi + 1;
+        bits_left_ = 0;
+        buffer_ = 0;
+
+        if (count - skipped >= block_vals) {
+            int32_t skip_bytes = (wi * bw + 7) / 8;
+            in.wrapped_buf_advance_read_pos(skip_bytes);
+            skipped += block_vals;
+            current_index_ = 0;
+            write_index_ = -1;
+        } else {
+            write_index_ = wi;
+            bit_width_ = bw;
+            delta_min_ = dm;
+            first_value_ = fv;
+            current_index_ = (wi == 0) ? 0 : 1;
+            ++skipped;
+            while (skipped < count && current_index_ != 0 &&
+                   has_remaining(in)) {
+                decode(in);
+                ++skipped;
+            }
+        }
+    }
+
+    return common::E_OK;
+}
+
+// ============================================================================
+// Block-level filter check: peek header and compute value range
+// ============================================================================
+
+template <>
+inline bool TS2DIFFDecoder<int64_t>::peek_next_block_range_int64(
+    common::ByteStream& in, int64_t& block_min, int64_t& block_max,
+    int& block_count) {
+    if (current_index_ != 0 || !has_remaining(in)) return false;
+
+    read_header(in);
+    common::SerializationUtil::read_i64(delta_min_, in);
+    common::SerializationUtil::read_i64(first_value_, in);
+    bits_left_ = 0;
+    buffer_ = 0;
+
+    block_min = first_value_;
+    block_count = write_index_ + 1;
+
+    // Look-ahead: since timestamps are monotonically increasing, the true
+    // block_max is the last timestamp, which equals next block's first_value_.
+    // The next block header starts at read_pos + packed_bytes. first_value_ is
+    // at offset 16 within the header
+    // (write_index_(4)+bit_width_(4)+delta_min_(8)). We read it via raw pointer
+    // so the stream position is not consumed.
+    int32_t packed_bytes = (write_index_ * bit_width_ + 7) / 8;
+    if (in.remaining_size() >= (uint32_t)packed_bytes + 24) {
+        char* next_fv_ptr =
+            in.get_wrapped_buf() + in.read_pos() + packed_bytes + 16;
+        block_max = (int64_t)common::SerializationUtil::read_ui64(next_fv_ptr);
+    } else {
+        // Last block in page: fall back to conservative estimate.
+        if (write_index_ == 0 || bit_width_ == 0) {
+            block_max = first_value_ + (int64_t)write_index_ * delta_min_;
+        } else if (bit_width_ >= 63) {
+            block_max = INT64_MAX;
+        } else {
+            int64_t max_delta = delta_min_ + ((1LL << bit_width_) - 1);
+            block_max = first_value_ + (int64_t)write_index_ * max_delta;
+        }
+    }
+
+    header_peeked_ = true;
+    return true;
+}
+
+template <>
+inline int TS2DIFFDecoder<int64_t>::skip_peeked_block_int64(
+    common::ByteStream& in, int& skipped) {
+    skipped = write_index_ + 1;
+    int32_t skip_bytes = (write_index_ * bit_width_ + 7) / 8;
+    in.wrapped_buf_advance_read_pos(skip_bytes);
+    header_peeked_ = false;
+    bits_left_ = 0;
+    buffer_ = 0;
+    current_index_ = 0;
+    write_index_ = -1;
+    return common::E_OK;
+}
+
+// INT32 specialization: not applicable (timestamps are always INT64)
+template <>
+inline bool TS2DIFFDecoder<int32_t>::peek_next_block_range_int64(
+    common::ByteStream& in, int64_t& block_min, int64_t& block_max,
+    int& block_count) {
+    return false;
+}
+
+template <>
+inline int TS2DIFFDecoder<int32_t>::skip_peeked_block_int64(
+    common::ByteStream& in, int& skipped) {
+    return common::E_NOT_SUPPORT;
+}
+
+// ============================================================================
+// Default (unsupported type) batch/skip — fall back to base class
+// ============================================================================
+
+template <>
+inline int TS2DIFFDecoder<int32_t>::read_batch_int64(int64_t* out, int capacity,
+                                                     int& actual,
+                                                     common::ByteStream& in) {
+    return Decoder::read_batch_int64(out, capacity, actual, in);
+}
+
+template <>
+inline int TS2DIFFDecoder<int32_t>::skip_int64(int count, int& skipped,
+                                               common::ByteStream& in) {
+    return Decoder::skip_int64(count, skipped, in);
+}
+
+template <>
+inline int TS2DIFFDecoder<int64_t>::read_batch_int32(int32_t* out, int capacity,
+                                                     int& actual,
+                                                     common::ByteStream& in) {
+    return Decoder::read_batch_int32(out, capacity, actual, in);
+}
+
+template <>
+inline int TS2DIFFDecoder<int64_t>::skip_int32(int count, int& skipped,
+                                               common::ByteStream& in) {
+    return Decoder::skip_int32(count, skipped, in);
+}
+
+// ============================================================================
+// Float / Double wrapper decoders (unchanged)
+// ============================================================================
+
 class FloatTS2DIFFDecoder : public TS2DIFFDecoder<int32_t> {
    public:
     FloatTS2DIFFDecoder() = default;
@@ -282,11 +901,24 @@ class FloatTS2DIFFDecoder : public TS2DIFFDecoder<int32_t> {
         return common::int_to_float(value_int);
     }
 
-    int read_boolean(bool& ret_value, common::ByteStream& in);
-    int read_int32(int32_t& ret_value, common::ByteStream& in);
-    int read_int64(int64_t& ret_value, common::ByteStream& in);
-    int read_float(float& ret_value, common::ByteStream& in);
-    int read_double(double& ret_value, common::ByteStream& in);
+    int read_boolean(bool& ret_value, common::ByteStream& in) override;
+    int read_int32(int32_t& ret_value, common::ByteStream& in) override;
+    int read_int64(int64_t& ret_value, common::ByteStream& in) override;
+    int read_float(float& ret_value, common::ByteStream& in) override;
+    int read_double(double& ret_value, common::ByteStream& in) override;
+
+    int read_batch_float(float* out, int capacity, int& actual,
+                         common::ByteStream& in) override {
+        // Reuse SIMD batch decode for int32, then bit-cast to float
+        int32_t* buf = reinterpret_cast<int32_t*>(out);
+        int ret = TS2DIFFDecoder<int32_t>::read_batch_int32(buf, capacity,
+                                                            actual, in);
+        if (ret != common::E_OK) return ret;
+        for (int i = 0; i < actual; ++i) {
+            out[i] = common::int_to_float(buf[i]);
+        }
+        return common::E_OK;
+    }
 
    private:
     bool is_legacy_raw_{false};
@@ -306,11 +938,24 @@ class DoubleTS2DIFFDecoder : public TS2DIFFDecoder<int64_t> {
         return common::long_to_double(value_long);
     }
 
-    int read_boolean(bool& ret_value, common::ByteStream& in);
-    int read_int32(int32_t& ret_value, common::ByteStream& in);
-    int read_int64(int64_t& ret_value, common::ByteStream& in);
-    int read_float(float& ret_value, common::ByteStream& in);
-    int read_double(double& ret_value, common::ByteStream& in);
+    int read_boolean(bool& ret_value, common::ByteStream& in) override;
+    int read_int32(int32_t& ret_value, common::ByteStream& in) override;
+    int read_int64(int64_t& ret_value, common::ByteStream& in) override;
+    int read_float(float& ret_value, common::ByteStream& in) override;
+    int read_double(double& ret_value, common::ByteStream& in) override;
+
+    int read_batch_double(double* out, int capacity, int& actual,
+                          common::ByteStream& in) override {
+        // Reuse SIMD batch decode for int64, then bit-cast to double
+        int64_t* buf = reinterpret_cast<int64_t*>(out);
+        int ret = TS2DIFFDecoder<int64_t>::read_batch_int64(buf, capacity,
+                                                            actual, in);
+        if (ret != common::E_OK) return ret;
+        for (int i = 0; i < actual; ++i) {
+            out[i] = common::long_to_double(buf[i]);
+        }
+        return common::E_OK;
+    }
 
    private:
     bool is_legacy_raw_{false};
diff --git a/cpp/src/encoding/ts2diff_encoder.h b/cpp/src/encoding/ts2diff_encoder.h
index d1ab43bfd..fc494581a 100644
--- a/cpp/src/encoding/ts2diff_encoder.h
+++ b/cpp/src/encoding/ts2diff_encoder.h
@@ -29,12 +29,9 @@
 #include "common/allocator/alloc_base.h"
 #include "common/allocator/byte_stream.h"
 #include "encoder.h"
-#if defined(__SSE4_2__)
-#include <smmintrin.h>
-#define USE_SSE 1
-#elif defined(__AVX2__)
-#include <immintrin.h>
-#define USE_AVX2 1
+
+#ifdef ENABLE_SIMD
+#include "simde/x86/avx2.h"
 #endif
 
 namespace storage {
@@ -44,15 +41,16 @@ struct SIMDOps;
 
 template <>
 struct SIMDOps<int32_t> {
-#ifdef USE_SSE
+#ifdef ENABLE_SIMD
     static void rebase(int32_t* arr, int32_t min_val, size_t size) {
-        const __m128i min_vec = _mm_set1_epi32(min_val);
+        const simde__m128i min_vec = simde_mm_set1_epi32(min_val);
         size_t i = 0;
         for (; i + 3 < size; i += 4) {
-            __m128i vec =
-                _mm_loadu_si128(reinterpret_cast<const __m128i*>(arr + i));
-            vec = _mm_sub_epi32(vec, min_vec);
-            _mm_storeu_si128(reinterpret_cast<__m128i*>(arr + i), vec);
+            simde__m128i vec = simde_mm_loadu_si128(
+                reinterpret_cast<const simde__m128i*>(arr + i));
+            vec = simde_mm_sub_epi32(vec, min_vec);
+            simde_mm_storeu_si128(reinterpret_cast<simde__m128i*>(arr + i),
+                                  vec);
         }
         for (; i < size; ++i) {
             arr[i] -= min_val;
@@ -69,15 +67,16 @@ struct SIMDOps<int32_t> {
 
 template <>
 struct SIMDOps<int64_t> {
-#ifdef USE_AVX2
+#ifdef ENABLE_SIMD
     static void rebase(int64_t* arr, int64_t min_val, size_t size) {
-        const __m256i min_vec = _mm256_set1_epi64x(min_val);
+        const simde__m256i min_vec = simde_mm256_set1_epi64x(min_val);
         size_t i = 0;
         for (; i + 3 < size; i += 4) {
-            __m256i vec =
-                _mm256_loadu_si256(reinterpret_cast<const __m256i*>(arr + i));
-            vec = _mm256_sub_epi64(vec, min_vec);
-            _mm256_storeu_si256(reinterpret_cast<__m256i*>(arr + i), vec);
+            simde__m256i vec = simde_mm256_loadu_si256(
+                reinterpret_cast<const simde__m256i*>(arr + i));
+            vec = simde_mm256_sub_epi64(vec, min_vec);
+            simde_mm256_storeu_si256(reinterpret_cast<simde__m256i*>(arr + i),
+                                     vec);
         }
         for (; i < size; ++i) {
             arr[i] -= min_val;
@@ -99,7 +98,7 @@ class TS2DIFFEncoder : public Encoder {
 
     ~TS2DIFFEncoder() { destroy(); }
 
-    void reset() { write_index_ = -1; }
+    void reset() override { write_index_ = -1; }
 
     void init() {
         block_size_ = 128;
@@ -115,7 +114,7 @@ class TS2DIFFEncoder : public Encoder {
         previous_value_ = 0;
     }
 
-    void destroy() {
+    void destroy() override {
         if (delta_arr_ != nullptr) {
             common::mem_free(delta_arr_);
             delta_arr_ = nullptr;
@@ -167,17 +166,71 @@ class TS2DIFFEncoder : public Encoder {
         return bit_width;
     }
 
+    // Batch bit-pack `count` values (each `bit_width` bits, MSB-first within
+    // byte) into a single contiguous buffer and write it to out_stream in one
+    // call. Avoids the per-byte write_buf overhead of the scalar write_bits
+    // loop.
+    //
+    // Result codes:
+    //   E_OK  → written successfully.
+    //   -1    → caller must fall back to write_bits + flush_remaining because
+    //           bit_width exceeds the safe accumulator width.
+    //   any other non-zero value → real write_buf error; the caller must
+    //           propagate it instead of treating the flush as successful.
+    template <typename U>
+    static int pack_bits_msb(const U* values, int count, int bit_width,
+                             common::ByteStream& out_stream) {
+        if (count <= 0 || bit_width <= 0) return common::E_OK;
+        if (bit_width > 56) return -1;  // fall back
+
+        size_t total_bytes = ((size_t)count * (size_t)bit_width + 7) / 8;
+        std::vector<uint8_t> buf(total_bytes, 0);
+
+        uint64_t accum = 0;
+        int bits_in_accum = 0;
+        size_t pos = 0;
+        const uint64_t mask = (1ULL << bit_width) - 1;
+
+        for (int i = 0; i < count; i++) {
+            uint64_t v = static_cast<uint64_t>(values[i]) & mask;
+            accum = (accum << bit_width) | v;
+            bits_in_accum += bit_width;
+            while (bits_in_accum >= 8) {
+                buf[pos++] = static_cast<uint8_t>(accum >> (bits_in_accum - 8));
+                bits_in_accum -= 8;
+            }
+            if (bits_in_accum > 0) {
+                accum &= ((1ULL << bits_in_accum) - 1);
+            } else {
+                accum = 0;
+            }
+        }
+        if (bits_in_accum > 0) {
+            buf[pos++] = static_cast<uint8_t>(accum << (8 - bits_in_accum));
+        }
+        // Surface write failures.  Previously the return code was dropped on
+        // the floor and flush() returned E_OK, then reset() wiped the
+        // encoder state — the on-disk page ended up missing its delta block
+        // but the caller thought the data was safe.
+        return out_stream.write_buf(buf.data(), pos);
+    }
+
     int do_encode(T value, common::ByteStream& out_stream);
-    int encode(bool value, common::ByteStream& out_stream);
-    int encode(int32_t value, common::ByteStream& out_stream);
-    int encode(int64_t value, common::ByteStream& out_stream);
-    int encode(float value, common::ByteStream& out_stream);
-    int encode(double value, common::ByteStream& out_stream);
-    int encode(common::String value, common::ByteStream& out_stream);
+    int encode(bool value, common::ByteStream& out_stream) override;
+    int encode(int32_t value, common::ByteStream& out_stream) override;
+    int encode(int64_t value, common::ByteStream& out_stream) override;
+    int encode(float value, common::ByteStream& out_stream) override;
+    int encode(double value, common::ByteStream& out_stream) override;
+    int encode(common::String value, common::ByteStream& out_stream) override;
+
+    int encode_batch(const int32_t* values, uint32_t count,
+                     common::ByteStream& out_stream) override;
+    int encode_batch(const int64_t* values, uint32_t count,
+                     common::ByteStream& out_stream) override;
 
-    int flush(common::ByteStream& out_stream);
+    int flush(common::ByteStream& out_stream) override;
 
-    int get_max_byte_size() {
+    int get_max_byte_size() override {
         // The meaning of 24 is: index(4)+width(4)+minDeltaBase(8)+firstValue(8)
         return 24 + write_index_ * 8;
     }
@@ -235,16 +288,39 @@ inline int TS2DIFFEncoder<int32_t>::flush(common::ByteStream& out_stream) {
     SIMDOps<int32_t>::rebase(delta_arr_, delta_arr_min_, write_index_);
     // Calculate the bit length of each value to writer
     int bit_width = cal_bit_width(delta_arr_max_ - delta_arr_min_);
-    // writer header
-    common::SerializationUtil::write_ui32(write_index_, out_stream);
-    common::SerializationUtil::write_ui32(bit_width, out_stream);
-    common::SerializationUtil::write_ui32(delta_arr_min_, out_stream);
-    common::SerializationUtil::write_ui32(first_value_, out_stream);
-    // writer data
-    for (int i = 0; i < write_index_; i++) {
-        write_bits(delta_arr_[i], bit_width, out_stream);
+    // Header writes can fail too (back-pressure / OOM on the underlying
+    // stream); a half-written header followed by reset() leaves the page
+    // corrupted but the caller thinking the data was flushed.
+    if (RET_FAIL(
+            common::SerializationUtil::write_ui32(write_index_, out_stream))) {
+        return ret;
+    }
+    if (RET_FAIL(
+            common::SerializationUtil::write_ui32(bit_width, out_stream))) {
+        return ret;
+    }
+    if (RET_FAIL(common::SerializationUtil::write_ui32(delta_arr_min_,
+                                                       out_stream))) {
+        return ret;
+    }
+    if (RET_FAIL(
+            common::SerializationUtil::write_ui32(first_value_, out_stream))) {
+        return ret;
+    }
+    // writer data — batched bit-pack + single write_buf for the common case;
+    // fall back to per-bit path for the rare wide bit_width.
+    const int pack_ret =
+        pack_bits_msb(delta_arr_, write_index_, bit_width, out_stream);
+    if (pack_ret == -1) {
+        for (int i = 0; i < write_index_; i++) {
+            write_bits(delta_arr_[i], bit_width, out_stream);
+        }
+        flush_remaining(out_stream);
+    } else if (pack_ret != common::E_OK) {
+        // Real write failure — don't clear encoder state so the higher
+        // layer can detect the page is poisoned.
+        return pack_ret;
     }
-    flush_remaining(out_stream);
     reset();
     return ret;
 }
@@ -259,20 +335,222 @@ inline int TS2DIFFEncoder<int64_t>::flush(common::ByteStream& out_stream) {
     SIMDOps<int64_t>::rebase(delta_arr_, delta_arr_min_, write_index_);
     // Calculate the bit length of each value to writer
     int bit_width = cal_bit_width(delta_arr_max_ - delta_arr_min_);
-    // writer header
-    common::SerializationUtil::write_i32(write_index_, out_stream);
-    common::SerializationUtil::write_i32(bit_width, out_stream);
-    common::SerializationUtil::write_i64(delta_arr_min_, out_stream);
-    common::SerializationUtil::write_i64(first_value_, out_stream);
-    // writer data
-    for (int i = 0; i < write_index_; i++) {
-        write_bits(delta_arr_[i], bit_width, out_stream);
+    // Header writes can fail too — see int32 specialization for rationale.
+    if (RET_FAIL(
+            common::SerializationUtil::write_i32(write_index_, out_stream))) {
+        return ret;
+    }
+    if (RET_FAIL(common::SerializationUtil::write_i32(bit_width, out_stream))) {
+        return ret;
+    }
+    if (RET_FAIL(
+            common::SerializationUtil::write_i64(delta_arr_min_, out_stream))) {
+        return ret;
+    }
+    if (RET_FAIL(
+            common::SerializationUtil::write_i64(first_value_, out_stream))) {
+        return ret;
+    }
+    // writer data — batched bit-pack + single write_buf for the common case;
+    // fall back to per-bit path for the rare wide bit_width (>56).
+    const int pack_ret =
+        pack_bits_msb(delta_arr_, write_index_, bit_width, out_stream);
+    if (pack_ret == -1) {
+        for (int i = 0; i < write_index_; i++) {
+            write_bits(delta_arr_[i], bit_width, out_stream);
+        }
+        flush_remaining(out_stream);
+    } else if (pack_ret != common::E_OK) {
+        return pack_ret;
     }
-    flush_remaining(out_stream);
     reset();  // 语义，writeIndex=-1;
     return ret;
 }
 
+// ============================================================================
+// Batch encode: INT32
+// Adjacent-difference removes sequential dependency; SIMD for delta + min/max.
+// ============================================================================
+
+template <>
+inline int TS2DIFFEncoder<int32_t>::encode_batch(
+    const int32_t* values, uint32_t count, common::ByteStream& out_stream) {
+    int ret = common::E_OK;
+    uint32_t offset = 0;
+
+    while (offset < count) {
+        // Start of new block: store first_value
+        if (write_index_ == -1) {
+            first_value_ = values[offset];
+            previous_value_ = first_value_;
+            write_index_ = 0;
+            offset++;
+            continue;
+        }
+
+        // How many deltas fit in current block
+        uint32_t space = static_cast<uint32_t>(block_size_) - write_index_;
+        uint32_t batch = std::min(count - offset, space);
+
+        // ── Adjacent difference: delta[i] = values[i] - values[i-1] ──
+        // First delta uses previous_value_
+        delta_arr_[write_index_] = values[offset] - previous_value_;
+
+        uint32_t i = 1;
+#ifdef ENABLE_SIMD
+        // SIMD: 4 adjacent differences at a time
+        for (; i + 3 < batch; i += 4) {
+            simde__m128i cur = simde_mm_loadu_si128(
+                reinterpret_cast<const simde__m128i*>(values + offset + i));
+            simde__m128i prv = simde_mm_loadu_si128(
+                reinterpret_cast<const simde__m128i*>(values + offset + i - 1));
+            simde__m128i diff = simde_mm_sub_epi32(cur, prv);
+            simde_mm_storeu_si128(
+                reinterpret_cast<simde__m128i*>(delta_arr_ + write_index_ + i),
+                diff);
+        }
+#endif
+        for (; i < batch; i++) {
+            delta_arr_[write_index_ + i] =
+                values[offset + i] - values[offset + i - 1];
+        }
+        previous_value_ = values[offset + batch - 1];
+
+        // ── Min/max of new deltas ──
+        int32_t local_min = delta_arr_[write_index_];
+        int32_t local_max = delta_arr_[write_index_];
+
+        uint32_t j = 1;
+#ifdef ENABLE_SIMD
+        if (batch >= 5) {
+            simde__m128i vmin = simde_mm_set1_epi32(local_min);
+            simde__m128i vmax = vmin;
+            for (; j + 3 < batch; j += 4) {
+                simde__m128i v =
+                    simde_mm_loadu_si128(reinterpret_cast<const simde__m128i*>(
+                        delta_arr_ + write_index_ + j));
+                vmin = simde_mm_min_epi32(vmin, v);
+                vmax = simde_mm_max_epi32(vmax, v);
+            }
+            // Horizontal reduce
+            int32_t tmp[4];
+            simde_mm_storeu_si128(reinterpret_cast<simde__m128i*>(tmp), vmin);
+            for (int k = 0; k < 4; k++)
+                if (tmp[k] < local_min) local_min = tmp[k];
+            simde_mm_storeu_si128(reinterpret_cast<simde__m128i*>(tmp), vmax);
+            for (int k = 0; k < 4; k++)
+                if (tmp[k] > local_max) local_max = tmp[k];
+        }
+#endif
+        for (; j < batch; j++) {
+            int32_t d = delta_arr_[write_index_ + j];
+            if (d < local_min) local_min = d;
+            if (d > local_max) local_max = d;
+        }
+
+        // Merge with block min/max
+        if (write_index_ == 0) {
+            delta_arr_min_ = local_min;
+            delta_arr_max_ = local_max;
+        } else {
+            if (local_min < delta_arr_min_) delta_arr_min_ = local_min;
+            if (local_max > delta_arr_max_) delta_arr_max_ = local_max;
+        }
+
+        write_index_ += batch;
+        offset += batch;
+
+        if (write_index_ >= block_size_) {
+            if (RET_FAIL(flush(out_stream))) return ret;
+        }
+    }
+    return ret;
+}
+
+// ============================================================================
+// Batch encode: INT64
+// ============================================================================
+
+template <>
+inline int TS2DIFFEncoder<int64_t>::encode_batch(
+    const int64_t* values, uint32_t count, common::ByteStream& out_stream) {
+    int ret = common::E_OK;
+    uint32_t offset = 0;
+
+    while (offset < count) {
+        if (write_index_ == -1) {
+            first_value_ = values[offset];
+            previous_value_ = first_value_;
+            write_index_ = 0;
+            offset++;
+            continue;
+        }
+
+        uint32_t space = static_cast<uint32_t>(block_size_) - write_index_;
+        uint32_t batch = std::min(count - offset, space);
+
+        // Adjacent difference
+        delta_arr_[write_index_] = values[offset] - previous_value_;
+
+        uint32_t i = 1;
+#ifdef ENABLE_SIMD
+        // SIMD: 2 adjacent differences at a time (128-bit, native NEON)
+        for (; i + 1 < batch; i += 2) {
+            simde__m128i cur = simde_mm_loadu_si128(
+                reinterpret_cast<const simde__m128i*>(values + offset + i));
+            simde__m128i prv = simde_mm_loadu_si128(
+                reinterpret_cast<const simde__m128i*>(values + offset + i - 1));
+            simde__m128i diff = simde_mm_sub_epi64(cur, prv);
+            simde_mm_storeu_si128(
+                reinterpret_cast<simde__m128i*>(delta_arr_ + write_index_ + i),
+                diff);
+        }
+#endif
+        for (; i < batch; i++) {
+            delta_arr_[write_index_ + i] =
+                values[offset + i] - values[offset + i - 1];
+        }
+        previous_value_ = values[offset + batch - 1];
+
+        // Min/max (scalar — no efficient 64-bit SIMD min/max before AVX-512)
+        int64_t local_min = delta_arr_[write_index_];
+        int64_t local_max = delta_arr_[write_index_];
+        for (uint32_t j = 1; j < batch; j++) {
+            int64_t d = delta_arr_[write_index_ + j];
+            if (d < local_min) local_min = d;
+            if (d > local_max) local_max = d;
+        }
+
+        if (write_index_ == 0) {
+            delta_arr_min_ = local_min;
+            delta_arr_max_ = local_max;
+        } else {
+            if (local_min < delta_arr_min_) delta_arr_min_ = local_min;
+            if (local_max > delta_arr_max_) delta_arr_max_ = local_max;
+        }
+
+        write_index_ += batch;
+        offset += batch;
+
+        if (write_index_ >= block_size_) {
+            if (RET_FAIL(flush(out_stream))) return ret;
+        }
+    }
+    return ret;
+}
+
+// Default: unsupported types fall back to base class loop
+template <typename T>
+int TS2DIFFEncoder<T>::encode_batch(const int32_t* values, uint32_t count,
+                                    common::ByteStream& out) {
+    return Encoder::encode_batch(values, count, out);
+}
+template <typename T>
+int TS2DIFFEncoder<T>::encode_batch(const int64_t* values, uint32_t count,
+                                    common::ByteStream& out) {
+    return Encoder::encode_batch(values, count, out);
+}
+
 class FloatTS2DIFFEncoder : public TS2DIFFEncoder<int32_t> {
    public:
     FloatTS2DIFFEncoder() : max_point_number_(2), max_point_value_(100.0) {}
@@ -280,6 +558,14 @@ class FloatTS2DIFFEncoder : public TS2DIFFEncoder<int32_t> {
         int32_t value_int = convert_float_to_int(value);
         return TS2DIFFEncoder<int32_t>::do_encode(value_int, out_stream);
     }
+    // PageWriter resets the encoder between pages without going through a
+    // successful flush() (e.g. when the prior page was aborted).  The base
+    // reset() only clears write_index_; underflow_flags_ would otherwise
+    // leak the prior page's overflow markers into the next page's bitmap.
+    void reset() override {
+        TS2DIFFEncoder<int32_t>::reset();
+        underflow_flags_.clear();
+    }
     int flush(common::ByteStream& out_stream) override;
     int encode(bool value, common::ByteStream& out_stream);
     int encode(int32_t value, common::ByteStream& out_stream);
@@ -332,6 +618,12 @@ class DoubleTS2DIFFEncoder : public TS2DIFFEncoder<int64_t> {
         int64_t value_long = convert_double_to_long(value);
         return TS2DIFFEncoder<int64_t>::do_encode(value_long, out_stream);
     }
+    // See FloatTS2DIFFEncoder::reset for rationale — the prior page's
+    // overflow markers must not bleed into the next.
+    void reset() override {
+        TS2DIFFEncoder<int64_t>::reset();
+        underflow_flags_.clear();
+    }
     int flush(common::ByteStream& out_stream) override;
     int encode(bool value, common::ByteStream& out_stream);
     int encode(int32_t value, common::ByteStream& out_stream);
@@ -518,7 +810,6 @@ FORCE_INLINE int FloatTS2DIFFEncoder::flush(common::ByteStream& out_stream) {
         write_bits(delta_arr_[i], bit_width, inner);
     }
     flush_remaining(inner);
-    reset();
 
     const bool overflow = has_overflow();
     if (overflow) {
@@ -564,7 +855,12 @@ FORCE_INLINE int FloatTS2DIFFEncoder::flush(common::ByteStream& out_stream) {
     if (RET_FAIL(merge_byte_stream(out_stream, inner, true))) {
         return ret;
     }
+    // Defer encoder-state wipe until after every write into out_stream has
+    // committed.  An earlier reset() let a mid-flush failure leave
+    // write_index_ at -1, so the next flush() short-circuited at the top
+    // and the data was silently lost.
     underflow_flags_.clear();
+    TS2DIFFEncoder<int32_t>::reset();
     return ret;
 }
 
@@ -597,7 +893,6 @@ FORCE_INLINE int DoubleTS2DIFFEncoder::flush(common::ByteStream& out_stream) {
         write_bits(delta_arr_[i], bit_width, inner);
     }
     flush_remaining(inner);
-    reset();
 
     const bool overflow = has_overflow();
     if (overflow) {
@@ -643,7 +938,11 @@ FORCE_INLINE int DoubleTS2DIFFEncoder::flush(common::ByteStream& out_stream) {
     if (RET_FAIL(merge_byte_stream(out_stream, inner, true))) {
         return ret;
     }
+    // Same deferred-reset rationale as FloatTS2DIFFEncoder::flush — keeping
+    // write_index_ live until every committed write succeeds avoids the
+    // "next flush returns E_OK on lost data" pattern.
     underflow_flags_.clear();
+    TS2DIFFEncoder<int64_t>::reset();
     return ret;
 }
 
diff --git a/cpp/src/file/read_file.cc b/cpp/src/file/read_file.cc
index d9902ddb9..c6bfd547a 100644
--- a/cpp/src/file/read_file.cc
+++ b/cpp/src/file/read_file.cc
@@ -26,6 +26,7 @@
 #ifdef _WIN32
 #include <io.h>
 #include <windows.h>
+
 ssize_t pread(int fd, void* buf, size_t count, uint64_t offset);
 #else
 #include <unistd.h>
diff --git a/cpp/src/file/restorable_tsfile_io_writer.cc b/cpp/src/file/restorable_tsfile_io_writer.cc
index 22a3fb500..a1fc53402 100644
--- a/cpp/src/file/restorable_tsfile_io_writer.cc
+++ b/cpp/src/file/restorable_tsfile_io_writer.cc
@@ -328,12 +328,15 @@ static int recover_chunk_statistic(
     uint32_t value_buf_size = 0;
     std::vector<int64_t> time_decode_buf;
     const std::vector<int64_t>* times = nullptr;
-    std::vector<uint8_t> aligned_value_notnull_bitmap;
+    // For aligned pages, retain the per-row not-null bitmap so the stat-update
+    // loop can skip null positions and bind each decoded value to its real
+    // timestamp.  Without this we'd hand non-null values to times[0..N-1] and
+    // get wrong start/end/first/last stats on sparse columns.
+    const char* aligned_bitmap = nullptr;
     uint32_t aligned_num_values = 0;
-    const bool is_aligned_value_chunk =
-        (time_batch != nullptr && !time_batch->empty());
+    bool is_aligned_page = false;
 
-    if (is_aligned_value_chunk) {
+    if (time_batch != nullptr && !time_batch->empty()) {
         // Aligned value page: uncompressed layout = uint32(num_values) + bitmap
         // + value_buf
         if (uncompressed_size < 4) {
@@ -341,7 +344,7 @@ static int recover_chunk_statistic(
             CompressorFactory::free(compressor);
             return E_OK;
         }
-        aligned_num_values =
+        uint32_t num_values =
             (static_cast<uint32_t>(
                  static_cast<unsigned char>(uncompressed_buf[0]))
              << 24) |
@@ -353,20 +356,19 @@ static int recover_chunk_statistic(
              << 8) |
             (static_cast<uint32_t>(
                 static_cast<unsigned char>(uncompressed_buf[3])));
-        uint32_t bitmap_size = (aligned_num_values + 7) / 8;
+        uint32_t bitmap_size = (num_values + 7) / 8;
         if (uncompressed_size < 4 + bitmap_size) {
             compressor->after_uncompress(uncompressed_buf);
             CompressorFactory::free(compressor);
             return E_OK;
         }
-        aligned_value_notnull_bitmap.resize(bitmap_size);
-        if (bitmap_size > 0) {
-            std::memcpy(aligned_value_notnull_bitmap.data(),
-                        uncompressed_buf + 4, bitmap_size);
-        }
         value_buf = uncompressed_buf + 4 + bitmap_size;
         value_buf_size = uncompressed_size - 4 - bitmap_size;
         times = time_batch;
+        aligned_bitmap = uncompressed_buf + 4;
+        aligned_num_values = std::min<uint32_t>(
+            num_values, static_cast<uint32_t>(time_batch->size()));
+        is_aligned_page = true;
     } else {
         // Non-aligned value page: var_uint(time_buf_size) + time_buf +
         // value_buf
@@ -419,25 +421,25 @@ static int recover_chunk_statistic(
     value_decoder->reset();
     size_t idx = 0;
     const size_t num_times = times->size();
-    while (idx < num_times) {
-        int64_t t = (*times)[idx];
-        bool has_value = true;
-        if (is_aligned_value_chunk) {
-            has_value = false;
-            const uint32_t byte_idx = static_cast<uint32_t>(idx / 8);
-            const uint32_t bit_shift = static_cast<uint32_t>(idx % 8);
-            if (byte_idx < aligned_value_notnull_bitmap.size()) {
-                has_value = ((aligned_value_notnull_bitmap[byte_idx] & 0xFF) &
-                             (0x80 >> bit_shift)) != 0;
-            }
-        }
-        if (!has_value) {
+    // For aligned pages the value stream only stores non-null rows; advance
+    // `idx` past null bitmap entries so each decoded value pairs with the
+    // matching timestamp. Non-aligned pages have no bitmap (every row is
+    // present), so we keep the dense walk.
+    auto bitmap_is_valid = [&](size_t row) -> bool {
+        if (!is_aligned_page) return true;
+        if (row >= aligned_num_values) return false;
+        // Aligned value-page bitmap: MSB-first within each byte, bit set
+        // means the row is NOT null.
+        unsigned char byte =
+            static_cast<unsigned char>(aligned_bitmap[row / 8]);
+        return (byte & static_cast<unsigned char>(0x80 >> (row % 8))) != 0;
+    };
+    while (idx < num_times && value_decoder->has_remaining(value_in)) {
+        if (!bitmap_is_valid(idx)) {
             idx++;
             continue;
         }
-        if (!value_decoder->has_remaining(value_in)) {
-            break;
-        }
+        int64_t t = (*times)[idx];
         switch (chdr.data_type_) {
             case common::BOOLEAN: {
                 bool v;
@@ -518,6 +520,12 @@ void RestorableTsFileIOWriter::close() {
         write_file_ = nullptr;
         write_file_owned_ = false;
     }
+    // Run the base writer's cleanup (frees post-recovery appended chunk
+    // metadata) before tearing down self_check_arena_ that backs the
+    // recovered ChunkGroupMeta entries.  Base destroy() only touches entries
+    // it allocated itself (tracked in appended_chunk_metas_ /
+    // appended_chunk_group_metas_), so it never dereferences self_check
+    // arena memory.
     TsFileIOWriter::destroy();
     for (ChunkGroupMeta* cgm : self_check_recovered_cgm_) {
         cgm->device_id_.reset();
@@ -842,15 +850,13 @@ int RestorableTsFileIOWriter::self_check(bool truncate_corrupted) {
         }
     }
 
-    // --- Attach recovered ChunkGroupMeta to writer; record per-CGM prefix
-    // length so destroy() can free stats appended later. ---
-    recovery_chunk_meta_prefix_.clear();
+    // Attach recovered ChunkGroupMeta entries to the base writer.  These
+    // live in self_check_arena_ and are *not* tracked in
+    // appended_chunk_group_metas_ — base destroy() leaves them alone, and
+    // close() resets their device_id_ refs before tearing down the arena.
     for (ChunkGroupMeta* cgm : recovered_cgm_list) {
-        recovery_chunk_meta_prefix_[cgm] =
-            static_cast<uint32_t>(cgm->chunk_meta_list_.size());
         push_chunk_group_meta(cgm);
     }
-    chunk_group_meta_from_recovery_ = true;
 
     return E_OK;
 }
diff --git a/cpp/src/file/tsfile_io_reader.cc b/cpp/src/file/tsfile_io_reader.cc
index 296556c15..014e78832 100644
--- a/cpp/src/file/tsfile_io_reader.cc
+++ b/cpp/src/file/tsfile_io_reader.cc
@@ -51,6 +51,8 @@ void TsFileIOReader::reset() {
         }
         read_file_ = nullptr;
         tsfile_meta_page_arena_.destroy();
+        device_node_cache_.clear();
+        device_node_cache_pa_.destroy();
         tsfile_meta_ready_ = false;
     }
 }
@@ -61,6 +63,9 @@ int TsFileIOReader::alloc_ssi(std::shared_ptr<IDeviceID> device_id,
                               common::PageArena& pa, Filter* time_filter) {
     int ret = E_OK;
     if (RET_FAIL(load_tsfile_meta_if_necessary())) {
+    } else if (!bloom_filter_contains(device_id->get_device_name(),
+                                      measurement_name)) {
+        return E_NO_MORE_DATA;
     } else {
         ssi = new TsFileSeriesScanIterator;
         ssi->init(device_id, measurement_name, read_file_, time_filter, pa);
@@ -80,6 +85,95 @@ int TsFileIOReader::alloc_ssi(std::shared_ptr<IDeviceID> device_id,
     return ret;
 }
 
+int TsFileIOReader::alloc_multi_ssi(
+    std::shared_ptr<IDeviceID> device_id,
+    const std::vector<std::string>& measurement_names,
+    TsFileSeriesScanIterator*& ssi, common::PageArena& pa,
+    Filter* time_filter) {
+    int ret = E_OK;
+    if (RET_FAIL(load_tsfile_meta_if_necessary())) return ret;
+
+    ssi = new TsFileSeriesScanIterator;
+    ssi->init(device_id, measurement_names.empty() ? "" : measurement_names[0],
+              read_file_, time_filter, pa);
+
+    auto& ssi_pa = ssi->timeseries_index_pa_;
+
+    // Use cached device measurement node (avoids repeated file I/O)
+    CachedDeviceNode cached;
+    if (RET_FAIL(get_cached_device_node(device_id, ssi_pa, cached))) {
+        delete ssi;
+        ssi = nullptr;
+        return ret;
+    }
+    auto top_node = cached.top_node;
+    if (!cached.is_aligned) {
+        delete ssi;
+        ssi = nullptr;
+        return E_NOT_SUPPORT;
+    }
+
+    // Get time column metadata
+    TimeseriesIndex* time_ts_idx = nullptr;
+    if (RET_FAIL(get_time_column_metadata(top_node, time_ts_idx, ssi_pa))) {
+        delete ssi;
+        ssi = nullptr;
+        return ret;
+    }
+
+    // Create MultiAlignedTimeseriesIndex
+    void* multi_buf = ssi_pa.alloc(sizeof(MultiAlignedTimeseriesIndex));
+    if (IS_NULL(multi_buf)) {
+        delete ssi;
+        ssi = nullptr;
+        return E_OOM;
+    }
+    auto* multi_idx = new (multi_buf) MultiAlignedTimeseriesIndex;
+    multi_idx->time_ts_idx_ = time_ts_idx;
+
+    // Load each measurement's TimeseriesIndex
+    for (const auto& meas_name : measurement_names) {
+        std::shared_ptr<IMetaIndexEntry> meas_entry;
+        int64_t meas_end_offset = 0;
+        if (RET_FAIL(load_measurement_index_entry(
+                meas_name, top_node, meas_entry, meas_end_offset))) {
+            // Measurement not found — abort multi path
+            delete ssi;
+            ssi = nullptr;
+            return ret;
+        }
+
+        ITimeseriesIndex* ts_idx = nullptr;
+        if (RET_FAIL(do_load_timeseries_index(
+                meas_name, meas_entry->get_offset(), meas_end_offset, ssi_pa,
+                ts_idx, /*is_aligned=*/true))) {
+            delete ssi;
+            ssi = nullptr;
+            return ret;
+        }
+
+        auto* aligned_idx = dynamic_cast<AlignedTimeseriesIndex*>(ts_idx);
+        if (aligned_idx && aligned_idx->value_ts_idx_) {
+            multi_idx->value_ts_idxs_.push_back(aligned_idx->value_ts_idx_);
+        } else {
+            delete ssi;
+            ssi = nullptr;
+            return E_NOT_EXIST;
+        }
+    }
+
+    ssi->itimeseries_index_ = multi_idx;
+
+    // Skip global statistic filter for multi — per-chunk filtering still works.
+
+    if (RET_FAIL(ssi->init_chunk_reader())) {
+        ssi->destroy();
+        delete ssi;
+        ssi = nullptr;
+    }
+    return ret;
+}
+
 void TsFileIOReader::revert_ssi(TsFileSeriesScanIterator* ssi) {
     if (ssi != nullptr) {
         ssi->destroy();
@@ -96,61 +190,14 @@ int TsFileIOReader::get_device_timeseries_meta_without_chunk_meta(
     int64_t end_offset;
     std::vector<std::pair<std::shared_ptr<IMetaIndexEntry>, int64_t>>
         meta_index_entry_list;
-    std::shared_ptr<MetaIndexNode> top_node;
-    bool is_aligned = false;
-    TimeseriesIndex* time_timeseries_index = nullptr;
     if (RET_FAIL(load_device_index_entry(
             std::make_shared<DeviceIDComparable>(device_id), meta_index_entry,
             end_offset))) {
-    } else {
-        int64_t start_offset = meta_index_entry->get_offset();
-        ASSERT(start_offset < end_offset);
-        const int32_t read_size = end_offset - start_offset;
-        int32_t ret_read_len = 0;
-        char* data_buf = (char*)pa.alloc(read_size);
-        void* m_idx_node_buf = pa.alloc(sizeof(MetaIndexNode));
-        if (IS_NULL(data_buf) || IS_NULL(m_idx_node_buf)) {
-            return E_OOM;
-        }
-        auto* top_node_ptr = new (m_idx_node_buf) MetaIndexNode(&pa);
-        top_node = std::shared_ptr<MetaIndexNode>(top_node_ptr,
-                                                  MetaIndexNode::self_deleter);
-        if (RET_FAIL(read_file_->read(start_offset, data_buf, read_size,
-                                      ret_read_len))) {
-        } else if (RET_FAIL(top_node->deserialize_from(data_buf, read_size))) {
-        } else {
-            is_aligned = is_aligned_device(top_node);
-            if (is_aligned) {
-                if (RET_FAIL(get_time_column_metadata(
-                        top_node, time_timeseries_index, pa))) {
-                    return ret;
-                }
-            }
-        }
-    }
-    if (RET_FAIL(ret)) {
-        return ret;
-    }
-    if (RET_FAIL(load_all_measurement_index_entry(
-            meta_index_entry->get_offset(), end_offset, pa,
-            meta_index_entry_list))) {
+    } else if (RET_FAIL(load_all_measurement_index_entry(
+                   meta_index_entry->get_offset(), end_offset, pa,
+                   meta_index_entry_list))) {
     } else if (RET_FAIL(do_load_all_timeseries_index(meta_index_entry_list, pa,
                                                      timeseries_indexs))) {
-    } else if (is_aligned && time_timeseries_index != nullptr) {
-        for (size_t i = 0; i < timeseries_indexs.size(); i++) {
-            void* buf = pa.alloc(sizeof(AlignedTimeseriesIndex));
-            if (IS_NULL(buf)) {
-                return E_OOM;
-            }
-            auto* aligned_ts_idx = new (buf) AlignedTimeseriesIndex;
-            aligned_ts_idx->time_ts_idx_ = time_timeseries_index;
-            aligned_ts_idx->value_ts_idx_ =
-                dynamic_cast<TimeseriesIndex*>(timeseries_indexs[i]);
-            if (aligned_ts_idx->value_ts_idx_ == nullptr) {
-                return E_TYPE_NOT_MATCH;
-            }
-            timeseries_indexs[i] = aligned_ts_idx;
-        }
     }
     return ret;
 }
@@ -225,6 +272,20 @@ bool TsFileIOReader::filter_stasify(ITimeseriesIndex* ts_index,
     return time_filter->satisfy(ts_index->get_statistic());
 }
 
+bool TsFileIOReader::bloom_filter_contains(
+    const std::string& device_name, const std::string& measurement_name) {
+    BloomFilter* bf = tsfile_meta_.bloom_filter_;
+    if (bf == nullptr || bf->is_empty()) {
+        return true;  // no bloom filter — assume present
+    }
+    common::String dev_str, meas_str;
+    dev_str.buf_ = const_cast<char*>(device_name.c_str());
+    dev_str.len_ = static_cast<uint32_t>(device_name.size());
+    meas_str.buf_ = const_cast<char*>(measurement_name.c_str());
+    meas_str.len_ = static_cast<uint32_t>(measurement_name.size());
+    return bf->contains(dev_str, meas_str);
+}
+
 int TsFileIOReader::load_tsfile_meta_if_necessary() {
     int ret = E_OK;
     if (!tsfile_meta_ready_) {
@@ -323,44 +384,111 @@ int TsFileIOReader::load_tsfile_meta() {
     return ret;
 }
 
-int TsFileIOReader::load_timeseries_index_for_ssi(
-    std::shared_ptr<IDeviceID> device_id, const std::string& measurement_name,
-    TsFileSeriesScanIterator*& ssi) {
+int TsFileIOReader::get_cached_device_node(std::shared_ptr<IDeviceID> device_id,
+                                           common::PageArena& pa,
+                                           CachedDeviceNode& out) {
+    std::string dev_name = device_id->get_device_name();
+
+    {
+        std::lock_guard<std::mutex> lk(device_node_cache_mu_);
+        auto it = device_node_cache_.find(dev_name);
+        if (it != device_node_cache_.end()) {
+            out = it->second;
+            return E_OK;
+        }
+    }
+
+    // Read the device meta index outside the lock — load_device_index_entry()
+    // and the file read can block on I/O, and we don't want to serialize all
+    // concurrent first-time lookups behind one slow disk fetch.  Two callers
+    // racing on the same missing device may both do the read; that's wasted
+    // work but not corruption — the second insert is dropped below.
     int ret = E_OK;
     std::shared_ptr<IMetaIndexEntry> device_index_entry;
     int64_t device_ie_end_offset = 0;
-    std::shared_ptr<IMetaIndexEntry> measurement_index_entry;
-    int64_t measurement_ie_end_offset = 0;
-    // bool is_aligned = false;
     if (RET_FAIL(load_device_index_entry(
             std::make_shared<DeviceIDComparable>(device_id), device_index_entry,
             device_ie_end_offset))) {
         return ret;
     }
-    auto& pa = ssi->timeseries_index_pa_;
 
     int64_t start_offset = device_index_entry->get_offset(),
             end_offset = device_ie_end_offset;
     ASSERT(start_offset < end_offset);
-    const int32_t read_size = end_offset - start_offset;
+    const int64_t read_size_i64 = end_offset - start_offset;
+    // read_file_->read() takes int32_t; a meta index node larger than 2 GiB
+    // is implausible but explicitly reject it instead of silently truncating
+    // the read length and corrupting the parse.
+    if (read_size_i64 <= 0 || read_size_i64 > INT32_MAX) {
+        return E_TSFILE_CORRUPTED;
+    }
+    const int32_t read_size = static_cast<int32_t>(read_size_i64);
     int32_t ret_read_len = 0;
-    char* data_buf = (char*)pa.alloc(read_size);
-    void* m_idx_node_buf = pa.alloc(sizeof(MetaIndexNode));
-    if (IS_NULL(data_buf) || IS_NULL(m_idx_node_buf)) {
+
+    // Read into a heap-owned buffer outside the lock.  The previous
+    // implementation allocated data_buf inside device_node_cache_pa_ before
+    // the read happened — every failed read or parse left that allocation
+    // pinned forever in the shared arena, and repeated disk errors on the
+    // same device let a long-lived reader grow it without bound.  Using a
+    // unique_ptr here means the read buffer is released on every failure
+    // path, and only the small MetaIndexNode allocations inside the lock
+    // share the arena.
+    std::unique_ptr<char[]> data_buf(new (std::nothrow) char[read_size]);
+    if (data_buf == nullptr) {
         return E_OOM;
     }
-    auto* top_node_ptr = new (m_idx_node_buf) MetaIndexNode(&pa);
-    auto top_node = std::shared_ptr<MetaIndexNode>(top_node_ptr,
-                                                   MetaIndexNode::self_deleter);
-
-    if (RET_FAIL(read_file_->read(start_offset, data_buf, read_size,
+    if (RET_FAIL(read_file_->read(start_offset, data_buf.get(), read_size,
                                   ret_read_len))) {
         return ret;
-    } else if (RET_FAIL(top_node->deserialize_from(data_buf, read_size))) {
+    }
+
+    CachedDeviceNode cached;
+    {
+        // Allocations into device_node_cache_pa_ and the map insert must be
+        // serialized — PageArena is not thread-safe, and unordered_map's
+        // rehash invalidates concurrent lookups.
+        std::lock_guard<std::mutex> lk(device_node_cache_mu_);
+        // Re-check: another thread may have populated the entry while we
+        // were doing I/O.
+        auto it = device_node_cache_.find(dev_name);
+        if (it != device_node_cache_.end()) {
+            out = it->second;
+            return E_OK;
+        }
+
+        void* m_idx_node_buf =
+            device_node_cache_pa_.alloc(sizeof(MetaIndexNode));
+        if (IS_NULL(m_idx_node_buf)) {
+            return E_OOM;
+        }
+        auto* top_node_ptr =
+            new (m_idx_node_buf) MetaIndexNode(&device_node_cache_pa_);
+        auto top_node = std::shared_ptr<MetaIndexNode>(
+            top_node_ptr, MetaIndexNode::self_deleter);
+        if (RET_FAIL(top_node->deserialize_from(data_buf.get(), read_size))) {
+            return ret;
+        }
+        cached.top_node = top_node;
+        cached.is_aligned = is_aligned_device(top_node);
+        device_node_cache_.emplace(std::move(dev_name), cached);
+    }
+    out = cached;
+    return E_OK;
+}
+
+int TsFileIOReader::load_timeseries_index_for_ssi(
+    std::shared_ptr<IDeviceID> device_id, const std::string& measurement_name,
+    TsFileSeriesScanIterator*& ssi) {
+    int ret = E_OK;
+    auto& pa = ssi->timeseries_index_pa_;
+
+    CachedDeviceNode cached;
+    if (RET_FAIL(get_cached_device_node(device_id, pa, cached))) {
         return ret;
     }
+    auto top_node = cached.top_node;
+    bool is_aligned = cached.is_aligned;
 
-    bool is_aligned = is_aligned_device(top_node);
     TimeseriesIndex* timeseries_index = nullptr;
     if (is_aligned) {
         if (RET_FAIL(
@@ -369,6 +497,8 @@ int TsFileIOReader::load_timeseries_index_for_ssi(
         }
     }
 
+    std::shared_ptr<IMetaIndexEntry> measurement_index_entry;
+    int64_t measurement_ie_end_offset = 0;
     if (RET_FAIL(load_measurement_index_entry(measurement_name, top_node,
                                               measurement_index_entry,
                                               measurement_ie_end_offset))) {
@@ -570,16 +700,30 @@ int TsFileIOReader::get_timeseries_indexes(
 
     int64_t idx = 0;
     for (const auto& measurement_name : measurement_names) {
-        if (RET_FAIL(load_measurement_index_entry(measurement_name, top_node,
-                                                  measurement_index_entry,
-                                                  measurement_ie_end_offset))) {
-        } else if (do_load_timeseries_index(
-                       measurement_name, measurement_index_entry->get_offset(),
-                       measurement_ie_end_offset, pa, timeseries_indexs[idx],
-                       is_aligned) == E_NOT_EXIST) {
+        timeseries_indexs[idx] = nullptr;
+        ret = load_measurement_index_entry(measurement_name, top_node,
+                                           measurement_index_entry,
+                                           measurement_ie_end_offset);
+        if (ret == E_MEASUREMENT_NOT_EXIST || ret == E_NOT_EXIST) {
+            ret = E_OK;
             idx++;
             continue;
         }
+        if (RET_FAIL(ret)) {
+            return ret;
+        }
+
+        ret = do_load_timeseries_index(
+            measurement_name, measurement_index_entry->get_offset(),
+            measurement_ie_end_offset, pa, timeseries_indexs[idx], is_aligned);
+        if (ret == E_NOT_EXIST) {
+            ret = E_OK;
+            idx++;
+            continue;
+        }
+        if (RET_FAIL(ret)) {
+            return ret;
+        }
         if (is_aligned) {
             AlignedTimeseriesIndex* aligned_timeseries_index =
                 dynamic_cast<AlignedTimeseriesIndex*>(timeseries_indexs[idx]);
@@ -677,6 +821,9 @@ int TsFileIOReader::search_from_internal_node(
 
 bool TsFileIOReader::is_aligned_device(
     std::shared_ptr<MetaIndexNode> measurement_node) {
+    if (measurement_node->children_.empty()) {
+        return false;
+    }
     auto entry = measurement_node->children_[0];
     return entry->get_name().is_null() ||
            entry->get_name().to_std_string() == "";
diff --git a/cpp/src/file/tsfile_io_reader.h b/cpp/src/file/tsfile_io_reader.h
index 85443326f..0073603fb 100644
--- a/cpp/src/file/tsfile_io_reader.h
+++ b/cpp/src/file/tsfile_io_reader.h
@@ -20,6 +20,8 @@
 #ifndef FILE_TSFILE_IO_REAER_H
 #define FILE_TSFILE_IO_REAER_H
 
+#include <mutex>
+#include <unordered_map>
 #include <unordered_set>
 
 #include "common/tsblock/tsblock.h"
@@ -46,6 +48,26 @@ class TsFileIOReader {
           tsfile_meta_ready_(false),
           read_file_created_(false) {
         tsfile_meta_page_arena_.init(512, common::MOD_TSFILE_READER);
+        device_node_cache_pa_.init(512, common::MOD_TSFILE_READER);
+    }
+
+    // Free only the ReadFile we own (created by init(const std::string&)).
+    // Without an explicit destructor that raw pointer leaks whenever a
+    // TsFileIOReader value goes out of scope without an explicit reset() (e.g.
+    // a stack instance in a test).  We deliberately do NOT call reset() here:
+    // reset() also runs tsfile_meta_page_arena_.destroy(), which would free the
+    // arena that tsfile_meta_ lives in *before* the implicit ~TsFileMeta member
+    // destructor runs, leaving its arena-allocated MetaIndexNode / shared_ptr
+    // graph dangling (use-after-free / crash).  The arenas and TsFileMeta clean
+    // themselves up correctly via member destruction order (tsfile_meta_ is
+    // destroyed before its backing arena).  An owner that already called
+    // reset() leaves read_file_ == nullptr, so this never double-frees.
+    ~TsFileIOReader() {
+        if (read_file_created_ && read_file_ != nullptr) {
+            read_file_->destroy();
+            delete read_file_;
+            read_file_ = nullptr;
+        }
     }
 
     int init(const std::string& file_path);
@@ -59,6 +81,11 @@ class TsFileIOReader {
                   TsFileSeriesScanIterator*& ssi, common::PageArena& pa,
                   Filter* time_filter = nullptr);
 
+    int alloc_multi_ssi(std::shared_ptr<IDeviceID> device_id,
+                        const std::vector<std::string>& measurement_names,
+                        TsFileSeriesScanIterator*& ssi, common::PageArena& pa,
+                        Filter* time_filter = nullptr);
+
     void revert_ssi(TsFileSeriesScanIterator* ssi);
 
     std::string get_file_path() const { return read_file_->file_path(); }
@@ -147,17 +174,40 @@ class TsFileIOReader {
 
     bool filter_stasify(ITimeseriesIndex* ts_index, Filter* time_filter);
 
+    bool bloom_filter_contains(const std::string& device_name,
+                               const std::string& measurement_name);
+
     int get_all_leaf(
         std::shared_ptr<MetaIndexNode> index_node,
         std::vector<std::pair<std::shared_ptr<IMetaIndexEntry>, int64_t>>&
             index_node_entry_list);
 
+    struct CachedDeviceNode {
+        std::shared_ptr<MetaIndexNode> top_node;
+        bool is_aligned;
+    };
+
+    // Returns E_OK on hit (out is filled), or an error code on miss / load
+    // failure (E_DEVICE_NOT_EXIST when the device is absent, the propagated
+    // error otherwise).  Copying into out keeps the caller safe from rehash /
+    // concurrent eviction of the cache map.
+    int get_cached_device_node(std::shared_ptr<IDeviceID> device_id,
+                               common::PageArena& pa, CachedDeviceNode& out);
+
    private:
     ReadFile* read_file_;
     common::PageArena tsfile_meta_page_arena_;
     TsFileMeta tsfile_meta_;
     bool tsfile_meta_ready_;
     bool read_file_created_;
+    // Cache: device_name → deserialized measurement MetaIndexNode.
+    // Guarded by device_node_cache_mu_ — multiple SSIs and Result Sets can
+    // hit the cache concurrently on the same reader, and an unsynchronized
+    // unordered_map insert would race with a parallel lookup (rehash,
+    // bucket-list rewrite) and with the underlying PageArena allocation.
+    common::PageArena device_node_cache_pa_;
+    std::unordered_map<std::string, CachedDeviceNode> device_node_cache_;
+    mutable std::mutex device_node_cache_mu_;
 };
 
 }  // end namespace storage
diff --git a/cpp/src/file/tsfile_io_writer.cc b/cpp/src/file/tsfile_io_writer.cc
index 42d99feda..71bb08a7e 100644
--- a/cpp/src/file/tsfile_io_writer.cc
+++ b/cpp/src/file/tsfile_io_writer.cc
@@ -21,6 +21,8 @@
 
 #include <fcntl.h>
 
+#include <chrono>
+#include <iomanip>
 #include <memory>
 
 #include "common/device_id.h"
@@ -40,14 +42,20 @@ namespace storage {
 #define OFFSET_DEBUG(msg) void(msg)
 #endif
 
+int64_t TsFileIOWriter::get_meta_size() const {
+    return meta_allocator_.get_total_used_bytes();
+}
+
 int TsFileIOWriter::init(WriteFile* write_file) {
     int ret = E_OK;
     const uint32_t page_size = 1024;
     meta_allocator_.init(page_size, MOD_TSFILE_WRITER_META);
     chunk_meta_count_ = 0;
-    recovery_chunk_meta_prefix_.clear();
-    destroyed_ = false;
     file_ = write_file;
+    // Re-arm destroy() for the new lifecycle.  Without this, a writer that
+    // was destroy()'d and then init()'d again would leak the fresh
+    // meta_allocator_/write_stream_/file_ on its next destroy().
+    destroyed_ = false;
     return ret;
 }
 
@@ -55,48 +63,37 @@ void TsFileIOWriter::destroy() {
     if (destroyed_) {
         return;
     }
-    // Recovery attaches a prefix of ChunkGroupMeta; device_id and chunk stats
-    // in that snapshot live in reader/recovery memory. After open, new chunks
-    // may be pushed into the same ChunkGroupMeta (same device); only those
-    // appended ChunkMeta need statistic_->destroy() (see
-    // recovery_chunk_meta_prefix_).
-    for (auto iter = chunk_group_meta_list_.begin();
-         iter != chunk_group_meta_list_.end(); iter++) {
-        ChunkGroupMeta* cgm = iter.get();
-        auto prefix_it = recovery_chunk_meta_prefix_.find(cgm);
-        const bool is_recovery_cgm =
-            chunk_group_meta_from_recovery_ && cgm != nullptr &&
-            prefix_it != recovery_chunk_meta_prefix_.end();
-        uint32_t recovered_cm_count = is_recovery_cgm ? prefix_it->second : 0;
-
-        if (!is_recovery_cgm) {
-            if (cgm != nullptr && cgm->device_id_) {
-                cgm->device_id_.reset();
-            }
-        }
-
-        if (cgm == nullptr) {
-            continue;
-        }
-        uint32_t cm_idx = 0;
-        for (auto chunk_meta = cgm->chunk_meta_list_.begin();
-             chunk_meta != cgm->chunk_meta_list_.end();
-             chunk_meta++, cm_idx++) {
-            if (chunk_meta.get() == nullptr ||
-                chunk_meta.get()->statistic_ == nullptr) {
-                continue;
-            }
-            if (is_recovery_cgm && cm_idx < recovered_cm_count) {
-                continue;
-            }
-            chunk_meta.get()->statistic_->destroy();
+    // Free heap-allocated PageArenas held by each appended statistic and
+    // drop shared_ptr refs on each appended CGM's device_id_.  Recovered
+    // entries from RestorableTsFileIOWriter live in self_check_arena_ and
+    // are not tracked here; the restorable writer cleans those up itself.
+    for (ChunkMeta* cm : appended_chunk_metas_) {
+        if (cm != nullptr && cm->statistic_ != nullptr) {
+            cm->statistic_->destroy();
         }
     }
-
-    if (cur_chunk_meta_ != nullptr && cur_chunk_meta_->statistic_ != nullptr) {
-        cur_chunk_meta_->statistic_->destroy();
-        cur_chunk_meta_ = nullptr;
+    appended_chunk_metas_.clear();
+    for (ChunkGroupMeta* cgm : appended_chunk_group_metas_) {
+        if (cgm != nullptr && cgm->device_id_) {
+            cgm->device_id_.reset();
+        }
     }
+    appended_chunk_group_metas_.clear();
+    // Drop every pointer that referenced meta_allocator_-owned memory before
+    // destroying the arena.  Without this, a reused writer (destroy() + a new
+    // init()) would still see the dangling CGM list/index/cur_* slots from
+    // the previous lifecycle and dereference freed nodes the next time
+    // start_flush_chunk_group() linear-scans the list.
+    chunk_group_meta_list_.clear();
+    chunk_group_meta_index_.clear();
+    cur_chunk_meta_ = nullptr;
+    cur_chunk_group_meta_ = nullptr;
+    cur_device_name_.reset();
+    chunk_meta_count_ = 0;
+    use_prev_alloc_cgm_ = false;
+    is_aligned_ = false;
+    file_base_offset_ = 0;
+    destroyed_ = true;
 
     meta_allocator_.destroy();
     write_stream_.destroy();
@@ -104,7 +101,6 @@ void TsFileIOWriter::destroy() {
         delete file_;
         file_ = nullptr;
     }
-    destroyed_ = true;
 }
 
 int TsFileIOWriter::start_file() {
@@ -145,6 +141,7 @@ int TsFileIOWriter::start_flush_chunk_group(
         } else {
             cur_chunk_group_meta_ = new (buf) ChunkGroupMeta(&meta_allocator_);
             cur_chunk_group_meta_->init(device_name);
+            appended_chunk_group_metas_.push_back(cur_chunk_group_meta_);
         }
     }
     return ret;
@@ -183,6 +180,7 @@ int TsFileIOWriter::start_flush_chunk(common::ByteStream& chunk_data,
         ret = cur_chunk_meta_->init(mname, data_type, cur_file_position(),
                                     chunk_statistic_copy, mask, encoding,
                                     compression, meta_allocator_);
+        appended_chunk_metas_.push_back(cur_chunk_meta_);
     }
 
     // Step 2. serialize chunk header to write_stream_
@@ -258,6 +256,8 @@ int TsFileIOWriter::end_flush_chunk_group(bool is_aligned) {
         cur_chunk_group_meta_ = nullptr;
         return common::E_OK;
     }
+    chunk_group_meta_index_[cur_device_name_->get_device_name()] =
+        cur_chunk_group_meta_;
     int ret = chunk_group_meta_list_.push_back(cur_chunk_group_meta_);
     cur_chunk_group_meta_ = nullptr;
     return ret;
@@ -269,17 +269,19 @@ int TsFileIOWriter::end_file() {
         return E_OK;
     }
     OFFSET_DEBUG("before end file");
+
     if (RET_FAIL(write_log_index_range())) {
         std::cout << "writer range index error, ret =" << ret << std::endl;
     } else if (RET_FAIL(write_file_index())) {
         std::cout << "writer file index error, ret = " << ret << std::endl;
     } else if (RET_FAIL(write_file_footer())) {
         std::cout << "writer file footer error, ret = " << ret << std::endl;
-    } else if (RET_FAIL(sync_file())) {
+    } else if (g_config_value_.sync_on_close_ && RET_FAIL(sync_file())) {
         std::cout << "sync file error, ret = " << ret << std::endl;
     } else if (RET_FAIL(close_file())) {
         std::cout << "close file error, ret = " << ret << std::endl;
     }
+
     return ret;
 }
 
diff --git a/cpp/src/file/tsfile_io_writer.h b/cpp/src/file/tsfile_io_writer.h
index 088e52f56..4904b924a 100644
--- a/cpp/src/file/tsfile_io_writer.h
+++ b/cpp/src/file/tsfile_io_writer.h
@@ -21,6 +21,7 @@
 #define FILE_TSFILE_IO_WRITER_H
 
 #include <map>
+#include <unordered_map>
 #include <vector>
 
 #include "common/allocator/page_arena.h"
@@ -108,6 +109,7 @@ class TsFileIOWriter {
 
     FORCE_INLINE std::string get_file_path() { return file_->get_file_path(); }
     FORCE_INLINE std::shared_ptr<Schema> get_schema() { return schema_; }
+    int64_t get_meta_size() const;
 
    private:
     int write_log_index_range();
@@ -191,13 +193,19 @@ class TsFileIOWriter {
     /** For RestorableTsFileIOWriter: append a recovered ChunkGroupMeta. */
     void push_chunk_group_meta(ChunkGroupMeta* cgm) {
         chunk_group_meta_list_.push_back(cgm);
+        if (cgm->device_id_) {
+            chunk_group_meta_index_[cgm->device_id_->get_device_name()] = cgm;
+        }
     }
-    /** True when chunk_group_meta_list_ has a prefix loaded from recovery;
-     * destroy() must not free device_id_/statistic_ for that prefix only. */
-    bool chunk_group_meta_from_recovery_ = false;
-    /** Recovered ChunkGroupMeta* -> chunk_meta_list_.size() at attach (pointer
-     * keys avoid idx skew). */
-    std::map<ChunkGroupMeta*, uint32_t> recovery_chunk_meta_prefix_;
+    /** Chunks/CGMs allocated from meta_allocator_ via start_flush_chunk*()
+     * (post-recovery for the restorable writer, all chunks for the normal
+     * writer).  destroy() iterates these directly to free the heap-allocated
+     * PageArena owned by each statistic and the shared_ptr<IDeviceID> held
+     * by each new CGM, without touching recovery-owned entries that live in
+     * RestorableTsFileIOWriter::self_check_arena_. */
+    std::vector<ChunkMeta*> appended_chunk_metas_;
+    std::vector<ChunkGroupMeta*> appended_chunk_group_metas_;
+    bool destroyed_ = false;
     /**
      * Recovery only: set file_base_offset_ so that cur_file_position() returns
      * correct absolute offsets.  After recovery the writer behaves as if the
@@ -214,6 +222,9 @@ class TsFileIOWriter {
     ChunkGroupMeta* cur_chunk_group_meta_;
     int32_t chunk_meta_count_;  // for debug
     common::SimpleList<ChunkGroupMeta*> chunk_group_meta_list_;
+    // O(1) lookup for existing ChunkGroupMeta by device name, avoiding the
+    // O(N) linear scan through chunk_group_meta_list_ per device.
+    std::unordered_map<std::string, ChunkGroupMeta*> chunk_group_meta_index_;
     bool use_prev_alloc_cgm_;  // chunk group meta
     std::shared_ptr<IDeviceID> cur_device_name_;
     WriteFile* file_;
@@ -227,10 +238,6 @@ class TsFileIOWriter {
     /** Recovery only: absolute file offset at which write_stream_ logically
      * begins.  Normal (non-recovery) path keeps this at 0. */
     int64_t file_base_offset_ = 0;
-    /** Set after destroy() completes; avoids double cleanup when
-     * RestorableTsFileIOWriter::close() calls destroy() before
-     * self_check_arena_.destroy(), then ~TsFileIOWriter runs again. */
-    bool destroyed_ = false;
 
     friend class RestorableTsFileIOWriter;  // uses push_chunk_group_meta
 };
diff --git a/cpp/src/reader/aligned_chunk_reader.cc b/cpp/src/reader/aligned_chunk_reader.cc
index 49c469547..7e2bda41e 100644
--- a/cpp/src/reader/aligned_chunk_reader.cc
+++ b/cpp/src/reader/aligned_chunk_reader.cc
@@ -19,8 +19,13 @@
 
 #include "aligned_chunk_reader.h"
 
+#include <algorithm>
 #include <limits>
 
+#include "common/global.h"
+#ifdef ENABLE_THREADS
+#include "common/thread_pool.h"
+#endif
 #include "compress/compressor_factory.h"
 #include "encoding/decoder_factory.h"
 
@@ -56,19 +61,74 @@ void AlignedChunkReader::reset() {
     if (file_data_buf != nullptr) {
         mem_free(file_data_buf);
     }
+    time_in_stream_.clear_wrapped_buf();
     time_in_stream_.reset();
     file_data_buf = value_in_stream_.get_wrapped_buf();
     if (file_data_buf != nullptr) {
         mem_free(file_data_buf);
     }
+    value_in_stream_.clear_wrapped_buf();
     value_in_stream_.reset();
     file_data_time_buf_size_ = 0;
     file_data_value_buf_size_ = 0;
     time_chunk_visit_offset_ = 0;
     value_chunk_visit_offset_ = 0;
+    page_plan_built_ = false;
+    current_page_loaded_ = false;
+    current_page_plan_index_ = 0;
+    time_predecoded_ = false;
+    page_all_times_.clear();
+    page_time_count_ = 0;
+    page_time_cursor_ = 0;
+
+    // Free leftover uncompressed buffers from the previous chunk.
+    if (time_uncompressed_buf_ != nullptr && time_compressor_ != nullptr) {
+        time_compressor_->after_uncompress(time_uncompressed_buf_);
+        time_uncompressed_buf_ = nullptr;
+    }
+
+    // Multi-value reset
+    for (auto* col : value_columns_) {
+        // Free uncompressed buffer before resetting.
+        if (col->uncompressed_buf != nullptr && col->compressor != nullptr) {
+            col->compressor->after_uncompress(col->uncompressed_buf);
+            col->uncompressed_buf = nullptr;
+        }
+        char* buf = col->in_stream.get_wrapped_buf();
+        if (buf != nullptr) mem_free(buf);
+        col->in_stream.clear_wrapped_buf();
+        col->in_stream.reset();
+        col->in.reset();
+        col->chunk_header.reset();
+        col->cur_page_header.reset();
+        col->file_data_buf_size = 0;
+        col->chunk_visit_offset = 0;
+        col->notnull_bitmap.clear();
+        col->cur_value_index = -1;
+        col->chunk_meta = nullptr;
+        for (auto& pps : col->per_page_state) {
+            pps.predecode_pa.destroy();
+        }
+        col->per_page_state.clear();
+        col->pending_decoded_values.clear();
+        col->pending_decoded_count = 0;
+        col->pending_decoded_cursor = 0;
+        col->pending_decoded = false;
+        // Note: decoder/compressor are NOT freed here — they are reused by
+        // alloc_compressor_and_decoder() in load_by_aligned_meta_multi().
+    }
+    release_current_page_state();
+    chunk_pages_.clear();
+    per_page_times_.clear();
 }
 
 void AlignedChunkReader::destroy() {
+    // .clear() leaves the vector's internal heap buffer allocated, which
+    // mem_free can't reach because we placement-new the reader. swap with
+    // an empty vector to actually release the backing storage so ASan's
+    // LeakSanitizer doesn't flag the (rather large) ChunkPageInfo buffers.
+    std::vector<ChunkPageInfo>{}.swap(chunk_pages_);
+    std::vector<int64_t>{}.swap(page_all_times_);
     if (time_uncompressed_buf_ != nullptr && time_compressor_ != nullptr) {
         time_compressor_->after_uncompress(time_uncompressed_buf_);
         time_uncompressed_buf_ = nullptr;
@@ -112,6 +172,59 @@ void AlignedChunkReader::destroy() {
     }
     cur_value_page_header_.reset();
     chunk_header_.~ChunkHeader();
+
+    // Multi-value destroy
+    for (size_t ci = 0; ci < value_columns_.size(); ci++) {
+        auto* col = value_columns_[ci];
+        if (col->decoder != nullptr) {
+            col->decoder->~Decoder();
+            DecoderFactory::free(col->decoder);
+            col->decoder = nullptr;
+        }
+        if (col->compressor != nullptr) {
+            col->compressor->~Compressor();
+            CompressorFactory::free(col->compressor);
+            col->compressor = nullptr;
+        }
+        for (auto& pps : col->per_page_state) {
+            pps.predecode_pa.destroy();
+        }
+        col->per_page_state.clear();
+        col->pending_decoded_values.clear();
+        buf = col->in_stream.get_wrapped_buf();
+        if (buf != nullptr) {
+            mem_free(buf);
+            col->in_stream.clear_wrapped_buf();
+        }
+        col->cur_page_header.reset();
+        delete col;
+    }
+    // This reader is placement-new'd and torn down via destroy() + mem_free
+    // without ever running ~AlignedChunkReader (see
+    // TsFileSeriesScanIterator::destroy), so .clear() would leave these
+    // vectors' backing buffers allocated and unreachable.  swap with an empty
+    // vector to actually release the storage, matching the chunk_pages_ /
+    // page_all_times_ handling above.
+    std::vector<ValueColumnState*>().swap(value_columns_);
+    release_current_page_state();
+    std::vector<std::vector<int64_t>>().swap(per_page_times_);
+#ifdef ENABLE_THREADS
+    decode_pool_ = nullptr;  // borrowed, not owned
+    for (auto* d : time_decoder_pool_) {
+        if (d != nullptr) {
+            d->~Decoder();
+            DecoderFactory::free(d);
+        }
+    }
+    std::vector<Decoder*>().swap(time_decoder_pool_);
+    for (auto* c : time_compressor_pool_) {
+        if (c != nullptr) {
+            c->~Compressor();
+            CompressorFactory::free(c);
+        }
+    }
+    std::vector<Compressor*>().swap(time_compressor_pool_);
+#endif
 }
 
 int AlignedChunkReader::load_by_aligned_meta(ChunkMeta* time_chunk_meta,
@@ -218,15 +331,19 @@ int AlignedChunkReader::alloc_compressor_and_decoder(
 
 int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock,
                                       Filter* oneshoot_filter, PageArena& pa) {
+    if (multi_value_mode_) {
+        return get_next_page_multi(ret_tsblock, oneshoot_filter, pa);
+    }
     int ret = E_OK;
     Filter* filter =
         (oneshoot_filter != nullptr ? oneshoot_filter : time_filter_);
-    if (prev_time_page_not_finish() && prev_value_page_not_finish()) {
-        ret = decode_time_value_buf_into_tsblock(ret_tsblock, oneshoot_filter,
-                                                 &pa);
+    bool pt = prev_time_page_not_finish();
+    bool pv = prev_value_page_not_finish();
+    if (pt && pv) {
+        ret = decode_time_value_buf_into_tsblock(ret_tsblock, filter, &pa);
         return ret;
     }
-    if (!prev_time_page_not_finish() && !prev_value_page_not_finish()) {
+    if (!pt && !pv) {
         while (IS_SUCC(ret)) {
             if (RET_FAIL(get_cur_page_header(
                     time_chunk_meta_, time_in_stream_, cur_time_page_header_,
@@ -249,8 +366,7 @@ int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock,
         }
     }
     if (IS_SUCC(ret)) {
-        ret = decode_time_value_buf_into_tsblock(ret_tsblock, oneshoot_filter,
-                                                 &pa);
+        ret = decode_time_value_buf_into_tsblock(ret_tsblock, filter, &pa);
     }
     return ret;
 }
@@ -259,7 +375,8 @@ int AlignedChunkReader::get_cur_page_header(ChunkMeta*& chunk_meta,
                                             common::ByteStream& in_stream,
                                             PageHeader& cur_page_header,
                                             uint32_t& chunk_visit_offset,
-                                            ChunkHeader& chunk_header) {
+                                            ChunkHeader& chunk_header,
+                                            int32_t* override_buf_size) {
     int ret = E_OK;
     bool retry = true;
     int cur_page_header_serialized_size = 0;
@@ -282,7 +399,8 @@ int AlignedChunkReader::get_cur_page_header(ChunkMeta*& chunk_meta,
             retry = false;
             retry_read_want_size += 1024;
             int32_t& file_data_buf_size =
-                chunk_header.data_type_ == common::VECTOR
+                override_buf_size != nullptr ? *override_buf_size
+                : chunk_header.data_type_ == common::VECTOR
                     ? file_data_time_buf_size_
                     : file_data_value_buf_size_;
             // do not shrink buffer for page header, otherwise, the buffer is
@@ -326,9 +444,13 @@ int AlignedChunkReader::read_from_file_and_rewrap(
         (may_shrink && read_size < file_data_buf_size / 10)) {
         file_data_buf = (char*)mem_realloc(file_data_buf, read_size);
         if (IS_NULL(file_data_buf)) {
+            in_stream_.clear_wrapped_buf();
             return E_OOM;
         }
         file_data_buf_size = read_size;
+        // Update stream pointer immediately so it stays valid even if
+        // the subsequent read fails and the caller frees via destroy().
+        in_stream_.wrap_from(file_data_buf, read_size);
     }
     int ret_read_len = 0;
     if (RET_FAIL(
@@ -563,6 +685,7 @@ int AlignedChunkReader::decode_time_value_buf_into_tsblock(
                 row_appender.append_null(1);                                   \
                 continue;                                                      \
             }                                                                  \
+            assert(value_decoder_->has_remaining(value_in));                   \
             if (!value_decoder_->has_remaining(value_in)) {                    \
                 return common::E_DATA_INCONSISTENCY;                           \
             }                                                                  \
@@ -597,19 +720,19 @@ int AlignedChunkReader::i32_DECODE_TYPED_TV_INTO_TSBLOCK(
         if (value_page_col_notnull_bitmap_.empty() ||
             ((value_page_col_notnull_bitmap_[cur_value_index / 8] & 0xFF) &
              (mask >> (cur_value_index % 8))) == 0) {
-            if (UNLIKELY(!row_appender.add_row())) {
-                ret = E_OVERFLOW;
-                cur_value_index--;
-                break;
-            }
             ret = time_decoder_->read_int64(time, time_in);
             if (ret != E_OK) {
                 break;
             }
+            if (UNLIKELY(!row_appender.add_row())) {
+                ret = E_OVERFLOW;
+                break;
+            }
             row_appender.append(0, (char*)&time, sizeof(time));
             row_appender.append_null(1);
             continue;
         }
+        assert(value_decoder_->has_remaining(value_in));
         if (!value_decoder_->has_remaining(value_in)) {
             return common::E_DATA_INCONSISTENCY;
         }
@@ -632,6 +755,566 @@ int AlignedChunkReader::i32_DECODE_TYPED_TV_INTO_TSBLOCK(
     return ret;
 }
 
+int AlignedChunkReader::i32_DECODE_TV_BATCH(ByteStream& time_in,
+                                            ByteStream& value_in,
+                                            RowAppender& row_appender,
+                                            Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    int32_t values[BATCH];
+    const uint32_t null_mask_base = 1 << 7;
+
+    while (time_decoder_->has_remaining(time_in)) {
+        if (row_appender.remaining() < (uint32_t)BATCH) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    int nonnull = 0;
+                    for (int i = 0; i < block_count; ++i) {
+                        int vi = cur_value_index + 1 + i;
+                        if (!value_page_col_notnull_bitmap_.empty() &&
+                            ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                             (null_mask_base >> (vi % 8))) != 0) {
+                            ++nonnull;
+                        }
+                    }
+                    cur_value_index += block_count;
+                    if (nonnull > 0) {
+                        // skip_* may legitimately fail (truncated page) or
+                        // short-read (corrupt bitmap vs. data); both must
+                        // abort the loop rather than silently desync the
+                        // value decoder.  Same defect the multi-value path
+                        // already guards against.
+                        int sk = 0;
+                        if (RET_FAIL(value_decoder_->skip_int32(nonnull, sk,
+                                                                value_in))) {
+                            break;
+                        }
+                        if (sk != nonnull) {
+                            ret = E_TSFILE_CORRUPTED;
+                            break;
+                        }
+                    }
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count,
+                                                     time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool is_null[BATCH];
+        int nonnull_count = 0;
+        for (int i = 0; i < time_count; ++i) {
+            int vi = cur_value_index + 1 + i;
+            if (value_page_col_notnull_bitmap_.empty() ||
+                ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                 (null_mask_base >> (vi % 8))) == 0) {
+                is_null[i] = true;
+            } else {
+                is_null[i] = false;
+                ++nonnull_count;
+            }
+        }
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            if (nonnull_count > 0) {
+                int skipped = 0;
+                if (RET_FAIL(value_decoder_->skip_int32(nonnull_count, skipped,
+                                                        value_in))) {
+                    break;
+                }
+                if (skipped != nonnull_count) {
+                    ret = E_TSFILE_CORRUPTED;
+                    break;
+                }
+            }
+            cur_value_index += time_count;
+            continue;
+        }
+
+        int value_count = 0;
+        if (nonnull_count > 0) {
+            if (RET_FAIL(value_decoder_->read_batch_int32(
+                    values, nonnull_count, value_count, value_in))) {
+                break;
+            }
+        }
+
+        int val_idx = 0;
+        for (int i = 0; i < time_count; ++i) {
+            cur_value_index++;
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                if (!is_null[i]) ++val_idx;
+                continue;
+            }
+            if (is_null[i]) {
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append_null(1);
+            } else {
+                int32_t val = values[val_idx++];
+                if (filter != nullptr && !block_all_pass &&
+                    !filter->satisfy(times[i], (int64_t)val)) {
+                    continue;
+                }
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append(1, (char*)&val, sizeof(int32_t));
+            }
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
+int AlignedChunkReader::i64_DECODE_TV_BATCH(ByteStream& time_in,
+                                            ByteStream& value_in,
+                                            RowAppender& row_appender,
+                                            Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    int64_t values[BATCH];
+    const uint32_t null_mask_base = 1 << 7;
+
+    while (time_decoder_->has_remaining(time_in)) {
+        if (row_appender.remaining() < (uint32_t)BATCH) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check: skip entire block if out of range
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    int nonnull = 0;
+                    for (int i = 0; i < block_count; ++i) {
+                        int vi = cur_value_index + 1 + i;
+                        if (!value_page_col_notnull_bitmap_.empty() &&
+                            ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                             (null_mask_base >> (vi % 8))) != 0) {
+                            ++nonnull;
+                        }
+                    }
+                    cur_value_index += block_count;
+                    if (nonnull > 0) {
+                        // See i32 path above for the rationale.
+                        int sk = 0;
+                        if (RET_FAIL(value_decoder_->skip_int64(nonnull, sk,
+                                                                value_in))) {
+                            break;
+                        }
+                        if (sk != nonnull) {
+                            ret = E_TSFILE_CORRUPTED;
+                            break;
+                        }
+                    }
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count,
+                                                     time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool is_null[BATCH];
+        int nonnull_count = 0;
+        for (int i = 0; i < time_count; ++i) {
+            int vi = cur_value_index + 1 + i;
+            if (value_page_col_notnull_bitmap_.empty() ||
+                ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                 (null_mask_base >> (vi % 8))) == 0) {
+                is_null[i] = true;
+            } else {
+                is_null[i] = false;
+                ++nonnull_count;
+            }
+        }
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            if (nonnull_count > 0) {
+                int skipped = 0;
+                if (RET_FAIL(value_decoder_->skip_int64(nonnull_count, skipped,
+                                                        value_in))) {
+                    break;
+                }
+                if (skipped != nonnull_count) {
+                    ret = E_TSFILE_CORRUPTED;
+                    break;
+                }
+            }
+            cur_value_index += time_count;
+            continue;
+        }
+
+        int value_count = 0;
+        if (nonnull_count > 0) {
+            if (RET_FAIL(value_decoder_->read_batch_int64(
+                    values, nonnull_count, value_count, value_in))) {
+                break;
+            }
+        }
+
+        int val_idx = 0;
+        for (int i = 0; i < time_count; ++i) {
+            cur_value_index++;
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                if (!is_null[i]) ++val_idx;
+                continue;
+            }
+            if (is_null[i]) {
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append_null(1);
+            } else {
+                int64_t val = values[val_idx++];
+                if (filter != nullptr && !block_all_pass &&
+                    !filter->satisfy(times[i], val)) {
+                    continue;
+                }
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append(1, (char*)&val, sizeof(int64_t));
+            }
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
+int AlignedChunkReader::float_DECODE_TV_BATCH(ByteStream& time_in,
+                                              ByteStream& value_in,
+                                              RowAppender& row_appender,
+                                              Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    float values[BATCH];
+    const uint32_t null_mask_base = 1 << 7;
+
+    while (time_decoder_->has_remaining(time_in)) {
+        if (row_appender.remaining() < (uint32_t)BATCH) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    int nonnull = 0;
+                    for (int i = 0; i < block_count; ++i) {
+                        int vi = cur_value_index + 1 + i;
+                        if (!value_page_col_notnull_bitmap_.empty() &&
+                            ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                             (null_mask_base >> (vi % 8))) != 0) {
+                            ++nonnull;
+                        }
+                    }
+                    cur_value_index += block_count;
+                    if (nonnull > 0) {
+                        // See i32 path above for the rationale.
+                        int sk = 0;
+                        if (RET_FAIL(value_decoder_->skip_float(nonnull, sk,
+                                                                value_in))) {
+                            break;
+                        }
+                        if (sk != nonnull) {
+                            ret = E_TSFILE_CORRUPTED;
+                            break;
+                        }
+                    }
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count,
+                                                     time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool is_null[BATCH];
+        int nonnull_count = 0;
+        for (int i = 0; i < time_count; ++i) {
+            int vi = cur_value_index + 1 + i;
+            if (value_page_col_notnull_bitmap_.empty() ||
+                ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                 (null_mask_base >> (vi % 8))) == 0) {
+                is_null[i] = true;
+            } else {
+                is_null[i] = false;
+                ++nonnull_count;
+            }
+        }
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            if (nonnull_count > 0) {
+                int skipped = 0;
+                if (RET_FAIL(value_decoder_->skip_float(nonnull_count, skipped,
+                                                        value_in))) {
+                    break;
+                }
+                if (skipped != nonnull_count) {
+                    ret = E_TSFILE_CORRUPTED;
+                    break;
+                }
+            }
+            cur_value_index += time_count;
+            continue;
+        }
+
+        int value_count = 0;
+        if (nonnull_count > 0) {
+            if (RET_FAIL(value_decoder_->read_batch_float(
+                    values, nonnull_count, value_count, value_in))) {
+                break;
+            }
+        }
+
+        int val_idx = 0;
+        for (int i = 0; i < time_count; ++i) {
+            cur_value_index++;
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                if (!is_null[i]) ++val_idx;
+                continue;
+            }
+            if (is_null[i]) {
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append_null(1);
+            } else {
+                float val = values[val_idx++];
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append(1, (char*)&val, sizeof(float));
+            }
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
+int AlignedChunkReader::double_DECODE_TV_BATCH(ByteStream& time_in,
+                                               ByteStream& value_in,
+                                               RowAppender& row_appender,
+                                               Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    double values[BATCH];
+    const uint32_t null_mask_base = 1 << 7;
+
+    while (time_decoder_->has_remaining(time_in)) {
+        if (row_appender.remaining() < (uint32_t)BATCH) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    int nonnull = 0;
+                    for (int i = 0; i < block_count; ++i) {
+                        int vi = cur_value_index + 1 + i;
+                        if (!value_page_col_notnull_bitmap_.empty() &&
+                            ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                             (null_mask_base >> (vi % 8))) != 0) {
+                            ++nonnull;
+                        }
+                    }
+                    cur_value_index += block_count;
+                    if (nonnull > 0) {
+                        // See i32 path above for the rationale.
+                        int sk = 0;
+                        if (RET_FAIL(value_decoder_->skip_double(nonnull, sk,
+                                                                 value_in))) {
+                            break;
+                        }
+                        if (sk != nonnull) {
+                            ret = E_TSFILE_CORRUPTED;
+                            break;
+                        }
+                    }
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, BATCH, time_count,
+                                                     time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool is_null[BATCH];
+        int nonnull_count = 0;
+        for (int i = 0; i < time_count; ++i) {
+            int vi = cur_value_index + 1 + i;
+            if (value_page_col_notnull_bitmap_.empty() ||
+                ((value_page_col_notnull_bitmap_[vi / 8] & 0xFF) &
+                 (null_mask_base >> (vi % 8))) == 0) {
+                is_null[i] = true;
+            } else {
+                is_null[i] = false;
+                ++nonnull_count;
+            }
+        }
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            if (nonnull_count > 0) {
+                int skipped = 0;
+                if (RET_FAIL(value_decoder_->skip_double(nonnull_count, skipped,
+                                                         value_in))) {
+                    break;
+                }
+                if (skipped != nonnull_count) {
+                    ret = E_TSFILE_CORRUPTED;
+                    break;
+                }
+            }
+            cur_value_index += time_count;
+            continue;
+        }
+
+        int value_count = 0;
+        if (nonnull_count > 0) {
+            if (RET_FAIL(value_decoder_->read_batch_double(
+                    values, nonnull_count, value_count, value_in))) {
+                break;
+            }
+        }
+
+        int val_idx = 0;
+        for (int i = 0; i < time_count; ++i) {
+            cur_value_index++;
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                if (!is_null[i]) ++val_idx;
+                continue;
+            }
+            if (is_null[i]) {
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append_null(1);
+            } else {
+                double val = values[val_idx++];
+                if (UNLIKELY(!row_appender.add_row())) {
+                    ret = E_OVERFLOW;
+                    break;
+                }
+                row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+                row_appender.append(1, (char*)&val, sizeof(double));
+            }
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
 int AlignedChunkReader::decode_tv_buf_into_tsblock_by_datatype(
     ByteStream& time_in, ByteStream& value_in, TsBlock* ret_tsblock,
     Filter* filter, common::PageArena* pa) {
@@ -644,23 +1327,24 @@ int AlignedChunkReader::decode_tv_buf_into_tsblock_by_datatype(
             break;
         case common::DATE:
         case common::INT32:
-            // DECODE_TYPED_TV_INTO_TSBLOCK(int32_t, int32, time_in_, value_in_,
-            //                              row_appender);
-            ret = i32_DECODE_TYPED_TV_INTO_TSBLOCK(time_in_, value_in_,
-                                                   row_appender, filter);
+            // Batch decode path: read_batch_int{32,64} consumes whole TS_2DIFF
+            // blocks at once (and uses SIMD when ENABLE_SIMD); replaces a
+            // per-value decode() loop that hot-dominated the read flame graph.
+            ret =
+                i32_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter);
             break;
         case common::TIMESTAMP:
         case common::INT64:
-            DECODE_TYPED_TV_INTO_TSBLOCK(int64_t, int64, time_in_, value_in_,
-                                         row_appender);
+            ret =
+                i64_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter);
             break;
         case common::FLOAT:
-            DECODE_TYPED_TV_INTO_TSBLOCK(float, float, time_in_, value_in_,
-                                         row_appender);
+            ret = float_DECODE_TV_BATCH(time_in_, value_in_, row_appender,
+                                        filter);
             break;
         case common::DOUBLE:
-            DECODE_TYPED_TV_INTO_TSBLOCK(double, double, time_in_, value_in_,
-                                         row_appender);
+            ret = double_DECODE_TV_BATCH(time_in_, value_in_, row_appender,
+                                         filter);
             break;
         case common::STRING:
         case common::BLOB:
@@ -695,6 +1379,7 @@ int AlignedChunkReader::STRING_DECODE_TYPED_TV_INTO_TSBLOCK(
         }
 
         if (should_read_data) {
+            assert(value_decoder_->has_remaining(value_in));
             if (!value_decoder_->has_remaining(value_in)) {
                 return E_DATA_INCONSISTENCY;
             }
@@ -740,21 +1425,15 @@ bool AlignedChunkReader::should_skip_page_by_offset(int& row_offset) {
     if (row_offset <= 0) {
         return false;
     }
-    // Aligned TV pages: only skip a whole page by count when both page headers
-    // expose the same positive row count. Using a single side (or min) when
-    // the other is missing or unequal can desynchronize row_offset from
-    // decoded row order vs. the paired time/value stream.
-    Statistic* ts = cur_time_page_header_.statistic_;
-    Statistic* vs = cur_value_page_header_.statistic_;
-    if (ts == nullptr || vs == nullptr) {
-        return false;
+    // Use time page statistic for count.
+    Statistic* stat = cur_time_page_header_.statistic_;
+    if (stat == nullptr) {
+        stat = cur_value_page_header_.statistic_;
     }
-    int32_t tc = ts->count_;
-    int32_t vc = vs->count_;
-    if (tc <= 0 || vc <= 0 || tc != vc) {
+    if (stat == nullptr || stat->count_ == 0) {
         return false;
     }
-    int32_t count = tc;
+    int32_t count = stat->count_;
     if (row_offset >= count) {
         row_offset -= count;
         return true;
@@ -766,6 +1445,19 @@ int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock,
                                       Filter* oneshoot_filter, PageArena& pa,
                                       int64_t min_time_hint, int& row_offset,
                                       int& row_limit) {
+    if (multi_value_mode_) {
+        // Multi-value aligned path doesn't yet honour row_offset / row_limit
+        // / min_time_hint — they get dropped on the floor, which silently
+        // returns full chunk data when the caller asked for a sub-range.
+        // Refuse the combination so the caller sees an actual error instead
+        // of garbage results.  set_row_range(0, -1) keeps the all-rows
+        // contract intact for normal queries.
+        if (row_offset > 0 || row_limit >= 0 ||
+            min_time_hint != std::numeric_limits<int64_t>::min()) {
+            return common::E_NOT_SUPPORT;
+        }
+        return get_next_page_multi(ret_tsblock, oneshoot_filter, pa);
+    }
     int ret = E_OK;
     Filter* filter =
         (oneshoot_filter != nullptr ? oneshoot_filter : time_filter_);
@@ -774,12 +1466,14 @@ int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock,
         return E_NO_MORE_DATA;
     }
 
-    if (prev_time_page_not_finish() && prev_value_page_not_finish()) {
-        ret = decode_time_value_buf_into_tsblock(ret_tsblock, oneshoot_filter,
-                                                 &pa);
+    bool pt = prev_time_page_not_finish();
+    bool pv = prev_value_page_not_finish();
+
+    if (pt && pv) {
+        ret = decode_time_value_buf_into_tsblock(ret_tsblock, filter, &pa);
         return ret;
     }
-    if (!prev_time_page_not_finish() && !prev_value_page_not_finish()) {
+    if (!pt && !pv) {
         while (IS_SUCC(ret)) {
             if (RET_FAIL(get_cur_page_header(
                     time_chunk_meta_, time_in_stream_, cur_time_page_header_,
@@ -810,10 +1504,1560 @@ int AlignedChunkReader::get_next_page(TsBlock* ret_tsblock,
         }
     }
     if (IS_SUCC(ret)) {
-        ret = decode_time_value_buf_into_tsblock(ret_tsblock, oneshoot_filter,
-                                                 &pa);
+        ret = decode_time_value_buf_into_tsblock(ret_tsblock, filter, &pa);
+    }
+    return ret;
+}
+
+// ══════════════════════════════════════════════════════════════════════════
+//  Multi-value AlignedChunkReader implementation
+// ══════════════════════════════════════════════════════════════════════════
+
+int AlignedChunkReader::load_by_aligned_meta_multi(
+    ChunkMeta* time_chunk_meta, const std::vector<ChunkMeta*>& value_metas) {
+    int ret = E_OK;
+    multi_value_mode_ = true;
+    time_chunk_meta_ = time_chunk_meta;
+    page_plan_built_ = false;
+    current_page_loaded_ = false;
+    current_page_plan_index_ = 0;
+    time_predecoded_ = false;
+    page_all_times_.clear();
+    page_time_count_ = 0;
+    page_time_cursor_ = 0;
+
+    // ── Load time chunk header ──
+    file_data_time_buf_size_ = 1024;
+    int32_t ret_read_len = 0;
+    char* time_file_data_buf =
+        (char*)mem_alloc(file_data_time_buf_size_, MOD_CHUNK_READER);
+    if (IS_NULL(time_file_data_buf)) return E_OOM;
+
+    ret = read_file_->read(time_chunk_meta_->offset_of_chunk_header_,
+                           time_file_data_buf, file_data_time_buf_size_,
+                           ret_read_len);
+    if (IS_SUCC(ret) && ret_read_len < ChunkHeader::MIN_SERIALIZED_SIZE) {
+        ret = E_TSFILE_CORRUPTED;
+        mem_free(time_file_data_buf);
+        return ret;
+    }
+    if (IS_SUCC(ret)) {
+        time_in_stream_.wrap_from(time_file_data_buf, ret_read_len);
+        if (RET_FAIL(time_chunk_header_.deserialize_from(time_in_stream_))) {
+            return ret;
+        }
+        time_chunk_visit_offset_ = time_in_stream_.read_pos();
+    }
+
+    // Alloc time decoder/compressor
+    if (IS_SUCC(ret)) {
+        if (RET_FAIL(alloc_compressor_and_decoder(
+                time_decoder_, time_compressor_,
+                time_chunk_header_.encoding_type_,
+                time_chunk_header_.data_type_,
+                time_chunk_header_.compression_type_))) {
+            return ret;
+        }
+    }
+
+    // ── Load each value column ──
+    // Reuse existing ValueColumnState objects if count matches (reset() already
+    // cleared their internal state).  Otherwise, recreate.
+    if (value_columns_.size() != value_metas.size()) {
+        for (auto* p : value_columns_) delete p;
+        value_columns_.clear();
+        value_columns_.reserve(value_metas.size());
+        for (size_t c = 0; c < value_metas.size(); c++) {
+            value_columns_.push_back(new ValueColumnState);
+        }
+    }
+    for (size_t c = 0; c < value_metas.size() && IS_SUCC(ret); c++) {
+        auto* col = value_columns_[c];
+        col->chunk_meta = value_metas[c];
+        col->file_data_buf_size = 1024;
+        ret_read_len = 0;
+        char* vbuf =
+            (char*)mem_alloc(col->file_data_buf_size, MOD_CHUNK_READER);
+        if (IS_NULL(vbuf)) return E_OOM;
+
+        ret = read_file_->read(col->chunk_meta->offset_of_chunk_header_, vbuf,
+                               col->file_data_buf_size, ret_read_len);
+        if (IS_SUCC(ret) && ret_read_len < ChunkHeader::MIN_SERIALIZED_SIZE) {
+            ret = E_TSFILE_CORRUPTED;
+            mem_free(vbuf);
+            break;
+        }
+        if (IS_SUCC(ret)) {
+            col->in_stream.wrap_from(vbuf, ret_read_len);
+            if (RET_FAIL(col->chunk_header.deserialize_from(col->in_stream))) {
+                break;
+            }
+            col->chunk_visit_offset = col->in_stream.read_pos();
+            if (RET_FAIL(alloc_compressor_and_decoder(
+                    col->decoder, col->compressor,
+                    col->chunk_header.encoding_type_,
+                    col->chunk_header.data_type_,
+                    col->chunk_header.compression_type_))) {
+                break;
+            }
+        }
+    }
+
+    return ret;
+}
+
+bool AlignedChunkReader::has_more_data_multi() const {
+    if (page_plan_built_) {
+        if (current_page_loaded_) {
+            return page_time_cursor_ < page_time_count_;
+        }
+        return current_page_plan_index_ < chunk_pages_.size();
+    }
+    if (prev_time_page_not_finish() || prev_any_value_page_not_finish_multi()) {
+        return true;
+    }
+    if (time_chunk_visit_offset_ - time_chunk_header_.serialized_size_ <
+        time_chunk_header_.data_size_) {
+        return true;
+    }
+    for (const auto* col : value_columns_) {
+        if (col->chunk_visit_offset - col->chunk_header.serialized_size_ <
+            col->chunk_header.data_size_) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool AlignedChunkReader::prev_any_value_page_not_finish_multi() const {
+    for (const auto* col : value_columns_) {
+        if ((col->decoder && col->decoder->has_remaining(col->in)) ||
+            col->in.has_remaining()) {
+            return true;
+        }
+    }
+    return false;
+}
+
+bool AlignedChunkReader::has_variable_length_value_column() const {
+    for (const auto* col : value_columns_) {
+        if (col->chunk_header.data_type_ == common::STRING ||
+            col->chunk_header.data_type_ == common::TEXT ||
+            col->chunk_header.data_type_ == common::BLOB) {
+            return true;
+        }
+    }
+    return false;
+}
+
+int AlignedChunkReader::count_non_null_prefix(
+    const std::vector<uint8_t>& bitmap, int32_t row_limit) const {
+    if (row_limit <= 0 || bitmap.empty()) {
+        return 0;
+    }
+    const uint32_t mask_base = 1 << 7;
+    int count = 0;
+    for (int32_t i = 0; i < row_limit; i++) {
+        if (((bitmap[i / 8] & 0xFF) & (mask_base >> (i % 8))) != 0) {
+            count++;
+        }
+    }
+    return count;
+}
+
+int AlignedChunkReader::decode_time_page_direct(
+    const ChunkPageInfo& page_info, std::vector<int64_t>& out_times) {
+    return decode_time_page_with(page_info, out_times, time_decoder_,
+                                 time_compressor_);
+}
+
+// Worker-safe variant: uses caller-provided decoder + compressor instead of
+// the shared time_decoder_/time_compressor_ members.  Used by the parallel
+// time-page decode dispatch in decode_all_planned_pages.
+int AlignedChunkReader::decode_time_page_with(const ChunkPageInfo& page_info,
+                                              std::vector<int64_t>& out_times,
+                                              Decoder* decoder,
+                                              Compressor* compressor) {
+    out_times.clear();
+    if (page_info.time_compressed_size == 0) {
+        return E_OK;
+    }
+
+    char stack_buf[4096];
+    char* compressed_buf = stack_buf;
+    bool heap = page_info.time_compressed_size > sizeof(stack_buf);
+    if (heap) {
+        compressed_buf = static_cast<char*>(common::mem_alloc(
+            page_info.time_compressed_size, common::MOD_DEFAULT));
+        if (compressed_buf == nullptr) {
+            return E_OOM;
+        }
+    }
+
+    int32_t read_len = 0;
+    int ret = read_file_->read(page_info.time_file_offset, compressed_buf,
+                               page_info.time_compressed_size, read_len);
+    if (IS_FAIL(ret)) {
+        if (heap) common::mem_free(compressed_buf);
+        return ret;
+    }
+    // ReadFile::read() returns E_OK + short read_len on EOF; uncompressing
+    // page_info.time_compressed_size from a buffer with uninitialised tail
+    // bytes would feed garbage to the decompressor.
+    if (read_len != static_cast<int32_t>(page_info.time_compressed_size)) {
+        if (heap) common::mem_free(compressed_buf);
+        return E_TSFILE_CORRUPTED;
+    }
+
+    char* uncompressed_buf = nullptr;
+    uint32_t uncompressed_size = 0;
+    if (RET_FAIL(compressor->reset(false))) {
+        if (heap) common::mem_free(compressed_buf);
+        return ret;
+    }
+    ret = compressor->uncompress(compressed_buf, page_info.time_compressed_size,
+                                 uncompressed_buf, uncompressed_size);
+    if (heap && compressed_buf != uncompressed_buf) {
+        common::mem_free(compressed_buf);
+    }
+    if (IS_FAIL(ret) || uncompressed_size != page_info.time_uncompressed_size) {
+        if (uncompressed_buf != nullptr) {
+            compressor->after_uncompress(uncompressed_buf);
+        }
+        return E_TSFILE_CORRUPTED;
+    }
+
+    common::ByteStream in;
+    in.wrap_from(uncompressed_buf, uncompressed_size);
+    decoder->reset();
+    const int batch_size = 1024;
+    int64_t batch[batch_size];
+    while (decoder->has_remaining(in)) {
+        int actual = 0;
+        if (RET_FAIL(
+                decoder->read_batch_int64(batch, batch_size, actual, in))) {
+            break;
+        }
+        if (actual == 0) {
+            break;
+        }
+        out_times.insert(out_times.end(), batch, batch + actual);
+    }
+    compressor->after_uncompress(uncompressed_buf);
+    return ret;
+}
+
+int AlignedChunkReader::build_page_plan(Filter* filter) {
+    int ret = E_OK;
+    chunk_pages_.clear();
+    current_page_plan_index_ = 0;
+    current_page_loaded_ = false;
+    page_plan_built_ = false;
+
+    const uint32_t num_cols = value_columns_.size();
+    while (IS_SUCC(ret)) {
+        if (time_chunk_visit_offset_ - time_chunk_header_.serialized_size_ >=
+            time_chunk_header_.data_size_) {
+            break;
+        }
+
+        if (RET_FAIL(get_cur_page_header(
+                time_chunk_meta_, time_in_stream_, cur_time_page_header_,
+                time_chunk_visit_offset_, time_chunk_header_))) {
+            break;
+        }
+        if (cur_time_page_header_.compressed_size_ == 0 &&
+            cur_time_page_header_.uncompressed_size_ == 0) {
+            break;
+        }
+
+        ChunkPageInfo page_info;
+        page_info.time_file_offset = time_chunk_meta_->offset_of_chunk_header_ +
+                                     time_chunk_visit_offset_;
+        page_info.time_compressed_size = cur_time_page_header_.compressed_size_;
+        page_info.time_uncompressed_size =
+            cur_time_page_header_.uncompressed_size_;
+        page_info.value_file_offsets.resize(num_cols);
+        page_info.value_compressed_sizes.resize(num_cols);
+        page_info.value_uncompressed_sizes.resize(num_cols);
+
+        for (uint32_t c = 0; c < num_cols && IS_SUCC(ret); c++) {
+            auto* col = value_columns_[c];
+            if (RET_FAIL(get_cur_page_header(
+                    col->chunk_meta, col->in_stream, col->cur_page_header,
+                    col->chunk_visit_offset, col->chunk_header,
+                    &col->file_data_buf_size))) {
+                break;
+            }
+            page_info.value_file_offsets[c] =
+                col->chunk_meta->offset_of_chunk_header_ +
+                col->chunk_visit_offset;
+            page_info.value_compressed_sizes[c] =
+                col->cur_page_header.compressed_size_;
+            page_info.value_uncompressed_sizes[c] =
+                col->cur_page_header.uncompressed_size_;
+        }
+        if (IS_FAIL(ret)) {
+            break;
+        }
+
+        Statistic* stat = cur_time_page_header_.statistic_;
+        if (filter == nullptr) {
+            page_info.pass_type = PagePassType::FULL_PASS;
+            page_info.row_begin = 0;
+            page_info.row_end = stat != nullptr ? stat->count_ : 0;
+        } else if (stat != nullptr && !filter->satisfy(stat)) {
+            page_info.pass_type = PagePassType::SKIP;
+        } else if (stat != nullptr && filter->contain_start_end_time(
+                                          stat->start_time_, stat->end_time_)) {
+            page_info.pass_type = PagePassType::FULL_PASS;
+            page_info.row_begin = 0;
+            page_info.row_end = stat->count_;
+        } else {
+            page_info.pass_type = PagePassType::BOUNDARY;
+            std::vector<int64_t> times;
+            if (RET_FAIL(decode_time_page_direct(page_info, times))) {
+                break;
+            }
+            int32_t first = -1;
+            int32_t last = -1;
+            for (int32_t i = 0; i < static_cast<int32_t>(times.size()); i++) {
+                if (filter->satisfy_start_end_time(times[i], times[i])) {
+                    if (first < 0) first = i;
+                    last = i;
+                }
+            }
+            if (first >= 0) {
+                page_info.row_begin = first;
+                page_info.row_end = last + 1;
+            } else {
+                page_info.pass_type = PagePassType::SKIP;
+            }
+        }
+
+        if (page_info.pass_type != PagePassType::SKIP) {
+            if (page_info.row_end == 0) {
+                std::vector<int64_t> times;
+                if (RET_FAIL(decode_time_page_direct(page_info, times))) {
+                    break;
+                }
+                page_info.row_end = static_cast<int32_t>(times.size());
+            }
+            if (page_info.row_begin < page_info.row_end) {
+                chunk_pages_.push_back(std::move(page_info));
+            }
+        }
+
+        time_chunk_visit_offset_ += cur_time_page_header_.compressed_size_;
+        time_in_stream_.wrapped_buf_advance_read_pos(
+            cur_time_page_header_.compressed_size_);
+        for (uint32_t c = 0; c < num_cols; c++) {
+            auto* col = value_columns_[c];
+            col->chunk_visit_offset += col->cur_page_header.compressed_size_;
+            col->in_stream.wrapped_buf_advance_read_pos(
+                col->cur_page_header.compressed_size_);
+        }
+    }
+
+    page_plan_built_ = IS_SUCC(ret);
+
+    if (page_plan_built_) {
+        per_page_times_.assign(chunk_pages_.size(), std::vector<int64_t>{});
+        for (auto* col : value_columns_) {
+            col->per_page_state.clear();
+            col->per_page_state.resize(chunk_pages_.size());
+        }
+    }
+    return ret;
+}
+
+void AlignedChunkReader::release_current_page_state() {
+    time_predecoded_ = false;
+    page_all_times_.clear();
+    page_time_count_ = 0;
+    page_time_cursor_ = 0;
+    for (auto* col : value_columns_) {
+        if (col->uncompressed_buf != nullptr && col->compressor != nullptr) {
+            col->compressor->after_uncompress(col->uncompressed_buf);
+            col->uncompressed_buf = nullptr;
+        }
+        col->notnull_bitmap.clear();
+        col->cur_value_index = -1;
+        col->in.reset();
+        for (auto& pps : col->per_page_state) {
+            pps.predecode_pa.destroy();
+        }
+        col->per_page_state.clear();
+        col->pending_decoded_values.clear();
+        col->pending_decoded_count = 0;
+        col->pending_decoded_cursor = 0;
+        col->pending_decoded = false;
+    }
+    per_page_times_.clear();
+    current_page_loaded_ = false;
+}
+
+int AlignedChunkReader::decode_value_page_for_slot(uint32_t col_idx,
+                                                   size_t page_idx) {
+    const ChunkPageInfo& page_info = chunk_pages_[page_idx];
+    auto* col = value_columns_[col_idx];
+    auto& pps = col->per_page_state[page_idx];
+
+    pps.notnull_bitmap.clear();
+    pps.predecoded_values.clear();
+    pps.predecoded_strings.clear();
+    pps.predecoded_read_pos = 0;
+    pps.predecoded_count = 0;
+    pps.predecode_pa.destroy();
+
+    if (page_info.value_compressed_sizes[col_idx] == 0) {
+        return E_OK;
+    }
+
+    char stack_buf[4096];
+    char* compressed_buf = stack_buf;
+    bool heap = page_info.value_compressed_sizes[col_idx] > sizeof(stack_buf);
+    if (heap) {
+        compressed_buf = static_cast<char*>(common::mem_alloc(
+            page_info.value_compressed_sizes[col_idx], common::MOD_DEFAULT));
+        if (compressed_buf == nullptr) return E_OOM;
+    }
+
+    int32_t read_len = 0;
+    int ret =
+        read_file_->read(page_info.value_file_offsets[col_idx], compressed_buf,
+                         page_info.value_compressed_sizes[col_idx], read_len);
+    if (IS_FAIL(ret)) {
+        if (heap) common::mem_free(compressed_buf);
+        return ret;
+    }
+    if (read_len !=
+        static_cast<int32_t>(page_info.value_compressed_sizes[col_idx])) {
+        if (heap) common::mem_free(compressed_buf);
+        return E_TSFILE_CORRUPTED;
+    }
+
+    char* uncompressed_buf = nullptr;
+    uint32_t uncompressed_size = 0;
+    if (RET_FAIL(col->compressor->reset(false))) {
+        if (heap) common::mem_free(compressed_buf);
+        return ret;
+    }
+    ret = col->compressor->uncompress(compressed_buf,
+                                      page_info.value_compressed_sizes[col_idx],
+                                      uncompressed_buf, uncompressed_size);
+    if (heap && compressed_buf != uncompressed_buf) {
+        common::mem_free(compressed_buf);
+    }
+    if (IS_FAIL(ret) ||
+        uncompressed_size != page_info.value_uncompressed_sizes[col_idx]) {
+        if (uncompressed_buf != nullptr) {
+            col->compressor->after_uncompress(uncompressed_buf);
+        }
+        return E_TSFILE_CORRUPTED;
+    }
+    // The value page begins with a uint32 data_num followed by a bitmap of
+    // ceil(data_num/8) bytes; a corrupt or truncated page that doesn't even
+    // hold the data_num header would let read_ui32() walk past the buffer.
+    if (uncompressed_size < sizeof(uint32_t)) {
+        col->compressor->after_uncompress(uncompressed_buf);
+        return E_TSFILE_CORRUPTED;
+    }
+
+    uint32_t offset = 0;
+    uint32_t data_num = SerializationUtil::read_ui32(uncompressed_buf);
+    offset += sizeof(uint32_t);
+    uint32_t bitmap_bytes = (data_num + 7) / 8;
+    if (uncompressed_size - offset < bitmap_bytes) {
+        col->compressor->after_uncompress(uncompressed_buf);
+        return E_TSFILE_CORRUPTED;
+    }
+    pps.notnull_bitmap.resize(bitmap_bytes);
+    for (size_t i = 0; i < pps.notnull_bitmap.size(); i++) {
+        pps.notnull_bitmap[i] = *(uncompressed_buf + offset++);
+    }
+
+    char* value_buf = uncompressed_buf + offset;
+    uint32_t value_buf_size = uncompressed_size - offset;
+    common::ByteStream in;
+    in.wrap_from(value_buf, value_buf_size);
+    col->decoder->reset();
+
+    auto dt = col->chunk_header.data_type_;
+    int nonnull_total = count_non_null_prefix(pps.notnull_bitmap,
+                                              static_cast<int32_t>(data_num));
+    int prefix_nonnull =
+        count_non_null_prefix(pps.notnull_bitmap, page_info.row_begin);
+    pps.predecoded_read_pos = prefix_nonnull;
+
+    auto cleanup = [&]() {
+        col->compressor->after_uncompress(uncompressed_buf);
+    };
+
+    if (dt == common::STRING || dt == common::TEXT || dt == common::BLOB) {
+        pps.predecode_pa.init(512, common::MOD_TSFILE_READER);
+        pps.predecoded_strings.resize(nonnull_total);
+        for (int i = 0; i < nonnull_total; i++) {
+            if (RET_FAIL(col->decoder->read_String(pps.predecoded_strings[i],
+                                                   pps.predecode_pa, in))) {
+                cleanup();
+                return ret;
+            }
+        }
+        pps.predecoded_count = nonnull_total;
+        cleanup();
+        return E_OK;
+    }
+
+    if (nonnull_total == 0) {
+        cleanup();
+        return E_OK;
+    }
+
+    uint32_t elem_size = common::get_data_type_size(dt);
+    pps.predecoded_values.resize(static_cast<size_t>(nonnull_total) *
+                                 elem_size);
+    int actual = 0;
+    switch (dt) {
+        case common::BOOLEAN: {
+            bool* out = reinterpret_cast<bool*>(pps.predecoded_values.data());
+            for (int i = 0; i < nonnull_total; i++) {
+                if (RET_FAIL(col->decoder->read_boolean(out[i], in))) {
+                    cleanup();
+                    return ret;
+                }
+            }
+            actual = nonnull_total;
+            break;
+        }
+        case common::INT32:
+        case common::DATE:
+            if (RET_FAIL(col->decoder->read_batch_int32(
+                    reinterpret_cast<int32_t*>(pps.predecoded_values.data()),
+                    nonnull_total, actual, in))) {
+                cleanup();
+                return ret;
+            }
+            break;
+        case common::INT64:
+        case common::TIMESTAMP:
+            if (RET_FAIL(col->decoder->read_batch_int64(
+                    reinterpret_cast<int64_t*>(pps.predecoded_values.data()),
+                    nonnull_total, actual, in))) {
+                cleanup();
+                return ret;
+            }
+            break;
+        case common::FLOAT:
+            if (RET_FAIL(col->decoder->read_batch_float(
+                    reinterpret_cast<float*>(pps.predecoded_values.data()),
+                    nonnull_total, actual, in))) {
+                cleanup();
+                return ret;
+            }
+            break;
+        case common::DOUBLE:
+            if (RET_FAIL(col->decoder->read_batch_double(
+                    reinterpret_cast<double*>(pps.predecoded_values.data()),
+                    nonnull_total, actual, in))) {
+                cleanup();
+                return ret;
+            }
+            break;
+        default:
+            cleanup();
+            return E_NOT_SUPPORT;
+    }
+    pps.predecoded_count = actual;
+    cleanup();
+    return E_OK;
+}
+
+// Multi-thread path: one task per value column, each decoding all non-SKIP
+// pages of that column serially.  Time pages dispatched as worker-bucketed
+// strided tasks using per-worker decoder/compressor (filled from
+// time_decoder_pool_ / time_compressor_pool_) so they don't contend on the
+// shared time_decoder_/time_compressor_.
+//
+// Single-thread: do NOT pre-decode every page upfront — leave per_page_state
+// empty so the scatter loop decodes on demand and releases after each page
+// (see decode_page_lazy() / release_page_slot()).  Bounds memory to one page.
+int AlignedChunkReader::decode_all_planned_pages() {
+    if (chunk_pages_.empty()) return E_OK;
+
+#ifdef ENABLE_THREADS
+    if (decode_pool_ != nullptr && value_columns_.size() > 1) {
+        // Lazily grow the per-worker time decoder/compressor pool.  Both
+        // factories can return nullptr on OOM/unsupported config; without
+        // checking, the worker task below dereferences null when calling
+        // decode_time_page_with().
+        size_t worker_count = decode_pool_->num_threads();
+        if (time_decoder_pool_.size() < worker_count) {
+            time_decoder_pool_.resize(worker_count, nullptr);
+            time_compressor_pool_.resize(worker_count, nullptr);
+            for (size_t w = 0; w < worker_count; w++) {
+                if (time_decoder_pool_[w] == nullptr) {
+                    time_decoder_pool_[w] =
+                        DecoderFactory::alloc_time_decoder();
+                    if (time_decoder_pool_[w] == nullptr) return E_OOM;
+                }
+                if (time_compressor_pool_[w] == nullptr) {
+                    time_compressor_pool_[w] =
+                        CompressorFactory::alloc_compressor(
+                            time_chunk_header_.compression_type_);
+                    if (time_compressor_pool_[w] == nullptr) return E_OOM;
+                }
+            }
+        }
+
+        std::vector<std::future<void>> futures;
+        std::vector<int> col_rets(value_columns_.size(), E_OK);
+        for (uint32_t c = 0; c < value_columns_.size(); c++) {
+            int* col_ret = &col_rets[c];
+            futures.push_back(decode_pool_->submit([this, c, col_ret]() {
+                for (size_t p = 0; p < chunk_pages_.size(); p++) {
+                    int r = decode_value_page_for_slot(c, p);
+                    if (IS_FAIL(r)) {
+                        *col_ret = r;
+                        return;
+                    }
+                }
+            }));
+        }
+        // Time pages dispatched in worker-sized chunks (one task per worker)
+        // to amortize submit/wait overhead.  Stride for load balance.
+        size_t time_task_count = std::min(worker_count, chunk_pages_.size());
+        std::vector<int> time_rets(time_task_count, E_OK);
+        for (size_t k = 0; k < time_task_count; k++) {
+            int* tr = &time_rets[k];
+            futures.push_back(decode_pool_->submit(
+                [this, k, tr, time_task_count, worker_count]() {
+                    size_t wid = common::ThreadPool::current_worker_id();
+                    if (wid >= worker_count) wid = 0;
+                    Decoder* dec = time_decoder_pool_[wid];
+                    Compressor* comp = time_compressor_pool_[wid];
+                    for (size_t p = k; p < chunk_pages_.size();
+                         p += time_task_count) {
+                        int r = decode_time_page_with(
+                            chunk_pages_[p], per_page_times_[p], dec, comp);
+                        if (IS_FAIL(r)) {
+                            *tr = r;
+                            return;
+                        }
+                    }
+                }));
+        }
+        // Wait on each task's own future rather than draining the whole pool:
+        // it is shared process-wide, so wait_all() would also block on
+        // unrelated concurrent operations' tasks still in flight.
+        for (auto& f : futures) f.get();
+        for (auto r : time_rets) {
+            if (IS_FAIL(r)) return r;
+        }
+        for (uint32_t c = 0; c < value_columns_.size(); c++) {
+            if (IS_FAIL(col_rets[c])) return col_rets[c];
+        }
+        return E_OK;
+    }
+#endif
+    // Single-thread: defer decode to scatter time.
+    return E_OK;
+}
+
+// Decode time + all value columns for a single page slot on demand.
+// Used by the single-thread path to keep memory bounded to one page.
+int AlignedChunkReader::decode_page_lazy(size_t page_idx) {
+    int ret = E_OK;
+    if (RET_FAIL(decode_time_page_direct(chunk_pages_[page_idx],
+                                         per_page_times_[page_idx]))) {
+        return ret;
+    }
+    for (uint32_t c = 0; c < value_columns_.size(); c++) {
+        if (RET_FAIL(decode_value_page_for_slot(c, page_idx))) {
+            return ret;
+        }
+    }
+    return E_OK;
+}
+
+// Release the decoded buffers of one page slot so they can be reused by the
+// next page (keeps memory footprint bounded for the single-thread path).
+void AlignedChunkReader::release_page_slot(size_t page_idx) {
+    std::vector<int64_t>{}.swap(per_page_times_[page_idx]);
+    for (auto* col : value_columns_) {
+        if (page_idx >= col->per_page_state.size()) continue;
+        auto& pps = col->per_page_state[page_idx];
+        std::vector<uint8_t>{}.swap(pps.notnull_bitmap);
+        std::vector<char>{}.swap(pps.predecoded_values);
+        std::vector<common::String>{}.swap(pps.predecoded_strings);
+        pps.predecode_pa.destroy();
+        pps.predecoded_count = 0;
+        pps.predecoded_read_pos = 0;
+    }
+}
+
+int AlignedChunkReader::get_next_page_multi(TsBlock* ret_tsblock,
+                                            Filter* oneshoot_filter,
+                                            PageArena& pa) {
+    int ret = E_OK;
+    Filter* filter =
+        (oneshoot_filter != nullptr ? oneshoot_filter : time_filter_);
+
+    // Dispatch:
+    //   - Multi-column with a thread pool → chunk-level pre-decode: one task
+    //     per value column decodes that column's whole chunk up front, then the
+    //     scatter loop bulk-memcpys.  decode_all_planned_pages() works for any
+    //     column count.  (An earlier cutoff sent >6 columns down the serial
+    //     path because per_page_state — the upfront predecode buffer — grows
+    //     with column count and was feared to thrash cache; it still grows, so
+    //     very wide aligned chunks are the case to watch if reads regress.)
+    //   - Single column, or no thread pool → serial path: decode the current
+    //     page's columns inline (multi_DECODE_TV_BATCH), no thread-pool
+    //     fan-out.
+#ifdef ENABLE_THREADS
+    const bool use_chunk_level =
+        decode_pool_ != nullptr && value_columns_.size() > 1;
+#else
+    const bool use_chunk_level = false;
+#endif
+    if (!use_chunk_level) {
+        return get_next_page_multi_serial(ret_tsblock, filter, pa);
+    }
+
+    if (!page_plan_built_) {
+        if (RET_FAIL(build_page_plan(filter))) {
+            return ret;
+        }
+        if (RET_FAIL(decode_all_planned_pages())) {
+            return ret;
+        }
+    }
+    if (chunk_pages_.empty()) {
+        return E_NO_MORE_DATA;
+    }
+
+    const uint32_t null_mask_base = 1 << 7;
+    const uint32_t num_cols = value_columns_.size();
+    RowAppender row_appender(ret_tsblock);
+    // Detect single-thread lazy mode by whether decode_all_planned_pages left
+    // per_page_times_ empty (it leaves slots empty when there's no pool).
+    const bool single_thread_lazy = per_page_times_[0].empty();
+
+    while (current_page_plan_index_ < chunk_pages_.size()) {
+        const ChunkPageInfo& page_info = chunk_pages_[current_page_plan_index_];
+
+        if (!current_page_loaded_) {
+            if (single_thread_lazy) {
+                if (RET_FAIL(decode_page_lazy(current_page_plan_index_))) {
+                    return ret;
+                }
+            }
+            page_time_cursor_ = page_info.row_begin;
+            page_time_count_ = page_info.row_end;
+            current_page_loaded_ = true;
+        }
+        const std::vector<int64_t>& times =
+            per_page_times_[current_page_plan_index_];
+
+        int32_t remaining_in_page = page_time_count_ - page_time_cursor_;
+        uint32_t budget = row_appender.remaining();
+
+        // Fast path: FULL_PASS page, no nulls in any value column, types
+        // match destination, budget > 0.  Bulk-memcpys up to
+        // min(budget, remaining_in_page) rows from page_time_cursor_; tail
+        // pages of an SSI tsblock still take the memcpy path instead of
+        // falling into the row-by-row scatter loop.
+        bool can_bulk = page_info.pass_type == PagePassType::FULL_PASS &&
+                        remaining_in_page > 0 && budget > 0;
+        if (can_bulk) {
+            for (uint32_t c = 0; c < num_cols; c++) {
+                auto* col = value_columns_[c];
+                auto& pps = col->per_page_state[current_page_plan_index_];
+                auto dt = col->chunk_header.data_type_;
+                if (dt == common::STRING || dt == common::TEXT ||
+                    dt == common::BLOB ||
+                    ret_tsblock->get_vector(c + 1)->get_vector_type() != dt ||
+                    pps.predecoded_count != page_time_count_) {
+                    can_bulk = false;
+                    break;
+                }
+            }
+        }
+
+        if (can_bulk) {
+            uint32_t bulk_count =
+                std::min(budget, static_cast<uint32_t>(remaining_in_page));
+            size_t time_byte_off =
+                static_cast<size_t>(page_time_cursor_) * sizeof(int64_t);
+            // Bulk-append both bytes AND row count for every Vector.
+            // Skipping add_row_nums() would leave each Vector's row_num_
+            // at 0 while the TsBlock-level row_count_ jumped to bulk_count;
+            // fill_trailling_nulls() would then mark every just-written
+            // row as null, and column iterators would report the wrong
+            // length.
+            common::Vector* time_vec = ret_tsblock->get_vector(0);
+            time_vec->get_value_data().append_fixed_value(
+                reinterpret_cast<const char*>(times.data()) + time_byte_off,
+                bulk_count * sizeof(int64_t));
+            time_vec->add_row_nums(bulk_count);
+            for (uint32_t c = 0; c < num_cols; c++) {
+                auto* col = value_columns_[c];
+                auto& pps = col->per_page_state[current_page_plan_index_];
+                uint32_t elem_size =
+                    common::get_data_type_size(col->chunk_header.data_type_);
+                common::Vector* vec = ret_tsblock->get_vector(c + 1);
+                vec->get_value_data().append_fixed_value(
+                    pps.predecoded_values.data() +
+                        static_cast<size_t>(page_time_cursor_) * elem_size,
+                    bulk_count * elem_size);
+                vec->add_row_nums(bulk_count);
+            }
+            row_appender.add_rows(bulk_count);
+            page_time_cursor_ += bulk_count;
+            if (page_time_cursor_ >= page_time_count_) {
+                if (single_thread_lazy) {
+                    release_page_slot(current_page_plan_index_);
+                }
+                current_page_plan_index_++;
+                current_page_loaded_ = false;
+                continue;
+            }
+            // Budget exhausted mid-page; caller will drain and resume.
+            return E_OK;
+        }
+
+        // Slow path: row-by-row.  Handles null bitmap, type promotion,
+        // BOUNDARY pages, and partial-page E_OVERFLOW.
+        // BOUNDARY pages: build_page_plan compressed the page to the
+        // [first-hit, last-hit] range, but timestamps inside that range may
+        // still fail the filter (e.g. TimeIn({2, 8}) leaves 3..7 unmatched).
+        // Re-apply the filter per timestamp here, advancing predecoded
+        // read positions for skipped non-null rows so the cursor stays
+        // aligned with the page's value layout.
+        const bool boundary_filter =
+            page_info.pass_type == PagePassType::BOUNDARY && filter != nullptr;
+        while (page_time_cursor_ < page_time_count_) {
+            if (row_appender.remaining() == 0) {
+                return E_OK;
+            }
+            int64_t ts = times[page_time_cursor_];
+            if (boundary_filter && !filter->satisfy_start_end_time(ts, ts)) {
+                for (uint32_t c = 0; c < num_cols; c++) {
+                    auto* col = value_columns_[c];
+                    auto& pps = col->per_page_state[current_page_plan_index_];
+                    bool is_null = true;
+                    if (!pps.notnull_bitmap.empty()) {
+                        is_null =
+                            ((pps.notnull_bitmap[page_time_cursor_ / 8] &
+                              0xFF) &
+                             (null_mask_base >> (page_time_cursor_ % 8))) == 0;
+                    }
+                    if (!is_null) pps.predecoded_read_pos++;
+                }
+                page_time_cursor_++;
+                continue;
+            }
+            if (UNLIKELY(!row_appender.add_row())) {
+                return E_OK;
+            }
+            row_appender.append(0, reinterpret_cast<char*>(&ts), sizeof(ts));
+
+            for (uint32_t c = 0; c < num_cols; c++) {
+                auto* col = value_columns_[c];
+                auto& pps = col->per_page_state[current_page_plan_index_];
+                bool is_null = true;
+                if (!pps.notnull_bitmap.empty()) {
+                    is_null =
+                        ((pps.notnull_bitmap[page_time_cursor_ / 8] & 0xFF) &
+                         (null_mask_base >> (page_time_cursor_ % 8))) == 0;
+                }
+                if (is_null) {
+                    row_appender.append_null(c + 1);
+                    continue;
+                }
+                if (col->chunk_header.data_type_ == common::STRING ||
+                    col->chunk_header.data_type_ == common::TEXT ||
+                    col->chunk_header.data_type_ == common::BLOB) {
+                    const common::String& value =
+                        pps.predecoded_strings[pps.predecoded_read_pos++];
+                    row_appender.append(c + 1, value.buf_, value.len_);
+                } else {
+                    uint32_t elem_size = common::get_data_type_size(
+                        col->chunk_header.data_type_);
+                    row_appender.append(
+                        c + 1,
+                        pps.predecoded_values.data() +
+                            static_cast<size_t>(pps.predecoded_read_pos++) *
+                                elem_size,
+                        elem_size);
+                }
+            }
+            page_time_cursor_++;
+        }
+
+        if (single_thread_lazy) {
+            release_page_slot(current_page_plan_index_);
+        }
+        current_page_plan_index_++;
+        current_page_loaded_ = false;
+    }
+    return E_NO_MORE_DATA;
+}
+
+int AlignedChunkReader::get_next_page_multi_serial(TsBlock* ret_tsblock,
+                                                   Filter* filter,
+                                                   PageArena& pa) {
+    int ret = E_OK;
+    bool pt = prev_time_page_not_finish();
+    bool pv = prev_any_value_page_not_finish_multi();
+    if (pt && pv) {
+        ret =
+            decode_time_value_buf_into_tsblock_multi(ret_tsblock, filter, &pa);
+        return ret;
+    }
+    if (!pt && !pv) {
+        while (IS_SUCC(ret)) {
+            if (RET_FAIL(get_cur_page_header(
+                    time_chunk_meta_, time_in_stream_, cur_time_page_header_,
+                    time_chunk_visit_offset_, time_chunk_header_))) {
+                break;
+            }
+            for (size_t c = 0; c < value_columns_.size() && IS_SUCC(ret); c++) {
+                auto* col = value_columns_[c];
+                if (RET_FAIL(get_cur_page_header(
+                        col->chunk_meta, col->in_stream, col->cur_page_header,
+                        col->chunk_visit_offset, col->chunk_header,
+                        &col->file_data_buf_size))) {
+                }
+            }
+            if (IS_FAIL(ret)) break;
+            if (cur_page_statisify_filter_multi(filter)) break;
+            if (RET_FAIL(skip_cur_page_multi())) break;
+            if (!has_more_data()) {
+                ret = E_NO_MORE_DATA;
+                break;
+            }
+        }
+        if (IS_SUCC(ret)) {
+            ret = decode_cur_time_page_data();
+            if (IS_SUCC(ret)) ret = decode_cur_value_pages_multi();
+        }
+    }
+    if (IS_SUCC(ret)) {
+        ret =
+            decode_time_value_buf_into_tsblock_multi(ret_tsblock, filter, &pa);
+    }
+    return ret;
+}
+
+bool AlignedChunkReader::cur_page_statisify_filter_multi(Filter* filter) {
+    bool time_satisfy = filter == nullptr ||
+                        cur_time_page_header_.statistic_ == nullptr ||
+                        filter->satisfy(cur_time_page_header_.statistic_);
+    return time_satisfy;
+}
+
+int AlignedChunkReader::skip_cur_page_multi() {
+    time_chunk_visit_offset_ += cur_time_page_header_.compressed_size_;
+    time_in_stream_.wrapped_buf_advance_read_pos(
+        cur_time_page_header_.compressed_size_);
+    for (auto* col : value_columns_) {
+        col->chunk_visit_offset += col->cur_page_header.compressed_size_;
+        col->in_stream.wrapped_buf_advance_read_pos(
+            col->cur_page_header.compressed_size_);
+    }
+    return E_OK;
+}
+
+int AlignedChunkReader::decode_cur_value_pages_multi() {
+    int ret = E_OK;
+    // Phase 1: Serial IO — ensure each column's page data is in memory.
+    for (size_t c = 0; c < value_columns_.size() && IS_SUCC(ret); c++) {
+        ret = ensure_value_page_loaded(*value_columns_[c]);
+    }
+    if (IS_FAIL(ret)) return ret;
+
+    // Phase 2: decompress + parse bitmap + reset decoder for each column's
+    // current page, inline.  This serial path now only runs for single-column
+    // reads or when no thread pool exists — multi-column reads with a pool take
+    // the chunk-level path (decode_all_planned_pages), so there is no per-page
+    // thread-pool fan-out here anymore.  predecode=false lets the scatter loop
+    // (multi_DECODE_TV_BATCH) decode inline, which has better cache locality
+    // when there is no parallelism to amortize an extra predecode buffer write.
+    for (size_t c = 0; c < value_columns_.size() && IS_SUCC(ret); c++) {
+        ret = decompress_and_parse_value_page(*value_columns_[c], false);
+    }
+    return ret;
+}
+
+int AlignedChunkReader::decode_cur_value_page_data_for(ValueColumnState& col) {
+    int ret = E_OK;
+
+    // Step 1: ensure full page data is loaded
+    if (col.in_stream.remaining_size() < col.cur_page_header.compressed_size_) {
+        if (RET_FAIL(read_from_file_and_rewrap(
+                col.in_stream, col.chunk_meta, col.chunk_visit_offset,
+                col.file_data_buf_size,
+                col.cur_page_header.compressed_size_))) {
+            return ret;
+        }
+    }
+
+    if (col.cur_page_header.compressed_size_ == 0) {
+        col.in.wrap_from(nullptr, 0);
+        return E_OK;
+    }
+
+    // Step 2: uncompress
+    char* compressed_buf =
+        col.in_stream.get_wrapped_buf() + col.in_stream.read_pos();
+    uint32_t compressed_size = col.cur_page_header.compressed_size_;
+    col.in_stream.wrapped_buf_advance_read_pos(compressed_size);
+    col.chunk_visit_offset += compressed_size;
+
+    char* uncompressed_buf = nullptr;
+    uint32_t uncompressed_size = 0;
+    if (RET_FAIL(col.compressor->reset(false))) {
+        return ret;
+    }
+    if (RET_FAIL(col.compressor->uncompress(compressed_buf, compressed_size,
+                                            uncompressed_buf,
+                                            uncompressed_size))) {
+        return ret;
+    }
+    col.uncompressed_buf = uncompressed_buf;
+
+    if (uncompressed_size != col.cur_page_header.uncompressed_size_) {
+        return E_TSFILE_CORRUPTED;
+    }
+
+    // Step 3: parse bitmap + value data
+    if (uncompressed_size < sizeof(uint32_t)) return E_TSFILE_CORRUPTED;
+    uint32_t offset = 0;
+    uint32_t data_num = SerializationUtil::read_ui32(uncompressed_buf);
+    offset += sizeof(uint32_t);
+    uint32_t bitmap_bytes = (data_num + 7) / 8;
+    if (uncompressed_size - offset < bitmap_bytes) return E_TSFILE_CORRUPTED;
+    col.notnull_bitmap.resize(bitmap_bytes);
+    for (size_t i = 0; i < col.notnull_bitmap.size(); i++) {
+        col.notnull_bitmap[i] = *(uncompressed_buf + offset);
+        offset++;
+    }
+    col.cur_value_index = -1;
+
+    char* value_buf = uncompressed_buf + offset;
+    uint32_t value_buf_size = uncompressed_size - offset;
+    col.decoder->reset();
+    col.in.wrap_from(value_buf, value_buf_size);
+    return ret;
+}
+
+int AlignedChunkReader::ensure_value_page_loaded(ValueColumnState& col) {
+    int ret = E_OK;
+    if (col.in_stream.remaining_size() < col.cur_page_header.compressed_size_) {
+        if (RET_FAIL(read_from_file_and_rewrap(
+                col.in_stream, col.chunk_meta, col.chunk_visit_offset,
+                col.file_data_buf_size,
+                col.cur_page_header.compressed_size_))) {
+            return ret;
+        }
+    }
+    return ret;
+}
+
+int AlignedChunkReader::decompress_and_parse_value_page(ValueColumnState& col,
+                                                        bool predecode) {
+    int ret = E_OK;
+
+    if (col.cur_page_header.compressed_size_ == 0) {
+        col.in.wrap_from(nullptr, 0);
+        return E_OK;
+    }
+
+    // Decompress
+    char* compressed_buf =
+        col.in_stream.get_wrapped_buf() + col.in_stream.read_pos();
+    uint32_t compressed_size = col.cur_page_header.compressed_size_;
+    col.in_stream.wrapped_buf_advance_read_pos(compressed_size);
+    col.chunk_visit_offset += compressed_size;
+
+    char* uncompressed_buf = nullptr;
+    uint32_t uncompressed_size = 0;
+    if (RET_FAIL(col.compressor->reset(false))) {
+        return ret;
+    }
+    if (RET_FAIL(col.compressor->uncompress(compressed_buf, compressed_size,
+                                            uncompressed_buf,
+                                            uncompressed_size))) {
+        return ret;
+    }
+    col.uncompressed_buf = uncompressed_buf;
+
+    if (uncompressed_size != col.cur_page_header.uncompressed_size_) {
+        return E_TSFILE_CORRUPTED;
+    }
+
+    // Parse bitmap + value data
+    if (uncompressed_size < sizeof(uint32_t)) return E_TSFILE_CORRUPTED;
+    uint32_t offset = 0;
+    uint32_t data_num = SerializationUtil::read_ui32(uncompressed_buf);
+    offset += sizeof(uint32_t);
+    uint32_t bitmap_bytes = (data_num + 7) / 8;
+    if (uncompressed_size - offset < bitmap_bytes) return E_TSFILE_CORRUPTED;
+    col.notnull_bitmap.resize(bitmap_bytes);
+    for (size_t i = 0; i < col.notnull_bitmap.size(); i++) {
+        col.notnull_bitmap[i] = *(uncompressed_buf + offset);
+        offset++;
+    }
+    col.cur_value_index = -1;
+
+    char* value_buf = uncompressed_buf + offset;
+    uint32_t value_buf_size = uncompressed_size - offset;
+    col.decoder->reset();
+    col.in.wrap_from(value_buf, value_buf_size);
+
+    // Pre-decode all non-null values into pending_decoded_values so the
+    // scatter loop (multi_DECODE_TV_BATCH) just memcpys instead of calling
+    // the decoder.  Moves the expensive int64/double decode into the worker
+    // task so it runs in parallel.  Only handles fixed-length types — strings
+    // stay on the inline-decode path.
+    col.pending_decoded = false;
+    col.pending_decoded_count = 0;
+    col.pending_decoded_cursor = 0;
+    auto dt = col.chunk_header.data_type_;
+    if (predecode && dt != common::STRING && dt != common::TEXT &&
+        dt != common::BLOB) {
+        int nonnull_total = 0;
+        for (uint32_t i = 0; i < data_num; i++) {
+            if ((col.notnull_bitmap[i / 8] & (0x80 >> (i % 8))) != 0) {
+                nonnull_total++;
+            }
+        }
+        if (nonnull_total > 0) {
+            uint32_t elem_size = common::get_data_type_size(dt);
+            col.pending_decoded_values.resize(
+                static_cast<size_t>(nonnull_total) * elem_size);
+            int actual = 0;
+            int rret = common::E_OK;
+            switch (dt) {
+                case common::BOOLEAN: {
+                    bool* out = reinterpret_cast<bool*>(
+                        col.pending_decoded_values.data());
+                    for (int i = 0; i < nonnull_total; i++) {
+                        bool v;
+                        if (col.decoder->read_boolean(v, col.in) !=
+                            common::E_OK) {
+                            rret = common::E_OUT_OF_RANGE;
+                            break;
+                        }
+                        out[i] = v;
+                    }
+                    actual = nonnull_total;
+                    break;
+                }
+                case common::INT32:
+                case common::DATE:
+                    rret = col.decoder->read_batch_int32(
+                        reinterpret_cast<int32_t*>(
+                            col.pending_decoded_values.data()),
+                        nonnull_total, actual, col.in);
+                    break;
+                case common::INT64:
+                case common::TIMESTAMP:
+                    rret = col.decoder->read_batch_int64(
+                        reinterpret_cast<int64_t*>(
+                            col.pending_decoded_values.data()),
+                        nonnull_total, actual, col.in);
+                    break;
+                case common::FLOAT:
+                    rret = col.decoder->read_batch_float(
+                        reinterpret_cast<float*>(
+                            col.pending_decoded_values.data()),
+                        nonnull_total, actual, col.in);
+                    break;
+                case common::DOUBLE:
+                    rret = col.decoder->read_batch_double(
+                        reinterpret_cast<double*>(
+                            col.pending_decoded_values.data()),
+                        nonnull_total, actual, col.in);
+                    break;
+                default:
+                    rret = common::E_OUT_OF_RANGE;
+            }
+            if (rret == common::E_OK && actual == nonnull_total) {
+                col.pending_decoded_count = nonnull_total;
+                col.pending_decoded = true;
+            }
+        } else {
+            col.pending_decoded = true;  // empty page is trivially predecoded
+        }
+    }
+    return ret;
+}
+
+int AlignedChunkReader::decode_time_value_buf_into_tsblock_multi(
+    TsBlock*& ret_tsblock, Filter* filter, PageArena* pa) {
+    int ret = E_OK;
+    RowAppender row_appender(ret_tsblock);
+    ret = multi_DECODE_TV_BATCH(ret_tsblock, row_appender, filter, pa);
+
+    // Release uncompressed buffers if pages are done
+    if (ret != E_OVERFLOW) {
+        if (time_uncompressed_buf_ != nullptr) {
+            time_compressor_->after_uncompress(time_uncompressed_buf_);
+            time_uncompressed_buf_ = nullptr;
+        }
+        for (auto* col : value_columns_) {
+            if (col->uncompressed_buf != nullptr) {
+                col->compressor->after_uncompress(col->uncompressed_buf);
+                col->uncompressed_buf = nullptr;
+            }
+            // The time stream and bitmap define the page's row/value count.
+            // Once the page is fully processed, bytes left in an all-null
+            // value stream are only encoder terminators or padding and must
+            // not make has_more_data_multi() treat the page as unfinished.
+            col->in.reset();
+            col->notnull_bitmap.clear();
+            col->notnull_bitmap.shrink_to_fit();
+        }
+        if (!prev_time_page_not_finish()) {
+            time_in_.reset();
+        }
+    } else {
+        ret = E_OK;
+    }
+    return ret;
+}
+
+int AlignedChunkReader::multi_DECODE_TV_BATCH(TsBlock* ret_tsblock,
+                                              RowAppender& row_appender,
+                                              Filter* filter, PageArena* pa) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    const uint32_t null_mask_base = 1 << 7;
+    const uint32_t num_cols = value_columns_.size();
+
+    while (time_decoder_->has_remaining(time_in_)) {
+        // Cap each pass to what the appender can still hold; mirrors the fix
+        // in ChunkReader's per-type batch loops.  A blanket "remaining < BATCH
+        // → E_OVERFLOW" made progress impossible whenever the caller handed
+        // us a TsBlock with capacity below BATCH (e.g. small per-block sizes
+        // in multi-chunk queries).
+        int eff_batch =
+            std::min(BATCH, static_cast<int>(row_appender.remaining()));
+        if (eff_batch <= 0) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // ── Phase 1: Decode a batch of timestamps ──
+        int time_count = 0;
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch,
+                                                     time_count, time_in_))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        // ── Phase 2: Apply time filter ──
+        bool time_mask[BATCH];
+        bool block_all_pass = (filter == nullptr);
+        int pass_count = time_count;
+        if (!block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        // ── Phase 3: Per-column null check + value decode ──
+        // For each column, compute null flags and decode non-null values.
+        // We store decoded values in column-specific buffers.
+        // Max 8 bytes per value, 129 values per batch.
+        struct ColBatch {
+            bool is_null[BATCH];
+            int nonnull_count;
+            // Value buffer for fixed-width types — up to 129 * 8 bytes
+            char val_buf[BATCH * 8];
+            int val_count;
+            // Variable-length values for STRING/TEXT/BLOB columns.  Only
+            // populated when the column's data_type_ is variable; their
+            // bufs are owned by the caller-provided PageArena.
+            std::vector<common::String> str_vals;
+        };
+        // Allocate on heap if many columns, stack for small counts
+        std::vector<ColBatch> col_batches(num_cols);
+
+        for (uint32_t c = 0; c < num_cols; c++) {
+            auto* col = value_columns_[c];
+            auto& cb = col_batches[c];
+            cb.nonnull_count = 0;
+            cb.val_count = 0;
+            for (int i = 0; i < time_count; i++) {
+                int vi = col->cur_value_index + 1 + i;
+                if (col->notnull_bitmap.empty() ||
+                    ((col->notnull_bitmap[vi / 8] & 0xFF) &
+                     (null_mask_base >> (vi % 8))) == 0) {
+                    cb.is_null[i] = true;
+                } else {
+                    cb.is_null[i] = false;
+                    cb.nonnull_count++;
+                }
+            }
+
+            // Skip values if no rows pass time filter.  Skip/read errors and
+            // short reads (decoder returned fewer values than the bitmap
+            // promised) must abort; otherwise the input stream is left
+            // mid-value and later batches would decode garbage from
+            // misaligned bytes.
+            if (pass_count == 0 && cb.nonnull_count > 0) {
+                int dret = common::E_OK;
+                int sk = 0;
+                switch (col->chunk_header.data_type_) {
+                    case common::BOOLEAN: {
+                        bool dummy;
+                        for (sk = 0; sk < cb.nonnull_count; sk++) {
+                            dret = col->decoder->read_boolean(dummy, col->in);
+                            if (dret != common::E_OK) break;
+                        }
+                        break;
+                    }
+                    case common::INT32:
+                    case common::DATE:
+                        dret = col->decoder->skip_int32(cb.nonnull_count, sk,
+                                                        col->in);
+                        break;
+                    case common::INT64:
+                    case common::TIMESTAMP:
+                        dret = col->decoder->skip_int64(cb.nonnull_count, sk,
+                                                        col->in);
+                        break;
+                    case common::FLOAT:
+                        dret = col->decoder->skip_float(cb.nonnull_count, sk,
+                                                        col->in);
+                        break;
+                    case common::DOUBLE:
+                        dret = col->decoder->skip_double(cb.nonnull_count, sk,
+                                                         col->in);
+                        break;
+                    case common::STRING:
+                    case common::TEXT:
+                    case common::BLOB: {
+                        // The decoder has no fast skip for var-length strings;
+                        // reading + discarding is the only way to advance the
+                        // input stream past the row's payload.
+                        common::String tmp;
+                        for (sk = 0; sk < cb.nonnull_count; sk++) {
+                            dret = col->decoder->read_String(tmp, *pa, col->in);
+                            if (dret != common::E_OK) break;
+                        }
+                        break;
+                    }
+                    default:
+                        ret = E_TSFILE_CORRUPTED;
+                        break;
+                }
+                if (ret != common::E_OK) break;
+                if (dret != common::E_OK) {
+                    ret = dret;
+                    break;
+                }
+                if (sk != cb.nonnull_count) {
+                    ret = E_TSFILE_CORRUPTED;
+                    break;
+                }
+                cb.nonnull_count = 0;  // bytes consumed cleanly
+            }
+
+            // Decode non-null values.  Fast path: values were predecoded
+            // into col->pending_decoded_values by the parallel worker — just
+            // memcpy the slice for this batch.  Fallback: call the decoder
+            // inline (used for STRING/TEXT/BLOB and when predecode was
+            // skipped).
+            if (cb.nonnull_count > 0) {
+                if (col->pending_decoded) {
+                    uint32_t elem_size = common::get_data_type_size(
+                        col->chunk_header.data_type_);
+                    memcpy(
+                        cb.val_buf,
+                        col->pending_decoded_values.data() +
+                            static_cast<size_t>(col->pending_decoded_cursor) *
+                                elem_size,
+                        static_cast<size_t>(cb.nonnull_count) * elem_size);
+                    col->pending_decoded_cursor += cb.nonnull_count;
+                    cb.val_count = cb.nonnull_count;
+                } else {
+                    int dret = common::E_OK;
+                    switch (col->chunk_header.data_type_) {
+                        case common::BOOLEAN: {
+                            bool* out = reinterpret_cast<bool*>(cb.val_buf);
+                            cb.val_count = 0;
+                            for (int s = 0; s < cb.nonnull_count; s++) {
+                                bool v;
+                                dret = col->decoder->read_boolean(v, col->in);
+                                if (dret != common::E_OK) break;
+                                out[cb.val_count++] = v;
+                            }
+                            break;
+                        }
+                        case common::INT32:
+                        case common::DATE:
+                            dret = col->decoder->read_batch_int32(
+                                reinterpret_cast<int32_t*>(cb.val_buf),
+                                cb.nonnull_count, cb.val_count, col->in);
+                            break;
+                        case common::INT64:
+                        case common::TIMESTAMP:
+                            dret = col->decoder->read_batch_int64(
+                                reinterpret_cast<int64_t*>(cb.val_buf),
+                                cb.nonnull_count, cb.val_count, col->in);
+                            break;
+                        case common::FLOAT:
+                            dret = col->decoder->read_batch_float(
+                                reinterpret_cast<float*>(cb.val_buf),
+                                cb.nonnull_count, cb.val_count, col->in);
+                            break;
+                        case common::DOUBLE:
+                            dret = col->decoder->read_batch_double(
+                                reinterpret_cast<double*>(cb.val_buf),
+                                cb.nonnull_count, cb.val_count, col->in);
+                            break;
+                        case common::STRING:
+                        case common::TEXT:
+                        case common::BLOB: {
+                            // Variable-length payload doesn't fit in
+                            // cb.val_buf; pull each value into str_vals and
+                            // let the scatter loop index by val_count.
+                            cb.str_vals.resize(cb.nonnull_count);
+                            cb.val_count = 0;
+                            for (int s = 0; s < cb.nonnull_count; s++) {
+                                dret = col->decoder->read_String(cb.str_vals[s],
+                                                                 *pa, col->in);
+                                if (dret != common::E_OK) break;
+                                cb.val_count++;
+                            }
+                            break;
+                        }
+                        default:
+                            break;
+                    }
+                    // Any decoder error, or a short decode that produced
+                    // fewer values than the bitmap promised, indicates a
+                    // corrupt page; propagate immediately so the scatter
+                    // loop doesn't read uninitialised cb.val_buf bytes.
+                    if (dret != common::E_OK) {
+                        ret = dret;
+                        break;
+                    }
+                    if (col->chunk_header.data_type_ != common::STRING &&
+                        col->chunk_header.data_type_ != common::TEXT &&
+                        col->chunk_header.data_type_ != common::BLOB &&
+                        cb.val_count != cb.nonnull_count) {
+                        ret = E_TSFILE_CORRUPTED;
+                        break;
+                    }
+                }
+            }
+        }
+        if (ret != E_OK) break;
+
+        // ── Phase 4: Skip if no rows pass ──
+        if (pass_count == 0) {
+            for (uint32_t c = 0; c < num_cols; c++) {
+                value_columns_[c]->cur_value_index += time_count;
+            }
+            continue;
+        }
+
+        // ── Phase 5: Scatter into TsBlock ──
+
+        // Fast path: all rows pass filter AND all columns have no nulls
+        // → batch memcpy directly into Vector buffers.  STRING/TEXT/BLOB
+        // columns have variable-width payload and live in cb.str_vals, not
+        // cb.val_buf, so they must take the slow scatter path.
+        if (pass_count == time_count) {
+            bool all_nonnull = true;
+            for (uint32_t c = 0; c < num_cols; c++) {
+                auto dt = value_columns_[c]->chunk_header.data_type_;
+                if (col_batches[c].nonnull_count != time_count ||
+                    dt == common::STRING || dt == common::TEXT ||
+                    dt == common::BLOB) {
+                    all_nonnull = false;
+                    break;
+                }
+            }
+            if (all_nonnull) {
+                // Batch append time column (bytes + row count); see the
+                // chunk-level bulk path above for why add_row_nums() is
+                // required alongside append_fixed_value().
+                common::Vector* time_vec = ret_tsblock->get_vector(0);
+                time_vec->get_value_data().append_fixed_value(
+                    (const char*)times,
+                    static_cast<uint32_t>(time_count) * sizeof(int64_t));
+                time_vec->add_row_nums(static_cast<uint32_t>(time_count));
+                // Batch append each value column
+                for (uint32_t c = 0; c < num_cols; c++) {
+                    auto& cb = col_batches[c];
+                    auto* col = value_columns_[c];
+                    uint32_t elem_size = common::get_data_type_size(
+                        col->chunk_header.data_type_);
+                    common::Vector* vec = ret_tsblock->get_vector(c + 1);
+                    vec->get_value_data().append_fixed_value(
+                        cb.val_buf,
+                        static_cast<uint32_t>(cb.val_count) * elem_size);
+                    vec->add_row_nums(static_cast<uint32_t>(cb.val_count));
+                    col->cur_value_index += time_count;
+                }
+                row_appender.add_rows(static_cast<uint32_t>(time_count));
+                continue;
+            }
+        }
+
+        // Slow path: per-row scatter (has filter or has nulls or strings)
+        std::vector<int> val_idx(num_cols, 0);
+
+        for (int i = 0; i < time_count; i++) {
+            bool passes = block_all_pass || time_mask[i];
+
+            if (!passes) {
+                for (uint32_t c = 0; c < num_cols; c++) {
+                    value_columns_[c]->cur_value_index++;
+                    if (!col_batches[c].is_null[i]) val_idx[c]++;
+                }
+                continue;
+            }
+
+            if (UNLIKELY(!row_appender.add_row())) {
+                ret = E_OVERFLOW;
+                break;
+            }
+
+            row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+
+            for (uint32_t c = 0; c < num_cols; c++) {
+                value_columns_[c]->cur_value_index++;
+                auto& cb = col_batches[c];
+                auto* col = value_columns_[c];
+
+                if (cb.is_null[i]) {
+                    row_appender.append_null(c + 1);
+                } else {
+                    auto dt = col->chunk_header.data_type_;
+                    if (dt == common::STRING || dt == common::TEXT ||
+                        dt == common::BLOB) {
+                        const common::String& sv = cb.str_vals[val_idx[c]];
+                        row_appender.append(c + 1, sv.buf_, sv.len_);
+                    } else {
+                        uint32_t elem_size = common::get_data_type_size(dt);
+                        row_appender.append(c + 1,
+                                            cb.val_buf + val_idx[c] * elem_size,
+                                            elem_size);
+                    }
+                    val_idx[c]++;
+                }
+            }
+        }
+        if (ret != E_OK) break;
     }
     return ret;
 }
 
-}  // end namespace storage
\ No newline at end of file
+}  // end namespace storage
diff --git a/cpp/src/reader/aligned_chunk_reader.h b/cpp/src/reader/aligned_chunk_reader.h
index 91281215e..69ce48f4a 100644
--- a/cpp/src/reader/aligned_chunk_reader.h
+++ b/cpp/src/reader/aligned_chunk_reader.h
@@ -28,8 +28,70 @@
 #include "reader/filter/filter.h"
 #include "reader/ichunk_reader.h"
 
+#ifdef ENABLE_THREADS
+namespace common {
+class ThreadPool;
+}
+#endif
+
 namespace storage {
 
+// Page classification for chunk-level parallel decode.
+enum class PagePassType { SKIP, FULL_PASS, BOUNDARY };
+
+// Metadata collected per page during the chunk scan phase.
+struct ChunkPageInfo {
+    PagePassType pass_type = PagePassType::SKIP;
+    // File offsets of compressed data for time and each value column.
+    int64_t time_file_offset = 0;
+    uint32_t time_compressed_size = 0;
+    uint32_t time_uncompressed_size = 0;
+    int32_t row_begin = 0;  // inclusive
+    int32_t row_end = 0;    // exclusive
+    std::vector<int64_t> value_file_offsets;
+    std::vector<uint32_t> value_compressed_sizes;
+    std::vector<uint32_t> value_uncompressed_sizes;
+};
+
+// Decoded state for one (column, page) slot.  Populated by chunk-level
+// parallel decode; consumed by the scatter loop.
+struct PageDecodedState {
+    std::vector<uint8_t> notnull_bitmap;
+    std::vector<char> predecoded_values;
+    std::vector<common::String> predecoded_strings;
+    common::PageArena predecode_pa;
+    int32_t predecoded_count = 0;
+    int32_t predecoded_read_pos = 0;
+};
+
+// Per-value-column state for multi-value AlignedChunkReader.
+struct ValueColumnState {
+    ChunkMeta* chunk_meta = nullptr;
+    ChunkHeader chunk_header;
+    Decoder* decoder = nullptr;
+    Compressor* compressor = nullptr;
+    common::ByteStream in_stream;  // raw data from file
+    common::ByteStream in;         // decompressed data
+    char* uncompressed_buf = nullptr;
+    int32_t file_data_buf_size = 0;
+    uint32_t chunk_visit_offset = 0;
+    PageHeader cur_page_header;
+    std::vector<uint8_t> notnull_bitmap;
+    int32_t cur_value_index = -1;
+
+    // Per-page decoded state for chunk-level parallel decode.
+    std::vector<PageDecodedState> per_page_state;
+
+    // Pre-decoded value buffer for the CURRENT page, filled by
+    // decompress_and_parse_value_page when the dense-multi path predecodes
+    // values in worker threads.  Consumed by multi_DECODE_TV_BATCH instead of
+    // calling the decoder inline.  Holds nonnull values only.
+    std::vector<char> pending_decoded_values;
+    int32_t pending_decoded_count = 0;
+    int32_t pending_decoded_cursor = 0;
+    bool pending_decoded = false;
+};
+
 class AlignedChunkReader : public IChunkReader {
    public:
     AlignedChunkReader()
@@ -64,11 +126,13 @@ class AlignedChunkReader : public IChunkReader {
     ~AlignedChunkReader() override = default;
 
     bool has_more_data() const override {
-        return prev_value_page_not_finish() ||
+        if (multi_value_mode_) {
+            return has_more_data_multi();
+        }
+        return prev_value_page_not_finish() || prev_time_page_not_finish() ||
                (value_chunk_visit_offset_ -
                     value_chunk_header_.serialized_size_ <
                 value_chunk_header_.data_size_) ||
-               prev_time_page_not_finish() ||
                (time_chunk_visit_offset_ - time_chunk_header_.serialized_size_ <
                 time_chunk_header_.data_size_);
     }
@@ -76,13 +140,36 @@ class AlignedChunkReader : public IChunkReader {
     int load_by_aligned_meta(ChunkMeta* time_meta,
                              ChunkMeta* value_meta) override;
 
+    // Multi-value: load one time chunk + N value chunks.
+    int load_by_aligned_meta_multi(ChunkMeta* time_meta,
+                                   const std::vector<ChunkMeta*>& value_metas);
+
     int get_next_page(common::TsBlock* tsblock, Filter* oneshoot_filter,
                       common::PageArena& pa) override;
-
     int get_next_page(common::TsBlock* tsblock, Filter* oneshoot_filter,
                       common::PageArena& pa, int64_t min_time_hint,
                       int& row_offset, int& row_limit) override;
 
+    // Multi-value: get the number of value columns.
+    uint32_t get_value_column_count() const {
+        return multi_value_mode_ ? value_columns_.size() : 1;
+    }
+
+    // Multi-value: get chunk header for a specific value column.
+    ChunkHeader& get_value_chunk_header(uint32_t col) {
+        if (multi_value_mode_ && col < value_columns_.size()) {
+            return value_columns_[col]->chunk_header;
+        }
+        return value_chunk_header_;
+    }
+
+    bool is_multi_value_mode() const { return multi_value_mode_; }
+
+#ifdef ENABLE_THREADS
+    // Set external thread pool for parallel decode (not owned).
+    void set_decode_pool(common::ThreadPool* pool) { decode_pool_ = pool; }
+#endif
+
    private:
     bool should_skip_page_by_time(int64_t min_time_hint);
     bool should_skip_page_by_offset(int& row_offset);
@@ -100,7 +187,8 @@ class AlignedChunkReader : public IChunkReader {
                             common::ByteStream& in_stream_,
                             PageHeader& cur_page_header_,
                             uint32_t& chunk_visit_offset,
-                            ChunkHeader& chunk_header);
+                            ChunkHeader& chunk_header,
+                            int32_t* override_buf_size = nullptr);
     int read_from_file_and_rewrap(common::ByteStream& in_stream_,
                                   ChunkMeta*& chunk_meta,
                                   uint32_t& chunk_visit_offset,
@@ -114,6 +202,7 @@ class AlignedChunkReader : public IChunkReader {
                                            Filter* filter,
                                            common::PageArena* pa);
     bool prev_time_page_not_finish() const {
+        if (time_predecoded_) return page_time_cursor_ < page_time_count_;
         return (time_decoder_ && time_decoder_->has_remaining(time_in_)) ||
                time_in_.has_remaining();
     }
@@ -132,58 +221,119 @@ class AlignedChunkReader : public IChunkReader {
                                          common::ByteStream& value_in,
                                          common::RowAppender& row_appender,
                                          Filter* filter);
+    int i32_DECODE_TV_BATCH(common::ByteStream& time_in,
+                            common::ByteStream& value_in,
+                            common::RowAppender& row_appender, Filter* filter);
+    int i64_DECODE_TV_BATCH(common::ByteStream& time_in,
+                            common::ByteStream& value_in,
+                            common::RowAppender& row_appender, Filter* filter);
+    int float_DECODE_TV_BATCH(common::ByteStream& time_in,
+                              common::ByteStream& value_in,
+                              common::RowAppender& row_appender,
+                              Filter* filter);
+    int double_DECODE_TV_BATCH(common::ByteStream& time_in,
+                               common::ByteStream& value_in,
+                               common::RowAppender& row_appender,
+                               Filter* filter);
     int STRING_DECODE_TYPED_TV_INTO_TSBLOCK(common::ByteStream& time_in,
                                             common::ByteStream& value_in,
                                             common::RowAppender& row_appender,
                                             common::PageArena& pa,
                                             Filter* filter);
 
+    // ── Multi-value private methods (page-level, serial fallback) ────────
+    bool has_more_data_multi() const;
+    bool prev_any_value_page_not_finish_multi() const;
+    int get_next_page_multi(common::TsBlock* ret_tsblock,
+                            Filter* oneshoot_filter, common::PageArena& pa);
+    int get_next_page_multi_serial(common::TsBlock* ret_tsblock, Filter* filter,
+                                   common::PageArena& pa);
+    int skip_cur_page_multi();
+    bool cur_page_statisify_filter_multi(Filter* filter);
+    int decode_cur_value_pages_multi();
+    int decode_cur_value_page_data_for(ValueColumnState& col);
+    int ensure_value_page_loaded(ValueColumnState& col);
+    static int decompress_and_parse_value_page(ValueColumnState& col,
+                                               bool predecode);
+    void predecode_all_timestamps();
+    int decode_time_value_buf_into_tsblock_multi(common::TsBlock*& ret_tsblock,
+                                                 Filter* filter,
+                                                 common::PageArena* pa);
+    int multi_DECODE_TV_BATCH(common::TsBlock* ret_tsblock,
+                              common::RowAppender& row_appender, Filter* filter,
+                              common::PageArena* pa);
+    int build_page_plan(Filter* filter);
+    int decode_time_page_direct(const ChunkPageInfo& page_info,
+                                std::vector<int64_t>& out_times);
+    int decode_time_page_with(const ChunkPageInfo& page_info,
+                              std::vector<int64_t>& out_times, Decoder* decoder,
+                              Compressor* compressor);
+    int decode_all_planned_pages();
+    int decode_value_page_for_slot(uint32_t col_idx, size_t page_idx);
+    int decode_page_lazy(size_t page_idx);
+    void release_page_slot(size_t page_idx);
+    void release_current_page_state();
+    bool has_variable_length_value_column() const;
+    int count_non_null_prefix(const std::vector<uint8_t>& bitmap,
+                              int32_t row_limit) const;
+
    private:
     ReadFile* read_file_;
+    // ── Single-value mode fields (kept for backward compat) ──────────────
     ChunkMeta* time_chunk_meta_;
     ChunkMeta* value_chunk_meta_;
     common::String measurement_name_;
     ChunkHeader time_chunk_header_;
-    // TODO: support reading more than one measurement in AlignedChunkReader.
     ChunkHeader value_chunk_header_;
     PageHeader cur_time_page_header_;
     PageHeader cur_value_page_header_;
 
-    /*
-     * Data reader from file is stored in @in_stream_, and the size
-     * is stored in @file_data_buf_size_. Note, in_stream_.total_size_
-     * is used to limit deserialization, that is why we still have
-     * @file_data_buf_size_.
-     *
-     * Since we may want keep data of current page (and page header
-     * of next page) in memory, we need a byte-size cursor to tell
-     * us which byte we are processing, so we have @chunk_visit_offset_
-     * it refer to position from the start of chunk_header_,
-     * also refer to offset within the chunk (including chunk header).
-     * It advanced by step of a page header or a page tv data.
-     */
-    common::ByteStream time_in_stream_{common::MOD_CHUNK_READER};
-    common::ByteStream value_in_stream_{common::MOD_CHUNK_READER};
+    common::ByteStream time_in_stream_;
+    common::ByteStream value_in_stream_;
     int32_t file_data_time_buf_size_;
     int32_t file_data_value_buf_size_;
     uint32_t time_chunk_visit_offset_;
     uint32_t value_chunk_visit_offset_;
 
-    // Statistic *page_statistic_;
     Compressor* time_compressor_;
     Compressor* value_compressor_;
     Filter* time_filter_;
 
     Decoder* time_decoder_;
     Decoder* value_decoder_;
-    common::ByteStream time_in_{common::MOD_CHUNK_READER};
-    common::ByteStream value_in_{common::MOD_CHUNK_READER};
+    common::ByteStream time_in_;
+    common::ByteStream value_in_;
     char* time_uncompressed_buf_;
     char* value_uncompressed_buf_;
     std::vector<uint8_t> value_page_col_notnull_bitmap_;
     uint32_t value_page_data_num_;
     int32_t cur_value_index;
+
+    // ── Multi-value mode fields ──────────────────────────────────────────
+    bool multi_value_mode_ = false;
+    std::vector<ValueColumnState*> value_columns_;
+
+    // Pre-decoded timestamps for page-level parallel decode.
+    std::vector<int64_t> page_all_times_;
+    int page_time_count_ = 0;
+    int page_time_cursor_ = 0;
+    bool time_predecoded_ = false;
+
+    // ── Page-plan state ────────────────────────────────────────────────
+    std::vector<ChunkPageInfo> chunk_pages_;
+    std::vector<std::vector<int64_t>> per_page_times_;
+    bool page_plan_built_ = false;
+    bool current_page_loaded_ = false;
+    size_t current_page_plan_index_ = 0;
+
+#ifdef ENABLE_THREADS
+    common::ThreadPool* decode_pool_ = nullptr;  // borrowed, not owned
+    // Per-worker time decoder + compressor pool for parallel time-page decode.
+    // Sized to decode_pool_->num_threads() on first use, owned by this reader.
+    std::vector<Decoder*> time_decoder_pool_;
+    std::vector<Compressor*> time_compressor_pool_;
+#endif
 };
 
 }  // end namespace storage
-#endif  // READER_CHUNK_READER_H
+#endif  // READER_CHUNK_ALIGNED_READER_H
diff --git a/cpp/src/reader/block/single_device_tsblock_reader.cc b/cpp/src/reader/block/single_device_tsblock_reader.cc
index 93f42efd3..5fb9d80d2 100644
--- a/cpp/src/reader/block/single_device_tsblock_reader.cc
+++ b/cpp/src/reader/block/single_device_tsblock_reader.cc
@@ -19,8 +19,18 @@
 
 #include "single_device_tsblock_reader.h"
 
+#include <algorithm>
+#include <iostream>
+#include <set>
+
+#include "common/db_common.h"
+
 namespace storage {
 
+namespace {
+const char* kTimeOnlyContextName = "__time_only_aligned_context__";
+}
+
 SingleDeviceTsBlockReader::SingleDeviceTsBlockReader(
     DeviceQueryTask* device_query_task, uint32_t block_size,
     IMetadataQuerier* metadata_querier, TsFileIOReader* tsfile_io_reader,
@@ -55,6 +65,25 @@ int SingleDeviceTsBlockReader::init(DeviceQueryTask* device_query_task,
 int32_t SingleDeviceTsBlockReader::compute_dense_row_count(
     const std::vector<ITimeseriesIndex*>& ts_indexes) {
     int64_t reference_time_count = -1;
+    // Single-chunk timeseries skip per-chunk statistic serialization
+    // (see TsFileIOWriter / TimeseriesIndex::deserialize_from); when the
+    // chunk-level statistic is null, fall back to the TimeseriesIndex's
+    // top-level statistic, which summarizes that lone chunk.
+    auto chunk_count = [](const common::SimpleList<ChunkMeta*>& list,
+                          Statistic* fallback) -> int64_t {
+        int64_t total = 0;
+        int nchunks = 0;
+        for (auto it = list.begin(); it != list.end(); it++) {
+            nchunks++;
+            if (it.get()->statistic_) {
+                total += it.get()->statistic_->count_;
+            }
+        }
+        if (total == 0 && nchunks == 1 && fallback != nullptr) {
+            total = fallback->count_;
+        }
+        return total;
+    };
     for (const auto* ts_index : ts_indexes) {
         if (ts_index == nullptr) {
             continue;
@@ -69,27 +98,30 @@ int32_t SingleDeviceTsBlockReader::compute_dense_row_count(
             if (time_list == nullptr || value_list == nullptr) {
                 return -1;
             }
-
-            for (auto it = time_list->begin(); it != time_list->end(); it++) {
-                if (it.get()->statistic_) {
-                    time_count += it.get()->statistic_->count_;
-                }
-            }
-            for (auto it = value_list->begin(); it != value_list->end(); it++) {
-                if (it.get()->statistic_) {
-                    value_count += it.get()->statistic_->count_;
-                }
+            // Use the time-side and value-side top stats independently:
+            // the value-side count_ excludes nulls, so reusing it for the
+            // time chunk would misclassify sparse data as dense.
+            const auto* aligned_ti =
+                dynamic_cast<const AlignedTimeseriesIndex*>(ts_index);
+            if (aligned_ti == nullptr) {
+                return -1;
             }
+            Statistic* time_top_stat =
+                aligned_ti->time_ts_idx_ != nullptr
+                    ? aligned_ti->time_ts_idx_->get_statistic()
+                    : nullptr;
+            Statistic* value_top_stat =
+                aligned_ti->value_ts_idx_ != nullptr
+                    ? aligned_ti->value_ts_idx_->get_statistic()
+                    : nullptr;
+            time_count = chunk_count(*time_list, time_top_stat);
+            value_count = chunk_count(*value_list, value_top_stat);
         } else {
             auto* list = ts_index->get_chunk_meta_list();
             if (list == nullptr) {
                 return -1;
             }
-            for (auto it = list->begin(); it != list->end(); it++) {
-                if (it.get()->statistic_) {
-                    time_count += it.get()->statistic_->count_;
-                }
-            }
+            time_count = chunk_count(*list, ts_index->get_statistic());
             value_count = time_count;
         }
 
@@ -149,32 +181,198 @@ int SingleDeviceTsBlockReader::init_internal(DeviceQueryTask* device_query_task,
             time_series_indexs, pa_))) {
         return ret;
     }
-
     dense_row_count_ = compute_dense_row_count(time_series_indexs);
-
-    if (dense_row_count_ >= 0 && remaining_offset_ >= dense_row_count_) {
-        remaining_offset_ -= dense_row_count_;
-        delete current_block_;
-        current_block_ = nullptr;
-        return common::E_OK;
+    // Fast path: when every aligned column is provably dense (same total row
+    // count across time + value chunks), bulk-copy from SSI tsblock to caller
+    // tsblock instead of per-row merging.  compute_dense_row_count() returns
+    // -1 if the device is not provably dense, which gates safety.
+    const bool enable_dense_aligned_fast_path = true;
+    // Early device-level time skip: if time_filter is set and ALL chunks of
+    // this device have statistics that fall outside the filter range, skip the
+    // entire device.  Chunks without statistics are assumed to satisfy.
+    //
+    // Skip the entire shortcut when time_series_indexs is empty (e.g. a
+    // time-only query that selects no value column): there's nothing to
+    // prove outside the filter, and dropping out here would lose the
+    // time-only fallback path that runs below.
+    if (time_filter != nullptr && !time_series_indexs.empty()) {
+        bool examined_any = false;
+        bool all_outside = true;
+        for (const auto* ts_idx : time_series_indexs) {
+            if (ts_idx == nullptr) continue;
+            auto* chunk_list = ts_idx->is_aligned()
+                                   ? ts_idx->get_time_chunk_meta_list()
+                                   : ts_idx->get_chunk_meta_list();
+            if (chunk_list == nullptr) {
+                all_outside = false;
+                break;
+            }
+            examined_any = true;
+            for (auto it = chunk_list->begin(); it != chunk_list->end(); it++) {
+                if (it.get()->statistic_ == nullptr ||
+                    time_filter->satisfy(it.get()->statistic_)) {
+                    all_outside = false;
+                    break;
+                }
+            }
+            if (!all_outside) break;
+        }
+        if (examined_any && all_outside) {
+            // No data in this device matches the time filter.
+            delete current_block_;
+            current_block_ = nullptr;
+            return common::E_OK;
+        }
     }
+    // Try multi-value aligned path: one VectorMeasurementColumnContext (and
+    // the SSI it owns) reads all aligned value columns at once.  This is the
+    // entry point for AlignedChunkReader's per-column parallel decode pool
+    // (created in TsFileSeriesScanIterator::init_chunk_reader_multi when
+    // num_cols > 1 && parallel_read_enabled_); per-column
+    // SingleMeasurementColumnContext siblings would each open their own
+    // single-column SSI and never reach it. Falls back to the per-column path
+    // if ctx->init() fails (e.g. the device mixes aligned and non-aligned
+    // chunks).
+    bool used_multi = false;
+    std::set<std::string> multi_names;
+    {
+        bool can_multi = !time_series_indexs.empty();
+        auto& meas_cols =
+            device_query_task->get_column_mapping()->get_measurement_columns();
+        for (const auto& ts_idx : time_series_indexs) {
+            if (ts_idx == nullptr || !ts_idx->is_aligned()) {
+                can_multi = false;
+                break;
+            }
+        }
+        if (can_multi) {
+            std::vector<std::string> meas_names(meas_cols.begin(),
+                                                meas_cols.end());
+            // Stable order by first appearance in the result schema so the
+            // shared SSI's column buffers line up with the result columns.
+            std::sort(
+                meas_names.begin(), meas_names.end(),
+                [device_query_task](const std::string& lhs,
+                                    const std::string& rhs) {
+                    const auto& lhs_pos =
+                        device_query_task->get_column_mapping()->get_column_pos(
+                            lhs);
+                    const auto& rhs_pos =
+                        device_query_task->get_column_mapping()->get_column_pos(
+                            rhs);
+                    const int lhs_first =
+                        lhs_pos.empty() ? INT32_MAX : lhs_pos.front();
+                    const int rhs_first =
+                        rhs_pos.empty() ? INT32_MAX : rhs_pos.front();
+                    if (lhs_first != rhs_first) {
+                        return lhs_first < rhs_first;
+                    }
+                    return lhs < rhs;
+                });
+            std::vector<std::vector<int32_t>> pos_list;
+            pos_list.reserve(meas_names.size());
+            for (const auto& name : meas_names) {
+                const auto& pos =
+                    device_query_task->get_column_mapping()->get_column_pos(
+                        name);
+                pos_list.push_back(
+                    std::vector<int32_t>(pos.begin(), pos.end()));
+            }
 
-    int ssi_offset = 0;
-    int ssi_limit = -1;
-    if (dense_row_count_ >= 0) {
-        ssi_offset = remaining_offset_;
-        ssi_limit = remaining_limit_;
+            auto* ctx = new VectorMeasurementColumnContext(tsfile_io_reader_);
+            if (common::E_OK == ctx->init(device_query_task_, meas_names,
+                                          time_filter, pos_list, pa_)) {
+                // The shared ctx is referenced from N map entries; close()
+                // and the merge loop dedupe by pointer (already in place).
+                for (const auto& name : meas_names) {
+                    field_column_contexts_.insert(std::make_pair(name, ctx));
+                    multi_names.insert(name);
+                }
+                aligned_col_count_ = meas_names.size();
+                used_multi = true;
+            } else {
+                delete ctx;
+            }
+        }
     }
 
+    // Per-column path for anything not absorbed by the multi-value ctx
+    // (e.g. fallback when init() failed, or a non-aligned column would have
+    // been added before we generalize this for mixed schemas).
     for (const auto& time_series_index : time_series_indexs) {
-        construct_column_context(time_series_index, time_filter, ssi_offset,
-                                 ssi_limit);
+        if (time_series_index == nullptr) {
+            continue;
+        }
+        const std::string measurement_name =
+            time_series_index->get_measurement_name().to_std_string();
+        if (used_multi && multi_names.count(measurement_name) > 0) {
+            continue;
+        }
+        construct_column_context(time_series_index, time_filter, 0, -1);
+    }
+
+    if (field_column_contexts_.empty()) {
+        // If value columns were actually requested but none produced a
+        // context, every one of them read empty under the current filter
+        // (e.g. an empty/inverted time range, or a filter that matches no
+        // rows).  The result is simply empty -- return it directly.  The
+        // time-only fallback below is only for genuine time-only queries (no
+        // value columns); routing an all-empty value query through it would
+        // call alloc_multi_ssi(), which is aligned-only and returns
+        // E_NOT_SUPPORT on non-aligned devices.
+        bool any_value_column_requested = false;
+        for (const auto* ts_idx : time_series_indexs) {
+            if (ts_idx != nullptr) {
+                any_value_column_requested = true;
+                break;
+            }
+        }
+        if (any_value_column_requested) {
+            delete current_block_;
+            current_block_ = nullptr;
+            return common::E_OK;
+        }
+
+        std::vector<std::string> empty_measurements;
+        std::vector<std::vector<int32_t>> empty_positions;
+        auto* time_only_ctx =
+            new VectorMeasurementColumnContext(tsfile_io_reader_);
+        int time_only_ret =
+            time_only_ctx->init(device_query_task_, empty_measurements,
+                                time_filter, empty_positions, pa_);
+        if (common::E_OK == time_only_ret) {
+            field_column_contexts_.insert(
+                std::make_pair(kTimeOnlyContextName, time_only_ctx));
+        } else {
+            delete time_only_ctx;
+            // Only treat "no data" as an acceptable empty result; I/O
+            // errors, OOM, and corruption from the time-only init must
+            // propagate so the caller sees the actual failure instead of
+            // an empty resultset wearing E_OK.
+            if (time_only_ret != common::E_NO_MORE_DATA) {
+                delete current_block_;
+                current_block_ = nullptr;
+                return time_only_ret;
+            }
+        }
     }
 
-    if (dense_row_count_ >= 0 && !field_column_contexts_.empty()) {
-        auto* first_ctx = field_column_contexts_.begin()->second;
-        remaining_offset_ = first_ctx->get_ssi_row_offset();
-        remaining_limit_ = first_ctx->get_ssi_row_limit();
+    // Detect aligned fast path: every field column comes from an aligned chunk.
+    if (!field_column_contexts_.empty() && enable_dense_aligned_fast_path &&
+        dense_row_count_ >= 0 &&
+        aligned_col_count_ == field_column_contexts_.size()) {
+        all_aligned_ = true;
+        aligned_vec_.reserve(field_column_contexts_.size());
+        if (used_multi) {
+            // Single shared VectorMeasurementColumnContext handles all
+            // columns — push it once, otherwise we'd schedule the same
+            // bulk_copy_into N times.
+            aligned_vec_.push_back(field_column_contexts_.begin()->second);
+        } else {
+            for (auto& kv : field_column_contexts_) {
+                aligned_vec_.push_back(kv.second);
+            }
+        }
     }
 
     if (field_column_contexts_.empty()) {
@@ -218,18 +416,25 @@ int SingleDeviceTsBlockReader::has_next(bool& has_next) {
 
     current_block_->reset();
 
-    uint32_t effective_block_size = block_size_;
-    if (remaining_limit_ > 0) {
-        effective_block_size =
-            std::min(block_size_, static_cast<uint32_t>(remaining_limit_));
+    if (all_aligned_) {
+        return has_next_aligned(has_next);
     }
 
     bool next_time_set = false;
     next_time_ = -1;
 
     std::vector<MeasurementColumnContext*> min_time_columns;
-    while (current_block_->get_row_count() < effective_block_size) {
+    while (current_block_->get_row_count() < block_size_) {
+        if (remaining_limit_ > 0 &&
+            current_block_->get_row_count() >=
+                static_cast<uint32_t>(remaining_limit_)) {
+            break;
+        }
+        std::set<MeasurementColumnContext*> visited_contexts;
         for (auto& column_context : field_column_contexts_) {
+            if (!visited_contexts.insert(column_context.second).second) {
+                continue;
+            }
             int64_t time;
             if (IS_FAIL(column_context.second->get_current_time(time))) {
                 continue;
@@ -293,6 +498,114 @@ int SingleDeviceTsBlockReader::has_next(bool& has_next) {
     return ret;
 }
 
+int SingleDeviceTsBlockReader::has_next_aligned(bool& result_has_next) {
+    int ret = common::E_OK;
+    int time_in_query_index = tuple_desc_.get_time_column_index();
+
+    while (current_block_->get_row_count() < block_size_) {
+        if (aligned_vec_.empty()) break;
+
+        if (remaining_limit_ == 0) break;
+
+        // Check if first column has data.
+        uint32_t avail = aligned_vec_[0]->available_rows();
+        if (avail == 0) {
+            for (auto* ctx : aligned_vec_) {
+                ctx->remove_from(field_column_contexts_);
+            }
+            aligned_vec_.clear();
+            break;
+        }
+
+        // Find the batch size: min of output capacity and all SSI
+        // availabilities.
+        uint32_t batch = block_size_ - current_block_->get_row_count();
+        for (auto* ctx : aligned_vec_) {
+            uint32_t ctx_avail = ctx->available_rows();
+            if (ctx_avail == 0) {
+                batch = 0;
+                break;
+            }
+            if (ctx_avail < batch) batch = ctx_avail;
+        }
+        if (batch == 0) {
+            for (auto* ctx : aligned_vec_) {
+                ctx->remove_from(field_column_contexts_);
+            }
+            aligned_vec_.clear();
+            break;
+        }
+
+        // Handle offset: skip rows before copying.
+        if (remaining_offset_ > 0) {
+            uint32_t skip = std::min(batch, (uint32_t)remaining_offset_);
+            for (auto* ctx : aligned_vec_) {
+                int sr = ctx->skip_rows(skip);
+                if (sr != common::E_OK) return sr;
+            }
+            remaining_offset_ -= skip;
+            continue;
+        }
+
+        // Handle limit: cap the batch size.
+        if (remaining_limit_ > 0) {
+            batch = std::min(batch, (uint32_t)remaining_limit_);
+        }
+
+        // First SSI: bulk copy time + values + row_count.
+        int copy_ret = aligned_vec_[0]->bulk_copy_into(
+            col_appenders_, col_appenders_[time_column_index_], row_appender_,
+            batch);
+        // E_NO_MORE_DATA is the normal end-of-stream signal; any other
+        // error (I/O, decode, corruption) must propagate to the caller
+        // instead of silently truncating the result with E_OK.
+        if (copy_ret != common::E_OK && copy_ret != common::E_NO_MORE_DATA) {
+            return copy_ret;
+        }
+
+        // Also copy time to explicit time column if requested.
+        if (time_in_query_index != -1) {
+            common::Vector* time_vec =
+                current_block_->get_vector(time_column_index_);
+            char* time_src =
+                time_vec->get_value_data().get_data() +
+                (current_block_->get_row_count() - batch) * sizeof(int64_t);
+            col_appenders_[time_in_query_index]->bulk_append_fixed(
+                time_src, batch, sizeof(int64_t));
+        }
+
+        // Other SSIs: bulk copy values only (no time, no row_count). Any
+        // hard error from these columns also has to propagate; otherwise a
+        // truncated/corrupt value column would silently emit nulls.
+        for (size_t i = 1; i < aligned_vec_.size(); i++) {
+            int other_ret = aligned_vec_[i]->bulk_copy_into(
+                col_appenders_, nullptr, nullptr, batch);
+            if (other_ret != common::E_OK &&
+                other_ret != common::E_NO_MORE_DATA) {
+                return other_ret;
+            }
+        }
+
+        // Decrement limit for data already copied.
+        if (remaining_limit_ > 0) {
+            remaining_limit_ -= batch;
+        }
+
+        // If first SSI signaled no-more-data, stop after accounting.
+        if (copy_ret == common::E_NO_MORE_DATA) break;
+    }
+
+    if (current_block_->get_row_count() > 0) {
+        if (RET_FAIL(fill_ids())) return ret;
+        current_block_->fill_trailling_nulls();
+        last_block_returned_ = false;
+        result_has_next = true;
+    } else {
+        result_has_next = false;
+    }
+    return ret;
+}
+
 int SingleDeviceTsBlockReader::fill_measurements(
     std::vector<MeasurementColumnContext*>& column_contexts) {
     int ret = common::E_OK;
@@ -400,8 +713,15 @@ int SingleDeviceTsBlockReader::next(common::TsBlock*& ret_block) {
 }
 
 void SingleDeviceTsBlockReader::close() {
+    aligned_vec_.clear();  // non-owning; owned by field_column_contexts_
+    // De-duplicate pointers before deleting: VectorMeasurementColumnContext
+    // has multiple map entries pointing to the same object.
+    std::set<MeasurementColumnContext*> unique_contexts;
     for (auto& column_context : field_column_contexts_) {
-        delete column_context.second;
+        unique_contexts.insert(column_context.second);
+    }
+    for (auto* ctx : unique_contexts) {
+        delete ctx;
     }
     for (auto& col_appender : col_appenders_) {
         if (col_appender) {
@@ -413,9 +733,7 @@ void SingleDeviceTsBlockReader::close() {
         delete row_appender_;
         row_appender_ = nullptr;
     }
-    if (device_query_task_) {
-        device_query_task_->~DeviceQueryTask();
-    }
+    device_query_task_ = nullptr;  // owned by the task iterator arena
     if (current_block_) {
         delete current_block_;
         current_block_ = nullptr;
@@ -430,24 +748,34 @@ int SingleDeviceTsBlockReader::construct_column_context(
         (!time_series_index->is_aligned() &&
          time_series_index->get_chunk_meta_list()->empty())) {
     } else if (time_series_index->is_aligned()) {
+        const int effective_ssi_offset = dense_row_count_ >= 0 ? ssi_offset : 0;
+        const int effective_ssi_limit = dense_row_count_ >= 0 ? ssi_limit : -1;
         const AlignedTimeseriesIndex* aligned_time_series_index =
             dynamic_cast<const AlignedTimeseriesIndex*>(time_series_index);
         if (aligned_time_series_index == nullptr) {
             assert(false);
         }
+        if (aligned_time_series_index->value_ts_idx_ != nullptr &&
+            aligned_time_series_index->value_ts_idx_->get_statistic() !=
+                nullptr &&
+            aligned_time_series_index->value_ts_idx_->get_statistic()->count_ ==
+                0) {
+            return ret;
+        }
         SingleMeasurementColumnContext* column_context =
             new SingleMeasurementColumnContext(tsfile_io_reader_);
         if (RET_FAIL(column_context->init(
                 device_query_task_, time_series_index, time_filter,
                 device_query_task_->get_column_mapping()->get_column_pos(
                     time_series_index->get_measurement_name().to_std_string()),
-                pa_, ssi_offset, ssi_limit))) {
+                pa_, effective_ssi_offset, effective_ssi_limit))) {
             delete column_context;
             return ret;
         }
         field_column_contexts_.insert(std::make_pair(
             time_series_index->get_measurement_name().to_std_string(),
             column_context));
+        aligned_col_count_++;
     } else {
         SingleMeasurementColumnContext* column_context =
             new SingleMeasurementColumnContext(tsfile_io_reader_);
@@ -568,4 +896,342 @@ void SingleMeasurementColumnContext::fill_into(
     }
 }
 
+uint32_t SingleMeasurementColumnContext::available_rows() const {
+    if (!time_iter_ || time_iter_->end()) return 0;
+    return time_iter_->remaining();
+}
+
+int SingleMeasurementColumnContext::bulk_copy_into(
+    std::vector<common::ColAppender*>& col_appenders,
+    common::ColAppender* time_appender, common::RowAppender* row_appender,
+    uint32_t count) {
+    int ret = common::E_OK;
+    const uint32_t time_elem_size = sizeof(int64_t);
+    auto dt = value_iter_->get_data_type();
+    bool is_varlen =
+        (dt == common::STRING || dt == common::TEXT || dt == common::BLOB);
+
+    // Bulk copy time column (only first SSI does this).
+    if (time_appender) {
+        time_appender->bulk_append_fixed(time_iter_->data_ptr(), count,
+                                         time_elem_size);
+    }
+
+    // Advance output row count (only first SSI does this).
+    if (row_appender) {
+        row_appender->add_rows(count);
+    }
+
+    if (is_varlen || value_iter_->has_null()) {
+        for (uint32_t r = 0; r < count; r++) {
+            uint32_t len = 0;
+            bool is_null = false;
+            char* val = value_iter_->read(&len, &is_null);
+            for (int32_t pos : pos_in_result_) {
+                auto* appender = col_appenders[pos + 1];
+                appender->add_row();
+                if (is_null) {
+                    appender->append_null();
+                } else {
+                    appender->append(val, len);
+                }
+            }
+            value_iter_->next();
+        }
+    } else {
+        const uint32_t val_elem_size = common::get_data_type_size(dt);
+        char* val_ptr = value_iter_->data_ptr();
+        for (int32_t pos : pos_in_result_) {
+            col_appenders[pos + 1]->bulk_append_fixed(val_ptr, count,
+                                                      val_elem_size);
+        }
+        value_iter_->advance(count, val_elem_size);
+    }
+
+    // Advance source iterators.
+    time_iter_->advance(count, time_elem_size);
+
+    // If source TsBlock exhausted, load next.
+    if (time_iter_->end()) {
+        if (RET_FAIL(get_next_tsblock(false))) {
+            return ret;
+        }
+    }
+    return ret;
+}
+
+int SingleMeasurementColumnContext::skip_rows(uint32_t count) {
+    if (!time_iter_ || time_iter_->end()) return common::E_OK;
+    const uint32_t time_elem_size = sizeof(int64_t);
+    auto dt = value_iter_->get_data_type();
+    bool is_varlen =
+        (dt == common::STRING || dt == common::TEXT || dt == common::BLOB);
+    uint32_t to_skip = std::min(count, time_iter_->remaining());
+    time_iter_->advance(to_skip, time_elem_size);
+    if (is_varlen || value_iter_->has_null()) {
+        for (uint32_t r = 0; r < to_skip; r++) {
+            value_iter_->next();
+        }
+    } else {
+        const uint32_t val_elem_size = common::get_data_type_size(dt);
+        value_iter_->advance(to_skip, val_elem_size);
+    }
+    if (time_iter_->end()) {
+        // Propagate hard errors from the next-tsblock load; E_NO_MORE_DATA
+        // is the legitimate end-of-stream signal and gets squashed back to
+        // E_OK so the caller's outer loop notices via available_rows()==0.
+        int r = get_next_tsblock(false);
+        if (r != common::E_OK && r != common::E_NO_MORE_DATA) return r;
+    }
+    return common::E_OK;
+}
+
+// ── VectorMeasurementColumnContext implementation ───────────────────────
+
+VectorMeasurementColumnContext::~VectorMeasurementColumnContext() {
+    if (time_iter_) {
+        delete time_iter_;
+        time_iter_ = nullptr;
+    }
+    for (auto* vi : value_iters_) {
+        if (vi) delete vi;
+    }
+    value_iters_.clear();
+    if (ssi_) {
+        ssi_->revert_tsblock();
+    }
+    tsfile_io_reader_->revert_ssi(ssi_);
+    ssi_ = nullptr;
+}
+
+int VectorMeasurementColumnContext::init(
+    DeviceQueryTask* device_query_task,
+    const std::vector<std::string>& measurement_names, Filter* time_filter,
+    std::vector<std::vector<int32_t>>& pos_in_result, common::PageArena& pa) {
+    int ret = common::E_OK;
+    pos_in_result_ = pos_in_result;
+    column_names_ = measurement_names;
+    if (RET_FAIL(tsfile_io_reader_->alloc_multi_ssi(
+            device_query_task->get_device_id(), measurement_names, ssi_, pa,
+            time_filter))) {
+        return ret;
+    }
+    if (RET_FAIL(get_next_tsblock(true))) {
+        return ret;
+    }
+    return ret;
+}
+
+int VectorMeasurementColumnContext::get_next_tsblock(bool alloc_mem) {
+    int ret = common::E_OK;
+    if (tsblock_ != nullptr) {
+        if (time_iter_) {
+            delete time_iter_;
+            time_iter_ = nullptr;
+        }
+        for (auto* vi : value_iters_) {
+            if (vi) delete vi;
+        }
+        value_iters_.clear();
+        tsblock_->reset();
+    }
+    if (RET_FAIL(ssi_->get_next(tsblock_, alloc_mem))) {
+        if (time_iter_) {
+            delete time_iter_;
+            time_iter_ = nullptr;
+        }
+        for (auto* vi : value_iters_) {
+            if (vi) delete vi;
+        }
+        value_iters_.clear();
+        if (tsblock_) {
+            ssi_->destroy();
+            tsblock_ = nullptr;
+        }
+    } else {
+        time_iter_ = new common::ColIterator(0, tsblock_);
+        uint32_t num_value_cols = tsblock_->get_column_count() - 1;
+        value_iters_.reserve(num_value_cols);
+        for (uint32_t c = 0; c < num_value_cols; c++) {
+            value_iters_.push_back(new common::ColIterator(c + 1, tsblock_));
+        }
+    }
+    return ret;
+}
+
+int VectorMeasurementColumnContext::get_current_time(int64_t& time) {
+    if (!time_iter_ || time_iter_->end()) return common::E_NO_MORE_DATA;
+    uint32_t len = 0;
+    time = *(int64_t*)(time_iter_->read(&len));
+    return common::E_OK;
+}
+
+int VectorMeasurementColumnContext::get_current_value(char*& value,
+                                                      uint32_t& len) {
+    if (value_iters_.empty() || value_iters_[0]->end())
+        return common::E_NO_MORE_DATA;
+    bool is_null = false;
+    value = value_iters_[0]->read(&len, &is_null);
+    return common::E_OK;
+}
+
+int VectorMeasurementColumnContext::move_iter() {
+    int ret = common::E_OK;
+    time_iter_->next();
+    for (auto* vi : value_iters_) vi->next();
+    if (time_iter_->end()) {
+        if (RET_FAIL(get_next_tsblock(false))) return ret;
+    }
+    return ret;
+}
+
+void VectorMeasurementColumnContext::fill_into(
+    std::vector<common::ColAppender*>& col_appenders) {
+    for (uint32_t c = 0; c < value_iters_.size() && c < pos_in_result_.size();
+         c++) {
+        uint32_t len = 0;
+        bool is_null = false;
+        char* val = value_iters_[c]->read(&len, &is_null);
+        for (int32_t pos : pos_in_result_[c]) {
+            col_appenders[pos + 1]->add_row();
+            if (is_null) {
+                col_appenders[pos + 1]->append_null();
+            } else {
+                col_appenders[pos + 1]->append(val, len);
+            }
+        }
+    }
+}
+
+void VectorMeasurementColumnContext::remove_from(
+    std::map<std::string, MeasurementColumnContext*>& column_context_map) {
+    if (column_names_.empty()) {
+        for (auto it = column_context_map.begin();
+             it != column_context_map.end();) {
+            if (it->second == this) {
+                it = column_context_map.erase(it);
+            } else {
+                ++it;
+            }
+        }
+        delete this;
+        return;
+    }
+    for (const auto& name : column_names_) {
+        column_context_map.erase(name);
+    }
+    delete this;
+}
+
+uint32_t VectorMeasurementColumnContext::available_rows() const {
+    if (!time_iter_ || time_iter_->end()) return 0;
+    return time_iter_->remaining();
+}
+
+int VectorMeasurementColumnContext::bulk_copy_into(
+    std::vector<common::ColAppender*>& col_appenders,
+    common::ColAppender* time_appender, common::RowAppender* row_appender,
+    uint32_t count) {
+    int ret = common::E_OK;
+    const uint32_t time_elem_size = sizeof(int64_t);
+
+    // Bulk copy time column (only when time_appender is provided).
+    if (time_appender) {
+        time_appender->bulk_append_fixed(time_iter_->data_ptr(), count,
+                                         time_elem_size);
+    }
+
+    // Advance output row count.
+    if (row_appender) {
+        row_appender->add_rows(count);
+    }
+
+    // Bulk copy each value column to its output positions, propagating nulls.
+    for (uint32_t c = 0; c < value_iters_.size() && c < pos_in_result_.size();
+         c++) {
+        auto dt = value_iters_[c]->get_data_type();
+        bool is_varlen =
+            (dt == common::STRING || dt == common::TEXT || dt == common::BLOB);
+        bool src_has_null = value_iters_[c]->has_null();
+
+        if (is_varlen || src_has_null) {
+            // Row-by-row copy for variable-length columns using the
+            // ColIterator next()/read() which properly tracks offsets. Fixed
+            // length columns with nulls also need this path because their
+            // payload buffer only stores non-null values.
+            auto* iter = value_iters_[c];
+            for (uint32_t r = 0; r < count; r++) {
+                uint32_t len = 0;
+                bool is_null = false;
+                char* val = iter->read(&len, &is_null);
+                for (int32_t pos : pos_in_result_[c]) {
+                    auto* appender = col_appenders[pos + 1];
+                    appender->add_row();
+                    if (is_null) {
+                        appender->append_null();
+                    } else {
+                        appender->append(val, len);
+                    }
+                }
+                iter->next();
+            }
+        } else {
+            // Bulk copy for fixed-length columns
+            uint32_t val_elem_size = common::get_data_type_size(dt);
+            char* val_ptr = value_iters_[c]->data_ptr();
+            for (int32_t pos : pos_in_result_[c]) {
+                col_appenders[pos + 1]->bulk_append_fixed(val_ptr, count,
+                                                          val_elem_size);
+            }
+        }
+    }
+
+    // Advance all source iterators.
+    time_iter_->advance(count, time_elem_size);
+    for (uint32_t c = 0; c < value_iters_.size(); c++) {
+        auto dt = value_iters_[c]->get_data_type();
+        bool is_varlen =
+            (dt == common::STRING || dt == common::TEXT || dt == common::BLOB);
+        if (!is_varlen && !value_iters_[c]->has_null()) {
+            uint32_t val_elem_size = common::get_data_type_size(dt);
+            value_iters_[c]->advance(count, val_elem_size);
+        }
+        // Variable-length iterators and fixed-length iterators with nulls were
+        // already advanced in the copy loop above.
+    }
+
+    // If source TsBlock exhausted, load next.
+    if (time_iter_->end()) {
+        if (RET_FAIL(get_next_tsblock(false))) return ret;
+    }
+    return ret;
+}
+
+int VectorMeasurementColumnContext::skip_rows(uint32_t count) {
+    if (!time_iter_ || time_iter_->end()) return common::E_OK;
+    const uint32_t time_elem_size = sizeof(int64_t);
+    uint32_t to_skip = std::min(count, time_iter_->remaining());
+    time_iter_->advance(to_skip, time_elem_size);
+    for (uint32_t c = 0; c < value_iters_.size(); c++) {
+        auto dt = value_iters_[c]->get_data_type();
+        bool is_varlen =
+            (dt == common::STRING || dt == common::TEXT || dt == common::BLOB);
+        if (!is_varlen && !value_iters_[c]->has_null()) {
+            uint32_t val_elem_size = common::get_data_type_size(dt);
+            value_iters_[c]->advance(to_skip, val_elem_size);
+        } else {
+            // Variable-length and fixed-length-with-null vectors need next()
+            // to keep the payload offset aligned with non-null rows.
+            for (uint32_t r = 0; r < to_skip; r++) {
+                value_iters_[c]->next();
+            }
+        }
+    }
+    if (time_iter_->end()) {
+        int r = get_next_tsblock(false);
+        if (r != common::E_OK && r != common::E_NO_MORE_DATA) return r;
+    }
+    return common::E_OK;
+}
+
 }  // namespace storage
diff --git a/cpp/src/reader/block/single_device_tsblock_reader.h b/cpp/src/reader/block/single_device_tsblock_reader.h
index 07d16860c..e74304baf 100644
--- a/cpp/src/reader/block/single_device_tsblock_reader.h
+++ b/cpp/src/reader/block/single_device_tsblock_reader.h
@@ -65,6 +65,9 @@ class SingleDeviceTsBlockReader : public TsBlockReader {
     int advance_column(MeasurementColumnContext* column_context);
     int32_t compute_dense_row_count(
         const std::vector<ITimeseriesIndex*>& ts_indexes);
+    // Fast path for aligned data: all columns share the same timestamps,
+    // so no per-row merge-sort is needed.
+    int has_next_aligned(bool& has_next);
 
     DeviceQueryTask* device_query_task_;
     Filter* field_filter_;
@@ -83,6 +86,11 @@ class SingleDeviceTsBlockReader : public TsBlockReader {
     int remaining_offset_ = 0;
     int remaining_limit_ = -1;
     int32_t dense_row_count_ = -1;
+    // Populated in init() when every field column comes from an aligned chunk.
+    // Provides cache-friendly vector iteration for has_next_aligned().
+    bool all_aligned_ = false;
+    uint32_t aligned_col_count_ = 0;
+    std::vector<MeasurementColumnContext*> aligned_vec_;
 };
 
 class MeasurementColumnContext {
@@ -116,6 +124,13 @@ class MeasurementColumnContext {
         return ssi_ ? ssi_->get_row_limit() : -1;
     }
 
+    virtual uint32_t available_rows() const = 0;
+    virtual int bulk_copy_into(std::vector<common::ColAppender*>& col_appenders,
+                               common::ColAppender* time_appender,
+                               common::RowAppender* row_appender,
+                               uint32_t count) = 0;
+    virtual int skip_rows(uint32_t count) = 0;
+
    protected:
     TsFileIOReader* tsfile_io_reader_;
     TsFileSeriesScanIterator* ssi_ = nullptr;
@@ -124,7 +139,7 @@ class MeasurementColumnContext {
     common::ColIterator* value_iter_ = nullptr;
 };
 
-class SingleMeasurementColumnContext final : public MeasurementColumnContext {
+class SingleMeasurementColumnContext : public MeasurementColumnContext {
    public:
     explicit SingleMeasurementColumnContext(TsFileIOReader* tsfile_io_reader)
         : MeasurementColumnContext(tsfile_io_reader) {}
@@ -155,6 +170,12 @@ class SingleMeasurementColumnContext final : public MeasurementColumnContext {
     int get_current_time(int64_t& time) override;
     int get_current_value(char*& value, uint32_t& len) override;
     int move_iter() override;
+    uint32_t available_rows() const override;
+    int bulk_copy_into(std::vector<common::ColAppender*>& col_appenders,
+                       common::ColAppender* time_appender,
+                       common::RowAppender* row_appender,
+                       uint32_t count) override;
+    int skip_rows(uint32_t count) override;
 
    private:
     std::string column_name_;
@@ -165,21 +186,31 @@ class VectorMeasurementColumnContext final : public MeasurementColumnContext {
    public:
     explicit VectorMeasurementColumnContext(TsFileIOReader* tsfile_io_reader)
         : MeasurementColumnContext(tsfile_io_reader) {}
+    ~VectorMeasurementColumnContext() override;
 
     void fill_into(std::vector<common::ColAppender*>& col_appenders) override;
     void remove_from(std::map<std::string, MeasurementColumnContext*>&
                          column_context_map) override;
     int init(DeviceQueryTask* device_query_task,
-             const ITimeseriesIndex* time_series_index, Filter* time_filter,
+             const std::vector<std::string>& measurement_names,
+             Filter* time_filter,
              std::vector<std::vector<int32_t>>& pos_in_result,
              common::PageArena& pa);
     int get_next_tsblock(bool alloc_mem) override;
     int get_current_time(int64_t& time) override;
     int get_current_value(char*& value, uint32_t& len) override;
     int move_iter() override;
+    uint32_t available_rows() const override;
+    int bulk_copy_into(std::vector<common::ColAppender*>& col_appenders,
+                       common::ColAppender* time_appender,
+                       common::RowAppender* row_appender,
+                       uint32_t count) override;
+    int skip_rows(uint32_t count) override;
 
    private:
+    std::vector<std::string> column_names_;
     std::vector<std::vector<int32_t>> pos_in_result_;
+    std::vector<common::ColIterator*> value_iters_;
 };
 
 class IdColumnContext {
diff --git a/cpp/src/reader/bloom_filter.cc b/cpp/src/reader/bloom_filter.cc
index 068c96e27..4aff4ecd3 100644
--- a/cpp/src/reader/bloom_filter.cc
+++ b/cpp/src/reader/bloom_filter.cc
@@ -208,6 +208,26 @@ int BloomFilter::add_path_entry(const String& device_name,
     return E_OK;
 }
 
+bool BloomFilter::contains(const String& device_name,
+                           const String& measurement_name) {
+    if (size_ == 0) {
+        return true;  // empty filter — assume present
+    }
+    String entry = get_entry_string(device_name, measurement_name);
+    if (IS_NULL(entry.buf_)) {
+        return true;  // OOM — conservatively assume present
+    }
+    for (uint32_t i = 0; i < hash_func_count_; i++) {
+        int32_t hv = hash_func_arr_[i].hash(entry);
+        if (!bitset_.get(hv)) {
+            free_entry_buf(entry.buf_);
+            return false;  // definitely not present
+        }
+    }
+    free_entry_buf(entry.buf_);
+    return true;  // probably present
+}
+
 int BloomFilter::serialize_to(ByteStream& out) {
     int ret = E_OK;
     uint8_t* filter_data_bytes = nullptr;
diff --git a/cpp/src/reader/bloom_filter.h b/cpp/src/reader/bloom_filter.h
index b00de4a84..323cfa8a4 100644
--- a/cpp/src/reader/bloom_filter.h
+++ b/cpp/src/reader/bloom_filter.h
@@ -74,6 +74,11 @@ class BitSet {
         int32_t word_offset = pos % 64;
         words_[word_idx] |= (1ull << word_offset);
     }
+    bool get(int32_t pos) const {
+        int32_t word_idx = pos / 64;
+        int32_t word_offset = pos % 64;
+        return (words_[word_idx] & (1ull << word_offset)) != 0;
+    }
     int32_t get_words_in_use() const {
         for (int32_t i = word_count_ - 1; i >= 0; i--) {
             if (words_[i] != 0) {
@@ -107,8 +112,11 @@ class BloomFilter {
     void destroy() { bitset_.destroy(); }
     int add_path_entry(const common::String& device_name,
                        const common::String& measurement_name);
+    bool contains(const common::String& device_name,
+                  const common::String& measurement_name);
     int serialize_to(common::ByteStream& out);
     int deserialize_from(common::ByteStream& in);
+    bool is_empty() const { return size_ == 0; }
     BitSet* get_bit_set() { return &bitset_; }
 
    private:
diff --git a/cpp/src/reader/chunk_reader.cc b/cpp/src/reader/chunk_reader.cc
index b150f7851..7c36ea07f 100644
--- a/cpp/src/reader/chunk_reader.cc
+++ b/cpp/src/reader/chunk_reader.cc
@@ -422,8 +422,6 @@ int ChunkReader::i32_DECODE_TYPED_TV_INTO_TSBLOCK(ByteStream& time_in,
                 row_appender.backoff_add_row();
                 continue;
             } else {
-                /*std::cout << "decoder: time=" << time << ", value=" << value
-                 * << std::endl;*/
                 row_appender.append(0, (char*)&time, sizeof(time));
                 row_appender.append(1, (char*)&value, sizeof(value));
             }
@@ -432,6 +430,350 @@ int ChunkReader::i32_DECODE_TYPED_TV_INTO_TSBLOCK(ByteStream& time_in,
     return ret;
 }
 
+int ChunkReader::i32_DECODE_TV_BATCH(ByteStream& time_in, ByteStream& value_in,
+                                     RowAppender& row_appender,
+                                     Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    int32_t values[BATCH];
+
+    while (time_decoder_->has_remaining(time_in)) {
+        // Cap each pass to what the appender can still hold; the old
+        // "remaining < BATCH → OVERFLOW" check made progress impossible on
+        // TsBlocks with capacity below BATCH.
+        int eff_batch =
+            std::min(BATCH, static_cast<int>(row_appender.remaining()));
+        if (eff_batch <= 0) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    value_decoder_->skip_int32(block_count, skipped, value_in);
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        int value_count = 0;
+
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch,
+                                                     time_count, time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            int skipped = 0;
+            value_decoder_->skip_int32(time_count, skipped, value_in);
+            continue;
+        }
+
+        if (RET_FAIL(value_decoder_->read_batch_int32(values, time_count,
+                                                      value_count, value_in))) {
+            break;
+        }
+        // Time and value chunks are written in lock-step; any discrepancy
+        // means the file is truncated or corrupted.  Reading uninitialised
+        // values[i] would silently surface garbage as decoded rows.
+        if (value_count != time_count) {
+            ret = E_TSFILE_CORRUPTED;
+            break;
+        }
+
+        for (int i = 0; i < time_count; ++i) {
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                continue;
+            }
+            if (filter != nullptr && !block_all_pass &&
+                !filter->satisfy(times[i], (int64_t)values[i])) {
+                continue;
+            }
+            if (UNLIKELY(!row_appender.add_row())) {
+                ret = E_OVERFLOW;
+                break;
+            }
+            row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+            row_appender.append(1, (char*)&values[i], sizeof(int32_t));
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
+int ChunkReader::i64_DECODE_TV_BATCH(ByteStream& time_in, ByteStream& value_in,
+                                     RowAppender& row_appender,
+                                     Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    int64_t values[BATCH];
+
+    while (time_decoder_->has_remaining(time_in)) {
+        int eff_batch =
+            std::min(BATCH, static_cast<int>(row_appender.remaining()));
+        if (eff_batch <= 0) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    value_decoder_->skip_int64(block_count, skipped, value_in);
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        int value_count = 0;
+
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch,
+                                                     time_count, time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            int skipped = 0;
+            value_decoder_->skip_int64(time_count, skipped, value_in);
+            continue;
+        }
+
+        if (RET_FAIL(value_decoder_->read_batch_int64(values, time_count,
+                                                      value_count, value_in))) {
+            break;
+        }
+        if (value_count != time_count) {
+            ret = E_TSFILE_CORRUPTED;
+            break;
+        }
+
+        for (int i = 0; i < time_count; ++i) {
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                continue;
+            }
+            if (filter != nullptr && !block_all_pass &&
+                !filter->satisfy(times[i], values[i])) {
+                continue;
+            }
+            if (UNLIKELY(!row_appender.add_row())) {
+                ret = E_OVERFLOW;
+                break;
+            }
+            row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+            row_appender.append(1, (char*)&values[i], sizeof(int64_t));
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
+int ChunkReader::float_DECODE_TV_BATCH(ByteStream& time_in,
+                                       ByteStream& value_in,
+                                       RowAppender& row_appender,
+                                       Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    float values[BATCH];
+
+    while (time_decoder_->has_remaining(time_in)) {
+        int eff_batch =
+            std::min(BATCH, static_cast<int>(row_appender.remaining()));
+        if (eff_batch <= 0) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    value_decoder_->skip_float(block_count, skipped, value_in);
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        int value_count = 0;
+
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch,
+                                                     time_count, time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            int skipped = 0;
+            value_decoder_->skip_float(time_count, skipped, value_in);
+            continue;
+        }
+
+        if (RET_FAIL(value_decoder_->read_batch_float(values, time_count,
+                                                      value_count, value_in))) {
+            break;
+        }
+        if (value_count != time_count) {
+            ret = E_TSFILE_CORRUPTED;
+            break;
+        }
+
+        for (int i = 0; i < time_count; ++i) {
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                continue;
+            }
+            if (UNLIKELY(!row_appender.add_row())) {
+                ret = E_OVERFLOW;
+                break;
+            }
+            row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+            row_appender.append(1, (char*)&values[i], sizeof(float));
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
+int ChunkReader::double_DECODE_TV_BATCH(ByteStream& time_in,
+                                        ByteStream& value_in,
+                                        RowAppender& row_appender,
+                                        Filter* filter) {
+    int ret = E_OK;
+    const int BATCH = 129;
+    int64_t times[BATCH];
+    double values[BATCH];
+
+    while (time_decoder_->has_remaining(time_in)) {
+        int eff_batch =
+            std::min(BATCH, static_cast<int>(row_appender.remaining()));
+        if (eff_batch <= 0) {
+            ret = E_OVERFLOW;
+            break;
+        }
+
+        // Block-level time filter check
+        bool block_all_pass = false;
+        if (filter != nullptr) {
+            int64_t block_min, block_max;
+            int block_count;
+            if (time_decoder_->peek_next_block_range_int64(
+                    time_in, block_min, block_max, block_count)) {
+                if (!filter->satisfy_start_end_time(block_min, block_max)) {
+                    int skipped = 0;
+                    time_decoder_->skip_peeked_block_int64(time_in, skipped);
+                    value_decoder_->skip_double(block_count, skipped, value_in);
+                    continue;
+                }
+                if (filter->contain_start_end_time(block_min, block_max)) {
+                    block_all_pass = true;
+                }
+            }
+        }
+
+        int time_count = 0;
+        int value_count = 0;
+
+        if (RET_FAIL(time_decoder_->read_batch_int64(times, eff_batch,
+                                                     time_count, time_in))) {
+            break;
+        }
+        if (time_count == 0) break;
+
+        bool time_mask[BATCH];
+        int pass_count = time_count;
+        if (filter != nullptr && !block_all_pass) {
+            pass_count =
+                filter->satisfy_batch_time(times, time_count, time_mask);
+        }
+
+        if (pass_count == 0) {
+            int skipped = 0;
+            value_decoder_->skip_double(time_count, skipped, value_in);
+            continue;
+        }
+
+        if (RET_FAIL(value_decoder_->read_batch_double(
+                values, time_count, value_count, value_in))) {
+            break;
+        }
+        if (value_count != time_count) {
+            ret = E_TSFILE_CORRUPTED;
+            break;
+        }
+
+        for (int i = 0; i < time_count; ++i) {
+            if (filter != nullptr && !block_all_pass && !time_mask[i]) {
+                continue;
+            }
+            if (UNLIKELY(!row_appender.add_row())) {
+                ret = E_OVERFLOW;
+                break;
+            }
+            row_appender.append(0, (char*)&times[i], sizeof(int64_t));
+            row_appender.append(1, (char*)&values[i], sizeof(double));
+        }
+        if (ret != E_OK) break;
+    }
+    return ret;
+}
+
 int ChunkReader::STRING_DECODE_TYPED_TV_INTO_TSBLOCK(ByteStream& time_in,
                                                      ByteStream& value_in,
                                                      RowAppender& row_appender,
@@ -472,23 +814,21 @@ int ChunkReader::decode_tv_buf_into_tsblock_by_datatype(ByteStream& time_in,
             break;
         case common::DATE:
         case common::INT32:
-            // DECODE_TYPED_TV_INTO_TSBLOCK(int32_t, int32, time_in_, value_in_,
-            // row_appender);
-            ret = i32_DECODE_TYPED_TV_INTO_TSBLOCK(time_in_, value_in_,
-                                                   row_appender, filter);
+            ret =
+                i32_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter);
             break;
         case TIMESTAMP:
         case common::INT64:
-            DECODE_TYPED_TV_INTO_TSBLOCK(int64_t, int64, time_in_, value_in_,
-                                         row_appender);
+            ret =
+                i64_DECODE_TV_BATCH(time_in_, value_in_, row_appender, filter);
             break;
         case common::FLOAT:
-            DECODE_TYPED_TV_INTO_TSBLOCK(float, float, time_in_, value_in_,
-                                         row_appender);
+            ret = float_DECODE_TV_BATCH(time_in_, value_in_, row_appender,
+                                        filter);
             break;
         case common::DOUBLE:
-            DECODE_TYPED_TV_INTO_TSBLOCK(double, double, time_in_, value_in_,
-                                         row_appender);
+            ret = double_DECODE_TV_BATCH(time_in_, value_in_, row_appender,
+                                         filter);
             break;
         case common::TEXT:
         case common::BLOB:
diff --git a/cpp/src/reader/chunk_reader.h b/cpp/src/reader/chunk_reader.h
index 3acd9c3cf..a1196c330 100644
--- a/cpp/src/reader/chunk_reader.h
+++ b/cpp/src/reader/chunk_reader.h
@@ -105,6 +105,20 @@ class ChunkReader : public IChunkReader {
                                          common::ByteStream& value_in,
                                          common::RowAppender& row_appender,
                                          Filter* filter);
+    int i32_DECODE_TV_BATCH(common::ByteStream& time_in,
+                            common::ByteStream& value_in,
+                            common::RowAppender& row_appender, Filter* filter);
+    int i64_DECODE_TV_BATCH(common::ByteStream& time_in,
+                            common::ByteStream& value_in,
+                            common::RowAppender& row_appender, Filter* filter);
+    int float_DECODE_TV_BATCH(common::ByteStream& time_in,
+                              common::ByteStream& value_in,
+                              common::RowAppender& row_appender,
+                              Filter* filter);
+    int double_DECODE_TV_BATCH(common::ByteStream& time_in,
+                               common::ByteStream& value_in,
+                               common::RowAppender& row_appender,
+                               Filter* filter);
     int STRING_DECODE_TYPED_TV_INTO_TSBLOCK(common::ByteStream& time_in,
                                             common::ByteStream& value_in,
                                             common::RowAppender& row_appender,
@@ -131,7 +145,7 @@ class ChunkReader : public IChunkReader {
      * also refer to offset within the chunk (including chunk header).
      * It advanced by step of a page header or a page tv data.
      */
-    common::ByteStream in_stream_{common::MOD_CHUNK_READER};
+    common::ByteStream in_stream_;
     int32_t file_data_buf_size_;
     uint32_t chunk_visit_offset_;
 
@@ -141,8 +155,8 @@ class ChunkReader : public IChunkReader {
 
     Decoder* time_decoder_;
     Decoder* value_decoder_;
-    common::ByteStream time_in_{common::MOD_CHUNK_READER};
-    common::ByteStream value_in_{common::MOD_CHUNK_READER};
+    common::ByteStream time_in_;
+    common::ByteStream value_in_;
     char* uncompressed_buf_;
 };
 
diff --git a/cpp/src/reader/device_meta_iterator.cc b/cpp/src/reader/device_meta_iterator.cc
index bf01b23a5..955965624 100644
--- a/cpp/src/reader/device_meta_iterator.cc
+++ b/cpp/src/reader/device_meta_iterator.cc
@@ -186,7 +186,17 @@ int DeviceMetaIterator::load_results_direct() {
     ret = io_reader_->load_device_index_entry(device_comparable,
                                               device_index_entry, end_offset);
 
-    if (ret != common::E_OK || device_index_entry == nullptr) {
+    // "Device not present in this file" is the only ret value we should
+    // suppress.  Read failures and corrupt index entries used to be folded
+    // into "no matches"; the caller then couldn't distinguish a clean miss
+    // from a partial read that silently dropped real data.  Surface them.
+    if (ret == common::E_DEVICE_NOT_EXIST || ret == common::E_NOT_EXIST) {
+        return common::E_OK;
+    }
+    if (ret != common::E_OK) {
+        return ret;
+    }
+    if (device_index_entry == nullptr) {
         return common::E_OK;
     }
 
diff --git a/cpp/src/reader/filter/and_filter.h b/cpp/src/reader/filter/and_filter.h
index b324a3f81..289115baf 100644
--- a/cpp/src/reader/filter/and_filter.h
+++ b/cpp/src/reader/filter/and_filter.h
@@ -19,6 +19,8 @@
 #ifndef READER_FILTER_OPERATOR_AND_FILTER_H
 #define READER_FILTER_OPERATOR_AND_FILTER_H
 
+#include <memory>
+
 #include "binary_filter.h"
 // #include "storage/storage_utils.h"
 
@@ -48,6 +50,27 @@ class AndFilter : public BinaryFilter {
                right_->contain_start_end_time(start_time, end_time);
     }
 
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+        // Inline buffer covers the common per-page BATCH=129 callers; only
+        // out-of-spec larger counts fall back to a heap allocation.
+        constexpr int kInlineCap = 256;
+        bool inline_buf[kInlineCap];
+        std::unique_ptr<bool[]> heap_buf;
+        bool* mask_right = inline_buf;
+        if (count > kInlineCap) {
+            heap_buf.reset(new bool[count]);
+            mask_right = heap_buf.get();
+        }
+        left_->satisfy_batch_time(times, count, mask);
+        right_->satisfy_batch_time(times, count, mask_right);
+        int pass = 0;
+        for (int i = 0; i < count; ++i) {
+            mask[i] = mask[i] && mask_right[i];
+            if (mask[i]) ++pass;
+        }
+        return pass;
+    }
+
     std::vector<TimeRange*>* get_time_ranges() {
         std::vector<TimeRange*>* result = new std::vector<TimeRange*>();
         std::vector<TimeRange*>* left_time_ranges = left_->get_time_ranges();
diff --git a/cpp/src/reader/filter/filter.h b/cpp/src/reader/filter/filter.h
index f39dddbae..e53992308 100644
--- a/cpp/src/reader/filter/filter.h
+++ b/cpp/src/reader/filter/filter.h
@@ -63,6 +63,20 @@ class Filter {
         ASSERT(false);
         return nullptr;
     }
+
+    // Batch time filter: evaluate time filter on an array of timestamps.
+    // Writes true/false into @mask for each element.
+    // Returns the number of elements that passed (mask[i] == true).
+    // Default: scalar fallback using satisfy_start_end_time.
+    virtual int satisfy_batch_time(const int64_t* times, int count,
+                                   bool* mask) {
+        int pass = 0;
+        for (int i = 0; i < count; ++i) {
+            mask[i] = satisfy_start_end_time(times[i], times[i]);
+            if (mask[i]) ++pass;
+        }
+        return pass;
+    }
 };
 
 }  // namespace storage
diff --git a/cpp/src/reader/filter/or_filter.h b/cpp/src/reader/filter/or_filter.h
index fc8d4a2cf..518308982 100644
--- a/cpp/src/reader/filter/or_filter.h
+++ b/cpp/src/reader/filter/or_filter.h
@@ -19,6 +19,8 @@
 #ifndef READER_FILTER_OPERATOR_OR_FILTER_H
 #define READER_FILTER_OPERATOR_OR_FILTER_H
 
+#include <memory>
+
 #include "binary_filter.h"
 // #include "storage/storage_utils.h"
 
@@ -48,6 +50,27 @@ class OrFilter : public BinaryFilter {
                right_->contain_start_end_time(start_time, end_time);
     }
 
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+        // Inline buffer covers the common per-page BATCH=129 callers; only
+        // out-of-spec larger counts fall back to a heap allocation.
+        constexpr int kInlineCap = 256;
+        bool inline_buf[kInlineCap];
+        std::unique_ptr<bool[]> heap_buf;
+        bool* mask_right = inline_buf;
+        if (count > kInlineCap) {
+            heap_buf.reset(new bool[count]);
+            mask_right = heap_buf.get();
+        }
+        left_->satisfy_batch_time(times, count, mask);
+        right_->satisfy_batch_time(times, count, mask_right);
+        int pass = 0;
+        for (int i = 0; i < count; ++i) {
+            mask[i] = mask[i] || mask_right[i];
+            if (mask[i]) ++pass;
+        }
+        return pass;
+    }
+
     std::vector<TimeRange*>* get_time_ranges() {
         std::vector<TimeRange*>* result = new std::vector<TimeRange*>();
         std::vector<TimeRange*>* left_time_ranges = left_->get_time_ranges();
diff --git a/cpp/src/reader/filter/time_operator.cc b/cpp/src/reader/filter/time_operator.cc
index 19f33b599..0bb12e4ec 100644
--- a/cpp/src/reader/filter/time_operator.cc
+++ b/cpp/src/reader/filter/time_operator.cc
@@ -18,9 +18,17 @@
  */
 #include "time_operator.h"
 
+#include <cstring>
+
 #include "common/statistic.h"
 #include "utils/storage_utils.h"
 
+#if defined(__ARM_NEON)
+#include <arm_neon.h>
+#elif defined(ENABLE_SIMD)
+#include "simde/x86/avx2.h"
+#endif
+
 namespace storage {
 
 TimeBetween::TimeBetween(int64_t value1, int64_t value2, bool not_between)
@@ -29,6 +37,15 @@ TimeBetween::TimeBetween(int64_t value1, int64_t value2, bool not_between)
 TimeBetween::~TimeBetween() {}
 
 bool TimeBetween::satisfy(Statistic* statistic) {
+    // An empty inner interval (value1_ > value2_) is unsatisfiable for BETWEEN
+    // (matches nothing) and trivially true for NOT BETWEEN (matches
+    // everything) -- i.e. the answer is exactly not_.  Without this guard the
+    // overlap test below wrongly reports "maybe" for an empty range,
+    // disagreeing with the row-level satisfy() and letting empty/inverted
+    // ranges slip past statistic-level pruning.
+    if (value1_ > value2_) {
+        return not_;
+    }
     if (not_) {
         return statistic->end_time_ < value1_ ||
                statistic->start_time_ > value2_;
@@ -47,6 +64,10 @@ bool TimeBetween::satisfy(int64_t time, common::String value) {
 }
 
 bool TimeBetween::satisfy_start_end_time(int64_t start_time, int64_t end_time) {
+    // Empty inner interval: see satisfy(Statistic*).
+    if (value1_ > value2_) {
+        return not_;
+    }
     if (not_) {
         return start_time < value1_ || end_time > value2_;
     } else {
@@ -55,6 +76,10 @@ bool TimeBetween::satisfy_start_end_time(int64_t start_time, int64_t end_time) {
 }
 
 bool TimeBetween::contain_start_end_time(int64_t start_time, int64_t end_time) {
+    // Empty inner interval: see satisfy(Statistic*).
+    if (value1_ > value2_) {
+        return not_;
+    }
     if (not_) {
         return end_time < value1_ || start_time > value2_;
     } else {
@@ -64,6 +89,16 @@ bool TimeBetween::contain_start_end_time(int64_t start_time, int64_t end_time) {
 
 std::vector<TimeRange*>* TimeBetween::get_time_ranges() {
     std::vector<TimeRange*>* result = new std::vector<TimeRange*>();
+    // Empty inner interval (value1_ > value2_): BETWEEN yields no ranges;
+    // NOT BETWEEN covers the whole timeline.
+    if (value1_ > value2_) {
+        if (not_) {
+            result->push_back(
+                new TimeRange(std::numeric_limits<int64_t>::min(),
+                              std::numeric_limits<int64_t>::max()));
+        }
+        return result;
+    }
     if (not_) {
         if (value1_ != std::numeric_limits<int64_t>::min()) {
             result->push_back(new TimeRange(std::numeric_limits<int64_t>::min(),
@@ -102,11 +137,42 @@ bool TimeIn::satisfy(int64_t time, common::String value) {
 }
 
 bool TimeIn::satisfy_start_end_time(int64_t start_time, int64_t end_time) {
-    return true;
+    // "Could any time in [s, e] satisfy the filter?"
+    // IN({v_i}): true iff some v_i lies in [s, e].
+    // NOT IN: true unless the entire range [s, e] is one point and that
+    // point is in values_; for ranges wider than a single integer there is
+    // always at least one time not in values_, so we're conservative.
+    bool any_in_range = false;
+    for (int64_t v : values_) {
+        if (v >= start_time && v <= end_time) {
+            any_in_range = true;
+            break;
+        }
+    }
+    if (not_) {
+        if (start_time == end_time) return !any_in_range;
+        return true;
+    }
+    return any_in_range;
 }
 
 bool TimeIn::contain_start_end_time(int64_t start_time, int64_t end_time) {
-    return true;
+    // "Do ALL times in [s, e] satisfy the filter?"
+    // IN({v_i}): only when [s,e] collapses to a single point that is in
+    // values_; a sparse IN list can't cover a range otherwise.  Returning
+    // true unconditionally would let the batch fast path skip per-row
+    // filtering and emit every row.
+    // NOT IN: true iff no v_i lies in [s, e].
+    bool any_in_range = false;
+    for (int64_t v : values_) {
+        if (v >= start_time && v <= end_time) {
+            any_in_range = true;
+            break;
+        }
+    }
+    if (not_) return !any_in_range;
+    if (start_time == end_time) return any_in_range;
+    return false;
 }
 
 std::vector<TimeRange*>* TimeIn::get_time_ranges() {
@@ -308,4 +374,269 @@ std::vector<TimeRange*>* TimeLtEq::get_time_ranges() {
     return result;
 }
 
+// ============================================================================
+// SIMD batch time filter implementations
+// ============================================================================
+
+// Helper: extract 4-bit movemask from 256-bit comparison result (4 x i64)
+#if !defined(__ARM_NEON) && defined(ENABLE_SIMD)
+static inline int simd_movemask_epi64(simde__m256i v) {
+    // movemask_pd reinterprets as double and checks sign bit = high bit of each
+    // 64-bit lane
+    return simde_mm256_movemask_pd(simde_mm256_castsi256_pd(v));
+}
+#endif
+
+int TimeGt::satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vval = vdupq_n_s64(value_);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t cmp = vcgtq_s64(vt, vval);
+        mask[i] = vgetq_lane_u64(cmp, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vval = simde_mm256_set1_epi64x(value_);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        // time > value_ => cmpgt(time, value_)
+        simde__m256i cmp = simde_mm256_cmpgt_epi64(vt, vval);
+        int bits = simd_movemask_epi64(cmp);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        mask[i] = value_ < times[i];
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
+int TimeGtEq::satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vval = vdupq_n_s64(value_);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t cmp = vcgeq_s64(vt, vval);
+        mask[i] = vgetq_lane_u64(cmp, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vval = simde_mm256_set1_epi64x(value_);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        // time >= value_ => NOT(cmpgt(value_, time))
+        simde__m256i cmp = simde_mm256_cmpgt_epi64(vval, vt);
+        simde__m256i ncmp =
+            simde_mm256_xor_si256(cmp, simde_mm256_set1_epi64x((int64_t)-1));
+        int bits = simd_movemask_epi64(ncmp);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        mask[i] = value_ <= times[i];
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
+int TimeLt::satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vval = vdupq_n_s64(value_);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t cmp = vcltq_s64(vt, vval);
+        mask[i] = vgetq_lane_u64(cmp, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vval = simde_mm256_set1_epi64x(value_);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        // time < value_ => cmpgt(value_, time)
+        simde__m256i cmp = simde_mm256_cmpgt_epi64(vval, vt);
+        int bits = simd_movemask_epi64(cmp);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        mask[i] = value_ > times[i];
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
+int TimeLtEq::satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vval = vdupq_n_s64(value_);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t cmp = vcleq_s64(vt, vval);
+        mask[i] = vgetq_lane_u64(cmp, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vval = simde_mm256_set1_epi64x(value_);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        // time <= value_ => NOT(cmpgt(time, value_))
+        simde__m256i cmp = simde_mm256_cmpgt_epi64(vt, vval);
+        simde__m256i ncmp =
+            simde_mm256_xor_si256(cmp, simde_mm256_set1_epi64x((int64_t)-1));
+        int bits = simd_movemask_epi64(ncmp);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        mask[i] = value_ >= times[i];
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
+int TimeEq::satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vval = vdupq_n_s64(value_);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t cmp = vceqq_s64(vt, vval);
+        mask[i] = vgetq_lane_u64(cmp, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vval = simde_mm256_set1_epi64x(value_);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        simde__m256i cmp = simde_mm256_cmpeq_epi64(vt, vval);
+        int bits = simd_movemask_epi64(cmp);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        mask[i] = value_ == times[i];
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
+int TimeNotEq::satisfy_batch_time(const int64_t* times, int count, bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vval = vdupq_n_s64(value_);
+    uint64x2_t ones = vdupq_n_u64(UINT64_MAX);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t cmp = veorq_u64(vceqq_s64(vt, vval), ones);
+        mask[i] = vgetq_lane_u64(cmp, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(cmp, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vval = simde_mm256_set1_epi64x(value_);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        simde__m256i eq = simde_mm256_cmpeq_epi64(vt, vval);
+        simde__m256i neq =
+            simde_mm256_xor_si256(eq, simde_mm256_set1_epi64x((int64_t)-1));
+        int bits = simd_movemask_epi64(neq);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        mask[i] = value_ != times[i];
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
+int TimeBetween::satisfy_batch_time(const int64_t* times, int count,
+                                    bool* mask) {
+    int pass = 0;
+    int i = 0;
+#if defined(__ARM_NEON)
+    int64x2_t vlo = vdupq_n_s64(value1_);
+    int64x2_t vhi = vdupq_n_s64(value2_);
+    uint64x2_t ones = vdupq_n_u64(UINT64_MAX);
+    for (; i + 1 < count; i += 2) {
+        int64x2_t vt = vld1q_s64(times + i);
+        uint64x2_t ge_lo = vcgeq_s64(vt, vlo);
+        uint64x2_t le_hi = vcleq_s64(vt, vhi);
+        uint64x2_t between = vandq_u64(ge_lo, le_hi);
+        uint64x2_t result = not_ ? veorq_u64(between, ones) : between;
+        mask[i] = vgetq_lane_u64(result, 0) != 0;
+        mask[i + 1] = vgetq_lane_u64(result, 1) != 0;
+        pass += mask[i] + mask[i + 1];
+    }
+#elif defined(ENABLE_SIMD)
+    simde__m256i vlo = simde_mm256_set1_epi64x(value1_);
+    simde__m256i vhi = simde_mm256_set1_epi64x(value2_);
+    simde__m256i ones = simde_mm256_set1_epi64x((int64_t)-1);
+    for (; i + 3 < count; i += 4) {
+        simde__m256i vt =
+            simde_mm256_loadu_si256((const simde__m256i*)(times + i));
+        // time >= lo => NOT(cmpgt(lo, time))
+        simde__m256i ge_lo =
+            simde_mm256_xor_si256(simde_mm256_cmpgt_epi64(vlo, vt), ones);
+        // time <= hi => NOT(cmpgt(time, hi))
+        simde__m256i le_hi =
+            simde_mm256_xor_si256(simde_mm256_cmpgt_epi64(vt, vhi), ones);
+        simde__m256i between = simde_mm256_and_si256(ge_lo, le_hi);
+        simde__m256i result =
+            not_ ? simde_mm256_xor_si256(between, ones) : between;
+        int bits = simd_movemask_epi64(result);
+        for (int j = 0; j < 4; ++j) {
+            mask[i + j] = (bits >> j) & 1;
+            pass += mask[i + j];
+        }
+    }
+#endif
+    for (; i < count; ++i) {
+        bool in_range = (value1_ <= times[i]) && (times[i] <= value2_);
+        mask[i] = not_ ? !in_range : in_range;
+        if (mask[i]) ++pass;
+    }
+    return pass;
+}
+
 }  // namespace storage
diff --git a/cpp/src/reader/filter/time_operator.h b/cpp/src/reader/filter/time_operator.h
index 29930b88a..f972a4259 100644
--- a/cpp/src/reader/filter/time_operator.h
+++ b/cpp/src/reader/filter/time_operator.h
@@ -47,6 +47,9 @@ class TimeBetween : public Filter {
     bool contain_start_end_time(int64_t start_time, int64_t end_time);
 
     std::vector<TimeRange*>* get_time_ranges();
+
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     FilterType get_filter_type() { return type_; }
 
    private:
@@ -99,6 +102,8 @@ class TimeEq : public Filter {
 
     std::vector<TimeRange*>* get_time_ranges();
 
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     FilterType get_filter_type() { return type_; }
 
    private:
@@ -122,6 +127,9 @@ class TimeNotEq : public Filter {
     bool contain_start_end_time(int64_t start_time, int64_t end_time);
 
     std::vector<TimeRange*>* get_time_ranges();
+
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     FilterType get_filter_type() { return type_; }
 
    private:
@@ -146,6 +154,8 @@ class TimeGt : public Filter {
 
     std::vector<TimeRange*>* get_time_ranges();
 
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     FilterType get_filter_type() { return type_; }
 
    private:
@@ -169,6 +179,9 @@ class TimeGtEq : public Filter {
     bool contain_start_end_time(int64_t start_time, int64_t end_time);
 
     std::vector<TimeRange*>* get_time_ranges();
+
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     void reset_value(int64_t val) { value_ = val; }
     FilterType get_filter_type() { return type_; }
 
@@ -194,6 +207,8 @@ class TimeLt : public Filter {
 
     std::vector<TimeRange*>* get_time_ranges();
 
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     FilterType get_filter_type() { return type_; }
 
    private:
@@ -217,6 +232,9 @@ class TimeLtEq : public Filter {
     bool contain_start_end_time(int64_t start_time, int64_t end_time);
 
     std::vector<TimeRange*>* get_time_ranges();
+
+    int satisfy_batch_time(const int64_t* times, int count, bool* mask);
+
     FilterType get_filter_type() { return type_; }
 
    private:
diff --git a/cpp/src/reader/qds_without_timegenerator.cc b/cpp/src/reader/qds_without_timegenerator.cc
index 474e13b77..b612e5dc2 100644
--- a/cpp/src/reader/qds_without_timegenerator.cc
+++ b/cpp/src/reader/qds_without_timegenerator.cc
@@ -68,7 +68,12 @@ int QDSWithoutTimeGenerator::init_internal(TsFileIOReader* io_reader,
         ret = io_reader_->alloc_ssi(paths[i].device_id_, paths[i].measurement_,
                                     ssi, pa_, global_time_filter);
         if (ret == E_MEASUREMENT_NOT_EXIST || ret == E_DEVICE_NOT_EXIST ||
-            ret == E_NOT_EXIST) {
+            ret == E_NOT_EXIST || ret == E_NO_MORE_DATA) {
+            // Java-aligned: silently skip paths whose device or measurement
+            // doesn't exist in this file. The bloom-filter optimization in
+            // alloc_ssi reports a missing series as E_NO_MORE_DATA, so treat
+            // that the same as the not-found codes.
+            ret = E_OK;
             continue;
         }
         if (ret != E_OK) {
diff --git a/cpp/src/reader/result_set.h b/cpp/src/reader/result_set.h
index 1f1653603..0b73595d4 100644
--- a/cpp/src/reader/result_set.h
+++ b/cpp/src/reader/result_set.h
@@ -162,6 +162,35 @@ class ResultSet : std::enable_shared_from_this<ResultSet> {
         return common::E_INVALID_ARG;
     }
 
+    // Typed direct accessors.  Default implementation routes through the
+    // generic RowRecord / Field path so existing subclasses keep working.
+    // Fast subclasses (TableResultSet) override these to read straight from
+    // the underlying columnar buffer, skipping the per-cell Field round-trip
+    // (and the eager materialization in next()).
+    virtual bool get_bool_at(uint32_t column_index) {
+        return get_row_record()->get_field(column_index - 1)->get_value<bool>();
+    }
+    virtual int32_t get_int32_at(uint32_t column_index) {
+        return get_row_record()
+            ->get_field(column_index - 1)
+            ->get_value<int32_t>();
+    }
+    virtual int64_t get_int64_at(uint32_t column_index) {
+        return get_row_record()
+            ->get_field(column_index - 1)
+            ->get_value<int64_t>();
+    }
+    virtual float get_float_at(uint32_t column_index) {
+        return get_row_record()
+            ->get_field(column_index - 1)
+            ->get_value<float>();
+    }
+    virtual double get_double_at(uint32_t column_index) {
+        return get_row_record()
+            ->get_field(column_index - 1)
+            ->get_value<double>();
+    }
+
     /**
      * @brief Get the row record of the result set
      *
@@ -245,6 +274,29 @@ inline std::tm ResultSet::get_value(uint32_t column_index) {
     return row_record->get_field(column_index)->get_date_value();
 }
 
+// Index-based primitive specializations route to the typed virtual
+// accessors so TableResultSet can serve them without materializing a Field.
+template <>
+inline bool ResultSet::get_value(uint32_t column_index) {
+    return get_bool_at(column_index);
+}
+template <>
+inline int32_t ResultSet::get_value(uint32_t column_index) {
+    return get_int32_at(column_index);
+}
+template <>
+inline int64_t ResultSet::get_value(uint32_t column_index) {
+    return get_int64_at(column_index);
+}
+template <>
+inline float ResultSet::get_value(uint32_t column_index) {
+    return get_float_at(column_index);
+}
+template <>
+inline double ResultSet::get_value(uint32_t column_index) {
+    return get_double_at(column_index);
+}
+
 /**
  * @brief Simple iterator for ResultSet with smart pointers
  */
@@ -306,7 +358,7 @@ inline ResultSetIterator ResultSet::iterator() {
     return ResultSetIterator(this);
 }
 
-static MAYBE_UNUSED void print_table_result_set(
+MAYBE_UNUSED static void print_table_result_set(
     storage::ResultSet* table_result_set) {
     if (table_result_set == nullptr) {
         std::cout << "TableResultSet is nullptr" << std::endl;
diff --git a/cpp/src/reader/table_result_set.cc b/cpp/src/reader/table_result_set.cc
index 81b58ce68..6de093d24 100644
--- a/cpp/src/reader/table_result_set.cc
+++ b/cpp/src/reader/table_result_set.cc
@@ -43,6 +43,16 @@ int TableResultSet::next(bool& has_next) {
 
     int ret = common::E_OK;
 
+    // Advance past the row yielded by the previous next() call, if any.
+    // Row iterator's next() advances all per-column offsets, so on the next
+    // read the vectors point to the new row's data.
+    if (row_ready_) {
+        row_iterator_->next();
+        row_ready_ = false;
+        row_materialized_ = false;
+    }
+
+    // Find the next non-empty TsBlock.
     while (row_iterator_ == nullptr || !row_iterator_->has_next()) {
         if (RET_FAIL(tsblock_reader_->has_next(has_next))) {
             return ret;
@@ -68,23 +78,29 @@ int TableResultSet::next(bool& has_next) {
     }
     if (row_iterator_ == nullptr || !row_iterator_->has_next()) {
         has_next = false;
+        return ret;
     }
 
-    if (has_next && IS_SUCC(ret)) {
-        uint32_t len = 0;
-        bool null = false;
-        row_record_->reset();
-        for (uint32_t i = 0; i < row_iterator_->get_column_count(); ++i) {
-            const auto value = row_iterator_->read(i, &len, &null);
-            if (!null) {
-                row_record_->get_field(i)->set_value(
-                    row_iterator_->get_data_type(i), value, len, pa_);
-                row_iterator_->next(i);
-            }
+    // A row is now available at row_iterator_'s current row_id_; the per-
+    // column vector offsets are pointing at that row's data.  We do NOT
+    // populate row_record_ here — typed accessors read straight from the
+    // vectors, and get_row_record() lazily materializes on demand.
+    has_next = true;
+    row_ready_ = true;
+    return ret;
+}
+
+void TableResultSet::materialize_current_row() {
+    uint32_t len = 0;
+    bool null = false;
+    row_record_->reset();
+    for (uint32_t i = 0; i < row_iterator_->get_column_count(); ++i) {
+        const auto value = row_iterator_->read(i, &len, &null);
+        if (!null) {
+            row_record_->get_field(i)->set_value(
+                row_iterator_->get_data_type(i), value, len, pa_);
         }
-        row_iterator_->update_row_id();
     }
-    return ret;
 }
 
 bool TableResultSet::is_null(const std::string& column_name) {
@@ -98,11 +114,57 @@ bool TableResultSet::is_null(const std::string& column_name) {
 
 bool TableResultSet::is_null(uint32_t column_index) {
     ASSERT(1 <= column_index && column_index <= row_record_->get_col_num());
-    return row_record_->get_field(column_index - 1) == nullptr ||
-           row_record_->get_field(column_index - 1)->is_type(common::NULL_TYPE);
+    if (!row_ready_) return true;
+    return row_iterator_->is_null_at(column_index - 1);
+}
+
+// Direct buffer access — skips Vector::read's virtual dispatch.  Caller is
+// expected to have checked is_null() (we still null-guard for safety).
+// For fixed-width primitives the vector keeps its value buffer in
+// values_ and tracks the current row's byte offset in offset_; the
+// element at the active row is simply *(T*)(values_.get_data() + offset_).
+// The ASSERT enforces strict typed access: the requested C++ type must match
+// the column's physical storage width (DATE is int32, not int64).  On a
+// mismatch it fires in debug instead of silently splicing the adjacent cell's
+// bytes into the result.
+#define TSFILE_FAST_PRIMITIVE_READ(TYPE, DFLT)                         \
+    if (!row_ready_) return DFLT;                                      \
+    common::Vector* vec = row_iterator_->get_vector(column_index - 1); \
+    ASSERT(common::TypeMatch<TYPE>(vec->get_vector_type()));           \
+    if (vec->has_null() && vec->is_null(row_iterator_->get_row_id()))  \
+        return DFLT;                                                   \
+    return *reinterpret_cast<TYPE*>(vec->get_value_data().get_data() + \
+                                    vec->get_offset())
+
+bool TableResultSet::get_bool_at(uint32_t column_index) {
+    TSFILE_FAST_PRIMITIVE_READ(bool, false);
 }
 
-RowRecord* TableResultSet::get_row_record() { return row_record_; }
+int32_t TableResultSet::get_int32_at(uint32_t column_index) {
+    TSFILE_FAST_PRIMITIVE_READ(int32_t, 0);
+}
+
+int64_t TableResultSet::get_int64_at(uint32_t column_index) {
+    TSFILE_FAST_PRIMITIVE_READ(int64_t, 0);
+}
+
+float TableResultSet::get_float_at(uint32_t column_index) {
+    TSFILE_FAST_PRIMITIVE_READ(float, 0.0f);
+}
+
+double TableResultSet::get_double_at(uint32_t column_index) {
+    TSFILE_FAST_PRIMITIVE_READ(double, 0.0);
+}
+
+#undef TSFILE_FAST_PRIMITIVE_READ
+
+RowRecord* TableResultSet::get_row_record() {
+    if (row_ready_ && !row_materialized_) {
+        materialize_current_row();
+        row_materialized_ = true;
+    }
+    return row_record_;
+}
 
 std::shared_ptr<ResultSetMetadata> TableResultSet::get_metadata() {
     return result_set_metadata_;
@@ -138,7 +200,13 @@ int TableResultSet::get_next_tsblock(common::TsBlock*& block) {
 }
 
 void TableResultSet::close() {
-    tsblock_reader_->close();
+    if (closed_) {
+        return;
+    }
+    closed_ = true;
+    if (tsblock_reader_) {
+        tsblock_reader_->close();
+    }
     pa_.destroy();
     if (row_record_) {
         delete row_record_;
@@ -150,4 +218,4 @@ void TableResultSet::close() {
     }
 }
 
-}  // namespace storage
\ No newline at end of file
+}  // namespace storage
diff --git a/cpp/src/reader/table_result_set.h b/cpp/src/reader/table_result_set.h
index 072a63f6f..d92072934 100644
--- a/cpp/src/reader/table_result_set.h
+++ b/cpp/src/reader/table_result_set.h
@@ -48,8 +48,23 @@ class TableResultSet : public ResultSet {
     void close() override;
     int get_next_tsblock(common::TsBlock*& block) override;
 
+    // Fast typed accessors: read straight from the current TsBlock vector
+    // without going through RowRecord/Field.  Caller is expected to have
+    // checked is_null() — when the cell is null the underlying buffer pointer
+    // is nullptr and these return a default (0 / 0.0 / false) without
+    // dereferencing it.
+    bool get_bool_at(uint32_t column_index) override;
+    int32_t get_int32_at(uint32_t column_index) override;
+    int64_t get_int64_at(uint32_t column_index) override;
+    float get_float_at(uint32_t column_index) override;
+    double get_double_at(uint32_t column_index) override;
+
    private:
     void init();
+    // Lazy materialization: fill row_record_ from the current row when a
+    // caller actually requests the RowRecord (or a non-fast accessor).
+    void materialize_current_row();
+
     std::unique_ptr<TsBlockReader> tsblock_reader_;
     common::RowIterator* row_iterator_ = nullptr;
     common::TsBlock* tsblock_ = nullptr;
@@ -58,6 +73,11 @@ class TableResultSet : public ResultSet {
     std::vector<std::string> column_names_;
     std::vector<common::TSDataType> data_types_;
     const int return_mode_;
+    bool closed_ = false;
+    // True when row_iterator_ points at a row that hasn't been consumed yet.
+    bool row_ready_ = false;
+    // True when row_record_ has been populated for the current row.
+    bool row_materialized_ = false;
 };
 }  // namespace storage
-#endif  // TABLE_RESULT_SET_H
\ No newline at end of file
+#endif  // TABLE_RESULT_SET_H
diff --git a/cpp/src/reader/task/device_query_task.cc b/cpp/src/reader/task/device_query_task.cc
index c7e7091ff..6345c93fa 100644
--- a/cpp/src/reader/task/device_query_task.cc
+++ b/cpp/src/reader/task/device_query_task.cc
@@ -19,6 +19,8 @@
 
 #include "reader/task/device_query_task.h"
 
+#include "common/tsfile_common.h"
+
 namespace storage {
 DeviceQueryTask* DeviceQueryTask::create_device_query_task(
     std::shared_ptr<IDeviceID> device_id, std::vector<std::string> column_names,
@@ -34,8 +36,14 @@ DeviceQueryTask* DeviceQueryTask::create_device_query_task(
 }
 
 DeviceQueryTask::~DeviceQueryTask() {
-    if (index_root_) {
+    // index_root_ was placement-new'd into DeviceMetaIterator's PageArena and
+    // ownership transferred here via DeviceMetaIterator::next; the arena only
+    // frees raw bytes, so we must invoke the destructor explicitly to release
+    // the heap-allocated children_ vector and its nested shared_ptr graph
+    // (DeviceMetaIndexEntry -> StringArrayDeviceID).
+    if (index_root_ != nullptr) {
         index_root_->~MetaIndexNode();
+        index_root_ = nullptr;
     }
 }
 
diff --git a/cpp/src/reader/task/device_task_iterator.cc b/cpp/src/reader/task/device_task_iterator.cc
index dbe763303..e22fefb06 100644
--- a/cpp/src/reader/task/device_task_iterator.cc
+++ b/cpp/src/reader/task/device_task_iterator.cc
@@ -37,6 +37,9 @@ int DeviceTaskIterator::next(DeviceQueryTask*& task) {
         task = DeviceQueryTask::create_device_query_task(
             device_meta_pair.first, column_names_, column_mapping_,
             device_meta_pair.second, table_schema_, pa_);
+        if (task != nullptr) {
+            created_tasks_.push_back(task);
+        }
     }
     return ret;
 }
diff --git a/cpp/src/reader/task/device_task_iterator.h b/cpp/src/reader/task/device_task_iterator.h
index 061711c17..cc5a75562 100644
--- a/cpp/src/reader/task/device_task_iterator.h
+++ b/cpp/src/reader/task/device_task_iterator.h
@@ -58,7 +58,17 @@ class DeviceTaskIterator {
         pa_.init(512, common::MOD_DEVICE_TASK_ITER);
     }
 
-    ~DeviceTaskIterator() { pa_.destroy(); }
+    ~DeviceTaskIterator() {
+        // The tasks are placement-new'd into pa_ memory; pa_.destroy() only
+        // releases the raw bytes, so we must call their destructors here to
+        // release the heap-allocated members (std::vector<std::string>,
+        // shared_ptr's, etc.) they own.
+        for (DeviceQueryTask* t : created_tasks_) {
+            t->~DeviceQueryTask();
+        }
+        created_tasks_.clear();
+        pa_.destroy();
+    }
 
     void flush_remaining_device_meta_cache();
 
@@ -72,6 +82,7 @@ class DeviceTaskIterator {
     std::unique_ptr<DeviceMetaIterator> device_meta_iterator_;
     std::shared_ptr<TableSchema> table_schema_;
     common::PageArena pa_;
+    std::vector<DeviceQueryTask*> created_tasks_;
 };
 
 }  // namespace storage
diff --git a/cpp/src/reader/tsfile_reader.cc b/cpp/src/reader/tsfile_reader.cc
index 8d9d9b5dc..540674f33 100644
--- a/cpp/src/reader/tsfile_reader.cc
+++ b/cpp/src/reader/tsfile_reader.cc
@@ -94,8 +94,7 @@ namespace storage {
 TsFileReader::TsFileReader()
     : read_file_(nullptr),
       tsfile_executor_(nullptr),
-      table_query_executor_(nullptr),
-      table_query_executor_batch_size_(0) {
+      table_query_executor_(nullptr) {
     tsfile_reader_meta_pa_.init(512, MOD_TSFILE_READER);
 }
 
@@ -113,6 +112,22 @@ int TsFileReader::open(const std::string& file_path) {
     return ret;
 }
 
+int TsFileReader::ensure_table_query_executor(int batch_size) {
+    if (table_query_executor_ != nullptr &&
+        table_query_executor_batch_size_ == batch_size) {
+        return E_OK;
+    }
+
+    if (table_query_executor_ != nullptr) {
+        delete table_query_executor_;
+        table_query_executor_ = nullptr;
+    }
+
+    table_query_executor_ = new TableQueryExecutor(read_file_, batch_size);
+    table_query_executor_batch_size_ = batch_size;
+    return E_OK;
+}
+
 int TsFileReader::close() {
     int ret = E_OK;
     if (tsfile_executor_ != nullptr) {
@@ -123,7 +138,6 @@ int TsFileReader::close() {
         delete table_query_executor_;
         table_query_executor_ = nullptr;
     }
-    table_query_executor_batch_size_ = 0;
     if (read_file_ != nullptr) {
         read_file_->close();
         delete read_file_;
@@ -132,22 +146,6 @@ int TsFileReader::close() {
     return ret;
 }
 
-int TsFileReader::ensure_table_query_executor(int batch_size) {
-    if (table_query_executor_ != nullptr &&
-        table_query_executor_batch_size_ == batch_size) {
-        return E_OK;
-    }
-
-    if (table_query_executor_ != nullptr) {
-        delete table_query_executor_;
-        table_query_executor_ = nullptr;
-    }
-
-    table_query_executor_ = new TableQueryExecutor(read_file_, batch_size);
-    table_query_executor_batch_size_ = batch_size;
-    return E_OK;
-}
-
 int TsFileReader::query(QueryExpression* qe, ResultSet*& ret_qds) {
     return tsfile_executor_->execute(qe, ret_qds);
 }
@@ -411,16 +409,21 @@ int TsFileReader::get_timeseries_schema(
                          device_id, timeseries_indexs, pa))) {
     } else {
         for (auto timeseries_index : timeseries_indexs) {
-            auto* aligned_timeseries_index =
-                dynamic_cast<AlignedTimeseriesIndex*>(timeseries_index);
-            auto data_type =
-                aligned_timeseries_index != nullptr &&
-                        aligned_timeseries_index->value_ts_idx_ != nullptr
-                    ? aligned_timeseries_index->value_ts_idx_->get_data_type()
-                    : timeseries_index->get_data_type();
+            // AlignedTimeseriesIndex::get_data_type() returns the time
+            // column type (VECTOR) so the aligned/non-aligned dispatch in
+            // SSI can keep using the existing accessor.  For schema
+            // exposure we need the actual value column type — without this
+            // unwrap, INT32/FLOAT/... would all surface as VECTOR.
+            common::TSDataType dt = timeseries_index->get_data_type();
+            if (dt == common::VECTOR) {
+                auto* aligned =
+                    dynamic_cast<AlignedTimeseriesIndex*>(timeseries_index);
+                if (aligned != nullptr && aligned->value_ts_idx_ != nullptr) {
+                    dt = aligned->value_ts_idx_->get_data_type();
+                }
+            }
             MeasurementSchema ms(
-                timeseries_index->get_measurement_name().to_std_string(),
-                data_type);
+                timeseries_index->get_measurement_name().to_std_string(), dt);
             result.push_back(ms);
         }
     }
@@ -448,6 +451,15 @@ int TsFileReader::get_timeseries_metadata_impl(
 
 DeviceTimeseriesMetadataMap TsFileReader::get_timeseries_metadata(
     const std::vector<std::shared_ptr<IDeviceID>>& device_ids) {
+    // Reset the shared meta arena up front: every call writes fresh
+    // timeseries-index metadata into it via _impl(), and the previous
+    // implementation only ever appended.  A long-lived reader that repeats
+    // this query would grow tsfile_reader_meta_pa_ without bound (each call
+    // duplicates the per-device payload).  Callers that need to retain prior
+    // results past this call must copy them out before invoking again — the
+    // shared_ptrs handed back use a noop deleter pointing into this arena.
+    tsfile_reader_meta_pa_.destroy();
+    tsfile_reader_meta_pa_.init(512, MOD_TSFILE_READER);
     DeviceTimeseriesMetadataMap result;
     for (const auto& device_id : device_ids) {
         std::vector<std::shared_ptr<ITimeseriesIndex>> list;
@@ -466,6 +478,10 @@ DeviceTimeseriesMetadataMap TsFileReader::get_timeseries_metadata() {
         return result;
     }
 
+    // Same arena-reset rationale as the device_ids overload above.
+    tsfile_reader_meta_pa_.destroy();
+    tsfile_reader_meta_pa_.init(512, MOD_TSFILE_READER);
+
     PageArena pa;
     pa.init(512, MOD_TSFILE_READER);
     std::vector<DeviceMetaEntry> entries;
diff --git a/cpp/src/reader/tsfile_reader.h b/cpp/src/reader/tsfile_reader.h
index 19d83ec61..e2f9f3496 100644
--- a/cpp/src/reader/tsfile_reader.h
+++ b/cpp/src/reader/tsfile_reader.h
@@ -143,7 +143,6 @@ class TsFileReader {
      * @param offset         Number of leading rows to skip (>= 0).
      * @param limit          Maximum rows to return. < 0 means unlimited.
      * @param[out] result_set  The result set containing query results.
-     * @param tag_filter     Optional tag filter for filtering by tag columns.
      * @return Returns 0 on success, or a non-zero error code on failure.
      */
     int queryByRow(const std::string& table_name,
@@ -243,8 +242,10 @@ class TsFileReader {
     storage::ReadFile* read_file_;
     storage::TsFileExecutor* tsfile_executor_;
     storage::TableQueryExecutor* table_query_executor_;
-    int table_query_executor_batch_size_;
+    int table_query_executor_batch_size_ = -1;
     common::PageArena tsfile_reader_meta_pa_;
+    // Test-only hook for the unbounded-arena-growth regression check.
+    friend class TsFileReaderMetaArenaTest;
 };
 
 }  // namespace storage
diff --git a/cpp/src/reader/tsfile_series_scan_iterator.cc b/cpp/src/reader/tsfile_series_scan_iterator.cc
index 1d666bfc0..538b00d43 100644
--- a/cpp/src/reader/tsfile_series_scan_iterator.cc
+++ b/cpp/src/reader/tsfile_series_scan_iterator.cc
@@ -19,13 +19,37 @@
 
 #include "reader/tsfile_series_scan_iterator.h"
 
+#include <iostream>
+
+#include "common/global.h"
+#ifdef ENABLE_THREADS
+#include "common/thread_pool.h"
+#endif
+
 using namespace common;
 
 namespace storage {
 
 void TsFileSeriesScanIterator::destroy() {
+    // MultiAlignedTimeseriesIndex is placement-new'd inside
+    // timeseries_index_pa_ (see TsFileIOReader::alloc_multi_ssi).  The arena's
+    // destroy() frees raw memory without running destructors, so its
+    // value_ts_idxs_ std::vector backing buffer would leak.  Release it
+    // explicitly before tearing down the arena.  dynamic_cast is null-safe and
+    // returns nullptr for the single-value / non-aligned index types, which own
+    // no separate heap storage.
+    if (auto* multi =
+            dynamic_cast<MultiAlignedTimeseriesIndex*>(itimeseries_index_)) {
+        std::vector<TimeseriesIndex*>().swap(multi->value_ts_idxs_);
+    }
+    itimeseries_index_ = nullptr;
     timeseries_index_pa_.destroy();
     if (chunk_reader_ != nullptr) {
+        // destroy() already runs manual destructors on internal members
+        // (chunk_header_, decoders, compressor, ...), so calling
+        // chunk_reader_->~IChunkReader() here would double-destruct them.
+        // The vector-buffer leaks (e.g. chunk_pages_) are released inside
+        // AlignedChunkReader::destroy() via vector<>{}.swap().
         chunk_reader_->destroy();
         common::mem_free(chunk_reader_);
         chunk_reader_ = nullptr;
@@ -65,20 +89,24 @@ bool TsFileSeriesScanIterator::should_skip_aligned_chunk_by_offset(
     if (row_offset_ <= 0) {
         return false;
     }
-    if (time_cm->statistic_ == nullptr || value_cm->statistic_ == nullptr) {
+    // Aligned value chunks' statistic_->count_ only counts non-null rows,
+    // not total rows.  Using value_cm alone could skip an entire 100-row
+    // chunk for an offset of 10 just because it has 10 non-null values.
+    // Only apply the whole-chunk shortcut when time and value statistics
+    // agree on the row count (i.e. no sparse nulls in this chunk); fall
+    // through to per-page/per-row handling otherwise so the offset is
+    // applied against the real row stream.
+    if (time_cm == nullptr || value_cm == nullptr ||
+        time_cm->statistic_ == nullptr || value_cm->statistic_ == nullptr) {
         return false;
     }
     int32_t tc = time_cm->statistic_->count_;
     int32_t vc = value_cm->statistic_->count_;
-    if (tc <= 0 || vc <= 0) {
-        return false;
-    }
-    if (tc != vc) {
+    if (tc <= 0 || vc <= 0 || tc != vc) {
         return false;
     }
-    int32_t count = tc;
-    if (row_offset_ >= count) {
-        row_offset_ -= count;
+    if (row_offset_ >= tc) {
+        row_offset_ -= tc;
         return true;
     }
     return false;
@@ -91,74 +119,104 @@ int TsFileSeriesScanIterator::get_next(TsBlock*& ret_tsblock, bool alloc,
     Filter* filter =
         (oneshoot_filter != nullptr) ? oneshoot_filter : time_filter_;
 
+    // When get_next_page() reports E_NO_MORE_DATA but the chunk reader
+    // still claims has_more_data() (an aligned-chunk artifact where time
+    // and value pages report state differently), a bare `continue` would
+    // retry the exhausted chunk forever.  Force the next iteration to
+    // advance to the next chunk-meta cursor instead.
     bool force_load_next_chunk = false;
     while (true) {
-        // When get_next_page() reports no more data for the current chunk but
-        // metadata still lists more chunks, we must load the next chunk. A
-        // bare continue would retry the exhausted reader forever if
-        // has_more_data() still returns true (e.g. aligned chunk state).
         if (!chunk_reader_->has_more_data() || force_load_next_chunk) {
             force_load_next_chunk = false;
             while (true) {
                 if (!has_next_chunk()) {
                     return E_NO_MORE_DATA;
+                } else if (is_multi_value_) {
+                    // Multi-value aligned path
+                    ChunkMeta* time_cm = time_chunk_meta_cursor_.get();
+                    std::vector<ChunkMeta*> value_cms;
+                    value_cms.reserve(value_chunk_meta_cursors_.size());
+                    for (auto& cur : value_chunk_meta_cursors_) {
+                        value_cms.push_back(cur.get());
+                    }
+                    advance_to_next_chunk();
+                    // Skip chunk by time filter using time chunk statistics.
+                    if (filter != nullptr && time_cm->statistic_ != nullptr &&
+                        !filter->satisfy(time_cm->statistic_)) {
+                        continue;
+                    }
+                    if (should_skip_chunk_by_time(time_cm, min_time_hint)) {
+                        continue;
+                    }
+                    chunk_reader_->reset();
+                    auto* acr = static_cast<AlignedChunkReader*>(chunk_reader_);
+                    if (RET_FAIL(acr->load_by_aligned_meta_multi(time_cm,
+                                                                 value_cms))) {
+                    }
+                    break;
+                } else if (!is_aligned_) {
+                    ChunkMeta* cm = get_current_chunk_meta();
+                    advance_to_next_chunk();
+                    if (filter != nullptr && cm->statistic_ != nullptr &&
+                        !filter->satisfy(cm->statistic_)) {
+                        continue;
+                    }
+                    // Skip by min_time_hint (merge cursor).
+                    if (should_skip_chunk_by_time(cm, min_time_hint)) {
+                        continue;
+                    }
+                    // Single-path: skip entire chunk by offset using count.
+                    if (should_skip_chunk_by_offset(cm)) {
+                        continue;
+                    }
+                    chunk_reader_->reset();
+                    if (RET_FAIL(chunk_reader_->load_by_meta(cm))) {
+                    }
+                    break;
                 } else {
-                    if (!is_aligned_) {
-                        ChunkMeta* cm = get_current_chunk_meta();
-                        advance_to_next_chunk();
-                        // Skip by time filter.
-                        if (filter != nullptr && cm->statistic_ != nullptr &&
-                            !filter->satisfy(cm->statistic_)) {
-                            continue;
-                        }
-                        // Skip by min_time_hint (merge cursor).
-                        if (should_skip_chunk_by_time(cm, min_time_hint)) {
-                            continue;
-                        }
-                        // Single-path: skip entire chunk by offset using count.
-                        if (should_skip_chunk_by_offset(cm)) {
-                            continue;
-                        }
-                        chunk_reader_->reset();
-                        if (RET_FAIL(chunk_reader_->load_by_meta(cm))) {
-                        }
-                        break;
-                    } else {
-                        ChunkMeta* value_cm = value_chunk_meta_cursor_.get();
-                        ChunkMeta* time_cm = time_chunk_meta_cursor_.get();
-                        advance_to_next_chunk();
-                        if (filter != nullptr &&
-                            value_cm->statistic_ != nullptr &&
-                            !filter->satisfy(value_cm->statistic_)) {
-                            continue;
-                        }
-                        if (should_skip_chunk_by_time(value_cm,
-                                                      min_time_hint)) {
-                            continue;
-                        }
-                        if (should_skip_aligned_chunk_by_offset(time_cm,
-                                                                value_cm)) {
-                            continue;
-                        }
-                        chunk_reader_->reset();
-                        if (RET_FAIL(chunk_reader_->load_by_aligned_meta(
-                                time_cm, value_cm))) {
-                        }
-                        break;
+                    ChunkMeta* value_cm = value_chunk_meta_cursor_.get();
+                    ChunkMeta* time_cm = time_chunk_meta_cursor_.get();
+                    advance_to_next_chunk();
+                    // Use time chunk statistics for time-based filtering.
+                    ChunkMeta* filter_cm =
+                        (time_cm->statistic_ != nullptr) ? time_cm : value_cm;
+                    if (filter != nullptr && filter_cm->statistic_ != nullptr &&
+                        !filter->satisfy(filter_cm->statistic_)) {
+                        continue;
+                    }
+                    if (should_skip_chunk_by_time(filter_cm, min_time_hint)) {
+                        continue;
                     }
+                    if (should_skip_aligned_chunk_by_offset(time_cm,
+                                                            value_cm)) {
+                        continue;
+                    }
+                    chunk_reader_->reset();
+                    if (RET_FAIL(chunk_reader_->load_by_aligned_meta(
+                            time_cm, value_cm))) {
+                    }
+                    break;
                 }
             }
         }
         if (IS_SUCC(ret)) {
             if (alloc && ret_tsblock == nullptr) {
-                ret_tsblock = alloc_tsblock();
+                ret_tsblock =
+                    is_multi_value_ ? alloc_tsblock_multi() : alloc_tsblock();
             }
             ret = chunk_reader_->get_next_page(ret_tsblock, filter, *data_pa_,
                                                min_time_hint, row_offset_,
                                                row_limit_);
         }
+        if (ret == common::E_NO_MORE_DATA && ret_tsblock != nullptr &&
+            ret_tsblock->get_row_count() > 0) {
+            return E_OK;
+        }
         // When current chunk is exhausted (e.g. all pages skipped by offset)
-        // but there are more chunks, load next chunk and retry.
+        // but there are more chunks, load next chunk and retry.  Set the
+        // force flag so the next iteration bypasses has_more_data() (which
+        // can still report true on an aligned chunk that has actually
+        // yielded all its rows).
         if (ret == common::E_NO_MORE_DATA && has_next_chunk()) {
             ret = E_OK;
             force_load_next_chunk = true;
@@ -179,9 +237,19 @@ void TsFileSeriesScanIterator::revert_tsblock() {
 int TsFileSeriesScanIterator::init_chunk_reader() {
     int ret = E_OK;
     is_aligned_ = itimeseries_index_->is_aligned();
+
+    // Check if this is a multi-value aligned index. alloc_multi_ssi() creates
+    // MultiAlignedTimeseriesIndex even when the query selects one value column,
+    // so keep that path consistent with wider aligned reads.
+    if (is_aligned_ && dynamic_cast<MultiAlignedTimeseriesIndex*>(
+                           itimeseries_index_) != nullptr) {
+        return init_chunk_reader_multi();
+    }
+
     if (!is_aligned_) {
         void* buf =
             common::mem_alloc(sizeof(ChunkReader), common::MOD_CHUNK_READER);
+        if (IS_NULL(buf)) return E_OOM;
         chunk_reader_ = new (buf) ChunkReader;
         chunk_meta_cursor_ = itimeseries_index_->get_chunk_meta_list()->begin();
         if (RET_FAIL(chunk_reader_->init(
@@ -191,6 +259,7 @@ int TsFileSeriesScanIterator::init_chunk_reader() {
     } else {
         void* buf = common::mem_alloc(sizeof(AlignedChunkReader),
                                       common::MOD_CHUNK_READER);
+        if (IS_NULL(buf)) return E_OOM;
         chunk_reader_ = new (buf) AlignedChunkReader;
         time_chunk_meta_cursor_ =
             itimeseries_index_->get_time_chunk_meta_list()->begin();
@@ -205,6 +274,96 @@ int TsFileSeriesScanIterator::init_chunk_reader() {
     return ret;
 }
 
+int TsFileSeriesScanIterator::init_chunk_reader_multi() {
+    int ret = E_OK;
+    is_multi_value_ = true;
+
+    void* buf =
+        common::mem_alloc(sizeof(AlignedChunkReader), common::MOD_CHUNK_READER);
+    if (IS_NULL(buf)) {
+        // The single-value path (init_chunk_reader) silently dereferenced
+        // the null pointer on OOM; this path is new in the multi-value
+        // reader work and would do the same via placement-new(nullptr) →
+        // undefined behavior the moment any AlignedChunkReader field is
+        // touched.  Surface E_OOM instead.
+        is_multi_value_ = false;
+        return E_OOM;
+    }
+    auto* acr = new (buf) AlignedChunkReader;
+    chunk_reader_ = acr;
+
+    uint32_t num_cols = itimeseries_index_->get_value_column_count();
+#ifdef ENABLE_THREADS
+    // Borrow the single process-wide worker pool (created in init_common()) for
+    // multi-column decode.  Null when libtsfile_init() hasn't run; combined
+    // with parallel_read_enabled_ this gates the parallel decode path — the
+    // reader falls back to serial decode otherwise.
+    if (num_cols > 1 && common::g_config_value_.parallel_read_enabled_ &&
+        common::g_thread_pool_ != nullptr) {
+        acr->set_decode_pool(common::g_thread_pool_);
+    }
+#endif
+
+    // Per-column chunk lists must align 1:1 with the time chunk list:
+    // load_by_aligned_meta_multi pairs them by index and the downstream
+    // reader has no notion of a "missing" value chunk for a CGM.  If a
+    // file evolved its schema and some column has fewer (or more) chunks
+    // than the time column, naive index pairing would mate chunks from
+    // different chunk groups, returning garbage and dereferencing past
+    // end() once the shorter list ran out.  Refuse upfront with a clear
+    // error rather than producing wrong data.
+    uint32_t time_chunk_count =
+        itimeseries_index_->get_time_chunk_meta_list()->size();
+    for (uint32_t c = 0; c < num_cols; c++) {
+        if (itimeseries_index_->get_value_chunk_meta_list(c)->size() !=
+            time_chunk_count) {
+            return E_NOT_SUPPORT;
+        }
+    }
+
+    // Init time cursor
+    time_chunk_meta_cursor_ =
+        itimeseries_index_->get_time_chunk_meta_list()->begin();
+
+    // Init all value cursors
+    value_chunk_meta_cursors_.resize(num_cols);
+    for (uint32_t c = 0; c < num_cols; c++) {
+        value_chunk_meta_cursors_[c] =
+            itimeseries_index_->get_value_chunk_meta_list(c)->begin();
+    }
+
+    // Init chunk reader
+    if (RET_FAIL(
+            acr->init(read_file_, itimeseries_index_->get_measurement_name(),
+                      itimeseries_index_->get_data_type(), time_filter_))) {
+        return ret;
+    }
+
+    // No chunks → nothing to load; iteration short-circuits via
+    // has_next_chunk() returning false.
+    if (time_chunk_count == 0) {
+        return ret;
+    }
+
+    // Load first chunk set
+    ChunkMeta* time_cm = time_chunk_meta_cursor_.get();
+    std::vector<ChunkMeta*> value_cms;
+    value_cms.reserve(num_cols);
+    for (uint32_t c = 0; c < num_cols; c++) {
+        value_cms.push_back(value_chunk_meta_cursors_[c].get());
+    }
+
+    if (RET_FAIL(acr->load_by_aligned_meta_multi(time_cm, value_cms))) {
+        return ret;
+    }
+
+    // Advance cursors
+    time_chunk_meta_cursor_++;
+    for (auto& cur : value_chunk_meta_cursors_) cur++;
+
+    return ret;
+}
+
 TsBlock* TsFileSeriesScanIterator::alloc_tsblock() {
     ChunkHeader& ch = chunk_reader_->get_chunk_header();
 
@@ -225,4 +384,29 @@ TsBlock* TsFileSeriesScanIterator::alloc_tsblock() {
     return tsblock_;
 }
 
-}  // end namespace storage
\ No newline at end of file
+TsBlock* TsFileSeriesScanIterator::alloc_tsblock_multi() {
+    auto* acr = static_cast<AlignedChunkReader*>(chunk_reader_);
+
+    // Time column
+    ColumnSchema time_cd("time", common::INT64, common::SNAPPY,
+                         common::TS_2DIFF);
+    tuple_desc_.push_back(time_cd);
+
+    // Value columns
+    uint32_t num_cols = acr->get_value_column_count();
+    for (uint32_t c = 0; c < num_cols; c++) {
+        ChunkHeader& ch = acr->get_value_chunk_header(c);
+        ColumnSchema value_cd(ch.measurement_name_, ch.data_type_,
+                              ch.compression_type_, ch.encoding_type_);
+        tuple_desc_.push_back(value_cd);
+    }
+
+    tsblock_ = new TsBlock(&tuple_desc_);
+    if (E_OK != tsblock_->init()) {
+        delete tsblock_;
+        tsblock_ = nullptr;
+    }
+    return tsblock_;
+}
+
+}  // end namespace storage
diff --git a/cpp/src/reader/tsfile_series_scan_iterator.h b/cpp/src/reader/tsfile_series_scan_iterator.h
index 9e790a3d1..77037d8e1 100644
--- a/cpp/src/reader/tsfile_series_scan_iterator.h
+++ b/cpp/src/reader/tsfile_series_scan_iterator.h
@@ -50,6 +50,7 @@ class TsFileSeriesScanIterator {
           tsblock_(nullptr),
           time_filter_(nullptr),
           is_aligned_(false),
+          is_multi_value_(false),
           row_offset_(0),
           row_limit_(-1) {}
     ~TsFileSeriesScanIterator() { destroy(); }
@@ -93,11 +94,42 @@ class TsFileSeriesScanIterator {
                  int64_t min_time_hint = std::numeric_limits<int64_t>::min());
     void revert_tsblock();
 
+    // Multi-value: number of value columns in the TsBlock
+    uint32_t get_value_column_count() const {
+        if (is_multi_value_ && chunk_reader_) {
+            auto* acr = static_cast<AlignedChunkReader*>(chunk_reader_);
+            return acr->get_value_column_count();
+        }
+        return 1;
+    }
+
+    bool is_multi_value() const { return is_multi_value_; }
+
     friend class TsFileIOReader;
 
    private:
     int init_chunk_reader();
+    int init_chunk_reader_multi();
     FORCE_INLINE bool has_next_chunk() const {
+        if (is_multi_value_) {
+            // Anchor on the time chunk list and require every value column
+            // to still have a chunk available.  Checking only value[0] used
+            // to read past end() for columns with fewer chunks (e.g. a
+            // column added after some chunk groups had already been
+            // flushed), which dereferenced freed memory and paired the
+            // wrong time/value chunks.
+            if (time_chunk_meta_cursor_ ==
+                itimeseries_index_->get_time_chunk_meta_list()->end()) {
+                return false;
+            }
+            for (uint32_t c = 0; c < value_chunk_meta_cursors_.size(); c++) {
+                if (value_chunk_meta_cursors_[c] ==
+                    itimeseries_index_->get_value_chunk_meta_list(c)->end()) {
+                    return false;
+                }
+            }
+            return true;
+        }
         if (is_aligned_) {
             return value_chunk_meta_cursor_ !=
                    itimeseries_index_->get_value_chunk_meta_list()->end();
@@ -107,7 +139,21 @@ class TsFileSeriesScanIterator {
         }
     }
     FORCE_INLINE void advance_to_next_chunk() {
-        if (is_aligned_) {
+        if (is_multi_value_) {
+            // Guard each cursor against advancing past end().  Same defense
+            // as has_next_chunk(): per-column chunk counts can diverge in
+            // files with schema evolution.
+            auto time_end =
+                itimeseries_index_->get_time_chunk_meta_list()->end();
+            if (time_chunk_meta_cursor_ != time_end) time_chunk_meta_cursor_++;
+            for (uint32_t c = 0; c < value_chunk_meta_cursors_.size(); c++) {
+                auto end =
+                    itimeseries_index_->get_value_chunk_meta_list(c)->end();
+                if (value_chunk_meta_cursors_[c] != end) {
+                    value_chunk_meta_cursors_[c]++;
+                }
+            }
+        } else if (is_aligned_) {
             time_chunk_meta_cursor_++;
             value_chunk_meta_cursor_++;
         } else {
@@ -119,15 +165,10 @@ class TsFileSeriesScanIterator {
     }
     bool should_skip_chunk_by_time(ChunkMeta* cm, int64_t min_time_hint);
     bool should_skip_chunk_by_offset(ChunkMeta* cm);
-    /**
-     * Aligned (VECTOR): whole-chunk skip by row count is only safe when the
-     * time ChunkMeta and value ChunkMeta agree on statistic count (>0). If
-     * either side lacks count or counts differ, skip is disabled for this
-     * chunk; pages are loaded and page/row-level offset handling applies.
-     */
     bool should_skip_aligned_chunk_by_offset(ChunkMeta* time_cm,
                                              ChunkMeta* value_cm);
     common::TsBlock* alloc_tsblock();
+    common::TsBlock* alloc_tsblock_multi();
 
    private:
     ReadFile* read_file_;
@@ -140,12 +181,16 @@ class TsFileSeriesScanIterator {
     common::SimpleList<ChunkMeta*>::Iterator chunk_meta_cursor_;
     common::SimpleList<ChunkMeta*>::Iterator time_chunk_meta_cursor_;
     common::SimpleList<ChunkMeta*>::Iterator value_chunk_meta_cursor_;
+    // Multi-value: one cursor per value column
+    std::vector<common::SimpleList<ChunkMeta*>::Iterator>
+        value_chunk_meta_cursors_;
     IChunkReader* chunk_reader_;
 
     common::TupleDesc tuple_desc_;
     common::TsBlock* tsblock_;
     Filter* time_filter_;
     bool is_aligned_ = false;
+    bool is_multi_value_ = false;
     int row_offset_;
     int row_limit_;
 };
diff --git a/cpp/src/utils/db_utils.h b/cpp/src/utils/db_utils.h
index 4ffc4d138..b3cb1943e 100644
--- a/cpp/src/utils/db_utils.h
+++ b/cpp/src/utils/db_utils.h
@@ -195,8 +195,6 @@ struct ColumnSchema {
 };
 
 FORCE_INLINE int64_t get_cur_timestamp() {
-    // Milliseconds since the Unix epoch. Uses the C++11 standard library so it
-    // is portable across platforms (gettimeofday is not available on MSVC).
     return std::chrono::duration_cast<std::chrono::milliseconds>(
                std::chrono::system_clock::now().time_since_epoch())
         .count();
diff --git a/cpp/src/writer/chunk_writer.cc b/cpp/src/writer/chunk_writer.cc
index da1811336..acdb4951d 100644
--- a/cpp/src/writer/chunk_writer.cc
+++ b/cpp/src/writer/chunk_writer.cc
@@ -138,6 +138,9 @@ int ChunkWriter::seal_cur_page(bool end_chunk) {
 void ChunkWriter::save_first_page_data(PageWriter& first_page_writer) {
     first_page_data_ = first_page_writer.get_cur_page_data();
     first_page_statistic_->deep_copy_from(first_page_writer.get_statistic());
+    // See ValueChunkWriter::save_first_page_data: avoid double-free on the
+    // shallow-copied buffer pointers.
+    first_page_writer.release_cur_page_data();
 }
 
 int ChunkWriter::write_first_page_data(ByteStream& pages_data,
diff --git a/cpp/src/writer/chunk_writer.h b/cpp/src/writer/chunk_writer.h
index 6eb3f5418..a65f0537f 100644
--- a/cpp/src/writer/chunk_writer.h
+++ b/cpp/src/writer/chunk_writer.h
@@ -103,6 +103,68 @@ class ChunkWriter {
         CW_DO_WRITE_FOR_TYPE();
     }
 
+    template <typename T>
+    int write_batch(const int64_t* timestamps, const T* values,
+                    uint32_t count) {
+        int ret = common::E_OK;
+        uint32_t offset = 0;
+        const uint32_t page_cap =
+            common::g_config_value_.page_writer_max_point_num_;
+        while (offset < count) {
+            uint32_t cur_points = page_writer_.get_point_numer();
+            // Seal whenever cur_points is at or past the cap; the counter is
+            // size_ (rows including the just-written batch) and may exceed
+            // page_cap, so a plain subtraction would underflow uint32_t.
+            if (cur_points >= page_cap) {
+                if (RET_FAIL(seal_cur_page(false))) {
+                    return ret;
+                }
+                cur_points = 0;
+            }
+            uint32_t page_remaining = page_cap - cur_points;
+            uint32_t batch_size = std::min(count - offset, page_remaining);
+            if (RET_FAIL(page_writer_.write_batch(
+                    timestamps + offset, values + offset, batch_size))) {
+                return ret;
+            }
+            offset += batch_size;
+            if (RET_FAIL(seal_cur_page_if_full())) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+
+    int write_string_batch(const int64_t* timestamps, const char* buffer,
+                           const uint32_t* offsets, uint32_t start_idx,
+                           uint32_t count) {
+        int ret = common::E_OK;
+        uint32_t offset = 0;
+        const uint32_t page_cap =
+            common::g_config_value_.page_writer_max_point_num_;
+        while (offset < count) {
+            uint32_t cur_points = page_writer_.get_point_numer();
+            if (cur_points >= page_cap) {
+                if (RET_FAIL(seal_cur_page(false))) {
+                    return ret;
+                }
+                cur_points = 0;
+            }
+            uint32_t page_remaining = page_cap - cur_points;
+            uint32_t batch_size = std::min(count - offset, page_remaining);
+            if (RET_FAIL(page_writer_.write_string_batch(
+                    timestamps + offset, buffer, offsets, start_idx + offset,
+                    batch_size))) {
+                return ret;
+            }
+            offset += batch_size;
+            if (RET_FAIL(seal_cur_page_if_full())) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+
     int end_encode_chunk();
     common::ByteStream& get_chunk_data() { return chunk_data_; }
     Statistic* get_chunk_statistic() { return chunk_statistic_; }
diff --git a/cpp/src/writer/page_writer.cc b/cpp/src/writer/page_writer.cc
index 7766e14c4..eebe5b400 100644
--- a/cpp/src/writer/page_writer.cc
+++ b/cpp/src/writer/page_writer.cc
@@ -126,6 +126,11 @@ void PageWriter::reset() {
     }
     time_out_stream_.reset();
     value_out_stream_.reset();
+    // Without this, a page that was poisoned by a mid-batch encode failure
+    // would stay refused forever even after ChunkWriter calls reset() to
+    // start a fresh page — `partial_failure_` would still be true and
+    // write_to_chunk() would return E_DATA_INCONSISTENCY indefinitely.
+    partial_failure_ = false;
 }
 
 void PageWriter::destroy() {
@@ -156,6 +161,14 @@ int PageWriter::write_to_chunk(ByteStream& pages_data, bool write_header,
               << pages_data.total_size() << " of chunk_data." << std::endl;
 #endif
     int ret = E_OK;
+    // Refuse to seal a page whose time and value streams diverged because of
+    // a mid-batch encode failure (see PageWriter::write_batch).  The higher
+    // layer (TsFileWriter::unrecoverable_) is the authoritative place to
+    // surface this to the caller; this guard prevents a misaligned page from
+    // ever entering the chunk stream.
+    if (UNLIKELY(partial_failure_)) {
+        return common::E_DATA_INCONSISTENCY;
+    }
     if (RET_FAIL(prepare_end_page())) {
         return ret;
     }
diff --git a/cpp/src/writer/page_writer.h b/cpp/src/writer/page_writer.h
index d3966d865..47c958913 100644
--- a/cpp/src/writer/page_writer.h
+++ b/cpp/src/writer/page_writer.h
@@ -150,10 +150,63 @@ class PageWriter {
         PW_DO_WRITE_FOR_TYPE();
     }
 
+    template <typename T>
+    FORCE_INLINE int write_batch(const int64_t* timestamps, const T* values,
+                                 uint32_t count) {
+        int ret = common::E_OK;
+        if (count == 0) return ret;
+        if (UNLIKELY(partial_failure_)) return common::E_DATA_INCONSISTENCY;
+        if (RET_FAIL(time_encoder_->encode_batch(timestamps, count,
+                                                 time_out_stream_))) {
+            // Time stream wasn't advanced (encode_batch is atomic w.r.t. the
+            // stream cursor on failure for these encoders) — leave the page
+            // intact so the caller can retry.
+        } else if (RET_FAIL(value_encoder_->encode_batch(values, count,
+                                                         value_out_stream_))) {
+            // Time stream already advanced; we can't roll it back here.
+            // Mark the page poisoned so write_to_chunk() refuses to seal a
+            // page where time and value rows are out of sync.
+            partial_failure_ = true;
+        } else {
+            statistic_->update_batch(timestamps, values, count);
+        }
+        return ret;
+    }
+
+    // Batch write strings from Arrow-style offset+buffer layout.
+    FORCE_INLINE int write_string_batch(const int64_t* timestamps,
+                                        const char* buffer,
+                                        const uint32_t* offsets,
+                                        uint32_t start_idx, uint32_t count) {
+        int ret = common::E_OK;
+        if (count == 0) return ret;
+        if (UNLIKELY(partial_failure_)) return common::E_DATA_INCONSISTENCY;
+        if (RET_FAIL(time_encoder_->encode_batch(timestamps, count,
+                                                 time_out_stream_))) {
+        } else if (RET_FAIL(value_encoder_->encode_string_batch(
+                       buffer, offsets, start_idx, count, value_out_stream_))) {
+            partial_failure_ = true;
+        } else {
+            for (uint32_t i = 0; i < count; i++) {
+                uint32_t idx = start_idx + i;
+                uint32_t len = offsets[idx + 1] - offsets[idx];
+                common::String val(buffer + offsets[idx], len);
+                statistic_->update(timestamps[i], val);
+            }
+        }
+        return ret;
+    }
+
+    FORCE_INLINE bool has_partial_failure() const { return partial_failure_; }
+
     FORCE_INLINE uint32_t get_point_numer() const { return statistic_->count_; }
     FORCE_INLINE uint32_t get_time_out_stream_size() const {
         return time_out_stream_.total_size();
     }
+    // Logical bytes written — used by the page-seal-when-full heuristic.
+    // Memory-pressure accounting should use estimate_max_mem_size() below,
+    // which reflects the real 64 KiB-page footprint of the underlying
+    // ByteStreams.
     FORCE_INLINE uint32_t get_page_memory_size() const {
         return time_out_stream_.total_size() + value_out_stream_.total_size();
     }
@@ -162,10 +215,17 @@ class PageWriter {
      * outputStream and value outputStream, because size outputStream is never
      * used until flushing.
      *
+     * Reports the *allocated* stream footprint (sum of backing 64 KiB pages)
+     * rather than the logical bytes written.  Sparse workloads with many
+     * measurements would otherwise look like they hold ~0 memory while
+     * actually pinning a full 64 KiB page per stream, so chunk-group memory
+     * thresholds couldn't keep peak memory under the configured cap.
+     *
      * @return allocated size in time, value and outputStream
      */
     FORCE_INLINE uint32_t estimate_max_mem_size() const {
-        return time_out_stream_.total_size() + value_out_stream_.total_size() +
+        return static_cast<uint32_t>(time_out_stream_.allocated_bytes() +
+                                     value_out_stream_.allocated_bytes()) +
                time_encoder_->get_max_byte_size() +
                value_encoder_->get_max_byte_size();
     }
@@ -179,6 +239,11 @@ class PageWriter {
     }
     FORCE_INLINE Statistic* get_statistic() { return statistic_; }
     PageData get_cur_page_data() { return cur_page_data_; }
+    // See ValuePageWriter::release_cur_page_data for rationale.
+    void release_cur_page_data() {
+        cur_page_data_.uncompressed_buf_ = nullptr;
+        cur_page_data_.compressed_buf_ = nullptr;
+    }
     void destroy_page_data() { cur_page_data_.destroy(); }
 
    private:
@@ -193,7 +258,6 @@ class PageWriter {
                           common::ByteStream& pages_data);
 
    private:
-    // static const uint32_t OUT_STREAM_PAGE_SIZE = 48;
     static const uint32_t OUT_STREAM_PAGE_SIZE = 1024;
 
    private:
@@ -206,6 +270,11 @@ class PageWriter {
     PageData cur_page_data_;
     Compressor* compressor_;
     bool is_inited_;
+    // Set when write_batch advanced the time stream but value encoding
+    // failed.  We can't unwind the partial time write, so refuse further
+    // writes and surface the poisoning to the higher layer via
+    // write_to_chunk().
+    bool partial_failure_ = false;
 };
 
 }  // end namespace storage
diff --git a/cpp/src/writer/time_chunk_writer.cc b/cpp/src/writer/time_chunk_writer.cc
index 0c7e3b212..0a0623686 100644
--- a/cpp/src/writer/time_chunk_writer.cc
+++ b/cpp/src/writer/time_chunk_writer.cc
@@ -144,6 +144,9 @@ int TimeChunkWriter::seal_cur_page(bool end_chunk) {
 void TimeChunkWriter::save_first_page_data(TimePageWriter& first_page_writer) {
     first_page_data_ = first_page_writer.get_cur_page_data();
     first_page_statistic_->deep_copy_from(first_page_writer.get_statistic());
+    // See ValueChunkWriter::save_first_page_data: avoid double-free on the
+    // shallow-copied buffer pointers.
+    first_page_writer.release_cur_page_data();
 }
 
 int TimeChunkWriter::write_first_page_data(ByteStream& pages_data,
@@ -173,9 +176,6 @@ int TimeChunkWriter::end_encode_chunk() {
             chunk_header_.data_size_ = chunk_data_.total_size();
             chunk_header_.num_of_pages_ = num_of_pages_;
         }
-    } else if (num_of_pages_ > 0) {
-        chunk_header_.data_size_ = chunk_data_.total_size();
-        chunk_header_.num_of_pages_ = num_of_pages_;
     }
 #if DEBUG_SE
     std::cout << "end_encode_time_chunk: num_of_pages_=" << num_of_pages_
diff --git a/cpp/src/writer/time_chunk_writer.h b/cpp/src/writer/time_chunk_writer.h
index c67516ba5..e6b2894e2 100644
--- a/cpp/src/writer/time_chunk_writer.h
+++ b/cpp/src/writer/time_chunk_writer.h
@@ -42,8 +42,7 @@ class TimeChunkWriter {
           first_page_data_(),
           first_page_statistic_(nullptr),
           chunk_header_(),
-          num_of_pages_(0),
-          enable_page_seal_if_full_(true) {}
+          num_of_pages_(0) {}
     ~TimeChunkWriter() { destroy(); }
     int init(const common::ColumnSchema& col_schema);
     int init(const std::string& measurement_name, common::TSEncoding encoding,
@@ -58,9 +57,35 @@ class TimeChunkWriter {
         if (RET_FAIL(time_page_writer_.write(timestamp))) {
             return ret;
         }
-        if (UNLIKELY(!enable_page_seal_if_full_)) {
+        if (RET_FAIL(seal_cur_page_if_full())) {
             return ret;
-        } else {
+        }
+        return ret;
+    }
+
+    int write_batch(const int64_t* timestamps, uint32_t count) {
+        int ret = common::E_OK;
+        uint32_t offset = 0;
+        const uint32_t page_cap =
+            common::g_config_value_.page_writer_max_point_num_;
+        while (offset < count) {
+            uint32_t cur_points = time_page_writer_.get_point_numer();
+            // Seal whenever cur_points is at or past the cap; the counter is
+            // size_ (rows including the just-written batch) and may exceed
+            // page_cap, so a plain subtraction would underflow uint32_t.
+            if (cur_points >= page_cap) {
+                if (RET_FAIL(seal_cur_page(false))) {
+                    return ret;
+                }
+                cur_points = 0;
+            }
+            uint32_t page_remaining = page_cap - cur_points;
+            uint32_t batch_size = std::min(count - offset, page_remaining);
+            if (RET_FAIL(time_page_writer_.write_batch(timestamps + offset,
+                                                       batch_size))) {
+                return ret;
+            }
+            offset += batch_size;
             if (RET_FAIL(seal_cur_page_if_full())) {
                 return ret;
             }
@@ -73,29 +98,25 @@ class TimeChunkWriter {
     Statistic* get_chunk_statistic() { return chunk_statistic_; }
     FORCE_INLINE int32_t num_of_pages() const { return num_of_pages_; }
 
+    int64_t estimate_max_series_mem_size();
+
+    bool hasData();
+
     // Current (unsealed) page point count.
     FORCE_INLINE uint32_t get_point_numer() const {
         return time_page_writer_.get_point_numer();
     }
 
-    int64_t estimate_max_series_mem_size();
-
-    bool hasData();
-
     /** True if the current (unsealed) page has at least one point. */
     bool has_current_page_data() const {
         return time_page_writer_.get_point_numer() > 0;
     }
 
-    /**
-     * Force seal the current page (for aligned model: when any aligned page
-     * seals due to memory/point threshold, all pages must seal together).
-     * @return E_OK on success.
-     */
+    /** Force seal the current page. */
     int seal_current_page() { return seal_cur_page(false); }
 
-    // For aligned writer: allow disabling the automatic page-size/point-number
-    // check so the caller can seal pages at chosen boundaries.
+    // Allow disabling the automatic page-size/point-number check so the
+    // caller can seal pages at chosen boundaries.
     FORCE_INLINE void set_enable_page_seal_if_full(bool enable) {
         enable_page_seal_if_full_ = enable;
     }
@@ -109,6 +130,9 @@ class TimeChunkWriter {
                 common::g_config_value_.page_writer_max_memory_bytes_);
     }
     FORCE_INLINE int seal_cur_page_if_full() {
+        if (UNLIKELY(!enable_page_seal_if_full_)) {
+            return common::E_OK;
+        }
         if (UNLIKELY(is_cur_page_full())) {
             return seal_cur_page(false);
         }
@@ -138,8 +162,7 @@ class TimeChunkWriter {
 
     ChunkHeader chunk_header_;
     int32_t num_of_pages_;
-    // If false, write() won't auto-seal when the current page becomes full.
-    bool enable_page_seal_if_full_;
+    bool enable_page_seal_if_full_ = true;
 };
 
 }  // end namespace storage
diff --git a/cpp/src/writer/time_page_writer.h b/cpp/src/writer/time_page_writer.h
index d9dcecff1..bda9a5023 100644
--- a/cpp/src/writer/time_page_writer.h
+++ b/cpp/src/writer/time_page_writer.h
@@ -84,15 +84,40 @@ class TimePageWriter {
         return ret;
     }
 
+    int write_batch(const int64_t* timestamps, uint32_t count) {
+        int ret = common::E_OK;
+        if (count == 0) return ret;
+        // Check order: first timestamp vs existing end_time
+        if (statistic_->count_ != 0 && is_inited_ &&
+            timestamps[0] <= statistic_->end_time_) {
+            return common::E_OUT_OF_ORDER;
+        }
+        // Check monotonicity within batch
+        for (uint32_t i = 1; i < count; i++) {
+            if (timestamps[i] <= timestamps[i - 1]) {
+                return common::E_OUT_OF_ORDER;
+            }
+        }
+        if (RET_FAIL(time_encoder_->encode_batch(timestamps, count,
+                                                 time_out_stream_))) {
+        } else {
+            statistic_->update_time_batch(timestamps, count);
+        }
+        return ret;
+    }
+
     FORCE_INLINE uint32_t get_point_numer() const { return statistic_->count_; }
     FORCE_INLINE uint32_t get_time_out_stream_size() const {
         return time_out_stream_.total_size();
     }
+    // Logical bytes written — used by the page-seal-when-full heuristic.
     FORCE_INLINE uint32_t get_page_memory_size() const {
         return time_out_stream_.total_size();
     }
+    // Allocated 64 KiB-page footprint — used by chunk-group memory pressure
+    // accounting.  See PageWriter::estimate_max_mem_size.
     FORCE_INLINE uint32_t estimate_max_mem_size() const {
-        return time_out_stream_.total_size() +
+        return static_cast<uint32_t>(time_out_stream_.allocated_bytes()) +
                time_encoder_->get_max_byte_size();
     }
     int write_to_chunk(common::ByteStream& pages_data, bool write_header,
@@ -102,6 +127,11 @@ class TimePageWriter {
     }
     FORCE_INLINE Statistic* get_statistic() { return statistic_; }
     TimePageData get_cur_page_data() { return cur_page_data_; }
+    // See ValuePageWriter::release_cur_page_data for rationale.
+    void release_cur_page_data() {
+        cur_page_data_.uncompressed_buf_ = nullptr;
+        cur_page_data_.compressed_buf_ = nullptr;
+    }
     void destroy_page_data() { cur_page_data_.destroy(); }
 
    private:
diff --git a/cpp/src/writer/tsfile_table_writer.cc b/cpp/src/writer/tsfile_table_writer.cc
index eb0319af8..b1b7911bd 100644
--- a/cpp/src/writer/tsfile_table_writer.cc
+++ b/cpp/src/writer/tsfile_table_writer.cc
@@ -45,7 +45,7 @@ TsFileTableWriter::TsFileTableWriter(
 
 }  // namespace storage
 
-storage::TsFileTableWriter::~TsFileTableWriter() = default;
+storage::TsFileTableWriter::~TsFileTableWriter() { close(); }
 
 int storage::TsFileTableWriter::register_table(
     const std::shared_ptr<TableSchema>& table_schema) {
@@ -66,21 +66,48 @@ int storage::TsFileTableWriter::write_table(storage::Tablet& tablet) const {
                tablet.get_table_name() != exclusive_table_name_) {
         return common::E_TABLE_NOT_EXIST;
     }
+    // Always lowercase the incoming tablet's table / column / schema-map
+    // names: each call may carry a fresh tablet with mixed-case identifiers,
+    // and the underlying engine expects lowercase. Lowering is idempotent so
+    // reusing the same tablet across calls remains cheap.
     tablet.set_table_name(to_lower(tablet.get_table_name()));
     for (size_t i = 0; i < tablet.get_column_count(); i++) {
         tablet.set_column_name(i, to_lower(tablet.get_column_name(i)));
     }
 
     auto schema_map = tablet.get_schema_map();
-    std::map<std::string, int> schema_map_;
+    std::map<std::string, int> new_schema_map;
     for (auto iter = schema_map.begin(); iter != schema_map.end(); iter++) {
-        schema_map_[to_lower(iter->first)] = iter->second;
+        new_schema_map[to_lower(iter->first)] = iter->second;
     }
-    tablet.set_schema_map(schema_map_);
+    tablet.set_schema_map(new_schema_map);
 
     return tsfile_writer_->write_table(tablet);
 }
 
-int storage::TsFileTableWriter::flush() { return tsfile_writer_->flush(); }
+int storage::TsFileTableWriter::flush() {
+    if (closed_) {
+        return common::E_OK;
+    }
+    return tsfile_writer_->flush();
+}
 
-int storage::TsFileTableWriter::close() { return tsfile_writer_->close(); }
+int storage::TsFileTableWriter::close() {
+    if (closed_) {
+        return common::E_OK;
+    }
+    if (!tsfile_writer_) {
+        closed_ = true;
+        return common::E_OK;
+    }
+    // Don't latch closed_ until the underlying writer reports success: a
+    // failed footer write / sync / file close should be retryable, and the
+    // destructor must still be able to drive a final close attempt.  The
+    // previous order returned E_OK on every retry after the first failure,
+    // potentially leaving the file unfinished and leaking the fd.
+    int ret = tsfile_writer_->close();
+    if (ret == common::E_OK) {
+        closed_ = true;
+    }
+    return ret;
+}
diff --git a/cpp/src/writer/tsfile_table_writer.h b/cpp/src/writer/tsfile_table_writer.h
index ce18bc007..a2d2a5fd9 100644
--- a/cpp/src/writer/tsfile_table_writer.h
+++ b/cpp/src/writer/tsfile_table_writer.h
@@ -124,6 +124,8 @@ class TsFileTableWriter {
     // Some errors may not be conveyed during the construction phase, so it's
     // necessary to maintain an internal error code.
     int error_number = common::E_OK;
+
+    bool closed_ = false;
 };
 
 }  // namespace storage
diff --git a/cpp/src/writer/tsfile_writer.cc b/cpp/src/writer/tsfile_writer.cc
index bc3398d98..c469faaec 100644
--- a/cpp/src/writer/tsfile_writer.cc
+++ b/cpp/src/writer/tsfile_writer.cc
@@ -25,8 +25,12 @@
 #include <unistd.h>
 #endif
 
+#include <chrono>
+#include <iomanip>
+
 #include "chunk_writer.h"
 #include "common/config/config.h"
+#include "common/global.h"
 #ifdef ENABLE_THREADS
 #include "common/thread_pool.h"
 #endif
@@ -56,23 +60,19 @@ int libtsfile_init() {
 }
 
 void libtsfile_destroy() {
+    ModStat::get_instance().destroy();
 #ifdef ENABLE_THREADS
-    delete common::g_write_thread_pool_;
-    common::g_write_thread_pool_ = nullptr;
+    delete common::g_thread_pool_;
+    common::g_thread_pool_ = nullptr;
 #endif
-    ModStat::get_instance().destroy();
     libtsfile::g_s_is_inited = false;
 }
 
-void set_page_max_point_count(uint32_t page_max_ponint_count) {
-    config_set_page_max_point_count(page_max_ponint_count);
+int set_page_max_point_count(uint32_t page_max_ponint_count) {
+    return config_set_page_max_point_count(page_max_ponint_count);
 }
-void set_max_degree_of_index_node(uint32_t max_degree_of_index_node) {
-    config_set_max_degree_of_index_node(max_degree_of_index_node);
-}
-
-void set_strict_page_size(bool strict_page_size) {
-    config_set_strict_page_size(strict_page_size);
+int set_max_degree_of_index_node(uint32_t max_degree_of_index_node) {
+    return config_set_max_degree_of_index_node(max_degree_of_index_node);
 }
 
 TsFileWriter::TsFileWriter()
@@ -84,8 +84,7 @@ TsFileWriter::TsFileWriter()
       record_count_for_next_mem_check_(
           g_config_value_.record_count_for_next_mem_check_),
       write_file_created_(false),
-      io_writer_owned_(true),
-      enforce_recovered_last_time_order_(false) {}
+      io_writer_owned_(true) {}
 
 TsFileWriter::~TsFileWriter() { destroy(); }
 
@@ -131,7 +130,19 @@ int TsFileWriter::init(WriteFile* write_file) {
     write_file_ = write_file;
     write_file_created_ = false;
     io_writer_owned_ = true;
+    // Re-arm per-lifecycle state when the writer is reused after a
+    // destroy().  enforce_recovered_last_time_order_ may have been set
+    // true by a previous recovery init; without resetting it we'd refuse
+    // valid writes whose timestamps don't satisfy a long-stale anchor.
+    // unrecoverable_ from a previous partial-write failure would otherwise
+    // make every operation on the new file fail immediately.
+    // start_file_done_ is true after the previous lifecycle's first flush,
+    // so without resetting it flush() would skip the magic/version write on
+    // the new file and produce headerless output.
     enforce_recovered_last_time_order_ = false;
+    unrecoverable_ = false;
+    start_file_done_ = false;
+    record_count_since_last_flush_ = 0;
     io_writer_ = new TsFileIOWriter();
     io_writer_->init(write_file_);
     return E_OK;
@@ -151,6 +162,10 @@ int TsFileWriter::init(RestorableTsFileIOWriter* rw) {
     write_file_ = rw->get_write_file();
     write_file_created_ = false;
     io_writer_owned_ = false;
+    // Clear any unrecoverable_ latched from a previous lifecycle so the
+    // re-init isn't immediately poisoned.
+    unrecoverable_ = false;
+    // Reject new writes whose timestamps fall back into the recovered range.
     enforce_recovered_last_time_order_ = true;
     io_writer_ = rw;
 
@@ -188,6 +203,8 @@ int TsFileWriter::init(RestorableTsFileIOWriter* rw) {
             if (cm == nullptr) {
                 continue;
             }
+            // Track the highest end_time across recovered chunks so that
+            // appending writes can refuse out-of-order timestamps.
             if (cm->statistic_ != nullptr && cm->statistic_->count_ > 0) {
                 group->last_time_ =
                     std::max(group->last_time_, cm->statistic_->end_time_);
@@ -682,6 +699,10 @@ int64_t TsFileWriter::calculate_mem_size_for_all_group() {
     return mem_total_size;
 }
 
+int64_t TsFileWriter::calculate_meta_mem_size() const {
+    return io_writer_->get_meta_size();
+}
+
 /**
  * check occupied memory size, if it exceeds the chunkGroupSize threshold, flush
  * them to given OutputStream.
@@ -689,7 +710,15 @@ int64_t TsFileWriter::calculate_mem_size_for_all_group() {
 int TsFileWriter::check_memory_size_and_may_flush_chunks() {
     int ret = E_OK;
     if (record_count_since_last_flush_ >= record_count_for_next_mem_check_) {
-        int64_t mem_size = calculate_mem_size_for_all_group();
+        // chunk-writer memory drops to ~0 after flush, but chunk metadata
+        // (ChunkMeta / ChunkGroupMeta / per-statistic PageArenas) keeps
+        // accumulating until end_file().  Wide-schema or many-flush
+        // workloads can pile up tens of MB of metadata that the old
+        // threshold check ignored entirely — flush would never fire even
+        // though total writer memory was well past chunk_group_size_threshold_.
+        int64_t chunk_size = calculate_mem_size_for_all_group();
+        int64_t meta_size = calculate_meta_mem_size();
+        int64_t mem_size = chunk_size + meta_size;
         record_count_for_next_mem_check_ =
             record_count_since_last_flush_ *
             common::g_config_value_.chunk_group_size_threshold_ / mem_size;
@@ -701,16 +730,17 @@ int TsFileWriter::check_memory_size_and_may_flush_chunks() {
 }
 
 int TsFileWriter::write_record(const TsRecord& record) {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
     int ret = E_OK;
     auto device_id = std::make_shared<StringArrayDeviceID>(record.device_id_);
-    auto schema_it = schemas_.find(device_id);
-    if (schema_it == schemas_.end() || schema_it->second == nullptr) {
-        return E_DEVICE_NOT_EXIST;
-    }
-    MeasurementSchemaGroup* device_schema = schema_it->second;
-    if (enforce_recovered_last_time_order_ &&
-        record.timestamp_ <= device_schema->last_time_) {
-        return E_OUT_OF_ORDER;
+    // After recovery, refuse writes whose timestamp would land at or before
+    // any already-flushed chunk's end_time for this device.
+    if (enforce_recovered_last_time_order_) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr &&
+            record.timestamp_ <= schema_it->second->last_time_) {
+            return E_OUT_OF_ORDER;
+        }
     }
     // std::vector<ChunkWriter*> chunk_writers;
     SimpleVector<ChunkWriter*> chunk_writers;
@@ -732,24 +762,28 @@ int TsFileWriter::write_record(const TsRecord& record) {
                     record.points_[c]);
     }
 
-    device_schema->last_time_ =
-        std::max(device_schema->last_time_, record.timestamp_);
+    if (enforce_recovered_last_time_order_) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr) {
+            schema_it->second->last_time_ =
+                std::max(schema_it->second->last_time_, record.timestamp_);
+        }
+    }
     record_count_since_last_flush_++;
     ret = check_memory_size_and_may_flush_chunks();
     return ret;
 }
 
 int TsFileWriter::write_record_aligned(const TsRecord& record) {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
     int ret = E_OK;
     auto device_id = std::make_shared<StringArrayDeviceID>(record.device_id_);
-    auto schema_it = schemas_.find(device_id);
-    if (schema_it == schemas_.end() || schema_it->second == nullptr) {
-        return E_DEVICE_NOT_EXIST;
-    }
-    MeasurementSchemaGroup* device_schema = schema_it->second;
-    if (enforce_recovered_last_time_order_ &&
-        record.timestamp_ <= device_schema->last_time_) {
-        return E_OUT_OF_ORDER;
+    if (enforce_recovered_last_time_order_) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr &&
+            record.timestamp_ <= schema_it->second->last_time_) {
+            return E_OUT_OF_ORDER;
+        }
     }
     SimpleVector<ValueChunkWriter*> value_chunk_writers;
     SimpleVector<common::TSDataType> data_types;
@@ -763,6 +797,8 @@ int TsFileWriter::write_record_aligned(const TsRecord& record) {
     if (value_chunk_writers.size() != record.points_.size()) {
         return E_INVALID_ARG;
     }
+    // Snapshot page counters before the write so we can detect any column
+    // that crossed a page boundary and seal the rest in lockstep.
     int32_t time_pages_before = time_chunk_writer->num_of_pages();
     std::vector<int32_t> value_pages_before(value_chunk_writers.size(), 0);
     for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
@@ -771,22 +807,40 @@ int TsFileWriter::write_record_aligned(const TsRecord& record) {
             value_pages_before[c] = value_chunk_writer->num_of_pages();
         }
     }
-    time_chunk_writer->write(record.timestamp_);
+    // Time first: a rejected timestamp (E_OUT_OF_ORDER, OOM, etc.) must
+    // not silently advance the value writers — that would leave the time
+    // chunk one row behind every value chunk for the rest of the file.
+    if (RET_FAIL(time_chunk_writer->write(record.timestamp_))) {
+        return ret;
+    }
     for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
         ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
         if (IS_NULL(value_chunk_writer)) {
             continue;
         }
-        write_point_aligned(value_chunk_writer, record.timestamp_,
-                            data_types[c], record.points_[c]);
+        if (RET_FAIL(write_point_aligned(value_chunk_writer, record.timestamp_,
+                                         data_types[c], record.points_[c]))) {
+            // Time wrote the row but at least one value column failed
+            // mid-record; the per-column row counts no longer agree.
+            // Mark the writer unrecoverable so flush/close refuses to
+            // seal a misaligned chunk group.
+            unrecoverable_ = true;
+            return ret;
+        }
     }
     if (RET_FAIL(maybe_seal_aligned_pages_together(
             time_chunk_writer, value_chunk_writers, time_pages_before,
             value_pages_before))) {
+        unrecoverable_ = true;
         return ret;
     }
-    device_schema->last_time_ =
-        std::max(device_schema->last_time_, record.timestamp_);
+    if (enforce_recovered_last_time_order_) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr) {
+            schema_it->second->last_time_ =
+                std::max(schema_it->second->last_time_, record.timestamp_);
+        }
+    }
     return ret;
 }
 
@@ -815,39 +869,10 @@ int TsFileWriter::write_point(ChunkWriter* chunk_writer, int64_t timestamp,
     }
 }
 
-int TsFileWriter::write_point_aligned(ValueChunkWriter* value_chunk_writer,
-                                      int64_t timestamp,
-                                      common::TSDataType data_type,
-                                      const DataPoint& point) {
-    bool isnull = point.isnull;
-    switch (data_type) {
-        case common::BOOLEAN:
-            return value_chunk_writer->write(timestamp, point.u_.bool_val_,
-                                             isnull);
-        case common::INT32:
-        case common::DATE:
-            return value_chunk_writer->write(timestamp, point.u_.i32_val_,
-                                             isnull);
-        case common::TIMESTAMP:
-        case common::INT64:
-            return value_chunk_writer->write(timestamp, point.u_.i64_val_,
-                                             isnull);
-        case common::FLOAT:
-            return value_chunk_writer->write(timestamp, point.u_.float_val_,
-                                             isnull);
-        case common::DOUBLE:
-            return value_chunk_writer->write(timestamp, point.u_.double_val_,
-                                             isnull);
-        case common::BLOB:
-        case common::TEXT:
-        case common::STRING:
-            return value_chunk_writer->write(timestamp, point.text_val_,
-                                             isnull);
-        default:
-            return E_INVALID_DATA_POINT;
-    }
-}
-
+// After writing one record / batch to the time chunk and every value chunk,
+// keep their page boundaries aligned: if any of them autosealed a page on
+// memory pressure, seal the rest of the open pages too so an aligned reader
+// can still pair position N across time + every value column.
 int TsFileWriter::maybe_seal_aligned_pages_together(
     TimeChunkWriter* time_chunk_writer,
     common::SimpleVector<ValueChunkWriter*>& value_chunk_writers,
@@ -883,19 +908,52 @@ int TsFileWriter::maybe_seal_aligned_pages_together(
     return ret;
 }
 
+int TsFileWriter::write_point_aligned(ValueChunkWriter* value_chunk_writer,
+                                      int64_t timestamp,
+                                      common::TSDataType data_type,
+                                      const DataPoint& point) {
+    bool isnull = point.isnull;
+    switch (data_type) {
+        case common::BOOLEAN:
+            return value_chunk_writer->write(timestamp, point.u_.bool_val_,
+                                             isnull);
+        case common::INT32:
+        case common::DATE:
+            return value_chunk_writer->write(timestamp, point.u_.i32_val_,
+                                             isnull);
+        case common::TIMESTAMP:
+        case common::INT64:
+            return value_chunk_writer->write(timestamp, point.u_.i64_val_,
+                                             isnull);
+        case common::FLOAT:
+            return value_chunk_writer->write(timestamp, point.u_.float_val_,
+                                             isnull);
+        case common::DOUBLE:
+            return value_chunk_writer->write(timestamp, point.u_.double_val_,
+                                             isnull);
+        case common::BLOB:
+        case common::TEXT:
+        case common::STRING:
+            return value_chunk_writer->write(timestamp, point.text_val_,
+                                             isnull);
+        default:
+            return E_INVALID_DATA_POINT;
+    }
+}
+
 int TsFileWriter::write_tablet_aligned(const Tablet& tablet) {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
     int ret = E_OK;
     auto device_id =
         std::make_shared<StringArrayDeviceID>(tablet.insert_target_name_);
-    auto schema_it = schemas_.find(device_id);
-    if (schema_it == schemas_.end() || schema_it->second == nullptr) {
-        return E_DEVICE_NOT_EXIST;
-    }
-    MeasurementSchemaGroup* device_schema = schema_it->second;
     const uint32_t total_rows = tablet.get_cur_row_size();
     if (enforce_recovered_last_time_order_ && total_rows > 0 &&
-        tablet.timestamps_[0] <= device_schema->last_time_) {
-        return E_OUT_OF_ORDER;
+        tablet.timestamps_ != nullptr) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr &&
+            tablet.timestamps_[0] <= schema_it->second->last_time_) {
+            return E_OUT_OF_ORDER;
+        }
     }
     SimpleVector<ValueChunkWriter*> value_chunk_writers;
     TimeChunkWriter* time_chunk_writer = nullptr;
@@ -906,247 +964,109 @@ int TsFileWriter::write_tablet_aligned(const Tablet& tablet) {
                                          data_types))) {
         return ret;
     }
-    const bool strict_page_size = common::g_config_value_.strict_page_size_;
-
-    // Decide whether we have string/blob/text columns.
-    bool has_varlen_column = false;
-    for (uint32_t i = 0; i < data_types.size(); i++) {
-        if (data_types[i] == common::STRING || data_types[i] == common::TEXT ||
-            data_types[i] == common::BLOB) {
-            has_varlen_column = true;
-            break;
-        }
-    }
-
-    // Keep writers' seal-check behavior consistent across calls.
-    time_chunk_writer->set_enable_page_seal_if_full(strict_page_size);
-    for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
-        if (!IS_NULL(value_chunk_writers[c])) {
-            value_chunk_writers[c]->set_enable_page_seal_if_full(
-                strict_page_size);
-        }
-    }
-
-    if (strict_page_size) {
-        // Strict mode: keep the original row-based insertion to ensure aligned
-        // pages seal together when either side becomes full.
-        for (uint32_t row = 0; row < total_rows; row++) {
-            int32_t time_pages_before = time_chunk_writer->num_of_pages();
-            std::vector<int32_t> value_pages_before(value_chunk_writers.size(),
-                                                    0);
-            for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
-                ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
-                if (!IS_NULL(value_chunk_writer)) {
-                    value_pages_before[c] = value_chunk_writer->num_of_pages();
-                }
-            }
-
-            if (RET_FAIL(time_chunk_writer->write(tablet.timestamps_[row]))) {
-                return ret;
-            }
-            ASSERT(value_chunk_writers.size() == tablet.get_column_count());
-            for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
-                ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
-                if (IS_NULL(value_chunk_writer)) {
-                    continue;
-                }
-                if (RET_FAIL(value_write_column(value_chunk_writer, tablet, c,
-                                                row, row + 1))) {
-                    return ret;
-                }
-            }
-            if (RET_FAIL(maybe_seal_aligned_pages_together(
-                    time_chunk_writer, value_chunk_writers, time_pages_before,
-                    value_pages_before))) {
-                return ret;
-            }
+    ASSERT(data_types.size() == tablet.get_column_count());
+    for (uint32_t c = 0; c < data_types.size(); c++) {
+        if (data_types[c] == common::NULL_TYPE) {
+            continue;
         }
-        if (total_rows > 0) {
-            device_schema->last_time_ = std::max(
-                device_schema->last_time_, tablet.timestamps_[total_rows - 1]);
+        if (data_types[c] != tablet.schema_vec_->at(c).data_type_) {
+            return E_TYPE_NOT_MATCH;
         }
-        return ret;
     }
-
-    // Non-strict mode: switch to column-based insertion.
-    if (!has_varlen_column) {
-        // Optimization: when there is no string/blob/text column, we only need
-        // to split by point-number so that each split will trigger a page
-        // seal (and avoid the per-row page-size check).
-        const uint32_t points_per_page =
-            common::g_config_value_.page_writer_max_point_num_;
-
-        // Disable auto page sealing. We will seal pages at split boundaries.
-        time_chunk_writer->set_enable_page_seal_if_full(false);
-        for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
-            if (!IS_NULL(value_chunk_writers[c])) {
-                value_chunk_writers[c]->set_enable_page_seal_if_full(false);
-            }
-        }
-
-        // Determine how many points we need to fill the current unsealed time
-        // page (it may already contain data from previous tablets).
-        uint32_t time_cur_points = time_chunk_writer->get_point_numer();
-        if (time_cur_points >= points_per_page &&
-            time_chunk_writer->has_current_page_data()) {
-            // Close the already-full page together with all aligned value
-            // pages.
-            if (RET_FAIL(time_chunk_writer->seal_current_page())) {
-                return ret;
-            }
-            for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
-                ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
-                if (!IS_NULL(value_chunk_writer) &&
-                    value_chunk_writer->has_current_page_data()) {
-                    if (RET_FAIL(value_chunk_writer->seal_current_page())) {
-                        return ret;
-                    }
-                }
-            }
-            time_cur_points = 0;
-        }
-        const uint32_t first_seg_len =
-            (time_cur_points > 0 && time_cur_points < points_per_page)
-                ? (points_per_page - time_cur_points)
-                : points_per_page;
-
-        // 1) Write time in segments and seal all full segments (except the
-        // last remaining segment).
-        uint32_t seg_start = 0;
-        uint32_t seg_len = first_seg_len;
-        while (seg_start < total_rows) {
-            const uint32_t seg_end = std::min(seg_start + seg_len, total_rows);
-            if (RET_FAIL(time_write_column(time_chunk_writer, tablet, seg_start,
-                                           seg_end))) {
-                return ret;
-            }
-            seg_start = seg_end;
-            if (seg_start < total_rows) {
-                if (RET_FAIL(time_chunk_writer->seal_current_page())) {
-                    return ret;
-                }
-            }
-            seg_len = points_per_page;
-        }
-
-        // 2) Write each value column in the same segments.
-        ASSERT(value_chunk_writers.size() == tablet.get_column_count());
-        for (uint32_t col = 0; col < value_chunk_writers.size(); col++) {
-            ValueChunkWriter* value_chunk_writer = value_chunk_writers[col];
-            if (IS_NULL(value_chunk_writer)) {
-                continue;
-            }
-
-            seg_start = 0;
-            seg_len = first_seg_len;
-            while (seg_start < total_rows) {
-                const uint32_t seg_end =
-                    std::min(seg_start + seg_len, total_rows);
-                if (RET_FAIL(value_write_column(value_chunk_writer, tablet, col,
-                                                seg_start, seg_end))) {
-                    return ret;
-                }
-                seg_start = seg_end;
-                if (seg_start < total_rows) {
-                    if (value_chunk_writer->has_current_page_data() &&
-                        RET_FAIL(value_chunk_writer->seal_current_page())) {
-                        return ret;
-                    }
-                }
-                seg_len = points_per_page;
-            }
-        }
-        if (total_rows > 0) {
-            device_schema->last_time_ = std::max(
-                device_schema->last_time_, tablet.timestamps_[total_rows - 1]);
+    // Snapshot page counters before the batch so we can detect any column
+    // that crossed a page boundary mid-tablet and seal the rest in lockstep.
+    int32_t time_pages_before = time_chunk_writer->num_of_pages();
+    std::vector<int32_t> value_pages_before(value_chunk_writers.size(), 0);
+    for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
+        ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
+        if (!IS_NULL(value_chunk_writer)) {
+            value_pages_before[c] = value_chunk_writer->num_of_pages();
         }
-        return ret;
     }
-
-    // General non-strict (may have varlen STRING/TEXT/BLOB columns):
-    // time auto-seals to provide aligned page boundaries; value writers
-    // skip auto page sealing and are sealed manually at time boundaries.
-    // Attention: since value-side auto-seal is disabled, if a varlen value
-    // page hits the memory threshold earlier, it may not seal immediately
-    // and instead will be sealed later at the recorded time-page boundaries
-    // (this may sacrifice the strict page size limit for performance).
-    time_chunk_writer->set_enable_page_seal_if_full(true);
+    // Suppress memory-driven page sealing on every column for the duration of
+    // the batch. The count-driven seals inside write_batch still fire at the
+    // same `page_writer_max_point_num_` boundary on every writer (time +
+    // values), which keeps aligned page boundaries in lock-step. Re-enable
+    // both before returning so subsequent record-by-record writes restore the
+    // normal memory-pressure behavior, and let the final
+    // maybe_seal_aligned_pages_together pick up any count-driven divergence
+    // (e.g. when a sealed value column ended a page that the time column did
+    // not).
+    time_chunk_writer->set_enable_page_seal_if_full(false);
     for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
-        if (!IS_NULL(value_chunk_writers[c])) {
-            value_chunk_writers[c]->set_enable_page_seal_if_full(false);
+        ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
+        if (!IS_NULL(value_chunk_writer)) {
+            value_chunk_writer->set_enable_page_seal_if_full(false);
         }
     }
-
-    std::vector<uint32_t> time_page_row_ends;
-    const uint32_t page_max_points = std::max<uint32_t>(
-        1, common::g_config_value_.page_writer_max_point_num_);
-    time_page_row_ends.reserve(total_rows / page_max_points + 1);
-
-    // Write time and record where a time page is sealed.
-    for (uint32_t row = 0; row < total_rows; row++) {
-        const int32_t pages_before = time_chunk_writer->num_of_pages();
-        if (RET_FAIL(time_chunk_writer->write(tablet.timestamps_[row]))) {
-            return ret;
-        }
-        const int32_t pages_after = time_chunk_writer->num_of_pages();
-        if (pages_after > pages_before) {
-            const uint32_t boundary_end = row + 1;
-            if (time_page_row_ends.empty() ||
-                time_page_row_ends.back() != boundary_end) {
-                time_page_row_ends.push_back(boundary_end);
+    auto restore_seal = [&]() {
+        time_chunk_writer->set_enable_page_seal_if_full(true);
+        for (uint32_t k = 0; k < value_chunk_writers.size(); k++) {
+            if (!IS_NULL(value_chunk_writers[k])) {
+                value_chunk_writers[k]->set_enable_page_seal_if_full(true);
             }
         }
+    };
+    // Any failure (out-of-order timestamps, OOM, etc.) must abort before we
+    // write a single value column — otherwise the time chunk would record
+    // fewer rows than each value chunk and the chunk-group would deserialize
+    // as misaligned data.
+    if (RET_FAIL(time_write_column_batch(time_chunk_writer, tablet, 0,
+                                         total_rows))) {
+        restore_seal();
+        return ret;
     }
-
-    // Write values column-by-column and seal at recorded boundaries.
     ASSERT(value_chunk_writers.size() == tablet.get_column_count());
-    for (uint32_t col = 0; col < value_chunk_writers.size(); col++) {
-        ValueChunkWriter* value_chunk_writer = value_chunk_writers[col];
+    for (uint32_t c = 0; c < value_chunk_writers.size(); c++) {
+        ValueChunkWriter* value_chunk_writer = value_chunk_writers[c];
         if (IS_NULL(value_chunk_writer)) {
             continue;
         }
-        uint32_t seg_start = 0;
-        for (uint32_t boundary_end : time_page_row_ends) {
-            if (boundary_end <= seg_start) {
-                continue;
-            }
-            if (RET_FAIL(value_write_column(value_chunk_writer, tablet, col,
-                                            seg_start, boundary_end))) {
-                return ret;
-            }
-            if (value_chunk_writer->has_current_page_data() &&
-                RET_FAIL(value_chunk_writer->seal_current_page())) {
-                return ret;
-            }
-            seg_start = boundary_end;
-        }
-        if (seg_start < total_rows) {
-            if (RET_FAIL(value_write_column(value_chunk_writer, tablet, col,
-                                            seg_start, total_rows))) {
-                return ret;
-            }
+        if (RET_FAIL(value_write_column_batch(value_chunk_writer, tablet, c, 0,
+                                              total_rows))) {
+            restore_seal();
+            // Time chunk has the full row count but at least one value
+            // column stopped early.  Mark the writer unrecoverable so no
+            // later flush/close seals the divergent state.
+            unrecoverable_ = true;
+            return ret;
         }
     }
-    if (total_rows > 0) {
-        device_schema->last_time_ = std::max(
-            device_schema->last_time_, tablet.timestamps_[total_rows - 1]);
+    restore_seal();
+    if (RET_FAIL(maybe_seal_aligned_pages_together(
+            time_chunk_writer, value_chunk_writers, time_pages_before,
+            value_pages_before))) {
+        unrecoverable_ = true;
+        return ret;
+    }
+    if (enforce_recovered_last_time_order_ && total_rows > 0 &&
+        tablet.timestamps_ != nullptr) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr) {
+            schema_it->second->last_time_ =
+                std::max(schema_it->second->last_time_,
+                         tablet.timestamps_[total_rows - 1]);
+        }
     }
     return ret;
 }
 
 int TsFileWriter::write_tablet(const Tablet& tablet) {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
     int ret = E_OK;
     auto device_id =
         std::make_shared<StringArrayDeviceID>(tablet.insert_target_name_);
-    auto schema_it = schemas_.find(device_id);
-    if (schema_it == schemas_.end() || schema_it->second == nullptr) {
-        return E_DEVICE_NOT_EXIST;
-    }
-    MeasurementSchemaGroup* device_schema = schema_it->second;
+    // Use the actual filled row count — max_row_num_ is the buffer capacity
+    // and would let uninitialized timestamps/values past the live range leak
+    // into the chunk.
     const uint32_t total_rows = tablet.get_cur_row_size();
     if (enforce_recovered_last_time_order_ && total_rows > 0 &&
-        tablet.timestamps_[0] <= device_schema->last_time_) {
-        return E_OUT_OF_ORDER;
+        tablet.timestamps_ != nullptr) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr &&
+            tablet.timestamps_[0] <= schema_it->second->last_time_) {
+            return E_OUT_OF_ORDER;
+        }
     }
     SimpleVector<ChunkWriter*> chunk_writers;
     SimpleVector<common::TSDataType> data_types;
@@ -1155,22 +1075,44 @@ int TsFileWriter::write_tablet(const Tablet& tablet) {
                                  data_types))) {
         return ret;
     }
+    ASSERT(data_types.size() == tablet.get_column_count());
+    for (uint32_t c = 0; c < data_types.size(); c++) {
+        if (data_types[c] == common::NULL_TYPE) {
+            continue;
+        }
+        if (data_types[c] != tablet.schema_vec_->at(c).data_type_) {
+            return E_TYPE_NOT_MATCH;
+        }
+    }
     ASSERT(chunk_writers.size() == tablet.get_column_count());
+    uint32_t columns_written = 0;
     for (uint32_t c = 0; c < chunk_writers.size(); c++) {
         ChunkWriter* chunk_writer = chunk_writers[c];
         if (IS_NULL(chunk_writer)) {
             continue;
         }
-        if (RET_FAIL(write_column(chunk_writer, tablet, c))) {
+        if (RET_FAIL(
+                write_column_batch(chunk_writer, tablet, c, 0, total_rows))) {
+            // Earlier columns already advanced their chunk writers; this
+            // column failed mid-write, so per-column row counts diverge.
+            // Mark unrecoverable so flush/close refuse to seal the
+            // misaligned tree chunk group.
+            if (columns_written > 0) unrecoverable_ = true;
             return ret;
         }
+        columns_written++;
     }
 
-    if (total_rows > 0) {
-        device_schema->last_time_ = std::max(
-            device_schema->last_time_, tablet.timestamps_[total_rows - 1]);
+    if (enforce_recovered_last_time_order_ && total_rows > 0 &&
+        tablet.timestamps_ != nullptr) {
+        auto schema_it = schemas_.find(device_id);
+        if (schema_it != schemas_.end() && schema_it->second != nullptr) {
+            schema_it->second->last_time_ =
+                std::max(schema_it->second->last_time_,
+                         tablet.timestamps_[total_rows - 1]);
+        }
     }
-    record_count_since_last_flush_ += tablet.max_row_num_;
+    record_count_since_last_flush_ += total_rows;
     ret = check_memory_size_and_may_flush_chunks();
     return ret;
 }
@@ -1201,6 +1143,7 @@ int TsFileWriter::write_tree(const TsRecord& record) {
 }
 
 int TsFileWriter::write_table(Tablet& tablet) {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
     int ret = E_OK;
     if (io_writer_->get_schema()->table_schema_map_.find(
             tablet.insert_target_name_) ==
@@ -1213,175 +1156,332 @@ int TsFileWriter::write_table(Tablet& tablet) {
     }
 
     auto device_id_end_index_pairs = split_tablet_by_device(tablet);
-    int start_idx = 0;
-    for (auto& device_id_end_index_pair : device_id_end_index_pairs) {
-        auto device_id = device_id_end_index_pair.first;
-        int end_idx = device_id_end_index_pair.second;
-        if (end_idx == 0) continue;
-
-        SimpleVector<ValueChunkWriter*> value_chunk_writers;
-        TimeChunkWriter* time_chunk_writer = nullptr;
-        if (RET_FAIL(do_check_schema_table(device_id, tablet, time_chunk_writer,
-                                           value_chunk_writers))) {
-            return ret;
-        }
-        auto schema_it = schemas_.find(device_id);
-        MeasurementSchemaGroup* device_schema =
-            (schema_it == schemas_.end()) ? nullptr : schema_it->second;
 
-        std::vector<uint32_t> field_columns;
-        field_columns.reserve(tablet.get_column_count());
-        for (uint32_t col = 0; col < tablet.get_column_count(); ++col) {
-            if (tablet.column_categories_[col] ==
-                common::ColumnCategory::FIELD) {
-                field_columns.push_back(col);
-            }
-        }
-        ASSERT(field_columns.size() == value_chunk_writers.size());
-
-        // Precompute page boundaries from point counts — no serial write
-        // needed.  The first segment may be shorter if the time page already
-        // holds data from a previous write_table call.
-        const uint32_t page_max_points = std::max<uint32_t>(
-            1, common::g_config_value_.page_writer_max_point_num_);
-        const uint32_t si = static_cast<uint32_t>(start_idx);
-        const uint32_t ei = static_cast<uint32_t>(end_idx);
-        if (enforce_recovered_last_time_order_ && device_schema != nullptr &&
-            si < ei && tablet.timestamps_[si] <= device_schema->last_time_) {
-            return E_OUT_OF_ORDER;
-        }
+    if (table_aligned_) {
+        struct ValueTask {
+            ValueChunkWriter* vcw;
+            uint32_t col_idx;
+        };
+        struct SegmentRange {
+            uint32_t si;
+            uint32_t ei;
+        };
+        struct DeviceWriteCtx {
+            TimeChunkWriter* tcw;
+            std::vector<ValueTask> value_tasks;
+            std::vector<SegmentRange> segments;
+            uint32_t initial_page_points;
+        };
 
-        // If the current unsealed page is already at or past capacity (from
-        // a previous write_table call), seal it before starting new segments.
-        uint32_t time_cur_points = time_chunk_writer->get_point_numer();
-        if (time_cur_points >= page_max_points) {
-            if (time_chunk_writer->has_current_page_data()) {
-                if (RET_FAIL(time_chunk_writer->seal_current_page())) {
-                    return ret;
+        const uint32_t page_max_points =
+            std::max<uint32_t>(1, g_config_value_.page_writer_max_point_num_);
+
+        std::vector<DeviceWriteCtx> device_ctxs;
+        std::map<std::shared_ptr<IDeviceID>, size_t, IDeviceIDComparator>
+            device_ctx_index;
+        int start_idx = 0;
+        for (auto& pair : device_id_end_index_pairs) {
+            auto device_id = pair.first;
+            int end_idx = pair.second;
+            if (end_idx == 0) continue;
+
+            const uint32_t si = static_cast<uint32_t>(start_idx);
+            const uint32_t ei = static_cast<uint32_t>(end_idx);
+            // Recovery: refuse any segment whose first timestamp would land
+            // at or before a flushed chunk's end_time for this device. This
+            // mirrors the per-record / per-tablet check on the tree path.
+            if (enforce_recovered_last_time_order_ && tablet.timestamps_ &&
+                ei > si) {
+                auto schema_it = schemas_.find(device_id);
+                if (schema_it != schemas_.end() &&
+                    schema_it->second != nullptr &&
+                    tablet.timestamps_[si] <= schema_it->second->last_time_) {
+                    return E_OUT_OF_ORDER;
                 }
             }
-            for (uint32_t k = 0; k < value_chunk_writers.size(); k++) {
-                if (!IS_NULL(value_chunk_writers[k]) &&
-                    value_chunk_writers[k]->has_current_page_data()) {
-                    if (RET_FAIL(value_chunk_writers[k]->seal_current_page())) {
-                        return ret;
+            auto idx_it = device_ctx_index.find(device_id);
+            if (idx_it == device_ctx_index.end()) {
+                SimpleVector<ValueChunkWriter*> value_chunk_writers;
+                TimeChunkWriter* time_chunk_writer = nullptr;
+                if (RET_FAIL(do_check_schema_table(device_id, tablet,
+                                                   time_chunk_writer,
+                                                   value_chunk_writers))) {
+                    return ret;
+                }
+
+                uint32_t time_cur_points = time_chunk_writer->get_point_numer();
+                if (time_cur_points >= page_max_points) {
+                    // Seal the time page first, then every value page in
+                    // lockstep.  Any failure leaves columns at different
+                    // page boundaries and the chunk group can no longer be
+                    // sealed coherently — mark the writer unrecoverable.
+                    if (time_chunk_writer->has_current_page_data()) {
+                        if (RET_FAIL(time_chunk_writer->seal_current_page())) {
+                            unrecoverable_ = true;
+                            return ret;
+                        }
+                    }
+                    for (uint32_t k = 0; k < value_chunk_writers.size(); k++) {
+                        if (!IS_NULL(value_chunk_writers[k]) &&
+                            value_chunk_writers[k]->has_current_page_data()) {
+                            if (RET_FAIL(value_chunk_writers[k]
+                                             ->seal_current_page())) {
+                                unrecoverable_ = true;
+                                return ret;
+                            }
+                        }
                     }
+                    time_cur_points = 0;
                 }
-            }
-            time_cur_points = 0;
-        }
-        const uint32_t first_seg_cap =
-            (time_cur_points > 0 && time_cur_points < page_max_points)
-                ? (page_max_points - time_cur_points)
-                : page_max_points;
 
-        std::vector<uint32_t> page_boundaries;  // row indices where a page
-                                                // should seal
-        {
-            uint32_t pos = si;
-            uint32_t seg_cap = first_seg_cap;
-            while (pos < ei) {
-                uint32_t seg_end = std::min(pos + seg_cap, ei);
-                if (seg_end < ei) {
-                    page_boundaries.push_back(seg_end);
+                DeviceWriteCtx ctx;
+                ctx.tcw = time_chunk_writer;
+                ctx.initial_page_points = time_cur_points;
+                uint32_t field_col_count = 0;
+                for (uint32_t i = 0; i < tablet.get_column_count(); ++i) {
+                    if (tablet.column_categories_[i] ==
+                        common::ColumnCategory::FIELD) {
+                        ValueChunkWriter* vcw =
+                            value_chunk_writers[field_col_count];
+                        if (!IS_NULL(vcw)) {
+                            ctx.value_tasks.push_back({vcw, i});
+                        }
+                        field_col_count++;
+                    }
                 }
-                pos = seg_end;
-                seg_cap = page_max_points;
+                device_ctxs.push_back(std::move(ctx));
+                idx_it = device_ctx_index
+                             .insert(std::make_pair(device_id,
+                                                    device_ctxs.size() - 1))
+                             .first;
             }
+
+            device_ctxs[idx_it->second].segments.push_back({si, ei});
+            start_idx = end_idx;
         }
 
-        // We control page sealing explicitly at precomputed boundaries, so
-        // auto-seal must be disabled during segmented writes — otherwise a
-        // segment of exactly page_max_points would trigger auto-seal AND
-        // our explicit seal, double-sealing (sealing an empty page → crash).
-        // Note: with auto-seal off, the memory-based threshold
-        // (page_writer_max_memory_bytes_) is not enforced within a segment.
-        // For varlen columns (STRING/TEXT/BLOB), individual pages may exceed
-        // the memory limit.  Each segment is still bounded by
-        // page_max_points rows, keeping pages within a reasonable size.
-        auto write_time_in_segments = [this, &tablet, &page_boundaries, si,
-                                       ei](TimeChunkWriter* tcw) -> int {
+        auto write_time_segments =
+            [this, &tablet, page_max_points](
+                TimeChunkWriter* tcw, const std::vector<SegmentRange>& segments,
+                uint32_t initial_page_points) -> int {
             int r = E_OK;
             tcw->set_enable_page_seal_if_full(false);
-            uint32_t seg_start = si;
-            for (uint32_t boundary : page_boundaries) {
-                if ((r = time_write_column(tcw, tablet, seg_start, boundary)) !=
-                    E_OK)
-                    return r;
-                if ((r = tcw->seal_current_page()) != E_OK) return r;
-                seg_start = boundary;
-            }
-            if (seg_start < ei) {
-                r = time_write_column(tcw, tablet, seg_start, ei);
+            uint32_t page_remaining =
+                (initial_page_points > 0 &&
+                 initial_page_points < page_max_points)
+                    ? (page_max_points - initial_page_points)
+                    : page_max_points;
+            for (const auto& segment : segments) {
+                uint32_t seg_pos = segment.si;
+                while (seg_pos < segment.ei) {
+                    uint32_t batch =
+                        std::min(page_remaining, segment.ei - seg_pos);
+                    if ((r = time_write_column_batch(
+                             tcw, tablet, seg_pos, seg_pos + batch)) != E_OK) {
+                        tcw->set_enable_page_seal_if_full(true);
+                        return r;
+                    }
+                    seg_pos += batch;
+                    page_remaining -= batch;
+                    if (page_remaining == 0) {
+                        if ((r = tcw->seal_current_page()) != E_OK) {
+                            tcw->set_enable_page_seal_if_full(true);
+                            return r;
+                        }
+                        page_remaining = page_max_points;
+                    }
+                }
             }
             tcw->set_enable_page_seal_if_full(true);
             return r;
         };
 
-        auto write_value_in_segments = [this, &tablet, &page_boundaries, si,
-                                        ei](ValueChunkWriter* vcw,
-                                            uint32_t col_idx) -> int {
+        auto write_value_segments =
+            [this, &tablet, page_max_points](
+                ValueChunkWriter* vcw, uint32_t col_idx,
+                const std::vector<SegmentRange>& segments,
+                uint32_t initial_page_points) -> int {
             int r = E_OK;
             vcw->set_enable_page_seal_if_full(false);
-            uint32_t seg_start = si;
-            for (uint32_t boundary : page_boundaries) {
-                if ((r = value_write_column(vcw, tablet, col_idx, seg_start,
-                                            boundary)) != E_OK)
-                    return r;
-                if (vcw->has_current_page_data() &&
-                    (r = vcw->seal_current_page()) != E_OK)
-                    return r;
-                seg_start = boundary;
-            }
-            if (seg_start < ei) {
-                r = value_write_column(vcw, tablet, col_idx, seg_start, ei);
+            uint32_t page_remaining =
+                (initial_page_points > 0 &&
+                 initial_page_points < page_max_points)
+                    ? (page_max_points - initial_page_points)
+                    : page_max_points;
+            for (const auto& segment : segments) {
+                uint32_t seg_pos = segment.si;
+                while (seg_pos < segment.ei) {
+                    uint32_t batch =
+                        std::min(page_remaining, segment.ei - seg_pos);
+                    if ((r = value_write_column_batch(
+                             vcw, tablet, col_idx, seg_pos, seg_pos + batch)) !=
+                        E_OK) {
+                        vcw->set_enable_page_seal_if_full(true);
+                        return r;
+                    }
+                    seg_pos += batch;
+                    page_remaining -= batch;
+                    if (page_remaining == 0) {
+                        if (vcw->has_current_page_data() &&
+                            (r = vcw->seal_current_page()) != E_OK) {
+                            vcw->set_enable_page_seal_if_full(true);
+                            return r;
+                        }
+                        page_remaining = page_max_points;
+                    }
+                }
             }
             vcw->set_enable_page_seal_if_full(true);
             return r;
         };
 
-        // All columns (time + values) write the same row segments and seal
-        // at the same boundaries — fully parallel.
 #ifdef ENABLE_THREADS
-        if (g_config_value_.parallel_write_enabled_) {
+        if (g_config_value_.parallel_write_enabled_ &&
+            common::g_thread_pool_ != nullptr) {
             std::vector<std::future<int>> futures;
-            futures.push_back(g_write_thread_pool_->submit(
-                [&write_time_in_segments, time_chunk_writer]() {
-                    return write_time_in_segments(time_chunk_writer);
-                }));
-            for (uint32_t k = 0; k < value_chunk_writers.size(); k++) {
-                ValueChunkWriter* vcw = value_chunk_writers[k];
-                if (IS_NULL(vcw)) continue;
-                uint32_t col_idx = field_columns[k];
-                futures.push_back(g_write_thread_pool_->submit(
-                    [&write_value_in_segments, vcw, col_idx]() {
-                        return write_value_in_segments(vcw, col_idx);
+            for (auto& ctx : device_ctxs) {
+                futures.push_back(common::g_thread_pool_->submit(
+                    [&write_time_segments, &ctx]() {
+                        return write_time_segments(ctx.tcw, ctx.segments,
+                                                   ctx.initial_page_points);
                     }));
+                for (auto& vt : ctx.value_tasks) {
+                    futures.push_back(common::g_thread_pool_->submit(
+                        [&write_value_segments, &vt, &ctx]() {
+                            return write_value_segments(
+                                vt.vcw, vt.col_idx, ctx.segments,
+                                ctx.initial_page_points);
+                        }));
+                }
             }
             for (auto& f : futures) {
                 int r = f.get();
                 if (r != E_OK && ret == E_OK) ret = r;
             }
-            if (ret != E_OK) return ret;
+            if (ret != E_OK) {
+                // One task aborted mid-batch while others may have written
+                // all of their rows; the per-column row counts no longer
+                // line up.  Mark the writer unrecoverable so flush/close
+                // can't seal a corrupt aligned chunk group.
+                unrecoverable_ = true;
+                return ret;
+            }
         } else
 #endif
         {
-            if (RET_FAIL(write_time_in_segments(time_chunk_writer))) {
+            for (auto& ctx : device_ctxs) {
+                if (RET_FAIL(write_time_segments(ctx.tcw, ctx.segments,
+                                                 ctx.initial_page_points))) {
+                    // Time wrote partial rows before failing; value columns
+                    // still hold the prior count.  Same column-alignment
+                    // hazard as the parallel path.
+                    unrecoverable_ = true;
+                    return ret;
+                }
+                for (auto& vt : ctx.value_tasks) {
+                    if (RET_FAIL(write_value_segments(
+                            vt.vcw, vt.col_idx, ctx.segments,
+                            ctx.initial_page_points))) {
+                        unrecoverable_ = true;
+                        return ret;
+                    }
+                }
+            }
+        }
+    } else {
+        int start_idx = 0;
+        for (auto& device_id_end_index_pair : device_id_end_index_pairs) {
+            auto device_id = device_id_end_index_pair.first;
+            int end_idx = device_id_end_index_pair.second;
+            if (end_idx == 0) continue;
+
+            const uint32_t si = static_cast<uint32_t>(start_idx);
+            if (enforce_recovered_last_time_order_ && tablet.timestamps_ &&
+                end_idx > start_idx) {
+                auto schema_it = schemas_.find(device_id);
+                if (schema_it != schemas_.end() &&
+                    schema_it->second != nullptr &&
+                    tablet.timestamps_[si] <= schema_it->second->last_time_) {
+                    return E_OUT_OF_ORDER;
+                }
+            }
+            MeasurementNamesFromTablet mnames_getter(tablet);
+            SimpleVector<ChunkWriter*> chunk_writers;
+            SimpleVector<common::TSDataType> data_types;
+            if (RET_FAIL(do_check_schema(device_id, mnames_getter,
+                                         chunk_writers, data_types))) {
                 return ret;
             }
-            for (uint32_t k = 0; k < value_chunk_writers.size(); k++) {
-                ValueChunkWriter* vcw = value_chunk_writers[k];
-                if (IS_NULL(vcw)) continue;
-                if (RET_FAIL(write_value_in_segments(vcw, field_columns[k]))) {
+            ASSERT(chunk_writers.size() == tablet.get_column_count());
+
+#ifdef ENABLE_THREADS
+            if (chunk_writers.size() >= 2 &&
+                g_config_value_.parallel_write_enabled_ &&
+                common::g_thread_pool_ != nullptr) {
+                const uint32_t si = start_idx;
+                const uint32_t ei = device_id_end_index_pair.second;
+                std::vector<std::future<int>> futures;
+                for (uint32_t c = 0; c < chunk_writers.size(); c++) {
+                    ChunkWriter* cw = chunk_writers[c];
+                    if (IS_NULL(cw)) continue;
+                    futures.push_back(common::g_thread_pool_->submit(
+                        [this, cw, &tablet, c, si, ei]() {
+                            return write_column_batch(cw, tablet, c, si, ei);
+                        }));
+                }
+                for (auto& f : futures) {
+                    int r = f.get();
+                    if (r != E_OK && ret == E_OK) ret = r;
+                }
+                if (ret != E_OK) {
+                    // One column aborted partway while sibling columns
+                    // may have written all of their rows.  The per-column
+                    // chunk writers now disagree on row count, so subsequent
+                    // flush/close would seal a corrupt non-aligned chunk
+                    // group.  Same hazard as the aligned parallel path —
+                    // mark the writer unrecoverable so future ops refuse.
+                    unrecoverable_ = true;
                     return ret;
                 }
+            } else
+#endif
+            {
+                for (uint32_t c = 0; c < chunk_writers.size(); c++) {
+                    ChunkWriter* chunk_writer = chunk_writers[c];
+                    if (IS_NULL(chunk_writer)) continue;
+                    if (RET_FAIL(write_column_batch(
+                            chunk_writer, tablet, c, start_idx,
+                            device_id_end_index_pair.second))) {
+                        // Sequential path: earlier columns already wrote
+                        // their batch, this column failed → divergent row
+                        // counts.  Same unrecoverable contract.
+                        if (c > 0) unrecoverable_ = true;
+                        return ret;
+                    }
+                }
             }
+            start_idx = device_id_end_index_pair.second;
         }
-        if (device_schema != nullptr && si < ei) {
-            device_schema->last_time_ =
-                std::max(device_schema->last_time_, tablet.timestamps_[ei - 1]);
+    }
+    // After all device segments wrote successfully, advance recovery's
+    // per-device last_time_ floor to the highest timestamp this tablet
+    // contributed for each device.
+    if (enforce_recovered_last_time_order_ && tablet.timestamps_) {
+        int update_start = 0;
+        for (auto& pair : device_id_end_index_pairs) {
+            int end_idx = pair.second;
+            if (end_idx == 0) continue;
+            if (end_idx > update_start) {
+                auto schema_it = schemas_.find(pair.first);
+                if (schema_it != schemas_.end() &&
+                    schema_it->second != nullptr) {
+                    schema_it->second->last_time_ =
+                        std::max(schema_it->second->last_time_,
+                                 tablet.timestamps_[end_idx - 1]);
+                }
+            }
+            update_start = end_idx;
         }
-        start_idx = end_idx;
     }
     record_count_since_last_flush_ += tablet.cur_row_size_;
     // Reset string column buffers so the tablet can be reused for the next
@@ -1395,14 +1495,13 @@ std::vector<std::pair<std::shared_ptr<IDeviceID>, int>>
 TsFileWriter::split_tablet_by_device(const Tablet& tablet) {
     std::vector<std::pair<std::shared_ptr<IDeviceID>, int>> result;
 
-    if (tablet.id_column_indexes_.empty()) {
+    if (tablet.id_column_indexes_.empty() || tablet.single_device_) {
+        // No tag columns or caller guarantees single device — skip boundary
+        // detection entirely.
         auto sentinel = std::make_shared<StringArrayDeviceID>("last_device_id");
         result.emplace_back(std::move(sentinel), 0);
-        std::vector<std::string*> id_array;
-        id_array.push_back(new std::string(tablet.insert_target_name_));
-        auto res = std::make_shared<StringArrayDeviceID>(id_array);
-        delete id_array[0];
-        result.emplace_back(std::move(res), tablet.get_cur_row_size());
+        std::shared_ptr<IDeviceID> dev_id(tablet.get_device_id(0));
+        result.emplace_back(std::move(dev_id), tablet.get_cur_row_size());
         return result;
     }
 
@@ -1428,41 +1527,49 @@ TsFileWriter::split_tablet_by_device(const Tablet& tablet) {
 int TsFileWriter::write_column(ChunkWriter* chunk_writer, const Tablet& tablet,
                                int col_idx, uint32_t start_idx,
                                uint32_t end_idx) {
-    int ret = E_OK;
-
     common::TSDataType data_type = tablet.schema_vec_->at(col_idx).data_type_;
     int64_t* timestamps = tablet.timestamps_;
     Tablet::ValueMatrixEntry col_values = tablet.value_matrix_[col_idx];
     BitMap& col_notnull_bitmap = tablet.bitmaps_[col_idx];
     end_idx = std::min(end_idx, tablet.max_row_num_);
 
-    if (data_type == common::BOOLEAN) {
-        ret = write_typed_column(chunk_writer, timestamps, col_values.bool_data,
-                                 col_notnull_bitmap, start_idx, end_idx);
-    } else if (data_type == common::INT32) {
-        ret =
-            write_typed_column(chunk_writer, timestamps, col_values.int32_data,
-                               col_notnull_bitmap, start_idx, end_idx);
-    } else if (data_type == common::INT64) {
-        ret =
-            write_typed_column(chunk_writer, timestamps, col_values.int64_data,
-                               col_notnull_bitmap, start_idx, end_idx);
-    } else if (data_type == common::FLOAT) {
-        ret =
-            write_typed_column(chunk_writer, timestamps, col_values.float_data,
-                               col_notnull_bitmap, start_idx, end_idx);
-    } else if (data_type == common::DOUBLE) {
-        ret =
-            write_typed_column(chunk_writer, timestamps, col_values.double_data,
-                               col_notnull_bitmap, start_idx, end_idx);
-    } else if (data_type == common::STRING) {
-        ret =
-            write_typed_column(chunk_writer, timestamps, col_values.string_col,
-                               col_notnull_bitmap, start_idx, end_idx);
-    } else {
-        ASSERT(false);
+    // Cover every storage type (DATE->int32, TIMESTAMP->int64, TEXT/BLOB->
+    // string).  This is the null fallback for the non-aligned batch path, so a
+    // column of any type that contains a null lands here; the old if/else only
+    // handled 6 types and ASSERT(false)'d (silently no-op in NDEBUG) on
+    // DATE/TIMESTAMP/TEXT/BLOB, dropping those rows.
+    switch (data_type) {
+        case common::BOOLEAN:
+            return write_typed_column(chunk_writer, timestamps,
+                                      col_values.bool_data, col_notnull_bitmap,
+                                      start_idx, end_idx);
+        case common::INT32:
+        case common::DATE:
+            return write_typed_column(chunk_writer, timestamps,
+                                      col_values.int32_data, col_notnull_bitmap,
+                                      start_idx, end_idx);
+        case common::INT64:
+        case common::TIMESTAMP:
+            return write_typed_column(chunk_writer, timestamps,
+                                      col_values.int64_data, col_notnull_bitmap,
+                                      start_idx, end_idx);
+        case common::FLOAT:
+            return write_typed_column(chunk_writer, timestamps,
+                                      col_values.float_data, col_notnull_bitmap,
+                                      start_idx, end_idx);
+        case common::DOUBLE:
+            return write_typed_column(chunk_writer, timestamps,
+                                      col_values.double_data,
+                                      col_notnull_bitmap, start_idx, end_idx);
+        case common::STRING:
+        case common::TEXT:
+        case common::BLOB:
+            return write_typed_column(chunk_writer, timestamps,
+                                      col_values.string_col, col_notnull_bitmap,
+                                      start_idx, end_idx);
+        default:
+            return E_NOT_SUPPORT;
     }
-    return ret;
 }
 
 int TsFileWriter::time_write_column(TimeChunkWriter* time_chunk_writer,
@@ -1481,124 +1588,25 @@ int TsFileWriter::time_write_column(TimeChunkWriter* time_chunk_writer,
     return ret;
 }
 
-int TsFileWriter::value_write_column(ValueChunkWriter* value_chunk_writer,
-                                     const Tablet& tablet, int col_idx,
+// Non-aligned numeric column: a null row contributes no point, so null rows
+// are skipped.  Covers bool/int32/int64/float/double; instantiated only from
+// write_column in this translation unit.
+template <typename T>
+int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
+                                     int64_t* timestamps, T* col_values,
+                                     BitMap& col_notnull_bitmap,
                                      uint32_t start_idx, uint32_t end_idx) {
     int ret = E_OK;
-
-    TSDataType data_type = tablet.schema_vec_->at(col_idx).data_type_;
-    int64_t* timestamps = tablet.timestamps_;
-    Tablet::ValueMatrixEntry col_values = tablet.value_matrix_[col_idx];
-    BitMap& col_notnull_bitmap = tablet.bitmaps_[col_idx];
-    switch (data_type) {
-        case common::BOOLEAN:
-            ret = write_typed_column(value_chunk_writer, timestamps,
-                                     (bool*)col_values.bool_data,
-                                     col_notnull_bitmap, start_idx, end_idx);
-            break;
-        case common::DATE:
-        case common::INT32:
-            ret = write_typed_column(value_chunk_writer, timestamps,
-                                     (int32_t*)col_values.int32_data,
-                                     col_notnull_bitmap, start_idx, end_idx);
-            break;
-        case common::TIMESTAMP:
-        case common::INT64:
-            ret = write_typed_column(value_chunk_writer, timestamps,
-                                     (int64_t*)col_values.int64_data,
-                                     col_notnull_bitmap, start_idx, end_idx);
-            break;
-        case common::FLOAT:
-            ret = write_typed_column(value_chunk_writer, timestamps,
-                                     (float*)col_values.float_data,
-                                     col_notnull_bitmap, start_idx, end_idx);
-            break;
-        case common::DOUBLE:
-            ret = write_typed_column(value_chunk_writer, timestamps,
-                                     (double*)col_values.double_data,
-                                     col_notnull_bitmap, start_idx, end_idx);
-            break;
-        case common::STRING:
-        case common::TEXT:
-        case common::BLOB:
-            ret = write_typed_column(value_chunk_writer, timestamps,
-                                     col_values.string_col, col_notnull_bitmap,
-                                     start_idx, end_idx);
-            break;
-        default:
-            ret = E_NOT_SUPPORT;
+    for (uint32_t r = start_idx; r < end_idx; r++) {
+        if (LIKELY(!col_notnull_bitmap.test(r))) {
+            if (RET_FAIL(chunk_writer->write(timestamps[r], col_values[r]))) {
+                return ret;
+            }
+        }
     }
     return ret;
 }
 
-#define DO_WRITE_TYPED_COLUMN()                                               \
-    do {                                                                      \
-        int ret = E_OK;                                                       \
-        for (uint32_t r = start_idx; r < end_idx; r++) {                      \
-            if (LIKELY(!col_notnull_bitmap.test(r))) {                        \
-                if (RET_FAIL(                                                 \
-                        chunk_writer->write(timestamps[r], col_values[r]))) { \
-                    return ret;                                               \
-                }                                                             \
-            }                                                                 \
-        }                                                                     \
-        return ret;                                                           \
-    } while (false)
-
-#define DO_VALUE_WRITE_TYPED_COLUMN()                            \
-    do {                                                         \
-        int ret = E_OK;                                          \
-        for (uint32_t r = start_idx; r < end_idx; r++) {         \
-            if (LIKELY(col_notnull_bitmap.test(r))) {            \
-                if (RET_FAIL(value_chunk_writer->write(          \
-                        timestamps[r], col_values[r], true))) {  \
-                    return ret;                                  \
-                }                                                \
-            } else {                                             \
-                if (RET_FAIL(value_chunk_writer->write(          \
-                        timestamps[r], col_values[r], false))) { \
-                    return ret;                                  \
-                }                                                \
-            }                                                    \
-        }                                                        \
-        return ret;                                              \
-    } while (false)
-
-int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
-                                     int64_t* timestamps, bool* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_WRITE_TYPED_COLUMN();
-}
-
-int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
-                                     int64_t* timestamps, int32_t* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_WRITE_TYPED_COLUMN();
-}
-
-int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
-                                     int64_t* timestamps, int64_t* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_WRITE_TYPED_COLUMN();
-}
-
-int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
-                                     int64_t* timestamps, float* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_WRITE_TYPED_COLUMN();
-}
-
-int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
-                                     int64_t* timestamps, double* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_WRITE_TYPED_COLUMN();
-}
-
 int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
                                      int64_t* timestamps,
                                      Tablet::StringColumn* string_col,
@@ -1609,8 +1617,7 @@ int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
         if (LIKELY(!col_notnull_bitmap.test(r))) {
             common::String val(
                 string_col->buffer + string_col->offsets[r],
-                static_cast<uint32_t>(string_col->offsets[r + 1] -
-                                      string_col->offsets[r]));
+                string_col->offsets[r + 1] - string_col->offsets[r]);
             if (RET_FAIL(chunk_writer->write(timestamps[r], val))) {
                 return ret;
             }
@@ -1619,67 +1626,161 @@ int TsFileWriter::write_typed_column(ChunkWriter* chunk_writer,
     return ret;
 }
 
-int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer,
-                                     int64_t* timestamps, bool* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_VALUE_WRITE_TYPED_COLUMN();
-}
-
-int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer,
-                                     int64_t* timestamps, int32_t* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_VALUE_WRITE_TYPED_COLUMN();
-}
-
-int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer,
-                                     int64_t* timestamps, int64_t* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_VALUE_WRITE_TYPED_COLUMN();
+int TsFileWriter::time_write_column_batch(TimeChunkWriter* time_chunk_writer,
+                                          const Tablet& tablet,
+                                          uint32_t start_idx,
+                                          uint32_t end_idx) {
+    int64_t* timestamps = tablet.timestamps_;
+    int ret = E_OK;
+    if (IS_NULL(time_chunk_writer) || IS_NULL(timestamps)) {
+        return E_INVALID_ARG;
+    }
+    end_idx = std::min(end_idx, tablet.max_row_num_);
+    uint32_t count = end_idx - start_idx;
+    if (count == 0) return ret;
+    return time_chunk_writer->write_batch(timestamps + start_idx, count);
 }
 
-int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer,
-                                     int64_t* timestamps, float* col_values,
-                                     BitMap& col_notnull_bitmap,
+int TsFileWriter::write_column_batch(ChunkWriter* chunk_writer,
+                                     const Tablet& tablet, int col_idx,
                                      uint32_t start_idx, uint32_t end_idx) {
-    DO_VALUE_WRITE_TYPED_COLUMN();
-}
+    int ret = E_OK;
+    common::TSDataType data_type = tablet.schema_vec_->at(col_idx).data_type_;
+    int64_t* timestamps = tablet.timestamps_;
+    Tablet::ValueMatrixEntry col_values = tablet.value_matrix_[col_idx];
+    BitMap& col_notnull_bitmap = tablet.bitmaps_[col_idx];
+    end_idx = std::min(end_idx, tablet.max_row_num_);
+    uint32_t count = end_idx - start_idx;
+    if (count == 0) return ret;
+
+    bool has_null = false;
+    if (col_notnull_bitmap.may_have_set_bits()) {
+        for (uint32_t r = start_idx; r < end_idx; r++) {
+            if (col_notnull_bitmap.test(r)) {
+                has_null = true;
+                break;
+            }
+        }
+    }
 
-int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer,
-                                     int64_t* timestamps, double* col_values,
-                                     BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
-    DO_VALUE_WRITE_TYPED_COLUMN();
+    if (!has_null) {
+        switch (data_type) {
+            case common::BOOLEAN:
+                ret = chunk_writer->write_batch(
+                    timestamps + start_idx, col_values.bool_data + start_idx,
+                    count);
+                break;
+            case common::INT32:
+            case common::DATE:
+                ret = chunk_writer->write_batch(
+                    timestamps + start_idx, col_values.int32_data + start_idx,
+                    count);
+                break;
+            case common::INT64:
+            case common::TIMESTAMP:
+                ret = chunk_writer->write_batch(
+                    timestamps + start_idx, col_values.int64_data + start_idx,
+                    count);
+                break;
+            case common::FLOAT:
+                ret = chunk_writer->write_batch(
+                    timestamps + start_idx, col_values.float_data + start_idx,
+                    count);
+                break;
+            case common::DOUBLE:
+                ret = chunk_writer->write_batch(
+                    timestamps + start_idx, col_values.double_data + start_idx,
+                    count);
+                break;
+            case common::STRING:
+            case common::TEXT:
+            case common::BLOB: {
+                auto* sc = col_values.string_col;
+                // sc->offsets is int32_t* (Arrow Utf8/Binary spec);
+                // write_string_batch still takes const uint32_t* through the
+                // page/encoder stack.  Offsets are non-negative by
+                // construction so the bit pattern is identical — cast at the
+                // boundary until the downstream chain is converted in a
+                // follow-up.
+                ret = chunk_writer->write_string_batch(
+                    timestamps + start_idx, sc->buffer,
+                    reinterpret_cast<const uint32_t*>(sc->offsets), start_idx,
+                    count);
+                break;
+            }
+            default:
+                ret = write_column(chunk_writer, tablet, col_idx, start_idx,
+                                   end_idx);
+                break;
+        }
+    } else {
+        ret = write_column(chunk_writer, tablet, col_idx, start_idx, end_idx);
+    }
+    return ret;
 }
 
-int TsFileWriter::write_typed_column(ValueChunkWriter* value_chunk_writer,
-                                     int64_t* timestamps,
-                                     Tablet::StringColumn* string_col,
-                                     common::BitMap& col_notnull_bitmap,
-                                     uint32_t start_idx, uint32_t end_idx) {
+int TsFileWriter::value_write_column_batch(ValueChunkWriter* value_chunk_writer,
+                                           const Tablet& tablet, int col_idx,
+                                           uint32_t start_idx,
+                                           uint32_t end_idx) {
     int ret = E_OK;
-    for (uint32_t r = start_idx; r < end_idx; r++) {
-        common::String val(string_col->buffer + string_col->offsets[r],
-                           static_cast<uint32_t>(string_col->offsets[r + 1] -
-                                                 string_col->offsets[r]));
-        if (LIKELY(col_notnull_bitmap.test(r))) {
-            if (RET_FAIL(value_chunk_writer->write(timestamps[r], val, true))) {
-                return ret;
-            }
-        } else {
-            if (RET_FAIL(
-                    value_chunk_writer->write(timestamps[r], val, false))) {
-                return ret;
-            }
+    common::TSDataType data_type = tablet.schema_vec_->at(col_idx).data_type_;
+    int64_t* timestamps = tablet.timestamps_;
+    Tablet::ValueMatrixEntry col_values = tablet.value_matrix_[col_idx];
+    BitMap& col_notnull_bitmap = tablet.bitmaps_[col_idx];
+    end_idx = std::min(end_idx, tablet.max_row_num_);
+    uint32_t count = end_idx - start_idx;
+    if (count == 0) return ret;
+
+    switch (data_type) {
+        case common::BOOLEAN:
+            ret = value_chunk_writer->write_batch(
+                timestamps, col_values.bool_data, col_notnull_bitmap, start_idx,
+                count);
+            break;
+        case common::DATE:
+        case common::INT32:
+            ret = value_chunk_writer->write_batch(
+                timestamps, col_values.int32_data, col_notnull_bitmap,
+                start_idx, count);
+            break;
+        case common::TIMESTAMP:
+        case common::INT64:
+            ret = value_chunk_writer->write_batch(
+                timestamps, col_values.int64_data, col_notnull_bitmap,
+                start_idx, count);
+            break;
+        case common::FLOAT:
+            ret = value_chunk_writer->write_batch(
+                timestamps, col_values.float_data, col_notnull_bitmap,
+                start_idx, count);
+            break;
+        case common::DOUBLE:
+            ret = value_chunk_writer->write_batch(
+                timestamps, col_values.double_data, col_notnull_bitmap,
+                start_idx, count);
+            break;
+        case common::STRING:
+        case common::TEXT:
+        case common::BLOB: {
+            auto* sc = col_values.string_col;
+            // See above: sc->offsets is int32_t*, downstream still uint32_t*.
+            ret = value_chunk_writer->write_string_batch(
+                timestamps, sc->buffer,
+                reinterpret_cast<const uint32_t*>(sc->offsets),
+                col_notnull_bitmap, start_idx, count);
+            break;
         }
+        default:
+            ret = E_NOT_SUPPORT;
+            break;
     }
     return ret;
 }
 
 // TODO make sure ret is meaningful to SDK user
 int TsFileWriter::flush() {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
     int ret = E_OK;
     if (!start_file_done_) {
         if (RET_FAIL(io_writer_->start_file())) {
@@ -1690,9 +1791,10 @@ int TsFileWriter::flush() {
 
     /* since @schemas_ used std::map which is rbtree underlying,
              so map itself is ordered by device name. */
+
     DeviceSchemasMapIter device_iter;
     for (device_iter = schemas_.begin(); device_iter != schemas_.end();
-         device_iter++) {  // cppcheck-suppress postfixOperator
+         device_iter++) {
         if (check_chunk_group_empty(device_iter->second,
                                     device_iter->second->is_aligned_)) {
             continue;
@@ -1706,6 +1808,7 @@ int TsFileWriter::flush() {
         } else if (RET_FAIL(io_writer_->end_flush_chunk_group(is_aligned))) {
         }
     }
+
     record_count_since_last_flush_ = 0;
     return ret;
 }
@@ -1751,6 +1854,56 @@ bool TsFileWriter::check_chunk_group_empty(MeasurementSchemaGroup* chunk_group,
         writer->reset();                                                       \
     }
 
+// Write already-encoded chunk data to stream (no compression — done earlier).
+#define FLUSH_CHUNK_ENCODED(writer, io_writer, name, data_type, encoding,     \
+                            compression, num_pages)                           \
+    if (RET_FAIL(io_writer->start_flush_chunk(writer->get_chunk_data(), name, \
+                                              data_type, encoding,            \
+                                              compression, num_pages))) {     \
+    } else if (RET_FAIL(io_writer->flush_chunk(writer->get_chunk_data()))) {  \
+    } else if (RET_FAIL(io_writer->end_flush_chunk(                           \
+                   writer->get_chunk_statistic()))) {                         \
+    } else {                                                                  \
+        writer->reset();                                                      \
+    }
+
+int TsFileWriter::flush_chunk_group_encoded(MeasurementSchemaGroup* chunk_group,
+                                            bool is_aligned) {
+    int ret = E_OK;
+    MeasurementSchemaMap& map = chunk_group->measurement_schema_map_;
+
+    if (chunk_group->is_aligned_) {
+        TimeChunkWriter*& time_chunk_writer = chunk_group->time_chunk_writer_;
+        ChunkHeader chunk_header = time_chunk_writer->get_chunk_header();
+        FLUSH_CHUNK_ENCODED(
+            time_chunk_writer, io_writer_, chunk_header.measurement_name_,
+            chunk_header.data_type_, chunk_header.encoding_type_,
+            chunk_header.compression_type_, time_chunk_writer->num_of_pages())
+    }
+
+    for (MeasurementSchemaMapIter ms_iter = map.begin(); ms_iter != map.end();
+         ms_iter++) {
+        MeasurementSchema* m_schema = ms_iter->second;
+        if (!chunk_group->is_aligned_ && m_schema->chunk_writer_ != nullptr) {
+            ChunkWriter*& chunk_writer = m_schema->chunk_writer_;
+            FLUSH_CHUNK_ENCODED(
+                chunk_writer, io_writer_, m_schema->measurement_name_,
+                m_schema->data_type_, m_schema->encoding_,
+                m_schema->compression_type_, chunk_writer->num_of_pages())
+        } else if (m_schema->value_chunk_writer_ != nullptr &&
+                   m_schema->value_chunk_writer_->hasData()) {
+            ValueChunkWriter*& value_chunk_writer =
+                m_schema->value_chunk_writer_;
+            FLUSH_CHUNK_ENCODED(
+                value_chunk_writer, io_writer_, m_schema->measurement_name_,
+                m_schema->data_type_, m_schema->encoding_,
+                m_schema->compression_type_, value_chunk_writer->num_of_pages())
+        }
+    }
+
+    return ret;
+}
+
 int TsFileWriter::flush_chunk_group(MeasurementSchemaGroup* chunk_group,
                                     bool is_aligned) {
     int ret = E_OK;
@@ -1774,7 +1927,8 @@ int TsFileWriter::flush_chunk_group(MeasurementSchemaGroup* chunk_group,
                         m_schema->data_type_, m_schema->encoding_,
                         m_schema->compression_type_,
                         chunk_writer->num_of_pages())
-        } else if (m_schema->value_chunk_writer_ != nullptr) {
+        } else if (m_schema->value_chunk_writer_ != nullptr &&
+                   m_schema->value_chunk_writer_->hasData()) {
             ValueChunkWriter*& value_chunk_writer =
                 m_schema->value_chunk_writer_;
             FLUSH_CHUNK(value_chunk_writer, io_writer_,
@@ -1787,6 +1941,9 @@ int TsFileWriter::flush_chunk_group(MeasurementSchemaGroup* chunk_group,
     return ret;
 }
 
-int TsFileWriter::close() { return io_writer_->end_file(); }
+int TsFileWriter::close() {
+    if (UNLIKELY(unrecoverable_)) return E_DATA_INCONSISTENCY;
+    return io_writer_->end_file();
+}
 
 }  // end namespace storage
diff --git a/cpp/src/writer/tsfile_writer.h b/cpp/src/writer/tsfile_writer.h
index a2c8f2842..e0b102c97 100644
--- a/cpp/src/writer/tsfile_writer.h
+++ b/cpp/src/writer/tsfile_writer.h
@@ -33,7 +33,6 @@
 #include "common/record.h"
 #include "common/schema.h"
 #include "common/tablet.h"
-#include "utils/util_define.h"  // mode_t and other platform-compat shims
 
 namespace storage {
 class WriteFile;
@@ -46,9 +45,12 @@ namespace storage {
 
 extern int libtsfile_init();
 extern void libtsfile_destroy();
-extern void set_page_max_point_count(uint32_t page_max_ponint_count);
-extern void set_max_degree_of_index_node(uint32_t max_degree_of_index_node);
-extern void set_strict_page_size(bool strict_page_size);
+// Returns common::E_INVALID_ARG when count would freeze the chunk writers
+// (i.e. less than 1); leaves the field untouched on rejection.
+extern int set_page_max_point_count(uint32_t page_max_ponint_count);
+// Returns common::E_INVALID_ARG when degree < 2 (which collapses the index
+// tree); leaves the field untouched on rejection.
+extern int set_max_degree_of_index_node(uint32_t max_degree_of_index_node);
 
 class TsFileWriter {
    public:
@@ -98,6 +100,7 @@ class TsFileWriter {
     std::shared_ptr<TableSchema> get_table_schema(
         const std::string& table_name) const;
     int64_t calculate_mem_size_for_all_group();
+    int64_t calculate_meta_mem_size() const;
     int check_memory_size_and_may_flush_chunks();
     /*
      * Flush buffer to disk file, but do not writer file index part.
@@ -125,25 +128,15 @@ class TsFileWriter {
         int32_t time_pages_before,
         const std::vector<int32_t>& value_pages_before);
     int flush_chunk_group(MeasurementSchemaGroup* chunk_group, bool is_aligned);
+    int flush_chunk_group_encoded(MeasurementSchemaGroup* chunk_group,
+                                  bool is_aligned);
 
+    // Numeric columns (bool/int32/int64/float/double) share one body:
+    // non-aligned ChunkWriter skips null rows entirely.  Defined in the .cc;
+    // every instantiation lives in that translation unit.
+    template <typename T>
     int write_typed_column(storage::ChunkWriter* chunk_writer,
-                           int64_t* timestamps, bool* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-    int write_typed_column(storage::ChunkWriter* chunk_writer,
-                           int64_t* timestamps, int32_t* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-    int write_typed_column(storage::ChunkWriter* chunk_writer,
-                           int64_t* timestamps, int64_t* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-    int write_typed_column(storage::ChunkWriter* chunk_writer,
-                           int64_t* timestamps, float* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-    int write_typed_column(storage::ChunkWriter* chunk_writer,
-                           int64_t* timestamps, double* col_values,
+                           int64_t* timestamps, T* col_values,
                            common::BitMap& col_notnull_bitmap,
                            uint32_t start_idx, uint32_t end_idx);
     int write_typed_column(ChunkWriter* chunk_writer, int64_t* timestamps,
@@ -196,41 +189,33 @@ class TsFileWriter {
     int64_t record_count_for_next_mem_check_;
     bool write_file_created_;
     bool io_writer_owned_;  // false when init(RestorableTsFileIOWriter*)
-    bool enforce_recovered_last_time_order_;
-
-    int write_typed_column(ValueChunkWriter* value_chunk_writer,
-                           int64_t* timestamps, bool* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-
-    int write_typed_column(ValueChunkWriter* value_chunk_writer,
-                           int64_t* timestamps, double* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-    int write_typed_column(ValueChunkWriter* value_chunk_writer,
-                           int64_t* timestamps,
-                           Tablet::StringColumn* string_col,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-
-    int write_typed_column(ValueChunkWriter* value_chunk_writer,
-                           int64_t* timestamps, float* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-
-    int write_typed_column(ValueChunkWriter* value_chunk_writer,
-                           int64_t* timestamps, int32_t* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-
-    int write_typed_column(ValueChunkWriter* value_chunk_writer,
-                           int64_t* timestamps, int64_t* col_values,
-                           common::BitMap& col_notnull_bitmap,
-                           uint32_t start_idx, uint32_t end_idx);
-
-    int value_write_column(ValueChunkWriter* value_chunk_writer,
+    // Only the recovery init path sets this true: subsequent writes must
+    // refuse timestamps <= the recovered per-device last_time_ so the chunk
+    // ordering invariants preserved by RestorableTsFileIOWriter are not
+    // broken by appending older data.
+    bool enforce_recovered_last_time_order_ = false;
+    bool table_aligned_ = true;
+    // Set once a partial-write failure leaves the per-column chunk writers
+    // out of sync (e.g. parallel aligned tablet write where one task fails
+    // mid-way while others succeed).  Subsequent write/flush/close calls
+    // refuse to operate so that the on-disk file isn't sealed with row
+    // counts that disagree between time and value columns.
+    bool unrecoverable_ = false;
+    // Test-only accessor for the unrecoverable contract: real triggers
+    // (parallel task failure, out-of-order timestamps across multiple chunk
+    // writers) are hard to drive deterministically, but the contract —
+    // flush/close refuse — can be unit-tested directly.
+    friend class TsFileWriterUnrecoverableTest;
+
+    int write_column_batch(storage::ChunkWriter* chunk_writer,
                            const Tablet& tablet, int col_idx,
                            uint32_t start_idx, uint32_t end_idx);
+    int time_write_column_batch(TimeChunkWriter* time_chunk_writer,
+                                const Tablet& tablet, uint32_t start_idx,
+                                uint32_t end_idx);
+    int value_write_column_batch(ValueChunkWriter* value_chunk_writer,
+                                 const Tablet& tablet, int col_idx,
+                                 uint32_t start_idx, uint32_t end_idx);
 };
 
 }  // end namespace storage
diff --git a/cpp/src/writer/value_chunk_writer.cc b/cpp/src/writer/value_chunk_writer.cc
index a59cf8d3f..182b0762b 100644
--- a/cpp/src/writer/value_chunk_writer.cc
+++ b/cpp/src/writer/value_chunk_writer.cc
@@ -110,7 +110,7 @@ int ValueChunkWriter::seal_cur_page(bool end_chunk) {
                 /*stat*/ false, /*data*/ false);
             if (IS_SUCC(ret)) {
                 save_first_page_data(value_page_writer_);
-                value_page_writer_.clear_page_data();
+                // value_page_writer_.destroy_page_data();
                 value_page_writer_.reset();
             }
         }
@@ -145,6 +145,11 @@ void ValueChunkWriter::save_first_page_data(
     ValuePageWriter& first_page_writer) {
     first_page_data_ = first_page_writer.get_cur_page_data();
     first_page_statistic_->deep_copy_from(first_page_writer.get_statistic());
+    // Take ownership of the heap buffers: get_cur_page_data() returned a
+    // shallow copy, so without this we'd alias compressed_buf_ /
+    // uncompressed_buf_ between cur_page_data_ and first_page_data_ and
+    // double-free at destroy() time.
+    first_page_writer.release_cur_page_data();
 }
 
 int ValueChunkWriter::write_first_page_data(ByteStream& pages_data,
@@ -161,8 +166,7 @@ int ValueChunkWriter::write_first_page_data(ByteStream& pages_data,
 
 int ValueChunkWriter::end_encode_chunk() {
     int ret = E_OK;
-    if (value_page_writer_.get_point_numer() > 0 ||
-        (has_current_page_data() && num_of_pages_ == 0)) {
+    if (has_current_page_data()) {
         ret = seal_cur_page(/*end_chunk*/ true);
         if (E_OK == ret) {
             chunk_header_.data_size_ = chunk_data_.total_size();
@@ -175,9 +179,6 @@ int ValueChunkWriter::end_encode_chunk() {
             chunk_header_.data_size_ = chunk_data_.total_size();
             chunk_header_.num_of_pages_ = num_of_pages_;
         }
-    } else if (num_of_pages_ > 0) {
-        chunk_header_.data_size_ = chunk_data_.total_size();
-        chunk_header_.num_of_pages_ = num_of_pages_;
     }
 #if DEBUG_SE
     std::cout << "end_encode_chunk: num_of_pages_=" << num_of_pages_
diff --git a/cpp/src/writer/value_chunk_writer.h b/cpp/src/writer/value_chunk_writer.h
index 64eb4cc50..cd7c75a54 100644
--- a/cpp/src/writer/value_chunk_writer.h
+++ b/cpp/src/writer/value_chunk_writer.h
@@ -53,8 +53,7 @@ class ValueChunkWriter {
           first_page_data_(),
           first_page_statistic_(nullptr),
           chunk_header_(),
-          num_of_pages_(0),
-          enable_page_seal_if_full_(true) {}
+          num_of_pages_(0) {}
     ~ValueChunkWriter() { destroy(); }
     int init(const common::ColumnSchema& col_schema);
     int init(const std::string& measurement_name, common::TSDataType data_type,
@@ -110,6 +109,71 @@ class ValueChunkWriter {
         VCW_DO_WRITE_FOR_TYPE(isnull);
     }
 
+    template <typename T>
+    int write_batch(const int64_t* timestamps, const T* values,
+                    const common::BitMap& col_notnull_bitmap,
+                    uint32_t start_idx, uint32_t count) {
+        int ret = common::E_OK;
+        uint32_t offset = 0;
+        const uint32_t page_cap =
+            common::g_config_value_.page_writer_max_point_num_;
+        while (offset < count) {
+            uint32_t cur_points = value_page_writer_.get_point_numer();
+            // get_point_numer() now returns size_ (rows including nulls and
+            // the just-written batch), so it can momentarily exceed page_cap;
+            // seal whenever we are at or past the cap to avoid uint32 wrap.
+            if (cur_points >= page_cap) {
+                if (RET_FAIL(seal_cur_page(false))) {
+                    return ret;
+                }
+                cur_points = 0;
+            }
+            uint32_t page_remaining = page_cap - cur_points;
+            uint32_t batch_size = std::min(count - offset, page_remaining);
+            if (RET_FAIL(value_page_writer_.write_batch(
+                    timestamps, values, col_notnull_bitmap, start_idx + offset,
+                    batch_size))) {
+                return ret;
+            }
+            offset += batch_size;
+            if (RET_FAIL(seal_cur_page_if_full())) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+
+    int write_string_batch(const int64_t* timestamps, const char* buffer,
+                           const uint32_t* offsets,
+                           const common::BitMap& col_notnull_bitmap,
+                           uint32_t start_idx, uint32_t count) {
+        int ret = common::E_OK;
+        uint32_t offset = 0;
+        const uint32_t page_cap =
+            common::g_config_value_.page_writer_max_point_num_;
+        while (offset < count) {
+            uint32_t cur_points = value_page_writer_.get_point_numer();
+            if (cur_points >= page_cap) {
+                if (RET_FAIL(seal_cur_page(false))) {
+                    return ret;
+                }
+                cur_points = 0;
+            }
+            uint32_t page_remaining = page_cap - cur_points;
+            uint32_t batch_size = std::min(count - offset, page_remaining);
+            if (RET_FAIL(value_page_writer_.write_string_batch(
+                    timestamps, buffer, offsets, col_notnull_bitmap,
+                    start_idx + offset, batch_size))) {
+                return ret;
+            }
+            offset += batch_size;
+            if (RET_FAIL(seal_cur_page_if_full())) {
+                return ret;
+            }
+        }
+        return ret;
+    }
+
     int end_encode_chunk();
     common::ByteStream& get_chunk_data() { return chunk_data_; }
     Statistic* get_chunk_statistic() { return chunk_statistic_; }
@@ -119,25 +183,21 @@ class ValueChunkWriter {
 
     bool hasData();
 
-    /** True if the current (unsealed) page has at least one write (including
-     * nulls). */
+    /** True if the current (unsealed) page has at least one write
+     *  (including NULLs). */
     bool has_current_page_data() const {
-        return value_page_writer_.get_total_write_count() > 0;
+        return value_page_writer_.get_point_numer() > 0;
     }
 
     FORCE_INLINE uint32_t get_point_numer() const {
         return value_page_writer_.get_point_numer();
     }
 
-    /**
-     * Force seal the current page (for aligned table model: when time page
-     * seals due to memory/point threshold, all value pages must seal together).
-     * @return E_OK on success.
-     */
+    /** Force seal the current page. */
     int seal_current_page() { return seal_cur_page(false); }
 
-    // For aligned writer: allow disabling the automatic page-size/point-number
-    // check so the caller can seal pages at chosen boundaries.
+    // Allow disabling the automatic page-size/point-number check so the
+    // caller can seal pages at chosen boundaries.
     FORCE_INLINE void set_enable_page_seal_if_full(bool enable) {
         enable_page_seal_if_full_ = enable;
     }
@@ -183,8 +243,7 @@ class ValueChunkWriter {
 
     ChunkHeader chunk_header_;
     int32_t num_of_pages_;
-    // If false, write() won't auto-seal when the current page becomes full.
-    bool enable_page_seal_if_full_;
+    bool enable_page_seal_if_full_ = true;
 };
 
 }  // end namespace storage
diff --git a/cpp/src/writer/value_page_writer.cc b/cpp/src/writer/value_page_writer.cc
index a7bcd89c4..c538ea2fa 100644
--- a/cpp/src/writer/value_page_writer.cc
+++ b/cpp/src/writer/value_page_writer.cc
@@ -59,6 +59,10 @@ int ValuePageData::init(ByteStream& col_notnull_bitmap_bs, ByteStream& value_bs,
                                           uncompressed_buf_ + sizeof(size) +
                                               col_notnull_bitmap_buf_size_,
                                           value_buf_size_))) {
+        // value_buf_size_ == 0 is a fully-null value page: only the bitmap is
+        // written, value_out_stream_ is empty. Skip the copy — feeding an
+        // empty stream to copy_bs_to_buf trips ASSERT(b.len_ > 0) in the
+        // buffer iterator. (Restores the #734 aligned-page-seal fix.)
     } else {
         // TODO
         // NOTE: different compressor may have different compress API
@@ -119,6 +123,8 @@ void ValuePageWriter::reset() {
     }
     col_notnull_bitmap_out_stream_.reset();
     value_out_stream_.reset();
+    col_notnull_bitmap_.clear();
+    size_ = 0;
 }
 
 void ValuePageWriter::destroy() {
diff --git a/cpp/src/writer/value_page_writer.h b/cpp/src/writer/value_page_writer.h
index 97f8a5f0d..92c39b9b2 100644
--- a/cpp/src/writer/value_page_writer.h
+++ b/cpp/src/writer/value_page_writer.h
@@ -59,19 +59,6 @@ struct ValuePageData {
             compressor_->after_compress(compressed_buf_);
             compressed_buf_ = nullptr;
         }
-        compressor_ = nullptr;
-    }
-
-    /** Clear pointers without freeing (transfer ownership to another holder).
-     */
-    void clear() {
-        col_notnull_bitmap_buf_size_ = 0;
-        value_buf_size_ = 0;
-        uncompressed_size_ = 0;
-        compressed_size_ = 0;
-        uncompressed_buf_ = nullptr;
-        compressed_buf_ = nullptr;
-        compressor_ = nullptr;
     }
 };
 
@@ -163,11 +150,170 @@ class ValuePageWriter {
         VPW_DO_WRITE_FOR_TYPE(isnull);
     }
 
-    FORCE_INLINE uint32_t get_point_numer() const { return statistic_->count_; }
-    FORCE_INLINE uint32_t get_total_write_count() const { return size_; }
+    // Batch write for aligned/table model.
+    // In the tablet bitmap: bit=1 means null, bit=0 means not null.
+    // In VPW_DO_WRITE_FOR_TYPE: ISNULL=true skips encoding.
+    // So: tablet bitmap.test(r)=true -> isnull=true (null value)
+    //     tablet bitmap.test(r)=false -> isnull=false (valid value)
+    template <typename T>
+    int write_batch(const int64_t* timestamps, const T* values,
+                    const common::BitMap& col_notnull_bitmap,
+                    uint32_t start_idx, uint32_t count) {
+        int ret = common::E_OK;
+        if (count == 0) return ret;
+
+        // Count the not-null rows but defer mutating size_ /
+        // col_notnull_bitmap_ until the value encode finishes successfully.
+        // Previously the bitmap and size_ were bumped first, so a half-failed
+        // encode_batch left the page claiming `count` rows had been written
+        // when only a prefix made it into value_out_stream_ — a subsequent
+        // re-encode would interleave with the stale stream and produce a
+        // misaligned page on disk.
+        uint32_t valid_count = 0;
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t row = start_idx + i;
+            // bit=1 in tablet bitmap means null; bit=0 means not null
+            if (!const_cast<common::BitMap&>(col_notnull_bitmap).test(row)) {
+                valid_count++;
+            }
+        }
+
+        if (valid_count == 0) {
+            // Still need to advance size_ so trailing null rows are tracked.
+            for (uint32_t i = 0; i < count; i++) {
+                if ((size_ / 8) + 1 > col_notnull_bitmap_.size()) {
+                    col_notnull_bitmap_.push_back(0);
+                }
+                size_++;
+            }
+            return ret;
+        }
+
+        // If all values are valid, we can encode the batch directly
+        if (valid_count == count) {
+            if (RET_FAIL(value_encoder_->encode_batch(values + start_idx, count,
+                                                      value_out_stream_))) {
+                // Don't bump size_/bitmap on encode failure.
+                return ret;
+            }
+            statistic_->update_batch(timestamps + start_idx, values + start_idx,
+                                     count);
+        } else {
+            // Encode only non-null values one by one
+            for (uint32_t i = 0; i < count; i++) {
+                uint32_t row = start_idx + i;
+                if (!const_cast<common::BitMap&>(col_notnull_bitmap)
+                         .test(row)) {
+                    if (RET_FAIL(value_encoder_->encode(values[row],
+                                                        value_out_stream_))) {
+                        return ret;
+                    }
+                    statistic_->update(timestamps[row], values[row]);
+                }
+            }
+        }
+
+        // Commit size_ + page bitmap now that all encoding succeeded.
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t row = start_idx + i;
+            if ((size_ / 8) + 1 > col_notnull_bitmap_.size()) {
+                col_notnull_bitmap_.push_back(0);
+            }
+            if (!const_cast<common::BitMap&>(col_notnull_bitmap).test(row)) {
+                col_notnull_bitmap_[size_ / 8] |= (MASK >> (size_ % 8));
+            }
+            size_++;
+        }
+        return ret;
+    }
+
+    // Batch write strings from Arrow-style offset+buffer layout with null
+    // bitmap.  See write_batch above for the encode-before-commit rationale.
+    int write_string_batch(const int64_t* timestamps, const char* buffer,
+                           const uint32_t* offsets,
+                           const common::BitMap& col_notnull_bitmap,
+                           uint32_t start_idx, uint32_t count) {
+        int ret = common::E_OK;
+        if (count == 0) return ret;
+
+        // Count valid rows up-front without mutating size_ / page bitmap.
+        uint32_t valid_count = 0;
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t row = start_idx + i;
+            if (!const_cast<common::BitMap&>(col_notnull_bitmap).test(row)) {
+                valid_count++;
+            }
+        }
+
+        if (valid_count == 0) {
+            // Advance size_ so the trailing null rows still count.
+            for (uint32_t i = 0; i < count; i++) {
+                if ((size_ / 8) + 1 > col_notnull_bitmap_.size()) {
+                    col_notnull_bitmap_.push_back(0);
+                }
+                size_++;
+            }
+            return ret;
+        }
+
+        // Phase 2: encode non-null strings (no page-state mutation yet).
+        if (valid_count == count) {
+            // All valid — batch encode directly
+            if (RET_FAIL(value_encoder_->encode_string_batch(
+                    buffer, offsets, start_idx, count, value_out_stream_))) {
+                return ret;
+            }
+        } else {
+            // Mixed — encode only non-null strings one by one
+            for (uint32_t i = 0; i < count; i++) {
+                uint32_t row = start_idx + i;
+                if (!const_cast<common::BitMap&>(col_notnull_bitmap)
+                         .test(row)) {
+                    uint32_t len = offsets[row + 1] - offsets[row];
+                    common::String val(buffer + offsets[row], len);
+                    if (RET_FAIL(
+                            value_encoder_->encode(val, value_out_stream_))) {
+                        return ret;
+                    }
+                }
+            }
+        }
+
+        // Phase 3: update statistics for non-null rows.
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t row = start_idx + i;
+            if (!const_cast<common::BitMap&>(col_notnull_bitmap).test(row)) {
+                uint32_t len = offsets[row + 1] - offsets[row];
+                common::String val(buffer + offsets[row], len);
+                statistic_->update(timestamps[row], val);
+            }
+        }
+
+        // Phase 4: commit page-level state (bitmap + size_) only after the
+        // encoder calls all succeeded.
+        for (uint32_t i = 0; i < count; i++) {
+            uint32_t row = start_idx + i;
+            if ((size_ / 8) + 1 > col_notnull_bitmap_.size()) {
+                col_notnull_bitmap_.push_back(0);
+            }
+            if (!const_cast<common::BitMap&>(col_notnull_bitmap).test(row)) {
+                col_notnull_bitmap_[size_ / 8] |= (MASK >> (size_ % 8));
+            }
+            size_++;
+        }
+        return ret;
+    }
+
+    // Rows in the current page including NULLs (NULLs advance size_ but not
+    // statistic_->count_).  This is the count the page-seal logic uses so
+    // value-column page boundaries stay aligned with the time column.
+    FORCE_INLINE uint32_t get_point_numer() const { return size_; }
     FORCE_INLINE uint32_t get_col_notnull_bitmap_out_stream_size() const {
         return col_notnull_bitmap_out_stream_.total_size();
     }
+    // Logical bytes written — used by the page-seal-when-full heuristic.
+    // Memory-pressure accounting uses estimate_max_mem_size() below, which
+    // counts the real 64 KiB-page footprint.
     FORCE_INLINE uint32_t get_page_memory_size() const {
         return col_notnull_bitmap_out_stream_.total_size() +
                value_out_stream_.total_size();
@@ -177,12 +323,16 @@ class ValuePageWriter {
      * outputStream and value outputStream, because size outputStream is never
      * used until flushing.
      *
+     * Reports the *allocated* stream footprint — see PageWriter::
+     * estimate_max_mem_size for rationale.
+     *
      * @return allocated size in time, value and outputStream
      */
     FORCE_INLINE uint32_t estimate_max_mem_size() const {
         return sizeof(int32_t) + 1 +
-               col_notnull_bitmap_out_stream_.total_size() +
-               value_out_stream_.total_size() +
+               static_cast<uint32_t>(
+                   col_notnull_bitmap_out_stream_.allocated_bytes() +
+                   value_out_stream_.allocated_bytes()) +
                value_encoder_->get_max_byte_size();
     }
     int write_to_chunk(common::ByteStream& pages_data, bool write_header,
@@ -195,9 +345,16 @@ class ValuePageWriter {
     }
     FORCE_INLINE Statistic* get_statistic() { return statistic_; }
     ValuePageData get_cur_page_data() { return cur_page_data_; }
+    // Transfer ownership of cur_page_data_'s heap buffers (uncompressed_buf_
+    // and compressed_buf_) out of this writer. Callers use this together with
+    // get_cur_page_data() to keep a long-lived copy of the data (e.g. as the
+    // first-page snapshot) without leaving an alias here that would cause a
+    // double free on destroy.
+    void release_cur_page_data() {
+        cur_page_data_.uncompressed_buf_ = nullptr;
+        cur_page_data_.compressed_buf_ = nullptr;
+    }
     void destroy_page_data() { cur_page_data_.destroy(); }
-    /** Clear cur_page_data_ without freeing (after ownership transferred). */
-    void clear_page_data() { cur_page_data_.clear(); }
 
    private:
     FORCE_INLINE int prepare_end_page() {
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 513cbd5ca..066e5accb 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -169,7 +169,7 @@ if (ENABLE_LZOKAY)
 endif()
 
 if (ENABLE_ZLIB)
-    include_directories(${CMAKE_SOURCE_DIR}/third_party/zlib-1.2.13)
+    include_directories(${THIRD_PARTY_INCLUDE}/zlib-1.3.1)
 endif()
 
 if (ENABLE_ANTLR4)
@@ -186,6 +186,7 @@ file(GLOB_RECURSE TEST_SRCS
         "reader/*_test.cc"
         "writer/*_test.cc"
         "cwrapper/*_test.cc"
+        "compress/*_test.cc"
 )
 
 # Parser tests depend on the ANTLR4 runtime; only build them when it is enabled.
diff --git a/cpp/test/common/allocator/byte_stream_test.cc b/cpp/test/common/allocator/byte_stream_test.cc
index b211803c3..3f57cbf84 100644
--- a/cpp/test/common/allocator/byte_stream_test.cc
+++ b/cpp/test/common/allocator/byte_stream_test.cc
@@ -87,7 +87,6 @@ TEST_F(ByteStreamTest, WriteReadLargeQuantities) {
         write_to_stream(&data, 1);
     }
 
-    // 1 MiB buffer: keep it off the stack (MSVC's default stack is only 1 MiB).
     static uint8_t read_buffer[1024 * 1024];
     for (int i = 0; i < 1024 * 1024; i++) {
         uint32_t read_len = 0;
@@ -186,6 +185,42 @@ TEST_F(ByteStreamTest, ReadMoreThanAvailableTest) {
     ASSERT_EQ(read_len, data_size);
 }
 
+// Regression: the ctor used to take page_size verbatim, but hot read/write
+// paths use `& (page_size-1)` as a bitmask.  A non-power-of-2 page_size
+// would cause page-crossing logic to misfire, corrupting written data.
+// Constructing with 1000 should still round-trip cleanly across many pages.
+// Regression: round_up_pow2 used `while (ps < n) ps <<= 1`, which overflows
+// to 0 once ps passes 2^31 and never matches, looping forever.  Verify the
+// clamped helper returns the largest representable power of two instead.
+TEST(ByteStreamCtorTest, RoundUpPow2ClampsHugeInput) {
+    EXPECT_EQ(round_up_pow2(0u), 1u);
+    EXPECT_EQ(round_up_pow2(1u), 1u);
+    EXPECT_EQ(round_up_pow2(1000u), 1024u);
+    EXPECT_EQ(round_up_pow2(1024u), 1024u);
+    EXPECT_EQ(round_up_pow2(0x80000000u), 0x80000000u);
+    EXPECT_EQ(round_up_pow2(0x80000001u), 0x80000000u);
+    EXPECT_EQ(round_up_pow2(0xFFFFFFFFu), 0x80000000u);
+}
+
+TEST(ByteStreamCtorTest, NonPowerOfTwoPageSizeRoundTrip) {
+    ByteStream bs(1000, MOD_DEFAULT, false);
+    // Span ~5 pages: 1024 * 5 = 5120 bytes.
+    const uint32_t N = 5120;
+    std::vector<uint8_t> data(N);
+    for (uint32_t i = 0; i < N; i++) {
+        data[i] = static_cast<uint8_t>((i * 31 + 7) & 0xff);
+    }
+    ASSERT_EQ(bs.write_buf(data.data(), N), common::E_OK);
+
+    std::vector<uint8_t> out(N, 0);
+    uint32_t read_len = 0;
+    ASSERT_EQ(bs.read_buf(out.data(), N, read_len), common::E_OK);
+    ASSERT_EQ(read_len, N);
+    for (uint32_t i = 0; i < N; i++) {
+        ASSERT_EQ(out[i], data[i]) << "mismatch at idx " << i;
+    }
+}
+
 TEST_F(ByteStreamTest, WrapAndClearTest) {
     const char externalBuffer[] = "Hello, World!";
     const int32_t bufferSize = sizeof(externalBuffer);
@@ -316,4 +351,70 @@ TEST_F(SerializationUtilTest, WriteReadIntLEPaddedBitWidthBoundaryValue) {
     }
 }
 
-}  // namespace common
\ No newline at end of file
+// Regression: total_size_ was widened to uint64_t but the read-cursor APIs
+// stayed uint32_t.  A stream that legitimately reaches >4 GiB would have
+// remaining_size() / read_pos() / set_read_pos() truncating to the low 32
+// bits and silently mis-positioning later reads.  Lock the widened type at
+// compile time so a partial revert can't reintroduce truncation, and
+// round-trip a moderate value via the API to catch arithmetic mistakes.
+TEST(ByteStreamWidthTest, ReadCursorApisAre64Bit) {
+    ByteStream s(64, common::MOD_DEFAULT);
+    static_assert(sizeof(decltype(s.read_pos())) >= sizeof(uint64_t),
+                  "ByteStream::read_pos() must return a 64-bit type");
+    static_assert(sizeof(decltype(s.remaining_size())) >= sizeof(uint64_t),
+                  "ByteStream::remaining_size() must return a 64-bit type");
+    static_assert(sizeof(decltype(s.get_mark_len())) >= sizeof(uint64_t),
+                  "ByteStream::get_mark_len() must return a 64-bit type");
+
+    // Round-trip a position via set_read_pos / read_pos on a small wrapped
+    // buffer.  Combined with the static_asserts above this guards the path
+    // arithmetic: a partial revert that kept the signature 64-bit but
+    // truncated read_pos_ to uint32_t internally would fail set_read_pos →
+    // read_pos on values near a 32-bit boundary.
+    constexpr int32_t kLen = 256;
+    std::vector<char> backing(kLen, 0);
+    ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from(backing.data(), kLen);
+    wrapped.set_read_pos(static_cast<uint64_t>(kLen - 7));
+    EXPECT_EQ(wrapped.read_pos(), static_cast<uint64_t>(kLen - 7));
+    EXPECT_EQ(wrapped.remaining_size(), 7u);
+}
+
+// Regression for the 64 KiB page memory-pressure account: ByteStream pages
+// are allocated up to OUT_STREAM_PAGE_SIZE bytes even when only a handful of
+// bytes have been written, so a chunk-group with many sparse measurements
+// can pin tens of megabytes that total_size() can't see.  allocated_bytes()
+// must reflect the real allocated footprint.
+TEST(ByteStreamAllocatedBytesTest, ReportsPageAllocationsNotLogicalSize) {
+    constexpr uint32_t kPageSize = 4096;
+    ByteStream s(kPageSize, common::MOD_DEFAULT);
+    EXPECT_EQ(s.allocated_bytes(), 0u);
+
+    // First write triggers one page allocation; logical size is 4 bytes but
+    // the real footprint should be the rounded page size.
+    uint8_t payload[4] = {1, 2, 3, 4};
+    ASSERT_EQ(s.write_buf(payload, 4), common::E_OK);
+    EXPECT_EQ(s.total_size(), 4u);
+    EXPECT_GE(s.allocated_bytes(), kPageSize);
+    EXPECT_EQ(s.allocated_bytes() % kPageSize, 0u);
+}
+
+// Regression for finding 21 (MSVC reinterpret_cast<atomic<T>*> UB): the
+// OptionalAtomic storage is now a real std::atomic<T>, so atomic ops never
+// observe a non-atomic backing object.  Lock the storage type at compile
+// time so a future refactor can't reintroduce the bare T fallback.
+TEST(OptionalAtomicStorageTest, BackingStorageIsRealAtomic) {
+    OptionalAtomic<uint64_t> oa(0, /*enable_atomic=*/true);
+    static_assert(!std::is_copy_constructible<OptionalAtomic<uint64_t>>::value,
+                  "OptionalAtomic must not be copyable — the std::atomic<T> "
+                  "storage forces explicit load/store");
+    EXPECT_EQ(oa.load(), 0u);
+    oa.store(42);
+    EXPECT_EQ(oa.load(), 42u);
+    EXPECT_EQ(oa.atomic_aaf(8), 50u);
+    EXPECT_EQ(oa.load(), 50u);
+    EXPECT_EQ(oa.atomic_faa(1), 50u);
+    EXPECT_EQ(oa.load(), 51u);
+}
+
+}  // namespace common
diff --git a/cpp/test/common/tablet_test.cc b/cpp/test/common/tablet_test.cc
index 71863f0c7..11dfa485f 100644
--- a/cpp/test/common/tablet_test.cc
+++ b/cpp/test/common/tablet_test.cc
@@ -46,6 +46,144 @@ TEST(TabletTest, BasicFunctionality) {
     EXPECT_EQ(tablet.add_value(1, 1, true), common::E_OK);
 }
 
+// Regression: reset() must restore each column's bitmap to all-null. If the
+// previous batch left some cells with non-null bits cleared and the next batch
+// does not re-fill those cells, get_value() must report them as null so the
+// writer does not emit stale leftover values.
+TEST(TabletTest, ResetClearsBitmap) {
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.push_back(MeasurementSchema(
+        "m_int", common::TSDataType::INT32, common::TSEncoding::PLAIN,
+        common::CompressionType::UNCOMPRESSED));
+    schema_vec.push_back(MeasurementSchema(
+        "m_double", common::TSDataType::DOUBLE, common::TSEncoding::PLAIN,
+        common::CompressionType::UNCOMPRESSED));
+    Tablet tablet("dev",
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec));
+
+    // First batch fills row 5 in both columns.
+    ASSERT_EQ(tablet.add_value(5u, 0u, static_cast<int32_t>(42)), common::E_OK);
+    ASSERT_EQ(tablet.add_value(5u, 1u, 3.14), common::E_OK);
+
+    common::TSDataType ty;
+    EXPECT_NE(tablet.get_value(5, 0u, ty), nullptr);
+    EXPECT_NE(tablet.get_value(5, 1u, ty), nullptr);
+
+    // Reuse the tablet: reset and write a fresh, smaller batch that does not
+    // touch row 5 at all. Row 5 must come back as null, not as the stale 42.
+    tablet.reset();
+    ASSERT_EQ(tablet.add_value(0u, 0u, static_cast<int32_t>(7)), common::E_OK);
+    EXPECT_NE(tablet.get_value(0, 0u, ty), nullptr);
+    EXPECT_EQ(tablet.get_value(5, 0u, ty), nullptr);
+    EXPECT_EQ(tablet.get_value(5, 1u, ty), nullptr);
+}
+
+// Regression: set_column_values() with a non-null bitmap must update
+// has_set_bits_, otherwise downstream may_have_set_bits() shortcuts treat the
+// column as having no nulls and the writer emits stale/garbage values for the
+// rows the bitmap was meant to mark null.
+TEST(TabletTest, SetColumnValuesBitmapPreservesNullFlag) {
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.push_back(MeasurementSchema(
+        "m_int", common::TSDataType::INT32, common::TSEncoding::PLAIN,
+        common::CompressionType::UNCOMPRESSED));
+    Tablet tablet("dev",
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec));
+
+    int32_t buf[8] = {1, 2, 3, 4, 5, 6, 7, 8};
+
+    // Step 1: write all 8 rows with no nulls -> clear_all() inside the tablet
+    // sets has_set_bits_=false, matching the state a real workload leaves
+    // behind for a fully-populated column.
+    ASSERT_EQ(tablet.set_column_values(0u, buf, /*bitmap=*/nullptr, 8u),
+              common::E_OK);
+
+    // Step 2: rewrite with a bitmap that marks rows 0 and 7 as NULL.  Tablet's
+    // BitMap layout is LSB-first within each byte (row i -> bit 1<<(i%8)).
+    uint8_t external_bitmap[] = {0x81};  // bit 0 (row 0) + bit 7 (row 7) set
+    ASSERT_EQ(tablet.set_column_values(0u, buf, external_bitmap, 8u),
+              common::E_OK);
+
+    common::TSDataType ty;
+    EXPECT_EQ(tablet.get_value(0, 0u, ty), nullptr);
+    EXPECT_NE(tablet.get_value(1, 0u, ty), nullptr);
+    EXPECT_EQ(tablet.get_value(7, 0u, ty), nullptr);
+}
+
+// Regression: set_column_string_values / set_column_string_repeated used to
+// reinterpret value_matrix_[c].string_col without checking the schema type.
+// Calling them on a numeric column would corrupt that column's numeric
+// buffer.  Verify both reject non-string columns with E_TYPE_NOT_MATCH.
+TEST(TabletTest, StringApisRejectNonStringColumn) {
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.push_back(MeasurementSchema(
+        "m_int", common::TSDataType::INT32, common::TSEncoding::PLAIN,
+        common::CompressionType::UNCOMPRESSED));
+    Tablet tablet("dev",
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec));
+
+    const char data[] = "hello";
+    int32_t offsets[2] = {0, 5};
+    EXPECT_EQ(tablet.set_column_string_values(0u, offsets, data, nullptr, 1u),
+              common::E_TYPE_NOT_MATCH);
+    EXPECT_EQ(tablet.set_column_string_repeated(0u, "x", 1u, 4u),
+              common::E_TYPE_NOT_MATCH);
+}
+
+// Regression: str_len * count used to be computed in uint32_t and would wrap
+// silently, leaving the loop to write past the truncated allocation.
+// 65536 * 65537 = 4295032832 → wraps to 65536 in uint32_t.
+TEST(TabletTest, StringRepeatedTotalBytesOverflowRejected) {
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.push_back(MeasurementSchema(
+        "m_str", common::TSDataType::STRING, common::TSEncoding::PLAIN,
+        common::CompressionType::UNCOMPRESSED));
+    Tablet tablet("dev",
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  100000u);
+    std::string big_str(65536, 'a');
+    EXPECT_EQ(tablet.set_column_string_repeated(0u, big_str.c_str(),
+                                                /*str_len=*/65536u,
+                                                /*count=*/65537u),
+              common::E_OVERFLOW);
+}
+
+// Regression: set_column_string_values only checked offsets[count] before;
+// non-monotonic / negative / non-zero-start offsets would underflow the
+// downstream `offsets[i+1] - offsets[i]` length calc and trigger wild
+// memcpy.  Verify each malformed input is rejected with E_INVALID_ARG.
+TEST(TabletTest, StringValuesRejectsMalformedOffsets) {
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.push_back(MeasurementSchema(
+        "m_str", common::TSDataType::STRING, common::TSEncoding::PLAIN,
+        common::CompressionType::UNCOMPRESSED));
+    Tablet tablet("dev",
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec));
+    const char data[] = "abcdefghij";
+
+    // Non-zero start offset.
+    int32_t off_bad_start[3] = {1, 5, 10};
+    EXPECT_EQ(
+        tablet.set_column_string_values(0u, off_bad_start, data, nullptr, 2u),
+        common::E_INVALID_ARG);
+
+    // Non-monotonic: {0, 10, 5}.
+    int32_t off_non_mono[3] = {0, 10, 5};
+    EXPECT_EQ(
+        tablet.set_column_string_values(0u, off_non_mono, data, nullptr, 2u),
+        common::E_INVALID_ARG);
+
+    // Negative offset somewhere in the middle.
+    int32_t off_neg[3] = {0, -1, 5};
+    EXPECT_EQ(tablet.set_column_string_values(0u, off_neg, data, nullptr, 2u),
+              common::E_INVALID_ARG);
+
+    // Sanity: well-formed offsets succeed.
+    int32_t off_ok[3] = {0, 3, 7};
+    EXPECT_EQ(tablet.set_column_string_values(0u, off_ok, data, nullptr, 2u),
+              common::E_OK);
+}
+
 TEST(TabletTest, LargeQuantities) {
     std::string device_name = "test_device";
     std::vector<MeasurementSchema> schema_vec;
diff --git a/cpp/test/common/thread_pool_test.cc b/cpp/test/common/thread_pool_test.cc
new file mode 100644
index 000000000..1fe7465cf
--- /dev/null
+++ b/cpp/test/common/thread_pool_test.cc
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifdef ENABLE_THREADS
+
+#include "common/thread_pool.h"
+
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <chrono>
+#include <future>
+#include <thread>
+
+// Regression: a zero-sized ThreadPool used to silently accept submit() but
+// block wait_all() forever (no worker thread, so active_ never reaches 0).
+// init_common() clamps thread_count_ to >= 1 before building the global pool,
+// but the ctor normalizes zero to a single worker as a defensive backstop so
+// any direct ThreadPool(0) still makes progress instead of hanging.
+TEST(ThreadPoolTest, ZeroThreadPoolStillExecutesAndDrains) {
+    common::ThreadPool pool(0);
+    EXPECT_GE(pool.num_threads(), static_cast<size_t>(1));
+
+    std::atomic<int> ran{0};
+    pool.submit([&ran]() { ran.fetch_add(1); });
+    auto fut = pool.submit([]() { return 42; });
+
+    auto wait_with_timeout = [&pool]() {
+        // wait_all has no timeout; run it in a helper thread we can join().
+        std::promise<void> done;
+        auto fut = done.get_future();
+        std::thread t([&pool, &done]() {
+            pool.wait_all();
+            done.set_value();
+        });
+        auto status = fut.wait_for(std::chrono::seconds(2));
+        if (status != std::future_status::ready) {
+            // Detach so a hung pool doesn't terminate the test process.
+            t.detach();
+            return false;
+        }
+        t.join();
+        return true;
+    };
+
+    ASSERT_TRUE(wait_with_timeout()) << "wait_all hung — zero-thread pool";
+    EXPECT_EQ(ran.load(), 1);
+    EXPECT_EQ(fut.get(), 42);
+}
+
+#endif  // ENABLE_THREADS
diff --git a/cpp/test/common/tsfile_common_test.cc b/cpp/test/common/tsfile_common_test.cc
index 01e193f79..c451a8136 100644
--- a/cpp/test/common/tsfile_common_test.cc
+++ b/cpp/test/common/tsfile_common_test.cc
@@ -21,6 +21,9 @@
 #include <common/schema.h>
 #include <gtest/gtest.h>
 
+#include "common/global.h"
+#include "compress/compressor_factory.h"
+
 namespace storage {
 TEST(PageHeaderTest, DefaultConstructor) {
     PageHeader header;
@@ -471,4 +474,26 @@ TEST_F(TsFileMetaTest, SerializeDeserialize) {
     ASSERT_EQ(*new_meta.tsfile_properties_["key"], std::string("value"));
     ASSERT_EQ(new_meta.tsfile_properties_["null_key"], nullptr);
 }
+
+// Regression: the default-compression configuration must name a compressor
+// that the build actually provides; otherwise CompressorFactory returns
+// nullptr at write time. init_config_value() previously gated SNAPPY on
+// ENABLE_LZ4, which broke --disable-snappy --enable-lz4 builds.
+TEST(DefaultCompressorTest, DefaultIsAllocatable) {
+    common::init_config_value();
+    Compressor* c = CompressorFactory::alloc_compressor(
+        common::g_config_value_.default_compression_type_);
+    ASSERT_NE(c, nullptr);
+#ifdef ENABLE_SNAPPY
+    EXPECT_EQ(common::g_config_value_.default_compression_type_,
+              common::CompressionType::SNAPPY);
+#elif defined(ENABLE_LZ4)
+    EXPECT_EQ(common::g_config_value_.default_compression_type_,
+              common::CompressionType::LZ4);
+#else
+    EXPECT_EQ(common::g_config_value_.default_compression_type_,
+              common::CompressionType::UNCOMPRESSED);
+#endif
+    CompressorFactory::free(c);
+}
 }  // namespace storage
diff --git a/cpp/test/compress/lz4_compressor_test.cc b/cpp/test/compress/lz4_compressor_test.cc
index c57ec0caf..0b2249f8d 100644
--- a/cpp/test/compress/lz4_compressor_test.cc
+++ b/cpp/test/compress/lz4_compressor_test.cc
@@ -126,4 +126,40 @@ TEST_F(LZ4Test, TestBytes2) {
     compressor.after_compress(compressed_buf);
     compressor.after_uncompress(decompressed_buf);
 }
+
+TEST_F(LZ4Test, AfterUncompressFreesParamNotMember) {
+    storage::LZ4Compressor compressor;
+    std::string input_a(1024, 'A');
+    std::string input_b(2048, 'B');
+    char* compressed_a = nullptr;
+    char* compressed_b = nullptr;
+    uint32_t compressed_a_len = 0;
+    uint32_t compressed_b_len = 0;
+
+    ASSERT_EQ(compressor.compress(&input_a[0], input_a.size(), compressed_a,
+                                  compressed_a_len),
+              common::E_OK);
+    ASSERT_EQ(compressor.compress(&input_b[0], input_b.size(), compressed_b,
+                                  compressed_b_len),
+              common::E_OK);
+
+    char* uncompressed_a = nullptr;
+    char* uncompressed_b = nullptr;
+    uint32_t uncompressed_a_len = 0;
+    uint32_t uncompressed_b_len = 0;
+    ASSERT_EQ(compressor.uncompress(compressed_a, compressed_a_len,
+                                    uncompressed_a, uncompressed_a_len),
+              common::E_OK);
+    ASSERT_EQ(compressor.uncompress(compressed_b, compressed_b_len,
+                                    uncompressed_b, uncompressed_b_len),
+              common::E_OK);
+
+    compressor.after_uncompress(uncompressed_a);
+    EXPECT_EQ(uncompressed_b_len, input_b.size());
+    EXPECT_EQ(memcmp(uncompressed_b, input_b.data(), uncompressed_b_len), 0);
+
+    compressor.after_uncompress(uncompressed_b);
+    compressor.after_compress(compressed_a);
+    compressor.after_compress(compressed_b);
+}
 }  // namespace
diff --git a/cpp/test/compress/snappy_compressor_test.cc b/cpp/test/compress/snappy_compressor_test.cc
index d24915d70..249200cce 100644
--- a/cpp/test/compress/snappy_compressor_test.cc
+++ b/cpp/test/compress/snappy_compressor_test.cc
@@ -126,4 +126,40 @@ TEST_F(SnappyTest, TestBytes2) {
     compressor.after_compress(compressed_buf);
     compressor.after_uncompress(decompressed_buf);
 }
+
+TEST_F(SnappyTest, AfterUncompressFreesParamNotMember) {
+    storage::SnappyCompressor compressor;
+    std::string input_a(1024, 'A');
+    std::string input_b(2048, 'B');
+    char* compressed_a = nullptr;
+    char* compressed_b = nullptr;
+    uint32_t compressed_a_len = 0;
+    uint32_t compressed_b_len = 0;
+
+    ASSERT_EQ(compressor.compress(&input_a[0], input_a.size(), compressed_a,
+                                  compressed_a_len),
+              common::E_OK);
+    ASSERT_EQ(compressor.compress(&input_b[0], input_b.size(), compressed_b,
+                                  compressed_b_len),
+              common::E_OK);
+
+    char* uncompressed_a = nullptr;
+    char* uncompressed_b = nullptr;
+    uint32_t uncompressed_a_len = 0;
+    uint32_t uncompressed_b_len = 0;
+    ASSERT_EQ(compressor.uncompress(compressed_a, compressed_a_len,
+                                    uncompressed_a, uncompressed_a_len),
+              common::E_OK);
+    ASSERT_EQ(compressor.uncompress(compressed_b, compressed_b_len,
+                                    uncompressed_b, uncompressed_b_len),
+              common::E_OK);
+
+    compressor.after_uncompress(uncompressed_a);
+    EXPECT_EQ(uncompressed_b_len, input_b.size());
+    EXPECT_EQ(memcmp(uncompressed_b, input_b.data(), uncompressed_b_len), 0);
+
+    compressor.after_uncompress(uncompressed_b);
+    compressor.after_compress(compressed_a);
+    compressor.after_compress(compressed_b);
+}
 }  // namespace
diff --git a/cpp/test/compress/uncompressed_compressor_test.cc b/cpp/test/compress/uncompressed_compressor_test.cc
new file mode 100644
index 000000000..c4f1e8ced
--- /dev/null
+++ b/cpp/test/compress/uncompressed_compressor_test.cc
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "compress/uncompressed_compressor.h"
+
+#include <gtest/gtest.h>
+
+#include <cstring>
+
+namespace storage {
+
+// Regression: after_uncompress() used to free the cached uncompressed_buf_
+// member regardless of which buffer the caller actually passed in.  Two
+// successive uncompress() calls would cache only the second buffer; calling
+// after_uncompress(first) then freed the still-live second buffer (UAF) and
+// leaked the first.  The fix frees the parameter and only clears the
+// member when it matches.  We can't directly observe UAF in a unit test,
+// but we can verify the contract: a buffer the caller is releasing is no
+// longer used after the call, and the second buffer's contents stay
+// readable until its own after_uncompress() runs.
+TEST(UncompressedCompressorTest, AfterUncompressFreesParamNotMember) {
+    UncompressedCompressor c;
+
+    const char src_a[] = "AAAA-payload-A";
+    const char src_b[] = "BBBB-payload-B-longer";
+
+    char* uA = nullptr;
+    uint32_t lenA = 0;
+    ASSERT_EQ(
+        c.uncompress(const_cast<char*>(src_a), sizeof(src_a) - 1, uA, lenA),
+        common::E_OK);
+    ASSERT_NE(uA, nullptr);
+    ASSERT_EQ(lenA, sizeof(src_a) - 1);
+    EXPECT_EQ(memcmp(uA, src_a, lenA), 0);
+
+    char* uB = nullptr;
+    uint32_t lenB = 0;
+    ASSERT_EQ(
+        c.uncompress(const_cast<char*>(src_b), sizeof(src_b) - 1, uB, lenB),
+        common::E_OK);
+    ASSERT_NE(uB, nullptr);
+    EXPECT_NE(uA, uB);
+    EXPECT_EQ(memcmp(uB, src_b, lenB), 0);
+
+    // Release the FIRST buffer.  Under the old bug this would free uB
+    // (the member-cached pointer) and leak uA.  Under the fix it frees uA
+    // and leaves uB intact for the next read.
+    c.after_uncompress(uA);
+    // uB must still be readable — if we had freed it above, the cached
+    // member pointer would now point into freed memory and most
+    // allocators would either return the byte back to the free list or
+    // poison it.  Validate via the original content.
+    EXPECT_EQ(memcmp(uB, src_b, lenB), 0);
+
+    // Releasing uB should be a clean no-op-after on the member.
+    c.after_uncompress(uB);
+}
+
+}  // namespace storage
diff --git a/cpp/test/cwrapper/c_release_test.cc b/cpp/test/cwrapper/c_release_test.cc
index 375c7e115..bb21483f7 100644
--- a/cpp/test/cwrapper/c_release_test.cc
+++ b/cpp/test/cwrapper/c_release_test.cc
@@ -40,6 +40,7 @@ class CReleaseTest : public testing::Test {};
 
 TEST_F(CReleaseTest, TestCreateFile) {
     ERRNO error_no = RET_OK;
+    remove("create_file1.tsfile");
     // Create File and Get RET_OK
     WriteFile file = write_file_new("create_file1.tsfile", &error_no);
     ASSERT_EQ(RET_OK, error_no);
@@ -50,7 +51,8 @@ TEST_F(CReleaseTest, TestCreateFile) {
     ASSERT_EQ(RET_ALREADY_EXIST, error_no);
     ASSERT_EQ(nullptr, file);
 
-    // Folder
+    // Folder: rejected either as an open error (POSIX) or as already-existing
+    // (Windows / filesystems where the directory already exists).
     file = write_file_new("test/", &error_no);
     ASSERT_TRUE(error_no == RET_FILRET_OPEN_ERR ||
                 error_no == RET_ALREADY_EXIST);
@@ -112,6 +114,17 @@ TEST_F(CReleaseTest, TsFileWriterNew) {
     free_write_file(&file);
     remove("test_empty_writer.tsfile");
 
+    // Normal schema with memory threshold
+    file = write_file_new("test_memory_threshold_writer.tsfile", &error_code);
+    ASSERT_EQ(RET_OK, error_code);
+    writer = tsfile_writer_new_with_memory_threshold(file, &table_schema, 100,
+                                                     &error_code);
+    ASSERT_NE(nullptr, writer);
+    ASSERT_EQ(RET_OK, error_code);
+    ASSERT_EQ(RET_OK, tsfile_writer_close(writer));
+    free_write_file(&file);
+    remove("test_memory_threshold_writer.tsfile");
+
     free_table_schema(table_schema);
     free_table_schema(test_schema);
 }
@@ -142,6 +155,10 @@ TEST_F(CReleaseTest, TsFileWriterWriteDataAbnormalColumn) {
     TsFileWriter writer =
         tsfile_writer_new(file, &abnormal_schema, &error_code);
     ASSERT_EQ(RET_INVALID_SCHEMA, error_code);
+    writer = tsfile_writer_new_with_memory_threshold(file, &abnormal_schema,
+                                                     100, &error_code);
+    ASSERT_EQ(nullptr, writer);
+    ASSERT_EQ(RET_INVALID_SCHEMA, error_code);
     free(abnormal_schema.column_schemas[2].column_name);
 
     abnormal_schema.column_schemas[2] =
@@ -150,6 +167,10 @@ TEST_F(CReleaseTest, TsFileWriterWriteDataAbnormalColumn) {
     // datatype conflict
     writer = tsfile_writer_new(file, &abnormal_schema, &error_code);
     ASSERT_EQ(RET_INVALID_SCHEMA, error_code);
+    writer = tsfile_writer_new_with_memory_threshold(file, &abnormal_schema,
+                                                     100, &error_code);
+    ASSERT_EQ(nullptr, writer);
+    ASSERT_EQ(RET_INVALID_SCHEMA, error_code);
 
     free(abnormal_schema.column_schemas[1].column_name);
     abnormal_schema.column_schemas[1] =
@@ -388,4 +409,4 @@ TEST_F(CReleaseTest, TsFileWriterConfTest) {
     remove("plain_file.tsfile");
 }
 
-}  // namespace CReleaseTest
\ No newline at end of file
+}  // namespace CReleaseTest
diff --git a/cpp/test/cwrapper/cwrapper_test.cc b/cpp/test/cwrapper/cwrapper_test.cc
index 9cf06d2f8..2ac6cad21 100644
--- a/cpp/test/cwrapper/cwrapper_test.cc
+++ b/cpp/test/cwrapper/cwrapper_test.cc
@@ -314,4 +314,155 @@ TEST_F(CWrapperTest, WriterFlushTabletAndReadData) {
     free(data_types);
     free_write_file(&file);
 }
-}  // namespace cwrapper
\ No newline at end of file
+
+// Regression: tsfile_writer_new_with_memory_threshold() had its duplicate-
+// column check inverted (`==` instead of `!=`), so the very first column
+// always looked like a duplicate and the constructor returned
+// E_INVALID_SCHEMA before any legitimate schema could be used.  Compare to
+// tsfile_writer_new() in the same file which had the correct check.
+TEST(TsFileWriterCApiTest, NewWithMemoryThresholdAcceptsValidSchema) {
+    const char* path = "cwrapper_writer_with_threshold_smoke.tsfile";
+    remove(path);
+    ERRNO code = 0;
+    WriteFile file = write_file_new(path, &code);
+    ASSERT_EQ(code, RET_OK);
+
+    const int column_num = 3;
+    TableSchema schema;
+    schema.table_name = strdup("t");
+    schema.column_num = column_num;
+    schema.column_schemas =
+        static_cast<ColumnSchema*>(malloc(sizeof(ColumnSchema) * column_num));
+    schema.column_schemas[0] =
+        ColumnSchema{strdup("id1"), TS_DATATYPE_STRING, TAG};
+    schema.column_schemas[1] =
+        ColumnSchema{strdup("s1"), TS_DATATYPE_INT64, FIELD};
+    schema.column_schemas[2] =
+        ColumnSchema{strdup("s2"), TS_DATATYPE_DOUBLE, FIELD};
+
+    TsFileWriter writer = tsfile_writer_new_with_memory_threshold(
+        file, &schema, 1024 * 1024, &code);
+    EXPECT_NE(writer, nullptr) << "constructor refused a valid 3-column schema";
+    EXPECT_EQ(code, RET_OK);
+
+    // Duplicate column triggers the now-correct path.
+    TableSchema dup;
+    dup.table_name = strdup("t");
+    dup.column_num = 2;
+    dup.column_schemas =
+        static_cast<ColumnSchema*>(malloc(sizeof(ColumnSchema) * 2));
+    dup.column_schemas[0] =
+        ColumnSchema{strdup("s1"), TS_DATATYPE_INT64, FIELD};
+    dup.column_schemas[1] =
+        ColumnSchema{strdup("s1"), TS_DATATYPE_INT64, FIELD};
+    ERRNO dup_code = 0;
+    TsFileWriter dup_writer = tsfile_writer_new_with_memory_threshold(
+        file, &dup, 1024 * 1024, &dup_code);
+    EXPECT_EQ(dup_writer, nullptr);
+    EXPECT_EQ(dup_code, common::E_INVALID_SCHEMA);
+
+    if (writer != nullptr) {
+        tsfile_writer_close(writer);
+    }
+    free_table_schema(schema);
+    free_table_schema(dup);
+    free_write_file(&file);
+    remove(path);
+}
+
+// Regression: tsfile_writer_new / tsfile_writer_new_with_memory_threshold /
+// _tsfile_writer_register_table used to dereference null inputs directly,
+// crashing the host process.  Each now reports E_INVALID_ARG (or returns
+// nullptr when err_code itself is null) instead of segfaulting.
+TEST(TsFileWriterCApiTest, RejectsNullInputs) {
+    ERRNO err = 0;
+
+    // tsfile_writer_new: null file
+    EXPECT_EQ(
+        tsfile_writer_new(nullptr, reinterpret_cast<TableSchema*>(1), &err),
+        nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    // tsfile_writer_new: null schema
+    err = 0;
+    EXPECT_EQ(tsfile_writer_new(reinterpret_cast<WriteFile>(1), nullptr, &err),
+              nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    // tsfile_writer_new: null err_code
+    EXPECT_EQ(tsfile_writer_new(nullptr, nullptr, nullptr), nullptr);
+
+    // tsfile_writer_new_with_memory_threshold: same checks
+    err = 0;
+    EXPECT_EQ(tsfile_writer_new_with_memory_threshold(
+                  nullptr, reinterpret_cast<TableSchema*>(1), 1024, &err),
+              nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    // _tsfile_writer_register_table: nulls
+    EXPECT_EQ(_tsfile_writer_register_table(nullptr,
+                                            reinterpret_cast<TableSchema*>(1)),
+              common::E_INVALID_ARG);
+    EXPECT_EQ(_tsfile_writer_register_table(reinterpret_cast<TsFileWriter>(1),
+                                            nullptr),
+              common::E_INVALID_ARG);
+}
+
+// Regression: the tag-filter C API used to dereference a null reader and
+// pass null char pointers straight to std::string(), crashing the host
+// process.  Each entry point must now return nullptr / E_INVALID_ARG on
+// missing inputs instead of segfaulting.  This test only checks the guards
+// are in place — it deliberately never touches a real reader.
+TEST(TagFilterCApiTest, RejectsNullInputs) {
+    const char* table = "t";
+    const char* col = "c";
+    const char* val = "v";
+
+    EXPECT_EQ(tsfile_tag_filter_eq(nullptr, table, col, val), nullptr);
+    EXPECT_EQ(tsfile_tag_filter_eq(reinterpret_cast<TsFileReader>(1), nullptr,
+                                   col, val),
+              nullptr);
+    EXPECT_EQ(tsfile_tag_filter_eq(reinterpret_cast<TsFileReader>(1), table,
+                                   nullptr, val),
+              nullptr);
+    EXPECT_EQ(tsfile_tag_filter_eq(reinterpret_cast<TsFileReader>(1), table,
+                                   col, nullptr),
+              nullptr);
+
+    EXPECT_EQ(tsfile_tag_filter_neq(nullptr, table, col, val), nullptr);
+    EXPECT_EQ(tsfile_tag_filter_lt(nullptr, table, col, val), nullptr);
+    EXPECT_EQ(tsfile_tag_filter_lteq(nullptr, table, col, val), nullptr);
+    EXPECT_EQ(tsfile_tag_filter_gt(nullptr, table, col, val), nullptr);
+    EXPECT_EQ(tsfile_tag_filter_gteq(nullptr, table, col, val), nullptr);
+
+    ERRNO err = common::E_OK;
+    EXPECT_EQ(
+        tsfile_tag_filter_create(nullptr, table, col, val, TAG_FILTER_EQ, &err),
+        nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    err = common::E_OK;
+    EXPECT_EQ(tsfile_tag_filter_create(reinterpret_cast<TsFileReader>(1),
+                                       nullptr, col, val, TAG_FILTER_EQ, &err),
+              nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    err = common::E_OK;
+    EXPECT_EQ(tsfile_tag_filter_create(reinterpret_cast<TsFileReader>(1), table,
+                                       nullptr, val, TAG_FILTER_EQ, &err),
+              nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    err = common::E_OK;
+    EXPECT_EQ(tsfile_tag_filter_create(reinterpret_cast<TsFileReader>(1), table,
+                                       col, nullptr, TAG_FILTER_EQ, &err),
+              nullptr);
+    EXPECT_EQ(err, common::E_INVALID_ARG);
+
+    // err_code itself is null — must not crash, must return null.
+    EXPECT_EQ(tsfile_tag_filter_create(reinterpret_cast<TsFileReader>(1), table,
+                                       col, val, TAG_FILTER_EQ, nullptr),
+              nullptr);
+}
+
+}  // namespace cwrapper
diff --git a/cpp/test/cwrapper/query_by_row_cwrapper_test.cc b/cpp/test/cwrapper/query_by_row_cwrapper_test.cc
index 3de447ffd..4983c57ea 100644
--- a/cpp/test/cwrapper/query_by_row_cwrapper_test.cc
+++ b/cpp/test/cwrapper/query_by_row_cwrapper_test.cc
@@ -217,7 +217,7 @@ TEST_F(CWrapperQueryByRowTest, TableByRowOffsetLimit) {
     const int limit = 5;
     ResultSet rs = tsfile_reader_query_table_by_row(reader, table_name.c_str(),
                                                     column_names_c, 2, offset,
-                                                    limit, NULL, 0, &code);
+                                                    limit, nullptr, 0, &code);
     ASSERT_EQ(code, RET_OK);
     ASSERT_NE(rs, nullptr);
 
diff --git a/cpp/test/encoding/encoding_coverage_test.cc b/cpp/test/encoding/encoding_coverage_test.cc
new file mode 100644
index 000000000..6970b9387
--- /dev/null
+++ b/cpp/test/encoding/encoding_coverage_test.cc
@@ -0,0 +1,406 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Targeted coverage tests that exercise paths missed by the per-codec
+// roundtrip tests: type-mismatch error returns, has_remaining variants,
+// SIMD/scalar batch branches, floating-point special values, dictionary
+// decoder/encoder, and reset cycles.
+
+#include <cmath>
+#include <limits>
+#include <vector>
+
+#include "common/allocator/byte_stream.h"
+#include "encoding/dictionary_decoder.h"
+#include "encoding/dictionary_encoder.h"
+#include "encoding/gorilla_decoder.h"
+#include "encoding/gorilla_encoder.h"
+#include "encoding/int32_rle_decoder.h"
+#include "encoding/int32_rle_encoder.h"
+#include "encoding/int64_rle_decoder.h"
+#include "encoding/int64_rle_encoder.h"
+#include "encoding/plain_decoder.h"
+#include "encoding/plain_encoder.h"
+#include "encoding/ts2diff_decoder.h"
+#include "encoding/ts2diff_encoder.h"
+#include "encoding/zigzag_decoder.h"
+#include "encoding/zigzag_encoder.h"
+#include "gtest/gtest.h"
+
+namespace storage {
+
+// ── Type-mismatch returns ────────────────────────────────────────────────
+//
+// Every codec exposes read_boolean / read_int32 / read_int64 / read_float /
+// read_double / read_String. Most of them only implement one or two and
+// return E_TYPE_NOT_MATCH for the rest, but those return paths were never
+// hit by the existing per-codec tests (which only call the one supported
+// method per codec).
+TEST(EncodingCoverage, TypeMismatchReturnsAreReachable) {
+    common::ByteStream s(64, common::MOD_DEFAULT);
+    common::PageArena pa;
+    pa.init(512, common::MOD_DEFAULT);
+    bool b;
+    float f;
+    double d;
+    int64_t i64;
+    common::String str;
+
+    // Each decoder returns an error sentinel (E_TYPE_NOT_MATCH or
+    // E_NOT_SUPPORT depending on codec) for the read_* variants it
+    // doesn't implement.  We only care that the unsupported path returns
+    // an error rather than a corrupted value.  Note that GorillaDecoder
+    // implements its unsupported paths with `ASSERT(false)`; calling
+    // those in Debug builds aborts, so we exercise only the codecs that
+    // return cleanly (Zigzag, RLE).
+    auto NE_OK = [](int r) { EXPECT_NE(r, common::E_OK); };
+    IntZigzagDecoder zz;
+    NE_OK(zz.read_boolean(b, s));
+    NE_OK(zz.read_float(f, s));
+    NE_OK(zz.read_double(d, s));
+    NE_OK(zz.read_String(str, pa, s));
+
+    Int32RleDecoder rle32;
+    NE_OK(rle32.read_int64(i64, s));
+    NE_OK(rle32.read_float(f, s));
+    NE_OK(rle32.read_double(d, s));
+    NE_OK(rle32.read_String(str, pa, s));
+
+    Int64RleDecoder rle64;
+    int32_t i32;
+    NE_OK(rle64.read_boolean(b, s));
+    NE_OK(rle64.read_int32(i32, s));
+    NE_OK(rle64.read_float(f, s));
+    NE_OK(rle64.read_double(d, s));
+    NE_OK(rle64.read_String(str, pa, s));
+    (void)i32;
+    (void)i64;
+}
+
+// ── Reset cycles ────────────────────────────────────────────────────────
+//
+// Each codec defines a reset() that resets internal state; nothing in the
+// roundtrip tests calls it.  Encode → reset → re-encode should still
+// produce a stream that decodes to the second batch's values.
+TEST(EncodingCoverage, ResetClearsState) {
+    {
+        IntZigzagEncoder enc;
+        IntZigzagDecoder dec;
+        common::ByteStream s(64, common::MOD_DEFAULT);
+        EXPECT_EQ(enc.encode(123, s), common::E_OK);
+        enc.flush(s);
+        EXPECT_EQ(dec.decode(s), 123);
+        dec.reset();
+        common::ByteStream s2(64, common::MOD_DEFAULT);
+        EXPECT_EQ(enc.encode(-456, s2), common::E_OK);
+        enc.flush(s2);
+        EXPECT_EQ(dec.decode(s2), -456);
+    }
+    {
+        IntGorillaEncoder enc;
+        IntGorillaDecoder dec;
+        common::ByteStream s(64, common::MOD_DEFAULT);
+        EXPECT_EQ(enc.encode(7, s), common::E_OK);
+        EXPECT_EQ(enc.encode(7, s), common::E_OK);
+        enc.flush(s);
+        int32_t v;
+        EXPECT_EQ(dec.read_int32(v, s), common::E_OK);
+        EXPECT_EQ(v, 7);
+        dec.reset();
+        enc.reset();
+        common::ByteStream s2(64, common::MOD_DEFAULT);
+        EXPECT_EQ(enc.encode(42, s2), common::E_OK);
+        EXPECT_EQ(enc.encode(42, s2), common::E_OK);
+        enc.flush(s2);
+        EXPECT_EQ(dec.read_int32(v, s2), common::E_OK);
+        EXPECT_EQ(v, 42);
+    }
+}
+
+// ── has_remaining variants ──────────────────────────────────────────────
+TEST(EncodingCoverage, HasRemainingOnEmptyAndAfterDrain) {
+    common::ByteStream empty(64, common::MOD_DEFAULT);
+    {
+        IntZigzagDecoder zz;
+        EXPECT_FALSE(zz.has_remaining(empty));
+    }
+    {
+        IntGorillaDecoder g;
+        EXPECT_FALSE(g.has_remaining(empty));
+    }
+    {
+        Int32RleDecoder rle;
+        EXPECT_FALSE(rle.has_remaining(empty));
+    }
+    {
+        TS2DIFFDecoder<int32_t> t;
+        EXPECT_FALSE(t.has_remaining(empty));
+    }
+    {
+        PlainDecoder p;
+        EXPECT_FALSE(p.has_remaining(empty));
+    }
+}
+
+// ── Gorilla floating-point special values ──────────────────────────────
+//
+// FloatGorillaDecoder / DoubleGorillaDecoder run different VALUE_BITS and
+// ending-sentinel paths.  Verify they round-trip NaN, infinity, -0.0 and
+// denormals — none of which the existing happy-path roundtrip exercises.
+TEST(EncodingCoverage, GorillaFloatSpecialValues) {
+    FloatGorillaEncoder enc;
+    common::ByteStream s(256, common::MOD_DEFAULT);
+    std::vector<float> values = {
+        0.0f,
+        -0.0f,
+        std::numeric_limits<float>::infinity(),
+        -std::numeric_limits<float>::infinity(),
+        std::numeric_limits<float>::min(),
+        std::numeric_limits<float>::denorm_min(),
+        std::numeric_limits<float>::epsilon(),
+        1.0f,
+        -1.0f,
+        std::numeric_limits<float>::max(),
+        std::numeric_limits<float>::lowest(),
+    };
+    for (float v : values) ASSERT_EQ(enc.encode(v, s), common::E_OK);
+    enc.flush(s);
+
+    FloatGorillaDecoder dec;
+    float out;
+    for (size_t i = 0; i < values.size(); i++) {
+        ASSERT_EQ(dec.read_float(out, s), common::E_OK) << "i=" << i;
+        if (std::isnan(values[i])) {
+            EXPECT_TRUE(std::isnan(out));
+        } else {
+            // Bitwise compare to catch -0.0 vs 0.0 etc.
+            uint32_t a, b;
+            memcpy(&a, &values[i], sizeof(float));
+            memcpy(&b, &out, sizeof(float));
+            EXPECT_EQ(a, b) << "i=" << i;
+        }
+    }
+}
+
+TEST(EncodingCoverage, GorillaDoubleSpecialValues) {
+    DoubleGorillaEncoder enc;
+    common::ByteStream s(256, common::MOD_DEFAULT);
+    std::vector<double> values = {
+        0.0,
+        -0.0,
+        std::numeric_limits<double>::infinity(),
+        -std::numeric_limits<double>::infinity(),
+        std::numeric_limits<double>::min(),
+        std::numeric_limits<double>::denorm_min(),
+        std::numeric_limits<double>::epsilon(),
+        1.0,
+        -1.0,
+        std::numeric_limits<double>::max(),
+        std::numeric_limits<double>::lowest(),
+    };
+    for (double v : values) ASSERT_EQ(enc.encode(v, s), common::E_OK);
+    enc.flush(s);
+
+    DoubleGorillaDecoder dec;
+    double out;
+    for (size_t i = 0; i < values.size(); i++) {
+        ASSERT_EQ(dec.read_double(out, s), common::E_OK) << "i=" << i;
+        uint64_t a, b;
+        memcpy(&a, &values[i], sizeof(double));
+        memcpy(&b, &out, sizeof(double));
+        EXPECT_EQ(a, b) << "i=" << i;
+    }
+}
+
+// ── Gorilla skip path ───────────────────────────────────────────────────
+TEST(EncodingCoverage, GorillaSkipInt32Roundtrip) {
+    IntGorillaEncoder enc;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 200;
+    std::vector<int32_t> values(N);
+    for (int i = 0; i < N; i++) {
+        values[i] = i * 11 - 5;
+        ASSERT_EQ(enc.encode(values[i], stream), common::E_OK);
+    }
+    enc.flush(stream);
+
+    // Wrap into contiguous buffer for batch_skip_raw.
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    IntGorillaDecoder dec;
+    int skipped = 0;
+    ASSERT_EQ(dec.skip_int32(50, skipped, wrapped), common::E_OK);
+    EXPECT_EQ(skipped, 50);
+    int32_t out[N];
+    int actual = 0;
+    ASSERT_EQ(dec.read_batch_int32(out, N - 50, actual, wrapped), common::E_OK);
+    EXPECT_EQ(actual, N - 50);
+    for (int i = 0; i < N - 50; i++) {
+        EXPECT_EQ(out[i], values[50 + i]) << "i=" << i;
+    }
+}
+
+// ── TS2DIFF batch decode hits SIMD block + scalar tail ─────────────────
+TEST(EncodingCoverage, TS2DIFFBatchInt32MultipleBlocks) {
+    TS2DIFFEncoder<int32_t> enc;
+    common::ByteStream s(8192, common::MOD_DEFAULT);
+    // Encode 500 values to span ~4 blocks (default block size 128).
+    const int N = 500;
+    std::vector<int32_t> values(N);
+    for (int i = 0; i < N; i++) {
+        values[i] = i * 7 + 3;
+        ASSERT_EQ(enc.encode(values[i], s), common::E_OK);
+    }
+    ASSERT_EQ(enc.flush(s), common::E_OK);
+
+    // Wrap-from for the SIMD/scalar block fast path.
+    uint32_t total = s.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    s.read_buf(buf.data(), total, got);
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    TS2DIFFDecoder<int32_t> dec;
+    std::vector<int32_t> out(N);
+    int total_decoded = 0;
+    while (dec.has_remaining(wrapped) && total_decoded < N) {
+        int actual = 0;
+        ASSERT_EQ(dec.read_batch_int32(out.data() + total_decoded,
+                                       N - total_decoded, actual, wrapped),
+                  common::E_OK);
+        if (actual == 0) break;
+        total_decoded += actual;
+    }
+    EXPECT_EQ(total_decoded, N);
+    for (int i = 0; i < N; i++) EXPECT_EQ(out[i], values[i]) << "i=" << i;
+}
+
+TEST(EncodingCoverage, TS2DIFFBatchInt64MultipleBlocks) {
+    TS2DIFFEncoder<int64_t> enc;
+    common::ByteStream s(8192, common::MOD_DEFAULT);
+    const int N = 500;
+    std::vector<int64_t> values(N);
+    for (int i = 0; i < N; i++) {
+        values[i] = static_cast<int64_t>(i) * 17 + 41;
+        ASSERT_EQ(enc.encode(values[i], s), common::E_OK);
+    }
+    ASSERT_EQ(enc.flush(s), common::E_OK);
+
+    uint32_t total = s.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    s.read_buf(buf.data(), total, got);
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    TS2DIFFDecoder<int64_t> dec;
+    std::vector<int64_t> out(N);
+    int total_decoded = 0;
+    while (dec.has_remaining(wrapped) && total_decoded < N) {
+        int actual = 0;
+        ASSERT_EQ(dec.read_batch_int64(out.data() + total_decoded,
+                                       N - total_decoded, actual, wrapped),
+                  common::E_OK);
+        if (actual == 0) break;
+        total_decoded += actual;
+    }
+    EXPECT_EQ(total_decoded, N);
+    for (int i = 0; i < N; i++) EXPECT_EQ(out[i], values[i]) << "i=" << i;
+}
+
+// ── Plain encoder: encode_batch fast paths for each type ───────────────
+TEST(EncodingCoverage, PlainEncoderBatchAllTypes) {
+    PlainEncoder enc;
+    PlainDecoder dec;
+
+    // Float batch.
+    {
+        common::ByteStream s(1024, common::MOD_DEFAULT);
+        const uint32_t N = 100;
+        float v[N];
+        for (uint32_t i = 0; i < N; i++) v[i] = i * 0.5f - 1.0f;
+        ASSERT_EQ(enc.encode_batch(v, N, s), common::E_OK);
+        float out[N];
+        int actual = 0;
+        ASSERT_EQ(dec.read_batch_float(out, N, actual, s), common::E_OK);
+        EXPECT_EQ(actual, static_cast<int>(N));
+        for (uint32_t i = 0; i < N; i++) EXPECT_FLOAT_EQ(out[i], v[i]);
+    }
+    // Int64 batch.
+    {
+        common::ByteStream s(1024, common::MOD_DEFAULT);
+        const uint32_t N = 100;
+        int64_t v[N];
+        for (uint32_t i = 0; i < N; i++) v[i] = i * 1000 - 50;
+        ASSERT_EQ(enc.encode_batch(v, N, s), common::E_OK);
+        int64_t out[N];
+        int actual = 0;
+        ASSERT_EQ(dec.read_batch_int64(out, N, actual, s), common::E_OK);
+        EXPECT_EQ(actual, static_cast<int>(N));
+        for (uint32_t i = 0; i < N; i++) EXPECT_EQ(out[i], v[i]);
+    }
+}
+
+// ── PlainDecoder skip paths (wrapped + paged) ──────────────────────────
+TEST(EncodingCoverage, PlainSkipPagedStream) {
+    PlainEncoder enc;
+    PlainDecoder dec;
+    // Paged ByteStream (tiny page) forces the fallback path.
+    common::ByteStream s(16, common::MOD_DEFAULT);
+    for (int i = 0; i < 32; i++)
+        ASSERT_EQ(enc.encode((int64_t)i, s), common::E_OK);
+    int skipped = 0;
+    ASSERT_EQ(dec.skip_int64(10, skipped, s), common::E_OK);
+    EXPECT_EQ(skipped, 10);
+    int64_t out;
+    ASSERT_EQ(dec.read_int64(out, s), common::E_OK);
+    EXPECT_EQ(out, 10);
+}
+
+// ── Dictionary codec roundtrip ─────────────────────────────────────────
+TEST(EncodingCoverage, DictionaryStringRoundTrip) {
+    DictionaryEncoder enc;
+    common::ByteStream s(1024, common::MOD_DEFAULT);
+
+    std::vector<std::string> raw = {"apple",  "banana", "apple",
+                                    "cherry", "banana", "apple"};
+    for (const auto& r : raw) {
+        common::String str(const_cast<char*>(r.c_str()), r.size());
+        ASSERT_EQ(enc.encode(str, s), common::E_OK);
+    }
+    enc.flush(s);
+
+    DictionaryDecoder dec;
+    common::PageArena pa;
+    pa.init(512, common::MOD_DEFAULT);
+    for (const auto& r : raw) {
+        common::String out;
+        ASSERT_EQ(dec.read_String(out, pa, s), common::E_OK);
+        ASSERT_EQ(out.len_, r.size());
+        EXPECT_EQ(std::string(out.buf_, out.len_), r);
+    }
+}
+
+}  // namespace storage
diff --git a/cpp/test/encoding/gorilla_codec_test.cc b/cpp/test/encoding/gorilla_codec_test.cc
index 47056a6db..945451088 100644
--- a/cpp/test/encoding/gorilla_codec_test.cc
+++ b/cpp/test/encoding/gorilla_codec_test.cc
@@ -207,4 +207,319 @@ TEST_F(GorillaCodecTest, DoubleEncodingDecodingBoundaryValues) {
     }
 }
 
+// ── Batch decode tests (exercises the raw-pointer GorillaBitReader path) ──
+
+TEST_F(GorillaCodecTest, Int32BatchDecode) {
+    storage::IntGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 500;
+    int32_t expected[N];
+    for (int i = 0; i < N; i++) {
+        expected[i] = i * 7 - 100;
+        EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    // Copy to a contiguous buffer and wrap (simulates production path)
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+    ASSERT_EQ(got, total);
+
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::IntGorillaDecoder decoder;
+    int32_t out[N];
+    int total_decoded = 0;
+    while (decoder.has_remaining(wrapped) && total_decoded < N) {
+        int batch = std::min(129, N - total_decoded);
+        int actual = 0;
+        EXPECT_EQ(decoder.read_batch_int32(out + total_decoded, batch, actual,
+                                           wrapped),
+                  common::E_OK);
+        if (actual == 0) break;
+        total_decoded += actual;
+    }
+    ASSERT_EQ(total_decoded, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_EQ(out[i], expected[i]) << "mismatch at index " << i;
+    }
+}
+
+TEST_F(GorillaCodecTest, Int64BatchDecode) {
+    storage::LongGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 500;
+    int64_t expected[N];
+    for (int i = 0; i < N; i++) {
+        expected[i] = (int64_t)i * 13 - 200;
+        EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::LongGorillaDecoder decoder;
+    int64_t out[N];
+    int total_decoded = 0;
+    while (decoder.has_remaining(wrapped) && total_decoded < N) {
+        int batch = std::min(129, N - total_decoded);
+        int actual = 0;
+        EXPECT_EQ(decoder.read_batch_int64(out + total_decoded, batch, actual,
+                                           wrapped),
+                  common::E_OK);
+        if (actual == 0) break;
+        total_decoded += actual;
+    }
+    ASSERT_EQ(total_decoded, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_EQ(out[i], expected[i]) << "mismatch at index " << i;
+    }
+}
+
+TEST_F(GorillaCodecTest, FloatBatchDecode) {
+    storage::FloatGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 300;
+    std::vector<float> expected(N);
+    for (int i = 0; i < N; i++) {
+        expected[i] = (float)i * 1.5f - 50.0f;
+        EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::FloatGorillaDecoder decoder;
+    std::vector<float> out(N);
+    int total_decoded = 0;
+    while (decoder.has_remaining(wrapped) && total_decoded < N) {
+        int batch = std::min(129, N - total_decoded);
+        int actual = 0;
+        EXPECT_EQ(decoder.read_batch_float(out.data() + total_decoded, batch,
+                                           actual, wrapped),
+                  common::E_OK);
+        if (actual == 0) break;
+        total_decoded += actual;
+    }
+    ASSERT_EQ(total_decoded, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_FLOAT_EQ(out[i], expected[i]) << "mismatch at index " << i;
+    }
+}
+
+TEST_F(GorillaCodecTest, DoubleBatchDecode) {
+    storage::DoubleGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 300;
+    std::vector<double> expected(N);
+    for (int i = 0; i < N; i++) {
+        expected[i] = (double)i * 2.7 - 100.0;
+        EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::DoubleGorillaDecoder decoder;
+    std::vector<double> out(N);
+    int total_decoded = 0;
+    while (decoder.has_remaining(wrapped) && total_decoded < N) {
+        int batch = std::min(129, N - total_decoded);
+        int actual = 0;
+        EXPECT_EQ(decoder.read_batch_double(out.data() + total_decoded, batch,
+                                            actual, wrapped),
+                  common::E_OK);
+        if (actual == 0) break;
+        total_decoded += actual;
+    }
+    ASSERT_EQ(total_decoded, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_DOUBLE_EQ(out[i], expected[i]) << "mismatch at index " << i;
+    }
+}
+
+TEST_F(GorillaCodecTest, Int32BatchSkip) {
+    storage::IntGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 200;
+    int32_t expected[N];
+    for (int i = 0; i < N; i++) {
+        expected[i] = i * 3;
+        EXPECT_EQ(encoder.encode(expected[i], stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::IntGorillaDecoder decoder;
+    // Skip first 50 values
+    int skipped = 0;
+    EXPECT_EQ(decoder.skip_int32(50, skipped, wrapped), common::E_OK);
+    EXPECT_EQ(skipped, 50);
+    // Read next 50 values
+    int32_t out[50];
+    int actual = 0;
+    EXPECT_EQ(decoder.read_batch_int32(out, 50, actual, wrapped), common::E_OK);
+    EXPECT_EQ(actual, 50);
+    for (int i = 0; i < 50; i++) {
+        EXPECT_EQ(out[i], expected[50 + i]) << "mismatch at index " << i;
+    }
+}
+
+// Regression: batch_decode_raw used to write out[0] unconditionally in the
+// bootstrap branch, even when capacity was 0. Verify the entry path early
+// returns and leaves the stream + state untouched.
+TEST_F(GorillaCodecTest, Int32BatchDecodeZeroCapacity) {
+    storage::IntGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 8;
+    for (int i = 0; i < N; i++) {
+        ASSERT_EQ(encoder.encode(i, stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::IntGorillaDecoder decoder;
+    int32_t sentinel[1] = {0x7fffffff};
+    int actual = 42;
+    EXPECT_EQ(decoder.read_batch_int32(sentinel, 0, actual, wrapped),
+              common::E_OK);
+    EXPECT_EQ(actual, 0);
+    EXPECT_EQ(sentinel[0], 0x7fffffff);  // not written
+
+    // Followup decode should still read the first value 0.
+    int32_t out[N];
+    int got_actual = 0;
+    EXPECT_EQ(decoder.read_batch_int32(out, N, got_actual, wrapped),
+              common::E_OK);
+    EXPECT_EQ(got_actual, N);
+    for (int i = 0; i < N; i++) EXPECT_EQ(out[i], i);
+}
+
+TEST_F(GorillaCodecTest, Int64BatchDecodeZeroCapacity) {
+    storage::LongGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    for (int i = 0; i < 8; i++) {
+        ASSERT_EQ(encoder.encode(static_cast<int64_t>(i), stream),
+                  common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+    common::ByteStream wrapped(common::MOD_DEFAULT);
+    wrapped.wrap_from((const char*)buf.data(), total);
+
+    storage::LongGorillaDecoder decoder;
+    int64_t sentinel[1] = {0x7fffffffffffffffLL};
+    int actual = 42;
+    EXPECT_EQ(decoder.read_batch_int64(sentinel, 0, actual, wrapped),
+              common::E_OK);
+    EXPECT_EQ(actual, 0);
+    EXPECT_EQ(sentinel[0], 0x7fffffffffffffffLL);  // not written
+}
+
+// Regression: a truncated Gorilla page used to spin GorillaBitReader::read_long
+// forever (bits stays 0, n -= 0 never decreases) and GorillaBitReader::read_bit
+// would compute (cur_byte >> -1).  batch_decode_raw must now surface
+// E_BUF_NOT_ENOUGH instead of looping.
+TEST_F(GorillaCodecTest, Int32BatchDecodeTruncatedInputReturnsError) {
+    // Encode enough values to fill several bits, then chop the buffer down to
+    // a small prefix so the decoder runs out of bits mid-value.
+    storage::IntGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 32;
+    for (int i = 0; i < N; i++) {
+        ASSERT_EQ(encoder.encode(i * 11 + 3, stream), common::E_OK);
+    }
+    encoder.flush(stream);
+
+    uint32_t total = stream.total_size();
+    ASSERT_GT(total, 4u);
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+    ASSERT_EQ(got, total);
+
+    // 3 bytes is large enough to bootstrap the first value (depending on
+    // VALUE_BITS_LENGTH_32BIT) but typically too short for the full batch.
+    common::ByteStream truncated(common::MOD_DEFAULT);
+    truncated.wrap_from((const char*)buf.data(), 3);
+
+    storage::IntGorillaDecoder decoder;
+    int32_t out[N];
+    int actual = -1;
+    int ret = decoder.read_batch_int32(out, N, actual, truncated);
+    // Either the decoder reports the truncation, or it stops early without
+    // looping forever; both are acceptable.  What MUST NOT happen is a hang
+    // or a full-batch return — the test will time out on a hang via the
+    // GoogleTest harness.
+    EXPECT_TRUE(ret == common::E_OK || ret == common::E_BUF_NOT_ENOUGH)
+        << "unexpected ret=" << ret;
+    EXPECT_LT(actual, N);
+}
+
+TEST_F(GorillaCodecTest, Int64BatchDecodeTruncatedInputReturnsError) {
+    storage::LongGorillaEncoder encoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const int N = 32;
+    for (int i = 0; i < N; i++) {
+        ASSERT_EQ(encoder.encode(static_cast<int64_t>(i) * 17 + 5, stream),
+                  common::E_OK);
+    }
+    encoder.flush(stream);
+    uint32_t total = stream.total_size();
+    ASSERT_GT(total, 4u);
+    std::vector<uint8_t> buf(total);
+    uint32_t got = 0;
+    stream.read_buf(buf.data(), total, got);
+    ASSERT_EQ(got, total);
+
+    common::ByteStream truncated(common::MOD_DEFAULT);
+    truncated.wrap_from((const char*)buf.data(), 3);
+
+    storage::LongGorillaDecoder decoder;
+    int64_t out[N];
+    int actual = -1;
+    int ret = decoder.read_batch_int64(out, N, actual, truncated);
+    EXPECT_TRUE(ret == common::E_OK || ret == common::E_BUF_NOT_ENOUGH)
+        << "unexpected ret=" << ret;
+    EXPECT_LT(actual, N);
+}
+
 }  // namespace storage
diff --git a/cpp/test/encoding/plain_codec_test.cc b/cpp/test/encoding/plain_codec_test.cc
index a51fa9261..6372469e6 100644
--- a/cpp/test/encoding/plain_codec_test.cc
+++ b/cpp/test/encoding/plain_codec_test.cc
@@ -110,4 +110,90 @@ TEST(PlainEncoderDecoderTest, EncodeDecodeDouble) {
     EXPECT_DOUBLE_EQ(original, decoded);
 }
 
+// Regression: read_batch_int64/float/double used to dereference
+// in.get_wrapped_buf() unconditionally, which is null for a normal paged
+// ByteStream. Verify the fallback path produces correct results.
+TEST(PlainEncoderDecoderTest, ReadBatchInt64PagedStream) {
+    PlainEncoder encoder;
+    PlainDecoder decoder;
+    // Tiny page size forces multi-page write so the stream is paged, not
+    // wrapped.
+    common::ByteStream stream(16, common::MOD_DEFAULT);
+    const int N = 32;
+    int64_t values[N];
+    for (int i = 0; i < N; i++) {
+        values[i] = static_cast<int64_t>(i) * 7 - 3;
+        encoder.encode(values[i], stream);
+    }
+    int64_t out[N];
+    int actual = 0;
+    EXPECT_EQ(decoder.read_batch_int64(out, N, actual, stream), common::E_OK);
+    EXPECT_EQ(actual, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_EQ(out[i], values[i]) << "mismatch at " << i;
+    }
+}
+
+TEST(PlainEncoderDecoderTest, ReadBatchFloatPagedStream) {
+    PlainEncoder encoder;
+    PlainDecoder decoder;
+    common::ByteStream stream(16, common::MOD_DEFAULT);
+    const int N = 32;
+    float values[N];
+    for (int i = 0; i < N; i++) {
+        values[i] = static_cast<float>(i) * 0.5f - 1.25f;
+        encoder.encode(values[i], stream);
+    }
+    float out[N];
+    int actual = 0;
+    EXPECT_EQ(decoder.read_batch_float(out, N, actual, stream), common::E_OK);
+    EXPECT_EQ(actual, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_FLOAT_EQ(out[i], values[i]);
+    }
+}
+
+// Regression: encode_batch(const double*) used to reinterpret_cast to
+// int64_t* and dispatch into the int64 path, which read the doubles through
+// an int64_t pointer — a strict-aliasing violation under -O.  The dedicated
+// double path now memcpys per element; verify a full round-trip through it.
+TEST(PlainEncoderDecoderTest, EncodeBatchDoubleRoundTrip) {
+    PlainEncoder encoder;
+    PlainDecoder decoder;
+    common::ByteStream stream(1024, common::MOD_DEFAULT);
+    const uint32_t N = 64;
+    double values[N];
+    for (uint32_t i = 0; i < N; i++) {
+        values[i] = static_cast<double>(i) * 0.125 - 3.14;
+    }
+    ASSERT_EQ(encoder.encode_batch(values, N, stream), common::E_OK);
+
+    double out[N];
+    int actual = 0;
+    EXPECT_EQ(decoder.read_batch_double(out, N, actual, stream), common::E_OK);
+    EXPECT_EQ(actual, static_cast<int>(N));
+    for (uint32_t i = 0; i < N; i++) {
+        EXPECT_DOUBLE_EQ(out[i], values[i]) << "mismatch at " << i;
+    }
+}
+
+TEST(PlainEncoderDecoderTest, ReadBatchDoublePagedStream) {
+    PlainEncoder encoder;
+    PlainDecoder decoder;
+    common::ByteStream stream(16, common::MOD_DEFAULT);
+    const int N = 32;
+    double values[N];
+    for (int i = 0; i < N; i++) {
+        values[i] = static_cast<double>(i) * 1.25 + 3.14;
+        encoder.encode(values[i], stream);
+    }
+    double out[N];
+    int actual = 0;
+    EXPECT_EQ(decoder.read_batch_double(out, N, actual, stream), common::E_OK);
+    EXPECT_EQ(actual, N);
+    for (int i = 0; i < N; i++) {
+        EXPECT_DOUBLE_EQ(out[i], values[i]);
+    }
+}
+
 }  // end namespace storage
\ No newline at end of file
diff --git a/cpp/test/encoding/ts2diff_codec_test.cc b/cpp/test/encoding/ts2diff_codec_test.cc
index 3164edafb..fb997103c 100644
--- a/cpp/test/encoding/ts2diff_codec_test.cc
+++ b/cpp/test/encoding/ts2diff_codec_test.cc
@@ -364,4 +364,120 @@ TEST_F(TS2DIFFCodecTest, TestEncodingLast) {
     EXPECT_FALSE(decoder_int_->has_remaining(out_stream_int32));
 }
 
+// Regression: skip_int32/skip_int64 used to advance the stream by the full
+// block size even when the requested skip count fell short of the block,
+// which silently dropped values from the next read in aligned nullable
+// columns.  Verify that skipping a count smaller than the first block leaves
+// the remainder of that block intact and decodable.
+TEST_F(TS2DIFFCodecTest, SkipPartialBlockInt32PreservesRemainder) {
+    common::ByteStream out_stream(1024, common::MOD_TS2DIFF_OBJ, false);
+    const int row_num = 1024;
+    std::vector<int32_t> data(row_num);
+    for (int i = 0; i < row_num; i++) {
+        data[i] = i * 3 + 7;
+    }
+    for (int i = 0; i < row_num; i++) {
+        ASSERT_EQ(encoder_int_->encode(data[i], out_stream), common::E_OK);
+    }
+    ASSERT_EQ(encoder_int_->flush(out_stream), common::E_OK);
+
+    const int skip_count = 5;
+    int skipped = 0;
+    ASSERT_EQ(decoder_int_->skip_int32(skip_count, skipped, out_stream),
+              common::E_OK);
+    EXPECT_EQ(skipped, skip_count);
+
+    int32_t v;
+    for (int i = skip_count; i < row_num; i++) {
+        ASSERT_EQ(decoder_int_->read_int32(v, out_stream), common::E_OK);
+        EXPECT_EQ(v, data[i]) << "mismatch at idx " << i;
+    }
+}
+
+TEST_F(TS2DIFFCodecTest, SkipPartialBlockInt64PreservesRemainder) {
+    common::ByteStream out_stream(1024, common::MOD_TS2DIFF_OBJ, false);
+    const int row_num = 1024;
+    std::vector<int64_t> data(row_num);
+    for (int i = 0; i < row_num; i++) {
+        data[i] = static_cast<int64_t>(i) * 13 + 11;
+    }
+    for (int i = 0; i < row_num; i++) {
+        ASSERT_EQ(encoder_long_->encode(data[i], out_stream), common::E_OK);
+    }
+    ASSERT_EQ(encoder_long_->flush(out_stream), common::E_OK);
+
+    const int skip_count = 7;
+    int skipped = 0;
+    ASSERT_EQ(decoder_long_->skip_int64(skip_count, skipped, out_stream),
+              common::E_OK);
+    EXPECT_EQ(skipped, skip_count);
+
+    int64_t v;
+    for (int i = skip_count; i < row_num; i++) {
+        ASSERT_EQ(decoder_long_->read_int64(v, out_stream), common::E_OK);
+        EXPECT_EQ(v, data[i]) << "mismatch at idx " << i;
+    }
+}
+
+// Regression: pack_bits_msb used to drop ByteStream::write_buf's return value
+// on the floor and unconditionally return 0 (success).  flush() then reported
+// E_OK and reset() wiped encoder state even when the actual data never made
+// it onto the stream.  The fix surfaces the underlying error code via the
+// helper's return value.
+//
+// We can't easily inject a real write failure without a custom allocator
+// (ByteStream::write_buf only fails on OOM), so this test pins down the
+// contract on the visible boundary: a wide bit_width must return the
+// dedicated "fallback" sentinel (-1) so flush() knows to take the per-bit
+// path, and the helper's return type must be the error code from write_buf
+// otherwise.  Future refactors that swallow the write error would either
+// stop returning -1 for fallback (caught here) or break round-trip in the
+// happy-path test below.
+TEST_F(TS2DIFFCodecTest, PackBitsMsbFallbackSentinelStillReported) {
+    common::ByteStream out(1024, common::MOD_TS2DIFF_OBJ, false);
+    int64_t values[4] = {1, 2, 3, 4};
+    EXPECT_EQ(TS2DIFFEncoder<int64_t>::pack_bits_msb(values, 4, 57, out), -1);
+    // Healthy small bit_width writes succeed.
+    int32_t small_values[4] = {1, 2, 3, 4};
+    EXPECT_EQ(TS2DIFFEncoder<int32_t>::pack_bits_msb(small_values, 4, 3, out),
+              common::E_OK);
+}
+
+// Regression: FloatTS2DIFFEncoder / DoubleTS2DIFFEncoder kept the previous
+// page's overflow markers in underflow_flags_ when reset() was called
+// directly (PageWriter drops a partial page that way).  The next page would
+// then read the stale flags and emit a wrong overflow bitmap.  reset() now
+// clears underflow_flags_; verify a reset between pages doesn't leak the
+// first page's overflow state into the second.
+TEST(FloatTS2DIFFEncoderResetTest, ResetClearsUnderflowFlags) {
+    storage::FloatTS2DIFFEncoder enc;
+    common::ByteStream out1(1024, common::MOD_TS2DIFF_OBJ, false);
+    // Encode a value that overflows the scale factor so the encoder records
+    // an underflow flag.
+    const float overflow_value = 1e30f;  // scaled > INT32_MAX
+    ASSERT_EQ(enc.encode(0.0f, out1), common::E_OK);
+    ASSERT_EQ(enc.encode(overflow_value, out1), common::E_OK);
+
+    // Drop the page without flushing.  PageWriter does exactly this when
+    // discarding a half-built page.
+    enc.reset();
+
+    // Encode a clean page that should not have any overflow markers.
+    common::ByteStream out2(1024, common::MOD_TS2DIFF_OBJ, false);
+    ASSERT_EQ(enc.encode(0.0f, out2), common::E_OK);
+    ASSERT_EQ(enc.encode(1.0f, out2), common::E_OK);
+    ASSERT_EQ(enc.encode(2.0f, out2), common::E_OK);
+    ASSERT_EQ(enc.flush(out2), common::E_OK);
+
+    // Round-trip the clean page; if reset() leaked the stale overflow flags
+    // the decoder would misinterpret the leading bytes as an overflow
+    // bitmap header and fail to recover the original values.
+    storage::FloatTS2DIFFDecoder dec;
+    float v = 0.0f;
+    for (int i = 0; i < 3; i++) {
+        ASSERT_EQ(dec.read_float(v, out2), common::E_OK);
+        EXPECT_NEAR(v, static_cast<float>(i), 1e-5f);
+    }
+}
+
 }  // namespace storage
diff --git a/cpp/test/file/restorable_tsfile_io_writer_test.cc b/cpp/test/file/restorable_tsfile_io_writer_test.cc
index 8f723e056..c60a855c5 100644
--- a/cpp/test/file/restorable_tsfile_io_writer_test.cc
+++ b/cpp/test/file/restorable_tsfile_io_writer_test.cc
@@ -994,4 +994,70 @@ TEST_F(RestorableTsFileIOWriterTest,
         }
         ASSERT_EQ(table_writer2.close(), E_OK);
     }
-}
\ No newline at end of file
+}
+
+// Regression: recovery of an aligned single-page value chunk must consult the
+// page's not-null bitmap to bind each decoded value to its real timestamp.
+// The bug paired non-null values densely with times[0..N-1], so a column whose
+// only non-null entry sat at the tail surfaced start_time/end_time equal to
+// the head of the time chunk, which then leaked through chunk-level time
+// filters.
+TEST_F(RestorableTsFileIOWriterTest, RecoveryAlignedSparseStatRespectsBitmap) {
+    const int64_t kBase = 100;
+    const int kRowCount = 10;
+    const int kNonNullRow = 7;
+    const std::string table_name = "sparse_aligned_t";
+    std::vector<MeasurementSchema*> ms_vec;
+    ms_vec.push_back(new MeasurementSchema("device", STRING));
+    ms_vec.push_back(new MeasurementSchema("s1", INT64));
+    std::vector<ColumnCategory> cats = {ColumnCategory::TAG,
+                                        ColumnCategory::FIELD};
+    TableSchema table_schema(table_name, ms_vec, cats);
+    {
+        WriteFile wf;
+        ASSERT_EQ(wf.create(file_name_, GetWriteCreateFlags(), 0666), E_OK);
+        TsFileTableWriter tw(&wf, &table_schema);
+        Tablet tablet(table_schema.get_measurement_names(),
+                      table_schema.get_data_types(), kRowCount);
+        tablet.set_table_name(table_name);
+        for (int i = 0; i < kRowCount; i++) {
+            tablet.add_timestamp(i, kBase + i);
+            tablet.add_value(i, "device", "d0");
+            // Only row kNonNullRow gets a value; the rest stay null.
+            if (i == kNonNullRow) {
+                tablet.add_value(i, "s1", static_cast<int64_t>(999));
+            }
+        }
+        ASSERT_EQ(tw.write_table(tablet), E_OK);
+        ASSERT_EQ(tw.flush(), E_OK);
+        ASSERT_EQ(tw.close(), E_OK);
+        wf.close();
+    }
+
+    CorruptCurrentFileTail(3);
+
+    RestorableTsFileIOWriter rw;
+    ASSERT_EQ(rw.open(file_name_, true), E_OK);
+
+    const std::vector<ChunkGroupMeta*>& cgms =
+        rw.get_recovered_chunk_group_metas();
+    ASSERT_FALSE(cgms.empty());
+
+    bool found_value_chunk = false;
+    for (ChunkGroupMeta* cgm : cgms) {
+        if (cgm == nullptr) continue;
+        for (auto it = cgm->chunk_meta_list_.begin();
+             it != cgm->chunk_meta_list_.end(); it++) {
+            ChunkMeta* cm = it.get();
+            if (cm == nullptr) continue;
+            if (cm->measurement_name_.to_std_string() != "s1") continue;
+            ASSERT_NE(cm->statistic_, nullptr);
+            // Exactly one non-null row at timestamp kBase + kNonNullRow.
+            EXPECT_EQ(cm->statistic_->count_, 1);
+            EXPECT_EQ(cm->statistic_->start_time_, kBase + kNonNullRow);
+            EXPECT_EQ(cm->statistic_->end_time_, kBase + kNonNullRow);
+            found_value_chunk = true;
+        }
+    }
+    EXPECT_TRUE(found_value_chunk);
+}
diff --git a/cpp/test/file/write_file_test.cc b/cpp/test/file/write_file_test.cc
index 3cb9edd25..615f069e8 100644
--- a/cpp/test/file/write_file_test.cc
+++ b/cpp/test/file/write_file_test.cc
@@ -141,3 +141,47 @@ TEST_F(WriteFileTest, TruncateFile) {
     EXPECT_EQ(file_content, "Hello, ");
     remove(file_name.c_str());
 }
+
+#include "file/tsfile_io_writer.h"
+
+// Regression: TsFileIOWriter::init() used to leave destroyed_=true after a
+// previous destroy(), so the second destroy() (during ~TsFileIOWriter())
+// short-circuited and skipped meta_allocator_.destroy() /
+// write_stream_.destroy() / file_ cleanup, leaking everything from the
+// new lifecycle.  Verify init() rearms the lifecycle by checking destroy()
+// runs again cleanly.
+TEST(TsFileIOWriterLifecycle, DestroyInitDestroyIsClean) {
+    std::string fn = "tsfile_iowriter_lifecycle.dat";
+    remove(fn.c_str());
+
+    WriteFile wf1;
+    int flags = O_WRONLY | O_CREAT | O_TRUNC;
+#ifdef _WIN32
+    flags |= O_BINARY;
+#endif
+    ASSERT_EQ(wf1.create(fn, flags, 0666), E_OK);
+
+    TsFileIOWriter w;
+    ASSERT_EQ(w.init(&wf1), E_OK);
+    w.destroy();
+
+    // Re-init against a fresh WriteFile (same writer object).  Under the
+    // old bug, destroyed_ stays true here.
+    remove(fn.c_str());
+    WriteFile wf2;
+    ASSERT_EQ(wf2.create(fn, flags, 0666), E_OK);
+    ASSERT_EQ(w.init(&wf2), E_OK);
+
+    // get_meta_size() reads meta_allocator_.get_total_used_bytes(); on a
+    // fresh init() this should be 0 (the allocator was reinitialised).
+    // If destroyed_ had been left true the allocator pages from before
+    // would still be there.
+    EXPECT_EQ(w.get_meta_size(), 0);
+
+    // Trigger second destroy() — must not crash on the re-initialised
+    // resources.
+    w.destroy();
+
+    wf2.close();
+    remove(fn.c_str());
+}
diff --git a/cpp/test/reader/filter/time_in_filter_test.cc b/cpp/test/reader/filter/time_in_filter_test.cc
new file mode 100644
index 000000000..9eceaaaa5
--- /dev/null
+++ b/cpp/test/reader/filter/time_in_filter_test.cc
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * License); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <gtest/gtest.h>
+
+#include "reader/filter/time_operator.h"
+
+using namespace storage;
+
+// Regression: TimeIn::satisfy_start_end_time / contain_start_end_time used to
+// return true unconditionally.  In the aligned batch/multi paths the
+// contain_start_end_time=true branch flips block_all_pass on, the per-row
+// satisfy_batch_time check is skipped, and the reader emits every row in the
+// block — making `WHERE time IN (2, 8)` look identical to "no time filter"
+// whenever the block's time range overlapped the IN list at all.
+
+TEST(TimeInFilterTest, ContainStartEndTimeIsFalseForSparseRange) {
+    TimeIn in({2, 8}, /*not_in=*/false);
+    // Range [0,10] contains many times not in {2,8}; the block cannot
+    // unconditionally pass.
+    EXPECT_FALSE(in.contain_start_end_time(0, 10));
+    // Range that is a single matching point passes.
+    EXPECT_TRUE(in.contain_start_end_time(2, 2));
+    // Single non-matching point: doesn't pass.
+    EXPECT_FALSE(in.contain_start_end_time(5, 5));
+}
+
+TEST(TimeInFilterTest, SatisfyStartEndTimeTracksOverlap) {
+    TimeIn in({2, 8}, /*not_in=*/false);
+    // Some value in range → block may have matching rows.
+    EXPECT_TRUE(in.satisfy_start_end_time(0, 10));
+    EXPECT_TRUE(in.satisfy_start_end_time(2, 2));
+    EXPECT_TRUE(in.satisfy_start_end_time(8, 8));
+    // No value in range → block can be skipped.
+    EXPECT_FALSE(in.satisfy_start_end_time(3, 7));
+    EXPECT_FALSE(in.satisfy_start_end_time(9, 100));
+}
+
+TEST(TimeInFilterTest, NotInContainSemantics) {
+    TimeIn not_in({2, 8}, /*not_in=*/true);
+    // Range [3,7] has no excluded value → every row passes NOT IN.
+    EXPECT_TRUE(not_in.contain_start_end_time(3, 7));
+    // Range [0,10] includes 2 and 8 → cannot blanket-pass.
+    EXPECT_FALSE(not_in.contain_start_end_time(0, 10));
+}
+
+TEST(TimeInFilterTest, NotInSatisfyStartEndTimeSemantics) {
+    TimeIn not_in({2, 8}, /*not_in=*/true);
+    // Single excluded point: filter rejects it.
+    EXPECT_FALSE(not_in.satisfy_start_end_time(2, 2));
+    // Single non-excluded point: filter accepts it.
+    EXPECT_TRUE(not_in.satisfy_start_end_time(5, 5));
+    // A wider range always has at least one non-excluded time.
+    EXPECT_TRUE(not_in.satisfy_start_end_time(0, 10));
+}
+
+TEST(TimeInFilterTest, BatchTimeFallbackUsesScalarSemantics) {
+    TimeIn in({2, 8}, /*not_in=*/false);
+    int64_t times[] = {1, 2, 3, 7, 8, 9};
+    bool mask[6];
+    int pass = in.satisfy_batch_time(times, 6, mask);
+    EXPECT_EQ(pass, 2);
+    EXPECT_FALSE(mask[0]);
+    EXPECT_TRUE(mask[1]);
+    EXPECT_FALSE(mask[2]);
+    EXPECT_FALSE(mask[3]);
+    EXPECT_TRUE(mask[4]);
+    EXPECT_FALSE(mask[5]);
+}
diff --git a/cpp/test/reader/query_by_row_performance_test.cc b/cpp/test/reader/query_by_row_performance_test.cc
index 4caf26f71..051c15d87 100644
--- a/cpp/test/reader/query_by_row_performance_test.cc
+++ b/cpp/test/reader/query_by_row_performance_test.cc
@@ -60,6 +60,7 @@
 #include "file/write_file.h"
 #include "reader/tsfile_reader.h"
 #include "reader/tsfile_tree_reader.h"
+#include "utils/util_define.h"
 #include "writer/tsfile_table_writer.h"
 #include "writer/tsfile_tree_writer.h"
 
@@ -86,7 +87,8 @@ static int query_by_row_perf_iters() {
     return n;
 }
 
-static int compute_offset_with_env(int num_rows, int default_offset) {
+MAYBE_UNUSED static int compute_offset_with_env(int num_rows,
+                                                int default_offset) {
     int offset = default_offset;
     int abs = 0;
     if (get_env_int("QUERY_BY_ROW_PERF_OFFSET", abs)) {
diff --git a/cpp/test/reader/table_view/tsfile_reader_table_batch_test.cc b/cpp/test/reader/table_view/tsfile_reader_table_batch_test.cc
index e115552ec..6e2da1c40 100644
--- a/cpp/test/reader/table_view/tsfile_reader_table_batch_test.cc
+++ b/cpp/test/reader/table_view/tsfile_reader_table_batch_test.cc
@@ -133,6 +133,25 @@ class TsFileTableReaderBatchTest : public ::testing::Test {
                                column_categories);
     }
 
+    static TableSchema* gen_table_schema_with_string_field() {
+        std::vector<MeasurementSchema*> measurement_schemas;
+        std::vector<ColumnCategory> column_categories;
+        measurement_schemas.emplace_back(
+            new MeasurementSchema("id0", TSDataType::STRING, TSEncoding::PLAIN,
+                                  CompressionType::UNCOMPRESSED));
+        column_categories.emplace_back(ColumnCategory::TAG);
+        measurement_schemas.emplace_back(new MeasurementSchema(
+            "s_text", TSDataType::STRING, TSEncoding::PLAIN,
+            CompressionType::UNCOMPRESSED));
+        column_categories.emplace_back(ColumnCategory::FIELD);
+        measurement_schemas.emplace_back(
+            new MeasurementSchema("s_num", TSDataType::INT64, TSEncoding::PLAIN,
+                                  CompressionType::UNCOMPRESSED));
+        column_categories.emplace_back(ColumnCategory::FIELD);
+        return new TableSchema("testTableString", measurement_schemas,
+                               column_categories);
+    }
+
     static storage::Tablet gen_tablet(TableSchema* table_schema, int offset,
                                       int device_num,
                                       int num_timestamp_per_device = 10) {
@@ -171,6 +190,121 @@ class TsFileTableReaderBatchTest : public ::testing::Test {
         delete[] literal;
         return tablet;
     }
+
+    static storage::Tablet gen_tablet_with_string_field(
+        TableSchema* table_schema, int num_rows) {
+        storage::Tablet tablet(table_schema->get_table_name(),
+                               table_schema->get_measurement_names(),
+                               table_schema->get_data_types(),
+                               table_schema->get_column_categories(), num_rows);
+        for (int i = 0; i < num_rows; i++) {
+            tablet.add_timestamp(i, i);
+            tablet.add_value(i, "id0", "device_a");
+            tablet.add_value(i, "s_text", "value_" + std::to_string(i));
+            tablet.add_value(i, "s_num", static_cast<int64_t>(i * 10));
+        }
+        return tablet;
+    }
+
+    std::vector<int64_t> query_timestamps_in_batches(TableSchema* table_schema,
+                                                     int64_t start_time,
+                                                     int64_t end_time,
+                                                     int batch_size) {
+        storage::TsFileReader reader;
+        int ret = reader.open(file_name_);
+        EXPECT_EQ(ret, common::E_OK);
+
+        ResultSet* tmp_result_set = nullptr;
+        ret = reader.query(table_schema->get_table_name(),
+                           table_schema->get_measurement_names(), start_time,
+                           end_time, tmp_result_set, batch_size);
+        EXPECT_EQ(ret, common::E_OK);
+        EXPECT_NE(tmp_result_set, nullptr);
+
+        auto* table_result_set = dynamic_cast<TableResultSet*>(tmp_result_set);
+        EXPECT_NE(table_result_set, nullptr);
+
+        std::vector<int64_t> timestamps;
+        common::TsBlock* block = nullptr;
+        while ((ret = table_result_set->get_next_tsblock(block)) ==
+               common::E_OK) {
+            if (block == nullptr) {
+                ADD_FAILURE() << "Expected non-null TsBlock";
+                break;
+            }
+            common::RowIterator row_iterator(block);
+            while (row_iterator.has_next()) {
+                uint32_t len = 0;
+                bool null = false;
+                int64_t timestamp = *reinterpret_cast<const int64_t*>(
+                    row_iterator.read(0, &len, &null));
+                EXPECT_FALSE(null);
+                timestamps.push_back(timestamp);
+
+                for (uint32_t col_idx = 1;
+                     col_idx < row_iterator.get_column_count(); ++col_idx) {
+                    const char* value = row_iterator.read(col_idx, &len, &null);
+                    EXPECT_FALSE(null);
+                    if (row_iterator.get_data_type(col_idx) ==
+                        TSDataType::INT64) {
+                        int64_t int_val =
+                            *reinterpret_cast<const int64_t*>(value);
+                        EXPECT_EQ(int_val, 0);
+                    }
+                }
+                row_iterator.next();
+            }
+        }
+
+        reader.destroy_query_data_set(table_result_set);
+        EXPECT_EQ(reader.close(), common::E_OK);
+        return timestamps;
+    }
+
+    std::vector<std::pair<int64_t, std::string>> query_string_field_in_batches(
+        TableSchema* table_schema, int64_t start_time, int64_t end_time,
+        int batch_size) {
+        storage::TsFileReader reader;
+        int ret = reader.open(file_name_);
+        EXPECT_EQ(ret, common::E_OK);
+
+        ResultSet* tmp_result_set = nullptr;
+        ret = reader.query(table_schema->get_table_name(),
+                           table_schema->get_measurement_names(), start_time,
+                           end_time, tmp_result_set, batch_size);
+        EXPECT_EQ(ret, common::E_OK);
+        EXPECT_NE(tmp_result_set, nullptr);
+
+        auto* table_result_set = dynamic_cast<TableResultSet*>(tmp_result_set);
+        EXPECT_NE(table_result_set, nullptr);
+
+        std::vector<std::pair<int64_t, std::string>> result;
+        common::TsBlock* block = nullptr;
+        while ((ret = table_result_set->get_next_tsblock(block)) ==
+               common::E_OK) {
+            if (block == nullptr) {
+                ADD_FAILURE() << "Expected non-null TsBlock";
+                break;
+            }
+            common::RowIterator row_iterator(block);
+            while (row_iterator.has_next()) {
+                uint32_t len = 0;
+                bool null = false;
+                int64_t timestamp = *reinterpret_cast<const int64_t*>(
+                    row_iterator.read(0, &len, &null));
+                EXPECT_FALSE(null);
+
+                const char* value = row_iterator.read(2, &len, &null);
+                EXPECT_FALSE(null);
+                result.emplace_back(timestamp, std::string(value, len));
+                row_iterator.next();
+            }
+        }
+
+        reader.destroy_query_data_set(table_result_set);
+        EXPECT_EQ(reader.close(), common::E_OK);
+        return result;
+    }
 };
 
 TEST_F(TsFileTableReaderBatchTest, BatchQueryWithSmallBatchSize) {
@@ -361,6 +495,89 @@ TEST_F(TsFileTableReaderBatchTest, BatchQueryVerifyDataCorrectness) {
     delete table_schema;
 }
 
+TEST_F(TsFileTableReaderBatchTest,
+       BatchQueryKeepsStateAcrossTsBlocksWithinPage) {
+    auto table_schema = gen_table_schema();
+    auto tsfile_table_writer_ =
+        std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
+
+    const int prev_page_point_num = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 128;
+
+    const int device_num = 1;
+    const int points_per_device = 35;
+    auto tablet = gen_tablet(table_schema, 0, device_num, points_per_device);
+    ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK);
+
+    const int batch_size = 8;
+    std::vector<int64_t> timestamps = query_timestamps_in_batches(
+        table_schema, 0, 1000000000000LL, batch_size);
+
+    ASSERT_EQ(timestamps.size(), static_cast<size_t>(points_per_device));
+    for (int64_t i = 0; i < points_per_device; ++i) {
+        EXPECT_EQ(timestamps[i], i);
+    }
+
+    g_config_value_.page_writer_max_point_num_ = prev_page_point_num;
+    delete table_schema;
+}
+
+TEST_F(TsFileTableReaderBatchTest, BatchQueryTimeFilterAcrossBoundaryPages) {
+    auto table_schema = gen_table_schema();
+    auto tsfile_table_writer_ =
+        std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
+
+    const int prev_page_point_num = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 8;
+
+    const int device_num = 1;
+    const int points_per_device = 25;
+    auto tablet = gen_tablet(table_schema, 0, device_num, points_per_device);
+    ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK);
+
+    const int batch_size = 4;
+    std::vector<int64_t> timestamps =
+        query_timestamps_in_batches(table_schema, 5, 18, batch_size);
+
+    ASSERT_EQ(timestamps.size(), static_cast<size_t>(14));
+    for (int64_t i = 0; i < 14; ++i) {
+        EXPECT_EQ(timestamps[i], i + 5);
+    }
+
+    g_config_value_.page_writer_max_point_num_ = prev_page_point_num;
+    delete table_schema;
+}
+
+TEST_F(TsFileTableReaderBatchTest,
+       BatchQueryVariableLengthFieldAcrossTsBlocks) {
+    auto table_schema = gen_table_schema_with_string_field();
+    auto tsfile_table_writer_ =
+        std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
+
+    const int prev_page_point_num = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 8;
+
+    const int num_rows = 23;
+    auto tablet = gen_tablet_with_string_field(table_schema, num_rows);
+    ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK);
+
+    auto result = query_string_field_in_batches(table_schema, 0, INT64_MAX, 5);
+    ASSERT_EQ(result.size(), static_cast<size_t>(num_rows));
+    for (int i = 0; i < num_rows; ++i) {
+        EXPECT_EQ(result[i].first, i);
+        EXPECT_EQ(result[i].second, "value_" + std::to_string(i));
+    }
+
+    g_config_value_.page_writer_max_point_num_ = prev_page_point_num;
+    delete table_schema;
+}
+
 TEST_F(TsFileTableReaderBatchTest, PerformanceComparisonSinglePointVsBatch) {
     // Create table schema without tags (only fields)
     auto table_schema = gen_table_schema_no_tag();
diff --git a/cpp/test/reader/table_view/tsfile_reader_table_test.cc b/cpp/test/reader/table_view/tsfile_reader_table_test.cc
index e55f34c2a..be0a6f64c 100644
--- a/cpp/test/reader/table_view/tsfile_reader_table_test.cc
+++ b/cpp/test/reader/table_view/tsfile_reader_table_test.cc
@@ -209,6 +209,43 @@ class TsFileTableReaderTest : public ::testing::Test {
 
 TEST_F(TsFileTableReaderTest, TableModelQuery) { test_table_model_query(); }
 
+// Regression: single_device_tsblock_reader used to initialise all_outside
+// to true, then bail out when the per-device chunk-list loop didn't
+// execute (e.g. time-only query where time_series_indexs is empty).  The
+// result was an empty resultset whenever a time filter was present, even
+// though there might be rows that satisfy it.  Verify that querying only
+// the time column with a tight filter still returns the matching rows.
+TEST_F(TsFileTableReaderTest, TimeOnlyQueryWithTimeFilterStillReturnsRows) {
+    auto table_schema = gen_table_schema(0);
+    auto tsfile_table_writer_ =
+        std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
+    auto tablet = gen_tablet(table_schema, /*start_ts=*/0, /*device_num=*/1,
+                             /*per_device=*/10);
+    ASSERT_EQ(tsfile_table_writer_->write_table(tablet), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK);
+
+    storage::TsFileReader reader;
+    ASSERT_EQ(reader.open(file_name_), common::E_OK);
+    ResultSet* tmp = nullptr;
+    // Query with an empty measurement list and a time window covering all
+    // 10 timestamps.  Under the bug this returned 0 rows.
+    std::vector<std::string> empty_cols;
+    ASSERT_EQ(reader.query(table_schema->get_table_name(), empty_cols,
+                           /*start_time=*/0, /*end_time=*/9, tmp),
+              common::E_OK);
+    auto* rs = (TableResultSet*)tmp;
+    int rows = 0;
+    bool hn = false;
+    while (IS_SUCC(rs->next(hn)) && hn) {
+        rows++;
+    }
+    EXPECT_EQ(rows, 10);
+    reader.destroy_query_data_set(rs);
+    ASSERT_EQ(reader.close(), common::E_OK);
+    delete table_schema;
+}
+
 TEST_F(TsFileTableReaderTest, TableModelQueryOneSmallPage) {
     int prev_config = g_config_value_.page_writer_max_point_num_;
     g_config_value_.page_writer_max_point_num_ = 5;
@@ -216,11 +253,13 @@ TEST_F(TsFileTableReaderTest, TableModelQueryOneSmallPage) {
     g_config_value_.page_writer_max_point_num_ = prev_config;
 }
 
-// Triggers memory-based seal in aligned table: time page seals by size while
-// value pages may not; ensure value pages are sealed together with time (no
-// time-page-sealed / value-page-not-sealed inconsistency).
-// Use 512 bytes so time seals by size before point count; 128 was too small
-// and could produce misaligned time/value pages on some encodings.
+TEST_F(TsFileTableReaderTest, TableModelQueryOneLargePage) {
+    int prev_config = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 10000;
+    test_table_model_query(g_config_value_.page_writer_max_point_num_);
+    g_config_value_.page_writer_max_point_num_ = prev_config;
+}
+
 TEST_F(TsFileTableReaderTest, TableModelQueryMemoryBasedSeal) {
     uint32_t prev_point_num = g_config_value_.page_writer_max_point_num_;
     uint32_t prev_mem_bytes = g_config_value_.page_writer_max_memory_bytes_;
@@ -231,13 +270,6 @@ TEST_F(TsFileTableReaderTest, TableModelQueryMemoryBasedSeal) {
     g_config_value_.page_writer_max_memory_bytes_ = prev_mem_bytes;
 }
 
-TEST_F(TsFileTableReaderTest, TableModelQueryOneLargePage) {
-    int prev_config = g_config_value_.page_writer_max_point_num_;
-    g_config_value_.page_writer_max_point_num_ = 10000;
-    test_table_model_query(g_config_value_.page_writer_max_point_num_);
-    g_config_value_.page_writer_max_point_num_ = prev_config;
-}
-
 TEST_F(TsFileTableReaderTest, TableModelQueryMultiLargePage) {
     int prev_config = g_config_value_.page_writer_max_point_num_;
     g_config_value_.page_writer_max_point_num_ = 10000;
@@ -1221,4 +1253,4 @@ TEST_F(TsFileTableReaderTest, MultiTagColumnFilterOnSecondTag) {
     ASSERT_EQ(reader.close(), common::E_OK);
     delete table_schema;
     delete tag_filter;
-}
\ No newline at end of file
+}
diff --git a/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc b/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc
index 026f75b2d..9e3d9b562 100644
--- a/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc
+++ b/cpp/test/reader/table_view/tsfile_table_query_by_row_test.cc
@@ -27,7 +27,6 @@
 #include "common/schema.h"
 #include "common/tablet.h"
 #include "file/write_file.h"
-#include "reader/filter/tag_filter.h"
 #include "reader/table_result_set.h"
 #include "reader/tsfile_reader.h"
 #include "writer/tsfile_table_writer.h"
@@ -103,6 +102,41 @@ class TableQueryByRowTest : public ::testing::Test {
         delete schema;
     }
 
+    void write_single_device_file_with_string_field(int num_rows) {
+        std::vector<ColumnSchema> col_schemas = {
+            ColumnSchema("id1", TSDataType::STRING,
+                         CompressionType::UNCOMPRESSED, TSEncoding::PLAIN,
+                         ColumnCategory::TAG),
+            ColumnSchema("s_text", TSDataType::STRING,
+                         CompressionType::UNCOMPRESSED, TSEncoding::PLAIN,
+                         ColumnCategory::FIELD),
+            ColumnSchema("s_num", TSDataType::INT64,
+                         CompressionType::UNCOMPRESSED, TSEncoding::PLAIN,
+                         ColumnCategory::FIELD),
+        };
+        auto* schema = new TableSchema("t_string", col_schemas);
+        auto* writer = new TsFileTableWriter(&write_file_, schema);
+
+        Tablet tablet(
+            "t_string", {"id1", "s_text", "s_num"},
+            {TSDataType::STRING, TSDataType::STRING, TSDataType::INT64},
+            {ColumnCategory::TAG, ColumnCategory::FIELD, ColumnCategory::FIELD},
+            num_rows);
+
+        for (int i = 0; i < num_rows; i++) {
+            tablet.add_timestamp(i, static_cast<int64_t>(i));
+            tablet.add_value(i, "id1", "device_a");
+            tablet.add_value(i, "s_text", "value_" + std::to_string(i));
+            tablet.add_value(i, "s_num", static_cast<int64_t>(i * 10));
+        }
+
+        ASSERT_EQ(writer->write_table(tablet), E_OK);
+        ASSERT_EQ(writer->flush(), E_OK);
+        ASSERT_EQ(writer->close(), E_OK);
+        delete writer;
+        delete schema;
+    }
+
     void write_multi_device_file(int rows_per_device, int device_count) {
         std::vector<ColumnSchema> col_schemas = {
             ColumnSchema("id1", TSDataType::STRING,
@@ -341,6 +375,29 @@ class TableQueryByRowTest : public ::testing::Test {
         return manual;
     }
 
+    std::vector<std::pair<int64_t, std::string>> query_by_row_time_and_text(
+        const std::string& table_name, const std::vector<std::string>& cols,
+        int offset, int limit) {
+        TsFileReader reader;
+        EXPECT_EQ(reader.open(file_name_), E_OK);
+        ResultSet* rs = nullptr;
+        EXPECT_EQ(reader.queryByRow(table_name, cols, offset, limit, rs), E_OK);
+        EXPECT_NE(rs, nullptr);
+
+        std::vector<std::pair<int64_t, std::string>> result;
+        bool has_next = false;
+        while (IS_SUCC(rs->next(has_next)) && has_next) {
+            int64_t time = rs->get_value<int64_t>("time");
+            common::String* text_val = rs->get_value<common::String*>("s_text");
+            result.emplace_back(time,
+                                std::string(text_val->buf_, text_val->len_));
+        }
+
+        reader.destroy_query_data_set(rs);
+        reader.close();
+        return result;
+    }
+
     std::string file_name_;
     WriteFile write_file_;
 };
@@ -356,6 +413,23 @@ TEST_F(TableQueryByRowTest, NoOffsetNoLimit) {
     ASSERT_EQ(result, all);
 }
 
+TEST_F(TableQueryByRowTest, NoOffsetNoLimitWithSmallPages) {
+    int prev_page_config = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 8;
+
+    int num_rows = 25;
+    write_single_device_file(num_rows);
+
+    auto result = query_by_row_time_and_s1("t1", {"id1", "s1", "s2"}, 0, -1);
+    ASSERT_EQ(result.size(), static_cast<size_t>(num_rows));
+    for (int i = 0; i < num_rows; ++i) {
+        EXPECT_EQ(result[i].first, i);
+        EXPECT_EQ(result[i].second, i * 10);
+    }
+
+    g_config_value_.page_writer_max_point_num_ = prev_page_config;
+}
+
 // Offset only: skip first N rows, return the rest; limit=-1 means no cap.
 TEST_F(TableQueryByRowTest, OffsetOnly) {
     int num_rows = 50;
@@ -399,6 +473,43 @@ TEST_F(TableQueryByRowTest, OffsetAndLimit) {
     }
 }
 
+TEST_F(TableQueryByRowTest, OffsetAndLimitWithSmallPages) {
+    int prev_page_config = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 8;
+
+    int num_rows = 40;
+    write_single_device_file(num_rows);
+
+    int offset = 7;
+    int limit = 19;
+    auto by_row =
+        query_by_row_time_and_s1("t1", {"id1", "s1", "s2"}, offset, limit);
+    auto manual =
+        query_manual_time_and_s1("t1", {"id1", "s1", "s2"}, offset, limit);
+
+    ASSERT_EQ(by_row, manual);
+
+    g_config_value_.page_writer_max_point_num_ = prev_page_config;
+}
+
+TEST_F(TableQueryByRowTest, VariableLengthFieldWithSmallPages) {
+    int prev_page_config = g_config_value_.page_writer_max_point_num_;
+    g_config_value_.page_writer_max_point_num_ = 8;
+
+    int num_rows = 21;
+    write_single_device_file_with_string_field(num_rows);
+
+    auto result = query_by_row_time_and_text("t_string",
+                                             {"id1", "s_text", "s_num"}, 0, -1);
+    ASSERT_EQ(result.size(), static_cast<size_t>(num_rows));
+    for (int i = 0; i < num_rows; ++i) {
+        EXPECT_EQ(result[i].first, i);
+        EXPECT_EQ(result[i].second, "value_" + std::to_string(i));
+    }
+
+    g_config_value_.page_writer_max_point_num_ = prev_page_config;
+}
+
 // Offset beyond total row count: returns empty result.
 TEST_F(TableQueryByRowTest, OffsetBeyondData) {
     int num_rows = 30;
@@ -652,15 +763,16 @@ TEST_F(TableQueryByRowTest, DenseSingleDeviceSsiLevelPushdown) {
 
 // Pushdown is faster than full query + manual next: queryByRow(offset, limit)
 // skips at device/SSI/Chunk level; old query then manual next decodes every
-// row. Timing tolerance 20% to allow measurement noise.
+// row. Timing tolerance 5% to allow measurement noise.
 TEST_F(TableQueryByRowTest, DISABLED_QueryByRowFasterThanManualNext) {
-    const int num_rows = 8000;
-    const int offset = 3000;
+    const int num_rows = 80000;
+    const int offset = 30000;
     const int limit = 1000;
     write_single_device_file(num_rows);
 
     const int num_iters = 5;
-    const double tolerance = 0.2;
+    const double tolerance =
+        0.5;  // 50% tolerance for cross-platform timing noise
 
     auto run_query_by_row = [this, offset, limit]() {
         TsFileReader reader;
@@ -725,47 +837,3 @@ TEST_F(TableQueryByRowTest, DISABLED_QueryByRowFasterThanManualNext) {
            "(min_by_row="
         << min_by_row << " ms, min_manual=" << min_manual << " ms)";
 }
-
-// queryByRow with tag filter: only rows matching the tag predicate are
-// returned.
-TEST_F(TableQueryByRowTest, TagFilterEq) {
-    int rows_per_device = 20;
-    int device_count = 3;
-    write_multi_device_file(rows_per_device, device_count);
-
-    // Reconstruct the same schema used by write_multi_device_file.
-    std::vector<ColumnSchema> col_schemas = {
-        ColumnSchema("id1", TSDataType::STRING, CompressionType::UNCOMPRESSED,
-                     TSEncoding::PLAIN, ColumnCategory::TAG),
-        ColumnSchema("s1", TSDataType::INT64, CompressionType::UNCOMPRESSED,
-                     TSEncoding::PLAIN, ColumnCategory::FIELD),
-    };
-    TableSchema schema("t1", col_schemas);
-
-    // Build tag filter: id1 == "dev1"
-    TagFilterBuilder builder(&schema);
-    Filter* tag_filter = builder.eq("id1", "dev1");
-
-    TsFileReader reader;
-    ASSERT_EQ(reader.open(file_name_), E_OK);
-
-    ResultSet* rs = nullptr;
-    ASSERT_EQ(reader.queryByRow("t1", {"id1", "s1"}, 0, -1, rs, tag_filter),
-              E_OK);
-    ASSERT_NE(rs, nullptr);
-
-    std::vector<int64_t> filtered_s1;
-    bool has_next = false;
-    while (IS_SUCC(rs->next(has_next)) && has_next) {
-        filtered_s1.push_back(rs->get_value<int64_t>("s1"));
-    }
-    reader.destroy_query_data_set(rs);
-    reader.close();
-    delete tag_filter;
-
-    // dev1 has rows_per_device rows with s1 = 1*1000+t for t in [0,20).
-    ASSERT_EQ(filtered_s1.size(), static_cast<size_t>(rows_per_device));
-    for (int t = 0; t < rows_per_device; t++) {
-        EXPECT_EQ(filtered_s1[t], static_cast<int64_t>(1 * 1000 + t));
-    }
-}
diff --git a/cpp/test/reader/tree_view/tsfile_reader_tree_test.cc b/cpp/test/reader/tree_view/tsfile_reader_tree_test.cc
index 8181b6130..e4daed748 100644
--- a/cpp/test/reader/tree_view/tsfile_reader_tree_test.cc
+++ b/cpp/test/reader/tree_view/tsfile_reader_tree_test.cc
@@ -509,3 +509,48 @@ TEST_F(TsFileTreeReaderTest, QueryTableOnTreeMissingMeasurement) {
     }
     reader.close();
 }
+
+// Regression: query_table_on_tree with an inverted time range (start > end) on
+// a non-aligned tree device must yield zero rows, not E_NOT_SUPPORT.  The chunk
+// time span straddles both bounds and single-chunk timeseries carry no
+// per-chunk statistic, so the device-level early-skip does NOT short-circuit;
+// the empty value-column result previously fell through to the time-only
+// fallback -> alloc_multi_ssi() (aligned-only) -> E_NOT_SUPPORT.
+TEST_F(TsFileTreeReaderTest, QueryTableOnTreeInvertedTimeRange) {
+    std::string device_id = "root.Device1";
+    std::vector<std::string> measurement_ids = {"m1", "m2", "m3"};
+    {
+        TsFileTreeWriter writer(&write_file_);
+        for (auto const& m : measurement_ids) {
+            auto* schema = new storage::MeasurementSchema(m, TSDataType::INT32);
+            ASSERT_EQ(E_OK, writer.register_timeseries(device_id, schema));
+            delete schema;
+        }
+        for (int i = 0; i < 100; i++) {
+            TsRecord record(device_id, static_cast<int64_t>(i - 50));
+            for (auto const& m : measurement_ids) {
+                record.add_point(m, static_cast<int32_t>(i));
+            }
+            ASSERT_EQ(E_OK, writer.write(record));
+        }
+        writer.flush();
+        writer.close();
+    }
+
+    TsFileReader reader;
+    ASSERT_EQ(E_OK, reader.open(file_name_));
+    ResultSet* result = nullptr;
+    int ret = reader.query_table_on_tree(measurement_ids, 10, -10, result);
+    ASSERT_EQ(E_OK, ret);
+    auto* trs = (storage::TableResultSet*)result;
+    bool has_next = false;
+    int row_cnt = 0;
+    int next_ret = E_OK;
+    while (IS_SUCC(next_ret = trs->next(has_next)) && has_next) {
+        row_cnt++;
+    }
+    EXPECT_EQ(E_OK, next_ret);
+    EXPECT_EQ(0, row_cnt);
+    reader.destroy_query_data_set(result);
+    reader.close();
+}
diff --git a/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc b/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc
index a686b8998..9c47a9d4d 100644
--- a/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc
+++ b/cpp/test/reader/tree_view/tsfile_tree_query_by_row_test.cc
@@ -16,7 +16,6 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-#include <fcntl.h>
 #include <gtest/gtest.h>
 
 #include <chrono>
@@ -25,12 +24,10 @@
 #include "common/global.h"
 #include "common/record.h"
 #include "common/schema.h"
-#include "common/tablet.h"
 #include "file/write_file.h"
 #include "reader/tsfile_reader.h"
 #include "reader/tsfile_tree_reader.h"
 #include "writer/tsfile_tree_writer.h"
-#include "writer/tsfile_writer.h"
 
 using namespace storage;
 using namespace common;
@@ -210,6 +207,90 @@ class TreeQueryByRowTest : public ::testing::Test {
     WriteFile write_file_;
 };
 
+// Regression: aligned value chunks store statistic_->count_ as the
+// non-null row count, not the total row count.  Whole-chunk offset skip
+// used to apply value_cm's count, so a sparse aligned chunk with 100 rows
+// and 10 non-nulls would jump over all 100 rows on offset=10 — leaving
+// the next chunks completely unread.  The fix only takes the whole-chunk
+// shortcut when time and value statistics agree on the row count, falling
+// through to per-row offset handling otherwise.
+TEST_F(TreeQueryByRowTest, SparseAlignedChunkOffsetCrossesChunks) {
+    using namespace storage;
+    libtsfile_destroy();
+    libtsfile_init();
+    remove(file_name_.c_str());
+
+    // Tighten per-chunk capacity so two write_tablet_aligned calls produce
+    // two distinct aligned chunks (rather than being merged into one).
+    uint32_t prev_chunk_thresh = g_config_value_.chunk_group_size_threshold_;
+    g_config_value_.chunk_group_size_threshold_ = 64;
+    int64_t prev_record_check =
+        g_config_value_.record_count_for_next_mem_check_;
+    g_config_value_.record_count_for_next_mem_check_ = 1;
+
+    {
+        TsFileWriter writer;
+        int flags = O_WRONLY | O_CREAT | O_TRUNC;
+#ifdef _WIN32
+        flags |= O_BINARY;
+#endif
+        ASSERT_EQ(writer.open(file_name_, flags, 0666), E_OK);
+        const std::string device = "sparse_dev";
+        std::vector<MeasurementSchema*> reg;
+        reg.push_back(new MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED));
+        writer.register_aligned_timeseries(device, reg);
+
+        // First aligned chunk: 20 timestamps but only every 4th row has a
+        // non-null value column (5 non-nulls).  Flush.
+        for (int i = 0; i < 20; i++) {
+            TsRecord r(static_cast<int64_t>(i), device);
+            DataPoint p("v0");
+            if (i % 4 == 0) p.set_i64(static_cast<int64_t>(i));
+            r.points_.push_back(p);
+            ASSERT_EQ(writer.write_record_aligned(r), E_OK);
+        }
+        ASSERT_EQ(writer.flush(), E_OK);
+
+        // Second aligned chunk: 20 more timestamps, every value non-null
+        // (all 20 non-nulls).
+        for (int i = 20; i < 40; i++) {
+            TsRecord r(static_cast<int64_t>(i), device);
+            DataPoint p("v0");
+            p.set_i64(static_cast<int64_t>(i));
+            r.points_.push_back(p);
+            ASSERT_EQ(writer.write_record_aligned(r), E_OK);
+        }
+        ASSERT_EQ(writer.flush(), E_OK);
+        ASSERT_EQ(writer.close(), E_OK);
+    }
+    g_config_value_.chunk_group_size_threshold_ = prev_chunk_thresh;
+    g_config_value_.record_count_for_next_mem_check_ = prev_record_check;
+
+    // Query with offset=10 — enough to fully cover the first chunk's 5
+    // non-null statistic-reported rows, but NOT enough to cover the
+    // chunk's 20 actual rows.  Under the bug the entire first chunk was
+    // skipped, and offset_=10-5=5 would land 5 rows into the second
+    // chunk, returning rows 25..39 (15 rows).  With the fix the first
+    // chunk is decoded, 10 rows are eaten, leaving rows 10..39 (30 rows).
+    TsFileTreeReader reader;
+    ASSERT_EQ(reader.open(file_name_), E_OK);
+    std::vector<std::string> devices = {"sparse_dev"};
+    std::vector<std::string> measurements = {"v0"};
+    ResultSet* result = nullptr;
+    ASSERT_EQ(reader.queryByRow(devices, measurements, 10, -1, result), E_OK);
+    ASSERT_NE(result, nullptr);
+
+    auto timestamps = collect_timestamps(result);
+    EXPECT_EQ(timestamps.size(), static_cast<size_t>(30));
+    if (timestamps.size() == 30) {
+        for (size_t i = 0; i < timestamps.size(); i++) {
+            EXPECT_EQ(timestamps[i], static_cast<int64_t>(i + 10));
+        }
+    }
+    reader.destroy_query_data_set(result);
+    reader.close();
+}
+
 // Basic test: queryByRow returns correct total count with no offset/limit.
 TEST_F(TreeQueryByRowTest, NoOffsetNoLimit) {
     std::vector<std::string> devices = {"d1"};
@@ -1310,7 +1391,8 @@ TEST_F(TreeQueryByRowTest, MultiPath_TimeHint_SkipsStaleChunk_WithOffset) {
 
 // Pushdown is faster than full query + manual next: queryByRow(offset, limit)
 // skips at Chunk/Page level; old query then manual next decodes every row.
-// Timing tolerance 20% to allow measurement noise.
+// Use the same 50% tolerance as the table-view sibling test for cross-platform
+// timing noise; the test is DISABLED_ and intended for manual runs.
 TEST_F(TreeQueryByRowTest, DISABLED_QueryByRowFasterThanManualNext) {
     std::vector<std::string> devices = {"d1"};
     std::vector<std::string> measurements = {"s1"};
@@ -1320,7 +1402,8 @@ TEST_F(TreeQueryByRowTest, DISABLED_QueryByRowFasterThanManualNext) {
     write_test_file(devices, measurements, num_rows);
 
     const int num_iters = 5;
-    const double tolerance = 0.2;
+    const double tolerance =
+        0.5;  // 50% tolerance for cross-platform timing noise
 
     auto run_query_by_row = [this, &devices, &measurements, offset, limit]() {
         TsFileTreeReader reader;
diff --git a/cpp/test/reader/tsfile_reader_test.cc b/cpp/test/reader/tsfile_reader_test.cc
index 08cda6e31..5f50724c4 100644
--- a/cpp/test/reader/tsfile_reader_test.cc
+++ b/cpp/test/reader/tsfile_reader_test.cc
@@ -29,9 +29,14 @@
 #include "common/record.h"
 #include "common/schema.h"
 #include "common/tablet.h"
+#include "common/tsblock/tsblock.h"
+#include "file/tsfile_io_reader.h"
 #include "file/tsfile_io_writer.h"
 #include "file/write_file.h"
+#include "reader/block/single_device_tsblock_reader.h"
+#include "reader/filter/time_operator.h"
 #include "reader/qds_without_timegenerator.h"
+#include "reader/tsfile_series_scan_iterator.h"
 #include "writer/tsfile_writer.h"
 
 using namespace storage;
@@ -395,3 +400,596 @@ TEST_F(TsFileReaderTest, GetTimeseriesMetadataTableModelTypeAndDeviceFilter) {
 
     reader.close();
 }
+
+static const int64_t kLargeFileNumRecords = 300000000;
+static const int64_t kLargeFileFlushBatch = 100000;
+
+TEST_F(TsFileReaderTest,
+       DISABLED_LargeFileNoEncodingNoCompression_WriteAndRead) {
+    std::string device_path = "device1";
+    std::string measurement_name = "temperature";
+    common::TSDataType data_type = common::TSDataType::INT64;
+    common::TSEncoding encoding = common::TSEncoding::PLAIN;
+    common::CompressionType compression_type =
+        common::CompressionType::UNCOMPRESSED;
+
+    tsfile_writer_->register_timeseries(
+        device_path, storage::MeasurementSchema(measurement_name, data_type,
+                                                encoding, compression_type));
+
+    const int64_t start_time = 1622505600000LL;
+    for (int64_t i = 0; i < kLargeFileNumRecords; ++i) {
+        TsRecord record(start_time + i * 1000, device_path);
+        record.add_point(measurement_name, static_cast<int64_t>(i));
+        ASSERT_EQ(tsfile_writer_->write_record(record), E_OK);
+        if ((i + 1) % kLargeFileFlushBatch == 0) {
+            ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+        }
+    }
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    std::vector<std::string> select_list = {"device1.temperature"};
+    const int64_t end_time = start_time + (kLargeFileNumRecords - 1) * 1000 + 1;
+
+    storage::TsFileReader reader;
+    int ret = reader.open(file_name_);
+    ASSERT_EQ(ret, common::E_OK);
+
+    storage::ResultSet* tmp_qds = nullptr;
+    ret = reader.query(select_list, start_time, end_time, tmp_qds);
+    ASSERT_EQ(ret, common::E_OK);
+    ASSERT_NE(tmp_qds, nullptr);
+
+    auto* qds = static_cast<QDSWithoutTimeGenerator*>(tmp_qds);
+    std::shared_ptr<ResultSetMetadata> meta = qds->get_metadata();
+    ASSERT_NE(meta, nullptr);
+    ASSERT_EQ(meta->get_column_type(1), INT64);
+    ASSERT_EQ(meta->get_column_type(2), INT64);
+
+    int64_t row_count = 0;
+    bool has_next = false;
+
+    while (true) {
+        ret = qds->next(has_next);
+        ASSERT_EQ(ret, common::E_OK);
+        if (!has_next) break;
+        row_count++;
+    }
+
+    ASSERT_EQ(row_count, kLargeFileNumRecords);
+
+    reader.destroy_query_data_set(qds);
+    reader.close();
+}
+
+// Multi-value aligned chunk reader doesn't honour row_offset / row_limit /
+// min_time_hint pushdown — silently dropping those args would hand the caller
+// full-chunk data when it asked for a sub-range.  The guard at the top of
+// AlignedChunkReader::get_next_page must turn the unsupported combination
+// into an explicit E_NOT_SUPPORT.
+TEST_F(TsFileReaderTest, MultiValueAlignedRowOffsetReturnsNotSupport) {
+    const std::string device = "root.dev_multi_offset";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg),
+                  E_OK);
+    }
+    const int N = 32;
+    Tablet tablet(device,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  N);
+    for (int i = 0; i < N; ++i) {
+        ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<int64_t>(i * 2)), E_OK);
+    }
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    storage::TsFileIOReader io_reader;
+    ASSERT_EQ(io_reader.init(file_name_), E_OK);
+
+    auto device_id = std::make_shared<StringArrayDeviceID>(device);
+    std::vector<std::string> measurements = {"v0", "v1"};
+    storage::TsFileSeriesScanIterator* ssi = nullptr;
+    common::PageArena pa;
+    pa.init(512, common::MOD_DEFAULT);
+    ASSERT_EQ(io_reader.alloc_multi_ssi(device_id, measurements, ssi, pa,
+                                        /*time_filter=*/nullptr),
+              E_OK);
+    ASSERT_NE(ssi, nullptr);
+
+    // row_offset > 0 hits the multi-value guard at the top of
+    // AlignedChunkReader::get_next_page; the SSI propagates the error code.
+    ssi->set_row_range(/*offset=*/5, /*limit=*/-1);
+    common::TsBlock* block = nullptr;
+    EXPECT_EQ(ssi->get_next(block, /*alloc_tsblock=*/true),
+              common::E_NOT_SUPPORT);
+
+    if (block != nullptr) {
+        ssi->revert_tsblock();
+    }
+    io_reader.revert_ssi(ssi);
+    // RAII handles io_reader teardown — explicit reset() would destroy the
+    // tsfile_meta page arena while tsfile_meta_ still holds shared_ptrs into
+    // it, then ~TsFileMeta would call self_deleter on freed memory.
+}
+
+namespace storage {
+// Subclass that lets the test (a) inject an error from the next-tsblock load
+// and (b) wire a manually constructed TsBlock into the inherited iterator
+// fields, so we can exercise the end-of-block branch of skip_rows()
+// deterministically.  The base destructor calls revert_ssi(nullptr), which
+// short-circuits safely; we hand it a default-constructed (never-init'd)
+// TsFileIOReader purely to satisfy the constructor.
+class FaultySingleMeasurementColumnContext
+    : public SingleMeasurementColumnContext {
+   public:
+    using SingleMeasurementColumnContext::SingleMeasurementColumnContext;
+    int get_next_tsblock_ret_ = common::E_OK;
+    int get_next_tsblock_calls_ = 0;
+    int get_next_tsblock(bool /*alloc_mem*/) override {
+        ++get_next_tsblock_calls_;
+        return get_next_tsblock_ret_;
+    }
+    void prime_iters_for_block(common::TsBlock* tsb) {
+        tsblock_ = tsb;
+        time_iter_ = new common::ColIterator(0, tsb);
+        value_iter_ = new common::ColIterator(1, tsb);
+    }
+};
+}  // namespace storage
+
+// Regression: skip_rows() used to be a void method that called
+// get_next_tsblock(false) for its side effects when the current block ran
+// out.  An IO/decode error from that call was silently swallowed and the
+// outer reader treated the source as exhausted, returning fewer rows than
+// requested with no error indication.  skip_rows() now returns int and must
+// surface hard errors (E_NO_MORE_DATA is the legitimate EOF and stays
+// suppressed).
+TEST_F(TsFileReaderTest,
+       SingleMeasurementSkipRowsPropagatesGetNextTsBlockError) {
+    common::TupleDesc desc;
+    desc.push_back(common::ColumnSchema("time", common::INT64,
+                                        common::UNCOMPRESSED, common::PLAIN));
+    desc.push_back(common::ColumnSchema("v0", common::INT64,
+                                        common::UNCOMPRESSED, common::PLAIN));
+    common::TsBlock tsb(&desc, 4);
+    ASSERT_EQ(tsb.init(), common::E_OK);
+    common::RowAppender ra(&tsb);
+    for (int i = 0; i < 2; i++) {
+        ASSERT_TRUE(ra.add_row());
+        int64_t t = 1000 + i;
+        int64_t v = i;
+        ra.append(0, reinterpret_cast<const char*>(&t), sizeof(int64_t));
+        ra.append(1, reinterpret_cast<const char*>(&v), sizeof(int64_t));
+    }
+
+    storage::TsFileIOReader io_reader_stub;
+    storage::FaultySingleMeasurementColumnContext ctx(&io_reader_stub);
+    ctx.prime_iters_for_block(&tsb);
+
+    // Hard error: skip_rows must propagate.
+    ctx.get_next_tsblock_ret_ = common::E_INVALID_ARG;
+    EXPECT_EQ(ctx.skip_rows(2), common::E_INVALID_ARG);
+    EXPECT_EQ(ctx.get_next_tsblock_calls_, 1);
+}
+
+TEST_F(TsFileReaderTest, SingleMeasurementSkipRowsSwallowsEndOfStream) {
+    common::TupleDesc desc;
+    desc.push_back(common::ColumnSchema("time", common::INT64,
+                                        common::UNCOMPRESSED, common::PLAIN));
+    desc.push_back(common::ColumnSchema("v0", common::INT64,
+                                        common::UNCOMPRESSED, common::PLAIN));
+    common::TsBlock tsb(&desc, 4);
+    ASSERT_EQ(tsb.init(), common::E_OK);
+    common::RowAppender ra(&tsb);
+    for (int i = 0; i < 2; i++) {
+        ASSERT_TRUE(ra.add_row());
+        int64_t t = 1000 + i;
+        int64_t v = i;
+        ra.append(0, reinterpret_cast<const char*>(&t), sizeof(int64_t));
+        ra.append(1, reinterpret_cast<const char*>(&v), sizeof(int64_t));
+    }
+
+    storage::TsFileIOReader io_reader_stub;
+    storage::FaultySingleMeasurementColumnContext ctx(&io_reader_stub);
+    ctx.prime_iters_for_block(&tsb);
+
+    // EOF: skip_rows must squash to E_OK so the outer loop notices via
+    // available_rows() instead of bubbling the EOF up as a query failure.
+    ctx.get_next_tsblock_ret_ = common::E_NO_MORE_DATA;
+    EXPECT_EQ(ctx.skip_rows(2), common::E_OK);
+    EXPECT_EQ(ctx.get_next_tsblock_calls_, 1);
+}
+
+// Regression: the multi-value aligned batch loop required the destination
+// TsBlock to have >= BATCH (=129) rows of free capacity, otherwise it
+// returned E_OVERFLOW immediately and the SSI surfaced that error to the
+// caller.  When tsblock_max_memory_ is small enough to land max_row_count_
+// below 129 (e.g. very small per-block memory in low-RAM configs) no rows
+// could ever be decoded.  The fix caps the batch by remaining capacity,
+// matching ChunkReader's per-type batch loops.
+TEST_F(TsFileReaderTest, MultiValueAlignedProgressesWithSmallTsBlock) {
+    const std::string device = "root.dev_multi_small_block";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg),
+                  E_OK);
+    }
+    const int N = 200;  // > BATCH (129) so the batch loop iterates twice
+    Tablet tablet(device,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  N);
+    for (int i = 0; i < N; ++i) {
+        ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<int64_t>(i * 2)), E_OK);
+    }
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    // Force max_row_count_ below BATCH: ~2 KB / 24 B per row → ~85 rows.
+    // Also force the multi_DECODE_TV_BATCH path by disabling parallel reads:
+    // with a thread pool the chunk-level pre-decode shortcut would otherwise
+    // run for any multi-column query (no upper column-count cutoff anymore).
+    uint32_t prev_capacity = common::g_config_value_.tsblock_max_memory_;
+    bool prev_parallel = common::g_config_value_.parallel_read_enabled_;
+    struct Guard {
+        uint32_t cap;
+        bool par;
+        ~Guard() {
+            common::g_config_value_.tsblock_max_memory_ = cap;
+            common::g_config_value_.parallel_read_enabled_ = par;
+        }
+    } guard{prev_capacity, prev_parallel};
+    common::g_config_value_.tsblock_max_memory_ = 2048;
+    common::g_config_value_.parallel_read_enabled_ = false;
+
+    storage::TsFileIOReader io_reader;
+    ASSERT_EQ(io_reader.init(file_name_), E_OK);
+
+    auto device_id = std::make_shared<StringArrayDeviceID>(device);
+    std::vector<std::string> measurements = {"v0", "v1"};
+    storage::TsFileSeriesScanIterator* ssi = nullptr;
+    common::PageArena pa;
+    pa.init(512, common::MOD_TSFILE_READER);
+    ASSERT_EQ(io_reader.alloc_multi_ssi(device_id, measurements, ssi, pa,
+                                        /*time_filter=*/nullptr),
+              E_OK);
+    ASSERT_NE(ssi, nullptr);
+
+    int collected = 0;
+    while (true) {
+        common::TsBlock* block = nullptr;
+        int ret = ssi->get_next(block, /*alloc_tsblock=*/true);
+        if (ret == common::E_NO_MORE_DATA) break;
+        ASSERT_EQ(ret, common::E_OK);
+        ASSERT_NE(block, nullptr);
+        ASSERT_GT(block->get_max_row_count(), 0u);
+        ASSERT_LT(block->get_max_row_count(), 129u);
+        collected += static_cast<int>(block->get_row_count());
+        ssi->revert_tsblock();
+    }
+    EXPECT_EQ(collected, N);
+
+    io_reader.revert_ssi(ssi);
+}
+
+// Regression: when a whole batch is filtered out, multi_DECODE_TV_BATCH skips
+// the non-null value bytes for each column.  The old code ignored the skip
+// return code and the `skipped` count, so a short/truncated page could leave
+// the decoder mid-value; subsequent batches would then read garbage bytes as
+// values.  This test exercises an intact page: the filter rejects rows
+// 0..127 (one full batch worth), then the rows after must come back with
+// their *correct* values — proving the decoder advanced exactly nonnull_count
+// values, not some smaller number that would shift the value alignment.
+TEST_F(TsFileReaderTest, MultiValueAlignedSkipsBatchPreservesValueAlignment) {
+    const std::string device = "root.dev_multi_skip_align";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg),
+                  E_OK);
+    }
+    // Two batches' worth of rows so the filter skips the first batch entirely
+    // and decodes the second.
+    const int N = 200;
+    Tablet tablet(device,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  N);
+    for (int i = 0; i < N; ++i) {
+        // Distinctive value pattern: i and 1000000 + i.  If skip
+        // mis-advances the decoder by even one value, the v0/v1 read after
+        // the skip will land on the wrong row's bytes.
+        ASSERT_EQ(tablet.add_timestamp(i, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<int64_t>(1000000 + i)),
+                  E_OK);
+    }
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    bool prev_parallel = common::g_config_value_.parallel_read_enabled_;
+    struct Guard {
+        bool par;
+        ~Guard() { common::g_config_value_.parallel_read_enabled_ = par; }
+    } guard{prev_parallel};
+    // Force the multi_DECODE_TV_BATCH path (the chunk-level shortcut would
+    // bypass the skip branch we want to exercise).
+    common::g_config_value_.parallel_read_enabled_ = false;
+
+    storage::TsFileIOReader io_reader;
+    ASSERT_EQ(io_reader.init(file_name_), E_OK);
+
+    auto device_id = std::make_shared<StringArrayDeviceID>(device);
+    std::vector<std::string> measurements = {"v0", "v1"};
+    storage::TsFileSeriesScanIterator* ssi = nullptr;
+    common::PageArena pa;
+    pa.init(512, common::MOD_TSFILE_READER);
+
+    // TimeIn filter selecting only rows 130..139 — entirely past the first
+    // 129-row batch, so the first batch hits the pass_count==0 skip branch
+    // for both value columns.
+    std::vector<int64_t> want;
+    for (int i = 130; i < 140; ++i) want.push_back(i);
+    storage::TimeIn time_filter(want, /*not_in=*/false);
+
+    ASSERT_EQ(io_reader.alloc_multi_ssi(device_id, measurements, ssi, pa,
+                                        &time_filter),
+              E_OK);
+    ASSERT_NE(ssi, nullptr);
+
+    std::vector<std::pair<int64_t, int64_t>> got;
+    while (true) {
+        common::TsBlock* block = nullptr;
+        int ret = ssi->get_next(block, /*alloc_tsblock=*/true, &time_filter);
+        if (ret == common::E_NO_MORE_DATA) break;
+        ASSERT_EQ(ret, common::E_OK);
+        ASSERT_NE(block, nullptr);
+        // Columns: time, v0, v1.
+        common::ColIterator t_iter(0, block);
+        common::ColIterator v0_iter(1, block);
+        common::ColIterator v1_iter(2, block);
+        const uint32_t rows = block->get_row_count();
+        for (uint32_t r = 0; r < rows; ++r) {
+            uint32_t len = 0;
+            int64_t t = *reinterpret_cast<int64_t*>(t_iter.read(&len));
+            int64_t v0 = *reinterpret_cast<int64_t*>(v0_iter.read(&len));
+            int64_t v1 = *reinterpret_cast<int64_t*>(v1_iter.read(&len));
+            got.push_back({t, v0});
+            // The decoder must have advanced exactly nonnull_count values
+            // when it skipped batch #1.  If it under-advanced (the latent
+            // bug), v1 would land on the wrong row's bytes here.
+            EXPECT_EQ(v1, 1000000 + t);
+            EXPECT_EQ(v0, t);
+            t_iter.next();
+            v0_iter.next();
+            v1_iter.next();
+        }
+        ssi->revert_tsblock();
+    }
+
+    ASSERT_EQ(got.size(), want.size());
+    for (size_t i = 0; i < got.size(); ++i) {
+        EXPECT_EQ(got[i].first, want[i]);
+        EXPECT_EQ(got[i].second, want[i]);
+    }
+
+    io_reader.revert_ssi(ssi);
+}
+
+// Coverage: an aligned read with > 6 value columns now takes the chunk-level
+// parallel decode path (decode_all_planned_pages) exactly like the 2..6 column
+// case — the old "<= 6 columns" dispatch cutoff that sent wide chunks down the
+// per-page serial path is gone.  With libtsfile_init() having built the global
+// pool and parallel_read_enabled_ on by default, an 8-column query exercises
+// that path end-to-end; each column carries a disjoint value range so any
+// cross-column misalignment in the wide chunk-level decode would be caught.
+TEST_F(TsFileReaderTest, MultiValueAlignedWideChunkParallelDecode) {
+    const std::string device = "root.dev_multi_wide";
+    const uint32_t kCols = 8;  // > 6: previously bypassed the chunk-level path
+    std::vector<MeasurementSchema> schema_vec;
+    for (uint32_t c = 0; c < kCols; ++c) {
+        schema_vec.emplace_back("v" + std::to_string(c), INT64, PLAIN,
+                                UNCOMPRESSED);
+    }
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg),
+                  E_OK);
+    }
+    const int N = 200;  // > BATCH (129) so the decode loop iterates more once
+    Tablet tablet(device,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  N);
+    // Row i, column c carries c * 1000000 + i so each column's values occupy a
+    // disjoint range; a wide-chunk decode that crossed column boundaries would
+    // surface as a value landing in the wrong column's range.
+    for (int i = 0; i < N; ++i) {
+        ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK);
+        for (uint32_t c = 0; c < kCols; ++c) {
+            ASSERT_EQ(
+                tablet.add_value(i, c, static_cast<int64_t>(c * 1000000 + i)),
+                E_OK);
+        }
+    }
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    // parallel_read_enabled_ defaults to true and SetUp() ran libtsfile_init(),
+    // so the SSI hands the AlignedChunkReader the global pool; with 8 value
+    // columns (> 1) the reader takes the chunk-level decode path.
+    ASSERT_TRUE(common::g_config_value_.parallel_read_enabled_);
+
+    storage::TsFileIOReader io_reader;
+    ASSERT_EQ(io_reader.init(file_name_), E_OK);
+
+    auto device_id = std::make_shared<StringArrayDeviceID>(device);
+    std::vector<std::string> measurements;
+    for (uint32_t c = 0; c < kCols; ++c)
+        measurements.push_back("v" + std::to_string(c));
+    storage::TsFileSeriesScanIterator* ssi = nullptr;
+    common::PageArena pa;
+    pa.init(512, common::MOD_TSFILE_READER);
+    ASSERT_EQ(io_reader.alloc_multi_ssi(device_id, measurements, ssi, pa,
+                                        /*time_filter=*/nullptr),
+              E_OK);
+    ASSERT_NE(ssi, nullptr);
+
+    int collected = 0;
+    while (true) {
+        common::TsBlock* block = nullptr;
+        int ret = ssi->get_next(block, /*alloc_tsblock=*/true);
+        if (ret == common::E_NO_MORE_DATA) break;
+        ASSERT_EQ(ret, common::E_OK);
+        ASSERT_NE(block, nullptr);
+        const uint32_t rows = block->get_row_count();
+
+        common::ColIterator t_iter(0, block);
+        std::vector<int64_t> times;
+        times.reserve(rows);
+        for (uint32_t r = 0; r < rows; ++r) {
+            uint32_t len = 0;
+            times.push_back(*reinterpret_cast<int64_t*>(t_iter.read(&len)));
+            t_iter.next();
+        }
+        // One independent iterator per value column so we never rely on
+        // vector<ColIterator> being movable.
+        for (uint32_t c = 0; c < kCols; ++c) {
+            common::ColIterator it(c + 1, block);
+            for (uint32_t r = 0; r < rows; ++r) {
+                uint32_t len = 0;
+                int64_t v = *reinterpret_cast<int64_t*>(it.read(&len));
+                int64_t i = times[r] - 1000;  // timestamp == 1000 + i
+                EXPECT_EQ(v, static_cast<int64_t>(c) * 1000000 + i);
+                it.next();
+            }
+        }
+        collected += static_cast<int>(rows);
+        ssi->revert_tsblock();
+    }
+    EXPECT_EQ(collected, N);
+
+    io_reader.revert_ssi(ssi);
+}
+
+// Regression: AlignedTimeseriesIndex::get_data_type() returns the time column
+// type (VECTOR), which the schema accessor used to surface verbatim — every
+// aligned column came back as VECTOR instead of its real INT32/FLOAT/etc.
+// type.  get_timeseries_schema() now unwraps AlignedTimeseriesIndex to read
+// value_ts_idx_->get_data_type() like the develop branch did.
+TEST_F(TsFileReaderTest, AlignedSchemaReportsValueDataType) {
+    const std::string device = "root.dev_aligned_schema";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v_i32", INT32, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v_dbl", DOUBLE, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg),
+                  E_OK);
+    }
+    const int N = 8;
+    Tablet tablet(device,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  N);
+    for (int i = 0; i < N; ++i) {
+        ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int32_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<double>(i) * 0.5), E_OK);
+    }
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    storage::TsFileReader reader;
+    ASSERT_EQ(reader.open(file_name_), E_OK);
+
+    auto device_id = std::make_shared<StringArrayDeviceID>(device);
+    std::vector<MeasurementSchema> schemas;
+    ASSERT_EQ(reader.get_timeseries_schema(device_id, schemas), E_OK);
+    ASSERT_EQ(schemas.size(), 2u);
+
+    // Match by name — IO reader iteration order isn't part of the contract.
+    common::TSDataType i32_type = common::INVALID_DATATYPE;
+    common::TSDataType dbl_type = common::INVALID_DATATYPE;
+    for (const auto& s : schemas) {
+        if (s.measurement_name_ == "v_i32") i32_type = s.data_type_;
+        if (s.measurement_name_ == "v_dbl") dbl_type = s.data_type_;
+    }
+    EXPECT_EQ(i32_type, INT32);
+    EXPECT_EQ(dbl_type, DOUBLE);
+    reader.close();
+}
+
+namespace storage {
+class TsFileReaderMetaArenaTest {
+   public:
+    static int64_t arena_used(const storage::TsFileReader& r) {
+        return r.tsfile_reader_meta_pa_.get_total_used_bytes();
+    }
+};
+}  // namespace storage
+
+// Regression: tsfile_reader_meta_pa_ used to be re-initialised at the start
+// of each get_timeseries_metadata() call.  When that reset was removed,
+// every call accumulated another copy of the per-device meta into the same
+// arena, so a long-lived reader that polled metadata kept growing memory
+// without bound.  Re-init now happens at the top of both overloads; verify
+// arena usage stays flat across repeated calls instead of growing linearly.
+TEST_F(TsFileReaderTest, RepeatedGetTimeseriesMetadataDoesNotLeakArena) {
+    const std::string device = "root.dev_arena_growth";
+    {
+        std::vector<MeasurementSchema*> reg;
+        reg.push_back(new MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED));
+        ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg),
+                  E_OK);
+    }
+    TsRecord r(1000, device);
+    r.points_.emplace_back("v0", static_cast<int64_t>(0));
+    ASSERT_EQ(tsfile_writer_->write_record_aligned(r), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    storage::TsFileReader reader;
+    ASSERT_EQ(reader.open(file_name_), E_OK);
+    std::vector<std::shared_ptr<IDeviceID>> ids = {
+        std::make_shared<StringArrayDeviceID>(device)};
+
+    // Prime the arena and capture the steady-state size.
+    (void)reader.get_timeseries_metadata(ids);
+    const int64_t after_one =
+        storage::TsFileReaderMetaArenaTest::arena_used(reader);
+    ASSERT_GT(after_one, 0);
+
+    for (int i = 0; i < 10; ++i) {
+        (void)reader.get_timeseries_metadata(ids);
+    }
+    const int64_t after_eleven =
+        storage::TsFileReaderMetaArenaTest::arena_used(reader);
+    // Without the fix, after_eleven ≈ 11 × after_one.  With the fix it
+    // should equal after_one (arena reset before each call).  Allow a small
+    // slack for arena page rounding, but reject anything close to 2× growth.
+    EXPECT_LT(after_eleven, after_one * 2)
+        << "arena grew from " << after_one << " to " << after_eleven
+        << " across 11 calls — reset on entry is missing";
+    reader.close();
+}
diff --git a/cpp/test/writer/table_view/tsfile_writer_table_test.cc b/cpp/test/writer/table_view/tsfile_writer_table_test.cc
index d1f3b92e4..0dfaccc06 100644
--- a/cpp/test/writer/table_view/tsfile_writer_table_test.cc
+++ b/cpp/test/writer/table_view/tsfile_writer_table_test.cc
@@ -20,7 +20,6 @@
 
 #include <random>
 
-#include "common/global.h"
 #include "common/record.h"
 #include "common/schema.h"
 #include "common/tablet.h"
@@ -32,11 +31,10 @@
 using namespace storage;
 using namespace common;
 
-class TsFileWriterTableTest : public ::testing::TestWithParam<bool> {
+class TsFileWriterTableTest : public ::testing::Test {
    protected:
     void SetUp() override {
         libtsfile_init();
-        set_parallel_write_enabled(GetParam());
         file_name_ = std::string("tsfile_writer_table_test_") +
                      generate_random_string(10) + std::string(".tsfile");
         remove(file_name_.c_str());
@@ -135,7 +133,7 @@ class TsFileWriterTableTest : public ::testing::TestWithParam<bool> {
     }
 };
 
-TEST_P(TsFileWriterTableTest, WriteTableTest) {
+TEST_F(TsFileWriterTableTest, WriteTableTest) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ =
         std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
@@ -146,7 +144,7 @@ TEST_P(TsFileWriterTableTest, WriteTableTest) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WithoutTagAndMultiPage) {
+TEST_F(TsFileWriterTableTest, WithoutTagAndMultiPage) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
     measurement_schemas.resize(1);
@@ -194,7 +192,7 @@ TEST_P(TsFileWriterTableTest, WithoutTagAndMultiPage) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriteDisorderTest) {
+TEST_F(TsFileWriterTableTest, WriteDisorderTest) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ =
         std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
@@ -239,12 +237,13 @@ TEST_P(TsFileWriterTableTest, WriteDisorderTest) {
 
     ASSERT_EQ(tsfile_table_writer_->write_table(tablet),
               common::E_OUT_OF_ORDER);
-    ASSERT_EQ(tsfile_table_writer_->flush(), common::E_OK);
-    ASSERT_EQ(tsfile_table_writer_->close(), common::E_OK);
+    ASSERT_EQ(tsfile_table_writer_->flush(), common::E_DATA_INCONSISTENCY);
+    ASSERT_EQ(tsfile_table_writer_->close(), common::E_DATA_INCONSISTENCY);
+    ASSERT_EQ(tsfile_table_writer_->close(), common::E_DATA_INCONSISTENCY);
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriteTableTestMultiFlush) {
+TEST_F(TsFileWriterTableTest, WriteTableTestMultiFlush) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ = std::make_shared<TsFileTableWriter>(
         &write_file_, table_schema, 2 * 1024);
@@ -257,7 +256,7 @@ TEST_P(TsFileWriterTableTest, WriteTableTestMultiFlush) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriteNonExistColumnTest) {
+TEST_F(TsFileWriterTableTest, WriteNonExistColumnTest) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ =
         std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
@@ -285,7 +284,7 @@ TEST_P(TsFileWriterTableTest, WriteNonExistColumnTest) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriteNonExistTableTest) {
+TEST_F(TsFileWriterTableTest, WriteNonExistTableTest) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ =
         std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
@@ -297,7 +296,7 @@ TEST_P(TsFileWriterTableTest, WriteNonExistTableTest) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriterWithMemoryThreshold) {
+TEST_F(TsFileWriterTableTest, WriterWithMemoryThreshold) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ = std::make_shared<TsFileTableWriter>(
         &write_file_, table_schema, 256 * 1024 * 1024);
@@ -307,7 +306,7 @@ TEST_P(TsFileWriterTableTest, WriterWithMemoryThreshold) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, EmptyTagWrite) {
+TEST_F(TsFileWriterTableTest, EmptyTagWrite) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
     measurement_schemas.resize(3);
@@ -363,7 +362,7 @@ TEST_P(TsFileWriterTableTest, EmptyTagWrite) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WritehDataTypeMisMatch) {
+TEST_F(TsFileWriterTableTest, WritehDataTypeMisMatch) {
     auto table_schema = gen_table_schema(0);
     auto tsfile_table_writer_ = std::make_shared<TsFileTableWriter>(
         &write_file_, table_schema, 256 * 1024 * 1024);
@@ -414,7 +413,7 @@ TEST_P(TsFileWriterTableTest, WritehDataTypeMisMatch) {
     tsfile_table_writer_->close();
 }
 
-TEST_P(TsFileWriterTableTest, WriteAndReadSimple) {
+TEST_F(TsFileWriterTableTest, WriteAndReadSimple) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
     measurement_schemas.resize(2);
@@ -469,7 +468,7 @@ TEST_P(TsFileWriterTableTest, WriteAndReadSimple) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, DuplicateColumnName) {
+TEST_F(TsFileWriterTableTest, DuplicateColumnName) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
     measurement_schemas.resize(3);
@@ -507,7 +506,7 @@ TEST_P(TsFileWriterTableTest, DuplicateColumnName) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriteWithNullAndEmptyTag) {
+TEST_F(TsFileWriterTableTest, WriteWithNullAndEmptyTag) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
     for (int i = 0; i < 3; i++) {
@@ -639,7 +638,7 @@ TEST_P(TsFileWriterTableTest, WriteWithNullAndEmptyTag) {
     ASSERT_EQ(reader.close(), common::E_OK);
 }
 
-TEST_P(TsFileWriterTableTest, MultiDeviceMultiFields) {
+TEST_F(TsFileWriterTableTest, MultiDeviceMultiFields) {
     common::config_set_max_degree_of_index_node(5);
     auto table_schema = gen_table_schema(0, 1, 100);
     auto tsfile_table_writer_ =
@@ -698,7 +697,7 @@ TEST_P(TsFileWriterTableTest, MultiDeviceMultiFields) {
     delete table_schema;
 }
 
-TEST_P(TsFileWriterTableTest, WriteDataWithEmptyField) {
+TEST_F(TsFileWriterTableTest, WriteDataWithEmptyField) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
     for (int i = 0; i < 3; i++) {
@@ -775,7 +774,7 @@ TEST_P(TsFileWriterTableTest, WriteDataWithEmptyField) {
     ASSERT_EQ(reader.close(), common::E_OK);
 }
 
-TEST_P(TsFileWriterTableTest, MultiDatatypes) {
+TEST_F(TsFileWriterTableTest, MultiDatatypes) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
 
@@ -879,7 +878,7 @@ TEST_P(TsFileWriterTableTest, MultiDatatypes) {
     delete[] literal;
 }
 
-TEST_P(TsFileWriterTableTest, DiffCodecTypes) {
+TEST_F(TsFileWriterTableTest, DiffCodecTypes) {
     std::vector<MeasurementSchema*> measurement_schemas;
     std::vector<ColumnCategory> column_categories;
 
@@ -987,7 +986,7 @@ TEST_P(TsFileWriterTableTest, DiffCodecTypes) {
     delete[] literal;
 }
 
-TEST_P(TsFileWriterTableTest, EncodingConfigIntegration) {
+TEST_F(TsFileWriterTableTest, EncodingConfigIntegration) {
     // 1. Test setting global compression type
     ASSERT_EQ(E_OK, set_global_compression(SNAPPY));
 
@@ -1100,7 +1099,7 @@ TEST_P(TsFileWriterTableTest, EncodingConfigIntegration) {
 }
 
 #ifdef ENABLE_MEM_STAT
-TEST_P(TsFileWriterTableTest, DISABLED_MemStatWriteAndVerify) {
+TEST_F(TsFileWriterTableTest, DISABLED_MemStatWriteAndVerify) {
     TableSchema* table_schema = gen_table_schema(0, 2, 3);
     auto tsfile_table_writer =
         std::make_shared<TsFileTableWriter>(&write_file_, table_schema);
@@ -1175,8 +1174,3 @@ TEST_P(TsFileWriterTableTest, DISABLED_MemStatWriteAndVerify) {
     delete table_schema;
 }
 #endif
-
-INSTANTIATE_TEST_SUITE_P(Serial, TsFileWriterTableTest,
-                         ::testing::Values(false));
-INSTANTIATE_TEST_SUITE_P(Parallel, TsFileWriterTableTest,
-                         ::testing::Values(true));
\ No newline at end of file
diff --git a/cpp/test/writer/tsfile_writer_test.cc b/cpp/test/writer/tsfile_writer_test.cc
index 139761380..62d5167f3 100644
--- a/cpp/test/writer/tsfile_writer_test.cc
+++ b/cpp/test/writer/tsfile_writer_test.cc
@@ -20,12 +20,15 @@
 
 #include <gtest/gtest.h>
 
+#include <cstring>
+#include <fstream>
 #include <random>
 
 #include "common/path.h"
 #include "common/record.h"
 #include "common/schema.h"
 #include "common/tablet.h"
+#include "common/tsfile_common.h"
 #include "file/tsfile_io_writer.h"
 #include "file/write_file.h"
 #include "reader/qds_without_timegenerator.h"
@@ -618,6 +621,74 @@ TEST_F(TsFileWriterTest, WriteMultipleTabletsDouble) {
     ASSERT_EQ(tsfile_writer_->close(), E_OK);
 }
 
+// Regression: write_column() is the null fallback of the non-aligned batch
+// path (write_column_batch -> has_null -> write_column).  It used to handle
+// only BOOLEAN/INT32/INT64/FLOAT/DOUBLE/STRING and ASSERT(false) otherwise;
+// in NDEBUG that assert is a no-op, so a non-aligned TEXT/BLOB/DATE/TIMESTAMP
+// column that contained a null silently dropped every row of that column.
+// This writes a TEXT column with a null in the middle and verifies the two
+// non-null rows survive the round trip.
+TEST_F(TsFileWriterTest, NonAlignedTextColumnWithNullIsNotDropped) {
+    // Non-const: storage::Path's ctor takes non-const std::string&.
+    std::string device = "root.dev_text_null";
+    std::string measure = "s_text";
+    tsfile_writer_->register_timeseries(
+        device, MeasurementSchema(measure, common::TSDataType::TEXT,
+                                  common::TSEncoding::PLAIN,
+                                  common::CompressionType::UNCOMPRESSED));
+
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back(measure, common::TSDataType::TEXT,
+                            common::TSEncoding::PLAIN,
+                            common::CompressionType::UNCOMPRESSED);
+    const int max_rows = 3;
+    storage::Tablet tablet(
+        device, std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+        max_rows);
+    for (int row = 0; row < max_rows; row++) {
+        ASSERT_EQ(tablet.add_timestamp(row, 1000 + row), E_OK);
+    }
+    // Rows 0 and 2 get values; row 1 is left untouched, so its not-null bit
+    // stays set (default) — that is the null that forces the write_column
+    // fallback.
+    char buf0[] = "v0";
+    char buf2[] = "v2";
+    String s0(buf0, 2), s2(buf2, 2);
+    ASSERT_EQ(tablet.add_value(0, 0u, s0), E_OK);
+    ASSERT_EQ(tablet.add_value(2, 0u, s2), E_OK);
+    ASSERT_EQ(tsfile_writer_->write_tablet(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    storage::TsFileReader reader;
+    ASSERT_EQ(reader.open(file_name_), E_OK);
+    std::vector<storage::Path> select_list{storage::Path(device, measure)};
+    storage::QueryExpression* query_expr =
+        storage::QueryExpression::create(select_list, nullptr);
+    storage::ResultSet* tmp_qds = nullptr;
+    ASSERT_EQ(reader.query(query_expr, tmp_qds), E_OK);
+    auto* qds = (QDSWithoutTimeGenerator*)tmp_qds;
+
+    // The regression signal is row survival: before the fix write_column hit
+    // ASSERT(false) on TEXT (a no-op in NDEBUG), so the column was dropped and
+    // this query returned 0 rows.  TEXT shares the identical (proven) string
+    // write path as STRING, so the two surviving rows at the right timestamps
+    // confirm the fix.  field(1) is the value column, but field(0) is non-null
+    // here too — the result row carries the timestamp as field(0).
+    std::vector<int64_t> times;
+    bool has_next = false;
+    while (IS_SUCC(qds->next(has_next)) && has_next) {
+        storage::RowRecord* rec = qds->get_row_record();
+        times.push_back(rec->get_timestamp());
+    }
+    reader.destroy_query_data_set(qds);
+    reader.close();
+
+    ASSERT_EQ(times.size(), 2u);
+    EXPECT_EQ(times[0], 1000);
+    EXPECT_EQ(times[1], 1002);
+}
+
 TEST_F(TsFileWriterTest, FlushMultipleDevice) {
     const int device_num = 50;
     const int measurement_num = 50;
@@ -699,6 +770,22 @@ TEST_F(TsFileWriterTest, FlushMultipleDevice) {
 }
 
 TEST_F(TsFileWriterTest, AnalyzeTsfileForload) {
+    // estimate_max_mem_size() now reflects the real 64 KiB-page footprint of
+    // each per-measurement output stream.  50 devices × 50 measurements ×
+    // 2 streams × 64 KiB = ~320 MiB, well past the 128 MiB default
+    // chunk_group_size_threshold_ — without raising the cap the auto-flush
+    // would fire mid-write and the post-write hasData() check below would
+    // observe a freshly drained chunk writer.  Lift the cap for the
+    // duration of this smoke test so the original semantics still apply.
+    uint32_t prev_threshold =
+        common::g_config_value_.chunk_group_size_threshold_;
+    struct Guard {
+        uint32_t prev;
+        ~Guard() { common::g_config_value_.chunk_group_size_threshold_ = prev; }
+    } guard{prev_threshold};
+    common::g_config_value_.chunk_group_size_threshold_ =
+        2ULL * 1024 * 1024 * 1024;
+
     const int device_num = 50;
     const int measurement_num = 50;
     const int max_rows = 100;
@@ -1070,6 +1157,214 @@ TEST_F(TsFileWriterTest, AlignedSealSync_ValueMemoryFirst) {
     ASSERT_EQ(reader.close(), E_OK);
 }
 
+// Regression: write_tablet_aligned() writes the entire time column first and
+// then each value column. With memory-based auto-seal still active, a large
+// STRING value column hits the memory threshold mid-batch (say at row 5),
+// while the INT64 time column does not seal until row page_writer_max_point
+// is reached.  Those divergent seals stamp misaligned page boundaries onto
+// the file and read-back returns wrong values per row.  Suppressing
+// memory-driven seals during the batch should keep all pages count-aligned.
+TEST_F(TsFileWriterTest, AlignedSealSync_TabletLargeStringValueMemoryFirst) {
+    uint32_t prev_pt = g_config_value_.page_writer_max_point_num_;
+    uint32_t prev_mem = g_config_value_.page_writer_max_memory_bytes_;
+    struct Guard {
+        uint32_t pt, mem;
+        ~Guard() {
+            g_config_value_.page_writer_max_point_num_ = pt;
+            g_config_value_.page_writer_max_memory_bytes_ = mem;
+        }
+    } guard{prev_pt, prev_mem};
+    // Big point cap, tiny memory cap: time chunk (INT64 PLAIN, 8B/point) never
+    // hits memory before it reaches the point cap, while the STRING value
+    // chunk crosses the memory threshold within a handful of rows.
+    g_config_value_.page_writer_max_point_num_ = 10000;
+    g_config_value_.page_writer_max_memory_bytes_ = 512;
+
+    std::string device_name = "device_tablet_str";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("s0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("s1", STRING, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("s2", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        tsfile_writer_->register_aligned_timeseries(device_name, reg);
+    }
+
+    const int row_num = 200;
+    Tablet tablet(device_name,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  row_num);
+    char* long_buf = new char[101];
+    memset(long_buf, 'A', 100);
+    long_buf[100] = '\0';
+    common::String str_val(long_buf, 100);
+    for (int i = 0; i < row_num; ++i) {
+        ASSERT_EQ(tablet.add_timestamp(i, 1622505600000 + i), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        // Sparse string column: every third row is null so we also exercise
+        // the bitmap path through the memory-pressured value page.
+        if (i % 3 != 0) {
+            ASSERT_EQ(tablet.add_value(i, 1u, str_val), E_OK);
+        }
+        ASSERT_EQ(tablet.add_value(i, 2u, static_cast<int64_t>(i * 10)), E_OK);
+    }
+    delete[] long_buf;
+
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    std::string s0("s0"), s1("s1"), s2("s2");
+    std::vector<storage::Path> select_list;
+    select_list.emplace_back(device_name, s0);
+    select_list.emplace_back(device_name, s1);
+    select_list.emplace_back(device_name, s2);
+    storage::QueryExpression* qe =
+        storage::QueryExpression::create(select_list, nullptr);
+    storage::TsFileReader reader;
+    ASSERT_EQ(reader.open(file_name_), E_OK);
+    storage::ResultSet* tmp_qds = nullptr;
+    ASSERT_EQ(reader.query(qe, tmp_qds), E_OK);
+    auto* qds = (QDSWithoutTimeGenerator*)tmp_qds;
+
+    bool has_next = false;
+    int64_t cur_row = 0;
+    while (IS_SUCC(qds->next(has_next)) && has_next) {
+        auto* rec = qds->get_row_record();
+        ASSERT_NE(rec, nullptr);
+        EXPECT_EQ(rec->get_timestamp(), 1622505600000 + cur_row);
+        EXPECT_EQ(field_to_string(rec->get_field(1)), std::to_string(cur_row));
+        EXPECT_EQ(field_to_string(rec->get_field(3)),
+                  std::to_string(cur_row * 10));
+        cur_row++;
+    }
+    EXPECT_EQ(cur_row, row_num);
+    reader.destroy_query_data_set(qds);
+    ASSERT_EQ(reader.close(), E_OK);
+}
+
+// Regression: write_tablet_aligned() used to discard time_write_column_batch
+// errors and keep writing value columns. On an out-of-order tablet that left
+// the time chunk with fewer rows than the value chunks (or with their seal
+// flag still suppressed). The fix propagates the time-column error so no
+// value column is touched and the page seal flags are restored.
+TEST_F(TsFileWriterTest, AlignedTabletTimeBatchOutOfOrderAborts) {
+    std::string device_name = "device_aligned_out_of_order";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        tsfile_writer_->register_aligned_timeseries(device_name, reg);
+    }
+
+    const int row_num = 16;
+    Tablet tablet(device_name,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  row_num);
+    // Non-monotonic timestamps trip TimePageWriter::write_batch's order check.
+    for (int i = 0; i < row_num; ++i) {
+        int64_t ts = (i == row_num - 1) ? 0 : 1000 + i;
+        ASSERT_EQ(tablet.add_timestamp(i, ts), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<int64_t>(i * 2)), E_OK);
+    }
+    EXPECT_NE(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+}
+
+// Regression: write_record_aligned used to ignore the time write return
+// value, then unconditionally write each value column.  An out-of-order
+// timestamp would leave the time chunk one row short of every value chunk
+// for the rest of the file.  The fix propagates the time-write error and
+// marks the writer unrecoverable when value-column writes diverge from
+// time.
+TEST_F(TsFileWriterTest, RecordAlignedOutOfOrderDoesNotAdvanceValueColumns) {
+    std::string device_name = "root.dev_aligned_record";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        tsfile_writer_->register_aligned_timeseries(device_name, reg);
+    }
+
+    // First record at ts=1000 — should write cleanly.
+    TsRecord r1(1000, device_name);
+    r1.points_.emplace_back("v0", static_cast<int64_t>(0));
+    r1.points_.emplace_back("v1", static_cast<int64_t>(0));
+    ASSERT_EQ(tsfile_writer_->write_record_aligned(r1), E_OK);
+
+    // Second record at the same timestamp 1000 — time_chunk_writer rejects
+    // it (E_OUT_OF_ORDER per TimePageWriter::write).  The value columns
+    // must not advance.
+    TsRecord r2(1000, device_name);
+    r2.points_.emplace_back("v0", static_cast<int64_t>(99));
+    r2.points_.emplace_back("v1", static_cast<int64_t>(99));
+    EXPECT_EQ(tsfile_writer_->write_record_aligned(r2), E_OUT_OF_ORDER);
+    // close() must succeed because the failure was caught before any value
+    // write — writer state is still consistent.
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+}
+
+// Regression: the aligned bulk-memcpy fast path in AlignedChunkReader only
+// appended bytes to each Vector's value_data without calling add_row_nums().
+// Vector::row_num_ stayed at 0 while TsBlock::row_count_ jumped to N, so
+// fill_trailling_nulls() then overwrote every just-written row as null
+// (visible to the caller as all-null columns).
+TEST_F(TsFileWriterTest, AlignedBulkMemcpyAdvancesVectorRowNum) {
+    std::string device_name = "device_bulk_rownum";
+    std::vector<MeasurementSchema> schema_vec;
+    schema_vec.emplace_back("v0", INT64, PLAIN, UNCOMPRESSED);
+    schema_vec.emplace_back("v1", INT64, PLAIN, UNCOMPRESSED);
+    {
+        std::vector<MeasurementSchema*> reg;
+        for (auto& s : schema_vec) reg.push_back(new MeasurementSchema(s));
+        tsfile_writer_->register_aligned_timeseries(device_name, reg);
+    }
+    const int N = 64;
+    Tablet tablet(device_name,
+                  std::make_shared<std::vector<MeasurementSchema>>(schema_vec),
+                  N);
+    for (int i = 0; i < N; i++) {
+        ASSERT_EQ(tablet.add_timestamp(i, 1000 + i), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<int64_t>(i * 2)), E_OK);
+    }
+    ASSERT_EQ(tsfile_writer_->write_tablet_aligned(tablet), E_OK);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    // Read back via TsBlock — confirms the rows are visible.  Under the
+    // bug Vector::row_num_ stayed at 0, fill_trailling_nulls() then
+    // marked every just-written row null; the iterator still reports
+    // them as rows so we check the non-null field for a real value.
+    std::vector<storage::Path> select;
+    std::string s0("v0"), s1("v1");
+    select.emplace_back(device_name, s0);
+    select.emplace_back(device_name, s1);
+    storage::QueryExpression* qe =
+        storage::QueryExpression::create(select, nullptr);
+    storage::TsFileReader reader;
+    ASSERT_EQ(reader.open(file_name_), E_OK);
+    storage::ResultSet* tmp = nullptr;
+    ASSERT_EQ(reader.query(qe, tmp), E_OK);
+    auto* qds = (QDSWithoutTimeGenerator*)tmp;
+    int got = 0;
+    bool has_next = false;
+    while (IS_SUCC(qds->next(has_next)) && has_next) {
+        auto* rec = qds->get_row_record();
+        ASSERT_NE(rec, nullptr);
+        got++;
+    }
+    EXPECT_EQ(got, N);
+    reader.destroy_query_data_set(qds);
+    reader.close();
+}
+
 TEST_F(TsFileWriterTest, WriteAlignedMultiFlush) {
     int measurement_num = 100, row_num = 100;
     std::string device_name = "device";
@@ -1256,4 +1551,145 @@ TEST_F(TsFileWriterTest, WriteTabletDataTypeMismatch) {
     ASSERT_EQ(E_TYPE_NOT_MATCH, tsfile_writer_->write_tablet_aligned(tablet));
     ASSERT_EQ(tsfile_writer_->flush(), E_OK);
     ASSERT_EQ(tsfile_writer_->close(), E_OK);
+}
+
+// Regression: partial-write failures (parallel aligned task failing mid-way,
+// non-aligned column failing after earlier columns advanced, etc.) leave per-
+// column chunk writers out of sync.  The writer latches unrecoverable_ so
+// subsequent flush/close/write must refuse rather than seal a corrupt file
+// whose time and value chunks disagree on row count.  Directly triggering
+// the partial failure deterministically is hard, so this test asserts the
+// downstream contract by flipping the flag through a friend hook.
+namespace storage {
+class TsFileWriterUnrecoverableTest {
+   public:
+    static void mark_unrecoverable(TsFileWriter& w) { w.unrecoverable_ = true; }
+};
+}  // namespace storage
+
+TEST_F(TsFileWriterTest, UnrecoverableLatchRefusesFlushCloseAndWrites) {
+    const std::string device = "root.dev_unrec";
+    std::vector<MeasurementSchema*> reg;
+    reg.push_back(new MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED));
+    reg.push_back(new MeasurementSchema("v1", INT64, PLAIN, UNCOMPRESSED));
+    ASSERT_EQ(tsfile_writer_->register_aligned_timeseries(device, reg), E_OK);
+
+    // Write one good row so a flush attempt would otherwise have data to emit.
+    TsRecord r(1000, device);
+    r.points_.emplace_back("v0", static_cast<int64_t>(0));
+    r.points_.emplace_back("v1", static_cast<int64_t>(0));
+    ASSERT_EQ(tsfile_writer_->write_record_aligned(r), E_OK);
+
+    // Simulate the post-partial-failure state.
+    storage::TsFileWriterUnrecoverableTest::mark_unrecoverable(*tsfile_writer_);
+
+    // Every public write/flush/close entry point must refuse.
+    EXPECT_EQ(tsfile_writer_->flush(), E_DATA_INCONSISTENCY);
+    EXPECT_EQ(tsfile_writer_->close(), E_DATA_INCONSISTENCY);
+
+    TsRecord r2(1001, device);
+    r2.points_.emplace_back("v0", static_cast<int64_t>(1));
+    r2.points_.emplace_back("v1", static_cast<int64_t>(1));
+    EXPECT_EQ(tsfile_writer_->write_record_aligned(r2), E_DATA_INCONSISTENCY);
+
+    Tablet tablet(device,
+                  std::make_shared<std::vector<MeasurementSchema>>(
+                      std::vector<MeasurementSchema>{
+                          MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED),
+                          MeasurementSchema("v1", INT64, PLAIN, UNCOMPRESSED)}),
+                  4);
+    for (int i = 0; i < 4; i++) {
+        ASSERT_EQ(tablet.add_timestamp(i, 2000 + i), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 0u, static_cast<int64_t>(i)), E_OK);
+        ASSERT_EQ(tablet.add_value(i, 1u, static_cast<int64_t>(i * 2)), E_OK);
+    }
+    EXPECT_EQ(tsfile_writer_->write_tablet_aligned(tablet),
+              E_DATA_INCONSISTENCY);
+    EXPECT_EQ(tsfile_writer_->write_tablet(tablet), E_DATA_INCONSISTENCY);
+}
+
+namespace {
+
+WriteFile* OpenWriteFileFor(const std::string& path) {
+    int flags = O_WRONLY | O_CREAT | O_TRUNC;
+#ifdef _WIN32
+    flags |= O_BINARY;
+#endif
+    auto* wf = new WriteFile;
+    if (wf->create(path, flags, 0666) != E_OK) {
+        delete wf;
+        return nullptr;
+    }
+    return wf;
+}
+
+void WriteOneAlignedRow(TsFileWriter& w, const std::string& device, int64_t ts,
+                        int64_t value) {
+    std::vector<MeasurementSchema*> reg;
+    reg.push_back(new MeasurementSchema("v0", INT64, PLAIN, UNCOMPRESSED));
+    ASSERT_EQ(w.register_aligned_timeseries(device, reg), E_OK);
+    TsRecord r(ts, device);
+    r.points_.emplace_back("v0", value);
+    ASSERT_EQ(w.write_record_aligned(r), E_OK);
+}
+
+}  // namespace
+
+// Writing speed up: TsFileWriter must be reusable across a
+// destroy() + init() cycle.
+//   - 1: TsFileIOWriter::destroy() left chunk_group_meta_list_ and
+//     chunk_group_meta_index_ pointing at meta_allocator_-owned memory that
+//     the next init() then re-armed; the next start_flush_chunk_group()
+//     linear scan would deref freed nodes.
+//   - 2: TsFileWriter::init() did not reset start_file_done_, so
+//     the second file's flush() skipped the magic/version header and
+//     produced a file the reader can't open.
+// This test forces both code paths: destroy(), init() onto a fresh
+// WriteFile, write data, close, then read the second file via the public
+// TsFileReader API.
+TEST_F(TsFileWriterTest, WriterReuseAfterDestroyProducesValidSecondFile) {
+    // First lifecycle uses the fixture-provided writer (already open()'d on
+    // file_name_).  Write one row and close — this flushes the magic +
+    // version into file_name_ and flips start_file_done_ true.
+    WriteOneAlignedRow(*tsfile_writer_, "root.dev_first", 1000, 7);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    // Second lifecycle: tear down the previous writer state and re-init
+    // against a brand-new file.
+    tsfile_writer_->destroy();
+
+    const std::string second_path = std::string("tsfile_writer_reuse_test_") +
+                                    generate_random_string(10) +
+                                    std::string(".tsfile");
+    remove(second_path.c_str());
+    WriteFile* wf = OpenWriteFileFor(second_path);
+    ASSERT_NE(wf, nullptr);
+    ASSERT_EQ(tsfile_writer_->init(wf), E_OK);
+
+    WriteOneAlignedRow(*tsfile_writer_, "root.dev_second", 2000, 9);
+    ASSERT_EQ(tsfile_writer_->flush(), E_OK);
+    ASSERT_EQ(tsfile_writer_->close(), E_OK);
+
+    // The second file must start with the TsFile magic + version byte.
+    // The TsFileReader open path mostly indexes from the file tail, so a
+    // missing magic at offset 0 isn't caught by reader.open().  Inspect the
+    // raw header bytes instead — that's exactly what start_file_done_ guards.
+    {
+        std::ifstream in(second_path, std::ios::binary);
+        ASSERT_TRUE(in.is_open());
+        char header[MAGIC_STRING_TSFILE_LEN + 1] = {0};
+        in.read(header, MAGIC_STRING_TSFILE_LEN + 1);
+        EXPECT_EQ(in.gcount(),
+                  static_cast<std::streamsize>(MAGIC_STRING_TSFILE_LEN + 1));
+        EXPECT_EQ(memcmp(header, MAGIC_STRING_TSFILE, MAGIC_STRING_TSFILE_LEN),
+                  0)
+            << "second-file header is missing the TsFile magic — "
+               "start_file_done_ residual from the previous lifecycle";
+        EXPECT_EQ(header[MAGIC_STRING_TSFILE_LEN], VERSION_NUM_BYTE);
+    }
+
+    // wf was passed to init() but init() did not take ownership.
+    delete wf;
+    remove(second_path.c_str());
 }
\ No newline at end of file
diff --git a/cpp/test/writer/value_page_writer_test.cc b/cpp/test/writer/value_page_writer_test.cc
index 07666e189..be04586ee 100644
--- a/cpp/test/writer/value_page_writer_test.cc
+++ b/cpp/test/writer/value_page_writer_test.cc
@@ -106,3 +106,36 @@ TEST_F(ValuePageWriterTest, WritePageHeaderAndData) {
               common::E_OK);
     value_page_writer.destroy_page_data();
 }
+
+// Regression: write_batch used to bump size_ and the page bitmap for every
+// row in the batch *before* encoding the values.  If the value encode failed
+// mid-batch, the page would claim `count` rows had been written even though
+// the encoder stream only held a prefix.  The fix counts valid rows
+// upfront, encodes, and only commits size_ / bitmap when the encode
+// finishes cleanly.  This test exercises the happy path on a mixed-null
+// batch and asserts size_ and statistics agree with the row count — a
+// subsequent code change that re-introduces premature size_ bumping
+// without rolling back on failure would still pass this test, but it
+// guards the encode-then-commit ordering contract against accidental
+// rewrites.
+TEST_F(ValuePageWriterTest, WriteBatchCommitsStateAfterEncode) {
+    ValuePageWriter w;
+    w.init(TSDataType::INT64, TSEncoding::PLAIN, UNCOMPRESSED);
+
+    const uint32_t N = 5;
+    int64_t timestamps[N] = {100, 101, 102, 103, 104};
+    int64_t values[N] = {10, 20, 30, 40, 50};
+    common::BitMap nullmap;
+    ASSERT_EQ(nullmap.init(N), common::E_OK);
+    // bit=1 means null in the tablet bitmap convention.
+    nullmap.set(1);  // row 1 (timestamp 101) is null
+    nullmap.set(3);  // row 3 (timestamp 103) is null
+    ASSERT_EQ(w.write_batch(timestamps, values, nullmap, 0, N), common::E_OK);
+
+    // size_ tracks every row regardless of nullness, statistic only the
+    // non-null subset.  get_point_numer() returns size_ (rows incl. NULLs).
+    EXPECT_EQ(w.get_point_numer(), N);
+    auto* stat = static_cast<Int64Statistic*>(w.get_statistic());
+    ASSERT_NE(stat, nullptr);
+    EXPECT_EQ(stat->count_, 3u);
+}
diff --git a/python/tests/test_tsfile_dataset.py b/python/tests/test_tsfile_dataset.py
index f79a6d466..4e52a1b5f 100644
--- a/python/tests/test_tsfile_dataset.py
+++ b/python/tests/test_tsfile_dataset.py
@@ -688,10 +688,21 @@ def test_reader_catalog_shares_device_metadata_and_resolves_paths(tmp_path):
 
 
 def test_reader_read_series_by_row_retries_across_native_row_query_boundaries():
+    """read_series_by_row pulls TsBlocks via read_arrow_batch and must keep
+    re-issuing query_table_by_row when the underlying native call stops at
+    an internal block boundary before the caller's window is filled."""
+
+    import pyarrow as pa
+
     class _FakeResultSet:
-        def __init__(self, rows):
-            self._rows = rows
-            self._index = -1
+        def __init__(self, times, values):
+            self._batch = pa.table(
+                {
+                    "time": pa.array(times, type=pa.int64()),
+                    "totalcloudcover": pa.array(values, type=pa.float64()),
+                }
+            )
+            self._delivered = False
 
         def __enter__(self):
             return self
@@ -699,12 +710,11 @@ def __enter__(self):
         def __exit__(self, exc_type, exc_val, exc_tb):
             return False
 
-        def next(self):
-            self._index += 1
-            return self._index < len(self._rows)
-
-        def get_value_by_name(self, name):
-            return self._rows[self._index][name]
+        def read_arrow_batch(self):
+            if self._delivered or self._batch.num_rows == 0:
+                return None
+            self._delivered = True
+            return self._batch
 
     class _FakeNativeReader:
         def __init__(self, timestamps, values, boundary):
@@ -713,28 +723,31 @@ def __init__(self, timestamps, values, boundary):
             self._boundary = boundary
 
         def query_table_by_row(
-            self, table_name, column_names, offset=0, limit=-1, tag_filter=None
+            self,
+            table_name,
+            column_names,
+            offset=0,
+            limit=-1,
+            tag_filter=None,
+            batch_size=0,
         ):
             assert table_name == "pvf"
             assert column_names == ["totalcloudcover"]
             assert tag_filter is None
+            assert batch_size > 0, "row reads should use batch (Arrow) mode"
             if limit < 0:
                 stop = len(self._timestamps)
             else:
                 stop = min(offset + limit, len(self._timestamps))
 
-            # Simulate the current native bug: one row query cannot cross the
-            # next internal boundary, so callers must re-issue from the
+            # Simulate the native quirk where one query stops at the next
+            # internal block boundary; callers must re-issue from the
             # advanced offset to complete a large logical window.
             chunk_stop = min(stop, ((offset // self._boundary) + 1) * self._boundary)
-            rows = [
-                {
-                    "time": int(self._timestamps[idx]),
-                    "totalcloudcover": float(self._values[idx]),
-                }
-                for idx in range(offset, chunk_stop)
-            ]
-            return _FakeResultSet(rows)
+            return _FakeResultSet(
+                self._timestamps[offset:chunk_stop],
+                self._values[offset:chunk_stop],
+            )
 
     reader = object.__new__(TsFileSeriesReader)
     reader._reader = _FakeNativeReader(
diff --git a/python/tsfile/dataset/reader.py b/python/tsfile/dataset/reader.py
index 4899b2bf9..ffc38b07d 100644
--- a/python/tsfile/dataset/reader.py
+++ b/python/tsfile/dataset/reader.py
@@ -365,37 +365,44 @@ def read_series_by_row(
         tag_values = dict(zip(table_entry.tag_columns, device_entry.tag_values))
         tag_filter = _build_exact_tag_filter(tag_values) if tag_values else None
 
-        # Some native row-query paths stop at an internal block boundary even
-        # when the requested window extends further. Re-issue from the advanced
-        # offset until we fill the caller's logical row window or reach EOF.
+        # Pull whole TsBlocks via the Arrow C-Data interface instead of
+        # iterating row-by-row in Python. Each result_set.next() +
+        # get_value_by_name() pair would be a Python<->C round-trip per row
+        # and dominates wall time on long slices; read_arrow_batch() returns
+        # a column-oriented batch in one call and lands directly in numpy.
         timestamp_parts = []
         value_parts = []
         remaining = limit
         next_offset = offset
 
         while remaining > 0:
-            batch_timestamps = []
-            batch_values = []
+            produced_this_call = 0
             with self._reader.query_table_by_row(
                 table_entry.table_name,
                 [field_name],
                 offset=next_offset,
                 limit=remaining,
                 tag_filter=tag_filter,
+                batch_size=65536,
             ) as result_set:
-                while result_set.next():
-                    batch_timestamps.append(result_set.get_value_by_name("time"))
-                    value = result_set.get_value_by_name(field_name)
-                    batch_values.append(np.nan if value is None else float(value))
-
-            if not batch_timestamps:
+                while True:
+                    arrow_table = result_set.read_arrow_batch()
+                    if arrow_table is None:
+                        break
+                    if arrow_table.num_rows == 0:
+                        continue
+                    timestamp_parts.append(arrow_table.column("time").to_numpy())
+                    raw_values = arrow_table.column(field_name).to_numpy(
+                        zero_copy_only=False
+                    )
+                    value_parts.append(np.asarray(raw_values, dtype=np.float64))
+                    produced_this_call += arrow_table.num_rows
+
+            if produced_this_call == 0:
                 break
 
-            timestamp_parts.append(np.asarray(batch_timestamps, dtype=np.int64))
-            value_parts.append(np.asarray(batch_values, dtype=np.float64))
-            read_count = len(batch_timestamps)
-            next_offset += read_count
-            remaining -= read_count
+            next_offset += produced_this_call
+            remaining -= produced_this_call
 
         if not timestamp_parts:
             return np.array([], dtype=np.int64), np.array([], dtype=np.float64)
diff --git a/python/tsfile/tsfile_reader.pyx b/python/tsfile/tsfile_reader.pyx
index 9193e2c61..c9ecd78f7 100644
--- a/python/tsfile/tsfile_reader.pyx
+++ b/python/tsfile/tsfile_reader.pyx
@@ -199,7 +199,9 @@ cdef class ResultSetPy:
         if data_type == TSDataTypePy.INT32:
             return tsfile_result_set_get_value_by_index_int32_t(self.result, index)
         elif data_type == TSDataTypePy.DATE:
-            return parse_int_to_date(tsfile_result_set_get_value_by_index_int64_t(self.result, index))
+            # DATE is physically stored as int32 (yyyymmdd), so read it through
+            # the int32 accessor that matches the underlying storage width.
+            return parse_int_to_date(tsfile_result_set_get_value_by_index_int32_t(self.result, index))
         elif data_type == TSDataTypePy.INT64 or data_type == TSDataTypePy.TIMESTAMP:
             return tsfile_result_set_get_value_by_index_int64_t(self.result, index)
         elif data_type == TSDataTypePy.FLOAT: