From 04be282156f5a2f483814656dacef088ae6d5b8b Mon Sep 17 00:00:00 2001 From: Anastasios Bakogiannis Date: Thu, 28 May 2026 09:08:06 +0200 Subject: [PATCH 1/4] feat(hll): expose a memory size function --- datasketches/src/hll/array4.rs | 5 +++++ datasketches/src/hll/array6.rs | 5 +++++ datasketches/src/hll/array8.rs | 5 +++++ datasketches/src/hll/aux_map.rs | 5 +++++ datasketches/src/hll/container.rs | 5 +++++ datasketches/src/hll/sketch.rs | 13 +++++++++++++ 6 files changed, 38 insertions(+) diff --git a/datasketches/src/hll/array4.rs b/datasketches/src/hll/array4.rs index 27798d0..0d7fb43 100644 --- a/datasketches/src/hll/array4.rs +++ b/datasketches/src/hll/array4.rs @@ -425,6 +425,11 @@ impl Array4 { bytes.into_bytes() } + + /// Returns the size of the heap allocations in bytes + pub fn heap_size(&self) -> usize { + self.bytes.len() + self.aux_map.as_ref().map(|a| a.heap_size()).unwrap_or(0) + } } #[cfg(test)] diff --git a/datasketches/src/hll/array6.rs b/datasketches/src/hll/array6.rs index 2aaff0a..439798c 100644 --- a/datasketches/src/hll/array6.rs +++ b/datasketches/src/hll/array6.rs @@ -272,6 +272,11 @@ impl Array6 { bytes.into_bytes() } + + /// Returns the size of the heap allocations in bytes + pub fn heap_size(&self) -> usize { + self.bytes.len() + } } /// Calculate number of bytes needed for k slots with 6 bits each diff --git a/datasketches/src/hll/array8.rs b/datasketches/src/hll/array8.rs index 21f0d53..7b5805d 100644 --- a/datasketches/src/hll/array8.rs +++ b/datasketches/src/hll/array8.rs @@ -344,6 +344,11 @@ impl Array8 { bytes.into_bytes() } + + /// Returns the size of the heap allocations in bytes + pub fn heap_size(&self) -> usize { + self.bytes.len() + } } #[cfg(test)] diff --git a/datasketches/src/hll/aux_map.rs b/datasketches/src/hll/aux_map.rs index 6e43a03..5358b50 100644 --- a/datasketches/src/hll/aux_map.rs +++ b/datasketches/src/hll/aux_map.rs @@ -226,6 +226,11 @@ impl AuxMap { } }) } + + /// Returns the size of the heap allocations in bytes + pub fn heap_size(&self) -> usize { + self.entries.len() * std::mem::size_of::() + } } /// Iterator over AuxMap entries diff --git a/datasketches/src/hll/container.rs b/datasketches/src/hll/container.rs index 8192b04..ff7ca0d 100644 --- a/datasketches/src/hll/container.rs +++ b/datasketches/src/hll/container.rs @@ -135,4 +135,9 @@ impl Container { pub fn iter(&self) -> impl Iterator + '_ { self.coupons.iter().filter(|&&c| !c.is_empty()).copied() } + + /// Returns the size of the heap allocations in bytes + pub fn heap_size(&self) -> usize { + self.coupons.len() * std::mem::size_of::() + } } diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs index eaa10ae..90ff374 100644 --- a/datasketches/src/hll/sketch.rs +++ b/datasketches/src/hll/sketch.rs @@ -423,6 +423,19 @@ impl HllSketch { Mode::Array8(arr) => arr.serialize(self.lg_config_k), } } + + /// Returns the size of the sketch in bytes + pub fn size(&self) -> usize { + let heap_size = match &self.mode { + Mode::List { list, .. } => list.container().heap_size(), + Mode::Set { set, .. } => set.container().heap_size(), + Mode::Array4(arr) => arr.heap_size(), + Mode::Array6(arr) => arr.heap_size(), + Mode::Array8(arr) => arr.heap_size(), + }; + + std::mem::size_of::() + heap_size + } } fn promote_container_to_set(container: &Container, hll_type: HllType) -> Mode { From 20c1abfe8948b5f8a23455d9c2bd37b2f807c3ac Mon Sep 17 00:00:00 2001 From: Anastasios Bakogiannis Date: Thu, 28 May 2026 09:56:21 +0200 Subject: [PATCH 2/4] feat(cpc): expose a memory size function --- datasketches/src/cpc/pair_table.rs | 5 +++++ datasketches/src/cpc/sketch.rs | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/datasketches/src/cpc/pair_table.rs b/datasketches/src/cpc/pair_table.rs index 6e23067..07af835 100644 --- a/datasketches/src/cpc/pair_table.rs +++ b/datasketches/src/cpc/pair_table.rs @@ -246,4 +246,9 @@ impl PairTable { } } } + + /// Returns the size of the heap allocations in bytes + pub fn heap_size(&self) -> usize { + self.slots.len() * std::mem::size_of::() + } } diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index 5064e45..3fca830 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -450,6 +450,18 @@ impl CpcSketch { matrix } + + /// Returns the size of the sketch in bytes + pub fn size(&self) -> usize { + let heap_size = self.sliding_window.len() + + self + .surprising_value_table + .as_ref() + .map(|t| t.heap_size()) + .unwrap_or(0); + + std::mem::size_of::() + heap_size + } } impl CpcSketch { From ecf60b8a7c977d7107fb576eb1b922ec8d3e996f Mon Sep 17 00:00:00 2001 From: Anastasios Bakogiannis Date: Mon, 1 Jun 2026 09:41:25 +0200 Subject: [PATCH 3/4] fix(hll, cpc): rename methods --- datasketches/src/cpc/pair_table.rs | 6 +++--- datasketches/src/cpc/sketch.rs | 8 ++++---- datasketches/src/hll/array4.rs | 11 ++++++++--- datasketches/src/hll/array6.rs | 4 ++-- datasketches/src/hll/array8.rs | 4 ++-- datasketches/src/hll/aux_map.rs | 4 ++-- datasketches/src/hll/container.rs | 4 ++-- datasketches/src/hll/sketch.rs | 14 +++++++------- 8 files changed, 30 insertions(+), 25 deletions(-) diff --git a/datasketches/src/cpc/pair_table.rs b/datasketches/src/cpc/pair_table.rs index 07af835..d36d68d 100644 --- a/datasketches/src/cpc/pair_table.rs +++ b/datasketches/src/cpc/pair_table.rs @@ -247,8 +247,8 @@ impl PairTable { } } - /// Returns the size of the heap allocations in bytes - pub fn heap_size(&self) -> usize { - self.slots.len() * std::mem::size_of::() + /// Returns the estimated size of the heap allocations in bytes + pub fn estimated_size(&self) -> usize { + self.slots.capacity() * std::mem::size_of::() } } diff --git a/datasketches/src/cpc/sketch.rs b/datasketches/src/cpc/sketch.rs index 3fca830..58cc232 100644 --- a/datasketches/src/cpc/sketch.rs +++ b/datasketches/src/cpc/sketch.rs @@ -451,13 +451,13 @@ impl CpcSketch { matrix } - /// Returns the size of the sketch in bytes - pub fn size(&self) -> usize { - let heap_size = self.sliding_window.len() + /// Returns the estimated size of the sketch in bytes + pub fn estimated_size(&self) -> usize { + let heap_size = self.sliding_window.capacity() + self .surprising_value_table .as_ref() - .map(|t| t.heap_size()) + .map(|t| t.estimated_size()) .unwrap_or(0); std::mem::size_of::() + heap_size diff --git a/datasketches/src/hll/array4.rs b/datasketches/src/hll/array4.rs index 0d7fb43..268db40 100644 --- a/datasketches/src/hll/array4.rs +++ b/datasketches/src/hll/array4.rs @@ -426,9 +426,14 @@ impl Array4 { bytes.into_bytes() } - /// Returns the size of the heap allocations in bytes - pub fn heap_size(&self) -> usize { - self.bytes.len() + self.aux_map.as_ref().map(|a| a.heap_size()).unwrap_or(0) + /// Returns the estimated size of the heap allocations in bytes + pub fn estimated_size(&self) -> usize { + self.bytes.len() + + self + .aux_map + .as_ref() + .map(|a| a.estimated_size()) + .unwrap_or(0) } } diff --git a/datasketches/src/hll/array6.rs b/datasketches/src/hll/array6.rs index 439798c..42646d3 100644 --- a/datasketches/src/hll/array6.rs +++ b/datasketches/src/hll/array6.rs @@ -273,8 +273,8 @@ impl Array6 { bytes.into_bytes() } - /// Returns the size of the heap allocations in bytes - pub fn heap_size(&self) -> usize { + /// Returns the estimated size of the heap allocations in bytes + pub fn estimated_size(&self) -> usize { self.bytes.len() } } diff --git a/datasketches/src/hll/array8.rs b/datasketches/src/hll/array8.rs index 7b5805d..55cf384 100644 --- a/datasketches/src/hll/array8.rs +++ b/datasketches/src/hll/array8.rs @@ -345,8 +345,8 @@ impl Array8 { bytes.into_bytes() } - /// Returns the size of the heap allocations in bytes - pub fn heap_size(&self) -> usize { + /// Returns the estimated size of the heap allocations in bytes + pub fn estimated_size(&self) -> usize { self.bytes.len() } } diff --git a/datasketches/src/hll/aux_map.rs b/datasketches/src/hll/aux_map.rs index 5358b50..fed7fac 100644 --- a/datasketches/src/hll/aux_map.rs +++ b/datasketches/src/hll/aux_map.rs @@ -227,8 +227,8 @@ impl AuxMap { }) } - /// Returns the size of the heap allocations in bytes - pub fn heap_size(&self) -> usize { + /// Returns the estimated size of the heap allocations in bytes + pub fn estimated_size(&self) -> usize { self.entries.len() * std::mem::size_of::() } } diff --git a/datasketches/src/hll/container.rs b/datasketches/src/hll/container.rs index ff7ca0d..716f045 100644 --- a/datasketches/src/hll/container.rs +++ b/datasketches/src/hll/container.rs @@ -136,8 +136,8 @@ impl Container { self.coupons.iter().filter(|&&c| !c.is_empty()).copied() } - /// Returns the size of the heap allocations in bytes - pub fn heap_size(&self) -> usize { + /// Returns the estimated size of the heap allocations in bytes + pub fn estimated_size(&self) -> usize { self.coupons.len() * std::mem::size_of::() } } diff --git a/datasketches/src/hll/sketch.rs b/datasketches/src/hll/sketch.rs index 90ff374..91d79b0 100644 --- a/datasketches/src/hll/sketch.rs +++ b/datasketches/src/hll/sketch.rs @@ -424,14 +424,14 @@ impl HllSketch { } } - /// Returns the size of the sketch in bytes - pub fn size(&self) -> usize { + /// Returns the estimated size of the sketch in bytes + pub fn estimated_size(&self) -> usize { let heap_size = match &self.mode { - Mode::List { list, .. } => list.container().heap_size(), - Mode::Set { set, .. } => set.container().heap_size(), - Mode::Array4(arr) => arr.heap_size(), - Mode::Array6(arr) => arr.heap_size(), - Mode::Array8(arr) => arr.heap_size(), + Mode::List { list, .. } => list.container().estimated_size(), + Mode::Set { set, .. } => set.container().estimated_size(), + Mode::Array4(arr) => arr.estimated_size(), + Mode::Array6(arr) => arr.estimated_size(), + Mode::Array8(arr) => arr.estimated_size(), }; std::mem::size_of::() + heap_size From b218d9399e6e88989675f31cb99eefdbe2cf4f7e Mon Sep 17 00:00:00 2001 From: Anastasios Bakogiannis Date: Mon, 1 Jun 2026 11:28:10 +0200 Subject: [PATCH 4/4] feat(bloom, tdigest, theta): expose a memory size function --- datasketches/Cargo.toml | 2 +- datasketches/src/bloom/sketch.rs | 5 +++++ datasketches/src/tdigest/sketch.rs | 12 ++++++++++++ datasketches/src/theta/hash_table.rs | 5 +++++ datasketches/src/theta/sketch.rs | 10 ++++++++++ 5 files changed, 33 insertions(+), 1 deletion(-) diff --git a/datasketches/Cargo.toml b/datasketches/Cargo.toml index eb299e8..67893e9 100644 --- a/datasketches/Cargo.toml +++ b/datasketches/Cargo.toml @@ -35,7 +35,7 @@ all-features = true rustdoc-args = ["--cfg", "docsrs"] [features] -default = [] +default = ["hll", "cpc", "bloom", "countmin", "frequencies", "tdigest", "theta"] # Each sketch has its own feature, so that users can opt in to only the sketches they need. bloom = [] diff --git a/datasketches/src/bloom/sketch.rs b/datasketches/src/bloom/sketch.rs index 19142a3..bfc5f63 100644 --- a/datasketches/src/bloom/sketch.rs +++ b/datasketches/src/bloom/sketch.rs @@ -561,6 +561,11 @@ impl BloomFilter { self.num_bits_set += 1; } } + + /// Returns the estimated size of the filter in bytes + pub fn estimated_size(&self) -> usize { + std::mem::size_of::() + self.bit_array.len() * std::mem::size_of::() + } } #[cfg(test)] diff --git a/datasketches/src/tdigest/sketch.rs b/datasketches/src/tdigest/sketch.rs index 51e2b62..9e14759 100644 --- a/datasketches/src/tdigest/sketch.rs +++ b/datasketches/src/tdigest/sketch.rs @@ -793,6 +793,13 @@ impl TDigestMut { self.reverse_merge = !self.reverse_merge; self.buffer.clear(); } + + /// Returns the estimated size of the sketch in bytes + pub fn estimated_size(&self) -> usize { + std::mem::size_of::() + + self.centroids.capacity() * std::mem::size_of::() + + self.buffer.capacity() * std::mem::size_of::() + } } /// Immutable (frozen) T-Digest sketch for estimating quantiles and ranks. @@ -1001,6 +1008,11 @@ impl TDigest { vec![], ) } + + /// Returns the estimated size of the sketch in bytes + pub fn estimated_size(&self) -> usize { + std::mem::size_of::() + self.centroids.capacity() * std::mem::size_of::() + } } struct TDigestView<'a> { diff --git a/datasketches/src/theta/hash_table.rs b/datasketches/src/theta/hash_table.rs index fb8fec9..1ee498d 100644 --- a/datasketches/src/theta/hash_table.rs +++ b/datasketches/src/theta/hash_table.rs @@ -381,6 +381,11 @@ impl ThetaHashTable { fn get_stride(key: u64, lg_size: u8) -> usize { (2 * ((key >> (lg_size)) & STRIDE_MASK) + 1) as usize } + + /// Returns the estimated size of the heap allocations in bytes + pub fn estimated_size(&self) -> usize { + self.entries.capacity() * std::mem::size_of::() + } } /// Compute initial lg_size for hash table based on target lg_size, minimum lg_size, and resize diff --git a/datasketches/src/theta/sketch.rs b/datasketches/src/theta/sketch.rs index 1313675..6ee9a23 100644 --- a/datasketches/src/theta/sketch.rs +++ b/datasketches/src/theta/sketch.rs @@ -313,6 +313,11 @@ impl ThetaSketch { ) .expect("theta should always be valid") } + + /// Returns the estimated size of the sketch in bytes + pub fn estimated_size(&self) -> usize { + std::mem::size_of::() + self.table.estimated_size() + } } impl ThetaSketchView for ThetaSketch { @@ -888,6 +893,11 @@ impl CompactThetaSketch { empty, }) } + + /// Returns the estimated size of the sketch in bytes + pub fn estimated_size(&self) -> usize { + std::mem::size_of::() + self.entries.capacity() * std::mem::size_of::() + } } impl ThetaSketchView for CompactThetaSketch {