Skip to content
Open
168 changes: 165 additions & 3 deletions datafusion/functions-aggregate/benches/count_distinct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@
use std::sync::Arc;

use arrow::array::{
ArrayRef, Int8Array, Int16Array, Int64Array, UInt8Array, UInt16Array,
ArrayRef, Int8Array, Int16Array, Int32Array, Int64Array, UInt8Array, UInt16Array,
UInt32Array,
};
use arrow::datatypes::{DataType, Field, Schema};
use criterion::{Criterion, criterion_group, criterion_main};
use datafusion_expr::function::AccumulatorArgs;
use datafusion_expr::{Accumulator, AggregateUDFImpl};
use datafusion_expr::{Accumulator, AggregateUDFImpl, EmitTo};
use datafusion_functions_aggregate::count::Count;
use datafusion_physical_expr::expressions::col;
use rand::rngs::StdRng;
Expand Down Expand Up @@ -87,6 +88,44 @@ fn create_i16_array(n_distinct: usize) -> Int16Array {
.collect()
}

fn create_u32_array(n_distinct: usize) -> UInt32Array {
let mut rng = StdRng::seed_from_u64(42);
(0..BATCH_SIZE)
.map(|_| Some(rng.random_range(0..n_distinct as u32)))
.collect()
}

fn create_i32_array(n_distinct: usize) -> Int32Array {
let mut rng = StdRng::seed_from_u64(42);
(0..BATCH_SIZE)
.map(|_| Some(rng.random_range(0..n_distinct as i32)))
.collect()
}

fn prepare_args(data_type: DataType) -> (Arc<Schema>, AccumulatorArgs<'static>) {
Comment thread
martin-g marked this conversation as resolved.
let schema = Arc::new(Schema::new(vec![Field::new("f", data_type, true)]));
let schema_leaked: &'static Schema = Box::leak(Box::new((*schema).clone()));
let expr = col("f", schema_leaked).unwrap();
let expr_leaked: &'static _ = Box::leak(Box::new(expr));
let return_field: Arc<Field> = Field::new("f", DataType::Int64, true).into();
let return_field_leaked: &'static _ = Box::leak(Box::new(return_field.clone()));
let expr_field = expr_leaked.return_field(schema_leaked).unwrap();
let expr_field_leaked: &'static _ = Box::leak(Box::new(expr_field));

let accumulator_args = AccumulatorArgs {
return_field: return_field_leaked.clone(),
schema: schema_leaked,
expr_fields: std::slice::from_ref(expr_field_leaked),
ignore_nulls: false,
order_bys: &[],
is_reversed: false,
name: "count(distinct f)",
is_distinct: true,
exprs: std::slice::from_ref(expr_leaked),
};
(schema, accumulator_args)
}

fn count_distinct_benchmark(c: &mut Criterion) {
for pct in [80, 99] {
let n_distinct = BATCH_SIZE * pct / 100;
Expand Down Expand Up @@ -148,7 +187,130 @@ fn count_distinct_benchmark(c: &mut Criterion) {
.unwrap()
})
});

// 32-bit integer types

// UInt32
for pct in [80, 99] {
let n_distinct = BATCH_SIZE * pct / 100;
let values = Arc::new(create_u32_array(n_distinct)) as ArrayRef;
c.bench_function(&format!("count_distinct u32 {pct}% distinct"), |b| {
b.iter(|| {
let mut accumulator = prepare_accumulator(DataType::UInt32);
accumulator
.update_batch(std::slice::from_ref(&values))
.unwrap()
})
});
}

// Int32
for pct in [80, 99] {
let n_distinct = BATCH_SIZE * pct / 100;
let values = Arc::new(create_i32_array(n_distinct)) as ArrayRef;
c.bench_function(&format!("count_distinct i32 {pct}% distinct"), |b| {
b.iter(|| {
let mut accumulator = prepare_accumulator(DataType::Int32);
accumulator
.update_batch(std::slice::from_ref(&values))
.unwrap()
})
});
}
}

/// Create group indices with uniform distribution
fn create_uniform_groups(num_groups: usize) -> Vec<usize> {
let mut rng = StdRng::seed_from_u64(42);
(0..BATCH_SIZE)
.map(|_| rng.random_range(0..num_groups))
.collect()
}

/// Create group indices with skewed distribution (80% in 20% of groups)
fn create_skewed_groups(num_groups: usize) -> Vec<usize> {
let mut rng = StdRng::seed_from_u64(42);
let hot_groups = (num_groups / 5).max(1);
(0..BATCH_SIZE)
.map(|_| {
if rng.random_range(0..100) < 80 {
rng.random_range(0..hot_groups)
} else {
rng.random_range(0..num_groups)
Comment thread
coderfender marked this conversation as resolved.
}
})
.collect()
}

fn create_array_for_type(data_type: &DataType, n_distinct: usize) -> ArrayRef {
match data_type {
DataType::Int64 => Arc::new(create_i64_array(n_distinct)) as ArrayRef,
DataType::Int32 => Arc::new(create_i32_array(n_distinct)) as ArrayRef,
DataType::UInt32 => Arc::new(create_u32_array(n_distinct)) as ArrayRef,
DataType::Int16 => Arc::new(create_i16_array(n_distinct)) as ArrayRef,
DataType::UInt16 => Arc::new(create_u16_array(n_distinct)) as ArrayRef,
DataType::Int8 => Arc::new(create_i8_array(n_distinct)) as ArrayRef,
DataType::UInt8 => Arc::new(create_u8_array(n_distinct)) as ArrayRef,
_ => panic!("Unsupported type for benchmark"),
}
}

fn count_distinct_groups_benchmark(c: &mut Criterion) {
let count_fn = Count::new();

let group_counts = [100, 1000, 10000];
let cardinalities = [("low", 20), ("mid", 80), ("high", 99)];
let distributions = ["uniform", "skewed"];
let data_types = [
("i64", DataType::Int64),
("u32", DataType::UInt32),
("i32", DataType::Int32),
("u16", DataType::UInt16),
("i16", DataType::Int16),
("u8", DataType::UInt8),
("i8", DataType::Int8),
];

for (type_name, data_type) in data_types {
for num_groups in group_counts {
for (card_name, distinct_pct) in cardinalities {
for dist in distributions {
let name = format!("{type_name}_g{num_groups}_{card_name}_{dist}");
let n_distinct = BATCH_SIZE * distinct_pct / 100;
let values = create_array_for_type(&data_type, n_distinct);
let group_indices = if dist == "uniform" {
create_uniform_groups(num_groups)
} else {
create_skewed_groups(num_groups)
};

let (_schema, args) = prepare_args(data_type.clone());

if count_fn.groups_accumulator_supported(args.clone()) {
c.bench_function(&format!("count_distinct_groups {name}"), |b| {
b.iter(|| {
let mut acc =
count_fn.create_groups_accumulator(args.clone()).unwrap();
acc.update_batch(
std::slice::from_ref(&values),
&group_indices,
None,
num_groups,
)
.unwrap();
acc.evaluate(EmitTo::All).unwrap()
})
});
}
}
}
}
}
}

criterion_group!(benches, count_distinct_benchmark);
criterion_group!(
benches,
count_distinct_benchmark,
count_distinct_groups_benchmark
);
criterion_main!(benches);
Loading