Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ The following table summarizes the key features of each algorithm:
| **[Odyssey](https://helios2.mi.parisdescartes.fr/~themisp/odyssey/)** | Distributed and parallel in-memory similarity search |
| **[SOFA](https://helios2.mi.parisdescartes.fr/~themisp/publications/icde25-sofa.pdf)** | In-memory similarity search using Symbolic Fourier Approximation (SFA) |
| **[Hercules](https://helios2.mi.parisdescartes.fr/~themisp/publications/pvldb22-hercules.pdf)** | In-memory hierarchical similarity search using EAPCA and SAX-based pruning |
| **[DumpyOS](https://helios2.mi.parisdescartes.fr/~themisp/publications/vldbj24-dumpyos.pdf)** | In-memory scalable data series similarity search using an adaptive multi-ary iSAX index |



Expand Down
21 changes: 21 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,27 @@ target_include_directories(bm_Sofa_L2Square
)
endif()

# ////// DUMPYOS //////
add_executable(bm_DumpyOS_L2Square
bm_DumpyOS_L2Square.cpp
bm_utils.cpp
../commons/paramSetup.cpp
../commons/test_bm_utils.cpp
../commons/dataloaders.cpp
)
target_link_libraries(bm_DumpyOS_L2Square
PRIVATE
benchmark::benchmark
benchmark::benchmark_main
dino_lib
GTest::gtest
)
target_include_directories(bm_DumpyOS_L2Square
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../lib
${CMAKE_CURRENT_SOURCE_DIR}/../commons
)

# ////// HERCULES //////
add_executable(bm_Hercules_L2Square
bm_Hercules_L2Square.cpp
Expand Down
147 changes: 147 additions & 0 deletions benchmark/bm_DumpyOS_L2Square.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#include <benchmark/benchmark.h>
#include <cstdio>
#include <string>
#include "bm_utils.hpp"
#include "../commons/dataloaders.hpp"
#include "../commons/VectorDataLoader.h"
#include "../commons/test_bm_utils.hpp"
#include "../lib/algos/DumpyOS.hpp"
#include "../lib/algos/DataSource.hpp"

static bool endsWith(const std::string& s, const std::string& suffix) {
return s.size() >= suffix.size() &&
s.compare(s.size() - suffix.size(), suffix.size(), suffix) == 0;
}

struct DumpyOSSearchOnlyFixture : public benchmark::Fixture {
daisy::DumpyOS* search = nullptr;
float* database = nullptr;
float* query = nullptr;
daisy::idx_t* I = nullptr;
float* D = nullptr;
daisy::idx_t n_query = 0;
size_t k = 0;
std::string dataset_name;
size_t n_database = 0;
int thread_count = 0;

void SetUp(const benchmark::State& state) override {
int config_idx = static_cast<int>(state.range(0));
const SSTestConfig& config = test_configs_deep_seismic_astro270m[config_idx];

const bool use_fvecs = endsWith(config.dataset_path, ".fvecs") || endsWith(config.query_path, ".fvecs");
size_t dim_u = 0, n_database_u = 0, n_q_u = 0;
database = nullptr;

if (use_fvecs) {
database = fvecs_read(config.dataset_path.c_str(), &dim_u, &n_database_u, 0);
if (!database) {
std::cerr << "Failed to load dataset (fvecs)" << std::endl;
return;
}
const size_t query_limit = (config.query_limit > 0) ? static_cast<size_t>(config.query_limit) : 0;
query = fvecs_read(config.query_path.c_str(), &dim_u, &n_q_u, query_limit);
if (!query) {
std::cerr << "Failed to load queries (fvecs)" << std::endl;
delete[] database;
return;
}
} else {
std::string dataset_filename = pathToFilename(config.dataset_path);
std::string query_filename = pathToFilename(config.query_path);

daisy::idx_t dim, n_database, _, __;
if (!parseFilenameForConfig(dataset_filename, "bruteForce", dim, n_database, _, __)) {
std::cerr << "Failed to parse dataset config from filename: " << dataset_filename << std::endl;
return;
}

daisy::idx_t dim_q, n_q, ___, ____;
if (!parseFilenameForConfig(query_filename, "bruteForce", dim_q, n_q, ___, ____)) {
std::cerr << "Failed to parse query config from filename: " << query_filename << std::endl;
return;
}

if (dim != static_cast<daisy::idx_t>(dim_q)) {
std::cerr << "Dimension mismatch between dataset and queries" << std::endl;
return;
}

dim_u = static_cast<size_t>(dim);
n_database_u = static_cast<size_t>(n_database);
if (config.query_limit > 0 && static_cast<daisy::idx_t>(config.query_limit) < n_q)
n_q = static_cast<daisy::idx_t>(config.query_limit);
n_q_u = static_cast<size_t>(n_q);

database = loadBinData(config.dataset_path.c_str(), n_database, dim, false);
if (!database) {
std::cerr << "Failed to load dataset" << std::endl;
return;
}

query = loadBinData(config.query_path.c_str(), n_q, dim_q, false);
if (!query) {
std::cerr << "Failed to load queries" << std::endl;
delete[] database;
return;
}
}

search = new daisy::DumpyOS(daisy::DistanceType::L2_SQUARED);
search->setNumThreads(config.thread_count);

fprintf(stderr, "[DUMPYOS] Before buildIndex (n_database=%zu dim=%zu).\n", n_database_u, dim_u);
fflush(stderr);

daisy::InMemoryDataSource data_source(database, static_cast<daisy::idx_t>(n_database_u), static_cast<daisy::idx_t>(dim_u));
search->buildIndex(&data_source);

fprintf(stderr, "[DUMPYOS] Indexing finished (n_database=%zu dim=%zu).\n", n_database_u, dim_u);
fflush(stderr);

k = static_cast<size_t>(config.k_value);
n_query = static_cast<daisy::idx_t>(n_q_u);
I = new daisy::idx_t[n_query * k];
D = new float[n_query * k];

dataset_name = config.name;
n_database = n_database_u;
thread_count = config.thread_count;

fprintf(stderr, "[DUMPYOS] n_database=%zu n_query=%zu dim=%zu k=%zu threads=%d\n",
n_database_u, (size_t)n_query, dim_u, k, config.thread_count);
fflush(stderr);
}

void TearDown(const benchmark::State&) override {
delete search;
delete[] database;
delete[] query;
delete[] I;
delete[] D;
search = nullptr;
database = nullptr;
query = nullptr;
I = nullptr;
D = nullptr;
}
};

BENCHMARK_DEFINE_F(DumpyOSSearchOnlyFixture, BM_DumpyOS_SearchOnly)(benchmark::State& state) {
for (auto _ : state) {
fprintf(stderr, "[DUMPYOS] --- Query phase ---\n");
fprintf(stderr, "[DUMPYOS] dataset=%s n_database=%zu\n", dataset_name.c_str(), n_database);
fprintf(stderr, "[DUMPYOS] search_threads=%d n_query=%zu k=%zu\n", thread_count, (size_t)n_query, k);
fflush(stderr);
search->searchIndex(query, n_query, static_cast<daisy::idx_t>(k), I, D);
fprintf(stderr, "[DUMPYOS] Querying finished (n_query=%zu k=%zu).\n", (size_t)n_query, k);
fflush(stderr);
}
}

BENCHMARK_REGISTER_F(DumpyOSSearchOnlyFixture, BM_DumpyOS_SearchOnly)
->Args({0})->Args({1})->Args({2})->Args({3})->Args({4})->Args({5})->Args({6})->Args({7})
->Iterations(1)
->Unit(benchmark::kMillisecond);

BENCHMARK_MAIN();
60 changes: 60 additions & 0 deletions demos/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,66 @@ if(BUILD_DEMO)
message(STATUS "Include directories added for demo_Hercules_L2Square.")
endif()

# ////// DUMPYOS L2Square //////
if(DEBUG_MSG)
message(STATUS "---")
message(STATUS "## Demo: DumpyOS L2Square")
message(STATUS "Attempting to add executable: demo_DumpyOS_L2Square")
endif()
add_executable(demo_DumpyOS_L2Square demo_DumpyOS_L2Square.cpp)
if(DEBUG_MSG)
message(STATUS "Executable demo_DumpyOS_L2Square added.")
endif()

if(DEBUG_MSG)
message(STATUS "Linking libraries for demo_DumpyOS_L2Square...")
endif()
target_link_libraries(demo_DumpyOS_L2Square PRIVATE dino_lib commons_lib)
if(DEBUG_MSG)
message(STATUS "Libraries linked for demo_DumpyOS_L2Square.")
endif()

if(DEBUG_MSG)
message(STATUS "Adding include directories for demo_DumpyOS_L2Square...")
endif()
target_include_directories(demo_DumpyOS_L2Square PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../lib
${CMAKE_CURRENT_SOURCE_DIR}/../commons
)
if(DEBUG_MSG)
message(STATUS "Include directories added for demo_DumpyOS_L2Square.")
endif()

# ////// DUMPYOS DTW //////
if(DEBUG_MSG)
message(STATUS "---")
message(STATUS "## Demo: DumpyOS DTW")
message(STATUS "Attempting to add executable: demo_DumpyOS_DTW")
endif()
add_executable(demo_DumpyOS_DTW demo_DumpyOS_DTW.cpp)
if(DEBUG_MSG)
message(STATUS "Executable demo_DumpyOS_DTW added.")
endif()

if(DEBUG_MSG)
message(STATUS "Linking libraries for demo_DumpyOS_DTW...")
endif()
target_link_libraries(demo_DumpyOS_DTW PRIVATE dino_lib commons_lib)
if(DEBUG_MSG)
message(STATUS "Libraries linked for demo_DumpyOS_DTW.")
endif()

if(DEBUG_MSG)
message(STATUS "Adding include directories for demo_DumpyOS_DTW...")
endif()
target_include_directories(demo_DumpyOS_DTW PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/../lib
${CMAKE_CURRENT_SOURCE_DIR}/../commons
)
if(DEBUG_MSG)
message(STATUS "Include directories added for demo_DumpyOS_DTW.")
endif()

else()
if(DEBUG_MSG)
message(STATUS "BUILD_DEMO is FALSE. Demo executables will NOT be built.")
Expand Down
44 changes: 44 additions & 0 deletions demos/demo_DumpyOS_DTW.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#include "../commons/dataloaders.hpp"
#include "../lib/daisy.hpp"
#include <chrono>
#include <algorithm>

int main()
{
daisy::idx_t n_database = 200000;
unsigned long long dim = 96;
unsigned long long n_query = 10;
daisy::idx_t k = 5;

float *database = loadRandomData(n_database, dim, 100, true);
float *query = loadRandomData(n_query, dim, 50, true);

printf("Loaded %llu database points and %llu query points with dimension %llu\n", n_database, n_query, dim);

daisy::DumpyOS dumpyos_search(daisy::DistanceType::DTW);
dumpyos_search.setNumThreads(4);

int warp_window = std::max(1, static_cast<int>(dim * 0.1));
dumpyos_search.setWarpingWindow(warp_window);

dumpyos_search.buildIndex(database, n_database, dim);

daisy::idx_t *I = new daisy::idx_t[n_query * k];
float *D = new float[n_query * k];
dumpyos_search.searchIndex(query, n_query, k, I, D);

for (daisy::idx_t i = 0; i < n_query; i++) {
printf("Query %llu: ", i);
for (daisy::idx_t j = 0; j < k; j++) {
printf("%llu ", I[i * k + j]);
}
printf("\n");
}

delete[] database;
delete[] query;
delete[] I;
delete[] D;

return 0;
}
37 changes: 37 additions & 0 deletions demos/demo_DumpyOS_DTW.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import sys
import os
import numpy as np

sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from daisy import DistanceType, DumpyOS

def main():

n_database = 200000
dim = 96
n_query = 10
k = 5

np.random.seed(100)
db = np.random.randn(n_database, dim).astype(np.float32)

np.random.seed(50)
query = np.random.randn(n_query, dim).astype(np.float32)

index = DumpyOS(DistanceType.DTW)
index.setWarpingWindow(max(1, int(dim * 0.1)))

index.setNumThreads(4)
index.buildIndex(db)

I, D = index.searchIndex(query, k)

for query_num in range(n_query):
print(f"Query {query_num}:")
print("Distances:", D[query_num])
print("Indices:", I[query_num])
print()

if __name__ == "__main__":
main()
42 changes: 42 additions & 0 deletions demos/demo_DumpyOS_L2Square.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include "../commons/dataloaders.hpp"
#include "../lib/daisy.hpp"
#include <chrono>

int main(){

daisy::idx_t n_database = 200000;
unsigned long long dim = 96;
unsigned long long n_query = 10;
daisy::idx_t k = 5;

float *database = loadRandomData(n_database, dim, 100, true);
float *query = loadRandomData(n_query, dim, 50, true);

printf("Loaded %llu database points and %llu query points with dimension %llu\n", n_database, n_query, dim);

daisy::DumpyOS dumpyos_search(daisy::DistanceType::L2_SQUARED);
dumpyos_search.setNumThreads(4);

dumpyos_search.buildIndex(database, n_database, dim);

daisy::idx_t *I = new daisy::idx_t[n_query * k];
float *D = new float[n_query * k];
dumpyos_search.searchIndex(query, n_query, k, I, D);

for (daisy::idx_t i = 0; i < n_query; i++)
{
printf("Query %llu: ", i);
for (daisy::idx_t j = 0; j < k; j++)
{
printf("%llu ", I[i * k + j]);
}
printf("\n");
}

delete[] database;
delete[] query;
delete[] I;
delete[] D;

return 0;
}
Loading
Loading