Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,25 @@ jobs:
--exclude lance-bench --exclude datafusion-bench --exclude random-access-bench `
--exclude compress-bench --exclude xtask --exclude vortex-datafusion `
--exclude gpu-scan-cli --exclude vortex-sqllogictest
- name: Rust Tests (Windows - DuckDB)
if: matrix.os == 'windows-x64'
shell: pwsh
# Non-blocking: vortex-duckdb's C++ extras call DuckDB's internal C++ API, but the
# prebuilt Windows duckdb.dll only exports the C API, so the link step fails with
# unresolved externals (LNK1120). Linking would require building DuckDB from source
# on Windows with all symbols exported. Until then this still validates that the
# crate configures, generates bindings (bindgen/libclang) and compiles on MSVC.
continue-on-error: true
run: |
# vortex-duckdb's build.rs runs bindgen, which needs libclang. Use the
# preinstalled LLVM if present, otherwise install it via Chocolatey.
$llvm = "C:\Program Files\LLVM\bin"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this a robust way to get the path, can we query chocolatey?

if (-not (Test-Path "$llvm\libclang.dll")) {
choco install llvm --no-progress -y
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

chocolatey has low ceiling rate-limiting, and strictly speaking requires a commercial license (maybe we can get away here without for vortex) but given the number of builds we run the rate-limiting is a real risk (i've recall this from embark studios where the ci shut down bc of that).

}
echo "LIBCLANG_PATH=$llvm" >> $env:GITHUB_ENV
$env:LIBCLANG_PATH = $llvm
cargo nextest run --cargo-profile ci --locked -p vortex-duckdb --all-features --no-fail-fast
- name: Rust Tests (Other)
if: matrix.os != 'windows-x64'
run: |
Expand Down
6 changes: 5 additions & 1 deletion vortex-duckdb/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,6 @@ crate-type = ["staticlib", "cdylib", "rlib"]
async-fs = { workspace = true }
async-trait = { workspace = true }
bitvec = { workspace = true }
custom-labels = { workspace = true }
futures = { workspace = true }
itertools = { workspace = true }
kanal = { workspace = true }
Expand All @@ -42,6 +41,11 @@ url = { workspace = true }
vortex = { workspace = true, features = ["files", "tokio", "object_store"] }
vortex-utils = { workspace = true, features = ["dashmap"] }

# `custom-labels` provides thread-local profiling labels but does not build under MSVC, so
# gate it to unix targets (matching vortex-io). Windows skips the labelset integration.
[target.'cfg(unix)'.dependencies]
custom-labels = { workspace = true }

[dev-dependencies]
anyhow = { workspace = true }
jiff = { workspace = true }
Expand Down
135 changes: 117 additions & 18 deletions vortex-duckdb/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
use std::env;
use std::fs;
use std::io;
#[cfg(unix)]
use std::os::unix::fs::symlink;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we don't use a symlink what do we do for Windows.

use std::path::Path;
use std::path::PathBuf;
Expand All @@ -22,8 +23,6 @@ const DUCKDB_SOURCE_RELEASE_URL: &str = "https://github.com/duckdb/duckdb/archiv
const DUCKDB_SOURCE_COMMIT_URL: &str = "https://github.com/duckdb/duckdb/archive";
const DEFAULT_DUCKDB_VERSION: &str = "1.5.3";

const BUILD_ARTIFACTS: [&str; 3] = ["libduckdb.dylib", "libduckdb.so", "libduckdb_static.a"];

const SOURCE_FILES: [&str; 17] = [
"cpp/client_context.cpp",
"cpp/config.cpp",
Expand All @@ -47,6 +46,35 @@ const SOURCE_FILES: [&str; 17] = [
const DOWNLOAD_MAX_RETRIES: i32 = 3;
const DOWNLOAD_TIMEOUT: u64 = 90;

/// Returns `true` if the build *target* uses the MSVC toolchain (Windows `cl`/`link`).
///
/// Reads `CARGO_CFG_TARGET_ENV` rather than `cfg!(target_env = ..)` because a build
/// script's `cfg!` reflects the host, not the target being compiled for.
fn target_is_msvc() -> bool {
env::var("CARGO_CFG_TARGET_ENV").as_deref() == Ok("msvc")
}

/// Returns `true` if the build *target* OS is Windows.
///
/// Reads `CARGO_CFG_TARGET_OS` rather than `cfg!(target_os = ..)` for the same
/// host-vs-target reason as [`target_is_msvc`].
fn target_is_windows() -> bool {
env::var("CARGO_CFG_TARGET_OS").as_deref() == Ok("windows")
}

/// The DuckDB library files to look for on the current target.
///
/// DuckDB names its shared library per-OS (`.dylib` / `.so` / `.dll`) and, on unix, also
/// emits a static archive. We probe for these after a download or source build and copy
/// whichever exist, so the list must reflect the target rather than mixing every OS.
fn build_artifacts() -> &'static [&'static str] {
match env::var("CARGO_CFG_TARGET_OS").as_deref() {
Ok("macos") => &["libduckdb.dylib", "libduckdb_static.a"],
Ok("windows") => &["duckdb.dll"],
_ => &["libduckdb.so", "libduckdb_static.a"],
}
}

#[derive(Debug)]
struct BindgenCargoCallbacks;

Expand Down Expand Up @@ -178,6 +206,8 @@ fn download(version: &DuckDBVersion, library_dir: &Path) {
"aarch64-apple-darwin" | "x86_64-apple-darwin" => ("osx", "universal"),
"x86_64-unknown-linux-gnu" => ("linux", "amd64"),
"aarch64-unknown-linux-gnu" => ("linux", "arm64"),
"x86_64-pc-windows-msvc" => ("windows", "amd64"),
"aarch64-pc-windows-msvc" => ("windows", "arm64"),
_ => {
println!("cargo:error=Unsupported target {target}");
exit(1);
Expand All @@ -192,7 +222,7 @@ fn download(version: &DuckDBVersion, library_dir: &Path) {
download_url(&url, &archive_path);

let duckdb_lib_dir = archive_path.parent().unwrap().to_path_buf();
for artifact in BUILD_ARTIFACTS {
for artifact in build_artifacts() {
if duckdb_lib_dir.join(artifact).exists() {
return;
}
Expand Down Expand Up @@ -264,7 +294,7 @@ fn try_build_duckdb(
let build_src_dir = build_dir.join("src");

let mut build = true;
for artifact in BUILD_ARTIFACTS {
for artifact in build_artifacts() {
let path = build_src_dir.join(artifact);
if path.exists() {
println!("cargo:info=Found {artifact} in {}", path.display());
Expand All @@ -287,7 +317,7 @@ fn try_build_duckdb(
fs::create_dir_all(library_dir).unwrap();

let mut found_artifact = false;
for artifact in BUILD_ARTIFACTS {
for artifact in build_artifacts() {
let src = build_src_dir.join(artifact);
if !src.exists() {
continue;
Expand All @@ -298,7 +328,7 @@ fn try_build_duckdb(
}

if !found_artifact {
let artifacts = BUILD_ARTIFACTS.join(",");
let artifacts = build_artifacts().join(",");
println!("cargo:error=Failed to find any of {artifacts} after build");
exit(1);
}
Expand Down Expand Up @@ -351,15 +381,31 @@ fn bindgen_c2rust(crate_dir: &Path, duckdb_include_dir: &Path) {
}
}

/// Configure the C++ compiler flags and the DuckDB header include path on `build`.
///
/// MSVC's `cl` does not understand the GCC/Clang `-W*`/`-isystem` flags, so it gets `/W4`
/// and a plain include. It also needs `/EHsc` to enable standard C++ exception unwinding
/// (which the DuckDB headers rely on). We deliberately do not enable `/WX`
/// (warnings-as-errors) there, so warnings originating inside the DuckDB headers don't fail
/// the build. Other toolchains keep the stricter `-Werror` set with the headers added via
/// `-isystem` for the same reason.
fn configure_cpp_warnings(build: &mut cc::Build, duckdb_include_dir: &Path) {
if target_is_msvc() {
build.flag("/W4").flag("/EHsc").include(duckdb_include_dir);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we ignore header warnings for Windows? Ideally, we could match the UNIX setup here fully.

} else {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just build with Clang actually. We need Clang anyway bc of libclang and clang is abi compatible with msvc on windows.

build
.flags(["-Wall", "-Wextra", "-Wpedantic", "-Werror"])
.flag("-isystem")
.flag(duckdb_include_dir);
}
}

/// Generate libvortex_duckdb.*
fn compile_cpp(duckdb_include_dir: &Path) {
cc::Build::new()
.std("c++20")
.flags(["-Wall", "-Wextra", "-Wpedantic", "-Werror"])
.cpp(true)
// We don't want compiler warnings inside duckdb headers, pass as flags
.flag("-isystem")
.flag(duckdb_include_dir)
let mut build = cc::Build::new();
build.std("c++20").cpp(true);
configure_cpp_warnings(&mut build, duckdb_include_dir);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any way to retain the builder pattern style?

build
.include("include")
.include("cpp/include")
.files(SOURCE_FILES)
Expand Down Expand Up @@ -396,6 +442,56 @@ fn cbindgen_rust2c(crate_dir: &Path) {
}
}

/// The linker argument that embeds an rpath pointing at the DuckDB library directory, or
/// `None` on platforms whose linker rejects `-Wl,-rpath` (e.g. Windows/MSVC, where the DLL
/// is resolved via `PATH` / the executable directory instead).
fn rpath_link_arg(library_dir: &Path) -> Option<String> {
if target_is_windows() {
None
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How do we ensure then that libduckdb is added to the path?

} else {
Some(format!("-Wl,-rpath,{}", library_dir.display()))
}
}

/// On Windows, copy `duckdb.dll` next to the built artifacts so the loader can find it at
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For all comments, double check whether we can make them more concise and readable.

/// run time. Windows has no rpath: a DLL is resolved from the executable's own directory or
/// from `PATH`. We place it in the Cargo profile dir and its `deps/` subdir (where `nextest`
/// runs test executables). No-op elsewhere, where rpath / `*_LIBRARY_PATH` handle this.
fn copy_runtime_dll(library_dir: &Path, out_dir: &Path) {
if !target_is_windows() {
return;
}
let dll = library_dir.join("duckdb.dll");
if !dll.exists() {
return;
}
// OUT_DIR is `<target>/<profile>/build/<pkg>/out`; the profile dir four levels up holds
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This differs for arch local and cross-compilation:

  • local : target/<profile>/build/<pkg-hash>/out
  • cross: target/<triple>/<profile>/build/<pkg-hash>/out

// the binaries and its `deps/` holds the test executables.
let Some(profile_dir) = out_dir.ancestors().nth(3) else {
return;
};
for dir in [profile_dir.to_path_buf(), profile_dir.join("deps")] {
if dir.is_dir() {
drop(fs::copy(&dll, dir.join("duckdb.dll")));
}
}
}

/// Create a convenience symlink `crate_dir/duckdb` -> the extracted DuckDB source tree.
/// This is purely for editor/tooling navigation; the build uses the source path directly.
/// Only created on unix, where the symlink API needs no special privilege.
#[cfg(unix)]
fn symlink_duckdb_source(source_dir: &Path, duckdb_dir: &Path) {
drop(fs::remove_file(duckdb_dir));
drop(fs::remove_dir_all(duckdb_dir));
symlink(source_dir, duckdb_dir).unwrap();
}

/// No-op on non-unix platforms (e.g. Windows): the symlink is only a convenience and
/// Windows directory symlinks require elevated privileges.
#[cfg(not(unix))]
fn symlink_duckdb_source(_source_dir: &Path, _duckdb_dir: &Path) {}

fn main() {
println!("cargo:rerun-if-env-changed=DUCKDB_VERSION");
println!("cargo:rerun-if-env-changed=VX_DUCKDB_DEBUG");
Expand Down Expand Up @@ -427,8 +523,11 @@ fn main() {
println!("cargo:rustc-link-lib=dylib=duckdb");

// Set rpath for binaries built directly from this crate. This is not
// inherited by downstream crates.
println!("cargo:rustc-link-arg=-Wl,-rpath,{library_dir_str}");
// inherited by downstream crates. Skipped on platforms whose linker rejects
// `-Wl,-rpath` (see `rpath_link_arg`).
if let Some(rpath_arg) = rpath_link_arg(&library_dir) {
println!("cargo:rustc-link-arg={rpath_arg}");
}

// Export the library path for downstream crates via the `links` manifest key.
// Downstream crates can access this via `env::var("DEP_DUCKDB_LIB_DIR")` in their build.rs
Expand Down Expand Up @@ -467,9 +566,7 @@ fn main() {
fs::write(&extract_marker, version.to_string()).unwrap();
}

drop(fs::remove_file(&duckdb_dir));
drop(fs::remove_dir_all(&duckdb_dir));
symlink(&source_dir, &duckdb_dir).unwrap();
symlink_duckdb_source(&source_dir, &duckdb_dir);

let has_debug_env =
env::var("VX_DUCKDB_DEBUG").is_ok_and(|v| matches!(v.as_str(), "1" | "true"));
Expand All @@ -485,6 +582,8 @@ fn main() {
download(&version, &library_dir);
};

copy_runtime_dll(&library_dir, &out_dir);

let duckdb_include_dir = inner_dir.join("src").join("include");
println!("cargo:rerun-if-changed=cpp/include");
bindgen_c2rust(&crate_dir, &duckdb_include_dir);
Expand Down
4 changes: 2 additions & 2 deletions vortex-duckdb/cpp/include/duckdb_vx/vector.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ void duckdb_vx_vector_set_validity_data(duckdb_vector ffi_vector,
// Converts a duckdb flat vector into a Sequence vector.
void duckdb_vx_sequence_vector(duckdb_vector c_vector, int64_t start, int64_t step, idx_t capacity);

void duckdb_vector_flatten(duckdb_vector vector, unsigned long len);
void duckdb_vector_flatten(duckdb_vector vector, idx_t len);

const char *duckdb_vector_to_string(duckdb_vector vector, unsigned long len, duckdb_vx_error *err);
const char *duckdb_vector_to_string(duckdb_vector vector, idx_t len, duckdb_vx_error *err);

duckdb_value duckdb_vx_vector_get_value(duckdb_vector ffi_vector, idx_t index);

Expand Down
8 changes: 6 additions & 2 deletions vortex-duckdb/cpp/vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -127,12 +127,12 @@ extern "C" duckdb_value duckdb_vx_vector_get_value(duckdb_vector ffi_vector, idx
return reinterpret_cast<duckdb_value>(value.release());
}

void duckdb_vector_flatten(duckdb_vector vector, unsigned long len) {
void duckdb_vector_flatten(duckdb_vector vector, idx_t len) {
auto dvector = reinterpret_cast<Vector *>(vector);
dvector->Flatten(len);
}

const char *duckdb_vector_to_string(duckdb_vector vector, unsigned long len, duckdb_vx_error *err) {
const char *duckdb_vector_to_string(duckdb_vector vector, idx_t len, duckdb_vx_error *err) {
try {
auto dvector = reinterpret_cast<Vector *>(vector);
auto str = dvector->ToString(len);
Expand Down Expand Up @@ -160,6 +160,10 @@ void duckdb_vx_vector_set_all_valid(duckdb_vector ffi_vector) {
case FSST_VECTOR:
return FSSTVector::Validity(vector).Reset();
default:
#if defined(_MSC_VER)
__assume(false);
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we build with clang, we don't need any of these specializations.

#else
__builtin_unreachable();
#endif
}
}
29 changes: 17 additions & 12 deletions vortex-duckdb/src/table_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ use std::sync::atomic::AtomicBool;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::Ordering;

#[cfg(unix)]
use custom_labels::CURRENT_LABELSET;
#[cfg(unix)]
use custom_labels::sys as custom_labels_sys;
use futures::StreamExt;
use itertools::Itertools;
use num_traits::AsPrimitive;
Expand All @@ -33,6 +36,7 @@ use vortex::io::runtime::BlockingRuntime as _;
use vortex::io::runtime::current::ThreadSafeIterator;
use vortex::layout::scan::multi::MultiLayoutChild;
use vortex::layout::scan::multi::MultiLayoutDataSource;
#[cfg(unix)]
use vortex::metrics::tracing::get_global_labels;
use vortex::scalar_fn::fns::binary::Binary;
use vortex::scalar_fn::fns::operators::Operator;
Expand Down Expand Up @@ -286,19 +290,20 @@ pub fn init_global(init_input: &TableInitInput) -> VortexResult<TableFunctionGlo
}

pub fn init_local(global: &TableFunctionGlobal) -> TableFunctionLocal {
unsafe {
use custom_labels::sys;

if sys::current().is_null() {
let ls = sys::new(0);
sys::replace(ls);
};
}

let global_labels = get_global_labels();
// `custom-labels` is unix-only (it does not build under MSVC), so the thread-local
// labelset integration is skipped on other platforms.
#[cfg(unix)]
{
unsafe {
if custom_labels_sys::current().is_null() {
let ls = custom_labels_sys::new(0);
custom_labels_sys::replace(ls);
};
}

for (key, value) in global_labels {
CURRENT_LABELSET.set(key, value);
for (key, value) in get_global_labels() {
CURRENT_LABELSET.set(key, value);
}
}

TableFunctionLocal {
Expand Down
Loading