Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions src/duckdb_py/include/duckdb_python/numpy/numpy_array.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
//===----------------------------------------------------------------------===//
// DuckDB
//
// duckdb_python/numpy/numpy_array.hpp
//
//
//===----------------------------------------------------------------------===//

#pragma once

#include "duckdb_python/pybind11/pybind_wrapper.hpp"
#include "duckdb.hpp"

namespace duckdb {

//! Thin façade over pybind11's `py::array`.
//!
//! This class is the SINGLE place in the codebase that names `py::array` as the
//! underlying numpy-array representation. A future migration to nanobind's
//! `nb::ndarray` should only require changing the member type and the handful of
//! small methods defined here -- every call site goes through this wrapper
//! instead of touching `py::array` directly.
//!
//! For operations that don't (yet) have a first-class method on the façade
//! (Python attribute access via `.attr(...)`, iteration, resizing, handing the
//! array back to Python, ...) use `GetArray()` to reach the underlying object.
class NumpyArray {
public:
NumpyArray() = default;
//! Wrap an existing numpy array. A `py::object` argument is implicitly
//! converted to a `py::array` (np.asarray semantics), matching the behaviour
//! the call sites relied on before this façade existed.
explicit NumpyArray(py::array arr) : array(std::move(arr)) {
}

NumpyArray(NumpyArray &&) = default;
NumpyArray &operator=(NumpyArray &&) = default;
NumpyArray(const NumpyArray &) = default;
NumpyArray &operator=(const NumpyArray &) = default;

public:
//! Allocate a fresh, contiguous 1-D numpy array of `count` elements with the
//! given dtype.
static NumpyArray Allocate(const py::dtype &dtype, idx_t count) {
return NumpyArray(py::array(py::dtype(dtype), count));
}

//! Produce a numpy array from an arbitrary Python object (np.asarray semantics).
static NumpyArray FromObject(py::object obj) {
return NumpyArray(py::array(std::move(obj)));
}

//! Read-only pointer to the underlying data buffer (wraps `py::array::data()`).
const void *Data() const {
return array.data();
}

//! Mutable pointer to the underlying data buffer (wraps `py::array::mutable_data()`).
void *MutableData() {
return array.mutable_data();
}

//! Access the underlying array, e.g. for `.attr(...)` calls, iteration, or to
//! hand it back to Python.
py::array &GetArray() {
return array;
}
const py::array &GetArray() const {
return array;
}

private:
//! The single data member -- the one spot that later becomes `nb::ndarray`.
py::array array;
};

} // namespace duckdb
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#pragma once

#include "duckdb_python/pybind11/pybind_wrapper.hpp"
#include "duckdb_python/numpy/numpy_array.hpp"
#include "duckdb.hpp"

namespace duckdb {
Expand All @@ -17,7 +18,7 @@ struct RawArrayWrapper {

explicit RawArrayWrapper(const LogicalType &type);

py::array array;
NumpyArray array;
data_ptr_t data;
LogicalType type;
idx_t type_width;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,20 @@

#include "duckdb_python/pandas/pandas_column.hpp"
#include "duckdb_python/pybind11/pybind_wrapper.hpp"
#include "duckdb_python/numpy/numpy_array.hpp"

namespace duckdb {

class PandasNumpyColumn : public PandasColumn {
public:
PandasNumpyColumn(py::array array_p) : PandasColumn(PandasColumnBackend::NUMPY), array(std::move(array_p)) {
D_ASSERT(py::hasattr(array, "strides"));
stride = array.attr("strides").attr("__getitem__")(0).cast<idx_t>();
PandasNumpyColumn(NumpyArray array_p) : PandasColumn(PandasColumnBackend::NUMPY), array(std::move(array_p)) {
auto &arr = array.GetArray();
D_ASSERT(py::hasattr(arr, "strides"));
stride = arr.attr("strides").attr("__getitem__")(0).cast<idx_t>();
}

public:
py::array array;
NumpyArray array;
idx_t stride;
};

Expand Down
5 changes: 3 additions & 2 deletions src/duckdb_py/include/duckdb_python/pandas/pandas_bind.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "duckdb_python/pybind11/pybind_wrapper.hpp"
#include "duckdb_python/pybind11/python_object_container.hpp"
#include "duckdb_python/numpy/numpy_type.hpp"
#include "duckdb_python/numpy/numpy_array.hpp"
#include "duckdb/common/helper.hpp"
#include "duckdb_python/pandas/pandas_column.hpp"

Expand All @@ -11,9 +12,9 @@ namespace duckdb {
class ClientContext;

struct RegisteredArray {
explicit RegisteredArray(py::array numpy_array) : numpy_array(std::move(numpy_array)) {
explicit RegisteredArray(NumpyArray numpy_array) : numpy_array(std::move(numpy_array)) {
}
py::array numpy_array;
NumpyArray numpy_array;
};

struct PandasColumnBindData {
Expand Down
8 changes: 4 additions & 4 deletions src/duckdb_py/numpy/array_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -739,15 +739,15 @@ void ArrayWrapper::Append(idx_t current_offset, Vector &input, idx_t source_size
}

py::object ArrayWrapper::ToArray() const {
D_ASSERT(data->array && mask->array);
D_ASSERT(data->array.GetArray() && mask->array.GetArray());
data->Resize(data->count);
if (!requires_mask) {
return std::move(data->array);
return std::move(data->array.GetArray());
}
mask->Resize(mask->count);
// construct numpy arrays from the data and the mask
auto values = std::move(data->array);
auto nullmask = std::move(mask->array);
auto values = std::move(data->array.GetArray());
auto nullmask = std::move(mask->array.GetArray());

// create masked array and return it
auto masked_array = py::module::import("numpy.ma").attr("masked_array")(values, nullmask);
Expand Down
7 changes: 4 additions & 3 deletions src/duckdb_py/numpy/numpy_bind.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "duckdb_python/numpy/numpy_bind.hpp"
#include "duckdb_python/numpy/array_wrapper.hpp"
#include "duckdb_python/numpy/numpy_array.hpp"
#include "duckdb_python/pandas/pandas_analyzer.hpp"
#include "duckdb_python/pandas/column/pandas_numpy_column.hpp"
#include "duckdb_python/pandas/pandas_bind.hpp"
Expand Down Expand Up @@ -34,7 +35,7 @@ void NumpyBind::Bind(ClientContext &context, py::handle df, vector<PandasColumnB
auto column = get_fun(df_columns[col_idx]);

if (bind_data.numpy_type.type == NumpyNullableType::FLOAT_16) {
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(py::array(column.attr("astype")("float32")));
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(NumpyArray(column.attr("astype")("float32")));
bind_data.numpy_type.type = NumpyNullableType::FLOAT_32;
duckdb_col_type = NumpyToLogicalType(bind_data.numpy_type);
} else if (bind_data.numpy_type.type == NumpyNullableType::STRING) {
Expand All @@ -53,9 +54,9 @@ void NumpyBind::Bind(ClientContext &context, py::handle df, vector<PandasColumnB
duckdb_col_type = LogicalType::ENUM(enum_entries_vec, size);
auto pandas_col = uniq.attr("__getitem__")(1);
bind_data.internal_categorical_type = string(py::str(pandas_col.attr("dtype")));
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(pandas_col);
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(NumpyArray(pandas_col));
} else {
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(column);
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(NumpyArray(column));
duckdb_col_type = NumpyToLogicalType(bind_data.numpy_type);
}

Expand Down
23 changes: 12 additions & 11 deletions src/duckdb_py/numpy/numpy_scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,14 @@
#include "duckdb_python/numpy/numpy_type.hpp"
#include "duckdb/function/scalar/nested_functions.hpp"
#include "duckdb_python/numpy/numpy_scan.hpp"
#include "duckdb_python/numpy/numpy_array.hpp"
#include "duckdb_python/pandas/column/pandas_numpy_column.hpp"

namespace duckdb {

template <class T>
void ScanNumpyColumn(py::array &numpy_col, idx_t stride, idx_t offset, Vector &out, idx_t count) {
auto src_ptr = (T *)numpy_col.data();
void ScanNumpyColumn(NumpyArray &numpy_col, idx_t stride, idx_t offset, Vector &out, idx_t count) {
auto src_ptr = (T *)numpy_col.Data();
if (stride == sizeof(T)) {
FlatVector::SetData(out, data_ptr_cast(src_ptr + offset), count_t(count));
} else {
Expand All @@ -32,8 +33,8 @@ void ScanNumpyColumn(py::array &numpy_col, idx_t stride, idx_t offset, Vector &o
}

template <class T, class V>
void ScanNumpyCategoryTemplated(py::array &column, idx_t offset, Vector &out, idx_t count) {
auto src_ptr = (T *)column.data();
void ScanNumpyCategoryTemplated(NumpyArray &column, idx_t offset, Vector &out, idx_t count) {
auto src_ptr = (T *)column.Data();
auto tgt_ptr = (V *)FlatVector::GetData(out);
auto &tgt_mask = FlatVector::ValidityMutable(out);
for (idx_t i = 0; i < count; i++) {
Expand All @@ -47,7 +48,7 @@ void ScanNumpyCategoryTemplated(py::array &column, idx_t offset, Vector &out, id
}

template <class T>
void ScanNumpyCategory(py::array &column, idx_t count, idx_t offset, Vector &out, string &src_type) {
void ScanNumpyCategory(NumpyArray &column, idx_t count, idx_t offset, Vector &out, string &src_type) {
if (src_type == "int8") {
ScanNumpyCategoryTemplated<int8_t, T>(column, offset, out, count);
} else if (src_type == "int16") {
Expand All @@ -63,7 +64,7 @@ void ScanNumpyCategory(py::array &column, idx_t count, idx_t offset, Vector &out

static void ApplyMask(PandasColumnBindData &bind_data, ValidityMask &validity, idx_t count, idx_t offset) {
D_ASSERT(bind_data.mask);
auto mask = reinterpret_cast<const bool *>(bind_data.mask->numpy_array.data());
auto mask = reinterpret_cast<const bool *>(bind_data.mask->numpy_array.Data());
for (idx_t i = 0; i < count; i++) {
auto is_null = mask[offset + i];
if (is_null) {
Expand Down Expand Up @@ -236,18 +237,18 @@ void NumpyScan::Scan(ClientContext &context, PandasColumnBindData &bind_data, id
ScanNumpyMasked<int64_t>(bind_data, count, offset, out);
break;
case NumpyNullableType::FLOAT_32:
ScanNumpyFpColumn<float>(bind_data, reinterpret_cast<const float *>(array.data()), numpy_col.stride, count,
ScanNumpyFpColumn<float>(bind_data, reinterpret_cast<const float *>(array.Data()), numpy_col.stride, count,
offset, out);
break;
case NumpyNullableType::FLOAT_64:
ScanNumpyFpColumn<double>(bind_data, reinterpret_cast<const double *>(array.data()), numpy_col.stride, count,
ScanNumpyFpColumn<double>(bind_data, reinterpret_cast<const double *>(array.Data()), numpy_col.stride, count,
offset, out);
break;
case NumpyNullableType::DATETIME_NS:
case NumpyNullableType::DATETIME_MS:
case NumpyNullableType::DATETIME_US:
case NumpyNullableType::DATETIME_S: {
auto src_ptr = reinterpret_cast<const int64_t *>(array.data());
auto src_ptr = reinterpret_cast<const int64_t *>(array.Data());
auto tgt_ptr = FlatVector::GetDataMutable<timestamp_t>(out);

using timestamp_convert_func = std::function<timestamp_t(int64_t)>;
Expand Down Expand Up @@ -307,7 +308,7 @@ void NumpyScan::Scan(ClientContext &context, PandasColumnBindData &bind_data, id
case NumpyNullableType::TIMEDELTA_US:
case NumpyNullableType::TIMEDELTA_MS:
case NumpyNullableType::TIMEDELTA_S: {
auto src_ptr = reinterpret_cast<const int64_t *>(array.data());
auto src_ptr = reinterpret_cast<const int64_t *>(array.Data());
auto tgt_ptr = FlatVector::GetDataMutable<interval_t>(out);
auto &mask = FlatVector::ValidityMutable(out);

Expand Down Expand Up @@ -352,7 +353,7 @@ void NumpyScan::Scan(ClientContext &context, PandasColumnBindData &bind_data, id
case NumpyNullableType::STRING:
case NumpyNullableType::OBJECT: {
// Get the source pointer of the numpy array
auto src_ptr = (PyObject **)array.data(); // NOLINT
auto src_ptr = (PyObject **)array.Data(); // NOLINT
const bool is_object_col = bind_data.numpy_type.type == NumpyNullableType::OBJECT;
if (is_object_col && out.GetType().id() != LogicalTypeId::VARCHAR) {
//! We have determined the underlying logical type of this object column
Expand Down
8 changes: 4 additions & 4 deletions src/duckdb_py/numpy/raw_array_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,14 +151,14 @@ string RawArrayWrapper::DuckDBToNumpyDtype(const LogicalType &type) {
void RawArrayWrapper::Initialize(idx_t capacity) {
string dtype = DuckDBToNumpyDtype(type);

array = py::array(py::dtype(dtype), capacity);
data = data_ptr_cast(array.mutable_data());
array = NumpyArray::Allocate(py::dtype(dtype), capacity);
data = data_ptr_cast(array.MutableData());
}

void RawArrayWrapper::Resize(idx_t new_capacity) {
vector<py::ssize_t> new_shape {py::ssize_t(new_capacity)};
array.resize(new_shape, false);
data = data_ptr_cast(array.mutable_data());
array.GetArray().resize(new_shape, false);
data = data_ptr_cast(array.MutableData());
}

} // namespace duckdb
29 changes: 15 additions & 14 deletions src/duckdb_py/pandas/bind.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "duckdb_python/pandas/pandas_bind.hpp"
#include "duckdb_python/pandas/pandas_analyzer.hpp"
#include "duckdb_python/pandas/column/pandas_numpy_column.hpp"
#include "duckdb_python/numpy/numpy_array.hpp"
#include "duckdb_python/pyconnection/pyconnection.hpp"

namespace duckdb {
Expand Down Expand Up @@ -53,19 +54,19 @@ static LogicalType BindColumn(ClientContext &context, PandasBindColumn &column_p

if (column_has_mask) {
// masked object, fetch the internal data and mask array
bind_data.mask = std::make_unique<RegisteredArray>(column.attr("array").attr("_mask"));
bind_data.mask = std::make_unique<RegisteredArray>(NumpyArray(column.attr("array").attr("_mask")));
}

if (bind_data.numpy_type.type == NumpyNullableType::CATEGORY) {
// for category types, we create an ENUM type for string or use the converted numpy type for the rest
D_ASSERT(py::hasattr(column, "cat"));
D_ASSERT(py::hasattr(column.attr("cat"), "categories"));
auto categories = py::array(column.attr("cat").attr("categories"));
auto categories_pd_type = ConvertNumpyType(categories.attr("dtype"));
NumpyArray categories(column.attr("cat").attr("categories"));
auto categories_pd_type = ConvertNumpyType(categories.GetArray().attr("dtype"));
if (categories_pd_type.type == NumpyNullableType::OBJECT) {
// Let's hope the object type is a string.
bind_data.numpy_type.type = NumpyNullableType::CATEGORY;
vector<string> enum_entries = py::cast<vector<string>>(categories);
vector<string> enum_entries = py::cast<vector<string>>(categories.GetArray());
idx_t size = enum_entries.size();
Vector enum_entries_vec(LogicalType::VARCHAR, size);
auto enum_entries_ptr = FlatVector::GetDataMutable<string_t>(enum_entries_vec);
Expand All @@ -74,33 +75,33 @@ static LogicalType BindColumn(ClientContext &context, PandasBindColumn &column_p
}
D_ASSERT(py::hasattr(column.attr("cat"), "codes"));
column_type = LogicalType::ENUM(enum_entries_vec, size);
auto pandas_col = py::array(column.attr("cat").attr("codes"));
bind_data.internal_categorical_type = string(py::str(pandas_col.attr("dtype")));
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(pandas_col);
NumpyArray pandas_col(column.attr("cat").attr("codes"));
bind_data.internal_categorical_type = string(py::str(pandas_col.GetArray().attr("dtype")));
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(std::move(pandas_col));
} else {
auto pandas_col = py::array(column.attr("to_numpy")());
auto numpy_type = pandas_col.attr("dtype");
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(pandas_col);
NumpyArray pandas_col(column.attr("to_numpy")());
auto numpy_type = pandas_col.GetArray().attr("dtype");
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(std::move(pandas_col));
// for category types (non-strings), we use the converted numpy type
bind_data.numpy_type = ConvertNumpyType(numpy_type);
column_type = NumpyToLogicalType(bind_data.numpy_type);
}
} else if (bind_data.numpy_type.type == NumpyNullableType::FLOAT_16) {
auto pandas_array = column.attr("array");
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(py::array(column.attr("to_numpy")("float32")));
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(NumpyArray(column.attr("to_numpy")("float32")));
bind_data.numpy_type.type = NumpyNullableType::FLOAT_32;
column_type = NumpyToLogicalType(bind_data.numpy_type);
} else {
auto pandas_array = column.attr("array");
if (py::hasattr(pandas_array, "_data")) {
// This means we can access the numpy array directly
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(column.attr("array").attr("_data"));
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(NumpyArray(column.attr("array").attr("_data")));
} else if (py::hasattr(pandas_array, "asi8")) {
// This is a datetime object, has the option to get the array as int64_t's
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(py::array(pandas_array.attr("asi8")));
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(NumpyArray(pandas_array.attr("asi8")));
} else {
// Otherwise we have to get it through 'to_numpy()'
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(py::array(column.attr("to_numpy")()));
bind_data.pandas_col = std::make_unique<PandasNumpyColumn>(NumpyArray(column.attr("to_numpy")()));
}
column_type = NumpyToLogicalType(bind_data.numpy_type);
}
Expand Down
5 changes: 3 additions & 2 deletions src/duckdb_py/pyconnection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "duckdb_python/pyresult.hpp"
#include "duckdb_python/python_conversion.hpp"
#include "duckdb_python/numpy/numpy_type.hpp"
#include "duckdb_python/numpy/numpy_array.hpp"
#include "duckdb_python/jupyter_progress_bar_display.hpp"
#include "duckdb_python/pyfilesystem.hpp"
#include "duckdb/parser/parsed_data/create_scalar_function_info.hpp"
Expand Down Expand Up @@ -2352,7 +2353,7 @@ bool IsValidNumpyDimensions(const py::handle &object, int &dim) {
if (!py::isinstance(object, import_cache.numpy.ndarray())) {
return false;
}
auto shape = (py::cast<py::array>(object)).attr("shape");
auto shape = NumpyArray(py::reinterpret_borrow<py::object>(object)).GetArray().attr("shape");
if (py::len(shape) != 1) {
return false;
}
Expand All @@ -2366,7 +2367,7 @@ NumpyObjectType DuckDBPyConnection::IsAcceptedNumpyObject(const py::object &obje
}
auto import_cache_ = ImportCache();
if (py::isinstance(object, import_cache_->numpy.ndarray())) {
auto len = py::len((py::cast<py::array>(object)).attr("shape"));
auto len = py::len(NumpyArray(object).GetArray().attr("shape"));
switch (len) {
case 1:
return NumpyObjectType::NDARRAY1D;
Expand Down
Loading
Loading