Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ci/docker/python-wheel-windows-vs2022-base.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ ENV CMAKE_BUILD_TYPE=${build_type} `
VCPKG_DEFAULT_TRIPLET=amd64-windows-static-md-${build_type} `
VCPKG_FEATURE_FLAGS="manifests"
COPY ci/vcpkg/vcpkg.json arrow/ci/vcpkg/

# cannot use the S3 feature here because while aws-sdk-cpp=1.9.160 contains
# ssl related fixes as well as we can patch the vcpkg portfile to support
# arm machines it hits ARROW-15141 where we would need to fall back to 1.8.186
Expand Down
11 changes: 11 additions & 0 deletions ci/vcpkg/vcpkg.patch
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
diff --git a/scripts/toolchains/windows.cmake b/scripts/toolchains/windows.cmake
index 3cc90cc..36af495 100644
--- a/scripts/toolchains/windows.cmake
+++ b/scripts/toolchains/windows.cmake
@@ -88,4 +88,4 @@ if(NOT _VCPKG_WINDOWS_TOOLCHAIN)
set(CMAKE_C_FLAGS_DEBUG "${VCPKG_CRT_LINK_FLAG_PREFIX}d /Z7 /Ob0 /Od /RTC1 ${VCPKG_C_FLAGS_DEBUG}" CACHE STRING "")
- set(CMAKE_CXX_FLAGS_RELEASE "${VCPKG_CRT_LINK_FLAG_PREFIX} /O2 /Oi /Gy /DNDEBUG /Z7 ${VCPKG_CXX_FLAGS_RELEASE}" CACHE STRING "")
- set(CMAKE_C_FLAGS_RELEASE "${VCPKG_CRT_LINK_FLAG_PREFIX} /O2 /Oi /Gy /DNDEBUG /Z7 ${VCPKG_C_FLAGS_RELEASE}" CACHE STRING "")
+ set(CMAKE_CXX_FLAGS_RELEASE "${VCPKG_CRT_LINK_FLAG_PREFIX} /O2 /Oi /Gy /DNDEBUG ${VCPKG_CXX_FLAGS_RELEASE}" CACHE STRING "")
+ set(CMAKE_C_FLAGS_RELEASE "${VCPKG_CRT_LINK_FLAG_PREFIX} /O2 /Oi /Gy /DNDEBUG ${VCPKG_C_FLAGS_RELEASE}" CACHE STRING "")

diff --git a/scripts/cmake/vcpkg_execute_build_process.cmake b/scripts/cmake/vcpkg_execute_build_process.cmake
index 60fd5b587a..c8dc021af8 100644
--- a/scripts/cmake/vcpkg_execute_build_process.cmake
Expand Down
12 changes: 7 additions & 5 deletions cpp/src/arrow/compute/kernels/scalar_cast_nested.cc
Original file line number Diff line number Diff line change
Expand Up @@ -346,11 +346,13 @@ struct CastStruct {
for (int out_field_index = 0; out_field_index < out_field_count; ++out_field_index) {
const auto& out_field = out_type.field(out_field_index);

// Take the first field with matching name, if any. Extract it from the map so it
// can't be reused.
auto maybe_in_field_index = in_fields.extract(out_field->name());
if (!maybe_in_field_index.empty()) {
fields_to_select[out_field_index] = maybe_in_field_index.mapped();
// Take the first field with matching name, if any. Erase it from the map so it
// can't be reused. Use lower_bound (which guarantees first-match) instead of
// find/extract (which do not guarantee first for duplicate keys).
auto it = in_fields.lower_bound(out_field->name());
if (it != in_fields.end() && it->first == out_field->name()) {
fields_to_select[out_field_index] = it->second;
in_fields.erase(it);
} else if (out_field->nullable()) {
fields_to_select[out_field_index] = kFillNullSentinel;
} else {
Expand Down
66 changes: 56 additions & 10 deletions cpp/src/arrow/extension/fixed_shape_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

#include <limits>
#include <numeric>
#include <sstream>

Expand Down Expand Up @@ -109,8 +110,8 @@ Result<std::shared_ptr<DataType>> FixedShapeTensorType::Deserialize(
return Status::Invalid("Expected FixedSizeList storage type, got ",
storage_type->ToString());
}
auto value_type =
internal::checked_pointer_cast<FixedSizeListType>(storage_type)->value_type();
auto fsl_type = internal::checked_pointer_cast<FixedSizeListType>(storage_type);
auto value_type = fsl_type->value_type();
rj::Document document;
if (document.Parse(serialized_data.data(), serialized_data.length()).HasParseError() ||
!document.IsObject() || !document.HasMember("shape") ||
Expand All @@ -119,29 +120,66 @@ Result<std::shared_ptr<DataType>> FixedShapeTensorType::Deserialize(
}

std::vector<int64_t> shape;
for (auto& x : document["shape"].GetArray()) {
for (const auto& x : document["shape"].GetArray()) {
if (!x.IsInt64()) {
return Status::Invalid("shape must contain integers, got ",
internal::JsonTypeName(x));
}
shape.emplace_back(x.GetInt64());
}

std::vector<int64_t> permutation;
if (document.HasMember("permutation")) {
for (auto& x : document["permutation"].GetArray()) {
const auto& json_permutation = document["permutation"];
if (!json_permutation.IsArray()) {
return Status::Invalid("permutation must be an array, got ",
internal::JsonTypeName(json_permutation));
}
for (const auto& x : json_permutation.GetArray()) {
if (!x.IsInt64()) {
return Status::Invalid("permutation must contain integers, got ",
internal::JsonTypeName(x));
}
permutation.emplace_back(x.GetInt64());
}
if (shape.size() != permutation.size()) {
return Status::Invalid("Invalid permutation");
}
RETURN_NOT_OK(internal::IsPermutationValid(permutation));
}
std::vector<std::string> dim_names;
if (document.HasMember("dim_names")) {
for (auto& x : document["dim_names"].GetArray()) {
const auto& json_dim_names = document["dim_names"];
if (!json_dim_names.IsArray()) {
return Status::Invalid("dim_names must be an array, got ",
internal::JsonTypeName(json_dim_names));
}
for (const auto& x : json_dim_names.GetArray()) {
if (!x.IsString()) {
return Status::Invalid("dim_names must contain strings, got ",
internal::JsonTypeName(x));
}
dim_names.emplace_back(x.GetString());
}
if (shape.size() != dim_names.size()) {
return Status::Invalid("Invalid dim_names");
}
}

return fixed_shape_tensor(value_type, shape, permutation, dim_names);
// Validate product of shape dimensions matches storage type list_size.
// This check is intentionally after field parsing so that metadata-level errors
// (type mismatches, size mismatches) are reported first.
ARROW_ASSIGN_OR_RAISE(auto ext_type, FixedShapeTensorType::Make(
value_type, shape, permutation, dim_names));
const auto& fst_type = internal::checked_cast<const FixedShapeTensorType&>(*ext_type);
ARROW_ASSIGN_OR_RAISE(const int64_t expected_size,
internal::ComputeShapeProduct(fst_type.shape()));
if (expected_size != fsl_type->list_size()) {
return Status::Invalid("Product of shape dimensions (", expected_size,
") does not match FixedSizeList size (", fsl_type->list_size(),
")");
}
return ext_type;
}

std::shared_ptr<Array> FixedShapeTensorType::MakeArray(
Expand Down Expand Up @@ -310,8 +348,7 @@ const Result<std::shared_ptr<Tensor>> FixedShapeTensorArray::ToTensor() const {
}

std::vector<int64_t> shape = ext_type.shape();
auto cell_size = std::accumulate(shape.begin(), shape.end(), static_cast<int64_t>(1),
std::multiplies<>());
ARROW_ASSIGN_OR_RAISE(const int64_t cell_size, internal::ComputeShapeProduct(shape));
shape.insert(shape.begin(), 1, this->length());
internal::Permute<int64_t>(permutation, &shape);

Expand All @@ -330,6 +367,11 @@ Result<std::shared_ptr<DataType>> FixedShapeTensorType::Make(
const std::shared_ptr<DataType>& value_type, const std::vector<int64_t>& shape,
const std::vector<int64_t>& permutation, const std::vector<std::string>& dim_names) {
const size_t ndim = shape.size();
for (auto dim : shape) {
if (dim < 0) {
return Status::Invalid("shape must have non-negative values, got ", dim);
}
}
if (!permutation.empty() && ndim != permutation.size()) {
return Status::Invalid("permutation size must match shape size. Expected: ", ndim,
" Got: ", permutation.size());
Expand All @@ -342,8 +384,12 @@ Result<std::shared_ptr<DataType>> FixedShapeTensorType::Make(
RETURN_NOT_OK(internal::IsPermutationValid(permutation));
}

const int64_t size = std::accumulate(shape.begin(), shape.end(),
static_cast<int64_t>(1), std::multiplies<>());
ARROW_ASSIGN_OR_RAISE(const int64_t size, internal::ComputeShapeProduct(shape));
if (size > std::numeric_limits<int32_t>::max()) {
return Status::Invalid("Product of shape dimensions (", size,
") exceeds maximum FixedSizeList size (",
std::numeric_limits<int32_t>::max(), ")");
}
return std::make_shared<FixedShapeTensorType>(value_type, static_cast<int32_t>(size),
shape, permutation, dim_names);
}
Expand Down
93 changes: 93 additions & 0 deletions cpp/src/arrow/extension/tensor_extension_array_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,73 @@ TEST_F(TestFixedShapeTensorType, MetadataSerializationRoundtrip) {
CheckDeserializationRaises(ext_type_, storage_type,
R"({"shape":[3],"dim_names":["x","y"]})",
"Invalid dim_names");

// Validate shape values must be integers. Error message should include the
// JSON type name of the offending value.
CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":[3.5,4]})",
"shape must contain integers, got Number");
CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":["3","4"]})",
"shape must contain integers, got String");
CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":[null]})",
"shape must contain integers, got Null");
CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":[true]})",
"shape must contain integers, got True");
CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":[false]})",
"shape must contain integers, got False");

// Validate shape values must be non-negative
CheckDeserializationRaises(ext_type_, fixed_size_list(int64(), 1), R"({"shape":[-1]})",
"shape must have non-negative values");

// Validate product of shape matches storage list_size
CheckDeserializationRaises(ext_type_, storage_type, R"({"shape":[3,3]})",
"Product of shape dimensions");

// Validate permutation member must be an array with integer values
CheckDeserializationRaises(ext_type_, storage_type,
R"({"shape":[3,4],"permutation":"invalid"})",
"permutation must be an array, got String");
CheckDeserializationRaises(ext_type_, storage_type,
R"({"shape":[3,4],"permutation":{"a":1}})",
"permutation must be an array, got Object");
CheckDeserializationRaises(ext_type_, storage_type,
R"({"shape":[3,4],"permutation":[1.5,0.5]})",
"permutation must contain integers, got Number");
CheckDeserializationRaises(ext_type_, storage_type,
R"({"shape":[3,4],"permutation":["a","b"]})",
"permutation must contain integers, got String");

// Validate permutation values must be unique integers in [0, N-1]
CheckDeserializationRaises(ext_type_, storage_type,
R"({"shape":[3,4],"permutation":[0,0]})",
"Permutation indices");
CheckDeserializationRaises(ext_type_, storage_type,
R"({"shape":[3,4],"permutation":[0,5]})",
"Permutation indices");
CheckDeserializationRaises(ext_type_, storage_type,
R"({"shape":[3,4],"permutation":[-1,0]})",
"Permutation indices");

// Validate dim_names member must be an array with string values
CheckDeserializationRaises(ext_type_, storage_type,
R"({"shape":[3,4],"dim_names":"invalid"})",
"dim_names must be an array, got String");
CheckDeserializationRaises(ext_type_, storage_type,
R"({"shape":[3,4],"dim_names":[1,2]})",
"dim_names must contain strings, got Number");
CheckDeserializationRaises(ext_type_, storage_type,
R"({"shape":[3,4],"dim_names":[null,null]})",
"dim_names must contain strings, got Null");
}

TEST_F(TestFixedShapeTensorType, MakeValidatesShape) {
// Negative shape values should be rejected
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, testing::HasSubstr("shape must have non-negative values"),
FixedShapeTensorType::Make(value_type_, {-1}));
EXPECT_RAISES_WITH_MESSAGE_THAT(
Invalid, testing::HasSubstr("shape must have non-negative values"),
FixedShapeTensorType::Make(value_type_, {3, -1, 4}));
}

TEST_F(TestFixedShapeTensorType, RoundtripBatch) {
Expand Down Expand Up @@ -794,6 +861,32 @@ TEST_F(TestVariableShapeTensorType, MetadataSerializationRoundtrip) {
"Invalid: permutation");
CheckDeserializationRaises(ext_type_, storage_type, R"({"dim_names":["x","y"]})",
"Invalid: dim_names");

// Validate permutation member must be an array with integer values. Error
// message should include the JSON type name of the offending value.
CheckDeserializationRaises(ext_type_, storage_type, R"({"permutation":"invalid"})",
"permutation must be an array, got String");
CheckDeserializationRaises(ext_type_, storage_type, R"({"permutation":[1.5,0.5,2.5]})",
"permutation must contain integers, got Number");
CheckDeserializationRaises(ext_type_, storage_type,
R"({"permutation":[null,null,null]})",
"permutation must contain integers, got Null");

// Validate dim_names member must be an array with string values
CheckDeserializationRaises(ext_type_, storage_type, R"({"dim_names":"invalid"})",
"dim_names must be an array, got String");
CheckDeserializationRaises(ext_type_, storage_type, R"({"dim_names":[1,2,3]})",
"dim_names must contain strings, got Number");

// Validate uniform_shape member must be an array with integer-or-null values
CheckDeserializationRaises(ext_type_, storage_type, R"({"uniform_shape":"invalid"})",
"uniform_shape must be an array, got String");
CheckDeserializationRaises(ext_type_, storage_type,
R"({"uniform_shape":[1.5,null,null]})",
"uniform_shape must contain integers or nulls, got Number");
CheckDeserializationRaises(ext_type_, storage_type,
R"({"uniform_shape":["x",null,null]})",
"uniform_shape must contain integers or nulls, got String");
}

TEST_F(TestVariableShapeTensorType, RoundtripBatch) {
Expand Down
32 changes: 26 additions & 6 deletions cpp/src/arrow/extension/tensor_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,31 @@

namespace arrow::internal {

namespace {

// Names indexed by rapidjson::Type enum value:
// kNullType=0, kFalseType=1, kTrueType=2, kObjectType=3,
// kArrayType=4, kStringType=5, kNumberType=6.
constexpr const char* kJsonTypeNames[] = {"Null", "False", "True", "Object",
"Array", "String", "Number"};

} // namespace

const char* JsonTypeName(const ::arrow::rapidjson::Value& v) {
return kJsonTypeNames[v.GetType()];
}

Result<int64_t> ComputeShapeProduct(std::span<const int64_t> shape) {
int64_t product = 1;
for (const auto dim : shape) {
if (MultiplyWithOverflow(product, dim, &product)) {
return Status::Invalid(
"Product of tensor shape dimensions would not fit in 64-bit integer");
}
}
return product;
}

bool IsPermutationTrivial(std::span<const int64_t> permutation) {
for (size_t i = 1; i < permutation.size(); ++i) {
if (permutation[i - 1] + 1 != permutation[i]) {
Expand Down Expand Up @@ -105,12 +130,7 @@ Result<std::shared_ptr<Buffer>> SliceTensorBuffer(const Array& data_array,
const DataType& value_type,
std::span<const int64_t> shape) {
const int64_t byte_width = value_type.byte_width();
int64_t size = 1;
for (const auto dim : shape) {
if (MultiplyWithOverflow(size, dim, &size)) {
return Status::Invalid("Tensor size would not fit in 64-bit integer");
}
}
ARROW_ASSIGN_OR_RAISE(const int64_t size, ComputeShapeProduct(shape));
if (size != data_array.length()) {
return Status::Invalid("Expected data array of length ", size, ", got ",
data_array.length());
Expand Down
14 changes: 14 additions & 0 deletions cpp/src/arrow/extension/tensor_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,25 @@
#include <span>
#include <vector>

#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep
#include "arrow/result.h"
#include "arrow/type_fwd.h"

#include <rapidjson/document.h>

namespace arrow::internal {

/// \brief Return the name of a RapidJSON value's type (e.g., "Null", "Array", "Number").
ARROW_EXPORT
const char* JsonTypeName(const ::arrow::rapidjson::Value& v);

/// \brief Compute the product of the given shape dimensions.
///
/// Returns Status::Invalid if the product would overflow int64_t.
/// An empty shape returns 1 (the multiplicative identity).
ARROW_EXPORT
Result<int64_t> ComputeShapeProduct(std::span<const int64_t> shape);

ARROW_EXPORT
bool IsPermutationTrivial(std::span<const int64_t> permutation);

Expand Down
Loading
Loading