diff --git a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp index 2ecc3d84c9..12707ecad0 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.cpp @@ -181,6 +181,87 @@ std::size_t BaseModel::getMemoryLowerBound() const noexcept { void BaseModel::unload() noexcept { module_.reset(nullptr); } +void BaseModel::ensureMethodLoaded(const std::string &methodName) { + if (methodName.empty()) { + throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, + "methodName cannot be empty"); + } + if (currentlyLoadedMethod_ == methodName) { + return; + } + if (!module_) { + throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, + "Model module is not loaded"); + } + if (!currentlyLoadedMethod_.empty()) { + module_->unload_method(currentlyLoadedMethod_); + } + auto loadResult = module_->load_method(methodName); + if (loadResult != executorch::runtime::Error::Ok) { + throw RnExecutorchError( + loadResult, "Failed to load method '" + methodName + + "'. Ensure the method exists in the exported model."); + } + currentlyLoadedMethod_ = methodName; +} + +std::vector +BaseModel::validateAndGetInputShape(const std::string &methodName, + size_t minDimensions) const { + auto inputShapes = getAllInputShapes(methodName); + + if (inputShapes.empty()) { + throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs, + "Model seems to not take any input tensors."); + } + + const auto &shape = inputShapes[0]; + if (shape.size() < minDimensions) { + throw RnExecutorchError( + RnExecutorchErrorCode::WrongDimensions, + "Unexpected model input size, expected at least " + + std::to_string(minDimensions) + + " dimensions but got: " + std::to_string(shape.size()) + "."); + } + + return shape; +} + +std::vector +BaseModel::forwardOrThrow(const EValue &input, + const std::string &contextMessage) const { + auto result = forward(input); + if (!result.ok()) { + throw RnExecutorchError(result.error(), contextMessage); + } + return std::move(result.get()); +} + +std::vector +BaseModel::forwardOrThrow(const std::vector &inputs, + const std::string &contextMessage) const { + auto result = forward(inputs); + if (!result.ok()) { + throw RnExecutorchError(result.error(), contextMessage); + } + return std::move(result.get()); +} + +std::vector +BaseModel::executeOrThrow(const std::string &methodName, + const std::vector &inputs, + const std::string &contextMessage) const { + auto result = execute(methodName, inputs); + if (!result.ok()) { + std::string message = + contextMessage.empty() + ? "Model " + methodName + " method failed. Ensure input is correct." + : contextMessage; + throw RnExecutorchError(result.error(), message); + } + return std::move(result.get()); +} + std::vector BaseModel::getTensorShape(const executorch::aten::Tensor &tensor) const { auto sizes = tensor.sizes(); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h index 6d44976b90..5a0a7eec00 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/BaseModel.h @@ -53,6 +53,32 @@ class BaseModel { std::size_t memorySizeLowerBound{0}; + /// Loads methodName, unloading any previously loaded method first. + /// Useful for multi-method models (e.g., "forward_384", "forward_640"). + void ensureMethodLoaded(const std::string &methodName); + + std::vector + validateAndGetInputShape(const std::string &methodName = "forward", + size_t minDimensions = 2) const; + + std::vector + forwardOrThrow(const EValue &input, + const std::string &contextMessage = + "Model forward failed. Ensure input is correct.") const; + + std::vector + forwardOrThrow(const std::vector &inputs, + const std::string &contextMessage = + "Model forward failed. Ensure input is correct.") const; + + std::vector + executeOrThrow(const std::string &methodName, + const std::vector &inputs, + const std::string &contextMessage = "") const; + + /// Name of the currently loaded method (for multi-method models). + std::string currentlyLoadedMethod_; + private: std::vector getTensorShape(const executorch::aten::Tensor &tensor) const; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.cpp b/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.cpp index cc9c862b32..6078621761 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.cpp @@ -1,6 +1,8 @@ #include "VisionModel.h" #include #include +#include +#include #include #include @@ -25,6 +27,25 @@ cv::Size VisionModel::modelInputSize() const { modelInputShape_[modelInputShape_.size() - 2]); } +cv::Size VisionModel::getModelInputSize(const std::string &methodName) const { + std::string method = methodName.empty() ? currentlyLoadedMethod_ : methodName; + if (method.empty()) { + throw RnExecutorchError( + RnExecutorchErrorCode::InvalidUserInput, + "No method specified and no method currently loaded"); + } + + auto inputShapes = getAllInputShapes(method); + if (inputShapes.empty() || inputShapes[0].size() < 2) { + throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs, + "Could not determine input shape for method: " + + method); + } + + const auto &shape = inputShapes[0]; + return cv::Size(shape[shape.size() - 1], shape[shape.size() - 2]); +} + cv::Mat VisionModel::extractFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) const { cv::Mat frame = ::rnexecutorch::utils::frameToMat(runtime, frameData); @@ -51,4 +72,46 @@ cv::Mat VisionModel::extractFromPixels(const JSTensorViewIn &tensorView) const { return ::rnexecutorch::utils::pixelsToMat(tensorView); } +void VisionModel::initNormalization(const std::vector &normMean, + const std::vector &normStd) { + if (normMean.size() == 3) { + normMean_ = cv::Scalar(normMean[0], normMean[1], normMean[2]); + } else if (!normMean.empty()) { + log(LOG_LEVEL::Warn, + "normMean must have 3 elements — ignoring provided value."); + } + + if (normStd.size() == 3) { + normStd_ = cv::Scalar(normStd[0], normStd[1], normStd[2]); + } else if (!normStd.empty()) { + log(LOG_LEVEL::Warn, + "normStd must have 3 elements — ignoring provided value."); + } +} + +TensorPtr VisionModel::createInputTensor(const cv::Mat &preprocessed) const { + return (normMean_ && normStd_) + ? image_processing::getTensorFromMatrix( + modelInputShape_, preprocessed, *normMean_, *normStd_) + : image_processing::getTensorFromMatrix(modelInputShape_, + preprocessed); +} + +cv::Mat VisionModel::loadImageToRGB(const std::string &imageSource) const { + cv::Mat imageBGR = image_processing::readImage(imageSource); + cv::Mat imageRGB; + cv::cvtColor(imageBGR, imageRGB, cv::COLOR_BGR2RGB); + return imageRGB; +} + +std::tuple +VisionModel::loadFrameRotated(jsi::Runtime &runtime, + const jsi::Value &frameData) const { + auto orient = utils::readFrameOrientation(runtime, frameData); + cv::Mat frame = extractFromFrame(runtime, frameData); + cv::Size originalSize = frame.size(); + cv::Mat rotated = utils::rotateFrameForModel(frame, orient); + return {rotated, orient, originalSize}; +} + } // namespace rnexecutorch::models diff --git a/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.h b/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.h index cf003948af..60ba94e640 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/VisionModel.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -10,6 +11,8 @@ namespace rnexecutorch { namespace models { +using executorch::extension::TensorPtr; + /** * @brief Base class for computer vision models that support real-time camera * input @@ -21,16 +24,35 @@ namespace models { * Thread Safety: * - All inference operations are protected by a mutex via scoped_lock * + * Normalization: + * Subclasses should call initNormalization() with ImageNet mean/std when the + * model expects ImageNet-normalized inputs (e.g., Classification, Detection, + * Segmentation). Skip initNormalization() when the model: + * - Has built-in normalization layers (e.g., some embeddings models) + * - Expects raw pixel values [0, 255] (e.g., StyleTransfer) + * - Uses non-ImageNet normalization (handle in custom preprocess()) + * + * The createInputTensor() method safely handles both cases via std::optional. + * * Usage: * Subclasses should: * 1. Inherit from VisionModel instead of BaseModel - * 2. Optionally override preprocess() for model-specific preprocessing - * 3. Implement runInference() which acquires the lock internally + * 2. Call initNormalization() if model expects normalized inputs + * 3. Optionally override preprocess() for model-specific preprocessing + * 4. Implement runInference() which acquires the lock internally * * Example: * @code * class Classification : public VisionModel { * public: + * Classification(const std::string& modelSource, + * std::shared_ptr callInvoker, + * const std::vector& normMean, + * const std::vector& normStd) + * : VisionModel(modelSource, callInvoker) { + * initNormalization(normMean, normStd); // ImageNet normalization + * } + * * std::unordered_map * generateFromFrame(jsi::Runtime& runtime, const jsi::Value& frameValue) { * auto frameObject = frameValue.asObject(runtime); @@ -63,6 +85,13 @@ class VisionModel : public BaseModel { /// Set once by each subclass constructor to avoid per-frame metadata lookups. std::vector modelInputShape_; + /// Per-channel normalization mean (RGB). nullopt = no normalization applied. + std::optional normMean_; + + /// Per-channel normalization std-dev (RGB). nullopt = no normalization + /// applied. + std::optional normStd_; + /** * @brief Mutex to ensure thread-safe inference * @@ -99,6 +128,35 @@ class VisionModel : public BaseModel { * sizes. */ virtual cv::Size modelInputSize() const; + + /** + * @brief Get input size for a specific method (last two shape dims). + * + * Useful for multi-method models with different input sizes per method. + * Falls back to currentlyLoadedMethod_ when methodName is empty. + */ + cv::Size getModelInputSize(const std::string &methodName = "") const; + + /** + * @brief Set normMean_/normStd_ from float vectors. + * + * Expects size == 3. Logs a warning and ignores if non-empty but wrong size. + */ + void initNormalization(const std::vector &normMean, + const std::vector &normStd); + + /// Builds input tensor from a preprocessed image. + /// Applies normalization if normMean_/normStd_ are set, skips it otherwise. + TensorPtr createInputTensor(const cv::Mat &preprocessed) const; + + /// Reads image from path and converts BGR → RGB. + cv::Mat loadImageToRGB(const std::string &imageSource) const; + + /// Extracts a camera frame, applies rotation, and returns + /// {rotated frame, orientation, original size}. + std::tuple + loadFrameRotated(jsi::Runtime &runtime, const jsi::Value &frameData) const; + /** * @brief Extract an RGB cv::Mat from a VisionCamera frame * diff --git a/packages/react-native-executorch/common/rnexecutorch/models/classification/Classification.cpp b/packages/react-native-executorch/common/rnexecutorch/models/classification/Classification.cpp index e34f68fe64..4589d58c61 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/classification/Classification.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/classification/Classification.cpp @@ -6,6 +6,7 @@ #include #include +#include namespace rnexecutorch::models::classification { @@ -16,34 +17,8 @@ Classification::Classification(const std::string &modelSource, std::shared_ptr callInvoker) : VisionModel(modelSource, callInvoker), labelNames_(std::move(labelNames)) { - if (normMean.size() == 3) { - normMean_ = cv::Scalar(normMean[0], normMean[1], normMean[2]); - } else if (!normMean.empty()) { - log(LOG_LEVEL::Warn, - "normMean must have 3 elements — ignoring provided value."); - } - if (normStd.size() == 3) { - normStd_ = cv::Scalar(normStd[0], normStd[1], normStd[2]); - } else if (!normStd.empty()) { - log(LOG_LEVEL::Warn, - "normStd must have 3 elements — ignoring provided value."); - } - - auto inputShapes = getAllInputShapes(); - if (inputShapes.size() == 0) { - throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs, - "Model seems to not take any input tensors."); - } - modelInputShape_ = inputShapes[0]; - if (modelInputShape_.size() < 2) { - char errorMessage[100]; - std::snprintf(errorMessage, sizeof(errorMessage), - "Unexpected model input size, expected at least 2 dimensions " - "but got: %zu.", - modelInputShape_.size()); - throw RnExecutorchError(RnExecutorchErrorCode::WrongDimensions, - errorMessage); - } + initNormalization(normMean, normStd); + modelInputShape_ = validateAndGetInputShape(); } std::unordered_map @@ -51,30 +26,17 @@ Classification::runInference(cv::Mat image) { std::scoped_lock lock(inference_mutex_); cv::Mat preprocessed = preprocess(image); + auto inputTensor = createInputTensor(preprocessed); - auto inputTensor = - (normMean_ && normStd_) - ? image_processing::getTensorFromMatrix( - modelInputShape_, preprocessed, *normMean_, *normStd_) - : image_processing::getTensorFromMatrix(modelInputShape_, - preprocessed); - - auto forwardResult = BaseModel::forward(inputTensor); - if (!forwardResult.ok()) { - throw RnExecutorchError(forwardResult.error(), - "The model's forward function did not succeed. " - "Ensure the model input is correct."); - } - return postprocess(forwardResult->at(0).toTensor()); + auto outputs = forwardOrThrow(inputTensor, + "The model's forward function did not succeed. " + "Ensure the model input is correct."); + return postprocess(outputs.at(0).toTensor()); } std::unordered_map Classification::generateFromString(std::string imageSource) { - cv::Mat imageBGR = image_processing::readImage(imageSource); - - cv::Mat imageRGB; - cv::cvtColor(imageBGR, imageRGB, cv::COLOR_BGR2RGB); - + cv::Mat imageRGB = loadImageToRGB(imageSource); return runInference(imageRGB); } @@ -94,8 +56,7 @@ Classification::generateFromPixels(JSTensorViewIn pixelData) { std::unordered_map Classification::postprocess(const Tensor &tensor) { - std::span resultData( - static_cast(tensor.const_data_ptr()), tensor.numel()); + auto resultData = utils::tensor::toSpan(tensor); std::vector resultVec(resultData.begin(), resultData.end()); if (resultVec.size() != labelNames_.size()) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/classification/Classification.h b/packages/react-native-executorch/common/rnexecutorch/models/classification/Classification.h index 2ea0e17bbb..784c2bb60a 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/classification/Classification.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/classification/Classification.h @@ -40,8 +40,6 @@ class Classification : public VisionModel { std::unordered_map postprocess(const Tensor &tensor); std::vector labelNames_; - std::optional normMean_; - std::optional normStd_; }; } // namespace models::classification diff --git a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/image/ImageEmbeddings.cpp b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/image/ImageEmbeddings.cpp index d2914469af..0385fa320f 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/embeddings/image/ImageEmbeddings.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/embeddings/image/ImageEmbeddings.cpp @@ -10,21 +10,7 @@ ImageEmbeddings::ImageEmbeddings( const std::string &modelSource, std::shared_ptr callInvoker) : VisionModel(modelSource, callInvoker) { - auto inputTensors = getAllInputShapes(); - if (inputTensors.size() == 0) { - throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs, - "Model seems to not take any input tensors."); - } - modelInputShape_ = inputTensors[0]; - if (modelInputShape_.size() < 2) { - char errorMessage[100]; - std::snprintf(errorMessage, sizeof(errorMessage), - "Unexpected model input size, expected at least 2 dimensions " - "but got: %zu.", - modelInputShape_.size()); - throw RnExecutorchError(RnExecutorchErrorCode::WrongDimensions, - errorMessage); - } + modelInputShape_ = validateAndGetInputShape(); } std::shared_ptr @@ -32,31 +18,21 @@ ImageEmbeddings::runInference(cv::Mat image) { std::scoped_lock lock(inference_mutex_); cv::Mat preprocessed = preprocess(image); + auto inputTensor = createInputTensor(preprocessed); - auto inputTensor = - image_processing::getTensorFromMatrix(modelInputShape_, preprocessed); + auto outputs = forwardOrThrow( + inputTensor, + "The model's forward function did not succeed. Ensure the model input " + "is correct."); - auto forwardResult = BaseModel::forward(inputTensor); - - if (!forwardResult.ok()) { - throw RnExecutorchError( - forwardResult.error(), - "The model's forward function did not succeed. Ensure the model input " - "is correct."); - } - - auto forwardResultTensor = forwardResult->at(0).toTensor(); + auto forwardResultTensor = outputs.at(0).toTensor(); return std::make_shared( forwardResultTensor.const_data_ptr(), forwardResultTensor.nbytes()); } std::shared_ptr ImageEmbeddings::generateFromString(std::string imageSource) { - cv::Mat imageBGR = image_processing::readImage(imageSource); - - cv::Mat imageRGB; - cv::cvtColor(imageBGR, imageRGB, cv::COLOR_BGR2RGB); - + cv::Mat imageRGB = loadImageToRGB(imageSource); return runInference(imageRGB); } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp b/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp index 3d2f9d1715..9b4e554cdd 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace rnexecutorch::models::instance_segmentation { @@ -17,41 +18,19 @@ BaseInstanceSegmentation::BaseInstanceSegmentation( std::vector normStd, bool applyNMS, std::shared_ptr callInvoker) : VisionModel(modelSource, callInvoker), applyNMS_(applyNMS) { - - if (normMean.size() == 3) { - normMean_ = cv::Scalar(normMean[0], normMean[1], normMean[2]); - } else if (!normMean.empty()) { - log(LOG_LEVEL::Warn, - "normMean must have 3 elements — ignoring provided value."); - } - if (normStd.size() == 3) { - normStd_ = cv::Scalar(normStd[0], normStd[1], normStd[2]); - } else if (!normStd.empty()) { - log(LOG_LEVEL::Warn, - "normStd must have 3 elements — ignoring provided value."); - } + initNormalization(normMean, normStd); } cv::Size BaseInstanceSegmentation::modelInputSize() const { - if (currentlyLoadedMethod_.empty()) { - return VisionModel::modelInputSize(); - } - auto inputShapes = getAllInputShapes(currentlyLoadedMethod_); - if (inputShapes.empty() || inputShapes[0].size() < 2) { - return VisionModel::modelInputSize(); + if (!currentlyLoadedMethod_.empty()) { + return getModelInputSize(currentlyLoadedMethod_); } - const auto &shape = inputShapes[0]; - return {shape[shape.size() - 2], shape[shape.size() - 1]}; + return VisionModel::modelInputSize(); } TensorPtr BaseInstanceSegmentation::buildInputTensor(const cv::Mat &image) { cv::Mat preprocessed = preprocess(image); - return (normMean_.has_value() && normStd_.has_value()) - ? image_processing::getTensorFromMatrix( - modelInputShape_, preprocessed, normMean_.value(), - normStd_.value()) - : image_processing::getTensorFromMatrix(modelInputShape_, - preprocessed); + return createInputTensor(preprocessed); } std::vector BaseInstanceSegmentation::runInference( @@ -63,35 +42,26 @@ std::vector BaseInstanceSegmentation::runInference( ensureMethodLoaded(methodName); - auto inputShapes = getAllInputShapes(methodName); - if (inputShapes.empty() || inputShapes[0].empty()) { - throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs, - "Method '" + methodName + - "' has invalid input tensor shape."); - } - - modelInputShape_ = inputShapes[0]; + modelInputShape_ = validateAndGetInputShape(methodName, 2); const auto &shape = modelInputShape_; cv::Size modelInputSize(shape[shape.size() - 2], shape[shape.size() - 1]); cv::Size originalSize(image.cols, image.rows); - validateThresholds(confidenceThreshold, iouThreshold); + utils::computer_vision::validateThreshold(confidenceThreshold, + "confidenceThreshold"); + utils::computer_vision::validateThreshold(iouThreshold, "iouThreshold"); - auto forwardResult = - BaseModel::execute(methodName, {buildInputTensor(image)}); - if (!forwardResult.ok()) { - throw RnExecutorchError( - forwardResult.error(), - "The model's forward function did not succeed. " - "Ensure the model input is correct and method name '" + - methodName + "' is valid."); - } + auto outputs = + executeOrThrow(methodName, {buildInputTensor(image)}, + "The model's forward function did not succeed. " + "Ensure the model input is correct and method name '" + + methodName + "' is valid."); - validateOutputTensors(forwardResult.get()); + validateOutputTensors(outputs); - auto instances = collectInstances( - forwardResult.get(), originalSize, modelInputSize, confidenceThreshold, - classIndices, returnMaskAtOriginalResolution); + auto instances = collectInstances(outputs, originalSize, modelInputSize, + confidenceThreshold, classIndices, + returnMaskAtOriginalResolution); return finalizeInstances(std::move(instances), iouThreshold, maxInstances); } @@ -100,10 +70,7 @@ std::vector BaseInstanceSegmentation::generateFromString( int32_t maxInstances, std::vector classIndices, bool returnMaskAtOriginalResolution, std::string methodName) { - cv::Mat imageBGR = image_processing::readImage(imageSource); - cv::Mat imageRGB; - cv::cvtColor(imageBGR, imageRGB, cv::COLOR_BGR2RGB); - + cv::Mat imageRGB = loadImageToRGB(imageSource); return runInference(imageRGB, confidenceThreshold, iouThreshold, maxInstances, classIndices, returnMaskAtOriginalResolution, methodName); } @@ -114,15 +81,16 @@ std::vector BaseInstanceSegmentation::generateFromFrame( std::vector classIndices, bool returnMaskAtOriginalResolution, std::string methodName) { - auto orient = ::rnexecutorch::utils::readFrameOrientation(runtime, frameData); - cv::Mat frame = extractFromFrame(runtime, frameData); - cv::Mat rotated = utils::rotateFrameForModel(frame, orient); + auto [rotated, orient, _] = loadFrameRotated(runtime, frameData); auto instances = runInference(rotated, confidenceThreshold, iouThreshold, maxInstances, classIndices, returnMaskAtOriginalResolution, methodName); + + // Inverse-rotate bboxes for all instances + utils::inverseRotateBboxes(instances, orient, rotated.size()); + + // Inverse-rotate masks (instance-specific logic) for (auto &inst : instances) { - utils::inverseRotateBbox(inst.bbox, orient, rotated.size()); - // Inverse-rotate the mask to match the screen orientation cv::Mat maskMat(inst.maskHeight, inst.maskWidth, CV_8UC1, inst.mask->data()); cv::Mat invMask = utils::inverseRotateMat(maskMat, orient); @@ -144,19 +112,6 @@ std::vector BaseInstanceSegmentation::generateFromPixels( classIndices, returnMaskAtOriginalResolution, methodName); } -std::tuple -BaseInstanceSegmentation::extractDetectionData(const float *bboxData, - const float *scoresData, - int32_t index) { - utils::computer_vision::BBox bbox{ - bboxData[index * 4], bboxData[index * 4 + 1], bboxData[index * 4 + 2], - bboxData[index * 4 + 3]}; - float score = scoresData[index * 2]; - int32_t label = static_cast(scoresData[index * 2 + 1]); - - return {bbox, score, label}; -} - cv::Rect BaseInstanceSegmentation::computeMaskCropRect( const utils::computer_vision::BBox &bboxModel, cv::Size modelInputSize, cv::Size maskSize) { @@ -232,22 +187,6 @@ cv::Mat BaseInstanceSegmentation::processMaskFromLogits( return thresholdToBinary(probMat); } -void BaseInstanceSegmentation::validateThresholds(double confidenceThreshold, - double iouThreshold) const { - if (confidenceThreshold < 0 || confidenceThreshold > 1) { - throw RnExecutorchError( - RnExecutorchErrorCode::InvalidConfig, - "Confidence threshold must be greater or equal to 0 " - "and less than or equal to 1."); - } - - if (iouThreshold < 0 || iouThreshold > 1) { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidConfig, - "IoU threshold must be greater or equal to 0 " - "and less than or equal to 1."); - } -} - void BaseInstanceSegmentation::validateOutputTensors( const std::vector &tensors) const { if (tensors.size() != 3) { @@ -258,48 +197,6 @@ void BaseInstanceSegmentation::validateOutputTensors( } } -std::set BaseInstanceSegmentation::prepareAllowedClasses( - const std::vector &classIndices) const { - std::set allowedClasses; - if (!classIndices.empty()) { - allowedClasses.insert(classIndices.begin(), classIndices.end()); - } - return allowedClasses; -} - -void BaseInstanceSegmentation::ensureMethodLoaded( - const std::string &methodName) { - if (methodName.empty()) { - throw RnExecutorchError( - RnExecutorchErrorCode::InvalidConfig, - "Method name cannot be empty. Use 'forward' for single-method models " - "or 'forward_{inputSize}' for multi-method models."); - } - - if (currentlyLoadedMethod_ == methodName) { - return; - } - - if (!module_) { - throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, - "Model not loaded. Cannot load method '" + - methodName + "'."); - } - - if (!currentlyLoadedMethod_.empty()) { - module_->unload_method(currentlyLoadedMethod_); - } - - auto loadResult = module_->load_method(methodName); - if (loadResult != executorch::runtime::Error::Ok) { - throw RnExecutorchError( - loadResult, "Failed to load method '" + methodName + - "'. Ensure the method exists in the exported model."); - } - - currentlyLoadedMethod_ = methodName; -} - std::vector BaseInstanceSegmentation::finalizeInstances( std::vector instances, double iouThreshold, int32_t maxInstances) const { @@ -326,7 +223,7 @@ std::vector BaseInstanceSegmentation::collectInstances( static_cast(originalSize.width) / modelInputSize.width; float heightRatio = static_cast(originalSize.height) / modelInputSize.height; - auto allowedClasses = prepareAllowedClasses(classIndices); + std::set allowedClasses(classIndices.begin(), classIndices.end()); // CONTRACT auto bboxTensor = tensors[0].toTensor(); // [1, N, 4] @@ -351,7 +248,7 @@ std::vector BaseInstanceSegmentation::collectInstances( for (int32_t i = 0; i < numInstances; ++i) { auto [bboxModel, score, labelIdx] = - extractDetectionData(bboxData, scoresData, i); + utils::computer_vision::extractDetectionData(bboxData, scoresData, i); if (!isValidDetection(score, labelIdx)) { continue; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.h b/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.h index 341d0f2235..541ef8b683 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/instance_segmentation/BaseInstanceSegmentation.h @@ -61,20 +61,8 @@ class BaseInstanceSegmentation : public VisionModel { const std::vector &classIndices, bool returnMaskAtOriginalResolution); - void validateThresholds(double confidenceThreshold, - double iouThreshold) const; void validateOutputTensors(const std::vector &tensors) const; - std::set - prepareAllowedClasses(const std::vector &classIndices) const; - - // Model loading and input helpers - void ensureMethodLoaded(const std::string &methodName); - - std::tuple - extractDetectionData(const float *bboxData, const float *scoresData, - int32_t index); - cv::Rect computeMaskCropRect(const utils::computer_vision::BBox &bboxModel, cv::Size modelInputSize, cv::Size maskSize); @@ -96,10 +84,7 @@ class BaseInstanceSegmentation : public VisionModel { const utils::computer_vision::BBox &bboxOriginal, cv::Size modelInputSize, cv::Size originalSize, bool warpToOriginal); - std::optional normMean_; - std::optional normStd_; bool applyNMS_; - std::string currentlyLoadedMethod_; }; } // namespace models::instance_segmentation diff --git a/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.cpp b/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.cpp index 24c4e1083a..c71a3716e4 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.cpp @@ -1,8 +1,6 @@ #include "ObjectDetection.h" #include "Constants.h" -#include - #include #include #include @@ -10,7 +8,9 @@ #include #include #include +#include #include +#include namespace rnexecutorch::models::object_detection { @@ -20,66 +20,14 @@ ObjectDetection::ObjectDetection( std::shared_ptr callInvoker) : VisionModel(modelSource, callInvoker), labelNames_(std::move(labelNames)) { - if (normMean.size() == 3) { - normMean_ = cv::Scalar(normMean[0], normMean[1], normMean[2]); - } else if (!normMean.empty()) { - log(LOG_LEVEL::Warn, - "normMean must have 3 elements — ignoring provided value."); - } - if (normStd.size() == 3) { - normStd_ = cv::Scalar(normStd[0], normStd[1], normStd[2]); - } else if (!normStd.empty()) { - log(LOG_LEVEL::Warn, - "normStd must have 3 elements — ignoring provided value."); - } + initNormalization(normMean, normStd); } cv::Size ObjectDetection::modelInputSize() const { if (currentlyLoadedMethod_.empty()) { return VisionModel::modelInputSize(); } - auto inputShapes = getAllInputShapes(currentlyLoadedMethod_); - if (inputShapes.empty() || inputShapes[0].size() < 2) { - throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs, - "Could not determine input shape for method: " + - currentlyLoadedMethod_); - } - const auto &shape = inputShapes[0]; - return {static_cast(shape[shape.size() - 2]), - static_cast(shape[shape.size() - 1])}; -} - -void ObjectDetection::ensureMethodLoaded(const std::string &methodName) { - if (methodName.empty()) { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, - "methodName cannot be empty"); - } - if (currentlyLoadedMethod_ == methodName) { - return; - } - if (!module_) { - throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded, - "Model module is not loaded"); - } - if (!currentlyLoadedMethod_.empty()) { - module_->unload_method(currentlyLoadedMethod_); - } - auto loadResult = module_->load_method(methodName); - if (loadResult != executorch::runtime::Error::Ok) { - throw RnExecutorchError( - loadResult, "Failed to load method '" + methodName + - "'. Ensure the method exists in the exported model."); - } - currentlyLoadedMethod_ = methodName; -} - -std::set ObjectDetection::prepareAllowedClasses( - const std::vector &classIndices) const { - std::set allowedClasses; - if (!classIndices.empty()) { - allowedClasses.insert(classIndices.begin(), classIndices.end()); - } - return allowedClasses; + return getModelInputSize(currentlyLoadedMethod_); } std::vector @@ -93,23 +41,12 @@ ObjectDetection::postprocess(const std::vector &tensors, static_cast(originalSize.height) / inputSize.height; // Prepare allowed classes set for filtering - auto allowedClasses = prepareAllowedClasses(classIndices); + std::set allowedClasses(classIndices.begin(), classIndices.end()); std::vector detections; - auto bboxTensor = tensors.at(0).toTensor(); - std::span bboxes( - static_cast(bboxTensor.const_data_ptr()), - bboxTensor.numel()); - - auto scoreTensor = tensors.at(1).toTensor(); - std::span scores( - static_cast(scoreTensor.const_data_ptr()), - scoreTensor.numel()); - - auto labelTensor = tensors.at(2).toTensor(); - std::span labels( - static_cast(labelTensor.const_data_ptr()), - labelTensor.numel()); + auto bboxes = utils::tensor::toSpan(tensors.at(0)); + auto scores = utils::tensor::toSpan(tensors.at(1)); + auto labels = utils::tensor::toSpan(tensors.at(2)); for (std::size_t i = 0; i < scores.size(); ++i) { if (scores[i] < detectionThreshold) { @@ -146,14 +83,9 @@ ObjectDetection::postprocess(const std::vector &tensors, std::vector ObjectDetection::runInference( cv::Mat image, double detectionThreshold, double iouThreshold, const std::vector &classIndices, const std::string &methodName) { - if (detectionThreshold < 0.0 || detectionThreshold > 1.0) { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, - "detectionThreshold must be in range [0, 1]"); - } - if (iouThreshold < 0.0 || iouThreshold > 1.0) { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, - "iouThreshold must be in range [0, 1]"); - } + utils::computer_vision::validateThreshold(detectionThreshold, + "detectionThreshold"); + utils::computer_vision::validateThreshold(iouThreshold, "iouThreshold"); std::scoped_lock lock(inference_mutex_); @@ -162,44 +94,25 @@ std::vector ObjectDetection::runInference( cv::Size originalSize = image.size(); - // Query input shapes for the currently loaded method - auto inputShapes = getAllInputShapes(methodName); - if (inputShapes.empty() || inputShapes[0].size() < 2) { - throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs, - "Could not determine input shape for method: " + - methodName); - } - modelInputShape_ = inputShapes[0]; + // Query and validate input shapes for the currently loaded method + modelInputShape_ = validateAndGetInputShape(methodName, 2); cv::Mat preprocessed = preprocess(image); + auto inputTensor = createInputTensor(preprocessed); - auto inputTensor = - (normMean_ && normStd_) - ? image_processing::getTensorFromMatrix( - modelInputShape_, preprocessed, *normMean_, *normStd_) - : image_processing::getTensorFromMatrix(modelInputShape_, - preprocessed); - - auto executeResult = execute(methodName, {inputTensor}); - if (!executeResult.ok()) { - throw RnExecutorchError(executeResult.error(), - "The model's " + methodName + - " method did not succeed. " - "Ensure the model input is correct."); - } + auto outputs = executeOrThrow(methodName, {inputTensor}, + "The model's " + methodName + + " method did not succeed. " + "Ensure the model input is correct."); - return postprocess(executeResult.get(), originalSize, detectionThreshold, - iouThreshold, classIndices); + return postprocess(outputs, originalSize, detectionThreshold, iouThreshold, + classIndices); } std::vector ObjectDetection::generateFromString( std::string imageSource, double detectionThreshold, double iouThreshold, std::vector classIndices, std::string methodName) { - cv::Mat imageBGR = image_processing::readImage(imageSource); - - cv::Mat imageRGB; - cv::cvtColor(imageBGR, imageRGB, cv::COLOR_BGR2RGB); - + cv::Mat imageRGB = loadImageToRGB(imageSource); return runInference(imageRGB, detectionThreshold, iouThreshold, classIndices, methodName); } @@ -208,15 +121,11 @@ std::vector ObjectDetection::generateFromFrame( jsi::Runtime &runtime, const jsi::Value &frameData, double detectionThreshold, double iouThreshold, std::vector classIndices, std::string methodName) { - auto orient = ::rnexecutorch::utils::readFrameOrientation(runtime, frameData); - cv::Mat frame = extractFromFrame(runtime, frameData); - cv::Mat rotated = ::rnexecutorch::utils::rotateFrameForModel(frame, orient); + auto [rotated, orient, _] = loadFrameRotated(runtime, frameData); auto detections = runInference(rotated, detectionThreshold, iouThreshold, classIndices, methodName); - for (auto &det : detections) { - ::rnexecutorch::utils::inverseRotateBbox(det.bbox, orient, rotated.size()); - } + utils::inverseRotateBboxes(detections, orient, rotated.size()); return detections; } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.h b/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.h index 6e3c01356e..6484d5d213 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/object_detection/ObjectDetection.h @@ -125,36 +125,8 @@ class ObjectDetection : public VisionModel { double detectionThreshold, double iouThreshold, const std::vector &classIndices); - /** - * @brief Ensures the specified method is loaded, unloading any previous - * method if necessary. - * - * @param methodName Name of the method to load (e.g., "forward", - * "forward_384"). - * @throws RnExecutorchError if the method cannot be loaded. - */ - void ensureMethodLoaded(const std::string &methodName); - - /** - * @brief Prepares a set of allowed class indices for filtering detections. - * - * @param classIndices Vector of class indices to allow. - * @return A set containing the allowed class indices. - */ - std::set - prepareAllowedClasses(const std::vector &classIndices) const; - - /// Optional per-channel mean for input normalisation (set in constructor). - std::optional normMean_; - - /// Optional per-channel standard deviation for input normalisation. - std::optional normStd_; - /// Ordered label strings mapping class indices to human-readable names. std::vector labelNames_; - - /// Name of the currently loaded method (for multi-method models). - std::string currentlyLoadedMethod_; }; } // namespace models::object_detection diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Detector.cpp b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Detector.cpp index e838a7a0f8..62dfa0cc6e 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Detector.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Detector.cpp @@ -10,19 +10,11 @@ namespace rnexecutorch::models::ocr { Detector::Detector(const std::string &modelSource, std::shared_ptr callInvoker) - : BaseModel(modelSource, callInvoker) { - + : VisionModel(modelSource, callInvoker) { + // Validate all supported input widths for (auto input_size : constants::kDetectorInputWidths) { std::string methodName = "forward_" + std::to_string(input_size); - auto inputShapes = getAllInputShapes(methodName); - if (inputShapes[0].size() < 2) { - std::string errorMessage = - "Unexpected detector model input size for method: " + methodName + - "expected at least 2 dimensions but got: ." + - std::to_string(inputShapes[0].size()); - throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs, - errorMessage); - } + validateAndGetInputShape(methodName, 2); } } @@ -61,17 +53,10 @@ std::vector Detector::generate(const cv::Mat &inputImage, } cv::Size Detector::calculateModelImageSize(int32_t methodInputWidth) { - utils::validateInputWidth(methodInputWidth, constants::kDetectorInputWidths, "Detector"); std::string methodName = "forward_" + std::to_string(methodInputWidth); - - auto inputShapes = getAllInputShapes(methodName); - std::vector modelInputShape = inputShapes[0]; - cv::Size modelInputSize = - cv::Size(modelInputShape[modelInputShape.size() - 1], - modelInputShape[modelInputShape.size() - 2]); - return modelInputSize; + return getModelInputSize(methodName); } std::vector diff --git a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Detector.h b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Detector.h index dc17aa0742..b77a379fd5 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/ocr/Detector.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/ocr/Detector.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include namespace rnexecutorch::models::ocr { @@ -17,13 +17,13 @@ namespace rnexecutorch::models::ocr { using executorch::aten::Tensor; using executorch::extension::TensorPtr; -class Detector : public BaseModel { +class Detector : public models::VisionModel { public: explicit Detector(const std::string &modelSource, std::shared_ptr callInvoker); - [[nodiscard("Registered non-void function")]] - virtual std::vector - generate(const cv::Mat &inputImage, int32_t inputWidth); + [[nodiscard("Registered non-void function")]] + virtual std::vector generate(const cv::Mat &inputImage, + int32_t inputWidth); cv::Size calculateModelImageSize(int32_t methodInputWidth); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.cpp b/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.cpp index 66458cb569..aecc6f625e 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.cpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace rnexecutorch::models::semantic_segmentation { @@ -18,33 +19,11 @@ BaseSemanticSegmentation::BaseSemanticSegmentation( : VisionModel(modelSource, callInvoker), allClasses_(std::move(allClasses)) { initModelImageSize(); - if (normMean.size() == 3) { - normMean_ = cv::Scalar(normMean[0], normMean[1], normMean[2]); - } else if (!normMean.empty()) { - log(LOG_LEVEL::Warn, - "normMean must have 3 elements — ignoring provided value."); - } - if (normStd.size() == 3) { - normStd_ = cv::Scalar(normStd[0], normStd[1], normStd[2]); - } else if (!normStd.empty()) { - log(LOG_LEVEL::Warn, - "normStd must have 3 elements — ignoring provided value."); - } + initNormalization(normMean, normStd); } void BaseSemanticSegmentation::initModelImageSize() { - auto inputShapes = getAllInputShapes(); - if (inputShapes.empty()) { - throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs, - "Model seems to not take any input tensors."); - } - modelInputShape_ = inputShapes[0]; - if (modelInputShape_.size() < 2) { - throw RnExecutorchError(RnExecutorchErrorCode::WrongDimensions, - "Unexpected model input size, expected at least 2 " - "dimensions but got: " + - std::to_string(modelInputShape_.size()) + "."); - } + modelInputShape_ = validateAndGetInputShape(); numModelPixels = modelInputSize().area(); } @@ -55,33 +34,22 @@ BaseSemanticSegmentation::runInference( std::scoped_lock lock(inference_mutex_); cv::Mat preprocessed = VisionModel::preprocess(image); - auto inputTensor = - (normMean_ && normStd_) - ? image_processing::getTensorFromMatrix( - modelInputShape_, preprocessed, *normMean_, *normStd_) - : image_processing::getTensorFromMatrix(modelInputShape_, - preprocessed); - - auto forwardResult = BaseModel::forward(inputTensor); - if (!forwardResult.ok()) { - throw RnExecutorchError(forwardResult.error(), - "The model's forward function did not succeed. " - "Ensure the model input is correct."); - } + auto inputTensor = createInputTensor(preprocessed); + + auto outputs = forwardOrThrow(inputTensor, + "The model's forward function did not succeed. " + "Ensure the model input is correct."); - return computeResult(forwardResult->at(0).toTensor(), originalSize, - allClasses_, classesOfInterest, resize); + return computeResult(outputs.at(0).toTensor(), originalSize, allClasses_, + classesOfInterest, resize); } semantic_segmentation::SegmentationResult BaseSemanticSegmentation::generateFromString( std::string imageSource, std::set> classesOfInterest, bool resize) { - cv::Mat imageBGR = image_processing::readImage(imageSource); - cv::Size originalSize = imageBGR.size(); - cv::Mat imageRGB; - cv::cvtColor(imageBGR, imageRGB, cv::COLOR_BGR2RGB); - + cv::Mat imageRGB = loadImageToRGB(imageSource); + cv::Size originalSize = imageRGB.size(); return runInference(imageRGB, originalSize, classesOfInterest, resize); } @@ -97,15 +65,13 @@ semantic_segmentation::SegmentationResult BaseSemanticSegmentation::generateFromFrame( jsi::Runtime &runtime, const jsi::Value &frameData, std::set> classesOfInterest, bool resize) { - auto orient = ::rnexecutorch::utils::readFrameOrientation(runtime, frameData); - cv::Mat frame = extractFromFrame(runtime, frameData); - cv::Mat rotated = utils::rotateFrameForModel(frame, orient); + auto [rotated, orient, originalSize] = loadFrameRotated(runtime, frameData); // Always run inference without resize — rotate first, then resize. auto result = runInference(rotated, rotated.size(), classesOfInterest, false); const cv::Size outputSize = modelInputSize(); // JS reads maskW=frame.height, maskH=frame.width (sensor-native swap). - const cv::Size frameSize = frame.size(); + const cv::Size frameSize = originalSize; auto inverseAndResize = [&orient, &frameSize, &outputSize, resize](std::shared_ptr &buf, @@ -139,8 +105,7 @@ BaseSemanticSegmentation::computeResult( std::vector &allClasses, std::set> &classesOfInterest, bool resize) { - const auto *dataPtr = tensor.const_data_ptr(); - auto resultData = std::span(dataPtr, tensor.numel()); + auto resultData = utils::tensor::toSpan(tensor); // Read output dimensions directly from tensor shape std::size_t numChannels = diff --git a/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.h b/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.h index a30ae375bf..ba207d919b 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/semantic_segmentation/BaseSemanticSegmentation.h @@ -47,8 +47,6 @@ class BaseSemanticSegmentation : public VisionModel { std::set> &classesOfInterest, bool resize); std::size_t numModelPixels; - std::optional normMean_; - std::optional normStd_; std::vector allClasses_; private: diff --git a/packages/react-native-executorch/common/rnexecutorch/models/style_transfer/StyleTransfer.cpp b/packages/react-native-executorch/common/rnexecutorch/models/style_transfer/StyleTransfer.cpp index 70a6ec916d..a5a6d63b00 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/style_transfer/StyleTransfer.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/style_transfer/StyleTransfer.cpp @@ -16,40 +16,21 @@ using executorch::extension::TensorPtr; StyleTransfer::StyleTransfer(const std::string &modelSource, std::shared_ptr callInvoker) : VisionModel(modelSource, callInvoker) { - auto inputShapes = getAllInputShapes(); - if (inputShapes.size() == 0) { - throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs, - "Model seems to not take any input tensors"); - } - modelInputShape_ = inputShapes[0]; - if (modelInputShape_.size() < 2) { - char errorMessage[100]; - std::snprintf(errorMessage, sizeof(errorMessage), - "Unexpected model input size, expected at least 2 dimensions " - "but got: %zu.", - modelInputShape_.size()); - throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs, - errorMessage); - } + modelInputShape_ = validateAndGetInputShape(); } cv::Mat StyleTransfer::runInference(cv::Mat image, cv::Size outputSize) { std::scoped_lock lock(inference_mutex_); cv::Mat preprocessed = preprocess(image); + auto inputTensor = createInputTensor(preprocessed); - auto inputTensor = - image_processing::getTensorFromMatrix(modelInputShape_, preprocessed); + auto outputs = forwardOrThrow(inputTensor, + "The model's forward function did not succeed. " + "Ensure the model input is correct."); - auto forwardResult = BaseModel::forward(inputTensor); - if (!forwardResult.ok()) { - throw RnExecutorchError(forwardResult.error(), - "The model's forward function did not succeed. " - "Ensure the model input is correct."); - } - - cv::Mat mat = image_processing::getMatrixFromTensor( - modelInputSize(), forwardResult->at(0).toTensor()); + cv::Mat mat = image_processing::getMatrixFromTensor(modelInputSize(), + outputs.at(0).toTensor()); if (mat.size() != outputSize) { cv::resize(mat, mat, outputSize); } @@ -68,11 +49,8 @@ PixelDataResult toPixelDataResult(const cv::Mat &bgrMat) { StyleTransferResult StyleTransfer::generateFromString(std::string imageSource, bool saveToFile) { - cv::Mat imageBGR = image_processing::readImage(imageSource); - cv::Size originalSize = imageBGR.size(); - - cv::Mat imageRGB; - cv::cvtColor(imageBGR, imageRGB, cv::COLOR_BGR2RGB); + cv::Mat imageRGB = loadImageToRGB(imageSource); + cv::Size originalSize = imageRGB.size(); cv::Mat result = runInference(imageRGB, originalSize); if (saveToFile) { @@ -83,9 +61,7 @@ StyleTransferResult StyleTransfer::generateFromString(std::string imageSource, PixelDataResult StyleTransfer::generateFromFrame(jsi::Runtime &runtime, const jsi::Value &frameData) { - auto orient = ::rnexecutorch::utils::readFrameOrientation(runtime, frameData); - cv::Mat frame = extractFromFrame(runtime, frameData); - cv::Mat rotated = utils::rotateFrameForModel(frame, orient); + auto [rotated, orient, _] = loadFrameRotated(runtime, frameData); cv::Mat output = runInference(rotated, modelInputSize()); cv::Mat oriented = utils::inverseRotateMat(output, orient); return toPixelDataResult(oriented); diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt index d68ab33509..aa030737f1 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt +++ b/packages/react-native-executorch/common/rnexecutorch/tests/CMakeLists.txt @@ -171,6 +171,13 @@ add_rn_test(FrameTransformTests unit/FrameTransformTest.cpp LIBS opencv_deps ) +add_rn_test(TensorHelpersTest unit/TensorHelpersTest.cpp) + +add_rn_test(ComputerVisionProcessingTest unit/ComputerVisionProcessingTest.cpp + SOURCES + ${RNEXECUTORCH_DIR}/utils/computer_vision/Processing.cpp +) + add_rn_test(BaseModelTests integration/BaseModelTest.cpp) add_rn_test(VisionModelTests integration/VisionModelTest.cpp @@ -333,6 +340,7 @@ add_rn_test(OCRTests integration/OCRTest.cpp ${RNEXECUTORCH_DIR}/models/ocr/utils/DetectorUtils.cpp ${RNEXECUTORCH_DIR}/models/ocr/utils/RecognitionHandlerUtils.cpp ${RNEXECUTORCH_DIR}/models/ocr/utils/RecognizerUtils.cpp + ${RNEXECUTORCH_DIR}/models/VisionModel.cpp ${RNEXECUTORCH_DIR}/utils/FrameProcessor.cpp ${RNEXECUTORCH_DIR}/utils/FrameExtractor.cpp ${RNEXECUTORCH_DIR}/utils/FrameTransform.cpp @@ -350,6 +358,7 @@ add_rn_test(VerticalOCRTests integration/VerticalOCRTest.cpp ${RNEXECUTORCH_DIR}/models/ocr/utils/DetectorUtils.cpp ${RNEXECUTORCH_DIR}/models/ocr/utils/RecognitionHandlerUtils.cpp ${RNEXECUTORCH_DIR}/models/ocr/utils/RecognizerUtils.cpp + ${RNEXECUTORCH_DIR}/models/VisionModel.cpp ${RNEXECUTORCH_DIR}/utils/FrameProcessor.cpp ${RNEXECUTORCH_DIR}/utils/FrameExtractor.cpp ${RNEXECUTORCH_DIR}/utils/FrameTransform.cpp diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/unit/ComputerVisionProcessingTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/unit/ComputerVisionProcessingTest.cpp new file mode 100644 index 0000000000..57d71b7368 --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/tests/unit/ComputerVisionProcessingTest.cpp @@ -0,0 +1,163 @@ +#include +#include +#include +#include + +using namespace rnexecutorch::utils::computer_vision; + +// ============================================================================ +// extractDetectionData — Extract bbox, score, label from raw tensor data +// ============================================================================ + +TEST(ExtractDetectionData, SingleDetection) { + // Format: bboxData = [x1, y1, x2, y2] per detection + // scoresData = [score, label] per detection + std::vector bboxData = {10.0f, 20.0f, 100.0f, 200.0f}; + std::vector scoresData = {0.95f, 5.0f}; + + auto [bbox, score, label] = + extractDetectionData(bboxData.data(), scoresData.data(), 0); + + EXPECT_FLOAT_EQ(bbox.x1, 10.0f); + EXPECT_FLOAT_EQ(bbox.y1, 20.0f); + EXPECT_FLOAT_EQ(bbox.x2, 100.0f); + EXPECT_FLOAT_EQ(bbox.y2, 200.0f); + EXPECT_FLOAT_EQ(score, 0.95f); + EXPECT_EQ(label, 5); +} + +TEST(ExtractDetectionData, MultipleDetections_FirstIndex) { + std::vector bboxData = { + 10.0f, 20.0f, 100.0f, 200.0f, // Detection 0 + 150.0f, 50.0f, 250.0f, 150.0f, // Detection 1 + 300.0f, 100.0f, 400.0f, 300.0f // Detection 2 + }; + std::vector scoresData = { + 0.95f, 5.0f, // Detection 0: score, label + 0.85f, 3.0f, // Detection 1 + 0.75f, 12.0f // Detection 2 + }; + + auto [bbox, score, label] = + extractDetectionData(bboxData.data(), scoresData.data(), 0); + + EXPECT_FLOAT_EQ(bbox.x1, 10.0f); + EXPECT_FLOAT_EQ(bbox.y1, 20.0f); + EXPECT_FLOAT_EQ(bbox.x2, 100.0f); + EXPECT_FLOAT_EQ(bbox.y2, 200.0f); + EXPECT_FLOAT_EQ(score, 0.95f); + EXPECT_EQ(label, 5); +} + +TEST(ExtractDetectionData, MultipleDetections_SecondIndex) { + std::vector bboxData = { + 10.0f, 20.0f, 100.0f, 200.0f, // Detection 0 + 150.0f, 50.0f, 250.0f, 150.0f, // Detection 1 + 300.0f, 100.0f, 400.0f, 300.0f // Detection 2 + }; + std::vector scoresData = { + 0.95f, 5.0f, // Detection 0 + 0.85f, 3.0f, // Detection 1 + 0.75f, 12.0f // Detection 2 + }; + + auto [bbox, score, label] = + extractDetectionData(bboxData.data(), scoresData.data(), 1); + + EXPECT_FLOAT_EQ(bbox.x1, 150.0f); + EXPECT_FLOAT_EQ(bbox.y1, 50.0f); + EXPECT_FLOAT_EQ(bbox.x2, 250.0f); + EXPECT_FLOAT_EQ(bbox.y2, 150.0f); + EXPECT_FLOAT_EQ(score, 0.85f); + EXPECT_EQ(label, 3); +} + +TEST(ExtractDetectionData, MultipleDetections_ThirdIndex) { + std::vector bboxData = { + 10.0f, 20.0f, 100.0f, 200.0f, // Detection 0 + 150.0f, 50.0f, 250.0f, 150.0f, // Detection 1 + 300.0f, 100.0f, 400.0f, 300.0f // Detection 2 + }; + std::vector scoresData = { + 0.95f, 5.0f, // Detection 0 + 0.85f, 3.0f, // Detection 1 + 0.75f, 12.0f // Detection 2 + }; + + auto [bbox, score, label] = + extractDetectionData(bboxData.data(), scoresData.data(), 2); + + EXPECT_FLOAT_EQ(bbox.x1, 300.0f); + EXPECT_FLOAT_EQ(bbox.y1, 100.0f); + EXPECT_FLOAT_EQ(bbox.x2, 400.0f); + EXPECT_FLOAT_EQ(bbox.y2, 300.0f); + EXPECT_FLOAT_EQ(score, 0.75f); + EXPECT_EQ(label, 12); +} + +TEST(ExtractDetectionData, LowConfidenceDetection) { + std::vector bboxData = {50.0f, 60.0f, 150.0f, 160.0f}; + std::vector scoresData = {0.05f, 1.0f}; // Very low confidence + + auto [bbox, score, label] = + extractDetectionData(bboxData.data(), scoresData.data(), 0); + + EXPECT_FLOAT_EQ(score, 0.05f); + EXPECT_EQ(label, 1); +} + +TEST(ExtractDetectionData, ZeroBasedLabelIndex) { + std::vector bboxData = {0.0f, 0.0f, 100.0f, 100.0f}; + std::vector scoresData = {0.9f, 0.0f}; // Label index 0 + + auto [bbox, score, label] = + extractDetectionData(bboxData.data(), scoresData.data(), 0); + + EXPECT_EQ(label, 0); +} + +TEST(ExtractDetectionData, LargeLabelIndex) { + std::vector bboxData = {0.0f, 0.0f, 100.0f, 100.0f}; + std::vector scoresData = {0.9f, 999.0f}; // Large label index + + auto [bbox, score, label] = + extractDetectionData(bboxData.data(), scoresData.data(), 0); + + EXPECT_EQ(label, 999); +} + +TEST(ExtractDetectionData, FloatToInt32Conversion) { + std::vector bboxData = {0.0f, 0.0f, 100.0f, 100.0f}; + std::vector scoresData = {0.9f, 42.7f}; // Float label gets truncated + + auto [bbox, score, label] = + extractDetectionData(bboxData.data(), scoresData.data(), 0); + + EXPECT_EQ(label, 42); // Should truncate, not round +} + +TEST(ExtractDetectionData, NegativeCoordinates) { + std::vector bboxData = {-10.0f, -20.0f, 50.0f, 60.0f}; + std::vector scoresData = {0.8f, 2.0f}; + + auto [bbox, score, label] = + extractDetectionData(bboxData.data(), scoresData.data(), 0); + + EXPECT_FLOAT_EQ(bbox.x1, -10.0f); + EXPECT_FLOAT_EQ(bbox.y1, -20.0f); + EXPECT_FLOAT_EQ(bbox.x2, 50.0f); + EXPECT_FLOAT_EQ(bbox.y2, 60.0f); +} + +TEST(ExtractDetectionData, FractionalCoordinates) { + std::vector bboxData = {10.5f, 20.75f, 100.25f, 200.9f}; + std::vector scoresData = {0.88f, 7.0f}; + + auto [bbox, score, label] = + extractDetectionData(bboxData.data(), scoresData.data(), 0); + + EXPECT_FLOAT_EQ(bbox.x1, 10.5f); + EXPECT_FLOAT_EQ(bbox.y1, 20.75f); + EXPECT_FLOAT_EQ(bbox.x2, 100.25f); + EXPECT_FLOAT_EQ(bbox.y2, 200.9f); +} diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/unit/FrameTransformTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/unit/FrameTransformTest.cpp index b5c0993128..1ec6d52ce6 100644 --- a/packages/react-native-executorch/common/rnexecutorch/tests/unit/FrameTransformTest.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/tests/unit/FrameTransformTest.cpp @@ -345,3 +345,68 @@ TEST(InverseRotatePoints, Down_CCW) { EXPECT_FLOAT_EQ(pts[3].x, 80); EXPECT_FLOAT_EQ(pts[3].y, 570); } + +// ============================================================================ +// inverseRotateBboxes — batch inverse rotation for containers with .bbox +// ============================================================================ + +struct Detection { + BBox bbox; + float score; +}; + +// Test batch rotation of multiple detections +TEST(InverseRotateBboxes, BatchRotation_Up) { + std::vector detections = {{{10, 20, 100, 200}, 0.9f}, + {{150, 50, 200, 150}, 0.8f}, + {{250, 100, 300, 250}, 0.7f}}; + + inverseRotateBboxes(detections, makeOrient("up", false), {640, 480}); + + // First detection: (10,20)-(100,200) → CW + EXPECT_FLOAT_EQ(detections[0].bbox.x1, 280); + EXPECT_FLOAT_EQ(detections[0].bbox.y1, 10); + EXPECT_FLOAT_EQ(detections[0].bbox.x2, 460); + EXPECT_FLOAT_EQ(detections[0].bbox.y2, 100); + + // Second detection: (150,50)-(200,150) → CW + EXPECT_FLOAT_EQ(detections[1].bbox.x1, 330); + EXPECT_FLOAT_EQ(detections[1].bbox.y1, 150); + EXPECT_FLOAT_EQ(detections[1].bbox.x2, 430); + EXPECT_FLOAT_EQ(detections[1].bbox.y2, 200); + + // Third detection: (250,100)-(300,250) → CW + EXPECT_FLOAT_EQ(detections[2].bbox.x1, 230); + EXPECT_FLOAT_EQ(detections[2].bbox.y1, 250); + EXPECT_FLOAT_EQ(detections[2].bbox.x2, 380); + EXPECT_FLOAT_EQ(detections[2].bbox.y2, 300); +} + +// Test with empty container +TEST(InverseRotateBboxes, EmptyContainer) { + std::vector detections; + inverseRotateBboxes(detections, makeOrient("up", false), {640, 480}); + EXPECT_EQ(detections.size(), 0); +} + +// Test with single detection +TEST(InverseRotateBboxes, SingleDetection) { + std::vector detections = {{{10, 20, 100, 200}, 0.9f}}; + inverseRotateBboxes(detections, makeOrient("left", false), {640, 480}); + + // "left" → no-op + EXPECT_FLOAT_EQ(detections[0].bbox.x1, 10); + EXPECT_FLOAT_EQ(detections[0].bbox.y1, 20); + EXPECT_FLOAT_EQ(detections[0].bbox.x2, 100); + EXPECT_FLOAT_EQ(detections[0].bbox.y2, 200); + EXPECT_FLOAT_EQ(detections[0].score, 0.9f); +} + +// Test that other fields are preserved +TEST(InverseRotateBboxes, PreservesOtherFields) { + std::vector detections = {{{10, 20, 100, 200}, 0.95f}}; + inverseRotateBboxes(detections, makeOrient("down", false), {640, 480}); + + // Score should be unchanged + EXPECT_FLOAT_EQ(detections[0].score, 0.95f); +} diff --git a/packages/react-native-executorch/common/rnexecutorch/tests/unit/TensorHelpersTest.cpp b/packages/react-native-executorch/common/rnexecutorch/tests/unit/TensorHelpersTest.cpp new file mode 100644 index 0000000000..a1c5077e3a --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/tests/unit/TensorHelpersTest.cpp @@ -0,0 +1,121 @@ +#include +#include +#include +#include +#include +#include + +using namespace rnexecutorch::utils::tensor; +using executorch::aten::ScalarType; +using executorch::extension::make_tensor_ptr; +using executorch::runtime::EValue; + +// ============================================================================ +// toSpan(Tensor) — Convert tensor to typed span +// ============================================================================ + +TEST(TensorHelpers, ToSpan_FloatTensor) { + std::vector data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f}; + auto tensor = make_tensor_ptr({5}, data.data(), ScalarType::Float); + + auto span = toSpan(*tensor); + + EXPECT_EQ(span.size(), 5); + EXPECT_FLOAT_EQ(span[0], 1.0f); + EXPECT_FLOAT_EQ(span[1], 2.0f); + EXPECT_FLOAT_EQ(span[2], 3.0f); + EXPECT_FLOAT_EQ(span[3], 4.0f); + EXPECT_FLOAT_EQ(span[4], 5.0f); +} + +TEST(TensorHelpers, ToSpan_Int32Tensor) { + std::vector data = {10, 20, 30, 40}; + auto tensor = make_tensor_ptr({4}, data.data(), ScalarType::Int); + + auto span = toSpan(*tensor); + + EXPECT_EQ(span.size(), 4); + EXPECT_EQ(span[0], 10); + EXPECT_EQ(span[1], 20); + EXPECT_EQ(span[2], 30); + EXPECT_EQ(span[3], 40); +} + +TEST(TensorHelpers, ToSpan_MultidimensionalTensor) { + std::vector data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + auto tensor = make_tensor_ptr({2, 3}, data.data(), ScalarType::Float); + + auto span = toSpan(*tensor); + + // Should flatten to 1D span + EXPECT_EQ(span.size(), 6); + EXPECT_FLOAT_EQ(span[0], 1.0f); + EXPECT_FLOAT_EQ(span[5], 6.0f); +} + +TEST(TensorHelpers, ToSpan_EmptyTensor) { + std::vector data; + auto tensor = make_tensor_ptr({0}, data.data(), ScalarType::Float); + + auto span = toSpan(*tensor); + + EXPECT_EQ(span.size(), 0); +} + +// ============================================================================ +// toSpan(EValue) — Extract tensor from EValue then convert to span +// ============================================================================ + +TEST(TensorHelpers, ToSpan_FromEValue) { + std::vector data = {1.5f, 2.5f, 3.5f}; + auto tensor = make_tensor_ptr({3}, data.data(), ScalarType::Float); + EValue evalue(*tensor); + + auto span = toSpan(evalue); + + EXPECT_EQ(span.size(), 3); + EXPECT_FLOAT_EQ(span[0], 1.5f); + EXPECT_FLOAT_EQ(span[1], 2.5f); + EXPECT_FLOAT_EQ(span[2], 3.5f); +} + +TEST(TensorHelpers, ToSpan_FromEValue_LargeTensor) { + std::vector data(100); + for (int i = 0; i < 100; ++i) { + data[i] = static_cast(i); + } + auto tensor = make_tensor_ptr({100}, data.data(), ScalarType::Float); + EValue evalue(*tensor); + + auto span = toSpan(evalue); + + EXPECT_EQ(span.size(), 100); + EXPECT_FLOAT_EQ(span[0], 0.0f); + EXPECT_FLOAT_EQ(span[50], 50.0f); + EXPECT_FLOAT_EQ(span[99], 99.0f); +} + +// ============================================================================ +// Type safety and const correctness +// ============================================================================ + +TEST(TensorHelpers, SpanIsConst) { + std::vector data = {1.0f, 2.0f, 3.0f}; + auto tensor = make_tensor_ptr({3}, data.data(), ScalarType::Float); + + auto span = toSpan(*tensor); + + // Verify span is const (compile-time check, but we can verify element type) + static_assert( + std::is_const_v>); +} + +TEST(TensorHelpers, CorrectDataPointer) { + std::vector data = {1.0f, 2.0f, 3.0f}; + auto tensor = make_tensor_ptr({3}, data.data(), ScalarType::Float); + + auto span = toSpan(*tensor); + + // Span should point to the same data as the original tensor + EXPECT_EQ(span.data(), tensor->const_data_ptr()); +} diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h index ed3fb124f4..7eaf66009a 100644 --- a/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h +++ b/packages/react-native-executorch/common/rnexecutorch/utils/FrameTransform.h @@ -112,4 +112,20 @@ void inverseRotatePoints(std::array &points, #endif } +/** + * @brief Inverse-rotate all bboxes in a container of detections/instances. + * + * Items must expose a .bbox member of type computer_vision::BBox. + */ +template + requires requires(Container c) { + { c.begin()->bbox } -> std::convertible_to; + } +void inverseRotateBboxes(Container &items, const FrameOrientation &orient, + cv::Size rotatedSize) { + for (auto &item : items) { + inverseRotateBbox(item.bbox, orient, rotatedSize); + } +} + } // namespace rnexecutorch::utils diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/TensorHelpers.h b/packages/react-native-executorch/common/rnexecutorch/utils/TensorHelpers.h new file mode 100644 index 0000000000..67e129584c --- /dev/null +++ b/packages/react-native-executorch/common/rnexecutorch/utils/TensorHelpers.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include +#include + +namespace rnexecutorch { +namespace utils { +namespace tensor { + +/// Returns a read-only span over the tensor's flat data buffer. +/// The span is valid only as long as the tensor exists. +template +std::span toSpan(const executorch::aten::Tensor &tensor) { + return std::span(static_cast(tensor.const_data_ptr()), + tensor.numel()); +} + +/// Convenience overload that extracts the tensor from an EValue first. +/// Assumes evalue.isTensor() == true. +template +std::span toSpan(const executorch::runtime::EValue &evalue) { + return toSpan(evalue.toTensor()); +} + +} // namespace tensor +} // namespace utils +} // namespace rnexecutorch diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.cpp b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.cpp index 108fd6ff8a..feb52209e6 100644 --- a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.cpp @@ -1,6 +1,8 @@ #include "Processing.h" #include #include +#include +#include namespace rnexecutorch::utils::computer_vision { @@ -18,4 +20,21 @@ float computeIoU(const BBox &a, const BBox &b) { return (unionArea > 0.0f) ? (intersectionArea / unionArea) : 0.0f; } +void validateThreshold(double value, const std::string &name) { + if (value < 0.0 || value > 1.0) { + throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, + name + " must be in range [0, 1]"); + } +} + +std::tuple extractDetectionData(const float *bboxData, + const float *scoresData, + int32_t index) { + BBox bbox{bboxData[index * 4], bboxData[index * 4 + 1], + bboxData[index * 4 + 2], bboxData[index * 4 + 3]}; + float score = scoresData[index * 2]; + int32_t label = static_cast(scoresData[index * 2 + 1]); + return {bbox, score, label}; +} + } // namespace rnexecutorch::utils::computer_vision diff --git a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.h b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.h index 3bd3022d4a..20b69792f9 100644 --- a/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.h +++ b/packages/react-native-executorch/common/rnexecutorch/utils/computer_vision/Processing.h @@ -2,12 +2,25 @@ #include "Types.h" #include +#include +#include +#include +#include #include namespace rnexecutorch::utils::computer_vision { float computeIoU(const BBox &a, const BBox &b); +/// Extracts {bbox, score, label} at index from raw model output buffers. +/// bboxData layout: [x1, y1, x2, y2] per detection. +/// scoresData layout: [score, label] per detection. +std::tuple extractDetectionData(const float *bboxData, + const float *scoresData, + int32_t index); + +void validateThreshold(double value, const std::string &name); + template std::vector nonMaxSuppression(std::vector items, double iouThreshold) { if (items.empty()) {