Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
a29c4e2
feat(models): add preprocessing helpers to VisionModel
benITo47 Apr 7, 2026
db8cb09
refactor(models): migrate 6 vision models to use VisionModel helpers
benITo47 Apr 7, 2026
db5a2a1
feat(models): add multi-method support helpers to BaseModel
benITo47 Apr 7, 2026
251a756
feat(utils): add detection helper utilities to computer_vision/Proces…
benITo47 Apr 7, 2026
a17443d
refactor(ocr): use BaseModel getModelInputSize helper in Detector
benITo47 Apr 7, 2026
afc81b1
refactor(models): add validateAndGetInputShape helper to BaseModel
benITo47 Apr 7, 2026
91d5123
feat: Deduplicate cpp
benITo47 Apr 10, 2026
9766dd8
docs(vision): document normalization usage in VisionModel
benITo47 Apr 10, 2026
1127fd9
feat(utils): add TensorHelpers for type-safe span conversions
benITo47 Apr 10, 2026
570e52f
feat(utils): add extractDetectionData to computer_vision
benITo47 Apr 10, 2026
07a0d02
feat(utils): add inverseRotateBboxes batch helper
benITo47 Apr 10, 2026
2062f57
feat(models): add forwardOrThrow helpers to BaseModel
benITo47 Apr 10, 2026
604d6bf
refactor(classification): use tensor and error utilities
benITo47 Apr 10, 2026
bff7197
refactor(semantic-seg): use tensor and error utilities
benITo47 Apr 10, 2026
cf0567c
refactor(object-detection): use tensor, error, and rotation utilities
benITo47 Apr 10, 2026
02a95b4
refactor(instance-seg): use detection and error utilities
benITo47 Apr 10, 2026
356764e
refactor(style-transfer): use error utilities
benITo47 Apr 10, 2026
4cd6d70
refactor(image-embeddings): use error utilities
benITo47 Apr 10, 2026
10e089c
test: add unit tests for new utility functions
benITo47 Apr 14, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,87 @@ std::size_t BaseModel::getMemoryLowerBound() const noexcept {

void BaseModel::unload() noexcept { module_.reset(nullptr); }

void BaseModel::ensureMethodLoaded(const std::string &methodName) {
if (methodName.empty()) {
throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
"methodName cannot be empty");
}
if (currentlyLoadedMethod_ == methodName) {
return;
}
if (!module_) {
throw RnExecutorchError(RnExecutorchErrorCode::ModuleNotLoaded,
"Model module is not loaded");
}
if (!currentlyLoadedMethod_.empty()) {
module_->unload_method(currentlyLoadedMethod_);
}
auto loadResult = module_->load_method(methodName);
if (loadResult != executorch::runtime::Error::Ok) {
throw RnExecutorchError(
loadResult, "Failed to load method '" + methodName +
"'. Ensure the method exists in the exported model.");
}
currentlyLoadedMethod_ = methodName;
}

std::vector<int32_t>
BaseModel::validateAndGetInputShape(const std::string &methodName,
size_t minDimensions) const {
auto inputShapes = getAllInputShapes(methodName);

if (inputShapes.empty()) {
throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs,
"Model seems to not take any input tensors.");
}

const auto &shape = inputShapes[0];
if (shape.size() < minDimensions) {
throw RnExecutorchError(
RnExecutorchErrorCode::WrongDimensions,
"Unexpected model input size, expected at least " +
std::to_string(minDimensions) +
" dimensions but got: " + std::to_string(shape.size()) + ".");
}

return shape;
}

std::vector<EValue>
BaseModel::forwardOrThrow(const EValue &input,
const std::string &contextMessage) const {
auto result = forward(input);
if (!result.ok()) {
throw RnExecutorchError(result.error(), contextMessage);
}
return std::move(result.get());
}

std::vector<EValue>
BaseModel::forwardOrThrow(const std::vector<EValue> &inputs,
const std::string &contextMessage) const {
auto result = forward(inputs);
if (!result.ok()) {
throw RnExecutorchError(result.error(), contextMessage);
}
return std::move(result.get());
}

std::vector<EValue>
BaseModel::executeOrThrow(const std::string &methodName,
const std::vector<EValue> &inputs,
const std::string &contextMessage) const {
auto result = execute(methodName, inputs);
if (!result.ok()) {
std::string message =
contextMessage.empty()
? "Model " + methodName + " method failed. Ensure input is correct."
: contextMessage;
throw RnExecutorchError(result.error(), message);
}
return std::move(result.get());
}

std::vector<int32_t>
BaseModel::getTensorShape(const executorch::aten::Tensor &tensor) const {
auto sizes = tensor.sizes();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,32 @@ class BaseModel {

std::size_t memorySizeLowerBound{0};

/// Loads methodName, unloading any previously loaded method first.
/// Useful for multi-method models (e.g., "forward_384", "forward_640").
void ensureMethodLoaded(const std::string &methodName);

std::vector<int32_t>
validateAndGetInputShape(const std::string &methodName = "forward",
size_t minDimensions = 2) const;

std::vector<EValue>
forwardOrThrow(const EValue &input,
const std::string &contextMessage =
"Model forward failed. Ensure input is correct.") const;

std::vector<EValue>
forwardOrThrow(const std::vector<EValue> &inputs,
const std::string &contextMessage =
"Model forward failed. Ensure input is correct.") const;

std::vector<EValue>
executeOrThrow(const std::string &methodName,
const std::vector<EValue> &inputs,
const std::string &contextMessage = "") const;

/// Name of the currently loaded method (for multi-method models).
std::string currentlyLoadedMethod_;

private:
std::vector<int32_t>
getTensorShape(const executorch::aten::Tensor &tensor) const;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include "VisionModel.h"
#include <rnexecutorch/Error.h>
#include <rnexecutorch/ErrorCodes.h>
#include <rnexecutorch/Log.h>
#include <rnexecutorch/data_processing/ImageProcessing.h>
#include <rnexecutorch/utils/FrameProcessor.h>
#include <rnexecutorch/utils/FrameTransform.h>

Expand All @@ -25,6 +27,25 @@ cv::Size VisionModel::modelInputSize() const {
modelInputShape_[modelInputShape_.size() - 2]);
}

cv::Size VisionModel::getModelInputSize(const std::string &methodName) const {
std::string method = methodName.empty() ? currentlyLoadedMethod_ : methodName;
if (method.empty()) {
throw RnExecutorchError(
RnExecutorchErrorCode::InvalidUserInput,
"No method specified and no method currently loaded");
}

auto inputShapes = getAllInputShapes(method);
if (inputShapes.empty() || inputShapes[0].size() < 2) {
throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs,
"Could not determine input shape for method: " +
method);
}

const auto &shape = inputShapes[0];
return cv::Size(shape[shape.size() - 1], shape[shape.size() - 2]);
}

cv::Mat VisionModel::extractFromFrame(jsi::Runtime &runtime,
const jsi::Value &frameData) const {
cv::Mat frame = ::rnexecutorch::utils::frameToMat(runtime, frameData);
Expand All @@ -51,4 +72,46 @@ cv::Mat VisionModel::extractFromPixels(const JSTensorViewIn &tensorView) const {
return ::rnexecutorch::utils::pixelsToMat(tensorView);
}

void VisionModel::initNormalization(const std::vector<float> &normMean,
const std::vector<float> &normStd) {
if (normMean.size() == 3) {
normMean_ = cv::Scalar(normMean[0], normMean[1], normMean[2]);
} else if (!normMean.empty()) {
log(LOG_LEVEL::Warn,
"normMean must have 3 elements — ignoring provided value.");
}

if (normStd.size() == 3) {
normStd_ = cv::Scalar(normStd[0], normStd[1], normStd[2]);
} else if (!normStd.empty()) {
log(LOG_LEVEL::Warn,
"normStd must have 3 elements — ignoring provided value.");
}
}

TensorPtr VisionModel::createInputTensor(const cv::Mat &preprocessed) const {
return (normMean_ && normStd_)
? image_processing::getTensorFromMatrix(
modelInputShape_, preprocessed, *normMean_, *normStd_)
: image_processing::getTensorFromMatrix(modelInputShape_,
preprocessed);
}

cv::Mat VisionModel::loadImageToRGB(const std::string &imageSource) const {
cv::Mat imageBGR = image_processing::readImage(imageSource);
cv::Mat imageRGB;
cv::cvtColor(imageBGR, imageRGB, cv::COLOR_BGR2RGB);
return imageRGB;
}

std::tuple<cv::Mat, utils::FrameOrientation, cv::Size>
VisionModel::loadFrameRotated(jsi::Runtime &runtime,
const jsi::Value &frameData) const {
auto orient = utils::readFrameOrientation(runtime, frameData);
cv::Mat frame = extractFromFrame(runtime, frameData);
cv::Size originalSize = frame.size();
cv::Mat rotated = utils::rotateFrameForModel(frame, orient);
return {rotated, orient, originalSize};
}

} // namespace rnexecutorch::models
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <executorch/extension/tensor/tensor_ptr.h>
#include <jsi/jsi.h>
#include <mutex>
#include <opencv2/opencv.hpp>
Expand All @@ -10,6 +11,8 @@
namespace rnexecutorch {
namespace models {

using executorch::extension::TensorPtr;

/**
* @brief Base class for computer vision models that support real-time camera
* input
Expand All @@ -21,16 +24,35 @@ namespace models {
* Thread Safety:
* - All inference operations are protected by a mutex via scoped_lock
*
* Normalization:
* Subclasses should call initNormalization() with ImageNet mean/std when the
* model expects ImageNet-normalized inputs (e.g., Classification, Detection,
* Segmentation). Skip initNormalization() when the model:
* - Has built-in normalization layers (e.g., some embeddings models)
* - Expects raw pixel values [0, 255] (e.g., StyleTransfer)
* - Uses non-ImageNet normalization (handle in custom preprocess())
*
* The createInputTensor() method safely handles both cases via std::optional.
*
* Usage:
* Subclasses should:
* 1. Inherit from VisionModel instead of BaseModel
* 2. Optionally override preprocess() for model-specific preprocessing
* 3. Implement runInference() which acquires the lock internally
* 2. Call initNormalization() if model expects normalized inputs
* 3. Optionally override preprocess() for model-specific preprocessing
* 4. Implement runInference() which acquires the lock internally
*
* Example:
* @code
* class Classification : public VisionModel {
* public:
* Classification(const std::string& modelSource,
* std::shared_ptr<react::CallInvoker> callInvoker,
* const std::vector<float>& normMean,
* const std::vector<float>& normStd)
* : VisionModel(modelSource, callInvoker) {
* initNormalization(normMean, normStd); // ImageNet normalization
* }
*
* std::unordered_map<std::string_view, float>
* generateFromFrame(jsi::Runtime& runtime, const jsi::Value& frameValue) {
* auto frameObject = frameValue.asObject(runtime);
Expand Down Expand Up @@ -63,6 +85,13 @@ class VisionModel : public BaseModel {
/// Set once by each subclass constructor to avoid per-frame metadata lookups.
std::vector<int32_t> modelInputShape_;

/// Per-channel normalization mean (RGB). nullopt = no normalization applied.
std::optional<cv::Scalar> normMean_;

/// Per-channel normalization std-dev (RGB). nullopt = no normalization
/// applied.
std::optional<cv::Scalar> normStd_;

/**
* @brief Mutex to ensure thread-safe inference
*
Expand Down Expand Up @@ -99,6 +128,35 @@ class VisionModel : public BaseModel {
* sizes.
*/
virtual cv::Size modelInputSize() const;

/**
* @brief Get input size for a specific method (last two shape dims).
*
* Useful for multi-method models with different input sizes per method.
* Falls back to currentlyLoadedMethod_ when methodName is empty.
*/
cv::Size getModelInputSize(const std::string &methodName = "") const;

/**
* @brief Set normMean_/normStd_ from float vectors.
*
* Expects size == 3. Logs a warning and ignores if non-empty but wrong size.
*/
void initNormalization(const std::vector<float> &normMean,
const std::vector<float> &normStd);

/// Builds input tensor from a preprocessed image.
/// Applies normalization if normMean_/normStd_ are set, skips it otherwise.
TensorPtr createInputTensor(const cv::Mat &preprocessed) const;

/// Reads image from path and converts BGR → RGB.
cv::Mat loadImageToRGB(const std::string &imageSource) const;

/// Extracts a camera frame, applies rotation, and returns
/// {rotated frame, orientation, original size}.
std::tuple<cv::Mat, utils::FrameOrientation, cv::Size>
loadFrameRotated(jsi::Runtime &runtime, const jsi::Value &frameData) const;

/**
* @brief Extract an RGB cv::Mat from a VisionCamera frame
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <rnexecutorch/data_processing/ImageProcessing.h>
#include <rnexecutorch/data_processing/Numerical.h>
#include <rnexecutorch/utils/TensorHelpers.h>

namespace rnexecutorch::models::classification {

Expand All @@ -16,65 +17,26 @@ Classification::Classification(const std::string &modelSource,
std::shared_ptr<react::CallInvoker> callInvoker)
: VisionModel(modelSource, callInvoker),
labelNames_(std::move(labelNames)) {
if (normMean.size() == 3) {
normMean_ = cv::Scalar(normMean[0], normMean[1], normMean[2]);
} else if (!normMean.empty()) {
log(LOG_LEVEL::Warn,
"normMean must have 3 elements — ignoring provided value.");
}
if (normStd.size() == 3) {
normStd_ = cv::Scalar(normStd[0], normStd[1], normStd[2]);
} else if (!normStd.empty()) {
log(LOG_LEVEL::Warn,
"normStd must have 3 elements — ignoring provided value.");
}

auto inputShapes = getAllInputShapes();
if (inputShapes.size() == 0) {
throw RnExecutorchError(RnExecutorchErrorCode::UnexpectedNumInputs,
"Model seems to not take any input tensors.");
}
modelInputShape_ = inputShapes[0];
if (modelInputShape_.size() < 2) {
char errorMessage[100];
std::snprintf(errorMessage, sizeof(errorMessage),
"Unexpected model input size, expected at least 2 dimensions "
"but got: %zu.",
modelInputShape_.size());
throw RnExecutorchError(RnExecutorchErrorCode::WrongDimensions,
errorMessage);
}
initNormalization(normMean, normStd);
modelInputShape_ = validateAndGetInputShape();
}

std::unordered_map<std::string_view, float>
Classification::runInference(cv::Mat image) {
std::scoped_lock lock(inference_mutex_);

cv::Mat preprocessed = preprocess(image);
auto inputTensor = createInputTensor(preprocessed);

auto inputTensor =
(normMean_ && normStd_)
? image_processing::getTensorFromMatrix(
modelInputShape_, preprocessed, *normMean_, *normStd_)
: image_processing::getTensorFromMatrix(modelInputShape_,
preprocessed);

auto forwardResult = BaseModel::forward(inputTensor);
if (!forwardResult.ok()) {
throw RnExecutorchError(forwardResult.error(),
"The model's forward function did not succeed. "
"Ensure the model input is correct.");
}
return postprocess(forwardResult->at(0).toTensor());
auto outputs = forwardOrThrow(inputTensor,
"The model's forward function did not succeed. "
"Ensure the model input is correct.");
return postprocess(outputs.at(0).toTensor());
}

std::unordered_map<std::string_view, float>
Classification::generateFromString(std::string imageSource) {
cv::Mat imageBGR = image_processing::readImage(imageSource);

cv::Mat imageRGB;
cv::cvtColor(imageBGR, imageRGB, cv::COLOR_BGR2RGB);

cv::Mat imageRGB = loadImageToRGB(imageSource);
return runInference(imageRGB);
}

Expand All @@ -94,8 +56,7 @@ Classification::generateFromPixels(JSTensorViewIn pixelData) {

std::unordered_map<std::string_view, float>
Classification::postprocess(const Tensor &tensor) {
std::span<const float> resultData(
static_cast<const float *>(tensor.const_data_ptr()), tensor.numel());
auto resultData = utils::tensor::toSpan<float>(tensor);
std::vector<float> resultVec(resultData.begin(), resultData.end());

if (resultVec.size() != labelNames_.size()) {
Expand Down
Loading
Loading