Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ cmake-out*
cmake-out-android/
build-android/
build-x86/
build-hexagon/
dist/
arm-scratch/
executorch.egg-info
Expand Down
11 changes: 11 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,17 @@ project(executorch)

set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR})

# Hexagon toolchain with release build complains about code in third party
# libraries.
if("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "Hexagon" AND "${CMAKE_BUILD_TYPE}"
STREQUAL "Release"
)
add_compile_options(
-Wno-error=format -Wno-error=implicit-int-conversion
-Wno-error=unused-variable -Wno-error=unused-function
)
endif()

# --- ExecuTorch Version ---
# Parse version from version.txt (single source of truth)
file(READ "${EXECUTORCH_ROOT}/version.txt" ET_VERSION_STRING)
Expand Down
29 changes: 20 additions & 9 deletions backends/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,22 +77,34 @@ if(${ANDROID})
find_library(android_log log)
endif()

add_compile_options("-Wall" "-Werror" "-Wno-sign-compare")
add_compile_options("-Wall" "-Werror" "-fvisibility=hidden")
add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS)

# GNU emit wanring for ignored attributes Unfortunately, we use [[maybe_unused]]
# which can be ignored by GNU. So we make it a warning, not an error in GNU.
# GNU emits warning for ignored attributes Unfortunately, we use
# [[maybe_unused]] which can be ignored by GNU. So we make it a warning, not an
# error in GNU.
if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
add_compile_options("-Wno-error=attributes")
add_link_options("-flto=auto")
endif()

if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
# strip symbols
add_link_options("-s")
add_link_options(LINKER:-s,--gc-sections)
if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
add_compile_options(
"-Os"
"-ffunction-sections"
"-fdata-sections"
"-frtti"
"-fno-exceptions"
"-fomit-frame-pointer"
"-fno-asynchronous-unwind-tables"
)
else()

# --gc-sections is added by torch.
add_compile_options("-O3" "-ffunction-sections" "-fdata-sections" "-frtti")
add_compile_options("-O3" "-ffunction-sections" "-fdata-sections" "-frtti")
endif()
endif()

include_directories(
Expand Down Expand Up @@ -230,9 +242,8 @@ target_link_libraries(
qnn_schema shared_buffer qnn_dlc_manager
)
target_link_libraries(
qnn_executorch_backend
PRIVATE qnn_executorch_header qnn_schema qnn_manager executorch_core
extension_tensor qnn_backend_options
qnn_executorch_backend PRIVATE qnn_executorch_header qnn_schema qnn_manager
executorch_core qnn_backend_options
)

if(${CMAKE_SYSTEM_PROCESSOR} MATCHES Hexagon)
Expand Down
2 changes: 1 addition & 1 deletion backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,7 @@ class PyQnnManager {
std::vector<std::vector<std::shared_ptr<OpWrapper>>>& op_wrappers) {
QnnExecuTorchContextBinary binary_info;

for (int i = 0; i < graph_names.size(); ++i) {
for (uint32_t i = 0; i < graph_names.size(); ++i) {
if (qnn_manager_->Compile(graph_names[i], op_wrappers[i]) !=
executorch::runtime::Error::Ok) {
QNN_EXECUTORCH_LOG_ERROR("Fail to compile QNN graph");
Expand Down
2 changes: 1 addition & 1 deletion backends/qualcomm/aot/wrappers/TensorWrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ class TensorWrapper {
rank);
return;
}
for (int i = 0; i < rank; ++i) {
for (size_t i = 0; i < rank; ++i) {
QNN_TENSOR_VER_PTR(tensor_)->dimensions[i] = dims[i];
}
}
Expand Down
10 changes: 7 additions & 3 deletions backends/qualcomm/runtime/QnnExecuTorch.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,14 +63,18 @@ struct CustomMemTensorInfo {
/// alignment as MemoryAllocator::kDefaultAlignment.
/// See runtime/core/memory_allocator.h. The function returns a valid pointer
/// if allocation is successful.
void* QnnExecuTorchAllocCustomMem(size_t bytes, size_t alignment);
__attribute__((__visibility__("default"))) void* QnnExecuTorchAllocCustomMem(
size_t bytes,
size_t alignment);

/// Add tensor to custom memory with custom type descriptor. Create memory
/// handle to tensor wrapper during execution
void QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem);
__attribute__((__visibility__("default"))) void
QnnExecuTorchAddCustomMemTensorAddr(void* tensor_addr, void* custom_mem);

/// Free the allocated shared memory.
void QnnExecuTorchFreeCustomMem(void* buffer_ptr);
__attribute__((__visibility__("default"))) void QnnExecuTorchFreeCustomMem(
void* buffer_ptr);

#ifdef __cplusplus
}
Expand Down
28 changes: 20 additions & 8 deletions backends/qualcomm/runtime/QnnManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
#include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
#include <executorch/backends/qualcomm/runtime/backends/QnnCustomProtocol.h>
#include <executorch/backends/qualcomm/runtime/backends/QnnImplementation.h>
#include <executorch/extension/tensor/tensor.h>
#include <algorithm>
#include <cstdlib>
#include <cstring>
Expand Down Expand Up @@ -427,19 +426,32 @@ Error QnnManager::Execute(
QNN_TENSOR_VER_PTR(output_tensor)->dimensions +
QNN_TENSOR_VER_PTR(output_tensor)->rank);

auto dump_tensor = executorch::extension::from_blob(
QNN_TENSOR_VER_PTR(output_tensor)->clientBuf.data,
sizes,
// Compute contiguous strides from sizes (e.g. [2,3,4] -> [12,4,1]).
std::vector<executorch::aten::StridesType> stride_size(sizes.size());
if (!sizes.empty()) {
stride_size.back() = 1;
for (int i = sizes.size() - 2; i >= 0; --i) {
stride_size[i] = stride_size[i + 1] * sizes[i + 1];
}
}
// Avoid using from_blob as it significantly increases shared library
// size.
executorch::aten::TensorImpl tensor_impl(
qnn_dtype_to_scalar_type_[QNN_TENSOR_VER_PTR(output_tensor)
->dataType]);
->dataType],
sizes.size(),
sizes.data(),
QNN_TENSOR_VER_PTR(output_tensor)->clientBuf.data,
nullptr,
stride_size.data());

executorch::runtime::event_tracer_log_output_delegate<
executorch::aten::Tensor>(
event_tracer,
QNN_TENSOR_VER_PTR(output_tensor)->name,
/*delegate_debug_id=*/
static_cast<executorch::runtime::DebugHandle>(-1),
*dump_tensor);
executorch::aten::Tensor(&tensor_impl));
}
}

Expand Down Expand Up @@ -547,7 +559,7 @@ Error QnnManager::CompileDlc() {

// Mapping memory address for the input and output of mutable buffer
std::unordered_map<int, const void*> mutable_buffer_id_to_memory_map;
for (int i = 0; i < graphInfo.numInputTensors; ++i) {
for (uint32_t i = 0; i < graphInfo.numInputTensors; ++i) {
auto tw = CreateTensorWrapper(graphInfo.inputTensors[i]);
tw->UpdateQnnTensorMeta(graphInfo.inputTensors[i]);

Expand All @@ -560,7 +572,7 @@ Error QnnManager::CompileDlc() {
}
graph_inputs.push_back(tw);
}
for (int i = 0; i < graphInfo.numOutputTensors; ++i) {
for (uint32_t i = 0; i < graphInfo.numOutputTensors; ++i) {
auto tw = CreateTensorWrapper(graphInfo.outputTensors[i]);
tw->UpdateQnnTensorMeta(graphInfo.outputTensors[i]);
int mutable_buffer_id = ExtractMutableBufferNumber(tw->GetName());
Expand Down
4 changes: 2 additions & 2 deletions backends/qualcomm/runtime/SharedBuffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ std::size_t std::hash<CustomMemTensorInfo>::operator()(
hash_val ^= std::hash<void*>()(info.custom_mem);
hash_val ^= std::hash<size_t>()(info.pos);
hash_val ^= std::hash<size_t>()(info.tensor_bytes);
for (int i = 0; i < info.rank; ++i) {
for (size_t i = 0; i < info.rank; ++i) {
hash_val ^= std::hash<uint32_t>()(info.shape[i]);
}
hash_val ^= std::hash<uint32_t>()(info.rank);
Expand All @@ -36,7 +36,7 @@ bool operator==(
(lhs.tensor_addr == rhs.tensor_addr && lhs.custom_mem == rhs.custom_mem &&
lhs.pos == rhs.pos && lhs.tensor_bytes == rhs.tensor_bytes &&
lhs.rank == rhs.rank && lhs.dtype == rhs.dtype);
for (int i = 0; i < lhs.rank; ++i) {
for (size_t i = 0; i < lhs.rank; ++i) {
is_same &= lhs.shape[i] == rhs.shape[i];
}
return is_same;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,3 +45,4 @@ target_link_libraries(
${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${DSP_VERSION}/G0/pic/libc++.so.1
${HEXAGON_TOOLS_ROOT}/Tools/target/hexagon/lib/${DSP_VERSION}/G0/pic/libc++abi.so.1
)
target_compile_options(qnn_executorch_skel PRIVATE "-fvisibility=default")
56 changes: 56 additions & 0 deletions backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from executorch.backends.qualcomm.debugger.utils import generate_optrace
from executorch.backends.qualcomm.serialization.qc_schema import (
QnnExecuTorchBackendType,
QnnExecuTorchHtpPerformanceMode,
)
from executorch.backends.qualcomm.tests.utils import (
convert_pt2e,
Expand Down Expand Up @@ -4847,6 +4848,33 @@ def setUp(self):
saver=False,
)

def test_qnn_backend_compile_time_option_htp_performance(self):
backend_options = generate_htp_compiler_spec(
use_fp16=True,
htp_performance_mode=QnnExecuTorchHtpPerformanceMode.kHtpHighPowerSaver,
)
TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
soc_model=self.chipset_table[TestQNN.model],
backend_options=backend_options,
)
module = SimpleModel() # noqa: F405
sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))

def output_callback(log_msg):
msg = log_msg.stdout
# Refer to HtpDevice.cpp for the following values
min_voltage = "coreVoltageCornerMin 80"
self.assertTrue(min_voltage in msg, f"Expecting '{min_voltage} ' in log")

runtime_extra_commands = " --log_level 4"
self.lower_module_and_test_output(
module,
sample_input,
extra_cmds=runtime_extra_commands,
output_callback=partial(output_callback),
save_inference_speed=True,
)

def test_qnn_backend_dump_intermediate_outputs_topk(self):
TestQNN.dump_intermediate_outputs = True
backend_options = generate_htp_compiler_spec(use_fp16=True)
Expand Down Expand Up @@ -5436,6 +5464,34 @@ def setUp(self):
saver=False,
)

def test_qnn_backend_compile_time_option_htp_performance(self):
backend_options = generate_htp_compiler_spec(
use_fp16=False,
htp_performance_mode=QnnExecuTorchHtpPerformanceMode.kHtpHighPowerSaver,
)
TestQNN.compiler_specs = generate_qnn_executorch_compiler_spec(
soc_model=self.chipset_table[TestQNN.model],
backend_options=backend_options,
)
module = SimpleModel() # noqa: F405
sample_input = (torch.ones(1, 32, 28, 28), torch.ones(1, 32, 28, 28))
module = self.get_qdq_module(module, sample_input)

def output_callback(log_msg):
msg = log_msg.stdout
# Refer to HtpDevice.cpp for the following values
min_voltage = "coreVoltageCornerMin 80"
self.assertTrue(min_voltage in msg, f"Expecting '{min_voltage} ' in log")

runtime_extra_commands = " --log_level 4"
self.lower_module_and_test_output(
module,
sample_input,
extra_cmds=runtime_extra_commands,
output_callback=partial(output_callback),
save_inference_speed=True,
)

def test_qnn_backend_dump_intermediate_outputs_simple_model(self):
TestQNN.dump_intermediate_outputs = True
backend_options = generate_htp_compiler_spec(use_fp16=False)
Expand Down
3 changes: 2 additions & 1 deletion backends/qualcomm/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -994,6 +994,7 @@ def generate_htp_compiler_spec(
use_multi_contexts: bool = False,
use_weight_sharing: bool = False,
use_slc_allocator: bool = False,
htp_performance_mode: QnnExecuTorchHtpPerformanceMode = QnnExecuTorchHtpPerformanceMode.kHtpBurst,
) -> QnnExecuTorchBackendOptions:
"""
Helper function generating backend options for QNN HTP
Expand Down Expand Up @@ -1025,7 +1026,7 @@ def generate_htp_compiler_spec(
# This actually is not an option which can affect the compiled blob.
# But we don't have other place to pass this option at execution stage.
# TODO: enable voting mechanism in runtime and make this as an option
htp_options.performance_mode = QnnExecuTorchHtpPerformanceMode.kHtpBurst
htp_options.performance_mode = htp_performance_mode
htp_options.use_multi_contexts = use_multi_contexts
htp_options.use_weight_sharing = use_weight_sharing
htp_options.use_dlbc = use_dlbc
Expand Down
15 changes: 14 additions & 1 deletion examples/qualcomm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from executorch.backends.qualcomm.serialization.qc_schema import (
QcomChipset,
QnnExecuTorchBackendType,
QnnExecuTorchHtpPerformanceMode,
QnnExecuTorchOpPackageOptions,
)
from executorch.backends.qualcomm.utils.constants import (
Expand Down Expand Up @@ -483,6 +484,7 @@ def build_executorch_binary(
optrace=False,
op_package_options: QnnExecuTorchOpPackageOptions = None,
direct_mode_build_path=None,
htp_performance_mode: QnnExecuTorchHtpPerformanceMode = QnnExecuTorchHtpPerformanceMode.kHtpBurst,
):
"""
A function to generate an ExecuTorch binary for Qualcomm platforms.
Expand All @@ -508,6 +510,8 @@ def build_executorch_binary(
optrace (bool, optional): Enable optrace mode for performance analysis if set to True.
op_package_options: Optional structure to specify op packages
loaded and used by the backend.
direct_mode_build_path (string, optional): Path to build folder for direct mode.
htp_performance_mode (QnnExecuTorchHtpPerformanceMode, optional): Option to set the performance mode for htp backend.

Returns:
None: The function writes the output to a specified .pte file.
Expand All @@ -517,7 +521,8 @@ def build_executorch_binary(
backend_options = {
QnnExecuTorchBackendType.kGpuBackend: generate_gpu_compiler_spec(),
QnnExecuTorchBackendType.kHtpBackend: generate_htp_compiler_spec(
use_fp16=False if quant_dtype is not None else True
use_fp16=False if quant_dtype is not None else True,
htp_performance_mode=htp_performance_mode,
),
}[backend]
compile_spec = generate_qnn_executorch_compiler_spec(
Expand Down Expand Up @@ -1038,6 +1043,14 @@ def setup_common_args_and_variables():
type=str,
)

parser.add_argument(
"--htp_performance_mode",
type=int,
choices=list(QnnExecuTorchHtpPerformanceMode),
help="Specify performance mode for htp from 0-8, default to burst(2). For more info, refer to qc_schema.py",
default=2,
)

# QNN_SDK_ROOT might also be an argument, but it is used in various places.
# So maybe it's fine to just use the environment.
if "QNN_SDK_ROOT" not in os.environ:
Expand Down
Loading