diff --git a/omniscidb/QueryEngine/CompilationOptions.h b/omniscidb/QueryEngine/CompilationOptions.h index 7555039bd..22ba3fa1e 100644 --- a/omniscidb/QueryEngine/CompilationOptions.h +++ b/omniscidb/QueryEngine/CompilationOptions.h @@ -25,6 +25,7 @@ #include #endif +#include #include "Compiler/CodegenTraitsDescriptor.h" #include "Shared/Config.h" #include "Shared/DeviceType.h" @@ -68,6 +69,7 @@ struct CompilationOptions { case ExecutorDeviceType::CPU: return compiler::cpu_cgen_traits_desc; case ExecutorDeviceType::GPU: + std::cout << "Check is_l0?" << is_l0 << std::endl; return (is_l0 ? compiler::l0_cgen_traits_desc : compiler::cuda_cgen_traits_desc); } return {}; diff --git a/omniscidb/QueryEngine/Compiler/Backend.cpp b/omniscidb/QueryEngine/Compiler/Backend.cpp index 67f12faf6..484a4192c 100644 --- a/omniscidb/QueryEngine/Compiler/Backend.cpp +++ b/omniscidb/QueryEngine/Compiler/Backend.cpp @@ -63,7 +63,7 @@ CodegenTraits CodegenTraits::get(CodegenTraitsDescriptor codegen_traits_desc) { CHECK(descCallingConvToLLVM.find(codegen_traits_desc.conv_) != descCallingConvToLLVM.end()); return CodegenTraits(codegen_traits_desc.local_addr_space_, - codegen_traits_desc.smem_addr_space_, + codegen_traits_desc.shared_addr_space_, codegen_traits_desc.global_addr_space_, descCallingConvToLLVM.at(codegen_traits_desc.conv_), codegen_traits_desc.triple_); @@ -1014,6 +1014,7 @@ std::shared_ptr L0Backend::generateNativeGPUCode( spirv_src->addOperand(llvm::MDNode::get(ctx, spirv_src_ops)); compiler::optimize_ir(func, module, live_funcs, false /*smem_used*/, co); + std::cout << "GenerateNativeGPUCode" << std::endl; // Remove the remaining freeze instruction after the optimization std::vector to_erase; diff --git a/omniscidb/QueryEngine/Compiler/Backend.h b/omniscidb/QueryEngine/Compiler/Backend.h index 3d5ea3976..c52e834d9 100644 --- a/omniscidb/QueryEngine/Compiler/Backend.h +++ b/omniscidb/QueryEngine/Compiler/Backend.h @@ -34,18 +34,18 @@ namespace compiler { class CodegenTraits { explicit CodegenTraits(unsigned local_addr_space, - unsigned smem_addr_space, + unsigned shared_addr_space, unsigned global_addr_space, llvm::CallingConv::ID calling_conv, llvm::StringRef triple = "") : local_addr_space_(local_addr_space) - , smem_addr_space_(smem_addr_space) + , shared_addr_space_(shared_addr_space) , global_addr_space_(global_addr_space) , conv_(calling_conv) , triple_(triple) {} const unsigned local_addr_space_; - const unsigned smem_addr_space_; + const unsigned shared_addr_space_; const unsigned global_addr_space_; const llvm::CallingConv::ID conv_; const llvm::StringRef triple_; @@ -60,12 +60,12 @@ class CodegenTraits { CodegenTraits& operator=(const CodegenTraits&) = delete; static CodegenTraits get(unsigned local_addr_space, - unsigned smem_addr_space, + unsigned shared_addr_space, unsigned global_addr_space, llvm::CallingConv::ID calling_conv, llvm::StringRef triple = "") { return CodegenTraits( - local_addr_space, smem_addr_space, global_addr_space, calling_conv, triple); + local_addr_space, shared_addr_space, global_addr_space, calling_conv, triple); } static CodegenTraits get(CodegenTraitsDescriptor codegen_traits_desc); @@ -76,8 +76,10 @@ class CodegenTraits { const std::string triple = ""); CodegenTraitsDescriptor getDescriptor() { + std::cout << "Backend.h getDescriptor() shared_addr_space = " << shared_addr_space_ + << std::endl; return CodegenTraitsDescriptor(local_addr_space_, - smem_addr_space_, + shared_addr_space_, global_addr_space_, llvmCallingConvToDesc.at(conv_), triple_.str()); @@ -87,7 +89,9 @@ class CodegenTraits { return llvm::PointerType::get(ElementType, local_addr_space_); } llvm::PointerType* smemPointerType(llvm::Type* ElementType) const { - return llvm::PointerType::get(ElementType, smem_addr_space_); + std::cout << "Backend.h shared_addr_space = " << shared_addr_space_ << std::endl; + assert(shared_addr_space_ == 3); // Shared mem addr spcae on Intel GPU is 3 + return llvm::PointerType::get(ElementType, shared_addr_space_); } llvm::PointerType* globalPointerType(llvm::Type* ElementType) const { return llvm::PointerType::get(ElementType, global_addr_space_); @@ -234,4 +238,6 @@ void setSharedMemory(ExecutorDeviceType dt, GPUTarget& gpu_target, const std::shared_ptr& backend); +void replace_function(llvm::Module* from, llvm::Module* to, const std::string& fname); + } // namespace compiler diff --git a/omniscidb/QueryEngine/Compiler/CodegenTraitsDescriptor.h b/omniscidb/QueryEngine/Compiler/CodegenTraitsDescriptor.h index 956fbba25..a821c8a35 100644 --- a/omniscidb/QueryEngine/Compiler/CodegenTraitsDescriptor.h +++ b/omniscidb/QueryEngine/Compiler/CodegenTraitsDescriptor.h @@ -12,19 +12,22 @@ namespace compiler { enum class CallingConvDesc { C, SPIR_FUNC }; struct CodegenTraitsDescriptor { CodegenTraitsDescriptor(unsigned local_addr_space, - unsigned smem_addr_space, + unsigned shared_addr_space, unsigned global_addr_space, CallingConvDesc calling_conv, std::string_view triple) : local_addr_space_(local_addr_space) - , smem_addr_space_(smem_addr_space) + , shared_addr_space_(shared_addr_space) , global_addr_space_(global_addr_space) , conv_(calling_conv) , triple_(triple) {} - CodegenTraitsDescriptor(){}; + CodegenTraitsDescriptor() { + std::cout << "CodegenTraitsDescriptor shared_addr_space=" << shared_addr_space_ + << std::endl; + }; unsigned local_addr_space_{0}; - unsigned smem_addr_space_{0}; + unsigned shared_addr_space_{0}; unsigned global_addr_space_{0}; CallingConvDesc conv_; std::string_view triple_{"DUMMY"}; diff --git a/omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp b/omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp index 865cd85f6..90b4369a7 100644 --- a/omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp +++ b/omniscidb/QueryEngine/GpuSharedMemoryUtils.cpp @@ -65,6 +65,7 @@ void GpuSharedMemCodeBuilder::codegen(const CompilationOptions& co) { CHECK(init_func_); codegenInitialization(); compiler::verify_function_ir(init_func_); + DUMP_MODULE(init_func_->getParent(), "after.codegenInitialization.ll"); // codegen the reduction function: reduction_func_ = createReductionFunction(co); @@ -242,10 +243,12 @@ llvm::Value* codegen_smem_dest_slot_ptr(llvm::LLVMContext& context, auto ptr_type = [&context, &traits](const size_t slot_bytes, const hdk::ir::Type* type) { if (slot_bytes == sizeof(int32_t)) { - return traits.smemPointerType(llvm::Type::getInt32Ty(context)); + // return traits.smemPointerType(llvm::Type::getInt32Ty(context)); // quickfix + return llvm::Type::getInt32PtrTy(context, /*address_space=*/3); } else { CHECK(slot_bytes == sizeof(int64_t)); - return traits.smemPointerType(llvm::Type::getInt64Ty(context)); + // return traits.smemPointerType(llvm::Type::getInt64Ty(context)); // quickfix + return llvm::Type::getInt64PtrTy(context, /*address_space=*/3); } UNREACHABLE() << "Invalid slot size encountered: " << std::to_string(slot_bytes); return traits.smemPointerType(llvm::Type::getInt32Ty(context)); @@ -327,6 +330,9 @@ void GpuSharedMemCodeBuilder::codegenInitialization() { dest_byte_stream, byte_offset_ll); + // DUMP(traits_, "output_traits_") + // std::cout << "traits_" << traits_; + llvm::Value* init_value_ll = nullptr; if (slot_size == sizeof(int32_t)) { init_value_ll = diff --git a/omniscidb/Tests/CMakeLists.txt b/omniscidb/Tests/CMakeLists.txt index 30ca53fcc..d497db149 100644 --- a/omniscidb/Tests/CMakeLists.txt +++ b/omniscidb/Tests/CMakeLists.txt @@ -60,6 +60,7 @@ endif() add_executable(StringDictionaryBenchmark StringDictionaryBenchmark.cpp) if(ENABLE_L0) + add_executable(GpuSharedMemoryTestIntel GpuSharedMemoryTestIntel.cpp ResultSetTestUtils.cpp) add_executable(L0MgrExecuteTest L0MgrExecuteTest.cpp) add_executable(SpirvBuildTest SpirvBuildTest.cpp) add_executable(DataMgrWithL0Test DataMgrWithL0Test.cpp) @@ -69,11 +70,13 @@ if(ENABLE_L0) target_link_libraries(SpirvBuildTest gtest ${llvm_libs}) target_link_libraries(DataMgrWithL0Test DataMgr gtest) target_link_libraries(IntelGPUEnablingTest gtest QueryEngine ArrowQueryRunner) + target_link_libraries(GpuSharedMemoryTestIntel gtest QueryEngine ArrowQueryRunner) add_test(L0MgrExecuteTest L0MgrExecuteTest ${TEST_ARGS}) add_test(SpirvBuildTest SpirvBuildTest ${TEST_ARGS}) add_test(DataMgrWithL0Test DataMgrWithL0Test ${TEST_ARGS}) add_test(IntelGPUEnablingTest IntelGPUEnablingTest ${TEST_ARGS}) + add_test(GpuSharedMemoryTestIntel GpuSharedMemoryTestIntel ${TEST_ARGS}) endif() add_executable(CostModelTest CostModel/CostModelTest.cpp) @@ -134,6 +137,10 @@ else() target_link_libraries(StringDictionaryBenchmark benchmark gtest StringDictionary Logger Utils $<$,$,9.0>>:stdc++fs> ${CMAKE_DL_LIBS} ${Boost_LIBRARIES} ${ZLIB_LIBRARIES}) endif() +if(ENABLE_L0) + target_link_libraries(GpuSharedMemoryTestIntel gtest Logger QueryEngine) +endif() + if(ENABLE_CUDA) target_link_libraries(GpuSharedMemoryTest gtest Logger QueryEngine) endif() @@ -180,6 +187,7 @@ if(ENABLE_CUDA) add_test(GpuSharedMemoryTest GpuSharedMemoryTest ${TEST_ARGS}) endif() if(ENABLE_L0) + add_test(GpuSharedMemoryTestIntel GpuSharedMemoryTestIntel ${TEST_ARGS}) add_test(NAME PuntToCpu COMMAND ArrowBasedExecuteTest "--gtest_filter=Select.Punt*" ${TEST_ARGS}) set_tests_properties(PuntToCpu PROPERTIES LABELS "enabling") add_test(NAME StreamingTopNFallback COMMAND ArrowBasedExecuteTest "--gtest_filter=Select.TopKHeap:Select.TimeInterval:Select.OverflowAndUnderFlow" ${TEST_ARGS}) @@ -236,6 +244,10 @@ if(ENABLE_CUDA) list(APPEND TEST_PROGRAMS GpuSharedMemoryTest) endif() +if(ENABLE_L0) + list(APPEND TEST_PROGRAMS GpuSharedMemoryTestIntel) +endif() + #if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") # list(APPEND TEST_PROGRAMS UdfTest) #endif() @@ -323,6 +335,7 @@ add_custom_target(topk_tests if(ENABLE_L0) set(ENABLING_TESTS + GpuSharedMemoryTestIntel ArrowStorageSqlTest # taxi queries SpirvBuildTest L0MgrExecuteTest diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp new file mode 100644 index 000000000..6e627bc3a --- /dev/null +++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.cpp @@ -0,0 +1,827 @@ +/* + * Copyright 2019 OmniSci, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "GpuSharedMemoryTestIntel.h" +#include +#include "QueryEngine/CompilationOptions.h" +#include "QueryEngine/Compiler/CodegenTraitsDescriptor.h" +#include "QueryEngine/Compiler/HelperFunctions.h" +#include "QueryEngine/LLVMGlobalContext.h" +#include "QueryEngine/OutputBufferInitialization.h" +#include "QueryEngine/ResultSetReduction.h" +#include "QueryEngine/ResultSetReductionJIT.h" + +extern bool g_is_test_env; + +auto int8_type = hdk::ir::Context::defaultCtx().int8(); +auto int16_type = hdk::ir::Context::defaultCtx().int16(); +auto int32_type = hdk::ir::Context::defaultCtx().int32(); +auto int64_type = hdk::ir::Context::defaultCtx().int64(); +auto float_type = hdk::ir::Context::defaultCtx().fp32(); +auto double_type = hdk::ir::Context::defaultCtx().fp64(); + +namespace { + +compiler::CodegenTraits get_codegen_traits() { + std::cout << "codegen_traits " << compiler::l0_cgen_traits_desc.shared_addr_space_ + << std::endl; + return compiler::CodegenTraits::get(compiler::l0_cgen_traits_desc); +} + +void init_storage_buffer(int8_t* buffer, + const std::vector& targets, + const QueryMemoryDescriptor& query_mem_desc) { + // get the initial values for all the aggregate columns + const auto init_agg_vals = init_agg_val_vec(targets, query_mem_desc); + CHECK(!query_mem_desc.didOutputColumnar()); + CHECK(query_mem_desc.getQueryDescriptionType() == + QueryDescriptionType::GroupByPerfectHash); + + const auto row_size = query_mem_desc.getRowSize(); + CHECK(query_mem_desc.hasKeylessHash()); + for (size_t entry_idx = 0; entry_idx < query_mem_desc.getEntryCount(); ++entry_idx) { + const auto row_ptr = buffer + entry_idx * row_size; + size_t init_agg_idx{0}; + int64_t init_val{0}; + // initialize each row's aggregate columns: + auto col_ptr = row_ptr + query_mem_desc.getColOffInBytes(0); + for (size_t slot_idx = 0; slot_idx < query_mem_desc.getSlotCount(); slot_idx++) { + if (query_mem_desc.getPaddedSlotWidthBytes(slot_idx) > 0) { + init_val = init_agg_vals[init_agg_idx++]; + } + switch (query_mem_desc.getPaddedSlotWidthBytes(slot_idx)) { + case 4: + *reinterpret_cast(col_ptr) = static_cast(init_val); + break; + case 8: + *reinterpret_cast(col_ptr) = init_val; + break; + case 0: + break; + default: + UNREACHABLE(); + } + col_ptr += query_mem_desc.getNextColOffInBytes(col_ptr, entry_idx, slot_idx); + } + } +} + +} // namespace + +void GpuReductionTester::codegenWrapperKernel() { + const unsigned address_space = 1; + auto pi8_type = llvm::Type::getInt8PtrTy(context_, address_space); + std::vector input_arguments; + input_arguments.push_back(llvm::PointerType::get(pi8_type, address_space)); + input_arguments.push_back(llvm::Type::getInt64Ty(context_)); // num input buffers + input_arguments.push_back(llvm::Type::getInt8PtrTy(context_, address_space)); + + llvm::FunctionType* ft = + llvm::FunctionType::get(llvm::Type::getVoidTy(context_), input_arguments, false); + wrapper_kernel_ = llvm::Function::Create( + ft, llvm::Function::ExternalLinkage, "wrapper_kernel", module_); + + wrapper_kernel_->setCallingConv(llvm::CallingConv::SPIR_KERNEL); + + auto arg_it = wrapper_kernel_->arg_begin(); + auto input_ptrs = &*arg_it; + input_ptrs->setName("input_pointers"); + arg_it++; + auto num_buffers = &*arg_it; + num_buffers->setName("num_buffers"); + arg_it++; + auto output_buffer = &*arg_it; + output_buffer->setName("output_buffer"); + + llvm::IRBuilder<> ir_builder(context_); + + auto bb_entry = llvm::BasicBlock::Create(context_, ".entry", wrapper_kernel_); + auto bb_body = llvm::BasicBlock::Create(context_, ".body", wrapper_kernel_); + auto bb_exit = llvm::BasicBlock::Create(context_, ".exit", wrapper_kernel_); + + // return if blockIdx.x > num_buffers + ir_builder.SetInsertPoint(bb_entry); + auto get_block_index_func = getFunction("get_block_index"); + auto block_index = ir_builder.CreateCall(get_block_index_func, {}, "block_index"); + const auto is_block_inbound = + ir_builder.CreateICmpSLT(block_index, num_buffers, "is_block_inbound"); + ir_builder.CreateCondBr(is_block_inbound, bb_body, bb_exit); + + // locate the corresponding input buffer: + ir_builder.SetInsertPoint(bb_body); + auto input_buffer_gep = ir_builder.CreateGEP( + input_ptrs->getType()->getScalarType()->getPointerElementType(), + input_ptrs, + block_index); + auto input_buffer = ir_builder.CreateLoad( + llvm::Type::getInt8PtrTy(context_, address_space), input_buffer_gep); + std::cout << "Addr space input_buffer_ptr=" << address_space << std::endl; + auto input_buffer_ptr = ir_builder.CreatePointerCast( + input_buffer, llvm::Type::getInt64PtrTy(context_, 4), "input_buffer_ptr"); + const auto buffer_size = ll_int( + static_cast(query_mem_desc_.getBufferSizeBytes(ExecutorDeviceType::GPU)), + context_); + + // initializing shared memory and copy input buffer into shared memory buffer: + auto init_smem_func = getFunction("init_smem_func"); + auto smem_input_buffer_ptr = ir_builder.CreateCall(init_smem_func, + { + input_buffer_ptr, + buffer_size, + }, + "smem_input_buffer_ptr"); + + auto output_buffer_ptr = ir_builder.CreatePointerCast( + output_buffer, llvm::Type::getInt64PtrTy(context_, 4), "output_buffer_ptr"); + + // call the reduction function + CHECK(reduction_func_); + std::vector reduction_args{ + output_buffer_ptr, smem_input_buffer_ptr, buffer_size}; + ir_builder.CreateCall(reduction_func_, reduction_args); + ir_builder.CreateBr(bb_exit); + + ir_builder.SetInsertPoint(bb_exit); + ir_builder.CreateRet(nullptr); +} + +namespace { + +std::unique_ptr compile_and_link_gpu_code( + const std::string& l0_llir, + llvm::Module* module, + l0::L0Manager* l0_mgr, + const std::string& kernel_name, + const size_t gpu_block_size = 256, // 256 for iGPU, 1024 for PVC + const size_t gpu_device_idx = 0) { + CHECK(module); + CHECK(l0_mgr); + + SPIRV::TranslatorOpts opts; + opts.enableAllExtensions(); + opts.setDesiredBIsRepresentation(SPIRV::BIsRepresentation::OpenCL12); + opts.setDebugInfoEIS(SPIRV::DebugInfoEIS::OpenCL_DebugInfo_100); + + std::cout << "compile_and_link_gpu_code - before writeSpirv" << std::endl; + + std::ostringstream ss; + std::string err; + auto success = writeSpirv(module, opts, ss, err); + CHECK(success) << "Spirv translation failed with error: " << err << "\n"; + + std::cout << "compile_and_link_gpu_code - before spv_to_bin" << std::endl; + + L0BinResult bin_result; + DUMP_MODULE(module, "before_spv_to_bin.ll"); + bin_result = spv_to_bin(ss.str(), kernel_name, gpu_block_size, l0_mgr); + + std::cout << "compile_and_link_gpu_code - after spv_to_bin" << std::endl; + + auto l0_context = std::make_unique( + bin_result.device, bin_result.kernel, bin_result.module, l0_mgr, 0, 1); + + return l0_context; +} + +std::vector> create_and_fill_input_result_sets( + const size_t num_input_buffers, + std::shared_ptr row_set_mem_owner, + const QueryMemoryDescriptor& query_mem_desc, + const std::vector& target_infos, + std::vector& generators, + const std::vector& steps) { + std::vector> result_sets; + for (size_t i = 0; i < num_input_buffers; i++) { + result_sets.push_back(std::make_unique(target_infos, + ExecutorDeviceType::CPU, + query_mem_desc, + row_set_mem_owner, + nullptr, + 0, + 0)); + const auto storage = result_sets.back()->allocateStorage(); + fill_storage_buffer(storage->getUnderlyingBuffer(), + target_infos, + query_mem_desc, + generators[i], + steps[i]); + } + return result_sets; +} + +std::pair, std::unique_ptr> +create_and_init_output_result_sets(std::shared_ptr row_set_mem_owner, + const QueryMemoryDescriptor& query_mem_desc, + const std::vector& target_infos) { + // CPU result set, will eventually host CPU reduciton results for validations + auto cpu_result_set = std::make_unique(target_infos, + ExecutorDeviceType::CPU, + query_mem_desc, + row_set_mem_owner, + nullptr, + 0, + 0); + auto cpu_storage_result = cpu_result_set->allocateStorage(); + init_storage_buffer( + cpu_storage_result->getUnderlyingBuffer(), target_infos, query_mem_desc); + + // GPU result set, will eventually host GPU reduction results + auto gpu_result_set = std::make_unique(target_infos, + ExecutorDeviceType::GPU, + query_mem_desc, + row_set_mem_owner, + nullptr, + 0, + 0); + auto gpu_storage_result = gpu_result_set->allocateStorage(); + init_storage_buffer( + gpu_storage_result->getUnderlyingBuffer(), target_infos, query_mem_desc); + return std::make_pair(std::move(cpu_result_set), std::move(gpu_result_set)); +} +void perform_reduction_on_cpu(std::vector>& result_sets, + const ResultSetStorage* cpu_result_storage) { + CHECK(result_sets.size() > 0); + Config config; + // for codegen only + auto executor = Executor::getExecutor(nullptr); + ResultSetReductionJIT reduction_jit(result_sets.front()->getQueryMemDesc(), + result_sets.front()->getTargetInfos(), + result_sets.front()->getTargetInitVals(), + config, + executor.get()); + const auto reduction_code = reduction_jit.codegen(); + for (auto& result_set : result_sets) { + ResultSetReduction::reduce(*cpu_result_storage, + *(result_set->getStorage()), + {}, + reduction_code, + config, + executor.get()); + } +} + +struct TestInputData { + size_t device_id; + size_t num_input_buffers; + std::vector target_infos; + int8_t suggested_agg_widths; + size_t min_entry; + size_t max_entry; + size_t step_size; + bool keyless_hash; + int32_t target_index_for_key; + TestInputData() + : device_id(0) + , num_input_buffers(0) + , suggested_agg_widths(0) + , min_entry(0) + , max_entry(0) + , step_size(2) + , keyless_hash(false) + , target_index_for_key(0) {} + TestInputData& setDeviceId(const size_t id) { + device_id = id; + return *this; + } + TestInputData& setNumInputBuffers(size_t num_buffers) { + num_input_buffers = num_buffers; + return *this; + } + TestInputData& setTargetInfos(std::vector tis) { + target_infos = tis; + return *this; + } + TestInputData& setAggWidth(int8_t agg_width) { + suggested_agg_widths = agg_width; + return *this; + } + TestInputData& setMinEntry(size_t min_e) { + min_entry = min_e; + return *this; + } + TestInputData& setMaxEntry(size_t max_e) { + max_entry = max_e; + return *this; + } + TestInputData& setKeylessHash(bool is_keyless) { + keyless_hash = is_keyless; + return *this; + } + TestInputData& setTargetIndexForKey(size_t target_idx) { + target_index_for_key = target_idx; + return *this; + } + TestInputData& setStepSize(size_t step) { + step_size = step; + return *this; + } +}; + +void perform_test_and_verify_results(TestInputData input) { + auto executor = Executor::getExecutor(nullptr, nullptr); + auto& context = executor->getContext(); + auto cgen_state = std::unique_ptr( + new CgenState({}, false, false, executor->getExtensionModuleContext(), context)); + cgen_state->set_module_shallow_copy( + executor->getExtensionModuleContext()->getRTModule(/*is_l0=*/true)); + auto module = cgen_state->module_; + module->setDataLayout( + "e-p:64:64:64-i1:8:8-i8:8:8-" + "i16:16:16-i32:32:32-i64:64:64-" + "f32:32:32-f64:64:64-v16:16:16-" + "v32:32:32-v64:64:64-v128:128:128-n16:32:64"); + module->setTargetTriple("spir64-unknown-unknown"); + auto l0_mgr = std::make_unique(); + const auto row_set_mem_owner = + std::make_shared(nullptr, Executor::getArenaBlockSize()); + auto query_mem_desc = perfect_hash_one_col_desc( + input.target_infos, input.suggested_agg_widths, input.min_entry, input.max_entry); + if (input.keyless_hash) { + query_mem_desc.setHasKeylessHash(true); + query_mem_desc.setTargetIdxForKey(input.target_index_for_key); + } + + std::vector generators( + input.num_input_buffers, StrideNumberGenerator(1, input.step_size)); + std::vector steps(input.num_input_buffers, input.step_size); + + std::cout << "GpuSharedMemoryTestIntel create_and_fill_input_result_sets START" + << std::endl; + auto input_result_sets = create_and_fill_input_result_sets(input.num_input_buffers, + row_set_mem_owner, + query_mem_desc, + input.target_infos, + generators, + steps); + std::cout << "GpuSharedMemoryTestIntel create_and_fill_input_result_sets END" + << std::endl; + + std::cout << "GpuSharedMemoryTestIntel create_and_init_output_result_sets START" + << std::endl; + const auto [cpu_result_set, gpu_result_set] = create_and_init_output_result_sets( + row_set_mem_owner, query_mem_desc, input.target_infos); + std::cout << "GpuSharedMemoryTestIntel create_and_init_output_result_sets END" + << std::endl; + + // performing reduciton using the GPU reduction code: + Config config; + std::cout << "GpuSharedMemoryTestIntel gpu_smem_tester START" << std::endl; + GpuReductionTester gpu_smem_tester(config, + module, + context, + query_mem_desc, + input.target_infos, + init_agg_val_vec(input.target_infos, query_mem_desc), + l0_mgr.get(), + get_codegen_traits(), + executor.get()); + std::cout << "GpuSharedMemoryTestIntel gpu_smem_tester END" << std::endl; + std::cout << "GpuSharedMemoryTestIntel codegen START" << std::endl; + gpu_smem_tester.codegen(CompilationOptions::defaults( + ExecutorDeviceType::GPU, + true)); // generate code for gpu reduction and initialization + std::cout << "GpuSharedMemoryTestIntel codegen END" << std::endl; + + std::cout << "GpuSharedMemoryTestIntel codegenWrapperKernel START" << std::endl; + DUMP_MODULE(module, "before.codegenWrapperKernel.ll") + gpu_smem_tester.codegenWrapperKernel(); + std::cout << "GpuSharedMemoryTestIntel codegenWrapperKernel END" << std::endl; + DUMP_MODULE(module, "after.codegenWrapperKernel.ll") + + std::cout << "GpuSharedMemoryTestIntel performReductionTest START" << std::endl; + gpu_smem_tester.performReductionTest( + input_result_sets, gpu_result_set->getStorage(), input.device_id); + std::cout << "GpuSharedMemoryTestIntel performReductionTest END" << std::endl; + + // CPU reduction for validation: + perform_reduction_on_cpu(input_result_sets, cpu_result_set->getStorage()); + + const auto cmp_result = + std::memcmp(cpu_result_set->getStorage()->getUnderlyingBuffer(), + gpu_result_set->getStorage()->getUnderlyingBuffer(), + query_mem_desc.getBufferSizeBytes(ExecutorDeviceType::GPU)); + ASSERT_EQ(cmp_result, 0); +} + +} // namespace + +void insert_globals(llvm::Module* from, llvm::Module* to) { + for (const llvm::GlobalVariable& I : from->globals()) { + std::cerr << "Adding global " << I.getName().str() << std::endl; + llvm::GlobalVariable* new_gv = + new llvm::GlobalVariable(*to, + I.getValueType(), + I.isConstant(), + I.getLinkage(), + (llvm::Constant*)nullptr, + I.getName(), + (llvm::GlobalVariable*)nullptr, + I.getThreadLocalMode(), + I.getType()->getAddressSpace()); + new_gv->copyAttributesFrom(&I); + } +} +void insert_declaration_tmp(llvm::Module* from, + llvm::Module* to, + const std::string& fname) { + auto fn = from->getFunction(fname); + CHECK(fn); + + llvm::Function::Create( + fn->getFunctionType(), llvm::GlobalValue::ExternalLinkage, fn->getName(), *to); +} + +void GpuReductionTester::performReductionTest( + const std::vector>& result_sets, + const ResultSetStorage* gpu_result_storage, + const size_t device_id) { + DUMP_MODULE(module_, "gen.ll"); + + auto& ext_module = executor_->getExtensionModuleContext()->getSpirvHelperFuncModule(); + + DUMP_MODULE(module_, "after.linking.before.insert_declaration.spirv.ll") + + insert_globals(ext_module.get(), module_); + for (auto& F : *ext_module) { + insert_declaration_tmp(ext_module.get(), module_, F.getName().str()); + } + + DUMP_MODULE(module_, "after.insert_global.spirv.ll") + + for (auto& F : *ext_module) { + if (!F.isDeclaration()) { + compiler::replace_function(ext_module.get(), module_, F.getName().str()); + } + } + + // set proper calling conv & mangle spirv built-ins + for (auto& Fn : *module_) { + Fn.setCallingConv(llvm::CallingConv::SPIR_FUNC); + if (Fn.getName().startswith("__spirv_")) { + CHECK(Fn.isDeclaration()); + Fn.setName(compiler::mangle_spirv_builtin(Fn)); + } + } + + std::cout << "PerformReductionTest - before calling conv" << std::endl; + + for (auto& Fn : *module_) { + for (auto I = llvm::inst_begin(Fn), E = llvm::inst_end(Fn); I != E; ++I) { + if (auto* CI = llvm::dyn_cast(&*I)) { + CI->setCallingConv(llvm::CallingConv::SPIR_FUNC); + } + } + } + + // wrapper_func->setCallingConv(llvm::CallingConv::SPIR_KERNEL);// todo fix + + // Initialize shared memory buffer + const auto slm_buffer = module_->getNamedGlobal("slm.buf.i64"); + CHECK(slm_buffer); + llvm::ArrayType* ArrayTy_0 = + llvm::ArrayType::get(llvm::IntegerType::get(module_->getContext(), 64), 1024); + llvm::ConstantAggregateZero* const_array = llvm::ConstantAggregateZero::get(ArrayTy_0); + slm_buffer->setInitializer(const_array); + +#ifdef DEBUG + // Check global string name + for (llvm::GlobalVariable& G : module_->getGlobalList()) { + // std::ostringstream oss; + std::cerr << " global var = " << G.getName().str() << std::endl; + } +#endif + + DUMP_MODULE(module_, "after.linking.spirv.ll") + std::cout << "PerformReductionTest - after linking" << std::endl; + + // Fix wrapper_kernel calling convention + auto fn = module_->getFunction("wrapper_kernel"); + fn->setCallingConv(llvm::CallingConv::SPIR_KERNEL); + + DUMP_MODULE(module_, "after.fixing.callingconv.ll") + + llvm::LLVMContext& ctx = module_->getContext(); + // set metadata -- pretend we're opencl (see + // https://github.com/KhronosGroup/SPIRV-LLVM-Translator/blob/master/docs/SPIRVRepresentationInLLVM.rst#spir-v-instructions-mapped-to-llvm-metadata) + llvm::Metadata* spirv_src_ops[] = { + llvm::ConstantAsMetadata::get( + llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), 3 /*OpenCL_C*/)), + llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(llvm::Type::getInt32Ty(ctx), + 102000 /*OpenCL ver 1.2*/))}; + llvm::NamedMDNode* spirv_src = module_->getOrInsertNamedMetadata("spirv.Source"); + spirv_src->addOperand(llvm::MDNode::get(ctx, spirv_src_ops)); + std::cout << "after add metadata" << std::endl; + + // TODO add optimize? + // for (auto& Fn : *module_) { + // cgen_state_->query_func_ = query_func + // auto module = cgen_state->module_; + + auto cgen_state = std::unique_ptr( + new CgenState({}, false, false, executor_->getExtensionModuleContext(), ctx)); + cgen_state->set_module_shallow_copy( + executor_->getExtensionModuleContext()->getRTModule(/*is_l0=*/true)); + + // if optimize = remove wrapper_kernel function + // compiler::optimize_ir(cgen_state->query_func_, + // module_, + // {}, + // true /*smem_used*/, + // CompilationOptions::defaults(ExecutorDeviceType::GPU, true)); + // DUMP_MODULE(module_, "after.optimize_ir.ll") + + std::stringstream ss; + llvm::raw_os_ostream os(ss); + module_->print(os, nullptr); + os.flush(); + std::string module_str(ss.str()); + + std::cout << "PerformReductionTest - before compile_and_link_gpu_code" << std::endl; + + std::unique_ptr gpu_context(compile_and_link_gpu_code( + module_str, module_, l0_mgr_, getWrapperKernel()->getName().str())); + std::cout << "PerformReductionTest - after compile_and_link_gpu_code" << std::endl; + + DUMP_MODULE(module_, "after.compile_link_gpu_code.ll") + + CHECK_EQ(getWrapperKernel()->getName().str(), "wrapper_kernel"); + // Code generation done + + const auto buffer_size = query_mem_desc_.getBufferSizeBytes(ExecutorDeviceType::GPU); + const size_t num_buffers = result_sets.size(); + std::vector d_input_buffers; + for (size_t i = 0; i < num_buffers; i++) { + d_input_buffers.push_back(l0_mgr_->allocateDeviceMem(buffer_size, device_id)); + l0_mgr_->copyHostToDevice(d_input_buffers[i], + result_sets[i]->getStorage()->getUnderlyingBuffer(), + buffer_size, + device_id); + } + + constexpr size_t num_kernel_params = 3; + CHECK_EQ(getWrapperKernel()->arg_size(), num_kernel_params); + + // parameter 1: an array of device pointers + typedef int8_t* L0deviceptr; + std::vector h_input_buffer_dptrs; + h_input_buffer_dptrs.reserve(num_buffers); + std::transform(d_input_buffers.begin(), + d_input_buffers.end(), + std::back_inserter(h_input_buffer_dptrs), + [](int8_t* dptr) { return reinterpret_cast(dptr); }); + + auto d_input_buffer_dptrs = + l0_mgr_->allocateDeviceMem(num_buffers * sizeof(L0deviceptr), device_id); + l0_mgr_->copyHostToDevice(d_input_buffer_dptrs, + reinterpret_cast(h_input_buffer_dptrs.data()), + num_buffers * sizeof(L0deviceptr), + device_id); + + // parameter 2: number of buffers + auto d_num_buffers = l0_mgr_->allocateDeviceMem(sizeof(int64_t), device_id); + l0_mgr_->copyHostToDevice(d_num_buffers, + reinterpret_cast(&num_buffers), + sizeof(int64_t), + device_id); + + // parameter 3: device pointer to the output buffer + auto d_result_buffer = l0_mgr_->allocateDeviceMem(buffer_size, device_id); + l0_mgr_->copyHostToDevice( + d_result_buffer, gpu_result_storage->getUnderlyingBuffer(), buffer_size, device_id); + + // collecting all kernel parameters: + std::vector h_kernel_params{ + reinterpret_cast(d_input_buffer_dptrs), + reinterpret_cast(d_num_buffers), + reinterpret_cast(d_result_buffer)}; + + // casting each kernel parameter to be a void* device ptr itself: + std::vector kernel_param_ptrs; + kernel_param_ptrs.reserve(num_kernel_params); + // std::transform(h_kernel_params.begin(), + // h_kernel_params.end(), + // std::back_inserter(kernel_param_ptrs), + // [](L0deviceptr& param) { return ¶m; }); + + // launching a kernel: + // we launch as many threadblocks as there are input buffers: + // in other words, each input buffer is handled by a single threadblock. + + // std::unique_ptr gpu_context + // auto l0_ctx = dynamic_cast(gpu_context); + // l0::L0Kernel* kernel = l0_ctx->getNativeCode(device_id); + // l0::L0Device* device = l0_ctx->getDevice(device_id); + + auto kernel = gpu_context->kernel(); + auto device = gpu_context->device(); + + auto q = device->command_queue(); + auto q_list = device->create_command_list(); + // l0::GroupCount gc = {ko.gridDimX, ko.gridDimY, ko.gridDimZ}; + l0::GroupCount gc = {1, 1, 1}; + // LOG(INFO) << "Launching L0 kernel with group size: {" << ko.gridDimX << "," + // << ko.gridDimY << "," << ko.gridDimZ << "}\n"; + // q_list->launch(kernel, kernel_param_ptrs.data(), gc); //<< here is the problem + + q_list->launch(kernel, h_kernel_params, gc); + q_list->submit(*q.get()); + + // transfer back the results: + l0_mgr_->copyDeviceToHost( + gpu_result_storage->getUnderlyingBuffer(), d_result_buffer, buffer_size, device_id); + + // release the gpu memory used: + for (auto& d_buffer : d_input_buffers) { + l0_mgr_->freeDeviceMem(d_buffer); + } + l0_mgr_->freeDeviceMem(d_input_buffer_dptrs); + l0_mgr_->freeDeviceMem(d_num_buffers); + l0_mgr_->freeDeviceMem(d_result_buffer); +} + +TEST(SingleColumn, VariableEntries_CountQuery_4B_Group) { + // for (auto num_entries : {1, 2, 3, 5, 13, 31, 63, 126, 241, 511, 1021}) { + for (auto num_entries : {1}) { + TestInputData input; + input.setDeviceId(0) + .setNumInputBuffers(4) + .setTargetInfos(generate_custom_agg_target_infos( + {4}, {hdk::ir::AggType::kCount}, {int32_type}, {int32_type})) + .setAggWidth(4) + .setMinEntry(0) + .setMaxEntry(num_entries) + .setStepSize(2) + .setKeylessHash(true) + .setTargetIndexForKey(0); + perform_test_and_verify_results(input); + } +} + +// TEST(SingleColumn, VariableEntries_CountQuery_8B_Group) { +// for (auto num_entries : {1, 2, 3, 5, 13, 31, 63, 126, 241, 511, 1021}) { +// TestInputData input; +// input.setDeviceId(0) +// .setNumInputBuffers(4) +// .setTargetInfos(generate_custom_agg_target_infos( +// {8}, {hdk::ir::AggType::kCount}, {int64_type}, {int64_type})) +// .setAggWidth(8) +// .setMinEntry(0) +// .setMaxEntry(num_entries) +// .setStepSize(2) +// .setKeylessHash(true) +// .setTargetIndexForKey(0); +// perform_test_and_verify_results(input); +// } +// } + +// TEST(SingleColumn, VariableSteps_FixedEntries_1) { +// TestInputData input; +// input.setDeviceId(0) +// .setNumInputBuffers(4) +// .setAggWidth(8) +// .setMinEntry(0) +// .setMaxEntry(126) +// .setKeylessHash(true) +// .setTargetIndexForKey(0) +// .setTargetInfos(generate_custom_agg_target_infos( +// {8}, +// {hdk::ir::AggType::kCount, +// hdk::ir::AggType::kMax, +// hdk::ir::AggType::kMin, +// hdk::ir::AggType::kSum, +// hdk::ir::AggType::kAvg}, +// {int64_type, int64_type, int64_type, int64_type, double_type}, +// {int32_type, int32_type, int32_type, int32_type, int32_type})); + +// for (auto& step_size : {2, 3, 5, 7, 11, 13}) { +// input.setStepSize(step_size); +// perform_test_and_verify_results(input); +// } +// } + +// TEST(SingleColumn, VariableSteps_FixedEntries_2) { +// TestInputData input; +// input.setDeviceId(0) +// .setNumInputBuffers(4) +// .setAggWidth(8) +// .setMinEntry(0) +// .setMaxEntry(126) +// .setKeylessHash(true) +// .setTargetIndexForKey(0) +// .setTargetInfos(generate_custom_agg_target_infos( +// {8}, +// {hdk::ir::AggType::kCount, +// hdk::ir::AggType::kAvg, +// hdk::ir::AggType::kMax, +// hdk::ir::AggType::kSum, +// hdk::ir::AggType::kMin}, +// {int64_type, double_type, int64_type, int64_type, int64_type}, +// {int32_type, int32_type, int32_type, int32_type, int32_type})); + +// for (auto& step_size : {2, 3, 5, 7, 11, 13}) { +// input.setStepSize(step_size); +// perform_test_and_verify_results(input); +// } +// } + +// TEST(SingleColumn, VariableSteps_FixedEntries_3) { +// TestInputData input; +// input.setDeviceId(0) +// .setNumInputBuffers(4) +// .setAggWidth(8) +// .setMinEntry(0) +// .setMaxEntry(367) +// .setKeylessHash(true) +// .setTargetIndexForKey(0) +// .setTargetInfos(generate_custom_agg_target_infos( +// {8}, +// {hdk::ir::AggType::kCount, +// hdk::ir::AggType::kMax, +// hdk::ir::AggType::kAvg, +// hdk::ir::AggType::kSum, +// hdk::ir::AggType::kMin}, +// {int64_type, double_type, double_type, double_type, double_type}, +// {int32_type, double_type, double_type, double_type, double_type})); + +// for (auto& step_size : {2, 3, 5, 7, 11, 13}) { +// input.setStepSize(step_size); +// perform_test_and_verify_results(input); +// } +// } + +// TEST(SingleColumn, VariableSteps_FixedEntries_4) { +// TestInputData input; +// input.setDeviceId(0) +// .setNumInputBuffers(4) +// .setAggWidth(8) +// .setMinEntry(0) +// .setMaxEntry(517) +// .setKeylessHash(true) +// .setTargetIndexForKey(0) +// .setTargetInfos(generate_custom_agg_target_infos( +// {8}, +// {hdk::ir::AggType::kCount, +// hdk::ir::AggType::kSum, +// hdk::ir::AggType::kMax, +// hdk::ir::AggType::kAvg, +// hdk::ir::AggType::kMin}, +// {int64_type, float_type, float_type, float_type, float_type}, +// {int16_type, float_type, float_type, float_type, float_type})); + +// for (auto& step_size : {2, 3, 5, 7, 11, 13}) { +// input.setStepSize(step_size); +// perform_test_and_verify_results(input); +// } +// } + +// TEST(SingleColumn, VariableNumBuffers) { +// TestInputData input; +// input.setDeviceId(0) +// .setAggWidth(8) +// .setMinEntry(0) +// .setMaxEntry(266) +// .setKeylessHash(true) +// .setTargetIndexForKey(0) +// .setTargetInfos(generate_custom_agg_target_infos( +// {8}, +// {hdk::ir::AggType::kCount, +// hdk::ir::AggType::kSum, +// hdk::ir::AggType::kAvg, +// hdk::ir::AggType::kMax, +// hdk::ir::AggType::kMin}, +// {int32_type, int64_type, double_type, float_type, double_type}, +// {int8_type, int8_type, int16_type, float_type, double_type})); + +// for (auto& num_buffers : {2, 3, 4, 5, 6, 7, 8, 16, 32, 64, 128}) { +// input.setNumInputBuffers(num_buffers); +// perform_test_and_verify_results(input); +// } +// } + +int main(int argc, char** argv) { + g_is_test_env = true; + + TestHelpers::init_logger_stderr_only(argc, argv); + testing::InitGoogleTest(&argc, argv); + + int err{0}; + try { + err = RUN_ALL_TESTS(); + } catch (const std::exception& e) { + LOG(ERROR) << e.what(); + } + return err; +} diff --git a/omniscidb/Tests/GpuSharedMemoryTestIntel.h b/omniscidb/Tests/GpuSharedMemoryTestIntel.h new file mode 100644 index 000000000..d8226f4bd --- /dev/null +++ b/omniscidb/Tests/GpuSharedMemoryTestIntel.h @@ -0,0 +1,93 @@ +/* + * Copyright 2019 OmniSci, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "L0Mgr/L0Mgr.h" +#include "Logger/Logger.h" +#include "QueryEngine/CodeGenerator.h" +#include "QueryEngine/GpuSharedMemoryUtils.h" +#include "QueryEngine/L0Kernel.h" +#include "QueryEngine/LLVMFunctionAttributesUtil.h" +#include "QueryEngine/OutputBufferInitialization.h" +#include "ResultSetTestUtils.h" +#include "Shared/TargetInfo.h" +#include "TestHelpers.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +class StrideNumberGenerator : public NumberGenerator { + public: + StrideNumberGenerator(const int64_t start, const int64_t stride) + : crt_(start), stride_(stride), start_(start) {} + + int64_t getNextValue() override { + const auto crt = crt_; + crt_ += stride_; + return crt; + } + + void reset() override { crt_ = start_; } + + private: + int64_t crt_; + int64_t stride_; + int64_t start_; +}; + +class GpuReductionTester : public GpuSharedMemCodeBuilder { + public: + GpuReductionTester(const Config& config, + llvm::Module* module, + llvm::LLVMContext& context, + const QueryMemoryDescriptor& qmd, + const std::vector& targets, + const std::vector& init_agg_values, + l0::L0Manager* l0_mgr, + const compiler::CodegenTraits& traits, + Executor* executor) + : GpuSharedMemCodeBuilder(module, + context, + qmd, + targets, + init_agg_values, + config, + traits, + executor) + , l0_mgr_(l0_mgr) + , executor_(executor) { + // CHECK(getReductionFunction()); + } + void codegenWrapperKernel(); + llvm::Function* getWrapperKernel() const { return wrapper_kernel_; } + void performReductionTest(const std::vector>& result_sets, + const ResultSetStorage* gpu_result_storage, + const size_t device_id); + + private: + l0::L0Manager* l0_mgr_; + llvm::Function* wrapper_kernel_; + Executor* executor_; +};