Conversation
| } | ||
|
|
||
|
|
||
| core::smart_refctd_ptr<ISemaphore> CVulkanLogicalDevice::createSemaphore(const uint64_t initialValue) |
There was a problem hiding this comment.
intialValue should go into creationPArams and the old signature should be implemented in terms of new and marked deprecated
| if (m_externalHandle != ExternalHandleNull) | ||
| { | ||
| bool re = CloseExternalHandle(m_externalHandle); | ||
| assert(re); |
There was a problem hiding this comment.
can log an error using the m_vulkanDevice's debug callback
| // The Vulkan spec states: If the pNext chain includes a VkExternalMemoryImageCreateInfo or VkExternalMemoryImageCreateInfoNV structure whose handleTypes member is not 0, initialLayout must be VK_IMAGE_LAYOUT_UNDEFINED | ||
| vk_createInfo.initialLayout = external ? VK_IMAGE_LAYOUT_UNDEFINED : (params.preinitialized ? VK_IMAGE_LAYOUT_PREINITIALIZED : VK_IMAGE_LAYOUT_UNDEFINED); |
There was a problem hiding this comment.
I would put a check in the valid() of the params that preinitialized is not true if external is true
| const auto handleType = static_cast<VkExternalSemaphoreHandleTypeFlagBits>(creationParams.externalHandleTypes.value); | ||
| if (handleType != 0) | ||
| { | ||
| #ifdef _WIN32 | ||
| VkSemaphoreGetWin32HandleInfoKHR props = { | ||
| .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR, | ||
| .semaphore = semaphore, | ||
| .handleType = handleType, | ||
| }; | ||
|
|
||
| if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreWin32HandleKHR(m_vkdev, &props, &externalHandle)) | ||
| { | ||
| m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, nullptr); | ||
| return nullptr; | ||
| } | ||
| #else | ||
| VkSemaphoreGetFdInfoKHR props = { | ||
| .sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR, | ||
| .semaphore = vkSemaphore, | ||
| .handleType = handleType, | ||
| }; | ||
| if (VK_SUCCESS != m_devf.vk.vkGetSemaphoreFdKHR(m_vkdev, &props, &externalHandle)) | ||
| { | ||
| m_devf.vk.vkDestroySemaphore(m_vkdev, semaphore, nullptr); | ||
| return nullptr; | ||
| } | ||
| #endif | ||
| } |
There was a problem hiding this comment.
this should be handled through a switch with a default case so we don't die when more handle types are added
There was a problem hiding this comment.
ok I see you can support multiple handles at once so switch doesn't make sense but you should still gate/assert that the external handle type you're doing the code for is there (e.g. WIN32 or FD) and others are not
| #ifdef _WIN32 | ||
| VkImportMemoryWin32HandleInfoKHR importInfo = { | ||
| .sType = VK_STRUCTURE_TYPE_IMPORT_MEMORY_WIN32_HANDLE_INFO_KHR, | ||
| .handleType = static_cast<VkExternalMemoryHandleTypeFlagBits>(info.externalHandleType), |
There was a problem hiding this comment.
you need to check the handle types at the start IMHO before just doing through with win32 or POSIX codepaths
There was a problem hiding this comment.
why are we passing dedicatedOnly around ?
There was a problem hiding this comment.
ok I see, IMHO the solution is to have IGPUBuffer and IGPUImage SCONSTRUCTIONPArams and SCachedConstruction params (which can contain the creation and cached creation params) to pass things between the inside of the factory creation function and the CVulkan constructor
| friend class IDeviceMemoryAllocator; | ||
| friend class ILogicalDevice; |
There was a problem hiding this comment.
write docs about why you need the friendship
| struct SCreationParams: SInfo | ||
| { | ||
| core::bitflag<E_MEMORY_PROPERTY_FLAGS> memoryPropertyFlags = E_MEMORY_PROPERTY_FLAGS::EMPF_NONE; | ||
| const bool dedicated = false; |
There was a problem hiding this comment.
you dont need to make the struct member const, its better if you make the m_params const!
|
|
||
| inline const SCreationParams& getCreationParams() const { return m_params; } | ||
|
|
||
| virtual external_handle_t getExternalHandle() const = 0; |
There was a problem hiding this comment.
I don't get the point of having this, and a getCreationParams() when I can go getCreationParams().externalHandle
| struct SInfo | ||
| { | ||
| uint64_t allocationSize = 0; | ||
| core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags = IDeviceMemoryAllocation::EMAF_NONE; | ||
| // Handle Type for external resources | ||
| IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE; | ||
| //! Imports the given handle if externalHandle != nullptr && externalHandleType != EHT_NONE | ||
| //! Creates exportable memory if externalHandle == nullptr && externalHandleType != EHT_NONE | ||
| external_handle_t externalHandle = 0; | ||
| }; | ||
|
|
||
| struct SCreationParams: SInfo |
There was a problem hiding this comment.
any reason for the split between SInfo and ScreationParams we could shove everything into 24 bytes, by packing the allocateflags, handle types, memoryPropertyFlags and dedicated into 8 bytes
There was a problem hiding this comment.
ok I see its because you reuse the SInfo in the allocator, but stull it would help to pack the allocate flags and external handle type into 4 bytes in case the external handle is 4 bytes on linux
| inline bool isDedicated() const {return m_dedicated;} | ||
| inline bool isDedicated() const {return m_params.dedicated;} | ||
|
|
||
| //! Returns the size of the memory allocation | ||
| inline size_t getAllocationSize() const {return m_allocationSize;} | ||
| inline size_t getAllocationSize() const {return m_params.allocationSize;} | ||
|
|
||
| //! | ||
| inline core::bitflag<E_MEMORY_ALLOCATE_FLAGS> getAllocateFlags() const { return m_allocateFlags; } | ||
| inline core::bitflag<E_MEMORY_ALLOCATE_FLAGS> getAllocateFlags() const { return m_params.allocateFlags; } | ||
|
|
||
| //! | ||
| inline core::bitflag<E_MEMORY_PROPERTY_FLAGS> getMemoryPropertyFlags() const { return m_memoryPropertyFlags; } | ||
| inline core::bitflag<E_MEMORY_PROPERTY_FLAGS> getMemoryPropertyFlags() const { return m_params.memoryPropertyFlags; } |
There was a problem hiding this comment.
whatever I can get from getCreationParams() mark with [[deprecated]]
| enum E_MEMORY_HEAP_FLAGS : uint32_t | ||
| { | ||
| EMHF_NONE = 0, | ||
| EMHF_DEVICE_LOCAL_BIT = 0x00000001, | ||
| EMHF_MULTI_INSTANCE_BIT = 0x00000002, | ||
| }; |
There was a problem hiding this comment.
you can take this enum down to uint8_t
| //! Flags for imported/exported allocation | ||
| enum E_EXTERNAL_HANDLE_TYPE : uint32_t |
There was a problem hiding this comment.
and this to uint16_t
| size_t size : 54 = 0ull; | ||
| size_t flags : 5 = 0u; // IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS | ||
| size_t memoryTypeIndex : 5 = 0u; | ||
| size_t memoryTypeIndex = 0u; |
There was a problem hiding this comment.
don't use `size_t for the index, use uint8_T at most
| class IMemoryTypeIterator | ||
| { | ||
| public: | ||
| IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags) | ||
| : m_allocateFlags(static_cast<uint32_t>(allocateFlags.value)), m_reqs(reqs) {} | ||
| IMemoryTypeIterator(const IDeviceMemoryBacked::SDeviceMemoryRequirements& reqs, | ||
| core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags, | ||
| IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE handleType, | ||
| external_handle_t handle) : | ||
| m_allocateFlags(static_cast<uint32_t>(allocateFlags.value)), | ||
| m_reqs(reqs), | ||
| m_handleType(handleType), | ||
| m_handle(handle) | ||
| {} |
There was a problem hiding this comment.
why does your memory type iterator care about the handle value ? I can understand caring about the handle type and whether we're importing or exporting, but not the actual handle
| IDeviceMemoryBacked* dedication = nullptr, | ||
| const core::bitflag<IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS> allocateFlags = IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, | ||
| IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE externalHandleType = IDeviceMemoryAllocation::EHT_NONE, | ||
| external_handle_t externalHandle = {}) |
There was a problem hiding this comment.
last 4 arguments should be wrapped up in a structure, it kinda looks like half of your Sinfo
| enum E_EXTERNAL_MEMORY_FEATURE_FLAGS : uint32_t | ||
| { | ||
| EEMF_NONE = 0x0, | ||
| EEMF_DEDICATED_ONLY_BIT = 0x1, | ||
| EEMF_EXPORTABLE_BIT = 0x2, | ||
| EEMF_IMPORTABLE_BIT = 0x4, | ||
| }; | ||
|
|
||
| struct SExternalMemoryProperties | ||
| { | ||
| IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE exportableTypes : 7; | ||
| IDeviceMemoryAllocation::E_EXTERNAL_HANDLE_TYPE compatibleTypes : 7; | ||
| // TODO(kevin): This should actually be core::bitflag to be semantically correct. What should we do? Should we use bool for each flag instead of enum? | ||
| E_EXTERNAL_MEMORY_FEATURE_FLAGS features : 3; |
There was a problem hiding this comment.
uint16_t for the class
its fine as it is, can make a core::bitflag<> get method
| // TODO(kevinyu): Should we cached the properties like Atil does. If yes, needs mutex and mutable specifier. Class become not that simple anymore. | ||
| // { | ||
| // std::shared_lock lock(m_externalBufferPropertiesMutex); | ||
| // auto it = m_externalBufferProperties.find({ usage, handleType }); | ||
| // if (it != m_externalBufferProperties.end()) | ||
| // return it->second; | ||
| // } | ||
| // | ||
| // std::unique_lock lock(m_externalBufferPropertiesMutex); | ||
| // return m_externalBufferProperties[{ usage, handleType }] = getExternalBufferProperties_impl(usage, handleType); | ||
| return getExternalMemoryProperties_impl(usages, handleType); |
There was a problem hiding this comment.
we do it for image formats, I guess we should make it cached here too
| struct SImageFormatInfo | ||
| { | ||
| asset::E_FORMAT format; | ||
| IGPUImage::E_TYPE type; | ||
| IGPUImage::TILING tiling; | ||
| core::bitflag<IGPUImage::E_USAGE_FLAGS> usage; | ||
| core::bitflag<IGPUImage::E_CREATE_FLAGS> flags; | ||
| }; |
There was a problem hiding this comment.
assert this is 5 bytes, if not find the padding
| static CUresult acquireAndGetMipmappedArray(GraphicsAPIObjLink<video::IGPUImage>* linksBegin, GraphicsAPIObjLink<video::IGPUImage>* linksEnd, CUstream stream); | ||
| static CUresult acquireAndGetArray(GraphicsAPIObjLink<video::IGPUImage>* linksBegin, GraphicsAPIObjLink<video::IGPUImage>* linksEnd, uint32_t* arrayIndices, uint32_t* mipLevels, CUstream stream); | ||
| #endif | ||
| size_t roundToGranularity(CUmemLocationType location, size_t size) const; |
There was a problem hiding this comment.
isn't this just core::roundUp or just align ?
| CCUDAImportedSemaphore(core::smart_refctd_ptr<CCUDADevice> device, | ||
| core::smart_refctd_ptr<ISemaphore> src, | ||
| CUexternalSemaphore semaphore) | ||
| : m_device(std::move(device)) | ||
| , m_src(std::move(src)) | ||
| , m_handle(semaphore) | ||
| {} | ||
| ~CCUDAImportedSemaphore() override; | ||
|
|
There was a problem hiding this comment.
again these are public and should be private
| namespace nbl::video | ||
| { | ||
|
|
||
| class NBL_API2 CCUDAImportedSemaphore : public core::IReferenceCounted |
There was a problem hiding this comment.
use final whenever possible
| // ASK(kevin): What initial_modified_time should I use? Is this how this parameter is used? | ||
| std::chrono::clock_cast<system::IFile::time_point_t::clock>(std::chrono::system_clock::now()), |
There was a problem hiding this comment.
this is fine, we don't have a file watcher system yet to reload the file contents if they change
| auto& cu = m_device->getHandler()->getCUDAFunctionTable(); | ||
| return cu.pcuExternalMemoryGetMappedBuffer(mappedBuffer, m_handle, &bufferDesc); | ||
|
|
||
| } |
There was a problem hiding this comment.
oh, the syntax for this hurts me, can we turn it into a method that returns CUdeviceptr (and null on failure) ?
If I want CUDA like API usage then I guess I can call getInternalObject and go to town
| req.prefersDedicatedAllocation = nullptr != dedication; | ||
| req.requiresDedicatedAllocation = nullptr != dedication; |
There was a problem hiding this comment.
I don't get this, if you pass a dedication it means you've already allocated the memory, so you may have allocated it on the wrong heap which is not compatible.
IMHO it wholesale doesn't make sense, because if you have a dedicate allocation made before you must have known the external handle to even make it, so its a chicken an egg problem.
HOWEVER you should have a method that returns the Device Memory Requirements (same as IGPUBuffer or IGPUImage) for the CCUDAExportableMemory so that these are known before we attempt an export
|
|
||
| return device->allocate(req, | ||
| dedication, | ||
| IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, |
There was a problem hiding this comment.
you've robbed me of specifying EMAF_DEVICE_ADDRESS_BIT and getting BDA for a CUDA buffer
| return device->allocate(req, | ||
| dedication, | ||
| IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_NONE, | ||
| CCUDADevice::EXTERNAL_MEMORY_HANDLE_TYPE, | ||
| m_params.externalHandle).memory; | ||
| } |
There was a problem hiding this comment.
you need the equivalent of ICleanup from IGPUBuffer and IGPUImage but for the IDeviceMemoryAllocation so you can keep a smart pointer back to the Imported thing which provides the external Handle
Right now you're not ensuring lifetimes when CUDA exports and Vulkan imports
| m_handler(std::move(handler)), | ||
| m_allocationGranularity{} | ||
| { | ||
| m_defaultCompileOptions.push_back("--std=c++14"); |
There was a problem hiding this comment.
can you do C++20 or will NVCC die?
| size_t CCUDADevice::roundToGranularity(CUmemLocationType location, size_t size) const | ||
| { | ||
| assert(link->obj); | ||
| auto glbuf = static_cast<video::COpenGLBuffer*>(link->obj.get()); | ||
| auto retval = cuda.pcuGraphicsGLRegisterBuffer(&link->cudaHandle,glbuf->getOpenGLName(),flags); | ||
| if (retval!=CUDA_SUCCESS) | ||
| link->obj = nullptr; | ||
| return retval; | ||
| return ((size - 1) / m_allocationGranularity[location] + 1) * m_allocationGranularity[location]; |
There was a problem hiding this comment.
ah okay you do it based on location, still please use the hlsl or core utitliities, and inline the function, no need to be going across DLL boundaries for stuff like that
| } | ||
| CUresult CCUDAHandler::registerImage(GraphicsAPIObjLink<video::IGPUImage>* link, uint32_t flags) | ||
|
|
||
| CUresult CCUDADevice::reserveAddressAndMapMemory(CUdeviceptr* outPtr, size_t size, size_t alignment, CUmemLocationType location, CUmemGenericAllocationHandle memory) const |
There was a problem hiding this comment.
I find STATUS func(RETURN* really awkward, maybe make a template<T> struct nbl::video::SCUDAResult {T result; CUresult status;};with avoid` specialization thats just the status
Description
Bridging CUDA and Vulkan
Testing
Test Pull Request