From f18160276e78f860f64c45111c874e3351b44ffb Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Wed, 3 Dec 2025 23:24:18 +0300 Subject: [PATCH 01/26] New example, copy of 61_UI, updated a lot, visualizer, still not "solid angle", rest should be shader work --- 72_SolidAngleVisualizer/CMakeLists.txt | 20 + 72_SolidAngleVisualizer/README.md | 0 .../hlsl/SolidAngleVis.frag.hlsl | 175 +++ .../app_resources/hlsl/common.hlsl | 14 + 72_SolidAngleVisualizer/config.json.template | 28 + 72_SolidAngleVisualizer/include/common.hpp | 20 + 72_SolidAngleVisualizer/include/transform.hpp | 172 +++ 72_SolidAngleVisualizer/main.cpp | 1105 +++++++++++++++++ 72_SolidAngleVisualizer/pipeline.groovy | 50 + 72_SolidAngleVisualizer/src/transform.cpp | 0 CMakeLists.txt | 1 + 11 files changed, 1585 insertions(+) create mode 100644 72_SolidAngleVisualizer/CMakeLists.txt create mode 100644 72_SolidAngleVisualizer/README.md create mode 100644 72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl create mode 100644 72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl create mode 100644 72_SolidAngleVisualizer/config.json.template create mode 100644 72_SolidAngleVisualizer/include/common.hpp create mode 100644 72_SolidAngleVisualizer/include/transform.hpp create mode 100644 72_SolidAngleVisualizer/main.cpp create mode 100644 72_SolidAngleVisualizer/pipeline.groovy create mode 100644 72_SolidAngleVisualizer/src/transform.cpp diff --git a/72_SolidAngleVisualizer/CMakeLists.txt b/72_SolidAngleVisualizer/CMakeLists.txt new file mode 100644 index 000000000..5d0021f61 --- /dev/null +++ b/72_SolidAngleVisualizer/CMakeLists.txt @@ -0,0 +1,20 @@ +if(NBL_BUILD_IMGUI) + set(NBL_EXTRA_SOURCES + "${CMAKE_CURRENT_SOURCE_DIR}/src/transform.cpp" + ) + + set(NBL_INCLUDE_SERACH_DIRECTORIES + "${CMAKE_CURRENT_SOURCE_DIR}/include" + ) + + list(APPEND NBL_LIBRARIES + imtestengine + imguizmo + "${NBL_EXT_IMGUI_UI_LIB}" + ) + + # TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !? + nbl_create_executable_project("${NBL_EXTRA_SOURCES}" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}") + # TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet + # LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD) +endif() \ No newline at end of file diff --git a/72_SolidAngleVisualizer/README.md b/72_SolidAngleVisualizer/README.md new file mode 100644 index 000000000..e69de29bb diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl new file mode 100644 index 000000000..d783a5b37 --- /dev/null +++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -0,0 +1,175 @@ +#pragma wave shader_stage(fragment) + +#include "common.hlsl" + +#include + +using namespace nbl::hlsl; +using namespace ext::FullScreenTriangle; + +[[vk::push_constant]] struct PushConstants pc; + +static const float CIRCLE_RADIUS = 0.45f; + +// --- Geometry Utils --- + +// Adjacency of edges to faces +static const int2 edgeToFaces[12] = { + {4,2}, {3,4}, {2,5}, {5,3}, + {2,0}, {0,3}, {1,2}, {3,1}, + {0,4}, {5,0}, {4,1}, {1,5} +}; + +static const float3 localNormals[6] = { + float3(0, 0, -1), // Face 0 (Z-) + float3(0, 0, 1), // Face 1 (Z+) + float3(-1, 0, 0), // Face 2 (X-) + float3(1, 0, 0), // Face 3 (X+) + float3(0, -1, 0), // Face 4 (Y-) + float3(0, 1, 0) // Face 5 (Y+) +}; + +static float3 corners[8]; +static float3 faceCenters[6] = { float3(0,0,0), float3(0,0,0), float3(0,0,0), + float3(0,0,0), float3(0,0,0), float3(0,0,0) }; +static float2 projCorners[8]; + + +// Converts UV into centered, aspect-corrected NDC circle space +float2 toCircleSpace(float2 uv) +{ + float aspect = pc.viewport.z / pc.viewport.w; + float2 centered = uv - 0.5f; + centered.x *= aspect; + return centered; +} + +// Distance to a 2D line segment +float sdSegment(float2 p, float2 a, float2 b) +{ + float2 pa = p - a; + float2 ba = b - a; + float h = clamp(dot(pa, ba) / dot(ba, ba), 0.0f, 1.0f); + return length(pa - ba * h); +} + +// TODO: Hemispherical Projection (Solid Angle / Orthographic/Lambertian Projection) +float2 project(float3 p) +{ + return normalize(p).xy; +} + +void computeCubeGeo() +{ + for (int i = 0; i < 8; i++) + { + float3 localPos = float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f; + float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz; + + corners[i] = worldPos; + + faceCenters[i/4] += worldPos / 4.0f; + faceCenters[2+i%2] += worldPos / 4.0f; + faceCenters[4+(i/2)%2] += worldPos / 4.0f; + + float3 viewPos = worldPos; + projCorners[i] = project(viewPos); + } +} + +int getVisibilityCount(int2 faces, float3 cameraPos) +{ + float3x3 rotMatrix = (float3x3)pc.modelMatrix; + float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]); + float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]); + + float3 viewVec_f1 = faceCenters[faces.x] - cameraPos; + float3 viewVec_f2 = faceCenters[faces.y] - cameraPos; + + // Face is visible if its outward normal points towards the origin (camera). + bool visible1 = dot(n_world_f1, viewVec_f1) < 0.0f; + bool visible2 = dot(n_world_f2, viewVec_f2) < 0.0f; + + // Determine Line Style: + bool isSilhouette = visible1 != visible2; // One face visible, the other hidden + bool isInner = visible1 && visible2; // Both faces visible + + int visibilityCount = 0; + if (isSilhouette) + { + visibilityCount = 1; + } + else if (isInner) + { + visibilityCount = 2; + } + + return visibilityCount; +} + +void drawLine(float2 p, int a, int b, int visibilityCount, inout float4 color, float aaWidth) +{ + if (visibilityCount > 0) + { + float3 A = corners[a]; + float3 B = corners[b]; + + float avgDepth = (length(A) + length(B)) * 0.5f; + float referenceDepth = 3.0f; + float depthScale = referenceDepth / avgDepth; + + float baseWidth = (visibilityCount == 1) ? 0.005f : 0.002f; + float intensity = (visibilityCount == 1) ? 1.0f : 0.5f; + float4 edgeColor = (visibilityCount == 1) ? float4(0.0f, 0.5f, 1.0f, 1.0f) : float4(1.0f, 0.0f, 0.0f, 1.0f); // Blue vs Red + + float width = min(baseWidth * depthScale, 0.03f); + + float dist = sdSegment(p, projCorners[a], projCorners[b]); + + float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); + + color += edgeColor * alpha * intensity; + } +} + +void drawRing(float2 p, inout float4 color, float aaWidth) +{ + float positionLength = length(p); + + // Mask to cut off drawing outside the circle + // float circleMask = 1.0f - smoothstep(CIRCLE_RADIUS, CIRCLE_RADIUS + aaWidth, positionLength); + // color *= circleMask; + + // Add a white background circle ring + float ringWidth = 0.005f; + float ringDistance = abs(positionLength - CIRCLE_RADIUS); + float ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance); + + // Ring color is now white + color = max(color, float4(1.0, 1.0, 1.0, 1.0) * ringAlpha); +} + +[[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 +{ + float3 cameraPos = float3(0, 0, 0); // Camera at origin + float2 p = toCircleSpace(vx.uv); + float4 color = float4(0, 0, 0, 0); + + computeCubeGeo(); + + float aaWidth = max(fwidth(p.x), fwidth(p.y)); + + for (int j = 0; j < 12; j++) + { + int a = j % 4 * (j < 4 ? 1 : 2) - (j / 4 == 1 ? j % 2 : 0); + int b = a + (4 >> (j / 4)); + + int2 faces = edgeToFaces[j]; + int visibilityCount = getVisibilityCount(faces, cameraPos); + drawLine(p, a, b, visibilityCount, color, aaWidth); + } + + drawRing(p, color, aaWidth); + + return color; +} \ No newline at end of file diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl new file mode 100644 index 000000000..80368d08f --- /dev/null +++ b/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl @@ -0,0 +1,14 @@ +#ifndef _SOLID_ANGLE_VIS_COMMON_HLSL_ +#define _SOLID_ANGLE_VIS_COMMON_HLSL_ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + + + +struct PushConstants +{ + nbl::hlsl::float32_t3x4 modelMatrix; + nbl::hlsl::float32_t4 viewport; +}; + + +#endif // _SOLID_ANGLE_VIS_COMMON_HLSL_ diff --git a/72_SolidAngleVisualizer/config.json.template b/72_SolidAngleVisualizer/config.json.template new file mode 100644 index 000000000..f961745c1 --- /dev/null +++ b/72_SolidAngleVisualizer/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/72_SolidAngleVisualizer/include/common.hpp b/72_SolidAngleVisualizer/include/common.hpp new file mode 100644 index 000000000..2e8e985dd --- /dev/null +++ b/72_SolidAngleVisualizer/include/common.hpp @@ -0,0 +1,20 @@ +#ifndef _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ +#define _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ + + +#include "nbl/examples/examples.hpp" + +// the example's headers +#include "transform.hpp" +#include "nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl" + +using namespace nbl; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; + +#endif // _NBL_THIS_EXAMPLE_COMMON_H_INCLUDED_ \ No newline at end of file diff --git a/72_SolidAngleVisualizer/include/transform.hpp b/72_SolidAngleVisualizer/include/transform.hpp new file mode 100644 index 000000000..002a9d215 --- /dev/null +++ b/72_SolidAngleVisualizer/include/transform.hpp @@ -0,0 +1,172 @@ +#ifndef _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_ +#define _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_ + + +#include "nbl/ui/ICursorControl.h" + +#include "nbl/ext/ImGui/ImGui.h" + +#include "imgui/imgui_internal.h" +#include "imguizmo/ImGuizmo.h" + + +struct TransformRequestParams +{ + float camDistance = 8.f; + uint8_t sceneTexDescIx = ~0; + bool useWindow = true, editTransformDecomposition = false, enableViewManipulate = false; +}; + +struct TransformReturnInfo +{ + nbl::hlsl::uint16_t2 sceneResolution = { 2048,1024 }; + bool isGizmoWindowHovered; + bool isGizmoBeingUsed; +}; + +TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjection, float* matrix, const TransformRequestParams& params) +{ + static ImGuizmo::OPERATION mCurrentGizmoOperation(ImGuizmo::TRANSLATE); + static ImGuizmo::MODE mCurrentGizmoMode(ImGuizmo::LOCAL); + static bool useSnap = false; + static float snap[3] = { 1.f, 1.f, 1.f }; + static float bounds[] = { -0.5f, -0.5f, -0.5f, 0.5f, 0.5f, 0.5f }; + static float boundsSnap[] = { 0.1f, 0.1f, 0.1f }; + static bool boundSizing = false; + static bool boundSizingSnap = false; + + if (params.editTransformDecomposition) + { + if (ImGui::IsKeyPressed(ImGuiKey_T)) + mCurrentGizmoOperation = ImGuizmo::TRANSLATE; + if (ImGui::IsKeyPressed(ImGuiKey_R)) + mCurrentGizmoOperation = ImGuizmo::ROTATE; + if (ImGui::IsKeyPressed(ImGuiKey_S)) + mCurrentGizmoOperation = ImGuizmo::SCALE; + if (ImGui::RadioButton("Translate", mCurrentGizmoOperation == ImGuizmo::TRANSLATE)) + mCurrentGizmoOperation = ImGuizmo::TRANSLATE; + ImGui::SameLine(); + if (ImGui::RadioButton("Rotate", mCurrentGizmoOperation == ImGuizmo::ROTATE)) + mCurrentGizmoOperation = ImGuizmo::ROTATE; + ImGui::SameLine(); + if (ImGui::RadioButton("Scale", mCurrentGizmoOperation == ImGuizmo::SCALE)) + mCurrentGizmoOperation = ImGuizmo::SCALE; + if (ImGui::RadioButton("Universal", mCurrentGizmoOperation == ImGuizmo::UNIVERSAL)) + mCurrentGizmoOperation = ImGuizmo::UNIVERSAL; + float matrixTranslation[3], matrixRotation[3], matrixScale[3]; + ImGuizmo::DecomposeMatrixToComponents(matrix, matrixTranslation, matrixRotation, matrixScale); + ImGui::InputFloat3("Tr", matrixTranslation); + ImGui::InputFloat3("Rt", matrixRotation); + ImGui::InputFloat3("Sc", matrixScale); + ImGuizmo::RecomposeMatrixFromComponents(matrixTranslation, matrixRotation, matrixScale, matrix); + + if (mCurrentGizmoOperation != ImGuizmo::SCALE) + { + if (ImGui::RadioButton("Local", mCurrentGizmoMode == ImGuizmo::LOCAL)) + mCurrentGizmoMode = ImGuizmo::LOCAL; + ImGui::SameLine(); + if (ImGui::RadioButton("World", mCurrentGizmoMode == ImGuizmo::WORLD)) + mCurrentGizmoMode = ImGuizmo::WORLD; + } + if (ImGui::IsKeyPressed(ImGuiKey_S) && ImGui::IsKeyPressed(ImGuiKey_LeftShift)) + useSnap = !useSnap; + ImGui::Checkbox("##UseSnap", &useSnap); + ImGui::SameLine(); + + switch (mCurrentGizmoOperation) + { + case ImGuizmo::TRANSLATE: + ImGui::InputFloat3("Snap", &snap[0]); + break; + case ImGuizmo::ROTATE: + ImGui::InputFloat("Angle Snap", &snap[0]); + break; + case ImGuizmo::SCALE: + ImGui::InputFloat("Scale Snap", &snap[0]); + break; + } + ImGui::Checkbox("Bound Sizing", &boundSizing); + if (boundSizing) + { + ImGui::PushID(3); + ImGui::Checkbox("##BoundSizing", &boundSizingSnap); + ImGui::SameLine(); + ImGui::InputFloat3("Snap", boundsSnap); + ImGui::PopID(); + } + } + + ImGuiIO& io = ImGui::GetIO(); + float viewManipulateRight = io.DisplaySize.x; + float viewManipulateTop = 0; + static ImGuiWindowFlags gizmoWindowFlags = 0; + + /* + for the "useWindow" case we just render to a gui area, + otherwise to fake full screen transparent window + + note that for both cases we make sure gizmo being + rendered is aligned to our texture scene using + imgui "cursor" screen positions + */ +// TODO: this shouldn't be handled here I think + SImResourceInfo info; + info.textureID = params.sceneTexDescIx; + info.samplerIx = (uint16_t)nbl::ext::imgui::UI::DefaultSamplerIx::USER; + + TransformReturnInfo retval; + if (params.useWindow) + { + ImGui::SetNextWindowSize(ImVec2(800, 800), ImGuiCond_Appearing); + ImGui::SetNextWindowPos(ImVec2(400, 20), ImGuiCond_Appearing); + ImGui::PushStyleColor(ImGuiCol_WindowBg, (ImVec4)ImColor(0.35f, 0.3f, 0.3f)); + ImGui::Begin("Gizmo", 0, gizmoWindowFlags); + ImGuizmo::SetDrawlist(); + + ImVec2 contentRegionSize = ImGui::GetContentRegionAvail(); + ImVec2 windowPos = ImGui::GetWindowPos(); + ImVec2 cursorPos = ImGui::GetCursorScreenPos(); + + ImGui::Image(info, contentRegionSize); + ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y); + retval.sceneResolution = {contentRegionSize.x,contentRegionSize.y}; + retval.isGizmoWindowHovered = ImGui::IsWindowHovered(); + + viewManipulateRight = cursorPos.x + contentRegionSize.x; + viewManipulateTop = cursorPos.y; + + ImGuiWindow* window = ImGui::GetCurrentWindow(); + gizmoWindowFlags = (ImGui::IsWindowHovered() && ImGui::IsMouseHoveringRect(window->InnerRect.Min, window->InnerRect.Max) ? ImGuiWindowFlags_NoMove : 0); + } + else + { + ImGui::SetNextWindowPos(ImVec2(0, 0)); + ImGui::SetNextWindowSize(io.DisplaySize); + ImGui::PushStyleColor(ImGuiCol_WindowBg, ImVec4(0, 0, 0, 0)); // fully transparent fake window + ImGui::Begin("FullScreenWindow", nullptr, ImGuiWindowFlags_NoTitleBar | ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoMove | ImGuiWindowFlags_NoScrollbar | ImGuiWindowFlags_NoScrollWithMouse | ImGuiWindowFlags_NoCollapse | ImGuiWindowFlags_NoBringToFrontOnFocus | ImGuiWindowFlags_NoBackground | ImGuiWindowFlags_NoInputs); + + ImVec2 contentRegionSize = ImGui::GetContentRegionAvail(); + ImVec2 cursorPos = ImGui::GetCursorScreenPos(); + + ImGui::Image(info, contentRegionSize); + ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y); + retval.sceneResolution = {contentRegionSize.x,contentRegionSize.y}; + retval.isGizmoWindowHovered = ImGui::IsWindowHovered(); + + viewManipulateRight = cursorPos.x + contentRegionSize.x; + viewManipulateTop = cursorPos.y; + } + + ImGuizmo::Manipulate(cameraView, cameraProjection, mCurrentGizmoOperation, mCurrentGizmoMode, matrix, NULL, useSnap ? &snap[0] : NULL, boundSizing ? bounds : NULL, boundSizingSnap ? boundsSnap : NULL); + retval.isGizmoBeingUsed = ImGuizmo::IsOver() || (ImGuizmo::IsUsing() && ImGui::IsMouseDown(ImGuiMouseButton_Left)); + + if(params.enableViewManipulate) + ImGuizmo::ViewManipulate(cameraView, params.camDistance, ImVec2(viewManipulateRight - 128, viewManipulateTop), ImVec2(128, 128), 0x10101010); + + ImGui::End(); + ImGui::PopStyleColor(); + + return retval; +} + +#endif // __NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED__ \ No newline at end of file diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp new file mode 100644 index 000000000..b6d723e70 --- /dev/null +++ b/72_SolidAngleVisualizer/main.cpp @@ -0,0 +1,1105 @@ +// Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + + +#include "common.hpp" +#include "app_resources/hlsl/common.hlsl" + +#include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" + +/* +Renders scene texture to an offscreen framebuffer whose color attachment is then sampled into a imgui window. + +Written with Nabla's UI extension and got integrated with ImGuizmo to handle scene's object translations. +*/ +class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinResourcesApplication +{ + using device_base_t = MonoWindowApplication; + using asset_base_t = BuiltinResourcesApplication; + + inline static std::string SolidAngleVisShaderPath = "app_resources/hlsl/SolidAngleVis.frag.hlsl"; +public: + inline SolidAngleVisualizer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD), + device_base_t({ 2048,1024 }, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) { + } + + inline bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + m_semaphore = m_device->createSemaphore(m_realFrameIx); + if (!m_semaphore) + return logFail("Failed to Create a Semaphore!"); + + auto pool = m_device->createCommandPool(getGraphicsQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + for (auto i = 0u; i < MaxFramesInFlight; i++) + { + if (!pool) + return logFail("Couldn't create Command Pool!"); + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i,1 })) + return logFail("Couldn't create Command Buffer!"); + } + + const uint32_t addtionalBufferOwnershipFamilies[] = { getGraphicsQueue()->getFamilyIndex() }; + m_scene = CGeometryCreatorScene::create( + { + .transferQueue = getTransferUpQueue(), + .utilities = m_utils.get(), + .logger = m_logger.get(), + .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies + }, + CSimpleDebugRenderer::DefaultPolygonGeometryPatch + ); + + // for the scene drawing pass + { + IGPURenderpass::SCreationParams params = {}; + const IGPURenderpass::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] = { + {{ + { + .format = sceneRenderDepthFormat, + .samples = IGPUImage::ESCF_1_BIT, + .mayAlias = false + }, + /*.loadOp =*/ {IGPURenderpass::LOAD_OP::CLEAR}, + /*.storeOp =*/ {IGPURenderpass::STORE_OP::STORE}, + /*.initialLayout =*/ {IGPUImage::LAYOUT::UNDEFINED}, + /*.finalLayout =*/ {IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} + }}, + IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd + }; + params.depthStencilAttachments = depthAttachments; + const IGPURenderpass::SCreationParams::SColorAttachmentDescription colorAttachments[] = { + {{ + { + .format = finalSceneRenderFormat, + .samples = IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT, + .mayAlias = false + }, + /*.loadOp =*/ IGPURenderpass::LOAD_OP::CLEAR, + /*.storeOp =*/ IGPURenderpass::STORE_OP::STORE, + /*.initialLayout =*/ IGPUImage::LAYOUT::UNDEFINED, + /*.finalLayout =*/ IGPUImage::LAYOUT::READ_ONLY_OPTIMAL // ImGUI shall read + }}, + IGPURenderpass::SCreationParams::ColorAttachmentsEnd + }; + params.colorAttachments = colorAttachments; + IGPURenderpass::SCreationParams::SSubpassDescription subpasses[] = { + {}, + IGPURenderpass::SCreationParams::SubpassesEnd + }; + subpasses[0].depthStencilAttachment = { {.render = {.attachmentIndex = 0,.layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}} }; + subpasses[0].colorAttachments[0] = { .render = {.attachmentIndex = 0,.layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} }; + params.subpasses = subpasses; + + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { + // wipe-transition of Color to ATTACHMENT_OPTIMAL and depth + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = { + // last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later + // while color is sampled by ImGUI + .srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, + // don't want any writes to be available, as we are clearing both attachments + .srcAccessMask = ACCESS_FLAGS::NONE, + // destination needs to wait as early as possible + // TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h` + .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // because depth and color get cleared first no read mask + .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + // leave view offsets and flags default + }, + { + .srcSubpass = 0, + .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .memoryBarrier = { + // last place where the color can get modified, depth is implicitly earlier + .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // only write ops, reads can't be made available, also won't be using depth so don't care about it being visible to anyone else + .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT, + // the ImGUI will sample the color, then next frame we overwrite both attachments + .dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT | PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT, + // but we only care about the availability-visibility chain between renderpass and imgui + .dstAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT + } + // leave view offsets and flags default + }, + IGPURenderpass::SCreationParams::DependenciesEnd + }; + params.dependencies = dependencies; + auto solidAngleRenderpassParams = params; + m_mainRenderpass = m_device->createRenderpass(std::move(params)); + if (!m_mainRenderpass) + return logFail("Failed to create Main Renderpass!"); + + m_solidAngleRenderpass = m_device->createRenderpass(std::move(solidAngleRenderpassParams)); + if (!m_solidAngleRenderpass) + return logFail("Failed to create Solid Angle Renderpass!"); + + } + + const auto& geometries = m_scene->getInitParams().geometries; + m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(), m_solidAngleRenderpass.get(), 0, { &geometries.front().get(),geometries.size() }); + // special case + { + const auto& pipelines = m_renderer->getInitParams().pipelines; + auto ix = 0u; + for (const auto& name : m_scene->getInitParams().geometryNames) + { + if (name == "Cone") + m_renderer->getGeometry(ix).pipeline = pipelines[CSimpleDebugRenderer::SInitParams::PipelineType::Cone]; + ix++; + } + } + // we'll only display one thing at a time + m_renderer->m_instances.resize(1); + + // Create graphics pipeline + { + auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro = "") -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.workingDirectory = localInputCWD; + auto assetBundle = m_assetMgr->getAsset(pathToShader, lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + m_logger->log("Could not load shader: ", ILogger::ELL_ERROR, pathToShader); + std::exit(-1); + } + + auto source = smart_refctd_ptr_static_cast(assets[0]); + // The down-cast should not fail! + assert(source); + + auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); + CHLSLCompiler::SOptions options = {}; + options.stage = IShader::E_SHADER_STAGE::ESS_FRAGMENT; + options.preprocessorOptions.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; + options.spirvOptimizer = nullptr; +#ifndef _NBL_DEBUG + ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; + auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); + options.spirvOptimizer = opt.get(); +#endif + options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; + options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); + options.preprocessorOptions.logger = m_logger.get(); + options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); + + core::vector defines; + if (!defineMacro.empty()) + defines.push_back({ defineMacro, "" }); + + options.preprocessorOptions.extraDefines = defines; + + source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); + + auto shader = m_device->compileShader({ source.get(), nullptr, nullptr, nullptr }); + if (!shader) + { + m_logger->log("HLSL shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader); + std::exit(-1); + } + + return shader; + }; + + auto scRes = static_cast(m_surface->getSwapchainResources()); + ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); + if (!fsTriProtoPPln) + return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); + + // Load Fragment Shader + auto fragmentShader = loadAndCompileHLSLShader(SolidAngleVisShaderPath); + if (!fragmentShader) + return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!"); + + const IGPUPipelineBase::SShaderSpecInfo fragSpec = { + .shader = fragmentShader.get(), + .entryPoint = "main" + }; + + const asset::SPushConstantRange ranges[] = { { + .stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, + .offset = 0, + .size = sizeof(PushConstants) + } }; + + auto visualizationLayout = m_device->createPipelineLayout( + ranges, + nullptr, + nullptr, + nullptr, + nullptr + ); + m_visualizationPipeline = fsTriProtoPPln.createPipeline(fragSpec, visualizationLayout.get(), m_solidAngleRenderpass.get()); + if (!m_visualizationPipeline) + return logFail("Could not create Graphics Pipeline!"); + + } + + // Create ImGUI + { + auto scRes = static_cast(m_surface->getSwapchainResources()); + ext::imgui::UI::SCreationParameters params = {}; + params.resources.texturesInfo = { .setIx = 0u,.bindingIx = TexturesImGUIBindingIndex }; + params.resources.samplersInfo = { .setIx = 0u,.bindingIx = 1u }; + params.utilities = m_utils; + params.transfer = getTransferUpQueue(); + params.pipelineLayout = ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxImGUITextures); + params.assetManager = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); + params.renderpass = smart_refctd_ptr(scRes->getRenderpass()); + params.subpassIx = 0u; + params.pipelineCache = nullptr; + interface.imGUI = ext::imgui::UI::create(std::move(params)); + if (!interface.imGUI) + return logFail("Failed to create `nbl::ext::imgui::UI` class"); + } + + // create rest of User Interface + { + auto* imgui = interface.imGUI.get(); + // create the suballocated descriptor set + { + // note that we use default layout provided by our extension, but you are free to create your own by filling ext::imgui::UI::S_CREATION_PARAMETERS::resources + const auto* layout = interface.imGUI->getPipeline()->getLayout()->getDescriptorSetLayout(0u); + auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT, { &layout,1 }); + auto ds = pool->createDescriptorSet(smart_refctd_ptr(layout)); + interface.subAllocDS = make_smart_refctd_ptr(std::move(ds)); + if (!interface.subAllocDS) + return logFail("Failed to create the descriptor set"); + // make sure Texture Atlas slot is taken for eternity + { + auto dummy = SubAllocatedDescriptorSet::invalid_value; + interface.subAllocDS->multi_allocate(0, 1, &dummy); + assert(dummy == ext::imgui::UI::FontAtlasTexId); + } + // write constant descriptors, note we don't create info & write pair for the samplers because UI extension's are immutable and baked into DS layout + IGPUDescriptorSet::SDescriptorInfo info = {}; + info.desc = smart_refctd_ptr(interface.imGUI->getFontAtlasView()); + info.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + const IGPUDescriptorSet::SWriteDescriptorSet write = { + .dstSet = interface.subAllocDS->getDescriptorSet(), + .binding = TexturesImGUIBindingIndex, + .arrayElement = ext::imgui::UI::FontAtlasTexId, + .count = 1, + .info = &info + }; + if (!m_device->updateDescriptorSets({ &write,1 }, {})) + return logFail("Failed to write the descriptor set"); + } + imgui->registerListener([this]() {interface(); }); + } + + interface.camera.mapKeysToWASD(); + + onAppInitializedFinish(); + return true; + } + + // + virtual inline bool onAppTerminated() + { + SubAllocatedDescriptorSet::value_type fontAtlasDescIx = ext::imgui::UI::FontAtlasTexId; + IGPUDescriptorSet::SDropDescriptorSet dummy[1]; + interface.subAllocDS->multi_deallocate(dummy, TexturesImGUIBindingIndex, 1, &fontAtlasDescIx); + return device_base_t::onAppTerminated(); + } + + inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override + { + // CPU events + update(nextPresentationTimestamp); + + const auto& virtualWindowRes = interface.transformReturnInfo.sceneResolution; + // TODO: check main frame buffer too + if (!m_solidAngleViewFramebuffer || m_solidAngleViewFramebuffer->getCreationParameters().width != virtualWindowRes[0] || m_solidAngleViewFramebuffer->getCreationParameters().height != virtualWindowRes[1]) + recreateFramebuffer(virtualWindowRes); + + // + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + + auto* const cb = m_cmdBufs.data()[resourceIx].get(); + cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + // clear to black for both things + const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} }; + if (m_solidAngleViewFramebuffer) + { + cb->beginDebugMarker("Draw Circle View Frame"); + { + const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f }; + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + { + .framebuffer = m_solidAngleViewFramebuffer.get(), + .colorClearValues = &clearValue, + .depthStencilClearValues = &farValue, + .renderArea = { + .offset = {0,0}, + .extent = {virtualWindowRes[0],virtualWindowRes[1]} + } + }; + beginRenderpass(cb, renderpassInfo); + } + // draw scene + { + PushConstants pc{ + .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), + .viewport = { 0.f,0.f,static_cast(virtualWindowRes[0]),static_cast(virtualWindowRes[1]) } + }; + auto pipeline = m_visualizationPipeline; + cb->bindGraphicsPipeline(pipeline.get()); + cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(PushConstants), &pc); + //cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 3, 1, &ds); + ext::FullScreenTriangle::recordDrawCall(cb); + } + cb->endRenderPass(); + cb->endDebugMarker(); + } + // draw main view + if (m_mainViewFramebuffer) + { + cb->beginDebugMarker("Main Scene Frame"); + { + const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f }; + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + { + .framebuffer = m_mainViewFramebuffer.get(), + .colorClearValues = &clearValue, + .depthStencilClearValues = &farValue, + .renderArea = { + .offset = {0,0}, + .extent = {virtualWindowRes[0],virtualWindowRes[1]} + } + }; + beginRenderpass(cb, renderpassInfo); + } + // draw scene + { + float32_t3x4 viewMatrix; + float32_t4x4 viewProjMatrix; + // TODO: get rid of legacy matrices + { + const auto& camera = interface.camera; + memcpy(&viewMatrix, camera.getViewMatrix().pointer(), sizeof(viewMatrix)); + memcpy(&viewProjMatrix, camera.getConcatenatedMatrix().pointer(), sizeof(viewProjMatrix)); + } + const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix, viewProjMatrix); + + // tear down scene every frame + auto& instance = m_renderer->m_instances[0]; + auto transposed = hlsl::transpose(interface.m_OBBModelMatrix); + memcpy(&instance.world, &transposed, sizeof(instance.world)); + instance.packedGeo = m_renderer->getGeometries().data();// +interface.gcIndex; + m_renderer->render(cb, viewParams); // draw the cube/OBB + + + // TODO: a better way to get identity matrix + float32_t3x4 origin = { + 0.2f,0.0f,0.0f,0.0f, + 0.0f,0.2f,0.0f,0.0f, + 0.0f,0.0f,0.2f,0.0f + }; + memcpy(&instance.world, &origin, sizeof(instance.world)); + instance.packedGeo = m_renderer->getGeometries().data() + 3; // sphere + m_renderer->render(cb, viewParams); + } + cb->endRenderPass(); + cb->endDebugMarker(); + } + { + cb->beginDebugMarker("SolidAngleVisualizer IMGUI Frame"); + { + auto scRes = static_cast(m_surface->getSwapchainResources()); + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + { + .framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex), + .colorClearValues = &clearValue, + .depthStencilClearValues = nullptr, + .renderArea = { + .offset = {0,0}, + .extent = {m_window->getWidth(),m_window->getHeight()} + } + }; + beginRenderpass(cb, renderpassInfo); + } + // draw ImGUI + { + auto* imgui = interface.imGUI.get(); + auto* pipeline = imgui->getPipeline(); + cb->bindGraphicsPipeline(pipeline); + // note that we use default UI pipeline layout where uiParams.resources.textures.setIx == uiParams.resources.samplers.setIx + const auto* ds = interface.subAllocDS->getDescriptorSet(); + cb->bindDescriptorSets(EPBP_GRAPHICS, pipeline->getLayout(), imgui->getCreationParameters().resources.texturesInfo.setIx, 1u, &ds); + // a timepoint in the future to release streaming resources for geometry + const ISemaphore::SWaitInfo drawFinished = { .semaphore = m_semaphore.get(),.value = m_realFrameIx + 1u }; + if (!imgui->render(cb, drawFinished)) + { + m_logger->log("TODO: need to present acquired image before bailing because its already acquired.", ILogger::ELL_ERROR); + return {}; + } + } + cb->endRenderPass(); + cb->endDebugMarker(); + } + cb->end(); + + IQueue::SSubmitInfo::SSemaphoreInfo retval = + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS + }; + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cb } + }; + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { + { + .semaphore = device_base_t::getCurrentAcquire().semaphore, + .value = device_base_t::getCurrentAcquire().acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE + } + }; + const IQueue::SSubmitInfo infos[] = + { + { + .waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = {&retval,1} + } + }; + + if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS) + { + retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal + m_realFrameIx--; + } + + + m_window->setCaption("[Nabla Engine] UI App Test Demo"); + return retval; + } + +protected: + const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override + { + // Subsequent submits don't wait for each other, but they wait for acquire and get waited on by present + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { + // don't want any writes to be available, we'll clear, only thing to worry about is the layout transition + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, // should sync against the semaphore wait anyway + .srcAccessMask = ACCESS_FLAGS::NONE, + // layout transition needs to finish before the color write + .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + } + // leave view offsets and flags default + }, + // want layout transition to begin after all color output is done + { + .srcSubpass = 0, + .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .memoryBarrier = { + // last place where the color can get modified, depth is implicitly earlier + .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // only write ops, reads can't be made available + .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + // spec says nothing is needed when presentation is the destination + } + // leave view offsets and flags default + }, + IGPURenderpass::SCreationParams::DependenciesEnd + }; + return dependencies; + } + +private: + inline void update(const std::chrono::microseconds nextPresentationTimestamp) + { + auto& camera = interface.camera; + camera.setMoveSpeed(interface.moveSpeed); + camera.setRotateSpeed(interface.rotateSpeed); + + + m_inputSystem->getDefaultMouse(&mouse); + m_inputSystem->getDefaultKeyboard(&keyboard); + + struct + { + std::vector mouse{}; + std::vector keyboard{}; + } uiEvents; + + // TODO: should be a member really + static std::chrono::microseconds previousEventTimestamp{}; + + // I think begin/end should always be called on camera, just events shouldn't be fed, why? + // If you stop begin/end, whatever keys were up/down get their up/down values frozen leading to + // `perActionDt` becoming obnoxiously large the first time the even processing resumes due to + // `timeDiff` being computed since `lastVirtualUpTimeStamp` + camera.beginInputProcessing(nextPresentationTimestamp); + { + mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void + { + if (interface.move) + camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + uiEvents.mouse.emplace_back(e); + + //if (e.type == nbl::ui::SMouseEvent::EET_SCROLL && m_renderer) + //{ + // interface.gcIndex += int16_t(core::sign(e.scrollEvent.verticalScroll)); + // interface.gcIndex = core::clamp(interface.gcIndex, 0ull, m_renderer->getGeometries().size() - 1); + //} + } + }, + m_logger.get() + ); + keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void + { + //if (interface.move) + camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl + + for (const auto& e : events) // here capture + { + if (e.timeStamp < previousEventTimestamp) + continue; + + previousEventTimestamp = e.timeStamp; + uiEvents.keyboard.emplace_back(e); + } + }, + m_logger.get() + ); + } + camera.endInputProcessing(nextPresentationTimestamp); + + const auto cursorPosition = m_window->getCursorControl()->getPosition(); + + ext::imgui::UI::SUpdateParameters params = + { + .mousePosition = float32_t2(cursorPosition.x,cursorPosition.y) - float32_t2(m_window->getX(),m_window->getY()), + .displaySize = {m_window->getWidth(),m_window->getHeight()}, + .mouseEvents = uiEvents.mouse, + .keyboardEvents = uiEvents.keyboard + }; + + //interface.objectName = m_scene->getInitParams().geometryNames[interface.gcIndex]; + interface.imGUI->update(params); + } + + void recreateFramebuffer(const uint16_t2 resolution) + { + auto createImageAndView = [&](E_FORMAT format)->smart_refctd_ptr + { + auto image = m_device->createImage({ { + .type = IGPUImage::ET_2D, + .samples = IGPUImage::ESCF_1_BIT, + .format = format, + .extent = {resolution.x,resolution.y,1}, + .mipLevels = 1, + .arrayLayers = 1, + .usage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT | IGPUImage::EUF_SAMPLED_BIT + } }); + if (!m_device->allocate(image->getMemoryReqs(), image.get()).isValid()) + return nullptr; + IGPUImageView::SCreationParams params = { + .image = std::move(image), + .viewType = IGPUImageView::ET_2D, + .format = format + }; + params.subresourceRange.aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT : IGPUImage::EAF_COLOR_BIT; + return m_device->createImageView(std::move(params)); + }; + + smart_refctd_ptr solidAngleView; + smart_refctd_ptr mainView; + // detect window minimization + if (resolution.x < 0x4000 && resolution.y < 0x4000) + { + solidAngleView = createImageAndView(finalSceneRenderFormat); + auto solidAngleDepthView = createImageAndView(sceneRenderDepthFormat); + m_solidAngleViewFramebuffer = m_device->createFramebuffer({ { + .renderpass = m_solidAngleRenderpass, + .depthStencilAttachments = &solidAngleDepthView.get(), + .colorAttachments = &solidAngleView.get(), + .width = resolution.x, + .height = resolution.y + } }); + + mainView = createImageAndView(finalSceneRenderFormat); + auto mainDepthView = createImageAndView(sceneRenderDepthFormat); + m_mainViewFramebuffer = m_device->createFramebuffer({ { + .renderpass = m_mainRenderpass, + .depthStencilAttachments = &mainDepthView.get(), + .colorAttachments = &mainView.get(), + .width = resolution.x, + .height = resolution.y + } }); + + } + else + { + m_solidAngleViewFramebuffer = nullptr; + m_mainViewFramebuffer = nullptr; + } + + // release previous slot and its image + interface.subAllocDS->multi_deallocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices, { .semaphore = m_semaphore.get(),.value = m_realFrameIx }); + // + if (solidAngleView) + { + interface.subAllocDS->multi_allocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices); + // update descriptor set + IGPUDescriptorSet::SDescriptorInfo infos[static_cast(CInterface::Count)] = {}; + infos[0].desc = solidAngleView; + infos[0].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; + infos[1].desc = mainView; + infos[1].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; + const IGPUDescriptorSet::SWriteDescriptorSet write[static_cast(CInterface::Count)] = { + {.dstSet = interface.subAllocDS->getDescriptorSet(), + .binding = TexturesImGUIBindingIndex, + .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)], + .count = 1, + .info = &infos[static_cast(CInterface::ERV_MAIN_VIEW)] + }, + { + .dstSet = interface.subAllocDS->getDescriptorSet(), + .binding = TexturesImGUIBindingIndex, + .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_MAIN_VIEW)], + .count = 1, + .info = &infos[1] + } + }; + m_device->updateDescriptorSets({ write, static_cast(CInterface::Count) }, {}); + } + interface.transformParams.sceneTexDescIx = interface.renderColorViewDescIndices[CInterface::ERV_MAIN_VIEW]; + } + + inline void beginRenderpass(IGPUCommandBuffer* cb, const IGPUCommandBuffer::SRenderpassBeginInfo& info) + { + cb->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); + cb->setScissor(0, 1, &info.renderArea); + const SViewport viewport = { + .x = 0, + .y = 0, + .width = static_cast(info.renderArea.extent.width), + .height = static_cast(info.renderArea.extent.height) + }; + cb->setViewport(0u, 1u, &viewport); + } + + // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers + constexpr static inline uint32_t MaxFramesInFlight = 3u; + constexpr static inline auto sceneRenderDepthFormat = EF_D32_SFLOAT; + constexpr static inline auto finalSceneRenderFormat = EF_R8G8B8A8_SRGB; + constexpr static inline auto TexturesImGUIBindingIndex = 0u; + // we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes + constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight; + + // + smart_refctd_ptr m_scene; + smart_refctd_ptr m_solidAngleRenderpass; + smart_refctd_ptr m_mainRenderpass; + smart_refctd_ptr m_renderer; + smart_refctd_ptr m_solidAngleViewFramebuffer; + smart_refctd_ptr m_mainViewFramebuffer; + smart_refctd_ptr m_visualizationPipeline; + // + smart_refctd_ptr m_semaphore; + uint64_t m_realFrameIx = 0; + std::array, MaxFramesInFlight> m_cmdBufs; + // + InputSystem::ChannelReader mouse; + InputSystem::ChannelReader keyboard; + // UI stuff + struct CInterface + { + void cameraToHome() + { + core::vectorSIMDf cameraPosition(-3.0f, 3.0f, 6.0f); + core::vectorSIMDf cameraTarget(0.f, 0.f, 6.f); + const static core::vectorSIMDf up(0.f, 1.f, 0.f); + + camera.setPosition(cameraPosition); + camera.setTarget(cameraTarget); + camera.setBackupUpVector(up); + + camera.recomputeViewMatrix(); + } + + void operator()() + { + ImGuiIO& io = ImGui::GetIO(); + + // TODO: why is this a lambda and not just an assignment in a scope ? + camera.setProjectionMatrix([&]() + { + matrix4SIMD projection; + + if (isPerspective) + if (isLH) + projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar); + else + projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar); + else + { + float viewHeight = viewWidth * io.DisplaySize.y / io.DisplaySize.x; + + if (isLH) + projection = matrix4SIMD::buildProjectionMatrixOrthoLH(viewWidth, viewHeight, zNear, zFar); + else + projection = matrix4SIMD::buildProjectionMatrixOrthoRH(viewWidth, viewHeight, zNear, zFar); + } + + return projection; + }()); + + ImGuizmo::SetOrthographic(false); + ImGuizmo::BeginFrame(); + + ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); + + // create a window and insert the inspector + ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); + ImGui::Begin("Editor"); + + //if (ImGui::RadioButton("Full view", !transformParams.useWindow)) + // transformParams.useWindow = false; + + //ImGui::SameLine(); + + //if (ImGui::RadioButton("Window", transformParams.useWindow)) + // transformParams.useWindow = true; + + ImGui::Text("Camera"); + bool viewDirty = false; + + if (ImGui::RadioButton("LH", isLH)) + isLH = true; + + ImGui::SameLine(); + + if (ImGui::RadioButton("RH", !isLH)) + isLH = false; + + if (ImGui::RadioButton("Perspective", isPerspective)) + isPerspective = true; + + ImGui::SameLine(); + + if (ImGui::RadioButton("Orthographic", !isPerspective)) + isPerspective = false; + + ImGui::Checkbox("Enable \"view manipulate\"", &transformParams.enableViewManipulate); + //ImGui::Checkbox("Enable camera movement", &move); + ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f); + + // ImGui::Checkbox("Flip Gizmo's Y axis", &flipGizmoY); // let's not expose it to be changed in UI but keep the logic in case + + if (isPerspective) + ImGui::SliderFloat("Fov", &fov, 20.f, 150.f); + else + ImGui::SliderFloat("Ortho width", &viewWidth, 1, 20); + + ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); + ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); + + viewDirty |= ImGui::SliderFloat("Distance", &transformParams.camDistance, 1.f, 69.f); + + if (viewDirty || firstFrame) + { + cameraToHome(); + } + firstFrame = false; + + ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); + if (ImGuizmo::IsUsing()) + { + ImGui::Text("Using gizmo"); + } + else + { + ImGui::Text(ImGuizmo::IsOver() ? "Over gizmo" : ""); + ImGui::SameLine(); + ImGui::Text(ImGuizmo::IsOver(ImGuizmo::TRANSLATE) ? "Over translate gizmo" : ""); + ImGui::SameLine(); + ImGui::Text(ImGuizmo::IsOver(ImGuizmo::ROTATE) ? "Over rotate gizmo" : ""); + ImGui::SameLine(); + ImGui::Text(ImGuizmo::IsOver(ImGuizmo::SCALE) ? "Over scale gizmo" : ""); + } + ImGui::Separator(); + + /* + * ImGuizmo expects view & perspective matrix to be column major both with 4x4 layout + * and Nabla uses row major matricies - 3x4 matrix for view & 4x4 for projection + + - VIEW: + + ImGuizmo + + | X[0] Y[0] Z[0] 0.0f | + | X[1] Y[1] Z[1] 0.0f | + | X[2] Y[2] Z[2] 0.0f | + | -Dot(X, eye) -Dot(Y, eye) -Dot(Z, eye) 1.0f | + + Nabla + + | X[0] X[1] X[2] -Dot(X, eye) | + | Y[0] Y[1] Y[2] -Dot(Y, eye) | + | Z[0] Z[1] Z[2] -Dot(Z, eye) | + + = transpose(nbl::core::matrix4SIMD()) + + - PERSPECTIVE [PROJECTION CASE]: + + ImGuizmo + + | (temp / temp2) (0.0) (0.0) (0.0) | + | (0.0) (temp / temp3) (0.0) (0.0) | + | ((right + left) / temp2) ((top + bottom) / temp3) ((-zfar - znear) / temp4) (-1.0f) | + | (0.0) (0.0) ((-temp * zfar) / temp4) (0.0) | + + Nabla + + | w (0.0) (0.0) (0.0) | + | (0.0) -h (0.0) (0.0) | + | (0.0) (0.0) (-zFar/(zFar-zNear)) (-zNear*zFar/(zFar-zNear)) | + | (0.0) (0.0) (-1.0) (0.0) | + + = transpose() + + * + * the ViewManipulate final call (inside EditTransform) returns world space column major matrix for an object, + * note it also modifies input view matrix but projection matrix is immutable + */ + + if (ImGui::IsKeyPressed(ImGuiKey_Home)) + { + cameraToHome(); + } + + if (ImGui::IsKeyPressed(ImGuiKey_End)) + { + m_OBBModelMatrix = { + 1.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 1.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 1.0f, 0.0f, + 0.0f, 0.0f, 12.0f, 1.0f + }; + } + + static struct + { + float32_t4x4 view, projection, model; + } imguizmoM16InOut; + + ImGuizmo::SetID(0u); + + // TODO: camera will return hlsl::float32_tMxN + auto view = *reinterpret_cast(camera.getViewMatrix().pointer()); + imguizmoM16InOut.view = hlsl::transpose(getMatrix3x4As4x4(view)); + + // TODO: camera will return hlsl::float32_tMxN + imguizmoM16InOut.projection = hlsl::transpose(*reinterpret_cast(camera.getProjectionMatrix().pointer())); + imguizmoM16InOut.model = m_OBBModelMatrix; + + { + if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates + imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/ + + transformParams.editTransformDecomposition = true; + transformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams); + + // TODO: camera stops when cursor hovers gizmo, but we also want to stop when gizmo is being used + move = (ImGui::IsMouseDown(ImGuiMouseButton_Left) || transformReturnInfo.isGizmoWindowHovered) && (!transformReturnInfo.isGizmoBeingUsed); + } + + // to Nabla + update camera & model matrices + // TODO: make it more nicely, extract: + // - Position by computing inverse of the view matrix and grabbing its translation + // - Target from 3rd row without W component of view matrix multiplied by some arbitrary distance value (can be the length of position from origin) and adding the position + // But then set the view matrix this way anyway, because up-vector may not be compatible + //const auto& view = camera.getViewMatrix(); + //const_cast(view) = core::transpose(imguizmoM16InOut.view).extractSub3x4(); // a hack, correct way would be to use inverse matrix and get position + target because now it will bring you back to last position & target when switching from gizmo move to manual move (but from manual to gizmo is ok) + m_OBBModelMatrix = imguizmoM16InOut.model; + + // object meta display + //{ + // ImGui::Begin("Object"); + // ImGui::Text("type: \"%s\"", objectName.data()); + // ImGui::End(); + //} + + // solid angle view window + { + ImGui::SetNextWindowSize(ImVec2(800, 800), ImGuiCond_Appearing); + ImGui::SetNextWindowPos(ImVec2(1240, 20), ImGuiCond_Appearing); + static bool isOpen = true; + ImGui::Begin("Solid angle view", &isOpen, 0); + + ImVec2 contentRegionSize = ImGui::GetContentRegionAvail(); + ImGui::Image({ renderColorViewDescIndices[ERV_SOLID_ANGLE_VIEW] }, contentRegionSize); + ImGui::End(); + } + + // view matrices editor + { + ImGui::Begin("Matrices"); + + auto addMatrixTable = [&](const char* topText, const char* tableName, const int rows, const int columns, const float* pointer, const bool withSeparator = true) + { + ImGui::Text(topText); + if (ImGui::BeginTable(tableName, columns)) + { + for (int y = 0; y < rows; ++y) + { + ImGui::TableNextRow(); + for (int x = 0; x < columns; ++x) + { + ImGui::TableSetColumnIndex(x); + ImGui::Text("%.3f", *(pointer + (y * columns) + x)); + } + } + ImGui::EndTable(); + } + + if (withSeparator) + ImGui::Separator(); + }; + + addMatrixTable("Model Matrix", "ModelMatrixTable", 4, 4, &m_OBBModelMatrix[0][0]); + addMatrixTable("Camera View Matrix", "ViewMatrixTable", 3, 4, camera.getViewMatrix().pointer()); + addMatrixTable("Camera View Projection Matrix", "ViewProjectionMatrixTable", 4, 4, camera.getProjectionMatrix().pointer(), false); + + ImGui::End(); + } + + // Nabla Imgui backend MDI buffer info + // To be 100% accurate and not overly conservative we'd have to explicitly `cull_frees` and defragment each time, + // so unless you do that, don't use this basic info to optimize the size of your IMGUI buffer. + { + auto* streaminingBuffer = imGUI->getStreamingBuffer(); + + const size_t total = streaminingBuffer->get_total_size(); // total memory range size for which allocation can be requested + const size_t freeSize = streaminingBuffer->getAddressAllocator().get_free_size(); // max total free bloock memory size we can still allocate from total memory available + const size_t consumedMemory = total - freeSize; // memory currently consumed by streaming buffer + + float freePercentage = 100.0f * (float)(freeSize) / (float)total; + float allocatedPercentage = (float)(consumedMemory) / (float)total; + + ImVec2 barSize = ImVec2(400, 30); + float windowPadding = 10.0f; + float verticalPadding = ImGui::GetStyle().FramePadding.y; + + ImGui::SetNextWindowSize(ImVec2(barSize.x + 2 * windowPadding, 110 + verticalPadding), ImGuiCond_Always); + ImGui::Begin("Nabla Imgui MDI Buffer Info", nullptr, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoScrollbar); + + ImGui::Text("Total Allocated Size: %zu bytes", total); + ImGui::Text("In use: %zu bytes", consumedMemory); + ImGui::Text("Buffer Usage:"); + + ImGui::SetCursorPosX(windowPadding); + + if (freePercentage > 70.0f) + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(0.0f, 1.0f, 0.0f, 0.4f)); // Green + else if (freePercentage > 30.0f) + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 1.0f, 0.0f, 0.4f)); // Yellow + else + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 0.0f, 0.0f, 0.4f)); // Red + + ImGui::ProgressBar(allocatedPercentage, barSize, ""); + + ImGui::PopStyleColor(); + + ImDrawList* drawList = ImGui::GetWindowDrawList(); + + ImVec2 progressBarPos = ImGui::GetItemRectMin(); + ImVec2 progressBarSize = ImGui::GetItemRectSize(); + + const char* text = "%.2f%% free"; + char textBuffer[64]; + snprintf(textBuffer, sizeof(textBuffer), text, freePercentage); + + ImVec2 textSize = ImGui::CalcTextSize(textBuffer); + ImVec2 textPos = ImVec2 + ( + progressBarPos.x + (progressBarSize.x - textSize.x) * 0.5f, + progressBarPos.y + (progressBarSize.y - textSize.y) * 0.5f + ); + + ImVec4 bgColor = ImGui::GetStyleColorVec4(ImGuiCol_WindowBg); + drawList->AddRectFilled + ( + ImVec2(textPos.x - 5, textPos.y - 2), + ImVec2(textPos.x + textSize.x + 5, textPos.y + textSize.y + 2), + ImGui::GetColorU32(bgColor) + ); + + ImGui::SetCursorScreenPos(textPos); + ImGui::Text("%s", textBuffer); + + ImGui::Dummy(ImVec2(0.0f, verticalPadding)); + + ImGui::End(); + } + ImGui::End(); + } + + smart_refctd_ptr imGUI; + + // descriptor set + smart_refctd_ptr subAllocDS; + enum E_RENDER_VIEWS : uint8_t + { + ERV_MAIN_VIEW, + ERV_SOLID_ANGLE_VIEW, + Count + }; + SubAllocatedDescriptorSet::value_type renderColorViewDescIndices[E_RENDER_VIEWS::Count] = { SubAllocatedDescriptorSet::invalid_value, SubAllocatedDescriptorSet::invalid_value }; + // + Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); + // mutables + float32_t4x4 m_OBBModelMatrix{ + 1.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 1.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 1.0f, 0.0f, + 0.0f, 0.0f, 12.0f, 1.0f + }; + + //std::string_view objectName; + TransformRequestParams transformParams; + TransformReturnInfo transformReturnInfo; + + float fov = 90.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f; + float viewWidth = 10.f; + float camYAngle = 90.f / 180.f * 3.14159f; + float camXAngle = 0.f / 180.f * 3.14159f; + //uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed + bool isPerspective = true, isLH = true, flipGizmoY = true, move = true; + bool firstFrame = true; + } interface; +}; + +NBL_MAIN_FUNC(SolidAngleVisualizer) \ No newline at end of file diff --git a/72_SolidAngleVisualizer/pipeline.groovy b/72_SolidAngleVisualizer/pipeline.groovy new file mode 100644 index 000000000..7b7c9702a --- /dev/null +++ b/72_SolidAngleVisualizer/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CUIBuilder extends IBuilder +{ + public CUIBuilder(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CUIBuilder(_agent, _info) +} + +return this \ No newline at end of file diff --git a/72_SolidAngleVisualizer/src/transform.cpp b/72_SolidAngleVisualizer/src/transform.cpp new file mode 100644 index 000000000..e69de29bb diff --git a/CMakeLists.txt b/CMakeLists.txt index 574925e97..fddafdac1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids) add_subdirectory(71_RayTracingPipeline) + add_subdirectory(72_SolidAngleVisualizer) # add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory NBL_GET_ALL_TARGETS(TARGETS) From 93861bd59f85721993472e3de67f23bec6170363 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Sat, 6 Dec 2025 21:02:46 +0300 Subject: [PATCH 02/26] Make camera account for up direction, corrected framebuffer resolutions for both views, solid angle shader now outputs correct cube vertices correctly --- .../hlsl/SolidAngleVis.frag.hlsl | 157 +++++++++++------- 72_SolidAngleVisualizer/include/transform.hpp | 2 +- 72_SolidAngleVisualizer/main.cpp | 134 ++++++++------- .../include/nbl/examples/cameras/CCamera.hpp | 50 +++--- 4 files changed, 190 insertions(+), 153 deletions(-) diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index d783a5b37..2ad766c8a 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -9,7 +9,7 @@ using namespace ext::FullScreenTriangle; [[vk::push_constant]] struct PushConstants pc; -static const float CIRCLE_RADIUS = 0.45f; +static const float CIRCLE_RADIUS = 0.75f; // --- Geometry Utils --- @@ -33,17 +33,23 @@ static float3 corners[8]; static float3 faceCenters[6] = { float3(0,0,0), float3(0,0,0), float3(0,0,0), float3(0,0,0), float3(0,0,0), float3(0,0,0) }; static float2 projCorners[8]; +static bool cornerVisible[8]; // Converts UV into centered, aspect-corrected NDC circle space float2 toCircleSpace(float2 uv) { - float aspect = pc.viewport.z / pc.viewport.w; - float2 centered = uv - 0.5f; - centered.x *= aspect; - return centered; + // Map [0,1] UV to [-1,1] + float2 p = uv * 2.0f - 1.0f; + + // Correct aspect ratio + float aspect = pc.viewport.z / pc.viewport.w; // width / height + p.x *= aspect; + + return p; } + // Distance to a 2D line segment float sdSegment(float2 p, float2 a, float2 b) { @@ -54,9 +60,18 @@ float sdSegment(float2 p, float2 a, float2 b) } // TODO: Hemispherical Projection (Solid Angle / Orthographic/Lambertian Projection) -float2 project(float3 p) +bool projectToOrthoSphere(float3 p, out float2 uv) { - return normalize(p).xy; + float3 n = normalize(p); // direction to sphere + + // hemisphere (Z > 0) + if (n.z <= 0.0) + return false; + + // orthographic projection (drop Z) + uv = n.xy; + + return true; // valid } void computeCubeGeo() @@ -66,71 +81,72 @@ void computeCubeGeo() float3 localPos = float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f; float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz; - corners[i] = worldPos; + corners[i] = worldPos.xyz; faceCenters[i/4] += worldPos / 4.0f; faceCenters[2+i%2] += worldPos / 4.0f; faceCenters[4+(i/2)%2] += worldPos / 4.0f; - float3 viewPos = worldPos; - projCorners[i] = project(viewPos); + float3 viewPos = worldPos.xyz; + cornerVisible[i] = projectToOrthoSphere(viewPos, projCorners[i]); + projCorners[i] *= CIRCLE_RADIUS; // scale to circle radius } } -int getVisibilityCount(int2 faces, float3 cameraPos) -{ - float3x3 rotMatrix = (float3x3)pc.modelMatrix; - float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]); - float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]); +// int getVisibilityCount(int2 faces, float3 cameraPos) +// { +// float3x3 rotMatrix = (float3x3)pc.modelMatrix; +// float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]); +// float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]); - float3 viewVec_f1 = faceCenters[faces.x] - cameraPos; - float3 viewVec_f2 = faceCenters[faces.y] - cameraPos; +// float3 viewVec_f1 = faceCenters[faces.x] - cameraPos; +// float3 viewVec_f2 = faceCenters[faces.y] - cameraPos; - // Face is visible if its outward normal points towards the origin (camera). - bool visible1 = dot(n_world_f1, viewVec_f1) < 0.0f; - bool visible2 = dot(n_world_f2, viewVec_f2) < 0.0f; +// // Face is visible if its outward normal points towards the origin (camera). +// bool visible1 = dot(n_world_f1, viewVec_f1) < 0.0f; +// bool visible2 = dot(n_world_f2, viewVec_f2) < 0.0f; - // Determine Line Style: - bool isSilhouette = visible1 != visible2; // One face visible, the other hidden - bool isInner = visible1 && visible2; // Both faces visible +// // Determine Line Style: +// bool isSilhouette = visible1 != visible2; // One face visible, the other hidden +// bool isInner = visible1 && visible2; // Both faces visible - int visibilityCount = 0; - if (isSilhouette) - { - visibilityCount = 1; - } - else if (isInner) - { - visibilityCount = 2; - } - - return visibilityCount; -} - -void drawLine(float2 p, int a, int b, int visibilityCount, inout float4 color, float aaWidth) -{ - if (visibilityCount > 0) - { - float3 A = corners[a]; - float3 B = corners[b]; - - float avgDepth = (length(A) + length(B)) * 0.5f; - float referenceDepth = 3.0f; - float depthScale = referenceDepth / avgDepth; - - float baseWidth = (visibilityCount == 1) ? 0.005f : 0.002f; - float intensity = (visibilityCount == 1) ? 1.0f : 0.5f; - float4 edgeColor = (visibilityCount == 1) ? float4(0.0f, 0.5f, 1.0f, 1.0f) : float4(1.0f, 0.0f, 0.0f, 1.0f); // Blue vs Red +// int visibilityCount = 0; +// if (isSilhouette) +// { +// visibilityCount = 1; +// } +// else if (isInner) +// { +// visibilityCount = 2; +// } + +// return visibilityCount; +// } + +// void drawLine(float2 p, int a, int b, int visibilityCount, inout float4 color, float aaWidth) +// { +// if (visibilityCount > 0) +// { +// float3 A = corners[a]; +// float3 B = corners[b]; + +// float avgDepth = (length(A) + length(B)) * 0.5f; +// float referenceDepth = 3.0f; +// float depthScale = referenceDepth / avgDepth; + +// float baseWidth = (visibilityCount == 1) ? 0.005f : 0.002f; +// float intensity = (visibilityCount == 1) ? 1.0f : 0.5f; +// float4 edgeColor = (visibilityCount == 1) ? float4(0.0f, 0.5f, 1.0f, 1.0f) : float4(1.0f, 0.0f, 0.0f, 1.0f); // Blue vs Red - float width = min(baseWidth * depthScale, 0.03f); +// float width = min(baseWidth * depthScale, 0.03f); - float dist = sdSegment(p, projCorners[a], projCorners[b]); +// float dist = sdSegment(p, projCorners[a], projCorners[b]); - float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); +// float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); - color += edgeColor * alpha * intensity; - } -} +// color += edgeColor * alpha * intensity; +// } +// } void drawRing(float2 p, inout float4 color, float aaWidth) { @@ -149,6 +165,12 @@ void drawRing(float2 p, inout float4 color, float aaWidth) color = max(color, float4(1.0, 1.0, 1.0, 1.0) * ringAlpha); } +float plotPoint(float2 uv, float2 p, float r) +{ + return step(length(uv - p), r); +} + + [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 { float3 cameraPos = float3(0, 0, 0); // Camera at origin @@ -159,16 +181,25 @@ void drawRing(float2 p, inout float4 color, float aaWidth) float aaWidth = max(fwidth(p.x), fwidth(p.y)); - for (int j = 0; j < 12; j++) + float pointMask = 0.0; + for (int i=0; i<8; i++) { - int a = j % 4 * (j < 4 ? 1 : 2) - (j / 4 == 1 ? j % 2 : 0); - int b = a + (4 >> (j / 4)); - - int2 faces = edgeToFaces[j]; - int visibilityCount = getVisibilityCount(faces, cameraPos); - drawLine(p, a, b, visibilityCount, color, aaWidth); + if (cornerVisible[i]) + pointMask += plotPoint(p, projCorners[i], 0.015f); } + color += pointMask * float4(1,0,0,1); // red points + + // for (int j = 0; j < 12; j++) + // { + // int a = j % 4 * (j < 4 ? 1 : 2) - (j / 4 == 1 ? j % 2 : 0); + // int b = a + (4 >> (j / 4)); + + // // int2 faces = edgeToFaces[j]; + // // int visibilityCount = getVisibilityCount(faces, cameraPos); + // // drawLine(p, a, b, visibilityCount, color, aaWidth); + // } + drawRing(p, color, aaWidth); return color; diff --git a/72_SolidAngleVisualizer/include/transform.hpp b/72_SolidAngleVisualizer/include/transform.hpp index 002a9d215..5061ebd49 100644 --- a/72_SolidAngleVisualizer/include/transform.hpp +++ b/72_SolidAngleVisualizer/include/transform.hpp @@ -19,7 +19,7 @@ struct TransformRequestParams struct TransformReturnInfo { - nbl::hlsl::uint16_t2 sceneResolution = { 2048,1024 }; + nbl::hlsl::uint16_t2 sceneResolution = { 0, 0 }; bool isGizmoWindowHovered; bool isGizmoBeingUsed; }; diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp index b6d723e70..1025eb067 100644 --- a/72_SolidAngleVisualizer/main.cpp +++ b/72_SolidAngleVisualizer/main.cpp @@ -5,7 +5,6 @@ #include "common.hpp" #include "app_resources/hlsl/common.hlsl" - #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" /* @@ -319,10 +318,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // CPU events update(nextPresentationTimestamp); - const auto& virtualWindowRes = interface.transformReturnInfo.sceneResolution; - // TODO: check main frame buffer too - if (!m_solidAngleViewFramebuffer || m_solidAngleViewFramebuffer->getCreationParameters().width != virtualWindowRes[0] || m_solidAngleViewFramebuffer->getCreationParameters().height != virtualWindowRes[1]) - recreateFramebuffer(virtualWindowRes); + { + const auto& virtualSolidAngleWindowRes = interface.solidAngleViewTransformReturnInfo.sceneResolution; + const auto& virtualMainWindowRes = interface.mainViewTransformReturnInfo.sceneResolution; + if (!m_solidAngleViewFramebuffer || m_solidAngleViewFramebuffer->getCreationParameters().width != virtualSolidAngleWindowRes[0] || m_solidAngleViewFramebuffer->getCreationParameters().height != virtualSolidAngleWindowRes[1] || + !m_mainViewFramebuffer || m_mainViewFramebuffer->getCreationParameters().width != virtualMainWindowRes[0] || m_mainViewFramebuffer->getCreationParameters().height != virtualMainWindowRes[1]) + recreateFramebuffer(); + } // const auto resourceIx = m_realFrameIx % MaxFramesInFlight; @@ -334,6 +336,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} }; if (m_solidAngleViewFramebuffer) { + auto creationParams = m_solidAngleViewFramebuffer->getCreationParameters(); cb->beginDebugMarker("Draw Circle View Frame"); { const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f }; @@ -344,7 +347,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .depthStencilClearValues = &farValue, .renderArea = { .offset = {0,0}, - .extent = {virtualWindowRes[0],virtualWindowRes[1]} + .extent = {creationParams.width, creationParams.height} } }; beginRenderpass(cb, renderpassInfo); @@ -353,7 +356,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { PushConstants pc{ .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), - .viewport = { 0.f,0.f,static_cast(virtualWindowRes[0]),static_cast(virtualWindowRes[1]) } + .viewport = { 0.f,0.f,static_cast(creationParams.width),static_cast(creationParams.height) } }; auto pipeline = m_visualizationPipeline; cb->bindGraphicsPipeline(pipeline.get()); @@ -369,6 +372,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { cb->beginDebugMarker("Main Scene Frame"); { + auto creationParams = m_mainViewFramebuffer->getCreationParameters(); const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f }; const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = { @@ -377,7 +381,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .depthStencilClearValues = &farValue, .renderArea = { .offset = {0,0}, - .extent = {virtualWindowRes[0],virtualWindowRes[1]} + .extent = {creationParams.width, creationParams.height} } }; beginRenderpass(cb, renderpassInfo); @@ -404,12 +408,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // TODO: a better way to get identity matrix float32_t3x4 origin = { - 0.2f,0.0f,0.0f,0.0f, - 0.0f,0.2f,0.0f,0.0f, - 0.0f,0.0f,0.2f,0.0f + 1.0f,0.0f,0.0f,0.0f, + 0.0f,1.0f,0.0f,0.0f, + 0.0f,0.0f,1.0f,0.0f }; memcpy(&instance.world, &origin, sizeof(instance.world)); - instance.packedGeo = m_renderer->getGeometries().data() + 3; // sphere + instance.packedGeo = m_renderer->getGeometries().data() + 2; // disk m_renderer->render(cb, viewParams); } cb->endRenderPass(); @@ -575,7 +579,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ); keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { - //if (interface.move) + if (interface.move) camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl for (const auto& e : events) // here capture @@ -606,9 +610,10 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR interface.imGUI->update(params); } - void recreateFramebuffer(const uint16_t2 resolution) + void recreateFramebuffer() { - auto createImageAndView = [&](E_FORMAT format)->smart_refctd_ptr + + auto createImageAndView = [&](const uint16_t2 resolution, E_FORMAT format)->smart_refctd_ptr { auto image = m_device->createImage({ { .type = IGPUImage::ET_2D, @@ -632,29 +637,32 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR smart_refctd_ptr solidAngleView; smart_refctd_ptr mainView; + const uint16_t2 solidAngleViewRes = interface.solidAngleViewTransformReturnInfo.sceneResolution; + const uint16_t2 mainViewRes = interface.mainViewTransformReturnInfo.sceneResolution; + // detect window minimization - if (resolution.x < 0x4000 && resolution.y < 0x4000) + if (solidAngleViewRes.x < 0x4000 && solidAngleViewRes.y < 0x4000 || + mainViewRes.x < 0x4000 && mainViewRes.y < 0x4000) { - solidAngleView = createImageAndView(finalSceneRenderFormat); - auto solidAngleDepthView = createImageAndView(sceneRenderDepthFormat); + solidAngleView = createImageAndView(solidAngleViewRes, finalSceneRenderFormat); + auto solidAngleDepthView = createImageAndView(solidAngleViewRes, sceneRenderDepthFormat); m_solidAngleViewFramebuffer = m_device->createFramebuffer({ { .renderpass = m_solidAngleRenderpass, .depthStencilAttachments = &solidAngleDepthView.get(), .colorAttachments = &solidAngleView.get(), - .width = resolution.x, - .height = resolution.y + .width = solidAngleViewRes.x, + .height = solidAngleViewRes.y } }); - mainView = createImageAndView(finalSceneRenderFormat); - auto mainDepthView = createImageAndView(sceneRenderDepthFormat); + mainView = createImageAndView(mainViewRes, finalSceneRenderFormat); + auto mainDepthView = createImageAndView(mainViewRes, sceneRenderDepthFormat); m_mainViewFramebuffer = m_device->createFramebuffer({ { .renderpass = m_mainRenderpass, .depthStencilAttachments = &mainDepthView.get(), .colorAttachments = &mainView.get(), - .width = resolution.x, - .height = resolution.y + .width = mainViewRes.x, + .height = mainViewRes.y } }); - } else { @@ -715,6 +723,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight; + constexpr static inline float32_t4x4 OBBModelMatrixDefault + { + 1.0f, 0.0f, 0.0f, 0.0f, + 0.0f, 1.0f, 0.0f, 0.0f, + 0.0f, 0.0f, 1.0f, 0.0f, + 0.0f, 0.0f, 6.0f, 1.0f + }; // smart_refctd_ptr m_scene; smart_refctd_ptr m_solidAngleRenderpass; @@ -722,7 +737,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR smart_refctd_ptr m_renderer; smart_refctd_ptr m_solidAngleViewFramebuffer; smart_refctd_ptr m_mainViewFramebuffer; - smart_refctd_ptr m_visualizationPipeline; + smart_refctd_ptr m_visualizationPipeline; // smart_refctd_ptr m_semaphore; uint64_t m_realFrameIx = 0; @@ -733,19 +748,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // UI stuff struct CInterface { - void cameraToHome() - { - core::vectorSIMDf cameraPosition(-3.0f, 3.0f, 6.0f); - core::vectorSIMDf cameraTarget(0.f, 0.f, 6.f); - const static core::vectorSIMDf up(0.f, 1.f, 0.f); - - camera.setPosition(cameraPosition); - camera.setTarget(cameraTarget); - camera.setBackupUpVector(up); - - camera.recomputeViewMatrix(); - } - void operator()() { ImGuiIO& io = ImGui::GetIO(); @@ -773,7 +775,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR return projection; }()); - ImGuizmo::SetOrthographic(false); + ImGuizmo::SetOrthographic(!isPerspective); ImGuizmo::BeginFrame(); ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); @@ -830,7 +832,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR if (viewDirty || firstFrame) { - cameraToHome(); + camera.setPosition(cameraIntialPosition); + camera.setTarget(cameraInitialTarget); + camera.setBackupUpVector(cameraInitialUp); + camera.setUpVector(cameraInitialUp); + + camera.recomputeViewMatrix(); } firstFrame = false; @@ -895,19 +902,15 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR * note it also modifies input view matrix but projection matrix is immutable */ - if (ImGui::IsKeyPressed(ImGuiKey_Home)) - { - cameraToHome(); - } + // No need because camera already has this functionality + // if (ImGui::IsKeyPressed(ImGuiKey_Home)) + // { + // cameraToHome(); + // } if (ImGui::IsKeyPressed(ImGuiKey_End)) { - m_OBBModelMatrix = { - 1.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, - 0.0f, 0.0f, 12.0f, 1.0f - }; + m_OBBModelMatrix = OBBModelMatrixDefault; } static struct @@ -930,10 +933,14 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/ transformParams.editTransformDecomposition = true; - transformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams); + mainViewTransformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams); + // MODEL: Zup -> Yup + + m_OBBModelMatrix = imguizmoM16InOut.model; // TODO: camera stops when cursor hovers gizmo, but we also want to stop when gizmo is being used - move = (ImGui::IsMouseDown(ImGuiMouseButton_Left) || transformReturnInfo.isGizmoWindowHovered) && (!transformReturnInfo.isGizmoBeingUsed); + move = (ImGui::IsMouseDown(ImGuiMouseButton_Left) || mainViewTransformReturnInfo.isGizmoWindowHovered) && (!mainViewTransformReturnInfo.isGizmoBeingUsed); + } // to Nabla + update camera & model matrices @@ -957,9 +964,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::SetNextWindowSize(ImVec2(800, 800), ImGuiCond_Appearing); ImGui::SetNextWindowPos(ImVec2(1240, 20), ImGuiCond_Appearing); static bool isOpen = true; - ImGui::Begin("Solid angle view", &isOpen, 0); + ImGui::Begin("Projected Solid Angle View", &isOpen, 0); ImVec2 contentRegionSize = ImGui::GetContentRegionAvail(); + solidAngleViewTransformReturnInfo.sceneResolution = uint16_t2(static_cast(contentRegionSize.x), static_cast(contentRegionSize.y)); + solidAngleViewTransformReturnInfo.isGizmoBeingUsed = false; // not used in this view + solidAngleViewTransformReturnInfo.isGizmoWindowHovered = false; // not used in this view ImGui::Image({ renderColorViewDescIndices[ERV_SOLID_ANGLE_VIEW] }, contentRegionSize); ImGui::End(); } @@ -1081,21 +1091,19 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); // mutables - float32_t4x4 m_OBBModelMatrix{ - 1.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, - 0.0f, 0.0f, 12.0f, 1.0f - }; + float32_t4x4 m_OBBModelMatrix = OBBModelMatrixDefault; //std::string_view objectName; TransformRequestParams transformParams; - TransformReturnInfo transformReturnInfo; + TransformReturnInfo mainViewTransformReturnInfo; + TransformReturnInfo solidAngleViewTransformReturnInfo; + + const static inline core::vectorSIMDf cameraIntialPosition{ -3.0f, 6.0f, 3.0f }; + const static inline core::vectorSIMDf cameraInitialTarget{ 0.f, 0.0f, 3.f }; + const static inline core::vectorSIMDf cameraInitialUp{ 0.f, 0.f, 1.f }; float fov = 90.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f; float viewWidth = 10.f; - float camYAngle = 90.f / 180.f * 3.14159f; - float camXAngle = 0.f / 180.f * 3.14159f; //uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed bool isPerspective = true, isLH = true, flipGizmoY = true, move = true; bool firstFrame = true; diff --git a/common/include/nbl/examples/cameras/CCamera.hpp b/common/include/nbl/examples/cameras/CCamera.hpp index 3b3cd38d8..f35cd341a 100644 --- a/common/include/nbl/examples/cameras/CCamera.hpp +++ b/common/include/nbl/examples/cameras/CCamera.hpp @@ -149,38 +149,36 @@ class Camera if(ev.type == nbl::ui::SMouseEvent::EET_MOVEMENT && mouseDown) { nbl::core::vectorSIMDf pos = getPosition(); - nbl::core::vectorSIMDf localTarget = getTarget() - pos; - - // Get Relative Rotation for localTarget in Radians - float relativeRotationX, relativeRotationY; - relativeRotationY = atan2(localTarget.X, localTarget.Z); - const double z1 = nbl::core::sqrt(localTarget.X*localTarget.X + localTarget.Z*localTarget.Z); - relativeRotationX = atan2(z1, localTarget.Y) - nbl::core::PI()/2; - - constexpr float RotateSpeedScale = 0.003f; - relativeRotationX -= ev.movementEvent.relativeMovementY * rotateSpeed * RotateSpeedScale * -1.0f; - float tmpYRot = ev.movementEvent.relativeMovementX * rotateSpeed * RotateSpeedScale * -1.0f; + nbl::core::vectorSIMDf upVector = getUpVector(); + nbl::core::vectorSIMDf forward = nbl::core::normalize(getTarget() - pos); + + nbl::core::vectorSIMDf right = nbl::core::normalize(nbl::core::cross(forward, upVector)); + nbl::core::vectorSIMDf up = nbl::core::normalize(nbl::core::cross(right, forward)); + + constexpr float RotateSpeedScale = 0.003f; + float pitchDelta = ev.movementEvent.relativeMovementY * rotateSpeed * RotateSpeedScale * -1.0f; + float yawDelta = ev.movementEvent.relativeMovementX * rotateSpeed * RotateSpeedScale * -1.0f; if (leftHanded) - relativeRotationY -= tmpYRot; - else - relativeRotationY += tmpYRot; + yawDelta = -yawDelta; - const double MaxVerticalAngle = nbl::core::radians(88.0f); + // Clamp pitch BEFORE applying rotation + const float MaxVerticalAngle = nbl::core::radians(88.0f); + float currentPitch = asin(nbl::core::dot(forward, upVector).X); + float newPitch = nbl::core::clamp(currentPitch + pitchDelta, -MaxVerticalAngle, MaxVerticalAngle); + pitchDelta = newPitch - currentPitch; - if (relativeRotationX > MaxVerticalAngle*2 && relativeRotationX < 2 * nbl::core::PI()-MaxVerticalAngle) - relativeRotationX = 2 * nbl::core::PI()-MaxVerticalAngle; - else - if (relativeRotationX > MaxVerticalAngle && relativeRotationX < 2 * nbl::core::PI()-MaxVerticalAngle) - relativeRotationX = MaxVerticalAngle; + // Create rotation quaternions using axis-angle method + nbl::core::quaternion pitchRot = nbl::core::quaternion::fromAngleAxis(pitchDelta, right); + nbl::core::quaternion yawRot = nbl::core::quaternion::fromAngleAxis(yawDelta, upVector); + nbl::core::quaternion combinedRot = yawRot * pitchRot; - localTarget.set(0,0, nbl::core::max(1.f, nbl::core::length(pos)[0]), 1.f); + // Apply to forward vector + forward = nbl::core::normalize(combinedRot.transformVect(forward)); - nbl::core::matrix3x4SIMD mat; - mat.setRotation(nbl::core::quaternion(relativeRotationX, relativeRotationY, 0)); - mat.transformVect(localTarget); - - setTarget(localTarget + pos); + // Set new target + float targetDistance = nbl::core::length(getTarget() - pos).X; + setTarget(pos + forward * targetDistance); } } } From adb15edd201e82cbc9ed3526bbfccfc67ccdf4ff Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Sun, 7 Dec 2025 00:12:56 +0300 Subject: [PATCH 03/26] sphere arc "cube edge" in solid angle view, more reliable resizing of windows --- .../hlsl/SolidAngleVis.frag.hlsl | 218 ++++++++---------- 72_SolidAngleVisualizer/main.cpp | 24 +- 2 files changed, 107 insertions(+), 135 deletions(-) diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index 2ad766c8a..badf1e4be 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -32,8 +32,7 @@ static const float3 localNormals[6] = { static float3 corners[8]; static float3 faceCenters[6] = { float3(0,0,0), float3(0,0,0), float3(0,0,0), float3(0,0,0), float3(0,0,0), float3(0,0,0) }; -static float2 projCorners[8]; -static bool cornerVisible[8]; + // Converts UV into centered, aspect-corrected NDC circle space @@ -46,32 +45,7 @@ float2 toCircleSpace(float2 uv) float aspect = pc.viewport.z / pc.viewport.w; // width / height p.x *= aspect; - return p; -} - - -// Distance to a 2D line segment -float sdSegment(float2 p, float2 a, float2 b) -{ - float2 pa = p - a; - float2 ba = b - a; - float h = clamp(dot(pa, ba) / dot(ba, ba), 0.0f, 1.0f); - return length(pa - ba * h); -} - -// TODO: Hemispherical Projection (Solid Angle / Orthographic/Lambertian Projection) -bool projectToOrthoSphere(float3 p, out float2 uv) -{ - float3 n = normalize(p); // direction to sphere - - // hemisphere (Z > 0) - if (n.z <= 0.0) - return false; - - // orthographic projection (drop Z) - uv = n.xy; - - return true; // valid + return p * CIRCLE_RADIUS; } void computeCubeGeo() @@ -86,121 +60,121 @@ void computeCubeGeo() faceCenters[i/4] += worldPos / 4.0f; faceCenters[2+i%2] += worldPos / 4.0f; faceCenters[4+(i/2)%2] += worldPos / 4.0f; - - float3 viewPos = worldPos.xyz; - cornerVisible[i] = projectToOrthoSphere(viewPos, projCorners[i]); - projCorners[i] *= CIRCLE_RADIUS; // scale to circle radius } } -// int getVisibilityCount(int2 faces, float3 cameraPos) -// { -// float3x3 rotMatrix = (float3x3)pc.modelMatrix; -// float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]); -// float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]); - -// float3 viewVec_f1 = faceCenters[faces.x] - cameraPos; -// float3 viewVec_f2 = faceCenters[faces.y] - cameraPos; - -// // Face is visible if its outward normal points towards the origin (camera). -// bool visible1 = dot(n_world_f1, viewVec_f1) < 0.0f; -// bool visible2 = dot(n_world_f2, viewVec_f2) < 0.0f; - -// // Determine Line Style: -// bool isSilhouette = visible1 != visible2; // One face visible, the other hidden -// bool isInner = visible1 && visible2; // Both faces visible - -// int visibilityCount = 0; -// if (isSilhouette) -// { -// visibilityCount = 1; -// } -// else if (isInner) -// { -// visibilityCount = 2; -// } - -// return visibilityCount; -// } - -// void drawLine(float2 p, int a, int b, int visibilityCount, inout float4 color, float aaWidth) -// { -// if (visibilityCount > 0) -// { -// float3 A = corners[a]; -// float3 B = corners[b]; - -// float avgDepth = (length(A) + length(B)) * 0.5f; -// float referenceDepth = 3.0f; -// float depthScale = referenceDepth / avgDepth; - -// float baseWidth = (visibilityCount == 1) ? 0.005f : 0.002f; -// float intensity = (visibilityCount == 1) ? 1.0f : 0.5f; -// float4 edgeColor = (visibilityCount == 1) ? float4(0.0f, 0.5f, 1.0f, 1.0f) : float4(1.0f, 0.0f, 0.0f, 1.0f); // Blue vs Red - -// float width = min(baseWidth * depthScale, 0.03f); - -// float dist = sdSegment(p, projCorners[a], projCorners[b]); - -// float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); - -// color += edgeColor * alpha * intensity; -// } -// } - -void drawRing(float2 p, inout float4 color, float aaWidth) +float4 drawRing(float2 p, float aaWidth) { float positionLength = length(p); - - // Mask to cut off drawing outside the circle - // float circleMask = 1.0f - smoothstep(CIRCLE_RADIUS, CIRCLE_RADIUS + aaWidth, positionLength); - // color *= circleMask; // Add a white background circle ring - float ringWidth = 0.005f; + float ringWidth = 0.01f; float ringDistance = abs(positionLength - CIRCLE_RADIUS); float ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance); - // Ring color is now white - color = max(color, float4(1.0, 1.0, 1.0, 1.0) * ringAlpha); + return ringAlpha.xxxx; } -float plotPoint(float2 uv, float2 p, float r) +// Check if a face on the hemisphere is visible from camera at origin +bool isFaceVisible(float3 faceCenter, float3 faceNormal) { - return step(length(uv - p), r); + // Face is visible if normal points toward camera (at origin) + float3 viewVec = -normalize(faceCenter); // Vector from face to camera + return dot(faceNormal, viewVec) > 0.0f; } +int getEdgeVisibility(int edgeIdx, float3 cameraPos) +{ + int2 faces = edgeToFaces[edgeIdx]; + + // Transform normals to world space + float3x3 rotMatrix = (float3x3)pc.modelMatrix; + float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]); + float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]); + + bool visible1 = isFaceVisible(faceCenters[faces.x], n_world_f1); + bool visible2 = isFaceVisible(faceCenters[faces.y], n_world_f2); + + // Silhouette: exactly one face visible + if (visible1 != visible2) return 1; + + // Inner edge: both faces visible + if (visible1 && visible2) return 2; + + // Hidden edge: both faces hidden + return 0; +} + +// Draw great circle arc in fragment shader +float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float aaWidth) +{ + if (visibility == 0) return float4(0,0,0,0); // Hidden edge + + float3 v0 = normalize(corners[edgeVerts.x]); + float3 v1 = normalize(corners[edgeVerts.y]); + float3 p = normalize(fragPos); // Current point on hemisphere + + // Great circle plane normal + float3 arcNormal = normalize(cross(v0, v1)); + + // Distance to great circle + float dist = abs(dot(p, arcNormal)); + + // Check if point is within arc bounds + float dotMid = dot(v0, v1); + bool onArc = (dot(p, v0) >= dotMid) && (dot(p, v1) >= dotMid); + + if (!onArc) return float4(0,0,0,0); + + // Depth-based width scaling + float avgDepth = (length(corners[edgeVerts.x]) + length(corners[edgeVerts.y])) * 0.5f; + float depthScale = 3.0f / avgDepth; + + float baseWidth = (visibility == 1) ? 0.01f : 0.005f; + float width = min(baseWidth * depthScale, 0.02f); + + float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); + + float4 edgeColor = (visibility == 1) ? + float4(0.0f, 0.5f, 1.0f, 1.0f) : // Silhouette: blue + float4(1.0f, 0.0f, 0.0f, 1.0f); // Inner: red + + float intensity = (visibility == 1) ? 1.0f : 0.5f; + return edgeColor * alpha * intensity; +} [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 { - float3 cameraPos = float3(0, 0, 0); // Camera at origin - float2 p = toCircleSpace(vx.uv); + float3 cameraPos = float3(0, 0, 0); float4 color = float4(0, 0, 0, 0); - - computeCubeGeo(); + float2 p = toCircleSpace(vx.uv); - float aaWidth = max(fwidth(p.x), fwidth(p.y)); - - float pointMask = 0.0; - for (int i=0; i<8; i++) + // Convert 2D disk position to 3D hemisphere position + // p is in range [-CIRCLE_RADIUS, CIRCLE_RADIUS] + float2 normalized = p / CIRCLE_RADIUS; // Now in range [-1, 1] + float r2 = dot(normalized, normalized); + + if (r2 > 1.0f) + discard; + + // Convert UV to 3D position on hemisphere + float3 spherePos = normalize(float3(normalized.x, normalized.y, sqrt(1 - r2))); + + computeCubeGeo(); // Your existing function + + float aaWidth = length(float2(ddx(p.x), ddy(p.y))); + + // Draw edges as great circle arcs + for (int j = 0; j < 12; j++) { - if (cornerVisible[i]) - pointMask += plotPoint(p, projCorners[i], 0.015f); + int a = j % 4 * (j < 4 ? 1 : 2) - (j / 4 == 1 ? j % 2 : 0); + int b = a + (4 >> (j / 4)); + + int visibility = getEdgeVisibility(j, cameraPos); + color += drawGreatCircleArc(spherePos, int2(a, b), visibility, aaWidth); } - - color += pointMask * float4(1,0,0,1); // red points - - // for (int j = 0; j < 12; j++) - // { - // int a = j % 4 * (j < 4 ? 1 : 2) - (j / 4 == 1 ? j % 2 : 0); - // int b = a + (4 >> (j / 4)); - - // // int2 faces = edgeToFaces[j]; - // // int visibilityCount = getVisibilityCount(faces, cameraPos); - // // drawLine(p, a, b, visibilityCount, color, aaWidth); - // } - - drawRing(p, color, aaWidth); - + + color += drawRing(p, aaWidth); + return color; } \ No newline at end of file diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp index 1025eb067..8fb8bf144 100644 --- a/72_SolidAngleVisualizer/main.cpp +++ b/72_SolidAngleVisualizer/main.cpp @@ -323,7 +323,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR const auto& virtualMainWindowRes = interface.mainViewTransformReturnInfo.sceneResolution; if (!m_solidAngleViewFramebuffer || m_solidAngleViewFramebuffer->getCreationParameters().width != virtualSolidAngleWindowRes[0] || m_solidAngleViewFramebuffer->getCreationParameters().height != virtualSolidAngleWindowRes[1] || !m_mainViewFramebuffer || m_mainViewFramebuffer->getCreationParameters().width != virtualMainWindowRes[0] || m_mainViewFramebuffer->getCreationParameters().height != virtualMainWindowRes[1]) - recreateFramebuffer(); + recreateFramebuffers(); } // @@ -402,10 +402,9 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR auto& instance = m_renderer->m_instances[0]; auto transposed = hlsl::transpose(interface.m_OBBModelMatrix); memcpy(&instance.world, &transposed, sizeof(instance.world)); - instance.packedGeo = m_renderer->getGeometries().data();// +interface.gcIndex; + instance.packedGeo = m_renderer->getGeometries().data(); // cube // +interface.gcIndex; m_renderer->render(cb, viewParams); // draw the cube/OBB - // TODO: a better way to get identity matrix float32_t3x4 origin = { 1.0f,0.0f,0.0f,0.0f, @@ -536,7 +535,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR camera.setMoveSpeed(interface.moveSpeed); camera.setRotateSpeed(interface.rotateSpeed); - m_inputSystem->getDefaultMouse(&mouse); m_inputSystem->getDefaultKeyboard(&keyboard); @@ -610,7 +608,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR interface.imGUI->update(params); } - void recreateFramebuffer() + void recreateFramebuffers() { auto createImageAndView = [&](const uint16_t2 resolution, E_FORMAT format)->smart_refctd_ptr @@ -671,30 +669,30 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } // release previous slot and its image - interface.subAllocDS->multi_deallocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices, { .semaphore = m_semaphore.get(),.value = m_realFrameIx }); + interface.subAllocDS->multi_deallocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices, { .semaphore = m_semaphore.get(),.value = m_realFrameIx + 1 }); // - if (solidAngleView) + if (solidAngleView && mainView) { interface.subAllocDS->multi_allocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices); // update descriptor set IGPUDescriptorSet::SDescriptorInfo infos[static_cast(CInterface::Count)] = {}; - infos[0].desc = solidAngleView; + infos[0].desc = mainView; infos[0].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; - infos[1].desc = mainView; + infos[1].desc = solidAngleView; infos[1].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; const IGPUDescriptorSet::SWriteDescriptorSet write[static_cast(CInterface::Count)] = { {.dstSet = interface.subAllocDS->getDescriptorSet(), .binding = TexturesImGUIBindingIndex, - .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)], + .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_MAIN_VIEW)], .count = 1, .info = &infos[static_cast(CInterface::ERV_MAIN_VIEW)] }, { .dstSet = interface.subAllocDS->getDescriptorSet(), .binding = TexturesImGUIBindingIndex, - .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_MAIN_VIEW)], + .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)], .count = 1, - .info = &infos[1] + .info = &infos[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)] } }; m_device->updateDescriptorSets({ write, static_cast(CInterface::Count) }, {}); @@ -728,7 +726,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, - 0.0f, 0.0f, 6.0f, 1.0f + 0.0f, 0.0f, 3.0f, 1.0f }; // smart_refctd_ptr m_scene; From 008e2ee154b6cf5ba725752a3f1b4dac5d37ff42 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Sun, 7 Dec 2025 00:29:22 +0300 Subject: [PATCH 04/26] Scaling by pressing G to prevent conflict with WASD camera movement, also added Q and E for moving up and down --- 72_SolidAngleVisualizer/include/transform.hpp | 4 +++- common/include/nbl/examples/cameras/CCamera.hpp | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/72_SolidAngleVisualizer/include/transform.hpp b/72_SolidAngleVisualizer/include/transform.hpp index 5061ebd49..639c0fa3a 100644 --- a/72_SolidAngleVisualizer/include/transform.hpp +++ b/72_SolidAngleVisualizer/include/transform.hpp @@ -35,13 +35,15 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti static bool boundSizing = false; static bool boundSizingSnap = false; + ImGui::Text("Press T/R/G to change gizmo mode"); + if (params.editTransformDecomposition) { if (ImGui::IsKeyPressed(ImGuiKey_T)) mCurrentGizmoOperation = ImGuizmo::TRANSLATE; if (ImGui::IsKeyPressed(ImGuiKey_R)) mCurrentGizmoOperation = ImGuizmo::ROTATE; - if (ImGui::IsKeyPressed(ImGuiKey_S)) + if (ImGui::IsKeyPressed(ImGuiKey_G)) mCurrentGizmoOperation = ImGuizmo::SCALE; if (ImGui::RadioButton("Translate", mCurrentGizmoOperation == ImGuizmo::TRANSLATE)) mCurrentGizmoOperation = ImGuizmo::TRANSLATE; diff --git a/common/include/nbl/examples/cameras/CCamera.hpp b/common/include/nbl/examples/cameras/CCamera.hpp index f35cd341a..e5f077e46 100644 --- a/common/include/nbl/examples/cameras/CCamera.hpp +++ b/common/include/nbl/examples/cameras/CCamera.hpp @@ -39,6 +39,8 @@ class Camera enum E_CAMERA_MOVE_KEYS : uint8_t { ECMK_MOVE_FORWARD = 0, + ECMK_MOVE_UP, + ECMK_MOVE_DOWN, ECMK_MOVE_BACKWARD, ECMK_MOVE_LEFT, ECMK_MOVE_RIGHT, @@ -47,6 +49,8 @@ class Camera inline void mapKeysToWASD() { + keysMap[ECMK_MOVE_UP] = nbl::ui::EKC_E; + keysMap[ECMK_MOVE_DOWN] = nbl::ui::EKC_Q; keysMap[ECMK_MOVE_FORWARD] = nbl::ui::EKC_W; keysMap[ECMK_MOVE_BACKWARD] = nbl::ui::EKC_S; keysMap[ECMK_MOVE_LEFT] = nbl::ui::EKC_A; @@ -211,7 +215,7 @@ class Camera assert(timeDiff >= 0); // handle camera movement - for (const auto logicalKey : { ECMK_MOVE_FORWARD, ECMK_MOVE_BACKWARD, ECMK_MOVE_LEFT, ECMK_MOVE_RIGHT }) + for (const auto logicalKey : { ECMK_MOVE_FORWARD, ECMK_MOVE_UP, ECMK_MOVE_DOWN, ECMK_MOVE_BACKWARD, ECMK_MOVE_LEFT, ECMK_MOVE_RIGHT }) { const auto code = keysMap[logicalKey]; @@ -275,6 +279,9 @@ class Camera up = nbl::core::normalize(backupUpVector); } + pos += up * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_UP] * moveSpeed * MoveSpeedScale; + pos -= up * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_DOWN] * moveSpeed * MoveSpeedScale; + nbl::core::vectorSIMDf strafevect = localTarget; if (leftHanded) strafevect = nbl::core::cross(strafevect, up); From 4290f4ab26360fbf8dac4c45c395fc4a20faf6e3 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Sun, 7 Dec 2025 16:33:09 +0300 Subject: [PATCH 05/26] better clipping of arcs behind the hemisphere --- .../app_resources/hlsl/SolidAngleVis.frag.hlsl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index badf1e4be..c12c007a0 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -114,6 +114,10 @@ float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float float3 v1 = normalize(corners[edgeVerts.y]); float3 p = normalize(fragPos); // Current point on hemisphere + // Skip fragment if not in front of hemisphere or edge if both endpoints are behind horizon + if (p.z < 0.0f || (v0.z < 0.0f && v1.z < 0.0f)) + return float4(0,0,0,0); + // Great circle plane normal float3 arcNormal = normalize(cross(v0, v1)); From ba068c44c08a777bb6794b3e0f019cbdc3605480 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Mon, 8 Dec 2025 08:47:02 +0300 Subject: [PATCH 06/26] WIP quick push for shader code --- .../hlsl/SolidAngleVis.frag.hlsl | 154 +++++++++++++++--- 1 file changed, 135 insertions(+), 19 deletions(-) diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index c12c007a0..7c96a8316 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -20,6 +20,25 @@ static const int2 edgeToFaces[12] = { {0,4}, {5,0}, {4,1}, {1,5} }; +//float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f +static const float3 constCorners[8] = { + float3(-1, -1, -1), // 0 + float3( 1, -1, -1), // 1 + float3(-1, 1, -1), // 2 + float3( 1, 1, -1), // 3 + float3(-1, -1, 1), // 4 + float3( 1, -1, 1), // 5 + float3(-1, 1, 1), // 6 + float3( 1, 1, 1) // 7 +}; + +// All 12 edges of the cube (vertex index pairs) +static const int2 allEdges[12] = { + {0, 1}, {2, 3}, {4, 5}, {6, 7}, // Edges along X axis + {0, 2}, {1, 3}, {4, 6}, {5, 7}, // Edges along Y axis + {0, 4}, {1, 5}, {2, 6}, {3, 7} // Edges along Z axis +}; + static const float3 localNormals[6] = { float3(0, 0, -1), // Face 0 (Z-) float3(0, 0, 1), // Face 1 (Z+) @@ -34,6 +53,30 @@ static float3 faceCenters[6] = { float3(0,0,0), float3(0,0,0), float3(0,0,0), float3(0,0,0), float3(0,0,0), float3(0,0,0) }; +static const float3 colorLUT[8] = { + float3(0, 0, 0), // 0: Black + float3(1, 0, 0), // 1: Red + float3(0, 1, 0), // 2: Green + float3(1, 1, 0), // 3: Yellow + float3(0, 0, 1), // 4: Blue + float3(1, 0, 1), // 5: Magenta + float3(0, 1, 1), // 6: Cyan + float3(1, 1, 1) // 7: White +}; + + + +// Vertices are ordered CCW relative to the camera view. +static const int silhouettes[8][6] = { + {2, 3, 1, 5, 4, 6}, // 0: Black + {6, 7, 5, 1, 0, 2}, // 1: Red + {7, 6, 4, 0, 1, 3}, // 2: Green + {3, 7, 5, 4, 0, 2}, // 3: Yellow + {3, 2, 0, 4, 5, 7}, // 4: Cyan + {1, 3, 7, 6, 4, 0}, // 5: Magenta + {0, 1, 5, 7, 6, 2}, // 6: White + {4, 6, 2, 3, 1, 5} // 7: Gray +}; // Converts UV into centered, aspect-corrected NDC circle space float2 toCircleSpace(float2 uv) @@ -52,7 +95,7 @@ void computeCubeGeo() { for (int i = 0; i < 8; i++) { - float3 localPos = float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f; + float3 localPos = constCorners[i]; //float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f; float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz; corners[i] = worldPos.xyz; @@ -72,7 +115,7 @@ float4 drawRing(float2 p, float aaWidth) float ringDistance = abs(positionLength - CIRCLE_RADIUS); float ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance); - return ringAlpha.xxxx; + return ringAlpha * float4(1, 1, 1, 1); } // Check if a face on the hemisphere is visible from camera at origin @@ -105,7 +148,7 @@ int getEdgeVisibility(int edgeIdx, float3 cameraPos) return 0; } -// Draw great circle arc in fragment shader +// Draw great circle arc in fragment shader with horizon clipping float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float aaWidth) { if (visibility == 0) return float4(0,0,0,0); // Hidden edge @@ -114,8 +157,12 @@ float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float float3 v1 = normalize(corners[edgeVerts.y]); float3 p = normalize(fragPos); // Current point on hemisphere - // Skip fragment if not in front of hemisphere or edge if both endpoints are behind horizon - if (p.z < 0.0f || (v0.z < 0.0f && v1.z < 0.0f)) + // HORIZON CLIPPING: Current fragment must be on front hemisphere + if (p.z < 0.0f) + return float4(0,0,0,0); + + // HORIZON CLIPPING: Skip edge if both endpoints are behind horizon + if (v0.z < 0.0f && v1.z < 0.0f) return float4(0,0,0,0); // Great circle plane normal @@ -149,36 +196,105 @@ float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 { - float3 cameraPos = float3(0, 0, 0); float4 color = float4(0, 0, 0, 0); float2 p = toCircleSpace(vx.uv); // Convert 2D disk position to 3D hemisphere position - // p is in range [-CIRCLE_RADIUS, CIRCLE_RADIUS] - float2 normalized = p / CIRCLE_RADIUS; // Now in range [-1, 1] + float2 normalized = p / CIRCLE_RADIUS; float r2 = dot(normalized, normalized); - if (r2 > 1.0f) - discard; - // Convert UV to 3D position on hemisphere float3 spherePos = normalize(float3(normalized.x, normalized.y, sqrt(1 - r2))); - computeCubeGeo(); // Your existing function + computeCubeGeo(); + + float3 obbCenter = mul(pc.modelMatrix, float4(0, 0, 0, 1)).xyz; + + float3 viewDir = obbCenter; + + // Is this correct? + float dotX = dot(viewDir, float3(pc.modelMatrix[0][0], pc.modelMatrix[1][0], pc.modelMatrix[2][0])); + float dotY = dot(viewDir, float3(pc.modelMatrix[0][1], pc.modelMatrix[1][1], pc.modelMatrix[2][1])); + float dotZ = dot(viewDir, float3(pc.modelMatrix[0][2], pc.modelMatrix[1][2], pc.modelMatrix[2][2])); + + // Determine octant from ray direction signs + int octant = (dotX >= 0 ? 4 : 0) + + (dotY >= 0 ? 2 : 0) + + (dotZ >= 0 ? 1 : 0); + + if (all(vx.uv >= float2(0.49f, 0.49f) ) && all(vx.uv <= float2(0.51f, 0.51f))) + { + return float4(colorLUT[octant], 1.0f); + } + + float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); - float aaWidth = length(float2(ddx(p.x), ddy(p.y))); + + // Draw the 6 silhouette edges + for (int i = 0; i < 6; i++) + { + int v0Idx = silhouettes[octant][i]; + int v1Idx = silhouettes[octant][(i + 1) % 6]; + + float4 edgeContribution = drawGreatCircleArc(spherePos, int2(v0Idx, v1Idx), 1, aaWidth); + color += float4(colorLUT[i] * edgeContribution.a, edgeContribution.a); + } - // Draw edges as great circle arcs - for (int j = 0; j < 12; j++) + // Draw the remaining edges (non-silhouette) in a different color + float3 hiddenEdgeColor = float3(0.3, 0.3, 0.3); // Gray color for hidden edges + + for (int i = 0; i < 12; i++) { - int a = j % 4 * (j < 4 ? 1 : 2) - (j / 4 == 1 ? j % 2 : 0); - int b = a + (4 >> (j / 4)); + int2 edge = allEdges[i]; + + // Check if this edge is already drawn as a silhouette edge + bool isSilhouette = false; + for (int j = 0; j < 6; j++) + { + int v0 = silhouettes[octant][j]; + int v1 = silhouettes[octant][(j + 1) % 6]; + + if ((edge.x == v0 && edge.y == v1) || (edge.x == v1 && edge.y == v0)) + { + isSilhouette = true; + break; + } + } - int visibility = getEdgeVisibility(j, cameraPos); - color += drawGreatCircleArc(spherePos, int2(a, b), visibility, aaWidth); + // Only draw if it's not a silhouette edge + if (!isSilhouette) + { + float4 edgeContribution = drawGreatCircleArc(spherePos, edge, 1, aaWidth); + color += float4(hiddenEdgeColor * edgeContribution.a, edgeContribution.a); + } + } + + // Draw corner labels for debugging + for (int i = 0; i < 8; i++) + { + float3 corner = normalize(corners[i]); + float2 cornerPos = corner.xy; + // Project corner onto 2D circle space + + // Distance from current fragment to corner + float dist = length(spherePos.xy - cornerPos); + + // Draw a small colored dot at the corner + float dotSize = 0.03f; + float dotAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist); + + if (dotAlpha > 0.0f) + { + float brightness = float(i) / 7.0f; + float3 dotColor = colorLUT[i]; + color += float4(dotColor * dotAlpha, dotAlpha); + } } color += drawRing(p, aaWidth); + + // if (r2 > 1.1f) + // color.a = 0.0f; // Outside circle, make transparent return color; } \ No newline at end of file From 91ae8657dee9b4de82c81b97b23b83d3824a6011 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Tue, 9 Dec 2025 00:20:01 +0300 Subject: [PATCH 07/26] Fixed main camera aspect ratio, added 27 configurations for cube silhouette --- .../hlsl/SolidAngleVis.frag.hlsl | 248 ++++++++++++------ 72_SolidAngleVisualizer/include/transform.hpp | 2 +- 72_SolidAngleVisualizer/main.cpp | 9 +- 3 files changed, 167 insertions(+), 92 deletions(-) diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index 7c96a8316..fa0805356 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -53,29 +53,84 @@ static float3 faceCenters[6] = { float3(0,0,0), float3(0,0,0), float3(0,0,0), float3(0,0,0), float3(0,0,0), float3(0,0,0) }; -static const float3 colorLUT[8] = { +static const float3 colorLUT[27] = { + // Row 1: Pure and bright colors float3(0, 0, 0), // 0: Black - float3(1, 0, 0), // 1: Red - float3(0, 1, 0), // 2: Green - float3(1, 1, 0), // 3: Yellow - float3(0, 0, 1), // 4: Blue - float3(1, 0, 1), // 5: Magenta - float3(0, 1, 1), // 6: Cyan - float3(1, 1, 1) // 7: White + float3(1, 1, 1), // 1: White + float3(0.5, 0.5, 0.5), // 2: Gray + + // Row 2: Primary colors + float3(1, 0, 0), // 3: Red + float3(0, 1, 0), // 4: Green + float3(0, 0, 1), // 5: Blue + + // Row 3: Secondary colors + float3(1, 1, 0), // 6: Yellow + float3(1, 0, 1), // 7: Magenta + float3(0, 1, 1), // 8: Cyan + + // Row 4: Orange family + float3(1, 0.5, 0), // 9: Orange + float3(1, 0.65, 0), // 10: Light Orange + float3(0.8, 0.4, 0), // 11: Dark Orange + + // Row 5: Pink/Rose family + float3(1, 0.4, 0.7), // 12: Pink + float3(1, 0.75, 0.8), // 13: Light Pink + float3(0.7, 0.1, 0.3), // 14: Deep Rose + + // Row 6: Purple/Violet family + float3(0.5, 0, 0.5), // 15: Purple + float3(0.6, 0.4, 0.8), // 16: Light Purple + float3(0.3, 0, 0.5), // 17: Indigo + + // Row 7: Green variations + float3(0, 0.5, 0), // 18: Dark Green + float3(0.5, 1, 0), // 19: Lime + float3(0, 0.5, 0.25), // 20: Forest Green + + // Row 8: Blue variations + float3(0, 0, 0.5), // 21: Navy + float3(0.3, 0.7, 1), // 22: Sky Blue + float3(0, 0.4, 0.6), // 23: Teal + + // Row 9: Earth tones + float3(0.6, 0.4, 0.2), // 24: Brown + float3(0.8, 0.7, 0.3), // 25: Tan/Beige + float3(0.4, 0.3, 0.1) // 26: Dark Brown }; // Vertices are ordered CCW relative to the camera view. -static const int silhouettes[8][6] = { - {2, 3, 1, 5, 4, 6}, // 0: Black - {6, 7, 5, 1, 0, 2}, // 1: Red - {7, 6, 4, 0, 1, 3}, // 2: Green - {3, 7, 5, 4, 0, 2}, // 3: Yellow - {3, 2, 0, 4, 5, 7}, // 4: Cyan - {1, 3, 7, 6, 4, 0}, // 5: Magenta - {0, 1, 5, 7, 6, 2}, // 6: White - {4, 6, 2, 3, 1, 5} // 7: Gray +static const int silhouettes[27][7] = { + {6, 1, 3, 2, 6, 4, 5}, // 0: Black + {6, 2, 6, 4, 5, 7, 3}, // 1: White + {6, 0, 4, 5, 7, 3, 2}, // 2: Gray + {6, 1, 3, 7, 6, 4, 5,}, // 3: Red + {4, 4, 5, 7, 6, -1, -1}, // 4: Green + {6, 0, 4, 5, 7, 6, 2}, // 5: Blue + {6, 0, 1, 3, 7, 6, 4}, // 6: Yellow + {6, 0, 1, 5, 7, 6, 4}, // 7: Magenta + {6, 0, 1, 5, 7, 6, 2}, // 8: Cyan + {6, 1, 3, 2, 6, 7, 5}, // 9: Orange + {4, 2, 6, 7, 3, -1, -1}, // 10: Light Orange + {6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange + {4, 1, 3, 7, 5, -1, -1}, // 12: Pink + {6, 0, 4, 6, 7, 3, 2}, // 13: Light Pink + {4, 0, 4, 6, 2, -1, -1}, // 14: Deep Rose + {6, 0, 1, 3, 7, 5, 4}, // 15: Purple + {4, 0, 1, 5, 4, -1, -1}, // 16: Light Purple + {6, 0, 1, 5, 4, 6, 2}, // 17: Indigo + {6, 0, 2, 6, 7, 5, 1}, // 18: Dark Green + {6, 0, 2, 6, 7, 3, 1}, // 19: Lime + {6, 0, 4, 6, 7, 3, 1}, // 20: Forest Green + {6, 0, 2, 3, 7, 5, 1}, // 21: Navy + {4, 0, 2, 3, 1, -1, -1}, // 22: Sky Blue + {6, 0, 4, 6, 2, 3, 1}, // 23: Teal + {6, 0, 2, 3, 7, 5, 4}, // 24: Brown + {6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige + {6, 1, 5, 4, 6, 2, 3} // 26: Dark Brown }; // Converts UV into centered, aspect-corrected NDC circle space @@ -106,6 +161,33 @@ void computeCubeGeo() } } +float4 drawCorners(float3 spherePos, float aaWidth) +{ + float4 color = float4(0,0,0,0); + // Draw corner labels for debugging + for (int i = 0; i < 8; i++) + { + float3 corner = normalize(corners[i]); + float2 cornerPos = corner.xy; + // Project corner onto 2D circle space + + // Distance from current fragment to corner + float dist = length(spherePos.xy - cornerPos); + + // Draw a small colored dot at the corner + float dotSize = 0.03f; + float dotAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist); + + if (dotAlpha > 0.0f) + { + float brightness = float(i) / 7.0f; + float3 dotColor = colorLUT[i]; + color += float4(dotColor * dotAlpha, dotAlpha); + } + } + return color; +} + float4 drawRing(float2 p, float aaWidth) { float positionLength = length(p); @@ -194,54 +276,11 @@ float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float return edgeColor * alpha * intensity; } -[[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 +float4 drawHiddenEdges(float3 spherePos, int configIndex, float aaWidth) { - float4 color = float4(0, 0, 0, 0); - float2 p = toCircleSpace(vx.uv); - - // Convert 2D disk position to 3D hemisphere position - float2 normalized = p / CIRCLE_RADIUS; - float r2 = dot(normalized, normalized); - - // Convert UV to 3D position on hemisphere - float3 spherePos = normalize(float3(normalized.x, normalized.y, sqrt(1 - r2))); - - computeCubeGeo(); - - float3 obbCenter = mul(pc.modelMatrix, float4(0, 0, 0, 1)).xyz; - - float3 viewDir = obbCenter; - - // Is this correct? - float dotX = dot(viewDir, float3(pc.modelMatrix[0][0], pc.modelMatrix[1][0], pc.modelMatrix[2][0])); - float dotY = dot(viewDir, float3(pc.modelMatrix[0][1], pc.modelMatrix[1][1], pc.modelMatrix[2][1])); - float dotZ = dot(viewDir, float3(pc.modelMatrix[0][2], pc.modelMatrix[1][2], pc.modelMatrix[2][2])); - - // Determine octant from ray direction signs - int octant = (dotX >= 0 ? 4 : 0) + - (dotY >= 0 ? 2 : 0) + - (dotZ >= 0 ? 1 : 0); - - if (all(vx.uv >= float2(0.49f, 0.49f) ) && all(vx.uv <= float2(0.51f, 0.51f))) - { - return float4(colorLUT[octant], 1.0f); - } - - float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); - - - // Draw the 6 silhouette edges - for (int i = 0; i < 6; i++) - { - int v0Idx = silhouettes[octant][i]; - int v1Idx = silhouettes[octant][(i + 1) % 6]; - - float4 edgeContribution = drawGreatCircleArc(spherePos, int2(v0Idx, v1Idx), 1, aaWidth); - color += float4(colorLUT[i] * edgeContribution.a, edgeContribution.a); - } - + float4 color = float4(0,0,0,0); // Draw the remaining edges (non-silhouette) in a different color - float3 hiddenEdgeColor = float3(0.3, 0.3, 0.3); // Gray color for hidden edges + float3 hiddenEdgeColor = float3(0.3, 0.3, 0); // dark yellow color for hidden edges for (int i = 0; i < 12; i++) { @@ -249,12 +288,14 @@ float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float // Check if this edge is already drawn as a silhouette edge bool isSilhouette = false; - for (int j = 0; j < 6; j++) + int vertexCount = silhouettes[configIndex][0]; + // Draw the 6 silhouette edges + for (int i = 0; i < vertexCount; i++) { - int v0 = silhouettes[octant][j]; - int v1 = silhouettes[octant][(j + 1) % 6]; + int v0Idx = silhouettes[configIndex][i + 1]; + int v1Idx = silhouettes[configIndex][((i + 1) % vertexCount) + 1]; - if ((edge.x == v0 && edge.y == v1) || (edge.x == v1 && edge.y == v0)) + if ((edge.x == v0Idx && edge.y == v1Idx) || (edge.x == v1Idx && edge.y == v0Idx)) { isSilhouette = true; break; @@ -268,33 +309,66 @@ float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float color += float4(hiddenEdgeColor * edgeContribution.a, edgeContribution.a); } } + return color; +} - // Draw corner labels for debugging - for (int i = 0; i < 8; i++) +[[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 +{ + float4 color = float4(0, 0, 0, 0); + float2 p = toCircleSpace(vx.uv); + + // Convert 2D disk position to 3D hemisphere position + float2 normalized = p / CIRCLE_RADIUS; + float r2 = dot(normalized, normalized); + float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); + + if (all(vx.uv >= float2(0.49f, 0.49f) ) && all(vx.uv <= float2(0.51f, 0.51f))) { - float3 corner = normalize(corners[i]); - float2 cornerPos = corner.xy; - // Project corner onto 2D circle space - - // Distance from current fragment to corner - float dist = length(spherePos.xy - cornerPos); - - // Draw a small colored dot at the corner - float dotSize = 0.03f; - float dotAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist); + return float4(colorLUT[configIndex], 1.0f); + } + + // Convert UV to 3D position on hemisphere + float3 spherePos = normalize(float3(normalized.x, normalized.y, sqrt(1 - r2))); + + computeCubeGeo(); + + // Get OBB center in world space + float3 obbCenter = mul(pc.modelMatrix, float4(0, 0, 0, 1)).xyz; + + float3x3 rotMatrix = (float3x3)pc.modelMatrix; + float3 proj = mul(obbCenter, rotMatrix); // Get all 3 projections at once + + // Get squared column lengths + float lenSqX = dot(rotMatrix[0], rotMatrix[0]); + float lenSqY = dot(rotMatrix[1], rotMatrix[1]); + float lenSqZ = dot(rotMatrix[2], rotMatrix[2]); + + int3 region = int3( + proj.x < -lenSqX ? 0 : (proj.x > lenSqX ? 2 : 1), + proj.y < -lenSqY ? 0 : (proj.y > lenSqY ? 2 : 1), + proj.z < -lenSqZ ? 0 : (proj.z > lenSqZ ? 2 : 1) + ); + + int configIndex = region.x + region.y * 3 + region.z * 9; // 0-26 + + int vertexCount = silhouettes[configIndex][0]; + for (int i = 0; i < vertexCount; i++) + { + int v0Idx = silhouettes[configIndex][i + 1]; + int v1Idx = silhouettes[configIndex][((i + 1) % vertexCount) + 1]; - if (dotAlpha > 0.0f) - { - float brightness = float(i) / 7.0f; - float3 dotColor = colorLUT[i]; - color += float4(dotColor * dotAlpha, dotAlpha); - } + float4 edgeContribution = drawGreatCircleArc(spherePos, int2(v0Idx, v1Idx), 1, aaWidth); + color += float4(colorLUT[i] * edgeContribution.a, edgeContribution.a); } + color += drawHiddenEdges(spherePos, configIndex, aaWidth); + + color += drawCorners(spherePos, aaWidth); + color += drawRing(p, aaWidth); - // if (r2 > 1.1f) - // color.a = 0.0f; // Outside circle, make transparent + if (r2 > 1.1f) + color.a = 0.0f; // Outside circle, make transparent return color; } \ No newline at end of file diff --git a/72_SolidAngleVisualizer/include/transform.hpp b/72_SolidAngleVisualizer/include/transform.hpp index 639c0fa3a..105b2f757 100644 --- a/72_SolidAngleVisualizer/include/transform.hpp +++ b/72_SolidAngleVisualizer/include/transform.hpp @@ -19,7 +19,7 @@ struct TransformRequestParams struct TransformReturnInfo { - nbl::hlsl::uint16_t2 sceneResolution = { 0, 0 }; + nbl::hlsl::uint16_t2 sceneResolution = { 1, 1 }; bool isGizmoWindowHovered; bool isGizmoBeingUsed; }; diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp index 8fb8bf144..5f73797a6 100644 --- a/72_SolidAngleVisualizer/main.cpp +++ b/72_SolidAngleVisualizer/main.cpp @@ -753,16 +753,17 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // TODO: why is this a lambda and not just an assignment in a scope ? camera.setProjectionMatrix([&]() { - matrix4SIMD projection; + const auto& sceneRes = mainViewTransformReturnInfo.sceneResolution; + matrix4SIMD projection; if (isPerspective) if (isLH) - projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar); + projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(fov), sceneRes.x / sceneRes.y, zNear, zFar); else - projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y, zNear, zFar); + projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), sceneRes.x / sceneRes.y, zNear, zFar); else { - float viewHeight = viewWidth * io.DisplaySize.y / io.DisplaySize.x; + float viewHeight = viewWidth * sceneRes.y / sceneRes.x; if (isLH) projection = matrix4SIMD::buildProjectionMatrixOrthoLH(viewWidth, viewHeight, zNear, zFar); From 0124cc9c0ad83d4a38f1e8ac3ddcdf56125740ac Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Tue, 9 Dec 2025 00:30:34 +0300 Subject: [PATCH 08/26] Shader fixes, bast uint16 resolutionf to float --- .../app_resources/hlsl/SolidAngleVis.frag.hlsl | 16 +++++++++------- 72_SolidAngleVisualizer/main.cpp | 2 +- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index fa0805356..ec30c2b64 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -322,10 +322,7 @@ float4 drawHiddenEdges(float3 spherePos, int configIndex, float aaWidth) float r2 = dot(normalized, normalized); float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); - if (all(vx.uv >= float2(0.49f, 0.49f) ) && all(vx.uv <= float2(0.51f, 0.51f))) - { - return float4(colorLUT[configIndex], 1.0f); - } + // Convert UV to 3D position on hemisphere float3 spherePos = normalize(float3(normalized.x, normalized.y, sqrt(1 - r2))); @@ -350,7 +347,7 @@ float4 drawHiddenEdges(float3 spherePos, int configIndex, float aaWidth) ); int configIndex = region.x + region.y * 3 + region.z * 9; // 0-26 - + int vertexCount = silhouettes[configIndex][0]; for (int i = 0; i < vertexCount; i++) { @@ -367,8 +364,13 @@ float4 drawHiddenEdges(float3 spherePos, int configIndex, float aaWidth) color += drawRing(p, aaWidth); - if (r2 > 1.1f) - color.a = 0.0f; // Outside circle, make transparent + if (all(vx.uv >= float2(0.49f, 0.49f) ) && all(vx.uv <= float2(0.51f, 0.51f))) + { + return float4(colorLUT[configIndex], 1.0f); + } + + // if (r2 > 1.1f) + // color.a = 0.0f; // Outside circle, make transparent return color; } \ No newline at end of file diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp index 5f73797a6..85685e705 100644 --- a/72_SolidAngleVisualizer/main.cpp +++ b/72_SolidAngleVisualizer/main.cpp @@ -753,7 +753,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // TODO: why is this a lambda and not just an assignment in a scope ? camera.setProjectionMatrix([&]() { - const auto& sceneRes = mainViewTransformReturnInfo.sceneResolution; + const auto& sceneRes = float16_t2(mainViewTransformReturnInfo.sceneResolution); matrix4SIMD projection; if (isPerspective) From a35eddd1bd83fbf636e820b59c6eef939ed09668 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Tue, 9 Dec 2025 00:44:42 +0300 Subject: [PATCH 09/26] Better color for non-silhouette edges --- .../app_resources/hlsl/SolidAngleVis.frag.hlsl | 2 +- 72_SolidAngleVisualizer/main.cpp | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index ec30c2b64..51cb1946d 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -280,7 +280,7 @@ float4 drawHiddenEdges(float3 spherePos, int configIndex, float aaWidth) { float4 color = float4(0,0,0,0); // Draw the remaining edges (non-silhouette) in a different color - float3 hiddenEdgeColor = float3(0.3, 0.3, 0); // dark yellow color for hidden edges + float3 hiddenEdgeColor = float3(0.1, 0.1, 0.1); // dark yellow color for hidden edges for (int i = 0; i < 12; i++) { diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp index 85685e705..e9266520d 100644 --- a/72_SolidAngleVisualizer/main.cpp +++ b/72_SolidAngleVisualizer/main.cpp @@ -933,9 +933,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR transformParams.editTransformDecomposition = true; mainViewTransformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams); - // MODEL: Zup -> Yup - - m_OBBModelMatrix = imguizmoM16InOut.model; // TODO: camera stops when cursor hovers gizmo, but we also want to stop when gizmo is being used move = (ImGui::IsMouseDown(ImGuiMouseButton_Left) || mainViewTransformReturnInfo.isGizmoWindowHovered) && (!mainViewTransformReturnInfo.isGizmoBeingUsed); From 1c6458d81b83aea176ac7ebda7450a9b395a85bd Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Wed, 17 Dec 2025 22:23:10 +0300 Subject: [PATCH 10/26] A lot more debuggability, and: - Camera movement is disabled correctly - Hacked ViewManipulate to use for the cube itself - Added a storage buffer for debugging and getting stuff from GPU to CPU - Most importantly, disabled skew, used TRS for that - Random OBB buttons - Detection of mismatch of silhouette vertices (between slow more correct algo vs fast LUT based algo) --- .../app_resources/hlsl/Drawing.hlsl | 172 +++++ .../hlsl/SolidAngleVis.frag.hlsl | 644 +++++++++--------- .../app_resources/hlsl/common.hlsl | 49 +- .../app_resources/hlsl/utils.hlsl | 23 + 72_SolidAngleVisualizer/include/transform.hpp | 73 +- 72_SolidAngleVisualizer/main.cpp | 375 ++++++++-- .../include/nbl/examples/cameras/CCamera.hpp | 5 + 7 files changed, 939 insertions(+), 402 deletions(-) create mode 100644 72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl create mode 100644 72_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl new file mode 100644 index 000000000..c3cb5befa --- /dev/null +++ b/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl @@ -0,0 +1,172 @@ +#ifndef _DEBUG_HLSL_ +#define _DEBUG_HLSL_ +#include "common.hlsl" + +float2 sphereToCircle(float3 spherePoint) +{ + if (spherePoint.z >= 0.0f) + { + return spherePoint.xy * CIRCLE_RADIUS; + } + else + { + float r2 = (1.0f - spherePoint.z) / (1.0f + spherePoint.z); + float uv2Plus1 = r2 + 1.0f; + return (spherePoint.xy * uv2Plus1 / 2.0f) * CIRCLE_RADIUS; + } +} + +float4 drawGreatCircleArc(float3 fragPos, float3 points[2], int visibility, float aaWidth) +{ + if (visibility == 0) return float4(0,0,0,0); + + float3 v0 = normalize(points[0]); + float3 v1 = normalize(points[1]); + float3 p = normalize(fragPos); + + float3 arcNormal = normalize(cross(v0, v1)); + float dist = abs(dot(p, arcNormal)); + + float dotMid = dot(v0, v1); + bool onArc = (dot(p, v0) >= dotMid) && (dot(p, v1) >= dotMid); + + if (!onArc) return float4(0,0,0,0); + + float avgDepth = (length(points[0]) + length(points[1])) * 0.5f; + float depthScale = 3.0f / avgDepth; + + float baseWidth = (visibility == 1) ? 0.01f : 0.005f; + float width = min(baseWidth * depthScale, 0.02f); + + float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); + + float4 edgeColor = (visibility == 1) ? + float4(0.0f, 0.5f, 1.0f, 1.0f) : + float4(1.0f, 0.0f, 0.0f, 1.0f); + + float intensity = (visibility == 1) ? 1.0f : 0.5f; + return edgeColor * alpha * intensity; +} + +float4 drawHiddenEdges(float3 spherePos, uint32_t silEdgeMask, float aaWidth) +{ + float4 color = float4(0,0,0,0); + float3 hiddenEdgeColor = float3(0.1, 0.1, 0.1); + + for (int i = 0; i < 12; i++) + { + if ((silEdgeMask & (1u << i)) == 0) + { + int2 edge = allEdges[i]; + float3 edgePoints[2] = { corners[edge.x], corners[edge.y] }; + float4 edgeContribution = drawGreatCircleArc(spherePos, edgePoints, 1, aaWidth); + color += float4(hiddenEdgeColor * edgeContribution.a, edgeContribution.a); + } + } + return color; +} + +float4 drawCorners(float3 spherePos, float2 p, float aaWidth) +{ + float4 color = float4(0,0,0,0); + for (int i = 0; i < 8; i++) + { + float3 corner3D = normalize(corners[i]); + float2 cornerPos = sphereToCircle(corner3D); + float dist = length(p - cornerPos); + float dotSize = 0.02f; + float dotAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist); + if (dotAlpha > 0.0f) + { + float3 dotColor = colorLUT[i]; + color += float4(dotColor * dotAlpha, dotAlpha); + } + } + return color; +} + +float4 drawRing(float2 p, float aaWidth) +{ + float positionLength = length(p); + float ringWidth = 0.002f; + float ringDistance = abs(positionLength - CIRCLE_RADIUS); + float ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance); + return ringAlpha * float4(1, 1, 1, 1); +} + +// Check if a face on the hemisphere is visible from camera at origin +bool isFaceVisible(float3 faceCenter, float3 faceNormal) +{ + float3 viewVec = normalize(-faceCenter); // Vector from camera to face + return dot(faceNormal, viewVec) > 0.0f; +} + +int getEdgeVisibility(int edgeIdx) +{ + int2 faces = edgeToFaces[edgeIdx]; + + // Transform normals to world space + float3x3 rotMatrix = (float3x3)pc.modelMatrix; + float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]); + float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]); + + bool visible1 = isFaceVisible(faceCenters[faces.x], n_world_f1); + bool visible2 = isFaceVisible(faceCenters[faces.y], n_world_f2); + + // Silhouette: exactly one face visible + if (visible1 != visible2) return 1; + + // Inner edge: both faces visible + if (visible1 && visible2) return 2; + + // Hidden edge: both faces hidden + return 0; +} + +#if DEBUG_DATA +uint32_t computeGroundTruthEdgeMask() +{ + uint32_t mask = 0u; + NBL_UNROLL + for (int j = 0; j < 12; j++) + { + // getEdgeVisibility returns 1 for a silhouette edge based on 3D geometry + if (getEdgeVisibility(j) == 1) + { + mask |= (1u << j); + } + } + return mask; +} + +void validateEdgeVisibility(uint32_t sil, int vertexCount, uint32_t generatedSilMask) +{ + uint32_t mismatchAccumulator = 0; + + // The Ground Truth now represents the full 3D silhouette, clipped or not. + uint32_t groundTruthMask = computeGroundTruthEdgeMask(); + + // The comparison checks if the generated mask perfectly matches the full 3D ground truth. + uint32_t mismatchMask = groundTruthMask ^ generatedSilMask; + + if (mismatchMask != 0) + { + NBL_UNROLL + for (int j = 0; j < 12; j++) + { + if ((mismatchMask >> j) & 1u) + { + int2 edge = allEdges[j]; + // Accumulate vertex indices where error occurred + mismatchAccumulator |= (1u << edge.x) | (1u << edge.y); + } + } + } + + // Simple Write (assuming all fragments calculate the same result) + InterlockedOr(DebugDataBuffer[0].edgeVisibilityMismatch, mismatchAccumulator); +} +#endif + + +#endif // _DEBUG_HLSL_ diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index 51cb1946d..cd291dbd2 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -1,376 +1,374 @@ #pragma wave shader_stage(fragment) #include "common.hlsl" - #include +#include "utils.hlsl" using namespace nbl::hlsl; using namespace ext::FullScreenTriangle; [[vk::push_constant]] struct PushConstants pc; +[[vk::binding(0, 0)]] RWStructuredBuffer DebugDataBuffer; -static const float CIRCLE_RADIUS = 0.75f; +static const float CIRCLE_RADIUS = 0.5f; // --- Geometry Utils --- -// Adjacency of edges to faces -static const int2 edgeToFaces[12] = { - {4,2}, {3,4}, {2,5}, {5,3}, - {2,0}, {0,3}, {1,2}, {3,1}, - {0,4}, {5,0}, {4,1}, {1,5} -}; - -//float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f static const float3 constCorners[8] = { - float3(-1, -1, -1), // 0 - float3( 1, -1, -1), // 1 - float3(-1, 1, -1), // 2 - float3( 1, 1, -1), // 3 - float3(-1, -1, 1), // 4 - float3( 1, -1, 1), // 5 - float3(-1, 1, 1), // 6 - float3( 1, 1, 1) // 7 + float3(-1, -1, -1), float3(1, -1, -1), float3(-1, 1, -1), float3(1, 1, -1), + float3(-1, -1, 1), float3(1, -1, 1), float3(-1, 1, 1), float3(1, 1, 1) }; -// All 12 edges of the cube (vertex index pairs) static const int2 allEdges[12] = { - {0, 1}, {2, 3}, {4, 5}, {6, 7}, // Edges along X axis - {0, 2}, {1, 3}, {4, 6}, {5, 7}, // Edges along Y axis - {0, 4}, {1, 5}, {2, 6}, {3, 7} // Edges along Z axis + {0, 1}, {2, 3}, {4, 5}, {6, 7}, // X axis + {0, 2}, {1, 3}, {4, 6}, {5, 7}, // Y axis + {0, 4}, {1, 5}, {2, 6}, {3, 7} // Z axis }; -static const float3 localNormals[6] = { - float3(0, 0, -1), // Face 0 (Z-) - float3(0, 0, 1), // Face 1 (Z+) - float3(-1, 0, 0), // Face 2 (X-) - float3(1, 0, 0), // Face 3 (X+) - float3(0, -1, 0), // Face 4 (Y-) - float3(0, 1, 0) // Face 5 (Y+) +// Adjacency of edges to faces +// Corrected Adjacency of edges to faces +static const int2 edgeToFaces[12] = { + // Edge Index: | allEdges[i] | Shared Faces: + + /* 0 (0-1) */ {4, 0}, // Y- (4) and Z- (0) + /* 1 (2-3) */ {5, 0}, // Y+ (5) and Z- (0) + /* 2 (4-5) */ {4, 1}, // Y- (4) and Z+ (1) + /* 3 (6-7) */ {5, 1}, // Y+ (5) and Z+ (1) + + /* 4 (0-2) */ {2, 0}, // X- (2) and Z- (0) + /* 5 (1-3) */ {3, 0}, // X+ (3) and Z- (0) + /* 6 (4-6) */ {2, 1}, // X- (2) and Z+ (1) + /* 7 (5-7) */ {3, 1}, // X+ (3) and Z+ (1) + + /* 8 (0-4) */ {2, 4}, // X- (2) and Y- (4) + /* 9 (1-5) */ {3, 4}, // X+ (3) and Y- (4) + /* 10 (2-6) */ {2, 5}, // X- (2) and Y+ (5) + /* 11 (3-7) */ {3, 5} // X+ (3) and Y+ (5) }; - static float3 corners[8]; -static float3 faceCenters[6] = { float3(0,0,0), float3(0,0,0), float3(0,0,0), - float3(0,0,0), float3(0,0,0), float3(0,0,0) }; - - -static const float3 colorLUT[27] = { - // Row 1: Pure and bright colors - float3(0, 0, 0), // 0: Black - float3(1, 1, 1), // 1: White - float3(0.5, 0.5, 0.5), // 2: Gray - - // Row 2: Primary colors - float3(1, 0, 0), // 3: Red - float3(0, 1, 0), // 4: Green - float3(0, 0, 1), // 5: Blue - - // Row 3: Secondary colors - float3(1, 1, 0), // 6: Yellow - float3(1, 0, 1), // 7: Magenta - float3(0, 1, 1), // 8: Cyan - - // Row 4: Orange family - float3(1, 0.5, 0), // 9: Orange - float3(1, 0.65, 0), // 10: Light Orange - float3(0.8, 0.4, 0), // 11: Dark Orange - - // Row 5: Pink/Rose family - float3(1, 0.4, 0.7), // 12: Pink - float3(1, 0.75, 0.8), // 13: Light Pink - float3(0.7, 0.1, 0.3), // 14: Deep Rose - - // Row 6: Purple/Violet family - float3(0.5, 0, 0.5), // 15: Purple - float3(0.6, 0.4, 0.8), // 16: Light Purple - float3(0.3, 0, 0.5), // 17: Indigo - - // Row 7: Green variations - float3(0, 0.5, 0), // 18: Dark Green - float3(0.5, 1, 0), // 19: Lime - float3(0, 0.5, 0.25), // 20: Forest Green - - // Row 8: Blue variations - float3(0, 0, 0.5), // 21: Navy - float3(0.3, 0.7, 1), // 22: Sky Blue - float3(0, 0.4, 0.6), // 23: Teal - - // Row 9: Earth tones - float3(0.6, 0.4, 0.2), // 24: Brown - float3(0.8, 0.7, 0.3), // 25: Tan/Beige - float3(0.4, 0.3, 0.1) // 26: Dark Brown +static float3 faceCenters[6] = { + float3(0,0,0), float3(0,0,0), float3(0,0,0), + float3(0,0,0), float3(0,0,0), float3(0,0,0) +}; + +static const float3 localNormals[6] = { + float3(0, 0, -1), // Face 0 (Z-) + float3(0, 0, 1), // Face 1 (Z+) + float3(-1, 0, 0), // Face 2 (X-) + float3(1, 0, 0), // Face 3 (X+) + float3(0, -1, 0), // Face 4 (Y-) + float3(0, 1, 0) // Face 5 (Y+) }; - +// TODO: unused, remove later // Vertices are ordered CCW relative to the camera view. static const int silhouettes[27][7] = { - {6, 1, 3, 2, 6, 4, 5}, // 0: Black - {6, 2, 6, 4, 5, 7, 3}, // 1: White - {6, 0, 4, 5, 7, 3, 2}, // 2: Gray - {6, 1, 3, 7, 6, 4, 5,}, // 3: Red - {4, 4, 5, 7, 6, -1, -1}, // 4: Green - {6, 0, 4, 5, 7, 6, 2}, // 5: Blue - {6, 0, 1, 3, 7, 6, 4}, // 6: Yellow - {6, 0, 1, 5, 7, 6, 4}, // 7: Magenta - {6, 0, 1, 5, 7, 6, 2}, // 8: Cyan - {6, 1, 3, 2, 6, 7, 5}, // 9: Orange - {4, 2, 6, 7, 3, -1, -1}, // 10: Light Orange - {6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange - {4, 1, 3, 7, 5, -1, -1}, // 12: Pink - {6, 0, 4, 6, 7, 3, 2}, // 13: Light Pink - {4, 0, 4, 6, 2, -1, -1}, // 14: Deep Rose - {6, 0, 1, 3, 7, 5, 4}, // 15: Purple - {4, 0, 1, 5, 4, -1, -1}, // 16: Light Purple - {6, 0, 1, 5, 4, 6, 2}, // 17: Indigo - {6, 0, 2, 6, 7, 5, 1}, // 18: Dark Green - {6, 0, 2, 6, 7, 3, 1}, // 19: Lime - {6, 0, 4, 6, 7, 3, 1}, // 20: Forest Green - {6, 0, 2, 3, 7, 5, 1}, // 21: Navy - {4, 0, 2, 3, 1, -1, -1}, // 22: Sky Blue - {6, 0, 4, 6, 2, 3, 1}, // 23: Teal - {6, 0, 2, 3, 7, 5, 4}, // 24: Brown - {6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige - {6, 1, 5, 4, 6, 2, 3} // 26: Dark Brown + {6, 1, 3, 2, 6, 4, 5}, // 0: Black + {6, 2, 6, 4, 5, 7, 3}, // 1: White + {6, 0, 4, 5, 7, 3, 2}, // 2: Gray + {6, 1, 3, 7, 6, 4, 5,}, // 3: Red + {4, 4, 5, 7, 6, -1, -1}, // 4: Green + {6, 0, 4, 5, 7, 6, 2}, // 5: Blue + {6, 0, 1, 3, 7, 6, 4}, // 6: Yellow + {6, 0, 1, 5, 7, 6, 4}, // 7: Magenta + {6, 0, 1, 5, 7, 6, 2}, // 8: Cyan + {6, 1, 3, 2, 6, 7, 5}, // 9: Orange + {4, 2, 6, 7, 3, -1, -1}, // 10: Light Orange + {6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange + {4, 1, 3, 7, 5, -1, -1}, // 12: Pink + {6, 0, 4, 6, 7, 3, 2}, // 13: Light Pink + {4, 0, 4, 6, 2, -1, -1}, // 14: Deep Rose + {6, 0, 1, 3, 7, 5, 4}, // 15: Purple + {4, 0, 1, 5, 4, -1, -1}, // 16: Light Purple + {6, 0, 1, 5, 4, 6, 2}, // 17: Indigo + {6, 0, 2, 6, 7, 5, 1}, // 18: Dark Green + {6, 0, 2, 6, 7, 3, 1}, // 19: Lime + {6, 0, 4, 6, 7, 3, 1}, // 20: Forest Green + {6, 0, 2, 3, 7, 5, 1}, // 21: Navy + {4, 0, 2, 3, 1, -1, -1}, // 22: Sky Blue + {6, 0, 4, 6, 2, 3, 1}, // 23: Teal + {6, 0, 2, 3, 7, 5, 4}, // 24: Brown + {6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige + {6, 1, 5, 4, 6, 2, 3} // 26: Dark Brown }; -// Converts UV into centered, aspect-corrected NDC circle space -float2 toCircleSpace(float2 uv) -{ - // Map [0,1] UV to [-1,1] - float2 p = uv * 2.0f - 1.0f; - - // Correct aspect ratio - float aspect = pc.viewport.z / pc.viewport.w; // width / height - p.x *= aspect; - - return p * CIRCLE_RADIUS; -} +// Binary packed silhouettes +static const uint32_t binSilhouettes[27] = { + 0b11000000000000101100110010011001, + 0b11000000000000011111101100110010, + 0b11000000000000010011111101100000, + 0b11000000000000101100110111011001, + 0b10000000000000000000110111101100, + 0b11000000000000010110111101100000, + 0b11000000000000100110111011001000, + 0b11000000000000100110111101001000, + 0b11000000000000010110111101001000, + 0b11000000000000101111110010011001, + 0b10000000000000000000011111110010, + 0b11000000000000010011111110100000, + 0b10000000000000000000101111011001, + 0b11000000000000010011111110100000, + 0b10000000000000000000010110100000, + 0b11000000000000100101111011001000, + 0b10000000000000000000100101001000, + 0b11000000000000010110100101001000, + 0b11000000000000001101111110010000, + 0b11000000000000001011111110010000, + 0b11000000000000001011111110100000, + 0b11000000000000001101111011010000, + 0b10000000000000000000001011010000, + 0b11000000000000001011010110100000, + 0b11000000000000100101111011010000, + 0b11000000000000100101001011010000, + 0b11000000000000011010110100101001, +}; -void computeCubeGeo() +int getSilhouetteVertex(uint32_t packedSil, int index) { - for (int i = 0; i < 8; i++) - { - float3 localPos = constCorners[i]; //float3(i % 2, (i / 2) % 2, (i / 4) % 2) * 2.0f - 1.0f; - float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz; - - corners[i] = worldPos.xyz; - - faceCenters[i/4] += worldPos / 4.0f; - faceCenters[2+i%2] += worldPos / 4.0f; - faceCenters[4+(i/2)%2] += worldPos / 4.0f; - } + return (packedSil >> (3 * index)) & 0x7; } -float4 drawCorners(float3 spherePos, float aaWidth) +// Get silhouette size +int getSilhouetteSize(uint32_t sil) { - float4 color = float4(0,0,0,0); - // Draw corner labels for debugging - for (int i = 0; i < 8; i++) - { - float3 corner = normalize(corners[i]); - float2 cornerPos = corner.xy; - // Project corner onto 2D circle space - - // Distance from current fragment to corner - float dist = length(spherePos.xy - cornerPos); - - // Draw a small colored dot at the corner - float dotSize = 0.03f; - float dotAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist); - - if (dotAlpha > 0.0f) - { - float brightness = float(i) / 7.0f; - float3 dotColor = colorLUT[i]; - color += float4(dotColor * dotAlpha, dotAlpha); - } - } - return color; + return (sil >> 29) & 0x7; + } -float4 drawRing(float2 p, float aaWidth) +// Check if vertex has negative z +bool getVertexZNeg(int vertexIdx) { - float positionLength = length(p); - - // Add a white background circle ring - float ringWidth = 0.01f; - float ringDistance = abs(positionLength - CIRCLE_RADIUS); - float ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance); - - return ringAlpha * float4(1, 1, 1, 1); + return normalize(corners[vertexIdx]).z < 0.0f; } -// Check if a face on the hemisphere is visible from camera at origin -bool isFaceVisible(float3 faceCenter, float3 faceNormal) +#include "Drawing.hlsl" + + +void setDebugData(uint32_t sil, int3 region, int configIndex, uint32_t clippedVertexCount) { - // Face is visible if normal points toward camera (at origin) - float3 viewVec = -normalize(faceCenter); // Vector from face to camera - return dot(faceNormal, viewVec) > 0.0f; +#if DEBUG_DATA + DebugDataBuffer[0].silhouetteVertexCount = uint32_t(getSilhouetteSize(sil)); + DebugDataBuffer[0].region = uint3(region); + DebugDataBuffer[0].silhouetteIndex = uint32_t(configIndex); + DebugDataBuffer[0].clippedVertexCount = clippedVertexCount; + for (int i = 0; i < 6; i++) + { + DebugDataBuffer[0].vertices[i] = uint32_t(getSilhouetteVertex(sil, i)); + } + DebugDataBuffer[0].silhouette = sil; +#endif } -int getEdgeVisibility(int edgeIdx, float3 cameraPos) +float2 toCircleSpace(float2 uv) { - int2 faces = edgeToFaces[edgeIdx]; - - // Transform normals to world space - float3x3 rotMatrix = (float3x3)pc.modelMatrix; - float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]); - float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]); - - bool visible1 = isFaceVisible(faceCenters[faces.x], n_world_f1); - bool visible2 = isFaceVisible(faceCenters[faces.y], n_world_f2); - - // Silhouette: exactly one face visible - if (visible1 != visible2) return 1; - - // Inner edge: both faces visible - if (visible1 && visible2) return 2; - - // Hidden edge: both faces hidden - return 0; + float2 p = uv * 2.0f - 1.0f; + float aspect = pc.viewport.z / pc.viewport.w; + p.x *= aspect; + return p; } -// Draw great circle arc in fragment shader with horizon clipping -float4 drawGreatCircleArc(float3 fragPos, int2 edgeVerts, int visibility, float aaWidth) +uint32_t packSilhouette(const int s[7]) { - if (visibility == 0) return float4(0,0,0,0); // Hidden edge - - float3 v0 = normalize(corners[edgeVerts.x]); - float3 v1 = normalize(corners[edgeVerts.y]); - float3 p = normalize(fragPos); // Current point on hemisphere - - // HORIZON CLIPPING: Current fragment must be on front hemisphere - if (p.z < 0.0f) - return float4(0,0,0,0); - - // HORIZON CLIPPING: Skip edge if both endpoints are behind horizon - if (v0.z < 0.0f && v1.z < 0.0f) - return float4(0,0,0,0); - - // Great circle plane normal - float3 arcNormal = normalize(cross(v0, v1)); - - // Distance to great circle - float dist = abs(dot(p, arcNormal)); - - // Check if point is within arc bounds - float dotMid = dot(v0, v1); - bool onArc = (dot(p, v0) >= dotMid) && (dot(p, v1) >= dotMid); - - if (!onArc) return float4(0,0,0,0); - - // Depth-based width scaling - float avgDepth = (length(corners[edgeVerts.x]) + length(corners[edgeVerts.y])) * 0.5f; - float depthScale = 3.0f / avgDepth; - - float baseWidth = (visibility == 1) ? 0.01f : 0.005f; - float width = min(baseWidth * depthScale, 0.02f); - - float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); - - float4 edgeColor = (visibility == 1) ? - float4(0.0f, 0.5f, 1.0f, 1.0f) : // Silhouette: blue - float4(1.0f, 0.0f, 0.0f, 1.0f); // Inner: red - - float intensity = (visibility == 1) ? 1.0f : 0.5f; - return edgeColor * alpha * intensity; + uint32_t packed = 0; + int size = s[0] & 0x7; // 3 bits for size + + // Pack vertices LSB-first (vertex1 in lowest 3 bits above size) + for (int i = 1; i <= 6; ++i) { + int v = s[i]; + if (v < 0) v = 0; // replace unused vertices with 0 + packed |= (v & 0x7) << (3 * (i - 1)); // vertex i-1 shifted by 3*(i-1) + } + + // Put size in the MSB (bits 29-31 for a 32-bit uint, leaving 29 bits for vertices) + packed |= (size & 0x7) << 29; + + return packed; } -float4 drawHiddenEdges(float3 spherePos, int configIndex, float aaWidth) +void computeCubeGeo() { - float4 color = float4(0,0,0,0); - // Draw the remaining edges (non-silhouette) in a different color - float3 hiddenEdgeColor = float3(0.1, 0.1, 0.1); // dark yellow color for hidden edges - - for (int i = 0; i < 12; i++) - { - int2 edge = allEdges[i]; - - // Check if this edge is already drawn as a silhouette edge - bool isSilhouette = false; - int vertexCount = silhouettes[configIndex][0]; - // Draw the 6 silhouette edges - for (int i = 0; i < vertexCount; i++) - { - int v0Idx = silhouettes[configIndex][i + 1]; - int v1Idx = silhouettes[configIndex][((i + 1) % vertexCount) + 1]; - - if ((edge.x == v0Idx && edge.y == v1Idx) || (edge.x == v1Idx && edge.y == v0Idx)) - { - isSilhouette = true; - break; - } - } - - // Only draw if it's not a silhouette edge - if (!isSilhouette) - { - float4 edgeContribution = drawGreatCircleArc(spherePos, edge, 1, aaWidth); - color += float4(hiddenEdgeColor * edgeContribution.a, edgeContribution.a); - } - } - return color; + for (int i = 0; i < 8; i++) + for (int i = 0; i < 8; i++) + { + float3 localPos = constCorners[i]; + float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz; + corners[i] = worldPos.xyz; + faceCenters[i / 4] += worldPos / 4.0f; + faceCenters[2 + i % 2] += worldPos / 4.0f; + faceCenters[4 + (i / 2) % 2] += worldPos / 4.0f; + } } [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 { - float4 color = float4(0, 0, 0, 0); - float2 p = toCircleSpace(vx.uv); - - // Convert 2D disk position to 3D hemisphere position - float2 normalized = p / CIRCLE_RADIUS; - float r2 = dot(normalized, normalized); - float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); - - - - // Convert UV to 3D position on hemisphere - float3 spherePos = normalize(float3(normalized.x, normalized.y, sqrt(1 - r2))); - - computeCubeGeo(); - - // Get OBB center in world space - float3 obbCenter = mul(pc.modelMatrix, float4(0, 0, 0, 1)).xyz; - - float3x3 rotMatrix = (float3x3)pc.modelMatrix; - float3 proj = mul(obbCenter, rotMatrix); // Get all 3 projections at once - - // Get squared column lengths - float lenSqX = dot(rotMatrix[0], rotMatrix[0]); - float lenSqY = dot(rotMatrix[1], rotMatrix[1]); - float lenSqZ = dot(rotMatrix[2], rotMatrix[2]); - - int3 region = int3( - proj.x < -lenSqX ? 0 : (proj.x > lenSqX ? 2 : 1), - proj.y < -lenSqY ? 0 : (proj.y > lenSqY ? 2 : 1), - proj.z < -lenSqZ ? 0 : (proj.z > lenSqZ ? 2 : 1) - ); - - int configIndex = region.x + region.y * 3 + region.z * 9; // 0-26 - - int vertexCount = silhouettes[configIndex][0]; - for (int i = 0; i < vertexCount; i++) - { - int v0Idx = silhouettes[configIndex][i + 1]; - int v1Idx = silhouettes[configIndex][((i + 1) % vertexCount) + 1]; - - float4 edgeContribution = drawGreatCircleArc(spherePos, int2(v0Idx, v1Idx), 1, aaWidth); - color += float4(colorLUT[i] * edgeContribution.a, edgeContribution.a); - } - - color += drawHiddenEdges(spherePos, configIndex, aaWidth); - - color += drawCorners(spherePos, aaWidth); - - color += drawRing(p, aaWidth); - - if (all(vx.uv >= float2(0.49f, 0.49f) ) && all(vx.uv <= float2(0.51f, 0.51f))) - { - return float4(colorLUT[configIndex], 1.0f); - } - - // if (r2 > 1.1f) - // color.a = 0.0f; // Outside circle, make transparent - - return color; + float4 color = float4(0, 0, 0, 0); + float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); + float2 p = toCircleSpace(vx.uv); + + float2 normalized = p / CIRCLE_RADIUS; + float r2 = dot(normalized, normalized); + + float3 spherePos; + if (r2 <= 1.0f) + { + spherePos = float3(normalized.x, normalized.y, sqrt(1.0f - r2)); + } + else + { + float uv2Plus1 = r2 + 1.0f; + spherePos = float3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; + } + spherePos = normalize(spherePos); + + computeCubeGeo(); + + float3 obbCenter = mul(pc.modelMatrix, float4(0, 0, 0, 1)).xyz; + + float3x3 upper3x3 = (float3x3)pc.modelMatrix; + +#if 1 + // Compute reciprocal scales + float3 rcpScales = rsqrt(float3( + dot(upper3x3[0], upper3x3[0]), + dot(upper3x3[1], upper3x3[1]), + dot(upper3x3[2], upper3x3[2]) + )); + + // Build inverse-rotation-only matrix + float3x3 invRot; + invRot[0] = upper3x3[0] * rcpScales.x; + invRot[1] = upper3x3[1] * rcpScales.y; + invRot[2] = upper3x3[2] * rcpScales.z; + + // Project center into OBB local space + float3 normalizedProj = mul(invRot, obbCenter); +#else + float3 normalizedProj = mul(inverse(upper3x3), obbCenter); +#endif + int3 region = int3( + normalizedProj.x < -1.0f ? 0 : (normalizedProj.x > 1.0f ? 2 : 1), + normalizedProj.y < -1.0f ? 0 : (normalizedProj.y > 1.0f ? 2 : 1), + normalizedProj.z < -1.0f ? 0 : (normalizedProj.z > 1.0f ? 2 : 1) + ); + int configIndex = region.x + region.y * 3 + region.z * 9; + + // uint32_t sil = packSilhouette(silhouettes[configIndex]); + uint32_t sil = binSilhouettes[configIndex]; + + int vertexCount = getSilhouetteSize(sil); + bool longSilhouette = (vertexCount == 6); + uint32_t silEdgeMask = 0; + +#if DEBUG_DATA + { + for (int i = 0; i < vertexCount; i++) + { + int vIdx = i % vertexCount; + int v1Idx = (i + 1) % vertexCount; + + int v0Corner = getSilhouetteVertex(sil, vIdx); + int v1Corner = getSilhouetteVertex(sil, v1Idx); + // Mark edge as part of silhouette + for (int e = 0; e < 12; e++) + { + int2 edge = allEdges[e]; + if ((edge.x == v0Corner && edge.y == v1Corner) || + (edge.x == v1Corner && edge.y == v0Corner)) + { + silEdgeMask |= (1u << e); + } + } + } + validateEdgeVisibility(sil, vertexCount, silEdgeMask); + } +#endif + // Build clip mask for vertices below horizon (z < 0) + uint32_t clipMask = 0u; + NBL_UNROLL + for (int i = 0; i < 6; i++) + { + if (i >= vertexCount) break; + clipMask |= (getVertexZNeg(getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; + } + + int clipCount = countbits(clipMask); + + // Total clipped vertices + int clippedVertCount = vertexCount + (clipMask != 0u ? (2 - clipCount) : 0); + + // Find rotation amount to place positive vertices first + int rotateAmount = 0; + if (clipMask != 0u) + { + uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u); + bool wrapAround = ((clipMask & 1u) != 0u) && ((clipMask >> (vertexCount - 1)) & 1u); + + rotateAmount = wrapAround ? + ((firstbithigh(invertedMask) + 1) % vertexCount) : + firstbitlow(clipMask); + } + + // Rotate silhouette bits + uint32_t vertexBits = sil & 0x1FFFFFFF; + uint32_t rotatedVertexBits = rotr(vertexBits, rotateAmount * 3, vertexCount * 3); + uint32_t rotatedSil = (sil & 0xE0000000) | rotatedVertexBits; + + // Rotate the clip mask to match + uint32_t rotatedClipMask = rotr(clipMask, rotateAmount, vertexCount); + + // Draw clipped silhouette edges + for (int i = 0; i < clippedVertCount; i++) + { + int nextI = (i + 1) % clippedVertCount; + + int vIdx = i % vertexCount; + int v1Idx = nextI % vertexCount; + + // Extract clip bits directly + bool v0Clipped = (rotatedClipMask >> vIdx) & 1u; + bool v1Clipped = (rotatedClipMask >> v1Idx) & 1u; + + // Skip if both clipped + if (v0Clipped && v1Clipped) continue; + + int v0Corner = getSilhouetteVertex(rotatedSil, vIdx); + int v1Corner = getSilhouetteVertex(rotatedSil, v1Idx); + + float3 v0 = normalize(corners[v0Corner]); + float3 v1 = normalize(corners[v1Corner]); + + float3 points[2] = { corners[v0Corner], corners[v1Corner] }; + + // Clip using bit state + if (v0Clipped) + { + float t = v0.z / (v0.z - v1.z); + points[0] = normalize(lerp(corners[v0Corner], corners[v1Corner], t)); + } + else if (v1Clipped) + { + float t = v0.z / (v0.z - v1.z); + points[1] = normalize(lerp(corners[v0Corner], corners[v1Corner], t)); + } + + // Draw edge + float4 edgeContribution = drawGreatCircleArc(spherePos, points, 1, aaWidth); + color += float4(colorLUT[i] * edgeContribution.a, edgeContribution.a); + + } + + + setDebugData(sil, region, configIndex, clippedVertCount); + + color += drawHiddenEdges(spherePos, silEdgeMask, aaWidth); + color += drawCorners(spherePos, p, aaWidth); + color += drawRing(p, aaWidth); + + if (all(vx.uv >= float2(0.49f, 0.49f)) && all(vx.uv <= float2(0.51f, 0.51f))) + { + return float4(colorLUT[configIndex], 1.0f); + } + + return color; } \ No newline at end of file diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl index 80368d08f..3c87a48bc 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl @@ -2,13 +2,52 @@ #define _SOLID_ANGLE_VIS_COMMON_HLSL_ #include "nbl/builtin/hlsl/cpp_compat.hlsl" +#define DEBUG_DATA 1 - -struct PushConstants +namespace nbl { - nbl::hlsl::float32_t3x4 modelMatrix; - nbl::hlsl::float32_t4 viewport; -}; + namespace hlsl + { + + struct ResultData + { + uint32_t3 region; + uint32_t silhouetteIndex; + + uint32_t silhouetteVertexCount; + uint32_t silhouette; + uint32_t clippedVertexCount; + uint32_t edgeVisibilityMismatch; + + uint32_t vertices[6]; + }; + + struct PushConstants + { + float32_t3x4 modelMatrix; + float32_t4 viewport; + }; + static const float32_t3 colorLUT[27] = { + float32_t3(0, 0, 0), float32_t3(1, 1, 1), float32_t3(0.5, 0.5, 0.5), + float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1), + float32_t3(1, 1, 0), float32_t3(1, 0, 1), float32_t3(0, 1, 1), + float32_t3(1, 0.5, 0), float32_t3(1, 0.65, 0), float32_t3(0.8, 0.4, 0), + float32_t3(1, 0.4, 0.7), float32_t3(1, 0.75, 0.8), float32_t3(0.7, 0.1, 0.3), + float32_t3(0.5, 0, 0.5), float32_t3(0.6, 0.4, 0.8), float32_t3(0.3, 0, 0.5), + float32_t3(0, 0.5, 0), float32_t3(0.5, 1, 0), float32_t3(0, 0.5, 0.25), + float32_t3(0, 0, 0.5), float32_t3(0.3, 0.7, 1), float32_t3(0, 0.4, 0.6), + float32_t3(0.6, 0.4, 0.2), float32_t3(0.8, 0.7, 0.3), float32_t3(0.4, 0.3, 0.1) + }; +#ifndef __HLSL_VERSION + static const char* colorNames[27] = {"Black", + "White", "Gray", "Red", "Green", "Blue", "Yellow", "Magenta", "Cyan", + "Orange", "Light Orange", "Dark Orange", "Pink", "Light Pink", "Deep Rose", "Purple", "Light Purple", + "Indigo", "Dark Green", "Lime", "Forest Green", "Navy", "Sky Blue", "Teal", "Brown", + "Tan/Beige", "Dark Brown" + }; +#endif // __HLSL_VERSION + } +} #endif // _SOLID_ANGLE_VIS_COMMON_HLSL_ diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl new file mode 100644 index 000000000..4031e048f --- /dev/null +++ b/72_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl @@ -0,0 +1,23 @@ +#ifndef _UTILS_HLSL_ +#define _UTILS_HLSL_ + +// TODO: implemented somewhere else? +// Bit rotation helpers +uint32_t rotl(uint32_t value, uint32_t bits, uint32_t width) +{ + bits = bits % width; + uint32_t mask = (1u << width) - 1u; + value &= mask; + return ((value << bits) | (value >> (width - bits))) & mask; +} + +uint32_t rotr(uint32_t value, uint32_t bits, uint32_t width) +{ + bits = bits % width; + uint32_t mask = (1u << width) - 1u; + value &= mask; + return ((value >> bits) | (value << (width - bits))) & mask; +} + + +#endif // _UTILS_HLSL_ diff --git a/72_SolidAngleVisualizer/include/transform.hpp b/72_SolidAngleVisualizer/include/transform.hpp index 105b2f757..538173223 100644 --- a/72_SolidAngleVisualizer/include/transform.hpp +++ b/72_SolidAngleVisualizer/include/transform.hpp @@ -1,27 +1,21 @@ #ifndef _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_ #define _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_ - #include "nbl/ui/ICursorControl.h" - #include "nbl/ext/ImGui/ImGui.h" - #include "imgui/imgui_internal.h" #include "imguizmo/ImGuizmo.h" - struct TransformRequestParams { - float camDistance = 8.f; uint8_t sceneTexDescIx = ~0; - bool useWindow = true, editTransformDecomposition = false, enableViewManipulate = false; + bool useWindow = true, editTransformDecomposition = false, enableViewManipulate = true; }; struct TransformReturnInfo { nbl::hlsl::uint16_t2 sceneResolution = { 1, 1 }; - bool isGizmoWindowHovered; - bool isGizmoBeingUsed; + bool allowCameraMovement = false; }; TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjection, float* matrix, const TransformRequestParams& params) @@ -35,7 +29,7 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti static bool boundSizing = false; static bool boundSizingSnap = false; - ImGui::Text("Press T/R/G to change gizmo mode"); + ImGui::Text("Use gizmo (T/R/G) or ViewManipulate widget to transform the cube"); if (params.editTransformDecomposition) { @@ -55,11 +49,13 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti mCurrentGizmoOperation = ImGuizmo::SCALE; if (ImGui::RadioButton("Universal", mCurrentGizmoOperation == ImGuizmo::UNIVERSAL)) mCurrentGizmoOperation = ImGuizmo::UNIVERSAL; + + // For UI editing, decompose temporarily float matrixTranslation[3], matrixRotation[3], matrixScale[3]; ImGuizmo::DecomposeMatrixToComponents(matrix, matrixTranslation, matrixRotation, matrixScale); - ImGui::InputFloat3("Tr", matrixTranslation); - ImGui::InputFloat3("Rt", matrixRotation); - ImGui::InputFloat3("Sc", matrixScale); + ImGui::DragFloat3("Tr", matrixTranslation, 0.01f); + ImGui::DragFloat3("Rt", matrixRotation, 0.01f); + ImGui::DragFloat3("Sc", matrixScale, 0.01f); ImGuizmo::RecomposeMatrixFromComponents(matrixTranslation, matrixRotation, matrixScale, matrix); if (mCurrentGizmoOperation != ImGuizmo::SCALE) @@ -101,17 +97,18 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti ImGuiIO& io = ImGui::GetIO(); float viewManipulateRight = io.DisplaySize.x; float viewManipulateTop = 0; + bool isWindowHovered = false; static ImGuiWindowFlags gizmoWindowFlags = 0; /* - for the "useWindow" case we just render to a gui area, + for the "useWindow" case we just render to a gui area, otherwise to fake full screen transparent window - note that for both cases we make sure gizmo being - rendered is aligned to our texture scene using - imgui "cursor" screen positions + note that for both cases we make sure gizmo being + rendered is aligned to our texture scene using + imgui "cursor" screen positions */ -// TODO: this shouldn't be handled here I think + // TODO: this shouldn't be handled here I think SImResourceInfo info; info.textureID = params.sceneTexDescIx; info.samplerIx = (uint16_t)nbl::ext::imgui::UI::DefaultSamplerIx::USER; @@ -128,17 +125,17 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti ImVec2 contentRegionSize = ImGui::GetContentRegionAvail(); ImVec2 windowPos = ImGui::GetWindowPos(); ImVec2 cursorPos = ImGui::GetCursorScreenPos(); + isWindowHovered = ImGui::IsWindowHovered(); ImGui::Image(info, contentRegionSize); ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y); - retval.sceneResolution = {contentRegionSize.x,contentRegionSize.y}; - retval.isGizmoWindowHovered = ImGui::IsWindowHovered(); + retval.sceneResolution = { contentRegionSize.x,contentRegionSize.y }; viewManipulateRight = cursorPos.x + contentRegionSize.x; viewManipulateTop = cursorPos.y; ImGuiWindow* window = ImGui::GetCurrentWindow(); - gizmoWindowFlags = (ImGui::IsWindowHovered() && ImGui::IsMouseHoveringRect(window->InnerRect.Min, window->InnerRect.Max) ? ImGuiWindowFlags_NoMove : 0); + gizmoWindowFlags = (isWindowHovered && ImGui::IsMouseHoveringRect(window->InnerRect.Min, window->InnerRect.Max) ? ImGuiWindowFlags_NoMove : 0); } else { @@ -149,21 +146,45 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti ImVec2 contentRegionSize = ImGui::GetContentRegionAvail(); ImVec2 cursorPos = ImGui::GetCursorScreenPos(); + isWindowHovered = ImGui::IsWindowHovered(); ImGui::Image(info, contentRegionSize); ImGuizmo::SetRect(cursorPos.x, cursorPos.y, contentRegionSize.x, contentRegionSize.y); - retval.sceneResolution = {contentRegionSize.x,contentRegionSize.y}; - retval.isGizmoWindowHovered = ImGui::IsWindowHovered(); + retval.sceneResolution = { contentRegionSize.x,contentRegionSize.y }; viewManipulateRight = cursorPos.x + contentRegionSize.x; viewManipulateTop = cursorPos.y; } + // Standard Manipulate gizmo - let ImGuizmo modify the matrix directly ImGuizmo::Manipulate(cameraView, cameraProjection, mCurrentGizmoOperation, mCurrentGizmoMode, matrix, NULL, useSnap ? &snap[0] : NULL, boundSizing ? bounds : NULL, boundSizingSnap ? boundsSnap : NULL); - retval.isGizmoBeingUsed = ImGuizmo::IsOver() || (ImGuizmo::IsUsing() && ImGui::IsMouseDown(ImGuiMouseButton_Left)); - if(params.enableViewManipulate) - ImGuizmo::ViewManipulate(cameraView, params.camDistance, ImVec2(viewManipulateRight - 128, viewManipulateTop), ImVec2(128, 128), 0x10101010); + retval.allowCameraMovement = isWindowHovered && !ImGuizmo::IsUsing(); + + // ViewManipulate for rotating the view + if (params.enableViewManipulate) + { + // Store original translation and scale before ViewManipulate + // Decompose original matrix + nbl::hlsl::float32_t3 translation, rotation, scale; + ImGuizmo::DecomposeMatrixToComponents(matrix, &translation.x, &rotation.x, &scale.x); + + float temp[16]; + nbl::hlsl::float32_t3 baseTranslation(0.0f); + nbl::hlsl::float32_t3 baseScale(1.0f); + ImGuizmo::RecomposeMatrixFromComponents(&baseTranslation.x, &rotation.x, &baseScale.x, temp); + // Manipulate rotation only + ImGuizmo::ViewManipulate(temp, 1.0f, ImVec2(viewManipulateRight - 128, viewManipulateTop), ImVec2(128, 128), 0x10101010); + + // Extract rotation from manipulated temp + nbl::hlsl::float32_t3 newRot; + ImGuizmo::DecomposeMatrixToComponents(temp, &baseTranslation.x, &newRot.x, &baseScale.x); + + // Recompose original matrix with new rotation but keep translation & scale + ImGuizmo::RecomposeMatrixFromComponents(&translation.x, &newRot.x, &scale.x, matrix); + + retval.allowCameraMovement &= isWindowHovered && !ImGuizmo::IsUsingViewManipulate(); + } ImGui::End(); ImGui::PopStyleColor(); @@ -171,4 +192,4 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti return retval; } -#endif // __NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED__ \ No newline at end of file +#endif // _NBL_THIS_EXAMPLE_TRANSFORM_H_INCLUDED_ \ No newline at end of file diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp index e9266520d..1c52547af 100644 --- a/72_SolidAngleVisualizer/main.cpp +++ b/72_SolidAngleVisualizer/main.cpp @@ -211,7 +211,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR return shader; }; - auto scRes = static_cast(m_surface->getSwapchainResources()); ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); if (!fsTriProtoPPln) return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); @@ -232,17 +231,73 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .size = sizeof(PushConstants) } }; - auto visualizationLayout = m_device->createPipelineLayout( - ranges, - nullptr, - nullptr, - nullptr, - nullptr + nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { + { + .binding = 0, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = ShaderStage::ESS_FRAGMENT, + .count = 1 + } + }; + smart_refctd_ptr dsLayout = m_device->createDescriptorSetLayout(bindings); + if (!dsLayout) + logFail("Failed to create a Descriptor Layout!\n"); + + + auto visualizationLayout = m_device->createPipelineLayout(ranges +#if DEBUG_DATA + , dsLayout +#endif ); m_visualizationPipeline = fsTriProtoPPln.createPipeline(fragSpec, visualizationLayout.get(), m_solidAngleRenderpass.get()); if (!m_visualizationPipeline) return logFail("Could not create Graphics Pipeline!"); + // Allocate the memory +#if DEBUG_DATA + { + constexpr size_t BufferSize = sizeof(ResultData); + + nbl::video::IGPUBuffer::SCreationParams params = {}; + params.size = BufferSize; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; + m_outputStorageBuffer = m_device->createBuffer(std::move(params)); + if (!m_outputStorageBuffer) + logFail("Failed to create a GPU Buffer of size %d!\n", params.size); + + m_outputStorageBuffer->setObjectDebugName("ResultData output buffer"); + + nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputStorageBuffer->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); + + m_allocation = m_device->allocate(reqs, m_outputStorageBuffer.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); + if (!m_allocation.isValid()) + logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + + assert(m_outputStorageBuffer->getBoundMemory().memory == m_allocation.memory.get()); + smart_refctd_ptr pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 }); + + m_ds = pool->createDescriptorSet(std::move(dsLayout)); + { + IGPUDescriptorSet::SDescriptorInfo info[1]; + info[0].desc = smart_refctd_ptr(m_outputStorageBuffer); + info[0].info.buffer = { .offset = 0,.size = BufferSize }; + IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { + {.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info} + }; + m_device->updateDescriptorSets(writes, {}); + } + } + + if (!m_allocation.memory->map({ 0ull,m_allocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ)) + logFail("Failed to map the Device Memory!\n"); + + // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches + const ILogicalDevice::MappedMemoryRange memoryRange(m_allocation.memory.get(), 0ull, m_allocation.memory->getAllocationSize()); + if (!m_allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memoryRange); +#endif } // Create ImGUI @@ -336,6 +391,15 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} }; if (m_solidAngleViewFramebuffer) { +#if DEBUG_DATA + asset::SBufferRange range + { + .offset = 0, + .size = m_outputStorageBuffer->getSize(), + .buffer = m_outputStorageBuffer + }; + cb->fillBuffer(range, 0u); +#endif auto creationParams = m_solidAngleViewFramebuffer->getCreationParameters(); cb->beginDebugMarker("Draw Circle View Frame"); { @@ -361,11 +425,17 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR auto pipeline = m_visualizationPipeline; cb->bindGraphicsPipeline(pipeline.get()); cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(PushConstants), &pc); - //cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 3, 1, &ds); + cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get()); ext::FullScreenTriangle::recordDrawCall(cb); } cb->endRenderPass(); cb->endDebugMarker(); + +#if DEBUG_DATA + m_device->waitIdle(); + std::memcpy(&m_GPUOutResulData, static_cast(m_allocation.memory->getMappedPointer()), sizeof(ResultData)); + m_device->waitIdle(); +#endif } // draw main view if (m_mainViewFramebuffer) @@ -557,6 +627,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { if (interface.move) camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl + else + camera.mouseKeysUp(); for (const auto& e : events) // here capture { @@ -713,6 +785,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR cb->setViewport(0u, 1u, &viewport); } +#if DEBUG_DATA + ~SolidAngleVisualizer() override + { + m_allocation.memory->unmap(); + } +#endif + // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers constexpr static inline uint32_t MaxFramesInFlight = 3u; constexpr static inline auto sceneRenderDepthFormat = EF_D32_SFLOAT; @@ -721,13 +800,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight; - constexpr static inline float32_t4x4 OBBModelMatrixDefault - { - 1.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, - 0.0f, 0.0f, 3.0f, 1.0f - }; + static inline ResultData m_GPUOutResulData; // smart_refctd_ptr m_scene; smart_refctd_ptr m_solidAngleRenderpass; @@ -737,6 +810,9 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR smart_refctd_ptr m_mainViewFramebuffer; smart_refctd_ptr m_visualizationPipeline; // + nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; + smart_refctd_ptr m_outputStorageBuffer; + smart_refctd_ptr m_ds = nullptr; smart_refctd_ptr m_semaphore; uint64_t m_realFrameIx = 0; std::array, MaxFramesInFlight> m_cmdBufs; @@ -794,7 +870,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // transformParams.useWindow = true; ImGui::Text("Camera"); - bool viewDirty = false; if (ImGui::RadioButton("LH", isLH)) isLH = true; @@ -827,13 +902,11 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); - viewDirty |= ImGui::SliderFloat("Distance", &transformParams.camDistance, 1.f, 69.f); - if (viewDirty || firstFrame) + if (firstFrame) { camera.setPosition(cameraIntialPosition); camera.setTarget(cameraInitialTarget); - camera.setBackupUpVector(cameraInitialUp); camera.setUpVector(cameraInitialUp); camera.recomputeViewMatrix(); @@ -909,45 +982,35 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR if (ImGui::IsKeyPressed(ImGuiKey_End)) { - m_OBBModelMatrix = OBBModelMatrixDefault; + m_TRS = TRS{}; } - static struct { - float32_t4x4 view, projection, model; - } imguizmoM16InOut; + static struct + { + float32_t4x4 view, projection, model; + } imguizmoM16InOut; - ImGuizmo::SetID(0u); + ImGuizmo::SetID(0u); - // TODO: camera will return hlsl::float32_tMxN - auto view = *reinterpret_cast(camera.getViewMatrix().pointer()); - imguizmoM16InOut.view = hlsl::transpose(getMatrix3x4As4x4(view)); + // TODO: camera will return hlsl::float32_tMxN + auto view = *reinterpret_cast(camera.getViewMatrix().pointer()); + imguizmoM16InOut.view = hlsl::transpose(getMatrix3x4As4x4(view)); - // TODO: camera will return hlsl::float32_tMxN - imguizmoM16InOut.projection = hlsl::transpose(*reinterpret_cast(camera.getProjectionMatrix().pointer())); - imguizmoM16InOut.model = m_OBBModelMatrix; + // TODO: camera will return hlsl::float32_tMxN + imguizmoM16InOut.projection = hlsl::transpose(*reinterpret_cast(camera.getProjectionMatrix().pointer())); + ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &imguizmoM16InOut.model[0][0]); - { if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/ transformParams.editTransformDecomposition = true; mainViewTransformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams); + move = mainViewTransformReturnInfo.allowCameraMovement; - // TODO: camera stops when cursor hovers gizmo, but we also want to stop when gizmo is being used - move = (ImGui::IsMouseDown(ImGuiMouseButton_Left) || mainViewTransformReturnInfo.isGizmoWindowHovered) && (!mainViewTransformReturnInfo.isGizmoBeingUsed); - + ImGuizmo::DecomposeMatrixToComponents(&imguizmoM16InOut.model[0][0], &m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x); + ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &imguizmoM16InOut.model[0][0]); } - - // to Nabla + update camera & model matrices - // TODO: make it more nicely, extract: - // - Position by computing inverse of the view matrix and grabbing its translation - // - Target from 3rd row without W component of view matrix multiplied by some arbitrary distance value (can be the length of position from origin) and adding the position - // But then set the view matrix this way anyway, because up-vector may not be compatible - //const auto& view = camera.getViewMatrix(); - //const_cast(view) = core::transpose(imguizmoM16InOut.view).extractSub3x4(); // a hack, correct way would be to use inverse matrix and get position + target because now it will bring you back to last position & target when switching from gizmo move to manual move (but from manual to gizmo is ok) - m_OBBModelMatrix = imguizmoM16InOut.model; - // object meta display //{ // ImGui::Begin("Object"); @@ -964,12 +1027,193 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImVec2 contentRegionSize = ImGui::GetContentRegionAvail(); solidAngleViewTransformReturnInfo.sceneResolution = uint16_t2(static_cast(contentRegionSize.x), static_cast(contentRegionSize.y)); - solidAngleViewTransformReturnInfo.isGizmoBeingUsed = false; // not used in this view - solidAngleViewTransformReturnInfo.isGizmoWindowHovered = false; // not used in this view + solidAngleViewTransformReturnInfo.allowCameraMovement = false; // not used in this view ImGui::Image({ renderColorViewDescIndices[ERV_SOLID_ANGLE_VIEW] }, contentRegionSize); ImGui::End(); } + // Show data coming from GPU +#if DEBUG_DATA + { + if (ImGui::Begin("Result Data")) + { + auto drawColorField = [&](const char* fieldName, uint32_t index) + { + ImGui::Text("%s: %u", fieldName, index); + + if (index >= 27) + { + ImGui::SameLine(); + ImGui::Text(""); + return; + } + + const auto& c = colorLUT[index]; // uses the combined LUT we made earlier + + ImGui::SameLine(); + + // Color preview button + ImGui::ColorButton( + fieldName, + ImVec4(c.r, c.g, c.b, 1.0f), + 0, + ImVec2(20, 20) + ); + + ImGui::SameLine(); + ImGui::Text("%s", colorNames[index]); + }; + + // Vertices + if (ImGui::CollapsingHeader("Vertices", ImGuiTreeNodeFlags_DefaultOpen)) + { + for (uint32_t i = 0; i < 6; ++i) + { + if (i < m_GPUOutResulData.silhouetteVertexCount) + { + ImGui::Text("corners[%u]", i); + ImGui::SameLine(); + drawColorField(":", m_GPUOutResulData.vertices[i]); + ImGui::SameLine(); + static const float32_t3 constCorners[8] = { + float32_t3(-1, -1, -1), float32_t3(1, -1, -1), float32_t3(-1, 1, -1), float32_t3(1, 1, -1), + float32_t3(-1, -1, 1), float32_t3(1, -1, 1), float32_t3(-1, 1, 1), float32_t3(1, 1, 1) + }; + float32_t3 vertexLocation = constCorners[m_GPUOutResulData.vertices[i]]; + ImGui::Text(" : (%.3f, %.3f, %.3f", vertexLocation.x, vertexLocation.y, vertexLocation.z); + } + else + { + ImGui::Text("corners[%u] :: ", i); + ImGui::SameLine(); + ImGui::ColorButton( + "", + ImVec4(0.0f, 0.0f, 0.0f, 0.0f), + 0, + ImVec2(20, 20) + ); + ImGui::SameLine(); + ImGui::Text(""); + + } + + } + } + + if (ImGui::CollapsingHeader("Color LUT Map")) + { + for (int i = 0; i < 27; i++) + drawColorField(" ", i); + } + + ImGui::Separator(); + + // Silhouette info + drawColorField("silhouetteIndex", m_GPUOutResulData.silhouetteIndex); + + ImGui::Text("silhouette Vertex Count: %u", m_GPUOutResulData.silhouetteVertexCount); + ImGui::Text("silhouette Clipped VertexCount: %u", m_GPUOutResulData.clippedVertexCount); + ImGui::Text("Silhouette Mismatch: %s", m_GPUOutResulData.edgeVisibilityMismatch ? "true" : "false"); + + { + float32_t3 xAxis = m_OBBModelMatrix[0].xyz; + float32_t3 yAxis = m_OBBModelMatrix[1].xyz; + float32_t3 zAxis = m_OBBModelMatrix[2].xyz; + + float32_t3 nx = normalize(xAxis); + float32_t3 ny = normalize(yAxis); + float32_t3 nz = normalize(zAxis); + + const float epsilon = 1e-4; + bool hasSkew = false; + if (abs(dot(nx, ny)) > epsilon || abs(dot(nx, nz)) > epsilon || abs(dot(ny, nz)) > epsilon) + hasSkew = true; + ImGui::Text("Matrix Has Skew: %s", hasSkew ? "true" : "false"); + } + + static bool modalShown = false; + static uint32_t lastSilhouetteIndex = ~0u; + + // Reset modal flag if silhouette configuration changed + if (m_GPUOutResulData.silhouetteIndex != lastSilhouetteIndex) + { + modalShown = false; + lastSilhouetteIndex = m_GPUOutResulData.silhouetteIndex; + } + + if (!m_GPUOutResulData.edgeVisibilityMismatch) + { + // Reset flag when mismatch is cleared + modalShown = false; + } + if (m_GPUOutResulData.edgeVisibilityMismatch && m_GPUOutResulData.silhouetteIndex != 13 && !modalShown) // 13 means we're inside the cube, so don't care + { + // Open modal popup only once per configuration + ImGui::OpenPopup("Edge Visibility Mismatch Warning"); + modalShown = true; + } + + // Modal popup + if (ImGui::BeginPopupModal("Edge Visibility Mismatch Warning", NULL, ImGuiWindowFlags_AlwaysAutoResize)) + { + ImGui::TextColored(ImVec4(1.0f, 0.5f, 0.0f, 1.0f), "Warning: Edge Visibility Mismatch Detected!"); + ImGui::Separator(); + + ImGui::Text("The silhouette lookup table (LUT) does not match the computed edge visibility."); + ImGui::Text("This indicates the pre-computed silhouette data may be incorrect."); + ImGui::Spacing(); + + // Show configuration info + ImGui::TextWrapped("Configuration Index: %u", m_GPUOutResulData.silhouetteIndex); + ImGui::TextWrapped("Region: (%d, %d, %d)", + m_GPUOutResulData.region.x, + m_GPUOutResulData.region.y, + m_GPUOutResulData.region.z); + ImGui::Spacing(); + + ImGui::Text("Mismatched Vertices (bitmask): 0x%08X", m_GPUOutResulData.edgeVisibilityMismatch); + + // Show which specific vertices are mismatched + ImGui::Text("Vertices involved in mismatched edges:"); + ImGui::Indent(); + for (int i = 0; i < 8; i++) + { + if (m_GPUOutResulData.edgeVisibilityMismatch & (1u << i)) + { + ImGui::BulletText("Vertex %d", i); + } + } + ImGui::Unindent(); + ImGui::Spacing(); + + if (ImGui::Button("OK", ImVec2(120, 0))) + { + ImGui::CloseCurrentPopup(); + } + + ImGui::EndPopup(); + } + + ImGui::Separator(); + + // Region (uint32_t3) + ImGui::Text("region: (%u, %u, %u)", + m_GPUOutResulData.region.x, m_GPUOutResulData.region.y, m_GPUOutResulData.region.z); + + ImGui::Separator(); + + // Silhouette mask printed in binary + char buf[33]; + for (int i = 0; i < 32; i++) + buf[i] = (m_GPUOutResulData.silhouette & (1u << (31 - i))) ? '1' : '0'; + buf[32] = '\0'; + + ImGui::Text("silhouette: 0x%08X", m_GPUOutResulData.silhouette); + ImGui::Text("binary: %s", buf); + } + ImGui::End(); + } +#endif // view matrices editor { ImGui::Begin("Matrices"); @@ -995,6 +1239,32 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::Separator(); }; + static RandomSampler rng(69); // Initialize RNG with seed + if (ImGui::Button("Randomize Translation")) + { + m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); + } + ImGui::SameLine(); + + if (ImGui::Button("Randomize Rotation")) + { + m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f)); + } + ImGui::SameLine(); + + if (ImGui::Button("Randomize Scale")) + { + m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); + } + + ImGui::SameLine(); + if (ImGui::Button("Randomize All")) + { + m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); + m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f)); + m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); + } + addMatrixTable("Model Matrix", "ModelMatrixTable", 4, 4, &m_OBBModelMatrix[0][0]); addMatrixTable("Camera View Matrix", "ViewMatrixTable", 3, 4, camera.getViewMatrix().pointer()); addMatrixTable("Camera View Projection Matrix", "ViewProjectionMatrixTable", 4, 4, camera.getProjectionMatrix().pointer(), false); @@ -1071,6 +1341,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::End(); } ImGui::End(); + + ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &m_OBBModelMatrix[0][0]); } smart_refctd_ptr imGUI; @@ -1085,15 +1357,22 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR }; SubAllocatedDescriptorSet::value_type renderColorViewDescIndices[E_RENDER_VIEWS::Count] = { SubAllocatedDescriptorSet::invalid_value, SubAllocatedDescriptorSet::invalid_value }; // - Camera camera = Camera(core::vectorSIMDf(0, 0, 0), core::vectorSIMDf(0, 0, 0), core::matrix4SIMD()); + Camera camera = Camera(cameraIntialPosition, cameraInitialTarget, core::matrix4SIMD(), 1, 1, nbl::core::vectorSIMDf(0.0f, 0.0f, 1.0f)); // mutables - float32_t4x4 m_OBBModelMatrix = OBBModelMatrixDefault; + struct TRS // Source of truth + { + float32_t3 translation{ 0.0f, 0.0f, 3.0f }; + float32_t3 rotation{ 0.0f }; // MUST stay orthonormal + float32_t3 scale{ 1.0f }; + } m_TRS; + float32_t4x4 m_OBBModelMatrix; // always overwritten from TRS //std::string_view objectName; TransformRequestParams transformParams; TransformReturnInfo mainViewTransformReturnInfo; TransformReturnInfo solidAngleViewTransformReturnInfo; + const static inline core::vectorSIMDf cameraIntialPosition{ -3.0f, 6.0f, 3.0f }; const static inline core::vectorSIMDf cameraInitialTarget{ 0.f, 0.0f, 3.f }; const static inline core::vectorSIMDf cameraInitialUp{ 0.f, 0.f, 1.f }; diff --git a/common/include/nbl/examples/cameras/CCamera.hpp b/common/include/nbl/examples/cameras/CCamera.hpp index e5f077e46..c61f93333 100644 --- a/common/include/nbl/examples/cameras/CCamera.hpp +++ b/common/include/nbl/examples/cameras/CCamera.hpp @@ -302,6 +302,11 @@ class Camera lastVirtualUpTimeStamp = nextPresentationTimeStamp; } + // TODO: temporary but a good fix for the camera events when mouse stops dragging gizmo + void mouseKeysUp() + { + mouseDown = false; + } private: inline void initDefaultKeysMap() { mapKeysToWASD(); } From 2e306fc96bfae85a9669ad552751cece33d1b383 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Thu, 18 Dec 2025 01:10:56 +0300 Subject: [PATCH 11/26] better (still not perfect) manual inverse of rotation matrix --- .../hlsl/SolidAngleVis.frag.hlsl | 22 ++++++------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index cd291dbd2..bf58e3231 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -228,21 +228,13 @@ void computeCubeGeo() float3x3 upper3x3 = (float3x3)pc.modelMatrix; #if 1 - // Compute reciprocal scales - float3 rcpScales = rsqrt(float3( - dot(upper3x3[0], upper3x3[0]), - dot(upper3x3[1], upper3x3[1]), - dot(upper3x3[2], upper3x3[2]) - )); - - // Build inverse-rotation-only matrix - float3x3 invRot; - invRot[0] = upper3x3[0] * rcpScales.x; - invRot[1] = upper3x3[1] * rcpScales.y; - invRot[2] = upper3x3[2] * rcpScales.z; - - // Project center into OBB local space - float3 normalizedProj = mul(invRot, obbCenter); +float3 rcpScales = rsqrt(float3( + dot(upper3x3[0], upper3x3[0]), + dot(upper3x3[1], upper3x3[1]), + dot(upper3x3[2], upper3x3[2]) +)); + +float3 normalizedProj = mul(transpose(upper3x3), obbCenter) * rcpScales; #else float3 normalizedProj = mul(inverse(upper3x3), obbCenter); #endif From 12486d4670f0453722351814996d91f198a16749 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Thu, 18 Dec 2025 02:24:41 +0300 Subject: [PATCH 12/26] Fixed faster inverse of rotation matrix, thanks Matt! --- .../hlsl/SolidAngleVis.frag.hlsl | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index bf58e3231..01d166aac 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -223,21 +223,20 @@ void computeCubeGeo() computeCubeGeo(); - float3 obbCenter = mul(pc.modelMatrix, float4(0, 0, 0, 1)).xyz; + float4x3 columnModel = transpose(pc.modelMatrix); - float3x3 upper3x3 = (float3x3)pc.modelMatrix; + float3 obbCenter = columnModel[3].xyz; -#if 1 -float3 rcpScales = rsqrt(float3( - dot(upper3x3[0], upper3x3[0]), - dot(upper3x3[1], upper3x3[1]), - dot(upper3x3[2], upper3x3[2]) -)); + float3x3 upper3x3 = (float3x3)columnModel; + + float3 rcpScales = rcp(float3( + dot(upper3x3[0], upper3x3[0]), + dot(upper3x3[1], upper3x3[1]), + dot(upper3x3[2], upper3x3[2]) + )); + + float3 normalizedProj = mul(upper3x3, obbCenter) * rcpScales; -float3 normalizedProj = mul(transpose(upper3x3), obbCenter) * rcpScales; -#else - float3 normalizedProj = mul(inverse(upper3x3), obbCenter); -#endif int3 region = int3( normalizedProj.x < -1.0f ? 0 : (normalizedProj.x > 1.0f ? 2 : 1), normalizedProj.y < -1.0f ? 0 : (normalizedProj.y > 1.0f ? 2 : 1), From 1961a898fd0a91c8e4d5c1a3fcb02df9142e8388 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Sat, 20 Dec 2025 10:18:48 +0300 Subject: [PATCH 13/26] Fast clipping, less branches, also - More debug data going to imgui - Little bit of shader code refactoring - "Revert to last" button to go back to last random transformation of the OBB - Added getVertexZNeg() and getVertex() preprocessor branches for faster versions --- .../app_resources/hlsl/Drawing.hlsl | 122 ++-- .../hlsl/SolidAngleVis.frag.hlsl | 639 ++++++++++-------- .../app_resources/hlsl/common.hlsl | 42 +- 72_SolidAngleVisualizer/main.cpp | 90 ++- 4 files changed, 532 insertions(+), 361 deletions(-) diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl index c3cb5befa..f3f1b4e96 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl @@ -16,79 +16,124 @@ float2 sphereToCircle(float3 spherePoint) } } -float4 drawGreatCircleArc(float3 fragPos, float3 points[2], int visibility, float aaWidth) +float drawGreatCircleArc(float3 fragPos, float3 points[2], float aaWidth, float width = 0.01f) { - if (visibility == 0) return float4(0,0,0,0); - float3 v0 = normalize(points[0]); float3 v1 = normalize(points[1]); float3 p = normalize(fragPos); - + float3 arcNormal = normalize(cross(v0, v1)); float dist = abs(dot(p, arcNormal)); - + float dotMid = dot(v0, v1); bool onArc = (dot(p, v0) >= dotMid) && (dot(p, v1) >= dotMid); - - if (!onArc) return float4(0,0,0,0); - + + if (!onArc) + return 0.0f; + float avgDepth = (length(points[0]) + length(points[1])) * 0.5f; float depthScale = 3.0f / avgDepth; - - float baseWidth = (visibility == 1) ? 0.01f : 0.005f; - float width = min(baseWidth * depthScale, 0.02f); - + + width = min(width * depthScale, 0.02f); float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); - - float4 edgeColor = (visibility == 1) ? - float4(0.0f, 0.5f, 1.0f, 1.0f) : - float4(1.0f, 0.0f, 0.0f, 1.0f); - - float intensity = (visibility == 1) ? 1.0f : 0.5f; - return edgeColor * alpha * intensity; + + return alpha; } float4 drawHiddenEdges(float3 spherePos, uint32_t silEdgeMask, float aaWidth) { - float4 color = float4(0,0,0,0); + float4 color = 0; float3 hiddenEdgeColor = float3(0.1, 0.1, 0.1); - + + NBL_UNROLL for (int i = 0; i < 12; i++) { - if ((silEdgeMask & (1u << i)) == 0) + // skip silhouette edges + if (silEdgeMask & (1u << i)) + continue; + + int2 edge = allEdges[i]; + + float3 v0 = normalize(getVertex(edge.x)); + float3 v1 = normalize(getVertex(edge.y)); + + bool neg0 = v0.z < 0.0f; + bool neg1 = v1.z < 0.0f; + + // fully hidden + if (neg0 && neg1) + continue; + + float3 p0 = v0; + float3 p1 = v1; + + // clip if needed + if (neg0 ^ neg1) { - int2 edge = allEdges[i]; - float3 edgePoints[2] = { corners[edge.x], corners[edge.y] }; - float4 edgeContribution = drawGreatCircleArc(spherePos, edgePoints, 1, aaWidth); - color += float4(hiddenEdgeColor * edgeContribution.a, edgeContribution.a); + float t = v0.z / (v0.z - v1.z); + float3 clip = normalize(lerp(v0, v1, t)); + + p0 = neg0 ? clip : v0; + p1 = neg1 ? clip : v1; } + + float3 pts[2] = {p0, p1}; + float4 c = drawGreatCircleArc(spherePos, pts, aaWidth, 0.005f); + color += float4(hiddenEdgeColor * c.a, c.a); } + return color; } float4 drawCorners(float3 spherePos, float2 p, float aaWidth) { - float4 color = float4(0,0,0,0); + float4 color = 0; + + float dotSize = 0.02f; + float innerDotSize = dotSize * 0.5f; + for (int i = 0; i < 8; i++) { - float3 corner3D = normalize(corners[i]); + float3 corner3D = normalize(getVertex(i)); float2 cornerPos = sphereToCircle(corner3D); + float dist = length(p - cornerPos); - float dotSize = 0.02f; - float dotAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist); - if (dotAlpha > 0.0f) + + // outer dot + float outerAlpha = 1.0f - smoothstep(dotSize - aaWidth, + dotSize + aaWidth, + dist); + + if (outerAlpha <= 0.0f) + continue; + + float3 dotColor = colorLUT[i]; + color += float4(dotColor * outerAlpha, outerAlpha); + + // ------------------------------------------------- + // inner black dot for hidden corners + // ------------------------------------------------- + if (corner3D.z < 0.0f) { - float3 dotColor = colorLUT[i]; - color += float4(dotColor * dotAlpha, dotAlpha); + float innerAlpha = 1.0f - smoothstep(innerDotSize - aaWidth, + innerDotSize + aaWidth, + dist); + + // ensure it stays inside the outer dot + innerAlpha *= outerAlpha; + + float3 innerColor = float3(0.0, 0.0, 0.0); + color -= float4(innerAlpha.xxx, 0.0f); } } + return color; } float4 drawRing(float2 p, float aaWidth) { float positionLength = length(p); - float ringWidth = 0.002f; + float ringWidth = 0.003f; float ringDistance = abs(positionLength - CIRCLE_RADIUS); float ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance); return ringAlpha * float4(1, 1, 1, 1); @@ -114,10 +159,12 @@ int getEdgeVisibility(int edgeIdx) bool visible2 = isFaceVisible(faceCenters[faces.y], n_world_f2); // Silhouette: exactly one face visible - if (visible1 != visible2) return 1; + if (visible1 != visible2) + return 1; // Inner edge: both faces visible - if (visible1 && visible2) return 2; + if (visible1 && visible2) + return 2; // Hidden edge: both faces hidden return 0; @@ -162,11 +209,10 @@ void validateEdgeVisibility(uint32_t sil, int vertexCount, uint32_t generatedSil } } } - + // Simple Write (assuming all fragments calculate the same result) InterlockedOr(DebugDataBuffer[0].edgeVisibilityMismatch, mismatchAccumulator); } #endif - #endif // _DEBUG_HLSL_ diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index 01d166aac..d7ceed943 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -15,351 +15,438 @@ static const float CIRCLE_RADIUS = 0.5f; // --- Geometry Utils --- static const float3 constCorners[8] = { - float3(-1, -1, -1), float3(1, -1, -1), float3(-1, 1, -1), float3(1, 1, -1), - float3(-1, -1, 1), float3(1, -1, 1), float3(-1, 1, 1), float3(1, 1, 1) -}; + float3(-1, -1, -1), float3(1, -1, -1), float3(-1, 1, -1), float3(1, 1, -1), + float3(-1, -1, 1), float3(1, -1, 1), float3(-1, 1, 1), float3(1, 1, 1)}; static const int2 allEdges[12] = { - {0, 1}, {2, 3}, {4, 5}, {6, 7}, // X axis - {0, 2}, {1, 3}, {4, 6}, {5, 7}, // Y axis - {0, 4}, {1, 5}, {2, 6}, {3, 7} // Z axis + {0, 1}, {2, 3}, {4, 5}, {6, 7}, // X axis + {0, 2}, + {1, 3}, + {4, 6}, + {5, 7}, // Y axis + {0, 4}, + {1, 5}, + {2, 6}, + {3, 7} // Z axis }; // Adjacency of edges to faces // Corrected Adjacency of edges to faces static const int2 edgeToFaces[12] = { - // Edge Index: | allEdges[i] | Shared Faces: - - /* 0 (0-1) */ {4, 0}, // Y- (4) and Z- (0) - /* 1 (2-3) */ {5, 0}, // Y+ (5) and Z- (0) - /* 2 (4-5) */ {4, 1}, // Y- (4) and Z+ (1) - /* 3 (6-7) */ {5, 1}, // Y+ (5) and Z+ (1) - - /* 4 (0-2) */ {2, 0}, // X- (2) and Z- (0) - /* 5 (1-3) */ {3, 0}, // X+ (3) and Z- (0) - /* 6 (4-6) */ {2, 1}, // X- (2) and Z+ (1) - /* 7 (5-7) */ {3, 1}, // X+ (3) and Z+ (1) - - /* 8 (0-4) */ {2, 4}, // X- (2) and Y- (4) - /* 9 (1-5) */ {3, 4}, // X+ (3) and Y- (4) - /* 10 (2-6) */ {2, 5}, // X- (2) and Y+ (5) - /* 11 (3-7) */ {3, 5} // X+ (3) and Y+ (5) + // Edge Index: | allEdges[i] | Shared Faces: + + /* 0 (0-1) */ {4, 0}, // Y- (4) and Z- (0) + /* 1 (2-3) */ {5, 0}, // Y+ (5) and Z- (0) + /* 2 (4-5) */ {4, 1}, // Y- (4) and Z+ (1) + /* 3 (6-7) */ {5, 1}, // Y+ (5) and Z+ (1) + + /* 4 (0-2) */ {2, 0}, // X- (2) and Z- (0) + /* 5 (1-3) */ {3, 0}, // X+ (3) and Z- (0) + /* 6 (4-6) */ {2, 1}, // X- (2) and Z+ (1) + /* 7 (5-7) */ {3, 1}, // X+ (3) and Z+ (1) + + /* 8 (0-4) */ {2, 4}, // X- (2) and Y- (4) + /* 9 (1-5) */ {3, 4}, // X+ (3) and Y- (4) + /* 10 (2-6) */ {2, 5}, // X- (2) and Y+ (5) + /* 11 (3-7) */ {3, 5} // X+ (3) and Y+ (5) }; static float3 corners[8]; static float3 faceCenters[6] = { - float3(0,0,0), float3(0,0,0), float3(0,0,0), - float3(0,0,0), float3(0,0,0), float3(0,0,0) -}; + float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), + float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0)}; static const float3 localNormals[6] = { - float3(0, 0, -1), // Face 0 (Z-) - float3(0, 0, 1), // Face 1 (Z+) - float3(-1, 0, 0), // Face 2 (X-) - float3(1, 0, 0), // Face 3 (X+) - float3(0, -1, 0), // Face 4 (Y-) - float3(0, 1, 0) // Face 5 (Y+) + float3(0, 0, -1), // Face 0 (Z-) + float3(0, 0, 1), // Face 1 (Z+) + float3(-1, 0, 0), // Face 2 (X-) + float3(1, 0, 0), // Face 3 (X+) + float3(0, -1, 0), // Face 4 (Y-) + float3(0, 1, 0) // Face 5 (Y+) }; - // TODO: unused, remove later // Vertices are ordered CCW relative to the camera view. static const int silhouettes[27][7] = { - {6, 1, 3, 2, 6, 4, 5}, // 0: Black - {6, 2, 6, 4, 5, 7, 3}, // 1: White - {6, 0, 4, 5, 7, 3, 2}, // 2: Gray - {6, 1, 3, 7, 6, 4, 5,}, // 3: Red - {4, 4, 5, 7, 6, -1, -1}, // 4: Green - {6, 0, 4, 5, 7, 6, 2}, // 5: Blue - {6, 0, 1, 3, 7, 6, 4}, // 6: Yellow - {6, 0, 1, 5, 7, 6, 4}, // 7: Magenta - {6, 0, 1, 5, 7, 6, 2}, // 8: Cyan - {6, 1, 3, 2, 6, 7, 5}, // 9: Orange - {4, 2, 6, 7, 3, -1, -1}, // 10: Light Orange - {6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange - {4, 1, 3, 7, 5, -1, -1}, // 12: Pink - {6, 0, 4, 6, 7, 3, 2}, // 13: Light Pink - {4, 0, 4, 6, 2, -1, -1}, // 14: Deep Rose - {6, 0, 1, 3, 7, 5, 4}, // 15: Purple - {4, 0, 1, 5, 4, -1, -1}, // 16: Light Purple - {6, 0, 1, 5, 4, 6, 2}, // 17: Indigo - {6, 0, 2, 6, 7, 5, 1}, // 18: Dark Green - {6, 0, 2, 6, 7, 3, 1}, // 19: Lime - {6, 0, 4, 6, 7, 3, 1}, // 20: Forest Green - {6, 0, 2, 3, 7, 5, 1}, // 21: Navy - {4, 0, 2, 3, 1, -1, -1}, // 22: Sky Blue - {6, 0, 4, 6, 2, 3, 1}, // 23: Teal - {6, 0, 2, 3, 7, 5, 4}, // 24: Brown - {6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige - {6, 1, 5, 4, 6, 2, 3} // 26: Dark Brown + {6, 1, 3, 2, 6, 4, 5}, // 0: Black + {6, 2, 6, 4, 5, 7, 3}, // 1: White + {6, 0, 4, 5, 7, 3, 2}, // 2: Gray + {6, 1, 3, 7, 6, 4, 5}, // 3: Red + {4, 4, 5, 7, 6, -1, -1}, // 4: Green + {6, 0, 4, 5, 7, 6, 2}, // 5: Blue + {6, 0, 1, 3, 7, 6, 4}, // 6: Yellow + {6, 0, 1, 5, 7, 6, 4}, // 7: Magenta + {6, 0, 1, 5, 7, 6, 2}, // 8: Cyan + {6, 1, 3, 2, 6, 7, 5}, // 9: Orange + {4, 2, 6, 7, 3, -1, -1}, // 10: Light Orange + {6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange + {4, 1, 3, 7, 5, -1, -1}, // 12: Pink + {6, 0, 4, 6, 7, 3, 2}, // 13: Light Pink + {4, 0, 4, 6, 2, -1, -1}, // 14: Deep Rose + {6, 0, 1, 3, 7, 5, 4}, // 15: Purple + {4, 0, 1, 5, 4, -1, -1}, // 16: Light Purple + {6, 0, 1, 5, 4, 6, 2}, // 17: Indigo + {6, 0, 2, 6, 7, 5, 1}, // 18: Dark Green + {6, 0, 2, 6, 7, 3, 1}, // 19: Lime + {6, 0, 4, 6, 7, 3, 1}, // 20: Forest Green + {6, 0, 2, 3, 7, 5, 1}, // 21: Navy + {4, 0, 2, 3, 1, -1, -1}, // 22: Sky Blue + {6, 0, 4, 6, 2, 3, 1}, // 23: Teal + {6, 0, 2, 3, 7, 5, 4}, // 24: Brown + {6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige + {6, 1, 5, 4, 6, 2, 3} // 26: Dark Brown }; // Binary packed silhouettes static const uint32_t binSilhouettes[27] = { - 0b11000000000000101100110010011001, - 0b11000000000000011111101100110010, - 0b11000000000000010011111101100000, - 0b11000000000000101100110111011001, - 0b10000000000000000000110111101100, - 0b11000000000000010110111101100000, - 0b11000000000000100110111011001000, - 0b11000000000000100110111101001000, - 0b11000000000000010110111101001000, - 0b11000000000000101111110010011001, - 0b10000000000000000000011111110010, - 0b11000000000000010011111110100000, - 0b10000000000000000000101111011001, - 0b11000000000000010011111110100000, - 0b10000000000000000000010110100000, - 0b11000000000000100101111011001000, - 0b10000000000000000000100101001000, - 0b11000000000000010110100101001000, - 0b11000000000000001101111110010000, - 0b11000000000000001011111110010000, - 0b11000000000000001011111110100000, - 0b11000000000000001101111011010000, - 0b10000000000000000000001011010000, - 0b11000000000000001011010110100000, - 0b11000000000000100101111011010000, - 0b11000000000000100101001011010000, - 0b11000000000000011010110100101001, + 0b11000000000000101100110010011001, + 0b11000000000000011111101100110010, + 0b11000000000000010011111101100000, + 0b11000000000000101100110111011001, + 0b10000000000000000000110111101100, + 0b11000000000000010110111101100000, + 0b11000000000000100110111011001000, + 0b11000000000000100110111101001000, + 0b11000000000000010110111101001000, + 0b11000000000000101111110010011001, + 0b10000000000000000000011111110010, + 0b11000000000000010011111110100000, + 0b10000000000000000000101111011001, + 0b11000000000000010011111110100000, + 0b10000000000000000000010110100000, + 0b11000000000000100101111011001000, + 0b10000000000000000000100101001000, + 0b11000000000000010110100101001000, + 0b11000000000000001101111110010000, + 0b11000000000000001011111110010000, + 0b11000000000000001011111110100000, + 0b11000000000000001101111011010000, + 0b10000000000000000000001011010000, + 0b11000000000000001011010110100000, + 0b11000000000000100101111011010000, + 0b11000000000000100101001011010000, + 0b11000000000000011010110100101001, }; int getSilhouetteVertex(uint32_t packedSil, int index) { - return (packedSil >> (3 * index)) & 0x7; + return (packedSil >> (3 * index)) & 0x7; } // Get silhouette size int getSilhouetteSize(uint32_t sil) { - return (sil >> 29) & 0x7; - + return (sil >> 29) & 0x7; } // Check if vertex has negative z bool getVertexZNeg(int vertexIdx) { - return normalize(corners[vertexIdx]).z < 0.0f; +#if FAST + float3 localPos = float3( + (vertexIdx & 1) ? 1.0f : -1.0f, + (vertexIdx & 2) ? 1.0f : -1.0f, + (vertexIdx & 4) ? 1.0f : -1.0f); + + float transformedZ = dot(pc.modelMatrix[2].xyz, localPos) + pc.modelMatrix[2].w; + return transformedZ < 0.0f; +#else + return corners[vertexIdx].z < 0.0f; +#endif } -#include "Drawing.hlsl" +float3 getVertex(int vertexIdx) +{ +#if FAST + // Reconstruct local cube corner from index bits + float sx = (vertexIdx & 1) ? 1.0f : -1.0f; + float sy = (vertexIdx & 2) ? 1.0f : -1.0f; + float sz = (vertexIdx & 4) ? 1.0f : -1.0f; + + float4x3 model = transpose(pc.modelMatrix); + + // Transform to world + // Full position, not just Z like getVertexZNeg + return model[0].xyz * sx + + model[1].xyz * sy + + model[2].xyz * sz + + model[3].xyz; + // return mul(pc.modelMatrix, float4(sx, sy, sz, 1.0f)); +#else + return corners[vertexIdx]; +#endif +} +#include "Drawing.hlsl" -void setDebugData(uint32_t sil, int3 region, int configIndex, uint32_t clippedVertexCount) +void setDebugData(uint32_t sil, int3 region, int configIndex) { #if DEBUG_DATA - DebugDataBuffer[0].silhouetteVertexCount = uint32_t(getSilhouetteSize(sil)); - DebugDataBuffer[0].region = uint3(region); - DebugDataBuffer[0].silhouetteIndex = uint32_t(configIndex); - DebugDataBuffer[0].clippedVertexCount = clippedVertexCount; - for (int i = 0; i < 6; i++) - { - DebugDataBuffer[0].vertices[i] = uint32_t(getSilhouetteVertex(sil, i)); - } - DebugDataBuffer[0].silhouette = sil; + DebugDataBuffer[0].silhouetteVertexCount = uint32_t(getSilhouetteSize(sil)); + DebugDataBuffer[0].region = uint3(region); + DebugDataBuffer[0].silhouetteIndex = uint32_t(configIndex); + for (int i = 0; i < 6; i++) + { + DebugDataBuffer[0].vertices[i] = uint32_t(getSilhouetteVertex(sil, i)); + } + DebugDataBuffer[0].silhouette = sil; #endif } float2 toCircleSpace(float2 uv) { - float2 p = uv * 2.0f - 1.0f; - float aspect = pc.viewport.z / pc.viewport.w; - p.x *= aspect; - return p; + float2 p = uv * 2.0f - 1.0f; + float aspect = pc.viewport.z / pc.viewport.w; + p.x *= aspect; + return p; } -uint32_t packSilhouette(const int s[7]) +uint32_t packSilhouette(const int s[7]) { - uint32_t packed = 0; - int size = s[0] & 0x7; // 3 bits for size - - // Pack vertices LSB-first (vertex1 in lowest 3 bits above size) - for (int i = 1; i <= 6; ++i) { - int v = s[i]; - if (v < 0) v = 0; // replace unused vertices with 0 - packed |= (v & 0x7) << (3 * (i - 1)); // vertex i-1 shifted by 3*(i-1) - } - - // Put size in the MSB (bits 29-31 for a 32-bit uint, leaving 29 bits for vertices) - packed |= (size & 0x7) << 29; - - return packed; + uint32_t packed = 0; + int size = s[0] & 0x7; // 3 bits for size + + // Pack vertices LSB-first (vertex1 in lowest 3 bits above size) + for (int i = 1; i <= 6; ++i) + { + int v = s[i]; + if (v < 0) + v = 0; // replace unused vertices with 0 + packed |= (v & 0x7) << (3 * (i - 1)); // vertex i-1 shifted by 3*(i-1) + } + + // Put size in the MSB (bits 29-31 for a 32-bit uint, leaving 29 bits for vertices) + packed |= (size & 0x7) << 29; + + return packed; } void computeCubeGeo() { - for (int i = 0; i < 8; i++) - for (int i = 0; i < 8; i++) - { - float3 localPos = constCorners[i]; - float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz; - corners[i] = worldPos.xyz; - faceCenters[i / 4] += worldPos / 4.0f; - faceCenters[2 + i % 2] += worldPos / 4.0f; - faceCenters[4 + (i / 2) % 2] += worldPos / 4.0f; - } + for (int i = 0; i < 8; i++) + { + float3 localPos = constCorners[i]; + float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz; + corners[i] = worldPos.xyz; + faceCenters[i / 4] += worldPos / 4.0f; + faceCenters[2 + i % 2] += worldPos / 4.0f; + faceCenters[4 + (i / 2) % 2] += worldPos / 4.0f; + } +} + +// Helper to draw an edge with proper color mapping +float4 drawEdge(int originalEdgeIdx, float3 pts[2], float3 spherePos, float aaWidth, float width = 0.01f) +{ + float4 edgeContribution = drawGreatCircleArc(spherePos, pts, aaWidth, width); + return float4(colorLUT[originalEdgeIdx] * edgeContribution.a, edgeContribution.a); +}; + +float4 drawSilhouette(uint32_t vertexCount, uint32_t sil, float3 spherePos, float aaWidth) +{ + float4 color = 0; + + // Build clip mask (z < 0) + uint32_t clipMask = 0u; + NBL_UNROLL + for (int i = 0; i < 4; i++) + clipMask |= (getVertexZNeg(getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; + + if (vertexCount == 6) + { + NBL_UNROLL + for (int i = 4; i < 6; i++) + clipMask |= (getVertexZNeg(getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; + } + + int clipCount = countbits(clipMask); + + // Early exit if fully clipped + if (clipCount == vertexCount) + return color; + + // No clipping needed - fast path + if (clipCount == 0) + { + for (int i = 0; i < vertexCount; i++) + { + int i0 = i; + int i1 = (i + 1) % vertexCount; + + float3 v0 = getVertex(getSilhouetteVertex(sil, i0)); + float3 v1 = getVertex(getSilhouetteVertex(sil, i1)); + float3 pts[2] = {v0, v1}; + + color += drawEdge(i1, pts, spherePos, aaWidth); + } + return color; + } + + // Rotate clip mask so positives come first + uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u); + bool wrapAround = ((clipMask & 1u) != 0u) && + ((clipMask & (1u << (vertexCount - 1))) != 0u); + int rotateAmount = wrapAround + ? firstbitlow(invertedMask) // -> First POSITIVE + : firstbithigh(clipMask) + 1; // -> First vertex AFTER last negative + + uint32_t rotatedClipMask = rotr(clipMask, rotateAmount, vertexCount); + uint32_t rotatedSil = rotr(sil, rotateAmount * 3, vertexCount * 3); + + int positiveCount = vertexCount - clipCount; + + // ALWAYS compute both clip points + int lastPosIdx = positiveCount - 1; + int firstNegIdx = positiveCount; + float3 vLastPos = getVertex(getSilhouetteVertex(rotatedSil, lastPosIdx)); + float3 vFirstNeg = getVertex(getSilhouetteVertex(rotatedSil, firstNegIdx)); + float t = vLastPos.z / (vLastPos.z - vFirstNeg.z); + float3 clipA = lerp(vLastPos, vFirstNeg, t); + + float3 vLastNeg = getVertex(getSilhouetteVertex(rotatedSil, vertexCount - 1)); + float3 vFirstPos = getVertex(getSilhouetteVertex(rotatedSil, 0)); + t = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + float3 clipB = lerp(vLastNeg, vFirstPos, t); + + // Draw positive edges + NBL_UNROLL + for (int i = 0; i < positiveCount; i++) + { + + float3 v0 = getVertex(getSilhouetteVertex(rotatedSil, i)); + bool useClipA = (i == positiveCount - 1); + float3 v1 = useClipA ? clipA : getVertex(getSilhouetteVertex(rotatedSil, (i + 1) % vertexCount)); + + float3 pts[2] = {v0, v1}; + color += drawEdge(i + 1, pts, spherePos, aaWidth); + } + + // NP edge + if (clipCount > 0 && clipCount < vertexCount) + { + float3 vFirst = getVertex(getSilhouetteVertex(rotatedSil, 0)); + float3 npPts[2] = {clipB, vFirst}; + color += drawEdge(0, npPts, spherePos, aaWidth); + } + + // Horizon arc + if (clipCount > 0 && clipCount < vertexCount) + { + float3 arcPts[2] = {clipA, clipB}; + color += drawEdge(23, arcPts, spherePos, aaWidth, 0.6f); + } + +#if DEBUG_DATA + DebugDataBuffer[0].clipMask = clipMask; + DebugDataBuffer[0].clipCount = clipCount; + { + int transitions = 0; + for (int i = 0; i < vertexCount; i++) + { + bool a = (rotatedClipMask >> i) & 1u; + bool b = (rotatedClipMask >> ((i + 1) % vertexCount)) & 1u; + if (a != b) + transitions++; + } + // transitions must be 0 or 2 + DebugDataBuffer[0].MoreThanTwoBitTransitions = transitions > 2; + DebugDataBuffer[0].rotatedClipMask = rotatedClipMask; + DebugDataBuffer[0].rotateAmount = rotateAmount; + DebugDataBuffer[0].positiveVertCount = positiveCount; + DebugDataBuffer[0].wrapAround = (uint32_t)wrapAround; + DebugDataBuffer[0].rotatedSil = rotatedSil; + } +#endif + return color; } [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 { - float4 color = float4(0, 0, 0, 0); - float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); - float2 p = toCircleSpace(vx.uv); + float4 color = float4(0, 0, 0, 0); + for (int i = 0; i < 1; i++) + { + + float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); + float2 p = toCircleSpace(vx.uv); - float2 normalized = p / CIRCLE_RADIUS; - float r2 = dot(normalized, normalized); + float2 normalized = p / CIRCLE_RADIUS; + float r2 = dot(normalized, normalized); - float3 spherePos; - if (r2 <= 1.0f) - { - spherePos = float3(normalized.x, normalized.y, sqrt(1.0f - r2)); - } - else - { - float uv2Plus1 = r2 + 1.0f; - spherePos = float3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; - } - spherePos = normalize(spherePos); + float3 spherePos; + if (r2 <= 1.0f) + { + spherePos = float3(normalized.x, normalized.y, sqrt(1.0f - r2)); + } + else + { + float uv2Plus1 = r2 + 1.0f; + spherePos = float3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; + } + spherePos = normalize(spherePos); - computeCubeGeo(); + computeCubeGeo(); - float4x3 columnModel = transpose(pc.modelMatrix); + float4x3 columnModel = transpose(pc.modelMatrix); - float3 obbCenter = columnModel[3].xyz; + float3 obbCenter = columnModel[3].xyz; - float3x3 upper3x3 = (float3x3)columnModel; + float3x3 upper3x3 = (float3x3)columnModel; - float3 rcpScales = rcp(float3( - dot(upper3x3[0], upper3x3[0]), - dot(upper3x3[1], upper3x3[1]), - dot(upper3x3[2], upper3x3[2]) - )); + float3 rcpSqScales = rcp(float3( + dot(upper3x3[0], upper3x3[0]), + dot(upper3x3[1], upper3x3[1]), + dot(upper3x3[2], upper3x3[2]))); - float3 normalizedProj = mul(upper3x3, obbCenter) * rcpScales; + float3 normalizedProj = mul(upper3x3, obbCenter) * rcpSqScales; - int3 region = int3( - normalizedProj.x < -1.0f ? 0 : (normalizedProj.x > 1.0f ? 2 : 1), - normalizedProj.y < -1.0f ? 0 : (normalizedProj.y > 1.0f ? 2 : 1), - normalizedProj.z < -1.0f ? 0 : (normalizedProj.z > 1.0f ? 2 : 1) - ); - int configIndex = region.x + region.y * 3 + region.z * 9; + int3 region = int3( + normalizedProj.x < -1.0f ? 0 : (normalizedProj.x > 1.0f ? 2 : 1), + normalizedProj.y < -1.0f ? 0 : (normalizedProj.y > 1.0f ? 2 : 1), + normalizedProj.z < -1.0f ? 0 : (normalizedProj.z > 1.0f ? 2 : 1)); - // uint32_t sil = packSilhouette(silhouettes[configIndex]); - uint32_t sil = binSilhouettes[configIndex]; + int configIndex = region.x + region.y * 3 + region.z * 9; - int vertexCount = getSilhouetteSize(sil); - bool longSilhouette = (vertexCount == 6); - uint32_t silEdgeMask = 0; + // uint32_t sil = packSilhouette(silhouettes[configIndex]); + uint32_t sil = binSilhouettes[configIndex]; + + int vertexCount = getSilhouetteSize(sil); + uint32_t silEdgeMask = 0; #if DEBUG_DATA - { - for (int i = 0; i < vertexCount; i++) - { - int vIdx = i % vertexCount; - int v1Idx = (i + 1) % vertexCount; - - int v0Corner = getSilhouetteVertex(sil, vIdx); - int v1Corner = getSilhouetteVertex(sil, v1Idx); - // Mark edge as part of silhouette - for (int e = 0; e < 12; e++) - { - int2 edge = allEdges[e]; - if ((edge.x == v0Corner && edge.y == v1Corner) || - (edge.x == v1Corner && edge.y == v0Corner)) - { - silEdgeMask |= (1u << e); - } - } - } - validateEdgeVisibility(sil, vertexCount, silEdgeMask); - } + { + for (int i = 0; i < vertexCount; i++) + { + int vIdx = i % vertexCount; + int v1Idx = (i + 1) % vertexCount; + + int v0Corner = getSilhouetteVertex(sil, vIdx); + int v1Corner = getSilhouetteVertex(sil, v1Idx); + // Mark edge as part of silhouette + for (int e = 0; e < 12; e++) + { + int2 edge = allEdges[e]; + if ((edge.x == v0Corner && edge.y == v1Corner) || + (edge.x == v1Corner && edge.y == v0Corner)) + { + silEdgeMask |= (1u << e); + } + } + } + validateEdgeVisibility(sil, vertexCount, silEdgeMask); + } #endif - // Build clip mask for vertices below horizon (z < 0) - uint32_t clipMask = 0u; - NBL_UNROLL - for (int i = 0; i < 6; i++) - { - if (i >= vertexCount) break; - clipMask |= (getVertexZNeg(getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; - } - - int clipCount = countbits(clipMask); - - // Total clipped vertices - int clippedVertCount = vertexCount + (clipMask != 0u ? (2 - clipCount) : 0); - - // Find rotation amount to place positive vertices first - int rotateAmount = 0; - if (clipMask != 0u) - { - uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u); - bool wrapAround = ((clipMask & 1u) != 0u) && ((clipMask >> (vertexCount - 1)) & 1u); - - rotateAmount = wrapAround ? - ((firstbithigh(invertedMask) + 1) % vertexCount) : - firstbitlow(clipMask); - } - - // Rotate silhouette bits - uint32_t vertexBits = sil & 0x1FFFFFFF; - uint32_t rotatedVertexBits = rotr(vertexBits, rotateAmount * 3, vertexCount * 3); - uint32_t rotatedSil = (sil & 0xE0000000) | rotatedVertexBits; - - // Rotate the clip mask to match - uint32_t rotatedClipMask = rotr(clipMask, rotateAmount, vertexCount); - - // Draw clipped silhouette edges - for (int i = 0; i < clippedVertCount; i++) - { - int nextI = (i + 1) % clippedVertCount; - - int vIdx = i % vertexCount; - int v1Idx = nextI % vertexCount; - - // Extract clip bits directly - bool v0Clipped = (rotatedClipMask >> vIdx) & 1u; - bool v1Clipped = (rotatedClipMask >> v1Idx) & 1u; - - // Skip if both clipped - if (v0Clipped && v1Clipped) continue; - - int v0Corner = getSilhouetteVertex(rotatedSil, vIdx); - int v1Corner = getSilhouetteVertex(rotatedSil, v1Idx); - - float3 v0 = normalize(corners[v0Corner]); - float3 v1 = normalize(corners[v1Corner]); - - float3 points[2] = { corners[v0Corner], corners[v1Corner] }; - - // Clip using bit state - if (v0Clipped) - { - float t = v0.z / (v0.z - v1.z); - points[0] = normalize(lerp(corners[v0Corner], corners[v1Corner], t)); - } - else if (v1Clipped) - { - float t = v0.z / (v0.z - v1.z); - points[1] = normalize(lerp(corners[v0Corner], corners[v1Corner], t)); - } - - // Draw edge - float4 edgeContribution = drawGreatCircleArc(spherePos, points, 1, aaWidth); - color += float4(colorLUT[i] * edgeContribution.a, edgeContribution.a); - - } - - - setDebugData(sil, region, configIndex, clippedVertCount); - - color += drawHiddenEdges(spherePos, silEdgeMask, aaWidth); - color += drawCorners(spherePos, p, aaWidth); - color += drawRing(p, aaWidth); - - if (all(vx.uv >= float2(0.49f, 0.49f)) && all(vx.uv <= float2(0.51f, 0.51f))) - { - return float4(colorLUT[configIndex], 1.0f); - } - - return color; + + uint32_t positiveCount = 0; + color += drawSilhouette(vertexCount, sil, spherePos, aaWidth); + setDebugData(sil, region, configIndex); + + color += drawHiddenEdges(spherePos, silEdgeMask, aaWidth); + color += drawCorners(spherePos, p, aaWidth); + color += drawRing(p, aaWidth); + + if (all(vx.uv >= float2(0.49f, 0.49f)) && all(vx.uv <= float2(0.51f, 0.51f))) + { + return float4(colorLUT[configIndex], 1.0f); + } + } + + return color; } \ No newline at end of file diff --git a/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl index 3c87a48bc..c8532e796 100644 --- a/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl +++ b/72_SolidAngleVisualizer/app_resources/hlsl/common.hlsl @@ -3,6 +3,7 @@ #include "nbl/builtin/hlsl/cpp_compat.hlsl" #define DEBUG_DATA 1 +#define FAST 1 namespace nbl { @@ -13,12 +14,19 @@ namespace nbl { uint32_t3 region; uint32_t silhouetteIndex; - + uint32_t silhouetteVertexCount; uint32_t silhouette; - uint32_t clippedVertexCount; + uint32_t positiveVertCount; uint32_t edgeVisibilityMismatch; + uint32_t clipMask; + uint32_t clipCount; + uint32_t rotatedSil; + uint32_t wrapAround; + uint32_t rotatedClipMask; + uint32_t rotateAmount; + uint32_t MoreThanTwoBitTransitions; uint32_t vertices[6]; }; @@ -29,24 +37,22 @@ namespace nbl }; static const float32_t3 colorLUT[27] = { - float32_t3(0, 0, 0), float32_t3(1, 1, 1), float32_t3(0.5, 0.5, 0.5), - float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1), - float32_t3(1, 1, 0), float32_t3(1, 0, 1), float32_t3(0, 1, 1), - float32_t3(1, 0.5, 0), float32_t3(1, 0.65, 0), float32_t3(0.8, 0.4, 0), - float32_t3(1, 0.4, 0.7), float32_t3(1, 0.75, 0.8), float32_t3(0.7, 0.1, 0.3), - float32_t3(0.5, 0, 0.5), float32_t3(0.6, 0.4, 0.8), float32_t3(0.3, 0, 0.5), - float32_t3(0, 0.5, 0), float32_t3(0.5, 1, 0), float32_t3(0, 0.5, 0.25), - float32_t3(0, 0, 0.5), float32_t3(0.3, 0.7, 1), float32_t3(0, 0.4, 0.6), - float32_t3(0.6, 0.4, 0.2), float32_t3(0.8, 0.7, 0.3), float32_t3(0.4, 0.3, 0.1) - }; + float32_t3(0, 0, 0), float32_t3(1, 1, 1), float32_t3(0.5, 0.5, 0.5), + float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1), + float32_t3(1, 1, 0), float32_t3(1, 0, 1), float32_t3(0, 1, 1), + float32_t3(1, 0.5, 0), float32_t3(1, 0.65, 0), float32_t3(0.8, 0.4, 0), + float32_t3(1, 0.4, 0.7), float32_t3(1, 0.75, 0.8), float32_t3(0.7, 0.1, 0.3), + float32_t3(0.5, 0, 0.5), float32_t3(0.6, 0.4, 0.8), float32_t3(0.3, 0, 0.5), + float32_t3(0, 0.5, 0), float32_t3(0.5, 1, 0), float32_t3(0, 0.5, 0.25), + float32_t3(0, 0, 0.5), float32_t3(0.3, 0.7, 1), float32_t3(0, 0.4, 0.6), + float32_t3(0.6, 0.4, 0.2), float32_t3(0.8, 0.7, 0.3), float32_t3(0.4, 0.3, 0.1)}; #ifndef __HLSL_VERSION - static const char* colorNames[27] = {"Black", - "White", "Gray", "Red", "Green", "Blue", "Yellow", "Magenta", "Cyan", - "Orange", "Light Orange", "Dark Orange", "Pink", "Light Pink", "Deep Rose", "Purple", "Light Purple", - "Indigo", "Dark Green", "Lime", "Forest Green", "Navy", "Sky Blue", "Teal", "Brown", - "Tan/Beige", "Dark Brown" - }; + static const char *colorNames[27] = {"Black", + "White", "Gray", "Red", "Green", "Blue", "Yellow", "Magenta", "Cyan", + "Orange", "Light Orange", "Dark Orange", "Pink", "Light Pink", "Deep Rose", "Purple", "Light Purple", + "Indigo", "Dark Green", "Lime", "Forest Green", "Navy", "Sky Blue", "Teal", "Brown", + "Tan/Beige", "Dark Brown"}; #endif // __HLSL_VERSION } } diff --git a/72_SolidAngleVisualizer/main.cpp b/72_SolidAngleVisualizer/main.cpp index 1c52547af..64f4cb100 100644 --- a/72_SolidAngleVisualizer/main.cpp +++ b/72_SolidAngleVisualizer/main.cpp @@ -475,13 +475,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR instance.packedGeo = m_renderer->getGeometries().data(); // cube // +interface.gcIndex; m_renderer->render(cb, viewParams); // draw the cube/OBB - // TODO: a better way to get identity matrix - float32_t3x4 origin = { - 1.0f,0.0f,0.0f,0.0f, - 0.0f,1.0f,0.0f,0.0f, - 0.0f,0.0f,1.0f,0.0f - }; - memcpy(&instance.world, &origin, sizeof(instance.world)); + instance.world = float32_t3x4(1.0f); instance.packedGeo = m_renderer->getGeometries().data() + 2; // disk m_renderer->render(cb, viewParams); } @@ -1112,8 +1106,9 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR drawColorField("silhouetteIndex", m_GPUOutResulData.silhouetteIndex); ImGui::Text("silhouette Vertex Count: %u", m_GPUOutResulData.silhouetteVertexCount); - ImGui::Text("silhouette Clipped VertexCount: %u", m_GPUOutResulData.clippedVertexCount); + ImGui::Text("silhouette Positive VertexCount: %u", m_GPUOutResulData.positiveVertCount); ImGui::Text("Silhouette Mismatch: %s", m_GPUOutResulData.edgeVisibilityMismatch ? "true" : "false"); + ImGui::Text("More Than Two Bit Transitions: %s", m_GPUOutResulData.MoreThanTwoBitTransitions ? "true" : "false"); { float32_t3 xAxis = m_OBBModelMatrix[0].xyz; @@ -1141,12 +1136,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR lastSilhouetteIndex = m_GPUOutResulData.silhouetteIndex; } - if (!m_GPUOutResulData.edgeVisibilityMismatch) + if (!m_GPUOutResulData.edgeVisibilityMismatch || !m_GPUOutResulData.MoreThanTwoBitTransitions) { // Reset flag when mismatch is cleared modalShown = false; } - if (m_GPUOutResulData.edgeVisibilityMismatch && m_GPUOutResulData.silhouetteIndex != 13 && !modalShown) // 13 means we're inside the cube, so don't care + if ((m_GPUOutResulData.edgeVisibilityMismatch || m_GPUOutResulData.MoreThanTwoBitTransitions) && m_GPUOutResulData.silhouetteIndex != 13 && !modalShown) // 13 means we're inside the cube, so don't care { // Open modal popup only once per configuration ImGui::OpenPopup("Edge Visibility Mismatch Warning"); @@ -1165,10 +1160,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // Show configuration info ImGui::TextWrapped("Configuration Index: %u", m_GPUOutResulData.silhouetteIndex); - ImGui::TextWrapped("Region: (%d, %d, %d)", - m_GPUOutResulData.region.x, - m_GPUOutResulData.region.y, - m_GPUOutResulData.region.z); + ImGui::TextWrapped("Region: (%u, %u, %u)", m_GPUOutResulData.region.x, m_GPUOutResulData.region.y, m_GPUOutResulData.region.z); ImGui::Spacing(); ImGui::Text("Mismatched Vertices (bitmask): 0x%08X", m_GPUOutResulData.edgeVisibilityMismatch); @@ -1203,13 +1195,26 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::Separator(); // Silhouette mask printed in binary - char buf[33]; - for (int i = 0; i < 32; i++) - buf[i] = (m_GPUOutResulData.silhouette & (1u << (31 - i))) ? '1' : '0'; - buf[32] = '\0'; - ImGui::Text("silhouette: 0x%08X", m_GPUOutResulData.silhouette); - ImGui::Text("binary: %s", buf); + + auto printBin = [](uint32_t bin, const char* name) + { + char buf[33]; + for (int i = 0; i < 32; i++) + buf[i] = (bin & (1u << (31 - i))) ? '1' : '0'; + buf[32] = '\0'; + ImGui::Text("%s: 0x%08X", name, bin); + ImGui::Text("binary: 0b%s", buf); + ImGui::Separator(); + }; + printBin(m_GPUOutResulData.silhouette, "Silhouette"); + printBin(m_GPUOutResulData.rotatedSil, "rotatedSilhouette"); + + printBin(m_GPUOutResulData.clipCount, "clipCount"); + printBin(m_GPUOutResulData.clipMask, "clipMask"); + printBin(m_GPUOutResulData.rotatedClipMask, "rotatedClipMask"); + printBin(m_GPUOutResulData.rotateAmount, "rotateAmount"); + printBin(m_GPUOutResulData.wrapAround, "wrapAround"); } ImGui::End(); } @@ -1240,29 +1245,56 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR }; static RandomSampler rng(69); // Initialize RNG with seed + + // Helper function to check if cube intersects unit sphere at origin + auto isCubeOutsideUnitSphere = [](const float32_t3& translation, const float32_t3& scale) -> bool { + float cubeRadius = glm::length(scale) * 0.5f; + float distanceToCenter = glm::length(translation); + return (distanceToCenter - cubeRadius) > 1.0f; + }; + + static TRS lastTRS = {}; if (ImGui::Button("Randomize Translation")) { - m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); + lastTRS = m_TRS; // Backup before randomizing + int attempts = 0; + do { + m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); + attempts++; + } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); } ImGui::SameLine(); - if (ImGui::Button("Randomize Rotation")) { + lastTRS = m_TRS; // Backup before randomizing m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f)); } ImGui::SameLine(); - if (ImGui::Button("Randomize Scale")) { - m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); + lastTRS = m_TRS; // Backup before randomizing + int attempts = 0; + do { + m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); + attempts++; + } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); } - - ImGui::SameLine(); + //ImGui::SameLine(); if (ImGui::Button("Randomize All")) { - m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); - m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f)); - m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); + lastTRS = m_TRS; // Backup before randomizing + int attempts = 0; + do { + m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); + m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f)); + m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); + attempts++; + } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); + } + ImGui::SameLine(); + if (ImGui::Button("Revert to Last")) + { + m_TRS = lastTRS; // Restore backed-up TRS } addMatrixTable("Model Matrix", "ModelMatrixTable", 4, 4, &m_OBBModelMatrix[0][0]); From 086af9e6590119bd394f2622db80ab0054445502 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Wed, 31 Dec 2025 14:05:14 +0300 Subject: [PATCH 14/26] Sample and visualize samples on the OBB, - Correct manipulation of OBB using `ImGuizmo::ViewManipulate()` - More visualizations of cube faces and 2D Primary Sample Space --- .../app_resources/hlsl/Drawing.hlsl | 207 ++++++++++- .../app_resources/hlsl/Sampling.hlsl | 247 +++++++++++++ .../hlsl/SolidAngleVis.frag.hlsl | 333 +++++++++--------- .../app_resources/hlsl/common.hlsl | 18 +- 73_SolidAngleVisualizer/include/transform.hpp | 34 +- 73_SolidAngleVisualizer/main.cpp | 29 +- 6 files changed, 675 insertions(+), 193 deletions(-) create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl index f3f1b4e96..89dfd4ae6 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl @@ -40,13 +40,24 @@ float drawGreatCircleArc(float3 fragPos, float3 points[2], float aaWidth, float return alpha; } +float drawCross2D(float2 fragPos, float2 center, float size, float thickness) +{ + float2 p = abs(fragPos - center); + + // Check if point is inside the cross (horizontal or vertical bar) + bool inHorizontal = (p.x <= size && p.y <= thickness); + bool inVertical = (p.y <= size && p.x <= thickness); + + return (inHorizontal || inVertical) ? 1.0f : 0.0f; +} + float4 drawHiddenEdges(float3 spherePos, uint32_t silEdgeMask, float aaWidth) { float4 color = 0; float3 hiddenEdgeColor = float3(0.1, 0.1, 0.1); NBL_UNROLL - for (int i = 0; i < 12; i++) + for (int32_t i = 0; i < 12; i++) { // skip silhouette edges if (silEdgeMask & (1u << i)) @@ -85,14 +96,14 @@ float4 drawHiddenEdges(float3 spherePos, uint32_t silEdgeMask, float aaWidth) return color; } -float4 drawCorners(float3 spherePos, float2 p, float aaWidth) +float4 drawCorners(float2 p, float aaWidth) { float4 color = 0; float dotSize = 0.02f; float innerDotSize = dotSize * 0.5f; - for (int i = 0; i < 8; i++) + for (int32_t i = 0; i < 8; i++) { float3 corner3D = normalize(getVertex(i)); float2 cornerPos = sphereToCircle(corner3D); @@ -130,6 +141,34 @@ float4 drawCorners(float3 spherePos, float2 p, float aaWidth) return color; } +float4 drawClippedSilhouetteVertices(float2 p, ClippedSilhouette silhouette, float aaWidth) +{ + float4 color = 0; + float dotSize = 0.03f; + + for (uint i = 0; i < silhouette.count; i++) + { + float3 corner3D = normalize(silhouette.vertices[i]); + float2 cornerPos = sphereToCircle(corner3D); + + float dist = length(p - cornerPos); + + // Smooth circle for the vertex + float alpha = 1.0f - smoothstep(dotSize * 0.8f, dotSize, dist); + + if (alpha > 0.0f) + { + // Color gradient: Red (index 0) to Cyan (last index) + // This helps verify the CCW winding order visually + float t = float(i) / float(max(1u, silhouette.count - 1)); + float3 vertexColor = lerp(float3(1, 0, 0), float3(0, 1, 1), t); + + color += float4(vertexColor * alpha, alpha); + } + } + return color; +} + float4 drawRing(float2 p, float aaWidth) { float positionLength = length(p); @@ -139,6 +178,59 @@ float4 drawRing(float2 p, float aaWidth) return ringAlpha * float4(1, 1, 1, 1); } +// Returns the number of visible faces and populates the faceIndices array +uint getVisibleFaces(int3 region, out uint faceIndices[3]) +{ + uint count = 0; + + // Check X axis + if (region.x == 0) + faceIndices[count++] = 3; // X+ + else if (region.x == 2) + faceIndices[count++] = 2; // X- + + // Check Y axis + if (region.y == 0) + faceIndices[count++] = 5; // Y+ + else if (region.y == 2) + faceIndices[count++] = 4; // Y- + + // Check Z axis + if (region.z == 0) + faceIndices[count++] = 1; // Z+ + else if (region.z == 2) + faceIndices[count++] = 0; // Z- + + return count; +} + +float4 drawVisibleFaceOverlay(float3 spherePos, int3 region, float aaWidth) +{ + uint faceIndices[3]; + uint count = getVisibleFaces(region, faceIndices); + float4 color = 0; + + for (uint i = 0; i < count; i++) + { + uint fIdx = faceIndices[i]; + float3 n = localNormals[fIdx]; + + // Transform normal to world space (using the same logic as your corners) + float3 worldNormal = -normalize(mul((float3x3)pc.modelMatrix, n)); + worldNormal.z = -worldNormal.z; // Invert Z for correct orientation + + // Very basic visualization: highlight if the sphere position + // is generally pointing towards that face's normal + float alignment = dot(spherePos, worldNormal); + if (alignment > 0.95f) + { + // Use different colors for different face indices + color += float4(colorLUT[fIdx % 24], 0.5f); + } + } + return color; +} + // Check if a face on the hemisphere is visible from camera at origin bool isFaceVisible(float3 faceCenter, float3 faceNormal) { @@ -146,8 +238,109 @@ bool isFaceVisible(float3 faceCenter, float3 faceNormal) return dot(faceNormal, viewVec) > 0.0f; } -int getEdgeVisibility(int edgeIdx) +float4 drawFaces(float3 spherePos, float aaWidth) +{ + float4 color = 0.0f; + float3 p = normalize(spherePos); + + float3x3 rotMatrix = (float3x3)pc.modelMatrix; + + // Check each of the 6 faces + for (int32_t faceIdx = 0; faceIdx < 6; faceIdx++) + { + float3 n_world = mul(rotMatrix, localNormals[faceIdx]); + + // Check if face is visible + if (!isFaceVisible(faceCenters[faceIdx], n_world)) + continue; + + // Get the 4 corners of this face + float3 faceVerts[4]; + for (int32_t i = 0; i < 4; i++) + { + int32_t cornerIdx = faceToCorners[faceIdx][i]; + faceVerts[i] = normalize(getVertex(cornerIdx)); + } + + // Compute face center for winding + float3 faceCenter = float3(0, 0, 0); + for (int32_t i = 0; i < 4; i++) + faceCenter += faceVerts[i]; + faceCenter = normalize(faceCenter); + + // Check if point is inside this face + bool isInside = true; + float minDist = 1e10; + + for (int32_t i = 0; i < 4; i++) + { + float3 v0 = faceVerts[i]; + float3 v1 = faceVerts[(i + 1) % 4]; + + // Skip edges behind camera + if (v0.z < 0.0f && v1.z < 0.0f) + { + isInside = false; + break; + } + + // Great circle normal + float3 edgeNormal = normalize(cross(v0, v1)); + + // Ensure normal points inward + if (dot(edgeNormal, faceCenter) < 0.0f) + edgeNormal = -edgeNormal; + + float d = dot(p, edgeNormal); + + if (d < -1e-6f) + { + isInside = false; + break; + } + + minDist = min(minDist, abs(d)); + } + + if (isInside) + { + float alpha = smoothstep(0.0f, aaWidth * 2.0f, minDist); + + // Use colorLUT based on face index (0-5) + float3 faceColor = colorLUT[faceIdx]; + + float shading = saturate(p.z * 0.8f + 0.2f); + color += float4(faceColor * shading * alpha, alpha); + } + } + + return color; +} + +int32_t getEdgeVisibility(int32_t edgeIdx) { + + // Adjacency of edges to faces + // Corrected Adjacency of edges to faces + static const int2 edgeToFaces[12] = { + // Edge Index: | allEdges[i] | Shared Faces: + + /* 0 (0-1) */ {4, 0}, // Y- (4) and Z- (0) + /* 1 (2-3) */ {5, 0}, // Y+ (5) and Z- (0) + /* 2 (4-5) */ {4, 1}, // Y- (4) and Z+ (1) + /* 3 (6-7) */ {5, 1}, // Y+ (5) and Z+ (1) + + /* 4 (0-2) */ {2, 0}, // X- (2) and Z- (0) + /* 5 (1-3) */ {3, 0}, // X+ (3) and Z- (0) + /* 6 (4-6) */ {2, 1}, // X- (2) and Z+ (1) + /* 7 (5-7) */ {3, 1}, // X+ (3) and Z+ (1) + + /* 8 (0-4) */ {2, 4}, // X- (2) and Y- (4) + /* 9 (1-5) */ {3, 4}, // X+ (3) and Y- (4) + /* 10 (2-6) */ {2, 5}, // X- (2) and Y+ (5) + /* 11 (3-7) */ {3, 5} // X+ (3) and Y+ (5) + }; + int2 faces = edgeToFaces[edgeIdx]; // Transform normals to world space @@ -175,7 +368,7 @@ uint32_t computeGroundTruthEdgeMask() { uint32_t mask = 0u; NBL_UNROLL - for (int j = 0; j < 12; j++) + for (int32_t j = 0; j < 12; j++) { // getEdgeVisibility returns 1 for a silhouette edge based on 3D geometry if (getEdgeVisibility(j) == 1) @@ -186,7 +379,7 @@ uint32_t computeGroundTruthEdgeMask() return mask; } -void validateEdgeVisibility(uint32_t sil, int vertexCount, uint32_t generatedSilMask) +void validateEdgeVisibility(uint32_t sil, int32_t vertexCount, uint32_t generatedSilMask) { uint32_t mismatchAccumulator = 0; @@ -199,7 +392,7 @@ void validateEdgeVisibility(uint32_t sil, int vertexCount, uint32_t generatedSil if (mismatchMask != 0) { NBL_UNROLL - for (int j = 0; j < 12; j++) + for (int32_t j = 0; j < 12; j++) { if ((mismatchMask >> j) & 1u) { diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl new file mode 100644 index 000000000..d213d8b94 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl @@ -0,0 +1,247 @@ +#ifndef _SAMPLING_HLSL_ +#define _SAMPLING_HLSL_ + +// Include the spherical triangle utilities +#include +#include +#include "nbl/builtin/hlsl/random/pcg.hlsl" +#include "nbl/builtin/hlsl/random/xoroshiro.hlsl" + +using namespace nbl::hlsl; +// Sampling mode enum +#define SAMPLING_MODE_SOLID_ANGLE 0 +#define SAMPLING_MODE_PROJECTED_SOLID_ANGLE 1 + +// Maximum number of triangles we can have after clipping +// Without clipping, max 3 faces can be visible at once +// With clipping, can be more. 7 - 2 = 5 max triangles because fanning from one vertex +#define MAX_TRIANGLES 5 + +struct SamplingData +{ + float32_t triangleWeights[MAX_TRIANGLES]; + uint32_t triangleIndices[MAX_TRIANGLES]; // Store the 'i' value for each valid triangle + uint32_t count; + float32_t totalWeight; +}; + +float32_t2 nextRandomUnorm2(inout nbl::hlsl::Xoroshiro64StarStar rnd) +{ + return float32_t2( + float32_t(rnd()) * 2.3283064365386963e-10, + float32_t(rnd()) * 2.3283064365386963e-10); +} + +float32_t computeProjectedSolidAngleFallback(float32_t3 v0, float32_t3 v1, float32_t3 v2, float32_t3 N) +{ + // 1. Get edge normals (unit vectors) + // We use the cross product of the vertices (unit vectors on sphere) + float32_t3 n0 = cross(v0, v1); + float32_t3 n1 = cross(v1, v2); + float32_t3 n2 = cross(v2, v0); + + // 2. Normalize edge normals (magnitude is sin of the arc length) + float32_t l0 = length(n0); + float32_t l1 = length(n1); + float32_t l2 = length(n2); + + // Guard against degenerate triangles + if (l0 < 1e-7 || l1 < 1e-7 || l2 < 1e-7) + return 0.0f; + + n0 /= l0; + n1 /= l1; + n2 /= l2; + + // 3. Get arc lengths (angles in radians) + float32_t a = asin(clamp(l0, -1.0, 1.0)); // side v0-v1 + float32_t b = asin(clamp(l1, -1.0, 1.0)); // side v1-v2 + float32_t c = asin(clamp(l2, -1.0, 1.0)); // side v2-v0 + + // Handle acos/asin quadrant if dot product is negative + if (dot(v0, v1) < 0) + a = 3.14159265 - a; + if (dot(v1, v2) < 0) + b = 3.14159265 - b; + if (dot(v2, v0) < 0) + c = 3.14159265 - c; + + // 4. Compute projected solid angle + float32_t Gamma = 0.5f * (a * dot(n0, N) + b * dot(n1, N) + c * dot(n2, N)); + + // Return the absolute value of the total (to handle CW/CCW triangles) + return abs(Gamma); +} + +// Build sampling data - store weights and vertex indices +SamplingData buildSamplingDataFromSilhouette(ClippedSilhouette silhouette, int32_t samplingMode) +{ + SamplingData data; + data.count = 0; + data.totalWeight = 0; + + if (silhouette.count < 3) + return data; + + float32_t3 v0 = silhouette.vertices[0]; + float32_t3 origin = float32_t3(0, 0, 0); + + for (uint32_t i = 1; i < silhouette.count - 1; i++) + { + float32_t3 v1 = silhouette.vertices[i]; + float32_t3 v2 = silhouette.vertices[i + 1]; + + shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(v0, v1, v2, origin); + + if (shapeTri.pyramidAngles()) + continue; + + float32_t weight; + if (samplingMode == SAMPLING_MODE_PROJECTED_SOLID_ANGLE) + { + float32_t3 faceNormal = normalize(cross(v1 - v0, v2 - v0)); // TODO: precompute? + weight = computeProjectedSolidAngleFallback(normalize(v0), normalize(v1), normalize(v2), faceNormal); + } + else + { + weight = shapeTri.solidAngleOfTriangle(); + } + + if (weight <= 0.0f) + continue; + + data.triangleWeights[data.count] = weight; + data.triangleIndices[data.count] = i; // Store the original vertex index, we need to account for skipped degenerate triangles. + data.totalWeight += weight; + data.count++; + } + +#ifdef DEBUG_DATA + // Assert no edge has both vertices antipodal (lune case) + for (uint32_t i = 0; i < silhouette.count; i++) + { + uint32_t j = (i + 1) % silhouette.count; + float32_t3 n1 = normalize(silhouette.vertices[i]); + float32_t3 n2 = normalize(silhouette.vertices[j]); + + // Check if vertices are antipodal + bool antipodal = dot(n1, n2) < -0.99f; + + assert(false && "Spherical lune detected: antipodal silhouette edge"); + } +#endif + + DebugDataBuffer[0].maxTrianglesExcceded = data.count > MAX_TRIANGLES; + return data; +} + +float32_t3 sampleFromData(SamplingData data, ClippedSilhouette silhouette, float32_t2 xi, out float32_t pdf, out uint32_t selectedIdx) +{ + if (data.count == 0 || data.totalWeight <= 0.0f) + { + pdf = 0; + selectedIdx = 0; + return float32_t3(0, 0, 1); + } + + // Select triangle using uniform random sampling weighted by importance + float32_t toFind = xi.x * data.totalWeight; + uint32_t triIdx = 0; + float32_t cumulativeWeight = 0.0f; + float32_t prevCumulativeWeight = 0.0f; + + NBL_UNROLL + for (uint32_t i = 0; i < data.count; i++) + { + prevCumulativeWeight = cumulativeWeight; + cumulativeWeight += data.triangleWeights[i]; + if (toFind <= cumulativeWeight) + { + triIdx = i; + break; + } + } + + selectedIdx = triIdx; + + // Remap xi.x to [0,1] within the selected triangle's weight range + float32_t triMin = prevCumulativeWeight; + float32_t triMax = cumulativeWeight; + float32_t triWeight = triMax - triMin; + float32_t u = (toFind - triMin) / max(triWeight, 1e-7f); + + // Reconstruct the triangle using the stored vertex index + uint32_t vertexIdx = data.triangleIndices[triIdx]; // We need to account for skipped degenerate triangles. + float32_t3 v0 = silhouette.vertices[0]; + float32_t3 v1 = silhouette.vertices[vertexIdx]; + float32_t3 v2 = silhouette.vertices[vertexIdx + 1]; + float32_t3 origin = float32_t3(0, 0, 0); + + shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(v0, v1, v2, origin); + sampling::SphericalTriangle samplingTri = sampling::SphericalTriangle::create(shapeTri); + + // Sample from the selected triangle using remapped u and original xi.y + float32_t rcpPdf; + float32_t3 direction = samplingTri.generate(rcpPdf, float32_t2(u, xi.y)); + + float32_t trianglePdf = 1.0f / rcpPdf; + pdf = trianglePdf * (data.triangleWeights[triIdx] / data.totalWeight); + + return normalize(direction); +} + +float32_t4 visualizeSamples(float32_t2 screenUV, float32_t3 spherePos, ClippedSilhouette silhouette, + int32_t samplingMode, SamplingData samplingData, int32_t numSamples) +{ + float32_t4 accumColor = 0; + + if (samplingData.count == 0) + return 0; + + float32_t2 pssSize = float32_t2(0.3, 0.3); // 30% of screen + float32_t2 pssPos = float32_t2(0.01, 0.01); // Offset from corner + bool isInsidePSS = all(and(screenUV >= pssPos, screenUV <= (pssPos + pssSize))); + + for (int32_t i = 0; i < numSamples; i++) + { + nbl::hlsl::random::PCG32 seedGen = nbl::hlsl::random::PCG32::construct(pc.frameIndex * 65536u + i); + const uint32_t seed1 = seedGen(); + const uint32_t seed2 = seedGen(); + nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(seed1, seed2)); + float32_t2 xi = nextRandomUnorm2(rnd); + + float32_t pdf; + uint32_t triIdx; + float32_t3 sampleDir = sampleFromData(samplingData, silhouette, xi, pdf, triIdx); + + float32_t dist3D = distance(sampleDir, normalize(spherePos)); + float32_t alpha3D = 1.0f - smoothstep(0.0f, 0.02f, dist3D); + + if (alpha3D > 0.0f && !isInsidePSS) + { + float32_t3 sampleColor = colorLUT[triIdx].rgb; + accumColor += float32_t4(sampleColor * alpha3D, alpha3D); + } + + if (isInsidePSS) + { + // Map the raw xi to the PSS square dimensions + float32_t2 xiPixelPos = pssPos + xi * pssSize; + float32_t dist2D = distance(screenUV, xiPixelPos); + + float32_t alpha2D = drawCross2D(screenUV, xiPixelPos, 0.005f, 0.001f); + if (alpha2D > 0.0f) + { + float32_t3 sampleColor = colorLUT[triIdx].rgb; + accumColor += float32_t4(sampleColor * alpha2D, alpha2D); + } + } + } + + // just the outline of the PSS + if (isInsidePSS && accumColor.a < 0.1) + accumColor = float32_t4(0.1, 0.1, 0.1, 1.0); + + return accumColor; +} +#endif diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index 8cc46bd25..31cbe577a 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -13,12 +13,17 @@ using namespace ext::FullScreenTriangle; static const float CIRCLE_RADIUS = 0.5f; // --- Geometry Utils --- +struct ClippedSilhouette +{ + float32_t3 vertices[7]; + uint32_t count; +}; -static const float3 constCorners[8] = { - float3(-1, -1, -1), float3(1, -1, -1), float3(-1, 1, -1), float3(1, 1, -1), - float3(-1, -1, 1), float3(1, -1, 1), float3(-1, 1, 1), float3(1, 1, 1)}; +static const float32_t3 constCorners[8] = { + float32_t3(-1, -1, -1), float32_t3(1, -1, -1), float32_t3(-1, 1, -1), float32_t3(1, 1, -1), + float32_t3(-1, -1, 1), float32_t3(1, -1, 1), float32_t3(-1, 1, 1), float32_t3(1, 1, 1)}; -static const int2 allEdges[12] = { +static const int32_t2 allEdges[12] = { {0, 1}, {2, 3}, {4, 5}, @@ -33,43 +38,33 @@ static const int2 allEdges[12] = { {3, 7}, // Z axis }; -// Adjacency of edges to faces -// Corrected Adjacency of edges to faces -static const int2 edgeToFaces[12] = { - // Edge Index: | allEdges[i] | Shared Faces: - - /* 0 (0-1) */ {4, 0}, // Y- (4) and Z- (0) - /* 1 (2-3) */ {5, 0}, // Y+ (5) and Z- (0) - /* 2 (4-5) */ {4, 1}, // Y- (4) and Z+ (1) - /* 3 (6-7) */ {5, 1}, // Y+ (5) and Z+ (1) - - /* 4 (0-2) */ {2, 0}, // X- (2) and Z- (0) - /* 5 (1-3) */ {3, 0}, // X+ (3) and Z- (0) - /* 6 (4-6) */ {2, 1}, // X- (2) and Z+ (1) - /* 7 (5-7) */ {3, 1}, // X+ (3) and Z+ (1) - - /* 8 (0-4) */ {2, 4}, // X- (2) and Y- (4) - /* 9 (1-5) */ {3, 4}, // X+ (3) and Y- (4) - /* 10 (2-6) */ {2, 5}, // X- (2) and Y+ (5) - /* 11 (3-7) */ {3, 5} // X+ (3) and Y+ (5) +// Maps face index (0-5) to its 4 corner indices in CCW order +static const uint32_t faceToCorners[6][4] = { + {0, 2, 3, 1}, // Face 0: Z- + {4, 5, 7, 6}, // Face 1: Z+ + {0, 4, 6, 2}, // Face 2: X- + {1, 3, 7, 5}, // Face 3: X+ + {0, 1, 5, 4}, // Face 4: Y- + {2, 6, 7, 3} // Face 5: Y+ }; -static float3 corners[8]; -static float3 faceCenters[6] = { - float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0), - float3(0, 0, 0), float3(0, 0, 0), float3(0, 0, 0)}; - -static const float3 localNormals[6] = { - float3(0, 0, -1), // Face 0 (Z-) - float3(0, 0, 1), // Face 1 (Z+) - float3(-1, 0, 0), // Face 2 (X-) - float3(1, 0, 0), // Face 3 (X+) - float3(0, -1, 0), // Face 4 (Y-) - float3(0, 1, 0) // Face 5 (Y+) + +static float32_t3 corners[8]; +static float32_t3 faceCenters[6] = { + float32_t3(0, 0, 0), float32_t3(0, 0, 0), float32_t3(0, 0, 0), + float32_t3(0, 0, 0), float32_t3(0, 0, 0), float32_t3(0, 0, 0)}; + +static const float32_t3 localNormals[6] = { + float32_t3(0, 0, -1), // Face 0 (Z-) + float32_t3(0, 0, 1), // Face 1 (Z+) + float32_t3(-1, 0, 0), // Face 2 (X-) + float32_t3(1, 0, 0), // Face 3 (X+) + float32_t3(0, -1, 0), // Face 4 (Y-) + float32_t3(0, 1, 0) // Face 5 (Y+) }; // TODO: unused, remove later // Vertices are ordered CCW relative to the camera view. -static const int silhouettes[27][7] = { +static const int32_t silhouettes[27][7] = { {6, 1, 3, 2, 6, 4, 5}, // 0: Black {6, 2, 6, 4, 5, 7, 3}, // 1: White {6, 0, 4, 5, 7, 3, 2}, // 2: Gray @@ -130,22 +125,22 @@ static const uint32_t binSilhouettes[27] = { 0b11000000000000011010110100101001, }; -int getSilhouetteVertex(uint32_t packedSil, int index) +int32_t getSilhouetteVertex(uint32_t packedSil, int32_t index) { return (packedSil >> (3 * index)) & 0x7; } // Get silhouette size -int getSilhouetteSize(uint32_t sil) +int32_t getSilhouetteSize(uint32_t sil) { return (sil >> 29) & 0x7; } // Check if vertex has negative z -bool getVertexZNeg(int vertexIdx) +bool getVertexZNeg(int32_t vertexIdx) { #if FAST - float3 localPos = float3( + float32_t3 localPos = float32_t3( (vertexIdx & 1) ? 1.0f : -1.0f, (vertexIdx & 2) ? 1.0f : -1.0f, (vertexIdx & 4) ? 1.0f : -1.0f); @@ -157,7 +152,8 @@ bool getVertexZNeg(int vertexIdx) #endif } -float3 getVertex(int vertexIdx) +// Get world position of cube vertex +float32_t3 getVertex(int32_t vertexIdx) { #if FAST // Reconstruct local cube corner from index bits @@ -165,7 +161,7 @@ float3 getVertex(int vertexIdx) float sy = (vertexIdx & 2) ? 1.0f : -1.0f; float sz = (vertexIdx & 4) ? 1.0f : -1.0f; - float4x3 model = transpose(pc.modelMatrix); + float32_t4x3 model = transpose(pc.modelMatrix); // Transform to world // Full position, not just Z like getVertexZNeg @@ -173,21 +169,22 @@ float3 getVertex(int vertexIdx) model[1].xyz * sy + model[2].xyz * sz + model[3].xyz; - // return mul(pc.modelMatrix, float4(sx, sy, sz, 1.0f)); + // return mul(pc.modelMatrix, float32_t4(sx, sy, sz, 1.0f)); #else return corners[vertexIdx]; #endif } #include "Drawing.hlsl" +#include "Sampling.hlsl" -void setDebugData(uint32_t sil, int3 region, int configIndex) +void setDebugData(uint32_t sil, int32_t3 region, int32_t configIndex) { #if DEBUG_DATA - DebugDataBuffer[0].silhouetteVertexCount = uint32_t(getSilhouetteSize(sil)); - DebugDataBuffer[0].region = uint3(region); + DebugDataBuffer[0].region = uint32_t3(region); DebugDataBuffer[0].silhouetteIndex = uint32_t(configIndex); - for (int i = 0; i < 6; i++) + DebugDataBuffer[0].silhouetteVertexCount = uint32_t(getSilhouetteSize(sil)); + for (int32_t i = 0; i < 6; i++) { DebugDataBuffer[0].vertices[i] = uint32_t(getSilhouetteVertex(sil, i)); } @@ -195,29 +192,29 @@ void setDebugData(uint32_t sil, int3 region, int configIndex) #endif } -float2 toCircleSpace(float2 uv) +float32_t2 toCircleSpace(float32_t2 uv) { - float2 p = uv * 2.0f - 1.0f; + float32_t2 p = uv * 2.0f - 1.0f; float aspect = pc.viewport.z / pc.viewport.w; p.x *= aspect; return p; } -uint32_t packSilhouette(const int s[7]) +uint32_t packSilhouette(const int32_t s[7]) { uint32_t packed = 0; - int size = s[0] & 0x7; // 3 bits for size + int32_t size = s[0] & 0x7; // 3 bits for size // Pack vertices LSB-first (vertex1 in lowest 3 bits above size) - for (int i = 1; i <= 6; ++i) + for (int32_t i = 1; i <= 6; ++i) { - int v = s[i]; + int32_t v = s[i]; if (v < 0) v = 0; // replace unused vertices with 0 packed |= (v & 0x7) << (3 * (i - 1)); // vertex i-1 shifted by 3*(i-1) } - // Put size in the MSB (bits 29-31 for a 32-bit uint, leaving 29 bits for vertices) + // Put size in the MSB (bits 29-31 for a 32-bit uint32_t, leaving 29 bits for vertices) packed |= (size & 0x7) << 29; return packed; @@ -225,211 +222,201 @@ uint32_t packSilhouette(const int s[7]) void computeCubeGeo() { - for (int i = 0; i < 8; i++) + for (int32_t i = 0; i < 8; i++) + corners[i] = mul(pc.modelMatrix, float32_t4(constCorners[i], 1.0f)).xyz; + + for (int32_t f = 0; f < 6; f++) { - float3 localPos = constCorners[i]; - float3 worldPos = mul(pc.modelMatrix, float4(localPos, 1.0f)).xyz; - corners[i] = worldPos.xyz; - faceCenters[i / 4] += worldPos / 4.0f; - faceCenters[2 + i % 2] += worldPos / 4.0f; - faceCenters[4 + (i / 2) % 2] += worldPos / 4.0f; + faceCenters[f] = float32_t3(0, 0, 0); + for (int32_t v = 0; v < 4; v++) + faceCenters[f] += corners[faceToCorners[f][v]]; + faceCenters[f] /= 4.0f; } } // Helper to draw an edge with proper color mapping -float4 drawEdge(int originalEdgeIdx, float3 pts[2], float3 spherePos, float aaWidth, float width = 0.01f) +float32_t4 drawEdge(int32_t originalEdgeIdx, float32_t3 pts[2], float32_t3 spherePos, float aaWidth, float width = 0.01f) { - float4 edgeContribution = drawGreatCircleArc(spherePos, pts, aaWidth, width); - return float4(colorLUT[originalEdgeIdx] * edgeContribution.a, edgeContribution.a); + float32_t4 edgeContribution = drawGreatCircleArc(spherePos, pts, aaWidth, width); + return float32_t4(colorLUT[originalEdgeIdx] * edgeContribution.a, edgeContribution.a); }; -float4 drawSilhouette(uint32_t vertexCount, uint32_t sil, float3 spherePos, float aaWidth) +float32_t4 computeSilhouette(uint32_t vertexCount, uint32_t sil, float32_t3 spherePos, float aaWidth, out ClippedSilhouette silhouette) { - float4 color = 0; + float32_t4 color = float32_t4(0, 0, 0, 0); + silhouette.count = 0; // Build clip mask (z < 0) - int clipMask = 0u; + int32_t clipMask = 0u; NBL_UNROLL - for (int i = 0; i < 4; i++) + for (int32_t i = 0; i < 4; i++) clipMask |= (getVertexZNeg(getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; if (vertexCount == 6) { NBL_UNROLL - for (int i = 4; i < 6; i++) + for (int32_t i = 4; i < 6; i++) clipMask |= (getVertexZNeg(getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; } - int clipCount = countbits(clipMask); + int32_t clipCount = countbits(clipMask); +#if 0 // Early exit if fully clipped - // if (clipCount == vertexCount) - // return color; + if (clipCount == vertexCount) + return color; // No clipping needed - fast path - // if (clipCount == 0) - // { - // for (int i = 0; i < vertexCount; i++) - // { - // int i0 = i; - // int i1 = (i + 1) % vertexCount; - - // float3 v0 = getVertex(getSilhouetteVertex(sil, i0)); - // float3 v1 = getVertex(getSilhouetteVertex(sil, i1)); - // float3 pts[2] = {v0, v1}; - - // color += drawEdge(i1, pts, spherePos, aaWidth); - // } - // return color; - // } + if (clipCount == 0) + { + for (int32_t i = 0; i < vertexCount; i++) + { + int32_t i0 = i; + int32_t i1 = (i + 1) % vertexCount; + + float32_t3 v0 = getVertex(getSilhouetteVertex(sil, i0)); + float32_t3 v1 = getVertex(getSilhouetteVertex(sil, i1)); + float32_t3 pts[2] = {v0, v1}; + + color += drawEdge(i1, pts, spherePos, aaWidth); + } + return color; + } +#endif // Rotate clip mask so positives come first uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u); bool wrapAround = ((clipMask & 1u) != 0u) && ((clipMask & (1u << (vertexCount - 1))) != 0u); - int rotateAmount = wrapAround - ? firstbitlow(invertedMask) // -> First POSITIVE - : firstbithigh(clipMask) + 1; // -> First vertex AFTER last negative, + int32_t rotateAmount = wrapAround + ? firstbitlow(invertedMask) // -> First POSITIVE + : firstbithigh(clipMask) + 1; // -> First vertex AFTER last negative uint32_t rotatedClipMask = rotr(clipMask, rotateAmount, vertexCount); uint32_t rotatedSil = rotr(sil, rotateAmount * 3, vertexCount * 3); - int positiveCount = vertexCount - clipCount; + int32_t positiveCount = vertexCount - clipCount; // ALWAYS compute both clip points - int lastPosIdx = positiveCount - 1; - int firstNegIdx = positiveCount; - float3 vLastPos = getVertex(getSilhouetteVertex(rotatedSil, lastPosIdx)); - float3 vFirstNeg = getVertex(getSilhouetteVertex(rotatedSil, firstNegIdx)); + int32_t lastPosIdx = positiveCount - 1; + int32_t firstNegIdx = positiveCount; + float32_t3 vLastPos = getVertex(getSilhouetteVertex(rotatedSil, lastPosIdx)); + float32_t3 vFirstNeg = getVertex(getSilhouetteVertex(rotatedSil, firstNegIdx)); float t = vLastPos.z / (vLastPos.z - vFirstNeg.z); - float3 clipA = lerp(vLastPos, vFirstNeg, t); + float32_t3 clipA = lerp(vLastPos, vFirstNeg, t); - float3 vLastNeg = getVertex(getSilhouetteVertex(rotatedSil, vertexCount - 1)); - float3 vFirstPos = getVertex(getSilhouetteVertex(rotatedSil, 0)); + float32_t3 vLastNeg = getVertex(getSilhouetteVertex(rotatedSil, vertexCount - 1)); + float32_t3 vFirstPos = getVertex(getSilhouetteVertex(rotatedSil, 0)); t = vLastNeg.z / (vLastNeg.z - vFirstPos.z); - float3 clipB = lerp(vLastNeg, vFirstPos, t); + float32_t3 clipB = lerp(vLastNeg, vFirstPos, t); // Draw positive edges NBL_UNROLL - for (int i = 0; i < positiveCount; i++) + for (int32_t i = 0; i < positiveCount; i++) { - float3 v0 = getVertex(getSilhouetteVertex(rotatedSil, i)); + float32_t3 v0 = getVertex(getSilhouetteVertex(rotatedSil, i)); // ONLY use clipA if we are at the end of the positive run AND there's a clip bool isLastPositive = (i == positiveCount - 1); bool useClipA = (clipCount > 0) && isLastPositive; // If not using clipA, wrap around to the next vertex - float3 v1 = useClipA ? clipA : getVertex(getSilhouetteVertex(rotatedSil, (i + 1) % vertexCount)); + float32_t3 v1 = useClipA ? clipA : getVertex(getSilhouetteVertex(rotatedSil, (i + 1) % vertexCount)); - float3 pts[2] = {v0, v1}; + float32_t3 pts[2] = {v0, v1}; color += drawEdge((i + 1) % vertexCount, pts, spherePos, aaWidth); + + silhouette.vertices[silhouette.count++] = v0; } - // NP edge if (clipCount > 0 && clipCount < vertexCount) { - float3 vFirst = getVertex(getSilhouetteVertex(rotatedSil, 0)); - float3 npPts[2] = {clipB, vFirst}; + // NP edge + float32_t3 vFirst = getVertex(getSilhouetteVertex(rotatedSil, 0)); + float32_t3 npPts[2] = {clipB, vFirst}; color += drawEdge(0, npPts, spherePos, aaWidth); - } - // Horizon arc - if (clipCount > 0 && clipCount < vertexCount) - { - float3 arcPts[2] = {clipA, clipB}; + // Horizon arc + float32_t3 arcPts[2] = {clipA, clipB}; color += drawEdge(23, arcPts, spherePos, aaWidth, 0.6f); + + silhouette.vertices[silhouette.count++] = clipA; + silhouette.vertices[silhouette.count++] = clipB; } #if DEBUG_DATA DebugDataBuffer[0].clipMask = clipMask; DebugDataBuffer[0].clipCount = clipCount; - { - int transitions = 0; - for (int i = 0; i < vertexCount; i++) - { - bool a = (rotatedClipMask >> i) & 1u; - bool b = (rotatedClipMask >> ((i + 1) % vertexCount)) & 1u; - if (a != b) - transitions++; - } - // transitions must be 0 or 2 - DebugDataBuffer[0].MoreThanTwoBitTransitions = transitions > 2; - DebugDataBuffer[0].rotatedClipMask = rotatedClipMask; - DebugDataBuffer[0].rotateAmount = rotateAmount; - DebugDataBuffer[0].positiveVertCount = positiveCount; - DebugDataBuffer[0].wrapAround = (uint32_t)wrapAround; - DebugDataBuffer[0].rotatedSil = rotatedSil; - } + DebugDataBuffer[0].rotatedClipMask = rotatedClipMask; + DebugDataBuffer[0].rotateAmount = rotateAmount; + DebugDataBuffer[0].positiveVertCount = positiveCount; + DebugDataBuffer[0].wrapAround = (uint32_t)wrapAround; + DebugDataBuffer[0].rotatedSil = rotatedSil; + #endif return color; } [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 { - float4 color = float4(0, 0, 0, 0); - for (int i = 0; i < 1; i++) + float32_t4 color = float32_t4(0, 0, 0, 0); + for (int32_t i = 0; i < 1; i++) { + float aaWidth = length(float32_t2(ddx(vx.uv.x), ddy(vx.uv.y))); + float32_t2 p = toCircleSpace(vx.uv); - float aaWidth = length(float2(ddx(vx.uv.x), ddy(vx.uv.y))); - float2 p = toCircleSpace(vx.uv); - - float2 normalized = p / CIRCLE_RADIUS; + float32_t2 normalized = p / CIRCLE_RADIUS; float r2 = dot(normalized, normalized); - float3 spherePos; + float32_t3 spherePos; if (r2 <= 1.0f) { - spherePos = float3(normalized.x, normalized.y, sqrt(1.0f - r2)); + spherePos = float32_t3(normalized.x, normalized.y, sqrt(1.0f - r2)); } else { float uv2Plus1 = r2 + 1.0f; - spherePos = float3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; + spherePos = float32_t3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; } spherePos = normalize(spherePos); computeCubeGeo(); - float4x3 columnModel = transpose(pc.modelMatrix); - - float3 obbCenter = columnModel[3].xyz; - - float3x3 upper3x3 = (float3x3)columnModel; - - float3 rcpSqScales = rcp(float3( + float32_t4x3 columnModel = transpose(pc.modelMatrix); + float32_t3 obbCenter = columnModel[3].xyz; + float32_t3x3 upper3x3 = (float32_t3x3)columnModel; + float32_t3 rcpSqScales = rcp(float32_t3( dot(upper3x3[0], upper3x3[0]), dot(upper3x3[1], upper3x3[1]), dot(upper3x3[2], upper3x3[2]))); + float32_t3 normalizedProj = mul(upper3x3, obbCenter) * rcpSqScales; - float3 normalizedProj = mul(upper3x3, obbCenter) * rcpSqScales; - - int3 region = int3( + int32_t3 region = int32_t3( normalizedProj.x < -1.0f ? 0 : (normalizedProj.x > 1.0f ? 2 : 1), normalizedProj.y < -1.0f ? 0 : (normalizedProj.y > 1.0f ? 2 : 1), normalizedProj.z < -1.0f ? 0 : (normalizedProj.z > 1.0f ? 2 : 1)); - int configIndex = region.x + region.y * 3 + region.z * 9; + int32_t configIndex = region.x + region.y * 3 + region.z * 9; // uint32_t sil = packSilhouette(silhouettes[configIndex]); uint32_t sil = binSilhouettes[configIndex]; - int vertexCount = getSilhouetteSize(sil); - uint32_t silEdgeMask = 0; + int32_t vertexCount = getSilhouetteSize(sil); + uint32_t silEdgeMask = 0; // TODO: take from 'fast' computeSilhouette() #if DEBUG_DATA { - for (int i = 0; i < vertexCount; i++) + for (int32_t i = 0; i < vertexCount; i++) { - int vIdx = i % vertexCount; - int v1Idx = (i + 1) % vertexCount; + int32_t vIdx = i % vertexCount; + int32_t v1Idx = (i + 1) % vertexCount; - int v0Corner = getSilhouetteVertex(sil, vIdx); - int v1Corner = getSilhouetteVertex(sil, v1Idx); + int32_t v0Corner = getSilhouetteVertex(sil, vIdx); + int32_t v1Corner = getSilhouetteVertex(sil, v1Idx); // Mark edge as part of silhouette - for (int e = 0; e < 12; e++) + for (int32_t e = 0; e < 12; e++) { - int2 edge = allEdges[e]; + int32_t2 edge = allEdges[e]; if ((edge.x == v0Corner && edge.y == v1Corner) || (edge.x == v1Corner && edge.y == v0Corner)) { @@ -442,16 +429,36 @@ float4 drawSilhouette(uint32_t vertexCount, uint32_t sil, float3 spherePos, floa #endif uint32_t positiveCount = 0; - color += drawSilhouette(vertexCount, sil, spherePos, aaWidth); - setDebugData(sil, region, configIndex); - color += drawHiddenEdges(spherePos, silEdgeMask, aaWidth); - color += drawCorners(spherePos, p, aaWidth); + ClippedSilhouette silhouette; + color += computeSilhouette(vertexCount, sil, spherePos, aaWidth, silhouette); + // Draw clipped silhouette vertices + // color += drawClippedSilhouetteVertices(p, silhouette, aaWidth); + + SamplingData samplingData = buildSamplingDataFromSilhouette(silhouette, pc.samplingMode); + + uint32_t faceIndices[3]; + uint32_t visibleFaceCount = getVisibleFaces(region, faceIndices); + + // For debugging: Draw a small indicator of which faces are found + // color += drawVisibleFaceOverlay(spherePos, region, aaWidth); + + // color += drawFaces(spherePos, aaWidth); + + // Draw samples on sphere + color += visualizeSamples(vx.uv, spherePos, silhouette, pc.samplingMode, samplingData, 64); + + // Or draw 2D sample space (in a separate viewport) + // color += visualizePrimarySampleSpace(vx.uv, pc.samplingMode, 64, aaWidth); + + setDebugData(sil, region, configIndex); + // color += drawHiddenEdges(spherePos, silEdgeMask, aaWidth); + color += drawCorners(p, aaWidth); color += drawRing(p, aaWidth); - if (all(vx.uv >= float2(0.49f, 0.49f)) && all(vx.uv <= float2(0.51f, 0.51f))) + if (all(vx.uv >= float32_t2(0.49f, 0.49f)) && all(vx.uv <= float32_t2(0.51f, 0.51f))) { - return float4(colorLUT[configIndex], 1.0f); + return float32_t4(colorLUT[configIndex], 1.0f); } } diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl index c8532e796..dd0ab2d99 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl @@ -24,9 +24,11 @@ namespace nbl uint32_t clipCount; uint32_t rotatedSil; uint32_t wrapAround; + uint32_t rotatedClipMask; uint32_t rotateAmount; - uint32_t MoreThanTwoBitTransitions; + uint32_t maxTrianglesExcceded; + uint32_t vertices[6]; }; @@ -34,10 +36,15 @@ namespace nbl { float32_t3x4 modelMatrix; float32_t4 viewport; + uint32_t samplingMode; + uint32_t frameIndex; }; + // Sampling mode enum +#define SAMPLING_MODE_SOLID_ANGLE 0 +#define SAMPLING_MODE_PROJECTED_SOLID_ANGLE 1 static const float32_t3 colorLUT[27] = { - float32_t3(0, 0, 0), float32_t3(1, 1, 1), float32_t3(0.5, 0.5, 0.5), + float32_t3(0, 0, 0), float32_t3(0.5, 0.5, 0.5), float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1), float32_t3(1, 1, 0), float32_t3(1, 0, 1), float32_t3(0, 1, 1), float32_t3(1, 0.5, 0), float32_t3(1, 0.65, 0), float32_t3(0.8, 0.4, 0), @@ -45,14 +52,13 @@ namespace nbl float32_t3(0.5, 0, 0.5), float32_t3(0.6, 0.4, 0.8), float32_t3(0.3, 0, 0.5), float32_t3(0, 0.5, 0), float32_t3(0.5, 1, 0), float32_t3(0, 0.5, 0.25), float32_t3(0, 0, 0.5), float32_t3(0.3, 0.7, 1), float32_t3(0, 0.4, 0.6), - float32_t3(0.6, 0.4, 0.2), float32_t3(0.8, 0.7, 0.3), float32_t3(0.4, 0.3, 0.1)}; + float32_t3(0.6, 0.4, 0.2), float32_t3(0.8, 0.7, 0.3), float32_t3(0.4, 0.3, 0.1), float32_t3(1, 1, 1)}; #ifndef __HLSL_VERSION - static const char *colorNames[27] = {"Black", - "White", "Gray", "Red", "Green", "Blue", "Yellow", "Magenta", "Cyan", + static const char *colorNames[27] = {"Black", "Gray", "Red", "Green", "Blue", "Yellow", "Magenta", "Cyan", "Orange", "Light Orange", "Dark Orange", "Pink", "Light Pink", "Deep Rose", "Purple", "Light Purple", "Indigo", "Dark Green", "Lime", "Forest Green", "Navy", "Sky Blue", "Teal", "Brown", - "Tan/Beige", "Dark Brown"}; + "Tan/Beige", "Dark Brown", "White"}; #endif // __HLSL_VERSION } } diff --git a/73_SolidAngleVisualizer/include/transform.hpp b/73_SolidAngleVisualizer/include/transform.hpp index 538173223..e1ffcd764 100644 --- a/73_SolidAngleVisualizer/include/transform.hpp +++ b/73_SolidAngleVisualizer/include/transform.hpp @@ -168,18 +168,36 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti // Decompose original matrix nbl::hlsl::float32_t3 translation, rotation, scale; ImGuizmo::DecomposeMatrixToComponents(matrix, &translation.x, &rotation.x, &scale.x); - - float temp[16]; + // Create rotation-only matrix + nbl::hlsl::float32_t4x4 temp; nbl::hlsl::float32_t3 baseTranslation(0.0f); nbl::hlsl::float32_t3 baseScale(1.0f); - ImGuizmo::RecomposeMatrixFromComponents(&baseTranslation.x, &rotation.x, &baseScale.x, temp); - // Manipulate rotation only - ImGuizmo::ViewManipulate(temp, 1.0f, ImVec2(viewManipulateRight - 128, viewManipulateTop), ImVec2(128, 128), 0x10101010); + ImGuizmo::RecomposeMatrixFromComponents(&baseTranslation.x, &rotation.x, &baseScale.x, &temp[0][0]); + temp = nbl::hlsl::transpose(temp); - // Extract rotation from manipulated temp - nbl::hlsl::float32_t3 newRot; - ImGuizmo::DecomposeMatrixToComponents(temp, &baseTranslation.x, &newRot.x, &baseScale.x); + // Invert to make it "view-like" + nbl::hlsl::float32_t4x4 tempInv = nbl::hlsl::inverse(temp); + + // Create flip matrix (flip X to fix left/right) + nbl::hlsl::float32_t4x4 flip(1.0f); + flip[0][0] = -1.0f; // Flip X axis + + // Apply flip to the inverted matrix + tempInv = nbl::hlsl::mul(nbl::hlsl::mul(flip, tempInv), flip); + // Manipulate + ImGuizmo::ViewManipulate(&tempInv[0][0], 1.0f, ImVec2(viewManipulateRight - 128, viewManipulateTop), ImVec2(128, 128), 0x10101010); + + // Undo flip (flip is its own inverse, so multiply by flip again) + tempInv = nbl::hlsl::mul(nbl::hlsl::mul(flip, tempInv), flip); + + // Invert back to model space + temp = nbl::hlsl::inverse(tempInv); + temp = nbl::hlsl::transpose(temp); + + // Extract rotation + nbl::hlsl::float32_t3 newRot; + ImGuizmo::DecomposeMatrixToComponents(&temp[0][0], &baseTranslation.x, &newRot.x, &baseScale.x); // Recompose original matrix with new rotation but keep translation & scale ImGuizmo::RecomposeMatrixFromComponents(&translation.x, &newRot.x, &scale.x, matrix); diff --git a/73_SolidAngleVisualizer/main.cpp b/73_SolidAngleVisualizer/main.cpp index 64f4cb100..401ab71b3 100644 --- a/73_SolidAngleVisualizer/main.cpp +++ b/73_SolidAngleVisualizer/main.cpp @@ -420,7 +420,9 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { PushConstants pc{ .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), - .viewport = { 0.f,0.f,static_cast(creationParams.width),static_cast(creationParams.height) } + .viewport = { 0.f,0.f,static_cast(creationParams.width),static_cast(creationParams.height) }, + .samplingMode = m_samplingMode, + .frameIndex = m_frameSeeding ? static_cast(m_realFrameIx) : 0u }; auto pipeline = m_visualizationPipeline; cb->bindGraphicsPipeline(pipeline.get()); @@ -794,6 +796,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight; + static inline uint32_t m_samplingMode = SAMPLING_MODE_SOLID_ANGLE; + static inline bool m_frameSeeding = true; static inline ResultData m_GPUOutResulData; // smart_refctd_ptr m_scene; @@ -855,13 +859,20 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); ImGui::Begin("Editor"); - //if (ImGui::RadioButton("Full view", !transformParams.useWindow)) - // transformParams.useWindow = false; + ImGui::Text("Sampling Mode: "); + ImGui::SameLine(); + + if (ImGui::RadioButton("Solid Angle", m_samplingMode == 0)) + m_samplingMode = SAMPLING_MODE_SOLID_ANGLE; + + ImGui::SameLine(); + + if (ImGui::RadioButton("Projected Solid Angle", m_samplingMode == 1)) + m_samplingMode = SAMPLING_MODE_PROJECTED_SOLID_ANGLE; - //ImGui::SameLine(); + ImGui::Checkbox("Frame seeding", &m_frameSeeding); - //if (ImGui::RadioButton("Window", transformParams.useWindow)) - // transformParams.useWindow = true; + ImGui::Separator(); ImGui::Text("Camera"); @@ -1108,7 +1119,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::Text("silhouette Vertex Count: %u", m_GPUOutResulData.silhouetteVertexCount); ImGui::Text("silhouette Positive VertexCount: %u", m_GPUOutResulData.positiveVertCount); ImGui::Text("Silhouette Mismatch: %s", m_GPUOutResulData.edgeVisibilityMismatch ? "true" : "false"); - ImGui::Text("More Than Two Bit Transitions: %s", m_GPUOutResulData.MoreThanTwoBitTransitions ? "true" : "false"); + ImGui::Text("More Than Two Bit Transitions: %s", m_GPUOutResulData.maxTrianglesExcceded ? "true" : "false"); { float32_t3 xAxis = m_OBBModelMatrix[0].xyz; @@ -1136,12 +1147,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR lastSilhouetteIndex = m_GPUOutResulData.silhouetteIndex; } - if (!m_GPUOutResulData.edgeVisibilityMismatch || !m_GPUOutResulData.MoreThanTwoBitTransitions) + if (!m_GPUOutResulData.edgeVisibilityMismatch || !m_GPUOutResulData.maxTrianglesExcceded) { // Reset flag when mismatch is cleared modalShown = false; } - if ((m_GPUOutResulData.edgeVisibilityMismatch || m_GPUOutResulData.MoreThanTwoBitTransitions) && m_GPUOutResulData.silhouetteIndex != 13 && !modalShown) // 13 means we're inside the cube, so don't care + if ((m_GPUOutResulData.edgeVisibilityMismatch || m_GPUOutResulData.maxTrianglesExcceded) && m_GPUOutResulData.silhouetteIndex != 13 && !modalShown) // 13 means we're inside the cube, so don't care { // Open modal popup only once per configuration ImGui::OpenPopup("Edge Visibility Mismatch Warning"); From 15e4d5d044d0b682279fcce5486a841e1f3d3541 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Wed, 7 Jan 2026 00:20:02 +0300 Subject: [PATCH 15/26] added benchmark code for sampling, visualization of rays in 3D view, added NSC compile rules (for benchmark only for now) --- 73_SolidAngleVisualizer/CMakeLists.txt | 78 ++- .../app_resources/hlsl/Drawing.hlsl | 327 ++++++----- .../app_resources/hlsl/RayVis.frag.hlsl | 221 ++++++++ .../app_resources/hlsl/Sampling.hlsl | 193 +++++-- .../hlsl/SolidAngleVis.frag.hlsl | 465 +++------------- .../hlsl/benchmark/benchmark.comp.hlsl | 45 ++ .../app_resources/hlsl/benchmark/common.hlsl | 23 + .../app_resources/hlsl/common.hlsl | 29 +- .../app_resources/hlsl/gpu_common.hlsl | 168 ++++++ .../app_resources/hlsl/silhouette.hlsl | 164 ++++++ .../app_resources/hlsl/utils.hlsl | 19 + 73_SolidAngleVisualizer/main.cpp | 527 +++++++++++++++--- 12 files changed, 1603 insertions(+), 656 deletions(-) create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/RayVis.frag.hlsl create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl diff --git a/73_SolidAngleVisualizer/CMakeLists.txt b/73_SolidAngleVisualizer/CMakeLists.txt index 5d0021f61..f1701829f 100644 --- a/73_SolidAngleVisualizer/CMakeLists.txt +++ b/73_SolidAngleVisualizer/CMakeLists.txt @@ -7,14 +7,88 @@ if(NBL_BUILD_IMGUI) "${CMAKE_CURRENT_SOURCE_DIR}/include" ) - list(APPEND NBL_LIBRARIES + list(APPEND NBL_LIBRARIES imtestengine imguizmo "${NBL_EXT_IMGUI_UI_LIB}" ) - + + if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) + endif() + # TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !? nbl_create_executable_project("${NBL_EXTRA_SOURCES}" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}") + # TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet # LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD) + set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen") + set(DEPENDS + app_resources/hlsl/common.hlsl + app_resources/hlsl/gpu_common.hlsl + app_resources/hlsl/Drawing.hlsl + app_resources/hlsl/Sampling.hlsl + app_resources/hlsl/Sampling.hlsl + app_resources/hlsl/silhouette.hlsl + app_resources/hlsl/utils.hlsl + + # app_resources/hlsl/test.comp.hlsl + app_resources/hlsl/benchmark/benchmark.comp.hlsl + app_resources/hlsl/benchmark/common.hlsl + ) + target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS}) + set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON) + + set(SM 6_8) + set(JSON [=[ + [ + + { + "INPUT": "app_resources/hlsl/benchmark/benchmark.comp.hlsl", + "KEY": "benchmark", + }, + ] + ]=]) + string(CONFIGURE "${JSON}" JSON) + + set(COMPILE_OPTIONS + -I "${CMAKE_CURRENT_SOURCE_DIR}" + -T lib_${SM} + ) + + NBL_CREATE_NSC_COMPILE_RULES( + TARGET ${EXECUTABLE_NAME}SPIRV + LINK_TO ${EXECUTABLE_NAME} + DEPENDS ${DEPENDS} + BINARY_DIR ${OUTPUT_DIRECTORY} + MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT + COMMON_OPTIONS ${COMPILE_OPTIONS} + OUTPUT_VAR KEYS + INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp + NAMESPACE nbl::this_example::builtin::build + INPUTS ${JSON} + ) + + NBL_CREATE_RESOURCE_ARCHIVE( + NAMESPACE nbl::this_example::builtin::build + TARGET ${EXECUTABLE_NAME}_builtinsBuild + LINK_TO ${EXECUTABLE_NAME} + BIND ${OUTPUT_DIRECTORY} + BUILTINS ${KEYS} + ) endif() \ No newline at end of file diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl index 89dfd4ae6..1a2962c78 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl @@ -1,72 +1,86 @@ #ifndef _DEBUG_HLSL_ #define _DEBUG_HLSL_ + #include "common.hlsl" +#include "gpu_common.hlsl" -float2 sphereToCircle(float3 spherePoint) +#if DEBUG_DATA +// Check if a face on the hemisphere is visible from camera at origin +bool isFaceVisible(float32_t3 faceCenter, float32_t3 faceNormal) +{ + float32_t3 viewVec = normalize(-faceCenter); // Vector from camera to face + return dot(faceNormal, viewVec) > 0.0f; +} +#endif // DEBUG_DATA + +#if VISUALIZE_SAMPLES + +// doesn't change Z coordinate +float32_t3 sphereToCircle(float32_t3 spherePoint) { if (spherePoint.z >= 0.0f) { - return spherePoint.xy * CIRCLE_RADIUS; + return float32_t3(spherePoint.xy * CIRCLE_RADIUS, spherePoint.z); } else { - float r2 = (1.0f - spherePoint.z) / (1.0f + spherePoint.z); - float uv2Plus1 = r2 + 1.0f; - return (spherePoint.xy * uv2Plus1 / 2.0f) * CIRCLE_RADIUS; + float32_t r2 = (1.0f - spherePoint.z) / (1.0f + spherePoint.z); + float32_t uv2Plus1 = r2 + 1.0f; + return float32_t3((spherePoint.xy * uv2Plus1 / 2.0f) * CIRCLE_RADIUS, spherePoint.z); } } -float drawGreatCircleArc(float3 fragPos, float3 points[2], float aaWidth, float width = 0.01f) +float32_t drawGreatCircleArc(float32_t3 fragPos, float32_t3 points[2], float32_t aaWidth, float32_t width = 0.01f) { - float3 v0 = normalize(points[0]); - float3 v1 = normalize(points[1]); - float3 p = normalize(fragPos); + float32_t3 v0 = normalize(points[0]); + float32_t3 v1 = normalize(points[1]); + float32_t3 ndc = normalize(fragPos); - float3 arcNormal = normalize(cross(v0, v1)); - float dist = abs(dot(p, arcNormal)); + float32_t3 arcNormal = normalize(cross(v0, v1)); + float32_t dist = abs(dot(ndc, arcNormal)); - float dotMid = dot(v0, v1); - bool onArc = (dot(p, v0) >= dotMid) && (dot(p, v1) >= dotMid); + float32_t dotMid = dot(v0, v1); + bool onArc = (dot(ndc, v0) >= dotMid) && (dot(ndc, v1) >= dotMid); if (!onArc) return 0.0f; - float avgDepth = (length(points[0]) + length(points[1])) * 0.5f; - float depthScale = 3.0f / avgDepth; + float32_t avgDepth = (length(points[0]) + length(points[1])) * 0.5f; + float32_t depthScale = 3.0f / avgDepth; width = min(width * depthScale, 0.02f); - float alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); + float32_t alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); return alpha; } -float drawCross2D(float2 fragPos, float2 center, float size, float thickness) +float32_t drawCross2D(float32_t2 fragPos, float32_t2 center, float32_t size, float32_t thickness) { - float2 p = abs(fragPos - center); + float32_t2 ndc = abs(fragPos - center); // Check if point is inside the cross (horizontal or vertical bar) - bool inHorizontal = (p.x <= size && p.y <= thickness); - bool inVertical = (p.y <= size && p.x <= thickness); + bool inHorizontal = (ndc.x <= size && ndc.y <= thickness); + bool inVertical = (ndc.y <= size && ndc.x <= thickness); return (inHorizontal || inVertical) ? 1.0f : 0.0f; } -float4 drawHiddenEdges(float3 spherePos, uint32_t silEdgeMask, float aaWidth) +float32_t4 drawHiddenEdges(float32_t3x4 modelMatrix, float32_t3 spherePos, uint32_t silEdgeMask, float32_t aaWidth) { - float4 color = 0; - float3 hiddenEdgeColor = float3(0.1, 0.1, 0.1); + float32_t4 color = 0; + float32_t3 hiddenEdgeColor = float32_t3(0.1, 0.1, 0.1); NBL_UNROLL - for (int32_t i = 0; i < 12; i++) + for (uint32_t i = 0; i < 12; i++) { // skip silhouette edges if (silEdgeMask & (1u << i)) continue; - int2 edge = allEdges[i]; + uint32_t2 edge = allEdges[i]; - float3 v0 = normalize(getVertex(edge.x)); - float3 v1 = normalize(getVertex(edge.y)); + float32_t3 v0 = normalize(getVertex(modelMatrix, edge.x)); + float32_t3 v1 = normalize(getVertex(modelMatrix, edge.y)); bool neg0 = v0.z < 0.0f; bool neg1 = v1.z < 0.0f; @@ -75,107 +89,163 @@ float4 drawHiddenEdges(float3 spherePos, uint32_t silEdgeMask, float aaWidth) if (neg0 && neg1) continue; - float3 p0 = v0; - float3 p1 = v1; + float32_t3 p0 = v0; + float32_t3 p1 = v1; // clip if needed if (neg0 ^ neg1) { - float t = v0.z / (v0.z - v1.z); - float3 clip = normalize(lerp(v0, v1, t)); + float32_t t = v0.z / (v0.z - v1.z); + float32_t3 clip = normalize(lerp(v0, v1, t)); p0 = neg0 ? clip : v0; p1 = neg1 ? clip : v1; } - float3 pts[2] = {p0, p1}; - float4 c = drawGreatCircleArc(spherePos, pts, aaWidth, 0.005f); - color += float4(hiddenEdgeColor * c.a, c.a); + float32_t3 pts[2] = {p0, p1}; + float32_t4 c = drawGreatCircleArc(spherePos, pts, aaWidth, 0.005f); + color += float32_t4(hiddenEdgeColor * c.a, c.a); } return color; } -float4 drawCorners(float2 p, float aaWidth) +float32_t4 drawCorner(float32_t3 cornerNDCPos, float32_t2 ndc, float32_t aaWidth, float32_t dotSize, float32_t innerDotSize, float32_t3 dotColor) { - float4 color = 0; - - float dotSize = 0.02f; - float innerDotSize = dotSize * 0.5f; - - for (int32_t i = 0; i < 8; i++) - { - float3 corner3D = normalize(getVertex(i)); - float2 cornerPos = sphereToCircle(corner3D); + float32_t4 color = float32_t4(0, 0, 0, 0); + float32_t dist = length(ndc - cornerNDCPos.xy); - float dist = length(p - cornerPos); - - // outer dot - float outerAlpha = 1.0f - smoothstep(dotSize - aaWidth, + // outer dot + float32_t outerAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist); - if (outerAlpha <= 0.0f) - continue; + if (outerAlpha <= 0.0f) + return color; - float3 dotColor = colorLUT[i]; - color += float4(dotColor * outerAlpha, outerAlpha); + color += float32_t4(dotColor * outerAlpha, outerAlpha); - // ------------------------------------------------- - // inner black dot for hidden corners - // ------------------------------------------------- - if (corner3D.z < 0.0f) - { - float innerAlpha = 1.0f - smoothstep(innerDotSize - aaWidth, + // ------------------------------------------------- + // inner black dot for hidden corners + // ------------------------------------------------- + if (cornerNDCPos.z < 0.0f) + { + float32_t innerAlpha = 1.0f - smoothstep(innerDotSize - aaWidth, innerDotSize + aaWidth, dist); - // ensure it stays inside the outer dot - innerAlpha *= outerAlpha; + // ensure it stays inside the outer dot + innerAlpha *= outerAlpha; - float3 innerColor = float3(0.0, 0.0, 0.0); - color -= float4(innerAlpha.xxx, 0.0f); - } + color -= float32_t4(innerAlpha.xxx, 0.0f); } return color; } -float4 drawClippedSilhouetteVertices(float2 p, ClippedSilhouette silhouette, float aaWidth) +// Draw a line segment in NDC space +float32_t lineSegment(float32_t2 ndc, float32_t2 a, float32_t2 b, float32_t thickness) { - float4 color = 0; - float dotSize = 0.03f; + float32_t2 pa = ndc - a; + float32_t2 ba = b - a; + float32_t h = saturate(dot(pa, ba) / dot(ba, ba)); + float32_t dist = length(pa - ba * h); + return smoothstep(thickness, thickness * 0.5, dist); +} - for (uint i = 0; i < silhouette.count; i++) +// Draw an arrow head (triangle) in NDC space +float32_t arrowHead(float32_t2 ndc, float32_t2 tip, float32_t2 direction, float32_t size) +{ + // Create perpendicular vector + float32_t2 perp = float32_t2(-direction.y, direction.x); + + // Three points of the arrow head triangle + float32_t2 p1 = tip; + float32_t2 p2 = tip - direction * size + perp * size * 0.5; + float32_t2 p3 = tip - direction * size - perp * size * 0.5; + + // Check if point is inside triangle using barycentric coordinates + float32_t2 v0 = p3 - p1; + float32_t2 v1 = p2 - p1; + float32_t2 v2 = ndc - p1; + + float32_t dot00 = dot(v0, v0); + float32_t dot01 = dot(v0, v1); + float32_t dot02 = dot(v0, v2); + float32_t dot11 = dot(v1, v1); + float32_t dot12 = dot(v1, v2); + + float32_t invDenom = 1.0 / (dot00 * dot11 - dot01 * dot01); + float32_t u = (dot11 * dot02 - dot01 * dot12) * invDenom; + float32_t v = (dot00 * dot12 - dot01 * dot02) * invDenom; + + bool inside = (u >= 0.0) && (v >= 0.0) && (u + v <= 1.0); + + // Add some antialiasing + float32_t minDist = min(min( + length(ndc - p1), + length(ndc - p2)), + length(ndc - p3)); + + return inside ? 1.0 : smoothstep(0.02, 0.0, minDist); +} + +// Helper to draw an edge with proper color mapping +float32_t4 drawEdge(uint32_t originalEdgeIdx, float32_t3 pts[2], float32_t3 spherePos, float32_t aaWidth, float32_t width = 0.01f) +{ + float32_t4 edgeContribution = drawGreatCircleArc(spherePos, pts, aaWidth, width); + return float32_t4(colorLUT[originalEdgeIdx] * edgeContribution.a, edgeContribution.a); +}; + +float32_t4 drawCorners(float32_t3x4 modelMatrix, float32_t2 ndc, float32_t aaWidth) +{ + float32_t4 color = float32_t4(0, 0, 0, 0); + + float32_t dotSize = 0.02f; + float32_t innerDotSize = dotSize * 0.5f; + + for (uint32_t i = 0; i < 8; i++) { - float3 corner3D = normalize(silhouette.vertices[i]); - float2 cornerPos = sphereToCircle(corner3D); + float32_t3 cornerCirclePos = sphereToCircle(normalize(getVertex(modelMatrix, i))); + color += drawCorner(cornerCirclePos, ndc, aaWidth, dotSize, innerDotSize, colorLUT[i]); + } - float dist = length(p - cornerPos); + return color; +} + +float32_t4 drawClippedSilhouetteVertices(float32_t2 ndc, ClippedSilhouette silhouette, float32_t aaWidth) +{ + float32_t4 color = 0; + float32_t dotSize = 0.03f; + + for (uint i = 0; i < silhouette.count; i++) + { + float32_t3 cornerCirclePos = sphereToCircle(normalize(silhouette.vertices[i])); + float32_t dist = length(ndc - cornerCirclePos.xy); // Smooth circle for the vertex - float alpha = 1.0f - smoothstep(dotSize * 0.8f, dotSize, dist); + float32_t alpha = 1.0f - smoothstep(dotSize * 0.8f, dotSize, dist); if (alpha > 0.0f) { // Color gradient: Red (index 0) to Cyan (last index) // This helps verify the CCW winding order visually - float t = float(i) / float(max(1u, silhouette.count - 1)); - float3 vertexColor = lerp(float3(1, 0, 0), float3(0, 1, 1), t); + float32_t t = float32_t(i) / float32_t(max(1u, silhouette.count - 1)); + float32_t3 vertexColor = lerp(float32_t3(1, 0, 0), float32_t3(0, 1, 1), t); - color += float4(vertexColor * alpha, alpha); + color += float32_t4(vertexColor * alpha, alpha); } } return color; } -float4 drawRing(float2 p, float aaWidth) +float32_t4 drawRing(float32_t2 ndc, float32_t aaWidth) { - float positionLength = length(p); - float ringWidth = 0.003f; - float ringDistance = abs(positionLength - CIRCLE_RADIUS); - float ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance); - return ringAlpha * float4(1, 1, 1, 1); + float32_t positionLength = length(ndc); + float32_t ringWidth = 0.003f; + float32_t ringDistance = abs(positionLength - CIRCLE_RADIUS); + float32_t ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance); + return ringAlpha * float32_t4(1, 1, 1, 1); } // Returns the number of visible faces and populates the faceIndices array @@ -204,78 +274,72 @@ uint getVisibleFaces(int3 region, out uint faceIndices[3]) return count; } -float4 drawVisibleFaceOverlay(float3 spherePos, int3 region, float aaWidth) +float32_t4 drawVisibleFaceOverlay(float32_t3x4 modelMatrix, float32_t3 spherePos, int3 region, float32_t aaWidth) { uint faceIndices[3]; uint count = getVisibleFaces(region, faceIndices); - float4 color = 0; + + float32_t4 color = 0; for (uint i = 0; i < count; i++) { uint fIdx = faceIndices[i]; - float3 n = localNormals[fIdx]; + float32_t3 n = localNormals[fIdx]; // Transform normal to world space (using the same logic as your corners) - float3 worldNormal = -normalize(mul((float3x3)pc.modelMatrix, n)); + float32_t3 worldNormal = -normalize(mul((float3x3)modelMatrix, n)); worldNormal.z = -worldNormal.z; // Invert Z for correct orientation // Very basic visualization: highlight if the sphere position // is generally pointing towards that face's normal - float alignment = dot(spherePos, worldNormal); + float32_t alignment = dot(spherePos, worldNormal); if (alignment > 0.95f) { // Use different colors for different face indices - color += float4(colorLUT[fIdx % 24], 0.5f); + color += float32_t4(colorLUT[fIdx % 24], 0.5f); } } return color; } -// Check if a face on the hemisphere is visible from camera at origin -bool isFaceVisible(float3 faceCenter, float3 faceNormal) +float32_t4 drawFaces(float32_t3x4 modelMatrix, float32_t3 spherePos, float32_t aaWidth) { - float3 viewVec = normalize(-faceCenter); // Vector from camera to face - return dot(faceNormal, viewVec) > 0.0f; -} + float32_t4 color = 0.0f; + float32_t3 ndc = normalize(spherePos); -float4 drawFaces(float3 spherePos, float aaWidth) -{ - float4 color = 0.0f; - float3 p = normalize(spherePos); - - float3x3 rotMatrix = (float3x3)pc.modelMatrix; + float3x3 rotMatrix = (float3x3)modelMatrix; // Check each of the 6 faces - for (int32_t faceIdx = 0; faceIdx < 6; faceIdx++) + for (uint32_t faceIdx = 0; faceIdx < 6; faceIdx++) { - float3 n_world = mul(rotMatrix, localNormals[faceIdx]); + float32_t3 n_world = mul(rotMatrix, localNormals[faceIdx]); // Check if face is visible if (!isFaceVisible(faceCenters[faceIdx], n_world)) continue; // Get the 4 corners of this face - float3 faceVerts[4]; - for (int32_t i = 0; i < 4; i++) + float32_t3 faceVerts[4]; + for (uint32_t i = 0; i < 4; i++) { - int32_t cornerIdx = faceToCorners[faceIdx][i]; - faceVerts[i] = normalize(getVertex(cornerIdx)); + uint32_t cornerIdx = faceToCorners[faceIdx][i]; + faceVerts[i] = normalize(getVertex(modelMatrix, cornerIdx)); } // Compute face center for winding - float3 faceCenter = float3(0, 0, 0); - for (int32_t i = 0; i < 4; i++) + float32_t3 faceCenter = float32_t3(0, 0, 0); + for (uint32_t i = 0; i < 4; i++) faceCenter += faceVerts[i]; faceCenter = normalize(faceCenter); // Check if point is inside this face bool isInside = true; - float minDist = 1e10; + float32_t minDist = 1e10; - for (int32_t i = 0; i < 4; i++) + for (uint32_t i = 0; i < 4; i++) { - float3 v0 = faceVerts[i]; - float3 v1 = faceVerts[(i + 1) % 4]; + float32_t3 v0 = faceVerts[i]; + float32_t3 v1 = faceVerts[(i + 1) % 4]; // Skip edges behind camera if (v0.z < 0.0f && v1.z < 0.0f) @@ -285,13 +349,13 @@ float4 drawFaces(float3 spherePos, float aaWidth) } // Great circle normal - float3 edgeNormal = normalize(cross(v0, v1)); + float32_t3 edgeNormal = normalize(cross(v0, v1)); // Ensure normal points inward if (dot(edgeNormal, faceCenter) < 0.0f) edgeNormal = -edgeNormal; - float d = dot(p, edgeNormal); + float32_t d = dot(ndc, edgeNormal); if (d < -1e-6f) { @@ -304,25 +368,29 @@ float4 drawFaces(float3 spherePos, float aaWidth) if (isInside) { - float alpha = smoothstep(0.0f, aaWidth * 2.0f, minDist); + float32_t alpha = smoothstep(0.0f, aaWidth * 2.0f, minDist); // Use colorLUT based on face index (0-5) - float3 faceColor = colorLUT[faceIdx]; + float32_t3 faceColor = colorLUT[faceIdx]; - float shading = saturate(p.z * 0.8f + 0.2f); - color += float4(faceColor * shading * alpha, alpha); + float32_t shading = saturate(ndc.z * 0.8f + 0.2f); + color += float32_t4(faceColor * shading * alpha, alpha); } } return color; } -int32_t getEdgeVisibility(int32_t edgeIdx) +#endif // VISUALIZE_SAMPLES + +#if DEBUG_DATA + +uint32_t getEdgeVisibility(float32_t3x4 modelMatrix, uint32_t edgeIdx) { // Adjacency of edges to faces // Corrected Adjacency of edges to faces - static const int2 edgeToFaces[12] = { + static const uint32_t2 edgeToFaces[12] = { // Edge Index: | allEdges[i] | Shared Faces: /* 0 (0-1) */ {4, 0}, // Y- (4) and Z- (0) @@ -341,12 +409,12 @@ int32_t getEdgeVisibility(int32_t edgeIdx) /* 11 (3-7) */ {3, 5} // X+ (3) and Y+ (5) }; - int2 faces = edgeToFaces[edgeIdx]; + uint32_t2 faces = edgeToFaces[edgeIdx]; // Transform normals to world space - float3x3 rotMatrix = (float3x3)pc.modelMatrix; - float3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]); - float3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]); + float3x3 rotMatrix = (float3x3)modelMatrix; + float32_t3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]); + float32_t3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]); bool visible1 = isFaceVisible(faceCenters[faces.x], n_world_f1); bool visible2 = isFaceVisible(faceCenters[faces.y], n_world_f2); @@ -363,15 +431,14 @@ int32_t getEdgeVisibility(int32_t edgeIdx) return 0; } -#if DEBUG_DATA -uint32_t computeGroundTruthEdgeMask() +uint32_t computeGroundTruthEdgeMask(float32_t3x4 modelMatrix) { uint32_t mask = 0u; NBL_UNROLL - for (int32_t j = 0; j < 12; j++) + for (uint32_t j = 0; j < 12; j++) { // getEdgeVisibility returns 1 for a silhouette edge based on 3D geometry - if (getEdgeVisibility(j) == 1) + if (getEdgeVisibility(modelMatrix, j) == 1) { mask |= (1u << j); } @@ -379,12 +446,12 @@ uint32_t computeGroundTruthEdgeMask() return mask; } -void validateEdgeVisibility(uint32_t sil, int32_t vertexCount, uint32_t generatedSilMask) +void validateEdgeVisibility(float32_t3x4 modelMatrix, uint32_t sil, uint32_t vertexCount, uint32_t generatedSilMask) { uint32_t mismatchAccumulator = 0; // The Ground Truth now represents the full 3D silhouette, clipped or not. - uint32_t groundTruthMask = computeGroundTruthEdgeMask(); + uint32_t groundTruthMask = computeGroundTruthEdgeMask(modelMatrix); // The comparison checks if the generated mask perfectly matches the full 3D ground truth. uint32_t mismatchMask = groundTruthMask ^ generatedSilMask; @@ -392,11 +459,11 @@ void validateEdgeVisibility(uint32_t sil, int32_t vertexCount, uint32_t generate if (mismatchMask != 0) { NBL_UNROLL - for (int32_t j = 0; j < 12; j++) + for (uint32_t j = 0; j < 12; j++) { if ((mismatchMask >> j) & 1u) { - int2 edge = allEdges[j]; + uint32_t2 edge = allEdges[j]; // Accumulate vertex indices where error occurred mismatchAccumulator |= (1u << edge.x) | (1u << edge.y); } @@ -406,6 +473,6 @@ void validateEdgeVisibility(uint32_t sil, int32_t vertexCount, uint32_t generate // Simple Write (assuming all fragments calculate the same result) InterlockedOr(DebugDataBuffer[0].edgeVisibilityMismatch, mismatchAccumulator); } -#endif +#endif // DEBUG_DATA #endif // _DEBUG_HLSL_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/RayVis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/RayVis.frag.hlsl new file mode 100644 index 000000000..2b4d7e3ef --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/RayVis.frag.hlsl @@ -0,0 +1,221 @@ +#pragma wave shader_stage(fragment) + +#include "common.hlsl" +#include +#include "utils.hlsl" + +using namespace nbl::hlsl; +using namespace ext::FullScreenTriangle; + +[[vk::push_constant]] struct PushConstantRayVis pc; +[[vk::binding(0, 0)]] RWStructuredBuffer DebugDataBuffer; +#define VISUALIZE_SAMPLES 1 +#include "Drawing.hlsl" + +// Ray-AABB intersection in world space +// Returns the distance to the nearest intersection point, or -1 if no hit +float32_t rayAABBIntersection(float32_t3 rayOrigin, float32_t3 rayDir, float32_t3 aabbMin, float32_t3 aabbMax) +{ + float32_t3 invDir = 1.0 / rayDir; + float32_t3 t0 = (aabbMin - rayOrigin) * invDir; + float32_t3 t1 = (aabbMax - rayOrigin) * invDir; + + float32_t3 tmin = min(t0, t1); + float32_t3 tmax = max(t0, t1); + + float32_t tNear = max(max(tmin.x, tmin.y), tmin.z); + float32_t tFar = min(min(tmax.x, tmax.y), tmax.z); + + // Check if ray intersects AABB + if (tNear > tFar || tFar < 0.0) + return -1.0; + + // Return the nearest positive intersection + return tNear >= 0.0 ? tNear : tFar; +} + +// Project 3D point to NDC space +float32_t2 projectToNDC(float32_t3 worldPos, float32_t4x4 viewProj, float32_t aspect) +{ + float32_t4 clipPos = mul(viewProj, float32_t4(worldPos, 1.0)); + clipPos /= clipPos.w; + + // Apply aspect ratio correction + clipPos.x *= aspect; + + return clipPos.xy; +} + +// Visualizes a ray as an arrow from origin in NDC space +// Returns color (rgb), intensity (a), and depth (in extra component) +struct ArrowResult +{ + float32_t4 color : SV_Target0; + float32_t depth : SV_Depth; +}; + +ArrowResult visualizeRayAsArrow(float32_t3 rayOrigin, float32_t4 directionAndPdf, float32_t arrowLength, float32_t2 ndcPos, float32_t aspect) +{ + ArrowResult result; + result.color = float32_t4(0, 0, 0, 0); + result.depth = 0.0; + + float32_t3 rayDir = normalize(directionAndPdf.xyz); + float32_t pdf = directionAndPdf.w; + + float32_t3 rayEnd = rayOrigin + rayDir * arrowLength; + + // Project start and end points to NDC space + float32_t2 ndcStart = projectToNDC(rayOrigin, pc.viewProjMatrix, aspect); + float32_t2 ndcEnd = projectToNDC(rayEnd, pc.viewProjMatrix, aspect); + + // Get clip space positions + float32_t4 clipStart = mul(pc.viewProjMatrix, float32_t4(rayOrigin, 1.0)); + float32_t4 clipEnd = mul(pc.viewProjMatrix, float32_t4(rayEnd, 1.0)); + + // Calculate arrow properties in NDC space + float32_t arrowNDCLength = length(ndcEnd - ndcStart); + + // Skip if arrow is too small on screen (in NDC units) + if (arrowNDCLength < 0.01) + return result; + + // Calculate the parametric position along the arrow shaft IN NDC + float32_t2 pa = ndcPos - ndcStart; + float32_t2 ba = ndcEnd - ndcStart; + float32_t t_ndc = saturate(dot(pa, ba) / dot(ba, ba)); + + // Draw line shaft + float32_t lineThickness = 0.002; + float32_t lineIntensity = lineSegment(ndcPos, ndcStart, ndcEnd, lineThickness); + + // Calculate depth at this pixel's position along the arrow + if (lineIntensity > 0.0) + { + // Interpolate in CLIP space for perspective-correct depth + float32_t4 clipPos = lerp(clipStart, clipEnd, t_ndc); + float32_t depthNDC = clipPos.z / clipPos.w; + + // Convert to reversed depth [0,1] -> [1,0] + result.depth = 1.0 - depthNDC; + + // Clip against depth range (like hardware would) + // In reversed depth: near=1.0, far=0.0 + if (result.depth < 0.0 || result.depth > 1.0) + { + lineIntensity = 0.0; // Outside depth range, clip it + } + } + + // Modulate by PDF + float32_t pdfIntensity = saturate(pdf * 0.5); + + float32_t3 finalColor = pdfIntensity; + + result.color = float32_t4(finalColor, lineIntensity); + return result; +} + +// Transform a point by inverse of model matrix (world to local space) +float32_t3 worldToLocal(float32_t3 worldPos, float32_t3x4 modelMatrix) +{ + // Manually construct 4x4 from 3x4 + float32_t4x4 model4x4 = float32_t4x4( + modelMatrix[0], + modelMatrix[1], + modelMatrix[2], + float32_t4(0.0, 0.0, 0.0, 1.0)); + float32_t4x4 invModel = inverse(model4x4); + return mul(invModel, float32_t4(worldPos, 1.0)).xyz; +} + +// Transform a direction by inverse of model matrix (no translation) +float32_t3 worldToLocalDir(float32_t3 worldDir, float32_t3x4 modelMatrix) +{ + // Manually construct 4x4 from 3x4 + float32_t4x4 model4x4 = float32_t4x4( + modelMatrix[0], + modelMatrix[1], + modelMatrix[2], + float32_t4(0.0, 0.0, 0.0, 1.0)); + float32_t4x4 invModel = inverse(model4x4); + return mul(invModel, float32_t4(worldDir, 0.0)).xyz; +} +[[vk::location(0)]] ArrowResult main(SVertexAttributes vx) +{ + ArrowResult output; + output.color = float32_t4(0.0, 0.0, 0.0, 0.0); + output.depth = 0.0; // Default to far plane in reversed depth + float32_t maxDepth = 0.0; // Track the closest depth (maximum in reversed depth) + + // Convert to NDC space with aspect ratio correction + float32_t2 ndcPos = vx.uv * 2.0f - 1.0f; + float32_t aspect = pc.viewport.z / pc.viewport.w; + ndcPos.x *= aspect; + + // Draw clipped silhouett vertices using drawCorners() + for (uint32_t v = 0; v < DebugDataBuffer[0].clippedSilhouetteVertexCount; v++) + { + float32_t4 clipPos = mul(pc.viewProjMatrix, float32_t4(DebugDataBuffer[0].clippedSilhouetteVertices[v], 1.0)); + float32_t3 ndcPosVertex = clipPos.xyz / clipPos.w; // Perspective divide to get NDC + + float32_t4 intensity = drawCorner(ndcPosVertex, ndcPos, 0.005, 0.01, 0.01, float32_t3(1.0, 0.0, 0.0)); + + output.color += intensity; + output.depth = intensity > 0.0 ? 1.0 : output.depth; // Update depth + maxDepth = max(maxDepth, output.depth); + } + + int sampleCount = DebugDataBuffer[0].sampleCount; + + for (int i = 0; i < sampleCount; i++) + { + float32_t3 rayOrigin = float32_t3(0, 0, 0); + float32_t4 directionAndPdf = DebugDataBuffer[0].rayData[i]; + float32_t3 rayDir = normalize(directionAndPdf.xyz); + + // Define cube bounds in local space (unit cube from -0.5 to 0.5, adjust as needed) + float32_t3 cubeLocalMin = float32_t3(-0.5, -0.5, -0.5); + float32_t3 cubeLocalMax = float32_t3(0.5, 0.5, 0.5); + + // Transform ray to local space of the cube + float32_t3 localRayOrigin = worldToLocal(rayOrigin, pc.modelMatrix); + float32_t3 localRayDir = normalize(worldToLocalDir(rayDir, pc.modelMatrix)); + + // Perform intersection test in local space + float32_t hitDistance = rayAABBIntersection(localRayOrigin, localRayDir, cubeLocalMin, cubeLocalMax); + + float32_t arrowLength; + if (hitDistance > 0.0) + { + // Calculate world space hit distance + // We need to account for the scaling in the model matrix + float32_t3 localHitPoint = localRayOrigin + localRayDir * hitDistance; + float32_t3 worldHitPoint = mul(pc.modelMatrix, float32_t4(localHitPoint, 1.0)).xyz; + arrowLength = length(worldHitPoint - rayOrigin); + } + else + { + // No intersection, use fallback (e.g., fixed length or distance to cube center) + float32_t3 cubeCenter = mul(pc.modelMatrix, float32_t4(0, 0, 0, 1)).xyz; + arrowLength = length(cubeCenter - rayOrigin) + 2.0; + } + + ArrowResult arrow = visualizeRayAsArrow(rayOrigin, directionAndPdf, arrowLength, ndcPos, aspect); + maxDepth = max(maxDepth, arrow.depth); + + // Additive blending + output.color.rgb += hitDistance > 0.0 ? arrow.color.rgb : float32_t3(1.0, 0.0, 0.0); + output.color.a = max(output.color.a, arrow.color.a); + } + + // Clamp to prevent overflow + output.color = saturate(output.color); + output.color.a = 1.0; + + // Write the closest depth (maximum in reversed depth) + // ONLY write depth if we actually drew something + output.depth = output.color.a > 0.0 ? maxDepth : 0.0; + + return output; +} \ No newline at end of file diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl index d213d8b94..9caf83246 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl @@ -2,8 +2,10 @@ #define _SAMPLING_HLSL_ // Include the spherical triangle utilities +#include #include #include +#include #include "nbl/builtin/hlsl/random/pcg.hlsl" #include "nbl/builtin/hlsl/random/xoroshiro.hlsl" @@ -13,16 +15,19 @@ using namespace nbl::hlsl; #define SAMPLING_MODE_PROJECTED_SOLID_ANGLE 1 // Maximum number of triangles we can have after clipping -// Without clipping, max 3 faces can be visible at once -// With clipping, can be more. 7 - 2 = 5 max triangles because fanning from one vertex +// Without clipping, max 3 faces can be visible at once so 3 faces * 2 triangles = 6 edges, forming max 4 triangles +// With clipping, one more edge. 7 - 2 = 5 max triangles because fanning from one vertex #define MAX_TRIANGLES 5 +// Minimal cached sampling data - only what's needed for selection struct SamplingData { - float32_t triangleWeights[MAX_TRIANGLES]; - uint32_t triangleIndices[MAX_TRIANGLES]; // Store the 'i' value for each valid triangle - uint32_t count; - float32_t totalWeight; + uint32_t count; // Number of valid triangles + uint32_t samplingMode; // Mode used during build + float32_t totalWeight; // Sum of all triangle weights + float32_t3 faceNormal; // Face normal (only used for projected mode) + float32_t triangleSolidAngles[MAX_TRIANGLES]; // Weight per triangle (for selection) + uint32_t triangleIndices[MAX_TRIANGLES]; // Vertex index i (forms triangle with v0, vi, vi+1) }; float32_t2 nextRandomUnorm2(inout nbl::hlsl::Xoroshiro64StarStar rnd) @@ -69,23 +74,35 @@ float32_t computeProjectedSolidAngleFallback(float32_t3 v0, float32_t3 v1, float // 4. Compute projected solid angle float32_t Gamma = 0.5f * (a * dot(n0, N) + b * dot(n1, N) + c * dot(n2, N)); - // Return the absolute value of the total (to handle CW/CCW triangles) + // Return the absolute value of the total return abs(Gamma); } -// Build sampling data - store weights and vertex indices -SamplingData buildSamplingDataFromSilhouette(ClippedSilhouette silhouette, int32_t samplingMode) +// Build sampling data once - cache only weights for triangle selection +SamplingData buildSamplingDataFromSilhouette(ClippedSilhouette silhouette, uint32_t samplingMode) { SamplingData data; data.count = 0; - data.totalWeight = 0; + data.totalWeight = 0.0f; + data.samplingMode = samplingMode; + data.faceNormal = float32_t3(0, 0, 0); if (silhouette.count < 3) return data; - float32_t3 v0 = silhouette.vertices[0]; - float32_t3 origin = float32_t3(0, 0, 0); + const float32_t3 v0 = silhouette.vertices[0]; + const float32_t3 origin = float32_t3(0, 0, 0); + // Compute face normal ONCE before the loop - silhouette is planar! + if (samplingMode == SAMPLING_MODE_PROJECTED_SOLID_ANGLE) + { + float32_t3 v1 = silhouette.vertices[1]; + float32_t3 v2 = silhouette.vertices[2]; + data.faceNormal = normalize(cross(v1 - v0, v2 - v0)); + } + + // Build fan triangulation from v0 + NBL_UNROLL for (uint32_t i = 1; i < silhouette.count - 1; i++) { float32_t3 v1 = silhouette.vertices[i]; @@ -93,60 +110,84 @@ SamplingData buildSamplingDataFromSilhouette(ClippedSilhouette silhouette, int32 shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(v0, v1, v2, origin); + // Skip degenerate triangles if (shapeTri.pyramidAngles()) continue; - float32_t weight; + // Calculate triangle solid angle + float32_t solidAngle; if (samplingMode == SAMPLING_MODE_PROJECTED_SOLID_ANGLE) { - float32_t3 faceNormal = normalize(cross(v1 - v0, v2 - v0)); // TODO: precompute? - weight = computeProjectedSolidAngleFallback(normalize(v0), normalize(v1), normalize(v2), faceNormal); + // scalar_type projectedSolidAngleOfTriangle(const vector3_type receiverNormal, NBL_REF_ARG(vector3_type) cos_sides, NBL_REF_ARG(vector3_type) csc_sides, NBL_REF_ARG(vector3_type) cos_vertices) + float32_t3 cos_vertices = clamp( + (shapeTri.cos_sides - shapeTri.cos_sides.yzx * shapeTri.cos_sides.zxy) * + shapeTri.csc_sides.yzx * shapeTri.csc_sides.zxy, + float32_t3(-1.0f, -1.0f, -1.0f), + float32_t3(1.0f, 1.0f, 1.0f)); + solidAngle = shapeTri.projectedSolidAngleOfTriangle(data.faceNormal, shapeTri.cos_sides, shapeTri.csc_sides, cos_vertices); } else { - weight = shapeTri.solidAngleOfTriangle(); + solidAngle = shapeTri.solidAngleOfTriangle(); } - if (weight <= 0.0f) + if (solidAngle <= 0.0f) continue; - data.triangleWeights[data.count] = weight; - data.triangleIndices[data.count] = i; // Store the original vertex index, we need to account for skipped degenerate triangles. - data.totalWeight += weight; + // Store only what's needed for weighted selection + data.triangleSolidAngles[data.count] = solidAngle; + data.triangleIndices[data.count] = i; + data.totalWeight += solidAngle; data.count++; } #ifdef DEBUG_DATA - // Assert no edge has both vertices antipodal (lune case) + // Validate no antipodal edges exist (would create spherical lune) for (uint32_t i = 0; i < silhouette.count; i++) { uint32_t j = (i + 1) % silhouette.count; float32_t3 n1 = normalize(silhouette.vertices[i]); float32_t3 n2 = normalize(silhouette.vertices[j]); - // Check if vertices are antipodal - bool antipodal = dot(n1, n2) < -0.99f; + if (dot(n1, n2) < -0.99f) + { + DebugDataBuffer[0].sphericalLuneDetected = 1; + assert(false && "Spherical lune detected: antipodal silhouette edge"); + } + } + DebugDataBuffer[0].maxTrianglesExceeded = (data.count > MAX_TRIANGLES); + + DebugDataBuffer[0].clippedSilhouetteVertexCount = silhouette.count; + for (uint32_t v = 0; v < silhouette.count; v++) + { + DebugDataBuffer[0].clippedSilhouetteVertices[v] = silhouette.vertices[v]; + } - assert(false && "Spherical lune detected: antipodal silhouette edge"); + DebugDataBuffer[0].triangleCount = data.count; + DebugDataBuffer[0].totalSolidAngles = data.totalWeight; + for (uint32_t tri = 0; tri < data.count; tri++) + { + DebugDataBuffer[0].solidAngles[tri] = data.triangleSolidAngles[tri]; } #endif - DebugDataBuffer[0].maxTrianglesExcceded = data.count > MAX_TRIANGLES; return data; } +// Sample using cached selection weights, but recompute geometry on-demand float32_t3 sampleFromData(SamplingData data, ClippedSilhouette silhouette, float32_t2 xi, out float32_t pdf, out uint32_t selectedIdx) { + selectedIdx = 0; + + // Handle empty or invalid data if (data.count == 0 || data.totalWeight <= 0.0f) { - pdf = 0; - selectedIdx = 0; + pdf = 0.0f; return float32_t3(0, 0, 1); } - // Select triangle using uniform random sampling weighted by importance - float32_t toFind = xi.x * data.totalWeight; - uint32_t triIdx = 0; + // Select triangle using cached weighted random selection + float32_t targetWeight = xi.x * data.totalWeight; float32_t cumulativeWeight = 0.0f; float32_t prevCumulativeWeight = 0.0f; @@ -154,57 +195,104 @@ float32_t3 sampleFromData(SamplingData data, ClippedSilhouette silhouette, float for (uint32_t i = 0; i < data.count; i++) { prevCumulativeWeight = cumulativeWeight; - cumulativeWeight += data.triangleWeights[i]; - if (toFind <= cumulativeWeight) + cumulativeWeight += data.triangleSolidAngles[i]; + + if (targetWeight <= cumulativeWeight) { - triIdx = i; + selectedIdx = i; break; } } - selectedIdx = triIdx; - - // Remap xi.x to [0,1] within the selected triangle's weight range - float32_t triMin = prevCumulativeWeight; - float32_t triMax = cumulativeWeight; - float32_t triWeight = triMax - triMin; - float32_t u = (toFind - triMin) / max(triWeight, 1e-7f); + // Remap xi.x to [0,1] within selected triangle's solidAngle interval + float32_t triSolidAngle = data.triangleSolidAngles[selectedIdx]; + float32_t u = (targetWeight - prevCumulativeWeight) / max(triSolidAngle, 1e-7f); - // Reconstruct the triangle using the stored vertex index - uint32_t vertexIdx = data.triangleIndices[triIdx]; // We need to account for skipped degenerate triangles. + // Reconstruct the selected triangle geometry + uint32_t vertexIdx = data.triangleIndices[selectedIdx]; float32_t3 v0 = silhouette.vertices[0]; float32_t3 v1 = silhouette.vertices[vertexIdx]; float32_t3 v2 = silhouette.vertices[vertexIdx + 1]; + + float32_t3 faceNormal = normalize(cross(v1 - v0, v2 - v0)); + float32_t3 origin = float32_t3(0, 0, 0); shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(v0, v1, v2, origin); - sampling::SphericalTriangle samplingTri = sampling::SphericalTriangle::create(shapeTri); - // Sample from the selected triangle using remapped u and original xi.y + // Compute vertex angles once + float32_t3 cos_vertices = clamp( + (shapeTri.cos_sides - shapeTri.cos_sides.yzx * shapeTri.cos_sides.zxy) * + shapeTri.csc_sides.yzx * shapeTri.csc_sides.zxy, + float32_t3(-1.0f, -1.0f, -1.0f), + float32_t3(1.0f, 1.0f, 1.0f)); + float32_t3 sin_vertices = sqrt(float32_t3(1.0f, 1.0f, 1.0f) - cos_vertices * cos_vertices); + + // Sample based on mode + float32_t3 direction; float32_t rcpPdf; - float32_t3 direction = samplingTri.generate(rcpPdf, float32_t2(u, xi.y)); - float32_t trianglePdf = 1.0f / rcpPdf; - pdf = trianglePdf * (data.triangleWeights[triIdx] / data.totalWeight); + if (data.samplingMode == SAMPLING_MODE_PROJECTED_SOLID_ANGLE) + { + sampling::ProjectedSphericalTriangle samplingTri = + sampling::ProjectedSphericalTriangle::create(shapeTri); + + direction = samplingTri.generate( + rcpPdf, + triSolidAngle, + cos_vertices, + sin_vertices, + shapeTri.cos_sides[0], + shapeTri.cos_sides[2], + shapeTri.csc_sides[1], + shapeTri.csc_sides[2], + faceNormal, + false, + float32_t2(u, xi.y)); + triSolidAngle = rcpPdf; // projected solid angle returned as rcpPdf + } + else + { + sampling::SphericalTriangle samplingTri = + sampling::SphericalTriangle::create(shapeTri); + + direction = samplingTri.generate( + triSolidAngle, + cos_vertices, + sin_vertices, + shapeTri.cos_sides[0], + shapeTri.cos_sides[2], + shapeTri.csc_sides[1], + shapeTri.csc_sides[2], + float32_t2(u, xi.y)); + } + + // Calculate PDF + float32_t trianglePdf = 1.0f / triSolidAngle; + float32_t selectionProb = triSolidAngle / data.totalWeight; + pdf = trianglePdf * selectionProb; return normalize(direction); } +#if VISUALIZE_SAMPLES + float32_t4 visualizeSamples(float32_t2 screenUV, float32_t3 spherePos, ClippedSilhouette silhouette, - int32_t samplingMode, SamplingData samplingData, int32_t numSamples) + uint32_t samplingMode, uint32_t frameIndex, SamplingData samplingData, uint32_t numSamples, inout RWStructuredBuffer DebugDataBuffer) { float32_t4 accumColor = 0; - if (samplingData.count == 0) + if (silhouette.count == 0) return 0; float32_t2 pssSize = float32_t2(0.3, 0.3); // 30% of screen float32_t2 pssPos = float32_t2(0.01, 0.01); // Offset from corner bool isInsidePSS = all(and(screenUV >= pssPos, screenUV <= (pssPos + pssSize))); - for (int32_t i = 0; i < numSamples; i++) + DebugDataBuffer[0].sampleCount = numSamples; + for (uint32_t i = 0; i < numSamples; i++) { - nbl::hlsl::random::PCG32 seedGen = nbl::hlsl::random::PCG32::construct(pc.frameIndex * 65536u + i); + nbl::hlsl::random::PCG32 seedGen = nbl::hlsl::random::PCG32::construct(frameIndex * 65536u + i); const uint32_t seed1 = seedGen(); const uint32_t seed2 = seedGen(); nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(seed1, seed2)); @@ -214,6 +302,8 @@ float32_t4 visualizeSamples(float32_t2 screenUV, float32_t3 spherePos, ClippedSi uint32_t triIdx; float32_t3 sampleDir = sampleFromData(samplingData, silhouette, xi, pdf, triIdx); + DebugDataBuffer[0].rayData[i] = float32_t4(sampleDir, pdf); + float32_t dist3D = distance(sampleDir, normalize(spherePos)); float32_t alpha3D = 1.0f - smoothstep(0.0f, 0.02f, dist3D); @@ -245,3 +335,4 @@ float32_t4 visualizeSamples(float32_t2 screenUV, float32_t3 spherePos, ClippedSi return accumColor; } #endif +#endif diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index 31cbe577a..79791af57 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -2,189 +2,27 @@ #include "common.hlsl" #include -#include "utils.hlsl" using namespace nbl::hlsl; using namespace ext::FullScreenTriangle; -[[vk::push_constant]] struct PushConstants pc; -[[vk::binding(0, 0)]] RWStructuredBuffer DebugDataBuffer; - -static const float CIRCLE_RADIUS = 0.5f; - -// --- Geometry Utils --- -struct ClippedSilhouette -{ - float32_t3 vertices[7]; - uint32_t count; -}; - -static const float32_t3 constCorners[8] = { - float32_t3(-1, -1, -1), float32_t3(1, -1, -1), float32_t3(-1, 1, -1), float32_t3(1, 1, -1), - float32_t3(-1, -1, 1), float32_t3(1, -1, 1), float32_t3(-1, 1, 1), float32_t3(1, 1, 1)}; - -static const int32_t2 allEdges[12] = { - {0, 1}, - {2, 3}, - {4, 5}, - {6, 7}, // X axis - {0, 2}, - {1, 3}, - {4, 6}, - {5, 7}, // Y axis - {0, 4}, - {1, 5}, - {2, 6}, - {3, 7}, // Z axis -}; - -// Maps face index (0-5) to its 4 corner indices in CCW order -static const uint32_t faceToCorners[6][4] = { - {0, 2, 3, 1}, // Face 0: Z- - {4, 5, 7, 6}, // Face 1: Z+ - {0, 4, 6, 2}, // Face 2: X- - {1, 3, 7, 5}, // Face 3: X+ - {0, 1, 5, 4}, // Face 4: Y- - {2, 6, 7, 3} // Face 5: Y+ -}; - -static float32_t3 corners[8]; -static float32_t3 faceCenters[6] = { - float32_t3(0, 0, 0), float32_t3(0, 0, 0), float32_t3(0, 0, 0), - float32_t3(0, 0, 0), float32_t3(0, 0, 0), float32_t3(0, 0, 0)}; - -static const float32_t3 localNormals[6] = { - float32_t3(0, 0, -1), // Face 0 (Z-) - float32_t3(0, 0, 1), // Face 1 (Z+) - float32_t3(-1, 0, 0), // Face 2 (X-) - float32_t3(1, 0, 0), // Face 3 (X+) - float32_t3(0, -1, 0), // Face 4 (Y-) - float32_t3(0, 1, 0) // Face 5 (Y+) -}; - -// TODO: unused, remove later -// Vertices are ordered CCW relative to the camera view. -static const int32_t silhouettes[27][7] = { - {6, 1, 3, 2, 6, 4, 5}, // 0: Black - {6, 2, 6, 4, 5, 7, 3}, // 1: White - {6, 0, 4, 5, 7, 3, 2}, // 2: Gray - {6, 1, 3, 7, 6, 4, 5}, // 3: Red - {4, 4, 5, 7, 6, -1, -1}, // 4: Green - {6, 0, 4, 5, 7, 6, 2}, // 5: Blue - {6, 0, 1, 3, 7, 6, 4}, // 6: Yellow - {6, 0, 1, 5, 7, 6, 4}, // 7: Magenta - {6, 0, 1, 5, 7, 6, 2}, // 8: Cyan - {6, 1, 3, 2, 6, 7, 5}, // 9: Orange - {4, 2, 6, 7, 3, -1, -1}, // 10: Light Orange - {6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange - {4, 1, 3, 7, 5, -1, -1}, // 12: Pink - {6, 0, 4, 6, 7, 3, 2}, // 13: Light Pink - {4, 0, 4, 6, 2, -1, -1}, // 14: Deep Rose - {6, 0, 1, 3, 7, 5, 4}, // 15: Purple - {4, 0, 1, 5, 4, -1, -1}, // 16: Light Purple - {6, 0, 1, 5, 4, 6, 2}, // 17: Indigo - {6, 0, 2, 6, 7, 5, 1}, // 18: Dark Green - {6, 0, 2, 6, 7, 3, 1}, // 19: Lime - {6, 0, 4, 6, 7, 3, 1}, // 20: Forest Green - {6, 0, 2, 3, 7, 5, 1}, // 21: Navy - {4, 0, 2, 3, 1, -1, -1}, // 22: Sky Blue - {6, 0, 4, 6, 2, 3, 1}, // 23: Teal - {6, 0, 2, 3, 7, 5, 4}, // 24: Brown - {6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige - {6, 1, 5, 4, 6, 2, 3} // 26: Dark Brown -}; - -// Binary packed silhouettes -static const uint32_t binSilhouettes[27] = { - 0b11000000000000101100110010011001, - 0b11000000000000011111101100110010, - 0b11000000000000010011111101100000, - 0b11000000000000101100110111011001, - 0b10000000000000000000110111101100, - 0b11000000000000010110111101100000, - 0b11000000000000100110111011001000, - 0b11000000000000100110111101001000, - 0b11000000000000010110111101001000, - 0b11000000000000101111110010011001, - 0b10000000000000000000011111110010, - 0b11000000000000010011111110100000, - 0b10000000000000000000101111011001, - 0b11000000000000010011111110100000, - 0b10000000000000000000010110100000, - 0b11000000000000100101111011001000, - 0b10000000000000000000100101001000, - 0b11000000000000010110100101001000, - 0b11000000000000001101111110010000, - 0b11000000000000001011111110010000, - 0b11000000000000001011111110100000, - 0b11000000000000001101111011010000, - 0b10000000000000000000001011010000, - 0b11000000000000001011010110100000, - 0b11000000000000100101111011010000, - 0b11000000000000100101001011010000, - 0b11000000000000011010110100101001, -}; - -int32_t getSilhouetteVertex(uint32_t packedSil, int32_t index) -{ - return (packedSil >> (3 * index)) & 0x7; -} - -// Get silhouette size -int32_t getSilhouetteSize(uint32_t sil) -{ - return (sil >> 29) & 0x7; -} - -// Check if vertex has negative z -bool getVertexZNeg(int32_t vertexIdx) -{ -#if FAST - float32_t3 localPos = float32_t3( - (vertexIdx & 1) ? 1.0f : -1.0f, - (vertexIdx & 2) ? 1.0f : -1.0f, - (vertexIdx & 4) ? 1.0f : -1.0f); - - float transformedZ = dot(pc.modelMatrix[2].xyz, localPos) + pc.modelMatrix[2].w; - return transformedZ < 0.0f; -#else - return corners[vertexIdx].z < 0.0f; -#endif -} - -// Get world position of cube vertex -float32_t3 getVertex(int32_t vertexIdx) -{ -#if FAST - // Reconstruct local cube corner from index bits - float sx = (vertexIdx & 1) ? 1.0f : -1.0f; - float sy = (vertexIdx & 2) ? 1.0f : -1.0f; - float sz = (vertexIdx & 4) ? 1.0f : -1.0f; - - float32_t4x3 model = transpose(pc.modelMatrix); +[[vk::binding(0, 0)]] RWStructuredBuffer DebugDataBuffer; // TODO: move below other includes - // Transform to world - // Full position, not just Z like getVertexZNeg - return model[0].xyz * sx + - model[1].xyz * sy + - model[2].xyz * sz + - model[3].xyz; - // return mul(pc.modelMatrix, float32_t4(sx, sy, sz, 1.0f)); -#else - return corners[vertexIdx]; -#endif -} +#define VISUALIZE_SAMPLES 1 +#include "utils.hlsl" #include "Drawing.hlsl" #include "Sampling.hlsl" +#include "silhouette.hlsl" +[[vk::push_constant]] struct PushConstants pc; -void setDebugData(uint32_t sil, int32_t3 region, int32_t configIndex) +void setDebugData(uint32_t sil, uint32_t3 region, uint32_t configIndex) { #if DEBUG_DATA DebugDataBuffer[0].region = uint32_t3(region); DebugDataBuffer[0].silhouetteIndex = uint32_t(configIndex); DebugDataBuffer[0].silhouetteVertexCount = uint32_t(getSilhouetteSize(sil)); - for (int32_t i = 0; i < 6; i++) + for (uint32_t i = 0; i < 6; i++) { DebugDataBuffer[0].vertices[i] = uint32_t(getSilhouetteVertex(sil, i)); } @@ -192,274 +30,131 @@ void setDebugData(uint32_t sil, int32_t3 region, int32_t configIndex) #endif } -float32_t2 toCircleSpace(float32_t2 uv) -{ - float32_t2 p = uv * 2.0f - 1.0f; - float aspect = pc.viewport.z / pc.viewport.w; - p.x *= aspect; - return p; -} - -uint32_t packSilhouette(const int32_t s[7]) -{ - uint32_t packed = 0; - int32_t size = s[0] & 0x7; // 3 bits for size - - // Pack vertices LSB-first (vertex1 in lowest 3 bits above size) - for (int32_t i = 1; i <= 6; ++i) - { - int32_t v = s[i]; - if (v < 0) - v = 0; // replace unused vertices with 0 - packed |= (v & 0x7) << (3 * (i - 1)); // vertex i-1 shifted by 3*(i-1) - } - - // Put size in the MSB (bits 29-31 for a 32-bit uint32_t, leaving 29 bits for vertices) - packed |= (size & 0x7) << 29; - - return packed; -} - void computeCubeGeo() { - for (int32_t i = 0; i < 8; i++) + for (uint32_t i = 0; i < 8; i++) corners[i] = mul(pc.modelMatrix, float32_t4(constCorners[i], 1.0f)).xyz; - for (int32_t f = 0; f < 6; f++) + for (uint32_t f = 0; f < 6; f++) { faceCenters[f] = float32_t3(0, 0, 0); - for (int32_t v = 0; v < 4; v++) + for (uint32_t v = 0; v < 4; v++) faceCenters[f] += corners[faceToCorners[f][v]]; faceCenters[f] /= 4.0f; } } -// Helper to draw an edge with proper color mapping -float32_t4 drawEdge(int32_t originalEdgeIdx, float32_t3 pts[2], float32_t3 spherePos, float aaWidth, float width = 0.01f) -{ - float32_t4 edgeContribution = drawGreatCircleArc(spherePos, pts, aaWidth, width); - return float32_t4(colorLUT[originalEdgeIdx] * edgeContribution.a, edgeContribution.a); -}; - -float32_t4 computeSilhouette(uint32_t vertexCount, uint32_t sil, float32_t3 spherePos, float aaWidth, out ClippedSilhouette silhouette) +void validateSilhouetteEdges(uint32_t sil, uint32_t vertexCount, inout uint32_t silEdgeMask) { - float32_t4 color = float32_t4(0, 0, 0, 0); - silhouette.count = 0; - - // Build clip mask (z < 0) - int32_t clipMask = 0u; - NBL_UNROLL - for (int32_t i = 0; i < 4; i++) - clipMask |= (getVertexZNeg(getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; - - if (vertexCount == 6) - { - NBL_UNROLL - for (int32_t i = 4; i < 6; i++) - clipMask |= (getVertexZNeg(getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; - } - - int32_t clipCount = countbits(clipMask); - -#if 0 - // Early exit if fully clipped - if (clipCount == vertexCount) - return color; - - // No clipping needed - fast path - if (clipCount == 0) +#if DEBUG_DATA { - for (int32_t i = 0; i < vertexCount; i++) + for (uint32_t i = 0; i < vertexCount; i++) { - int32_t i0 = i; - int32_t i1 = (i + 1) % vertexCount; + uint32_t vIdx = i % vertexCount; + uint32_t v1Idx = (i + 1) % vertexCount; - float32_t3 v0 = getVertex(getSilhouetteVertex(sil, i0)); - float32_t3 v1 = getVertex(getSilhouetteVertex(sil, i1)); - float32_t3 pts[2] = {v0, v1}; - - color += drawEdge(i1, pts, spherePos, aaWidth); + uint32_t v0Corner = getSilhouetteVertex(sil, vIdx); + uint32_t v1Corner = getSilhouetteVertex(sil, v1Idx); + // Mark edge as part of silhouette + for (uint32_t e = 0; e < 12; e++) + { + uint32_t2 edge = allEdges[e]; + if ((edge.x == v0Corner && edge.y == v1Corner) || + (edge.x == v1Corner && edge.y == v0Corner)) + { + silEdgeMask |= (1u << e); + } + } } - return color; + validateEdgeVisibility(pc.modelMatrix, sil, vertexCount, silEdgeMask); } #endif +} - // Rotate clip mask so positives come first - uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u); - bool wrapAround = ((clipMask & 1u) != 0u) && - ((clipMask & (1u << (vertexCount - 1))) != 0u); - int32_t rotateAmount = wrapAround - ? firstbitlow(invertedMask) // -> First POSITIVE - : firstbithigh(clipMask) + 1; // -> First vertex AFTER last negative - - uint32_t rotatedClipMask = rotr(clipMask, rotateAmount, vertexCount); - uint32_t rotatedSil = rotr(sil, rotateAmount * 3, vertexCount * 3); - - int32_t positiveCount = vertexCount - clipCount; - - // ALWAYS compute both clip points - int32_t lastPosIdx = positiveCount - 1; - int32_t firstNegIdx = positiveCount; - float32_t3 vLastPos = getVertex(getSilhouetteVertex(rotatedSil, lastPosIdx)); - float32_t3 vFirstNeg = getVertex(getSilhouetteVertex(rotatedSil, firstNegIdx)); - float t = vLastPos.z / (vLastPos.z - vFirstNeg.z); - float32_t3 clipA = lerp(vLastPos, vFirstNeg, t); +void computeSpherePos(SVertexAttributes vx, out float32_t2 ndc, out float32_t3 spherePos) +{ + ndc = vx.uv * 2.0f - 1.0f; + float32_t aspect = pc.viewport.z / pc.viewport.w; + ndc.x *= aspect; - float32_t3 vLastNeg = getVertex(getSilhouetteVertex(rotatedSil, vertexCount - 1)); - float32_t3 vFirstPos = getVertex(getSilhouetteVertex(rotatedSil, 0)); - t = vLastNeg.z / (vLastNeg.z - vFirstPos.z); - float32_t3 clipB = lerp(vLastNeg, vFirstPos, t); + float32_t2 normalized = ndc / CIRCLE_RADIUS; + float32_t r2 = dot(normalized, normalized); - // Draw positive edges - NBL_UNROLL - for (int32_t i = 0; i < positiveCount; i++) + if (r2 <= 1.0f) { - float32_t3 v0 = getVertex(getSilhouetteVertex(rotatedSil, i)); - - // ONLY use clipA if we are at the end of the positive run AND there's a clip - bool isLastPositive = (i == positiveCount - 1); - bool useClipA = (clipCount > 0) && isLastPositive; - - // If not using clipA, wrap around to the next vertex - float32_t3 v1 = useClipA ? clipA : getVertex(getSilhouetteVertex(rotatedSil, (i + 1) % vertexCount)); - - float32_t3 pts[2] = {v0, v1}; - color += drawEdge((i + 1) % vertexCount, pts, spherePos, aaWidth); - - silhouette.vertices[silhouette.count++] = v0; + spherePos = float32_t3(normalized.x, normalized.y, sqrt(1.0f - r2)); } - - if (clipCount > 0 && clipCount < vertexCount) + else { - // NP edge - float32_t3 vFirst = getVertex(getSilhouetteVertex(rotatedSil, 0)); - float32_t3 npPts[2] = {clipB, vFirst}; - color += drawEdge(0, npPts, spherePos, aaWidth); - - // Horizon arc - float32_t3 arcPts[2] = {clipA, clipB}; - color += drawEdge(23, arcPts, spherePos, aaWidth, 0.6f); - - silhouette.vertices[silhouette.count++] = clipA; - silhouette.vertices[silhouette.count++] = clipB; + float32_t uv2Plus1 = r2 + 1.0f; + spherePos = float32_t3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; } - -#if DEBUG_DATA - DebugDataBuffer[0].clipMask = clipMask; - DebugDataBuffer[0].clipCount = clipCount; - DebugDataBuffer[0].rotatedClipMask = rotatedClipMask; - DebugDataBuffer[0].rotateAmount = rotateAmount; - DebugDataBuffer[0].positiveVertCount = positiveCount; - DebugDataBuffer[0].wrapAround = (uint32_t)wrapAround; - DebugDataBuffer[0].rotatedSil = rotatedSil; - -#endif - return color; + spherePos = normalize(spherePos); } [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 { float32_t4 color = float32_t4(0, 0, 0, 0); - for (int32_t i = 0; i < 1; i++) + for (uint32_t i = 0; i < 1; i++) { - float aaWidth = length(float32_t2(ddx(vx.uv.x), ddy(vx.uv.y))); - float32_t2 p = toCircleSpace(vx.uv); - - float32_t2 normalized = p / CIRCLE_RADIUS; - float r2 = dot(normalized, normalized); - + float32_t aaWidth = length(float32_t2(ddx(vx.uv.x), ddy(vx.uv.y))); float32_t3 spherePos; - if (r2 <= 1.0f) - { - spherePos = float32_t3(normalized.x, normalized.y, sqrt(1.0f - r2)); - } - else - { - float uv2Plus1 = r2 + 1.0f; - spherePos = float32_t3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; - } - spherePos = normalize(spherePos); - + float32_t2 ndc; + computeSpherePos(vx, ndc, spherePos); +#if !FAST || DEBUG_DATA computeCubeGeo(); - - float32_t4x3 columnModel = transpose(pc.modelMatrix); - float32_t3 obbCenter = columnModel[3].xyz; - float32_t3x3 upper3x3 = (float32_t3x3)columnModel; - float32_t3 rcpSqScales = rcp(float32_t3( - dot(upper3x3[0], upper3x3[0]), - dot(upper3x3[1], upper3x3[1]), - dot(upper3x3[2], upper3x3[2]))); - float32_t3 normalizedProj = mul(upper3x3, obbCenter) * rcpSqScales; - - int32_t3 region = int32_t3( - normalizedProj.x < -1.0f ? 0 : (normalizedProj.x > 1.0f ? 2 : 1), - normalizedProj.y < -1.0f ? 0 : (normalizedProj.y > 1.0f ? 2 : 1), - normalizedProj.z < -1.0f ? 0 : (normalizedProj.z > 1.0f ? 2 : 1)); - - int32_t configIndex = region.x + region.y * 3 + region.z * 9; - - // uint32_t sil = packSilhouette(silhouettes[configIndex]); - uint32_t sil = binSilhouettes[configIndex]; - - int32_t vertexCount = getSilhouetteSize(sil); +#endif + uint32_t3 region; + uint32_t configIndex; + uint32_t vertexCount; + uint32_t sil = computeRegionAndConfig(pc.modelMatrix, region, configIndex, vertexCount); uint32_t silEdgeMask = 0; // TODO: take from 'fast' computeSilhouette() #if DEBUG_DATA - { - for (int32_t i = 0; i < vertexCount; i++) - { - int32_t vIdx = i % vertexCount; - int32_t v1Idx = (i + 1) % vertexCount; - - int32_t v0Corner = getSilhouetteVertex(sil, vIdx); - int32_t v1Corner = getSilhouetteVertex(sil, v1Idx); - // Mark edge as part of silhouette - for (int32_t e = 0; e < 12; e++) - { - int32_t2 edge = allEdges[e]; - if ((edge.x == v0Corner && edge.y == v1Corner) || - (edge.x == v1Corner && edge.y == v0Corner)) - { - silEdgeMask |= (1u << e); - } - } - } - validateEdgeVisibility(sil, vertexCount, silEdgeMask); - } + validateSilhouetteEdges(sil, vertexCount, silEdgeMask); #endif - - uint32_t positiveCount = 0; - ClippedSilhouette silhouette; - color += computeSilhouette(vertexCount, sil, spherePos, aaWidth, silhouette); + +#if VISUALIZE_SAMPLES + color += computeSilhouette(pc.modelMatrix, vertexCount, sil, spherePos, aaWidth, silhouette); +#else + computeSilhouette(pc.modelMatrix, vertexCount, sil, silhouette); +#endif // Draw clipped silhouette vertices - // color += drawClippedSilhouetteVertices(p, silhouette, aaWidth); + // color += drawClippedSilhouetteVertices(ndc, silhouette, aaWidth); SamplingData samplingData = buildSamplingDataFromSilhouette(silhouette, pc.samplingMode); - - uint32_t faceIndices[3]; - uint32_t visibleFaceCount = getVisibleFaces(region, faceIndices); +#if VISUALIZE_SAMPLES // For debugging: Draw a small indicator of which faces are found - // color += drawVisibleFaceOverlay(spherePos, region, aaWidth); + // color += drawVisibleFaceOverlay(pc.modelMatrix, spherePos, region, aaWidth); - // color += drawFaces(spherePos, aaWidth); + // color += drawFaces(pc.modelMatrix, spherePos, aaWidth); // Draw samples on sphere - color += visualizeSamples(vx.uv, spherePos, silhouette, pc.samplingMode, samplingData, 64); + color += visualizeSamples(vx.uv, spherePos, silhouette, pc.samplingMode, pc.frameIndex, samplingData, 64, DebugDataBuffer); - // Or draw 2D sample space (in a separate viewport) - // color += visualizePrimarySampleSpace(vx.uv, pc.samplingMode, 64, aaWidth); - - setDebugData(sil, region, configIndex); - // color += drawHiddenEdges(spherePos, silEdgeMask, aaWidth); - color += drawCorners(p, aaWidth); - color += drawRing(p, aaWidth); + color += drawHiddenEdges(pc.modelMatrix, spherePos, silEdgeMask, aaWidth); + color += drawCorners(pc.modelMatrix, ndc, aaWidth); + color += drawRing(ndc, aaWidth); if (all(vx.uv >= float32_t2(0.49f, 0.49f)) && all(vx.uv <= float32_t2(0.51f, 0.51f))) { return float32_t4(colorLUT[configIndex], 1.0f); } +#else + nbl::hlsl::random::PCG32 seedGen = nbl::hlsl::random::PCG32::construct(65536u + i); + const uint32_t2 seeds = uint32_t2(seedGen(), seedGen()); + nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(seeds); + float32_t2 xi = nextRandomUnorm2(rnd); + + float32_t pdf; + uint32_t triIdx; + float32_t3 sampleDir = sampleFromData(samplingData, silhouette, xi, pdf, triIdx); + + color += float4(sampleDir * 0.02f / pdf, 1.0f); +#endif // VISUALIZE_SAMPLES + setDebugData(sil, region, configIndex); } return color; diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl new file mode 100644 index 000000000..6d04538a5 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl @@ -0,0 +1,45 @@ +//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#pragma shader_stage(compute) +#define DEBUG_DATA 0 +#include "app_resources/hlsl/benchmark/common.hlsl" +#include "app_resources/hlsl/silhouette.hlsl" +#include "app_resources/hlsl/Sampling.hlsl" + +using namespace nbl::hlsl; + +[[vk::binding(0, 0)]] RWByteAddressBuffer outputBuffer; +[[vk::push_constant]] BenchmarkPushConstants pc; + +[numthreads(BENCHMARK_WORKGROUP_DIMENSION_SIZE_X, 1, 1)] + [shader("compute")] void + main(uint3 invocationID : SV_DispatchThreadID) +{ + uint32_t3 region; + uint32_t configIndex; + uint32_t vertexCount; + uint32_t sil = computeRegionAndConfig(pc.modelMatrix, region, configIndex, vertexCount); + + ClippedSilhouette silhouette; + computeSilhouette(pc.modelMatrix, vertexCount, sil, silhouette); + + SamplingData samplingData; + samplingData = buildSamplingDataFromSilhouette(silhouette, pc.samplingMode); + + nbl::hlsl::random::PCG32 seedGen = nbl::hlsl::random::PCG32::construct(65536u + invocationID.x); + const uint32_t2 seeds = uint32_t2(seedGen(), seedGen()); + + float32_t pdf; + uint32_t triIdx; + float32_t3 sampleDir = float32_t3(0.0, 0.0, 0.0); + for (uint32_t i = 0; i < 64; i++) + { + nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(seeds); + float32_t2 xi = nextRandomUnorm2(rnd); + sampleDir += sampleFromData(samplingData, silhouette, xi, pdf, triIdx); + } + + const uint32_t offset = sizeof(uint32_t) * invocationID.x; + outputBuffer.Store(offset, pdf + triIdx + asuint(sampleDir.x) + asuint(sampleDir.y) + asuint(sampleDir.z)); +} diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl new file mode 100644 index 000000000..d54ee8a36 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl @@ -0,0 +1,23 @@ +//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h + +#include + +NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_X = 64u; +NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y = 1u; +NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z = 1u; +NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_COUNT = 1920u * 1080u / BENCHMARK_WORKGROUP_DIMENSION_SIZE_X; + +enum SAMPLING_BENCHMARK_MODE +{ + TRIANGLE_SOLID_ANGLE, + TRIANGLE_PROJECTED_SOLID_ANGLE, +}; + +struct BenchmarkPushConstants +{ + float32_t3x4 modelMatrix; + uint32_t samplingMode; + SAMPLING_BENCHMARK_MODE benchmarkMode; +}; \ No newline at end of file diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl index dd0ab2d99..db2f328b5 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl @@ -2,6 +2,10 @@ #define _SOLID_ANGLE_VIS_COMMON_HLSL_ #include "nbl/builtin/hlsl/cpp_compat.hlsl" +// Sampling mode enum +#define SAMPLING_MODE_SOLID_ANGLE 0 +#define SAMPLING_MODE_PROJECTED_SOLID_ANGLE 1 + #define DEBUG_DATA 1 #define FAST 1 @@ -27,9 +31,21 @@ namespace nbl uint32_t rotatedClipMask; uint32_t rotateAmount; - uint32_t maxTrianglesExcceded; + uint32_t maxTrianglesExceeded; + uint32_t sphericalLuneDetected; uint32_t vertices[6]; + + uint32_t clippedSilhouetteVertexCount; + float32_t3 clippedSilhouetteVertices[7]; + + uint32_t triangleCount; + float32_t solidAngles[5]; + float32_t totalSolidAngles; + + // Sampling ray visualization data + uint32_t sampleCount; + float32_t4 rayData[64]; // xyz = direction, w = PDF }; struct PushConstants @@ -39,9 +55,14 @@ namespace nbl uint32_t samplingMode; uint32_t frameIndex; }; - // Sampling mode enum -#define SAMPLING_MODE_SOLID_ANGLE 0 -#define SAMPLING_MODE_PROJECTED_SOLID_ANGLE 1 + + struct PushConstantRayVis + { + float32_t4x4 viewProjMatrix; + float32_t3x4 modelMatrix; + float32_t4 viewport; + uint32_t frameIndex; + }; static const float32_t3 colorLUT[27] = { float32_t3(0, 0, 0), float32_t3(0.5, 0.5, 0.5), diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl new file mode 100644 index 000000000..d4ef71d07 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl @@ -0,0 +1,168 @@ +#ifndef GPU_COMMON_HLSL +#define GPU_COMMON_HLSL + +static const float32_t CIRCLE_RADIUS = 0.5f; + +// --- Geometry Utils --- +struct ClippedSilhouette +{ + float32_t3 vertices[7]; // Max 7 vertices after clipping, unnormalized + uint32_t count; +}; + +static const float32_t3 constCorners[8] = { + float32_t3(-0.5f, -0.5f, -0.5f), float32_t3(0.5f, -0.5f, -0.5f), float32_t3(-0.5f, 0.5f, -0.5f), float32_t3(0.5f, 0.5f, -0.5f), + float32_t3(-0.5f, -0.5f, 0.5f), float32_t3(0.5f, -0.5f, 0.5f), float32_t3(-0.5f, 0.5f, 0.5f), float32_t3(0.5f, 0.5f, 0.5f)}; + +static const uint32_t2 allEdges[12] = { + {0, 1}, + {2, 3}, + {4, 5}, + {6, 7}, // X axis + {0, 2}, + {1, 3}, + {4, 6}, + {5, 7}, // Y axis + {0, 4}, + {1, 5}, + {2, 6}, + {3, 7}, // Z axis +}; + +// Maps face index (0-5) to its 4 corner indices in CCW order +static const uint32_t faceToCorners[6][4] = { + {0, 2, 3, 1}, // Face 0: Z- + {4, 5, 7, 6}, // Face 1: Z+ + {0, 4, 6, 2}, // Face 2: X- + {1, 3, 7, 5}, // Face 3: X+ + {0, 1, 5, 4}, // Face 4: Y- + {2, 6, 7, 3} // Face 5: Y+ +}; + +static float32_t3 corners[8]; +static float32_t3 faceCenters[6] = { + float32_t3(0, 0, 0), float32_t3(0, 0, 0), float32_t3(0, 0, 0), + float32_t3(0, 0, 0), float32_t3(0, 0, 0), float32_t3(0, 0, 0)}; + +static const float32_t3 localNormals[6] = { + float32_t3(0, 0, -1), // Face 0 (Z-) + float32_t3(0, 0, 1), // Face 1 (Z+) + float32_t3(-1, 0, 0), // Face 2 (X-) + float32_t3(1, 0, 0), // Face 3 (X+) + float32_t3(0, -1, 0), // Face 4 (Y-) + float32_t3(0, 1, 0) // Face 5 (Y+) +}; + +// TODO: unused, remove later +// Vertices are ordered CCW relative to the camera view. +static const uint32_t silhouettes[27][7] = { + {6, 1, 3, 2, 6, 4, 5}, // 0: Black + {6, 2, 6, 4, 5, 7, 3}, // 1: White + {6, 0, 4, 5, 7, 3, 2}, // 2: Gray + {6, 1, 3, 7, 6, 4, 5}, // 3: Red + {4, 4, 5, 7, 6, 0, 0}, // 4: Green + {6, 0, 4, 5, 7, 6, 2}, // 5: Blue + {6, 0, 1, 3, 7, 6, 4}, // 6: Yellow + {6, 0, 1, 5, 7, 6, 4}, // 7: Magenta + {6, 0, 1, 5, 7, 6, 2}, // 8: Cyan + {6, 1, 3, 2, 6, 7, 5}, // 9: Orange + {4, 2, 6, 7, 3, 0, 0}, // 10: Light Orange + {6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange + {4, 1, 3, 7, 5, 0, 0}, // 12: Pink + {6, 0, 4, 6, 7, 3, 2}, // 13: Light Pink + {4, 0, 4, 6, 2, 0, 0}, // 14: Deep Rose + {6, 0, 1, 3, 7, 5, 4}, // 15: Purple + {4, 0, 1, 5, 4, 0, 0}, // 16: Light Purple + {6, 0, 1, 5, 4, 6, 2}, // 17: Indigo + {6, 0, 2, 6, 7, 5, 1}, // 18: Dark Green + {6, 0, 2, 6, 7, 3, 1}, // 19: Lime + {6, 0, 4, 6, 7, 3, 1}, // 20: Forest Green + {6, 0, 2, 3, 7, 5, 1}, // 21: Navy + {4, 0, 2, 3, 1, 0, 0}, // 22: Sky Blue + {6, 0, 4, 6, 2, 3, 1}, // 23: Teal + {6, 0, 2, 3, 7, 5, 4}, // 24: Brown + {6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige + {6, 1, 5, 4, 6, 2, 3} // 26: Dark Brown +}; + +// Binary packed silhouettes +static const uint32_t binSilhouettes[27] = { + 0b11000000000000101100110010011001, + 0b11000000000000011111101100110010, + 0b11000000000000010011111101100000, + 0b11000000000000101100110111011001, + 0b10000000000000000000110111101100, + 0b11000000000000010110111101100000, + 0b11000000000000100110111011001000, + 0b11000000000000100110111101001000, + 0b11000000000000010110111101001000, + 0b11000000000000101111110010011001, + 0b10000000000000000000011111110010, + 0b11000000000000010011111110100000, + 0b10000000000000000000101111011001, + 0b11000000000000010011111110100000, + 0b10000000000000000000010110100000, + 0b11000000000000100101111011001000, + 0b10000000000000000000100101001000, + 0b11000000000000010110100101001000, + 0b11000000000000001101111110010000, + 0b11000000000000001011111110010000, + 0b11000000000000001011111110100000, + 0b11000000000000001101111011010000, + 0b10000000000000000000001011010000, + 0b11000000000000001011010110100000, + 0b11000000000000100101111011010000, + 0b11000000000000100101001011010000, + 0b11000000000000011010110100101001, +}; + +uint32_t getSilhouetteVertex(uint32_t packedSil, uint32_t index) +{ + return (packedSil >> (3u * index)) & 0x7u; +} + +// Get silhouette size +uint32_t getSilhouetteSize(uint32_t sil) +{ + return (sil >> 29u) & 0x7u; +} + +// Check if vertex has negative z +bool getVertexZNeg(float32_t3x4 modelMatrix, uint32_t vertexIdx) +{ +#if FAST + float32_t3 localPos = float32_t3( + (vertexIdx & 1) ? 0.5f : -0.5f, + (vertexIdx & 2) ? 0.5f : -0.5f, + (vertexIdx & 4) ? 0.5f : -0.5f); + + float32_t transformedZ = dot(modelMatrix[2].xyz, localPos) + modelMatrix[2].w; + return transformedZ < 0.0f; +#else + return corners[vertexIdx].z < 0.0f; +#endif +} + +// Get world position of cube vertex +float32_t3 getVertex(float32_t3x4 modelMatrix, uint32_t vertexIdx) +{ +#if FAST + // Reconstruct local cube corner from index bits + float32_t sx = (vertexIdx & 1) ? 0.5f : -0.5f; + float32_t sy = (vertexIdx & 2) ? 0.5f : -0.5f; + float32_t sz = (vertexIdx & 4) ? 0.5f : -0.5f; + + float32_t4x3 model = transpose(modelMatrix); + + // Transform to world + // Full position, not just Z like getVertexZNeg + return model[0].xyz * sx + + model[1].xyz * sy + + model[2].xyz * sz + + model[3].xyz; + // return mul(modelMatrix, float32_t4(sx, sy, sz, 1.0f)); +#else + return corners[vertexIdx]; +#endif +} +#endif // GPU_COMMON_HLSL diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl new file mode 100644 index 000000000..05d913e01 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl @@ -0,0 +1,164 @@ +#ifndef _SILHOUETTE_HLSL_ +#define _SILHOUETTE_HLSL_ +#include "gpu_common.hlsl" + +#include "utils.hlsl" + +// Compute region and configuration index from model matrix +uint32_t computeRegionAndConfig(float32_t3x4 modelMatrix, out uint32_t3 region, out uint32_t configIndex, out uint32_t vertexCount) +{ + float32_t4x3 columnModel = transpose(modelMatrix); + float32_t3 obbCenter = columnModel[3].xyz; + float32_t3x3 upper3x3 = (float32_t3x3)columnModel; + float32_t3 rcpSqScales = rcp(float32_t3( + dot(upper3x3[0], upper3x3[0]), + dot(upper3x3[1], upper3x3[1]), + dot(upper3x3[2], upper3x3[2]))); + float32_t3 normalizedProj = mul(upper3x3, obbCenter) * rcpSqScales; + + region = uint32_t3( + normalizedProj.x < -0.5f ? 0 : (normalizedProj.x > 0.5f ? 2 : 1), + normalizedProj.y < -0.5f ? 0 : (normalizedProj.y > 0.5f ? 2 : 1), + normalizedProj.z < -0.5f ? 0 : (normalizedProj.z > 0.5f ? 2 : 1)); + + configIndex = region.x + region.y * 3u + region.z * 9u; + + uint32_t sil = packSilhouette(silhouettes[configIndex]); + // uint32_t sil = binSilhouettes[configIndex]; + vertexCount = getSilhouetteSize(sil); + return sil; +} + +#if VISUALIZE_SAMPLES +float32_t4 +#else +void +#endif +computeSilhouette(float32_t3x4 modelMatrix, uint32_t vertexCount, uint32_t sil +#if VISUALIZE_SAMPLES + , + float32_t3 spherePos, float32_t aaWidth +#endif + , + NBL_REF_ARG(ClippedSilhouette) silhouette) +{ +#if VISUALIZE_SAMPLES + float32_t4 color = float32_t4(0, 0, 0, 0); +#endif + silhouette.count = 0; + + // Build clip mask (z < 0) + uint32_t clipMask = 0u; + NBL_UNROLL + for (uint32_t i = 0; i < 4; i++) + clipMask |= (getVertexZNeg(modelMatrix, getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; + + if (vertexCount == 6) + { + NBL_UNROLL + for (uint32_t i = 4; i < 6; i++) + clipMask |= (getVertexZNeg(modelMatrix, getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; + } + + uint32_t clipCount = countbits(clipMask); + +#if 0 + // Early exit if fully clipped + if (clipCount == vertexCount) + return color; + + // No clipping needed - fast path + if (clipCount == 0) + { + for (uint32_t i = 0; i < vertexCount; i++) + { + uint32_t i0 = i; + uint32_t i1 = (i + 1) % vertexCount; + + float32_t3 v0 = getVertex(modelMatrix, getSilhouetteVertex(sil, i0)); + silhouette.vertices[silhouette.count++] = v0; +#if VISUALIZE_SAMPLES + float32_t3 v1 = getVertex(modelMatrix, getSilhouetteVertex(sil, i1)); + float32_t3 pts[2] = {v0, v1}; + color += drawEdge(i1, pts, spherePos, aaWidth); +#endif + } + return color; + } +#endif + + // Rotate clip mask so positives come first + uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u); + bool wrapAround = ((clipMask & 1u) != 0u) && + ((clipMask & (1u << (vertexCount - 1))) != 0u); + uint32_t rotateAmount = wrapAround + ? firstbitlow(invertedMask) // -> First POSITIVE + : firstbithigh(clipMask) + 1; // -> First vertex AFTER last negative + + uint32_t rotatedClipMask = rotr(clipMask, rotateAmount, vertexCount); + uint32_t rotatedSil = rotr(sil, rotateAmount * 3, vertexCount * 3); + + uint32_t positiveCount = vertexCount - clipCount; + + // ALWAYS compute both clip points + uint32_t lastPosIdx = positiveCount - 1; + uint32_t firstNegIdx = positiveCount; + float32_t3 vLastPos = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, lastPosIdx)); + float32_t3 vFirstNeg = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, firstNegIdx)); + float32_t t = vLastPos.z / (vLastPos.z - vFirstNeg.z); + float32_t3 clipA = lerp(vLastPos, vFirstNeg, t); + + float32_t3 vLastNeg = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, vertexCount - 1)); + float32_t3 vFirstPos = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, 0)); + t = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + float32_t3 clipB = lerp(vLastNeg, vFirstPos, t); + + NBL_UNROLL + for (uint32_t i = 0; i < positiveCount; i++) + { + // Get raw vertex + float32_t3 v0 = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, i)); + + bool isLastPositive = (i == positiveCount - 1); + bool useClipA = (clipCount > 0) && isLastPositive; + +#if VISUALIZE_SAMPLES + float32_t3 v1 = useClipA ? clipA + : getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, (i + 1) % vertexCount)); + + float32_t3 pts[2] = {normalize(v0), normalize(v1)}; + color += drawEdge((i + 1) % vertexCount, pts, spherePos, aaWidth); +#endif + silhouette.vertices[silhouette.count++] = v0; + } + + if (clipCount > 0 && clipCount < vertexCount) + { + float32_t3 vFirst = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, 0)); + +#if VISUALIZE_SAMPLES + float32_t3 npPts[2] = {normalize(clipB), normalize(vFirst)}; + color += drawEdge(0, npPts, spherePos, aaWidth); + + float32_t3 arcPts[2] = {normalize(clipA), normalize(clipB)}; + color += drawEdge(23, arcPts, spherePos, aaWidth, 0.6f); +#endif + silhouette.vertices[silhouette.count++] = clipA; + silhouette.vertices[silhouette.count++] = clipB; + } + +#if DEBUG_DATA + DebugDataBuffer[0].clipMask = clipMask; + DebugDataBuffer[0].clipCount = clipCount; + DebugDataBuffer[0].rotatedClipMask = rotatedClipMask; + DebugDataBuffer[0].rotateAmount = rotateAmount; + DebugDataBuffer[0].positiveVertCount = positiveCount; + DebugDataBuffer[0].wrapAround = (uint32_t)wrapAround; + DebugDataBuffer[0].rotatedSil = rotatedSil; +#endif +#if VISUALIZE_SAMPLES + return color; +#endif +} + +#endif // _SILHOUETTE_HLSL_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl index 4031e048f..f01667bf0 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl @@ -19,5 +19,24 @@ uint32_t rotr(uint32_t value, uint32_t bits, uint32_t width) return ((value >> bits) | (value << (width - bits))) & mask; } +uint32_t packSilhouette(const uint32_t s[7]) +{ + uint32_t packed = 0; + uint32_t size = s[0] & 0x7; // 3 bits for size + + // Pack vertices LSB-first (vertex1 in lowest 3 bits above size) + for (uint32_t i = 1; i <= 6; ++i) + { + uint32_t v = s[i]; + if (v < 0) + v = 0; // replace unused vertices with 0 + packed |= (v & 0x7) << (3 * (i - 1)); // vertex i-1 shifted by 3*(i-1) + } + + // Put size in the MSB (bits 29-31 for a 32-bit uint32_t, leaving 29 bits for vertices) + packed |= (size & 0x7) << 29; + + return packed; +} #endif // _UTILS_HLSL_ diff --git a/73_SolidAngleVisualizer/main.cpp b/73_SolidAngleVisualizer/main.cpp index 401ab71b3..4c32069ff 100644 --- a/73_SolidAngleVisualizer/main.cpp +++ b/73_SolidAngleVisualizer/main.cpp @@ -1,10 +1,11 @@ // Copyright (C) 2018-2020 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h - +#include "nbl/this_example/builtin/build/spirv/keys.hpp" #include "common.hpp" #include "app_resources/hlsl/common.hlsl" +#include "app_resources/hlsl/benchmark/common.hlsl" #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" /* @@ -18,6 +19,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR using asset_base_t = BuiltinResourcesApplication; inline static std::string SolidAngleVisShaderPath = "app_resources/hlsl/SolidAngleVis.frag.hlsl"; + inline static std::string RayVisShaderPath = "app_resources/hlsl/RayVis.frag.hlsl"; public: inline SolidAngleVisualizer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD), @@ -31,6 +33,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) return false; + interface.m_visualizer = this; + m_semaphore = m_device->createSemaphore(m_realFrameIx); if (!m_semaphore) return logFail("Failed to Create a Semaphore!"); @@ -162,7 +166,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // Create graphics pipeline { - auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, const std::string& defineMacro = "") -> smart_refctd_ptr + auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, IShader::E_SHADER_STAGE stage, const std::string& defineMacro = "") -> smart_refctd_ptr { IAssetLoader::SAssetLoadParams lp = {}; lp.workingDirectory = localInputCWD; @@ -180,7 +184,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); CHLSLCompiler::SOptions options = {}; - options.stage = IShader::E_SHADER_STAGE::ESS_FRAGMENT; + options.stage = stage; options.preprocessorOptions.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; options.spirvOptimizer = nullptr; #ifndef _NBL_DEBUG @@ -216,21 +220,24 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); // Load Fragment Shader - auto fragmentShader = loadAndCompileHLSLShader(SolidAngleVisShaderPath); - if (!fragmentShader) - return logFail("Failed to Load and Compile Fragment Shader: lumaMeterShader!"); + auto solidAngleVisFragShader = loadAndCompileHLSLShader(SolidAngleVisShaderPath, ESS_FRAGMENT); + if (!solidAngleVisFragShader) + return logFail("Failed to Load and Compile Fragment Shader: SolidAngleVis!"); - const IGPUPipelineBase::SShaderSpecInfo fragSpec = { - .shader = fragmentShader.get(), + const IGPUPipelineBase::SShaderSpecInfo solidAngleFragSpec = { + .shader = solidAngleVisFragShader.get(), .entryPoint = "main" }; - const asset::SPushConstantRange ranges[] = { { - .stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, - .offset = 0, - .size = sizeof(PushConstants) - } }; + auto rayVisFragShader = loadAndCompileHLSLShader(RayVisShaderPath, ESS_FRAGMENT); + if (!rayVisFragShader) + return logFail("Failed to Load and Compile Fragment Shader: rayVis!"); + const IGPUPipelineBase::SShaderSpecInfo RayFragSpec = { + .shader = rayVisFragShader.get(), + .entryPoint = "main" + }; + smart_refctd_ptr solidAngleVisLayout, rayVisLayout; nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { { .binding = 0, @@ -241,21 +248,39 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } }; smart_refctd_ptr dsLayout = m_device->createDescriptorSetLayout(bindings); + + const asset::SPushConstantRange saRanges[] = { { + .stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, + .offset = 0, + .size = sizeof(PushConstants) + } }; + const asset::SPushConstantRange rayRanges[] = { { + .stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, + .offset = 0, + .size = sizeof(PushConstantRayVis) + } }; + if (!dsLayout) logFail("Failed to create a Descriptor Layout!\n"); + solidAngleVisLayout = m_device->createPipelineLayout(saRanges, dsLayout); - auto visualizationLayout = m_device->createPipelineLayout(ranges -#if DEBUG_DATA - , dsLayout -#endif - ); - m_visualizationPipeline = fsTriProtoPPln.createPipeline(fragSpec, visualizationLayout.get(), m_solidAngleRenderpass.get()); - if (!m_visualizationPipeline) - return logFail("Could not create Graphics Pipeline!"); + rayVisLayout = m_device->createPipelineLayout(rayRanges, dsLayout); + + { + m_solidAngleVisPipeline = fsTriProtoPPln.createPipeline(solidAngleFragSpec, solidAngleVisLayout.get(), m_solidAngleRenderpass.get()); + if (!m_solidAngleVisPipeline) + return logFail("Could not create Graphics Pipeline!"); + + asset::SRasterizationParams rasterParams = ext::FullScreenTriangle::ProtoPipeline::DefaultRasterParams; + rasterParams.depthWriteEnable = true; + rasterParams.depthCompareOp = asset::E_COMPARE_OP::ECO_GREATER; + m_rayVisualizationPipeline = fsTriProtoPPln.createPipeline(RayFragSpec, rayVisLayout.get(), m_mainRenderpass.get(), 0, {}, rasterParams); + if (!m_rayVisualizationPipeline) + return logFail("Could not create Graphics Pipeline!"); + } // Allocate the memory -#if DEBUG_DATA { constexpr size_t BufferSize = sizeof(ResultData); @@ -297,7 +322,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR const ILogicalDevice::MappedMemoryRange memoryRange(m_allocation.memory.get(), 0ull, m_allocation.memory->getAllocationSize()); if (!m_allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) m_device->invalidateMappedMemoryRanges(1, &memoryRange); -#endif } // Create ImGUI @@ -391,7 +415,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} }; if (m_solidAngleViewFramebuffer) { -#if DEBUG_DATA asset::SBufferRange range { .offset = 0, @@ -399,40 +422,43 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .buffer = m_outputStorageBuffer }; cb->fillBuffer(range, 0u); -#endif - auto creationParams = m_solidAngleViewFramebuffer->getCreationParameters(); - cb->beginDebugMarker("Draw Circle View Frame"); { - const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f }; - const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + + const auto& creationParams = m_solidAngleViewFramebuffer->getCreationParameters(); + cb->beginDebugMarker("Draw Circle View Frame"); { - .framebuffer = m_solidAngleViewFramebuffer.get(), - .colorClearValues = &clearValue, - .depthStencilClearValues = &farValue, - .renderArea = { - .offset = {0,0}, - .extent = {creationParams.width, creationParams.height} - } - }; - beginRenderpass(cb, renderpassInfo); - } - // draw scene - { - PushConstants pc{ - .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), - .viewport = { 0.f,0.f,static_cast(creationParams.width),static_cast(creationParams.height) }, - .samplingMode = m_samplingMode, - .frameIndex = m_frameSeeding ? static_cast(m_realFrameIx) : 0u - }; - auto pipeline = m_visualizationPipeline; - cb->bindGraphicsPipeline(pipeline.get()); - cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(PushConstants), &pc); - cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get()); - ext::FullScreenTriangle::recordDrawCall(cb); + const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f }; + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + { + .framebuffer = m_solidAngleViewFramebuffer.get(), + .colorClearValues = &clearValue, + .depthStencilClearValues = &farValue, + .renderArea = { + .offset = {0,0}, + .extent = {creationParams.width, creationParams.height} + } + }; + beginRenderpass(cb, renderpassInfo); + } + // draw scene + { + static uint32_t lastFrameSeed = 0u; + lastFrameSeed = m_frameSeeding ? static_cast(m_realFrameIx) : lastFrameSeed; + PushConstants pc{ + .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), + .viewport = { 0.f,0.f,static_cast(creationParams.width),static_cast(creationParams.height) }, + .samplingMode = m_samplingMode, + .frameIndex = lastFrameSeed + }; + auto pipeline = m_solidAngleVisPipeline; + cb->bindGraphicsPipeline(pipeline.get()); + cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(pc), &pc); + cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get()); + ext::FullScreenTriangle::recordDrawCall(cb); + } + cb->endRenderPass(); + cb->endDebugMarker(); } - cb->endRenderPass(); - cb->endDebugMarker(); - #if DEBUG_DATA m_device->waitIdle(); std::memcpy(&m_GPUOutResulData, static_cast(m_allocation.memory->getMappedPointer()), sizeof(ResultData)); @@ -442,11 +468,11 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // draw main view if (m_mainViewFramebuffer) { - cb->beginDebugMarker("Main Scene Frame"); { auto creationParams = m_mainViewFramebuffer->getCreationParameters(); const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f }; const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + { .framebuffer = m_mainViewFramebuffer.get(), .colorClearValues = &clearValue, @@ -457,9 +483,33 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } }; beginRenderpass(cb, renderpassInfo); + + } + { // draw rays visualization + auto creationParams = m_mainViewFramebuffer->getCreationParameters(); + + cb->beginDebugMarker("Draw Rays visualization"); + // draw scene + { + float32_t4x4 viewProj = *reinterpret_cast(&interface.camera.getConcatenatedMatrix()); + PushConstantRayVis pc{ + .viewProjMatrix = viewProj, + .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), + .viewport = { 0.f,0.f,static_cast(creationParams.width),static_cast(creationParams.height) }, + .frameIndex = m_frameSeeding ? static_cast(m_realFrameIx) : 0u + }; + auto pipeline = m_rayVisualizationPipeline; + cb->bindGraphicsPipeline(pipeline.get()); + cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(pc), &pc); + cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get()); + ext::FullScreenTriangle::recordDrawCall(cb); + } + cb->endDebugMarker(); } // draw scene { + cb->beginDebugMarker("Main Scene Frame"); + float32_t3x4 viewMatrix; float32_t4x4 viewProjMatrix; // TODO: get rid of legacy matrices @@ -472,8 +522,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // tear down scene every frame auto& instance = m_renderer->m_instances[0]; - auto transposed = hlsl::transpose(interface.m_OBBModelMatrix); - memcpy(&instance.world, &transposed, sizeof(instance.world)); + instance.world = float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)); instance.packedGeo = m_renderer->getGeometries().data(); // cube // +interface.gcIndex; m_renderer->render(cb, viewParams); // draw the cube/OBB @@ -481,9 +530,11 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR instance.packedGeo = m_renderer->getGeometries().data() + 2; // disk m_renderer->render(cb, viewParams); } - cb->endRenderPass(); + cb->endDebugMarker(); + cb->endRenderPass(); } + { cb->beginDebugMarker("SolidAngleVisualizer IMGUI Frame"); { @@ -781,12 +832,10 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR cb->setViewport(0u, 1u, &viewport); } -#if DEBUG_DATA ~SolidAngleVisualizer() override { m_allocation.memory->unmap(); } -#endif // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers constexpr static inline uint32_t MaxFramesInFlight = 3u; @@ -806,7 +855,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR smart_refctd_ptr m_renderer; smart_refctd_ptr m_solidAngleViewFramebuffer; smart_refctd_ptr m_mainViewFramebuffer; - smart_refctd_ptr m_visualizationPipeline; + smart_refctd_ptr m_solidAngleVisPipeline; + smart_refctd_ptr m_rayVisualizationPipeline; // nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; smart_refctd_ptr m_outputStorageBuffer; @@ -859,6 +909,15 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); ImGui::Begin("Editor"); + ImGui::Text("Benchmarking Solid Angle Visualizer"); + + if (ImGui::Button("Run Benchmark")) + { + SolidAngleVisualizer::SamplingBenchmark benchmark(*m_visualizer); + benchmark.run(); + } + ImGui::Separator(); + ImGui::Text("Sampling Mode: "); ImGui::SameLine(); @@ -1119,7 +1178,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::Text("silhouette Vertex Count: %u", m_GPUOutResulData.silhouetteVertexCount); ImGui::Text("silhouette Positive VertexCount: %u", m_GPUOutResulData.positiveVertCount); ImGui::Text("Silhouette Mismatch: %s", m_GPUOutResulData.edgeVisibilityMismatch ? "true" : "false"); - ImGui::Text("More Than Two Bit Transitions: %s", m_GPUOutResulData.maxTrianglesExcceded ? "true" : "false"); + ImGui::Text("Max triangles exceeded: %s", m_GPUOutResulData.maxTrianglesExceeded ? "true" : "false"); + ImGui::Text("spherical lune detected: %s", m_GPUOutResulData.sphericalLuneDetected ? "true" : "false"); { float32_t3 xAxis = m_OBBModelMatrix[0].xyz; @@ -1138,23 +1198,27 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } static bool modalShown = false; + static bool modalDismissed = false; static uint32_t lastSilhouetteIndex = ~0u; - // Reset modal flag if silhouette configuration changed + // Reset modal flags if silhouette configuration changed if (m_GPUOutResulData.silhouetteIndex != lastSilhouetteIndex) { modalShown = false; + modalDismissed = false; // Allow modal to show again for new configuration lastSilhouetteIndex = m_GPUOutResulData.silhouetteIndex; } - if (!m_GPUOutResulData.edgeVisibilityMismatch || !m_GPUOutResulData.maxTrianglesExcceded) + // Reset flags when mismatch is cleared + if (!m_GPUOutResulData.edgeVisibilityMismatch && !m_GPUOutResulData.maxTrianglesExceeded && !m_GPUOutResulData.sphericalLuneDetected) { - // Reset flag when mismatch is cleared modalShown = false; + modalDismissed = false; } - if ((m_GPUOutResulData.edgeVisibilityMismatch || m_GPUOutResulData.maxTrianglesExcceded) && m_GPUOutResulData.silhouetteIndex != 13 && !modalShown) // 13 means we're inside the cube, so don't care + + // Open modal only if not already shown/dismissed + if ((m_GPUOutResulData.edgeVisibilityMismatch || m_GPUOutResulData.maxTrianglesExceeded || m_GPUOutResulData.sphericalLuneDetected) && m_GPUOutResulData.silhouetteIndex != 13 && !modalShown && !modalDismissed) // Don't reopen if user dismissed it { - // Open modal popup only once per configuration ImGui::OpenPopup("Edge Visibility Mismatch Warning"); modalShown = true; } @@ -1164,19 +1228,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { ImGui::TextColored(ImVec4(1.0f, 0.5f, 0.0f, 1.0f), "Warning: Edge Visibility Mismatch Detected!"); ImGui::Separator(); - ImGui::Text("The silhouette lookup table (LUT) does not match the computed edge visibility."); ImGui::Text("This indicates the pre-computed silhouette data may be incorrect."); ImGui::Spacing(); - - // Show configuration info ImGui::TextWrapped("Configuration Index: %u", m_GPUOutResulData.silhouetteIndex); ImGui::TextWrapped("Region: (%u, %u, %u)", m_GPUOutResulData.region.x, m_GPUOutResulData.region.y, m_GPUOutResulData.region.z); ImGui::Spacing(); - ImGui::Text("Mismatched Vertices (bitmask): 0x%08X", m_GPUOutResulData.edgeVisibilityMismatch); - - // Show which specific vertices are mismatched ImGui::Text("Vertices involved in mismatched edges:"); ImGui::Indent(); for (int i = 0; i < 8; i++) @@ -1188,12 +1246,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } ImGui::Unindent(); ImGui::Spacing(); - if (ImGui::Button("OK", ImVec2(120, 0))) { ImGui::CloseCurrentPopup(); + modalShown = false; + modalDismissed = true; // Mark as dismissed to prevent reopening } - ImGui::EndPopup(); } @@ -1203,6 +1261,25 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::Text("region: (%u, %u, %u)", m_GPUOutResulData.region.x, m_GPUOutResulData.region.y, m_GPUOutResulData.region.z); + // print solidAngles for each triangle + { + ImGui::Text("Solid Angles per Triangle:"); + ImGui::BeginTable("SolidAnglesTable", 2); + ImGui::TableSetupColumn("Triangle Index"); + ImGui::TableSetupColumn("Solid Angle"); + ImGui::TableHeadersRow(); + for (uint32_t i = 0; i < m_GPUOutResulData.triangleCount; ++i) + { + ImGui::TableNextRow(); + ImGui::TableSetColumnIndex(0); + ImGui::Text("%u", i); + ImGui::TableSetColumnIndex(1); + ImGui::Text("%.6f", m_GPUOutResulData.solidAngles[i]); + } + ImGui::Text("Total: %.6f", m_GPUOutResulData.totalSolidAngles); + ImGui::EndTable(); + } + ImGui::Separator(); // Silhouette mask printed in binary @@ -1255,14 +1332,15 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::Separator(); }; - static RandomSampler rng(69); // Initialize RNG with seed + static RandomSampler rng(0x45); // Initialize RNG with seed // Helper function to check if cube intersects unit sphere at origin - auto isCubeOutsideUnitSphere = [](const float32_t3& translation, const float32_t3& scale) -> bool { - float cubeRadius = glm::length(scale) * 0.5f; - float distanceToCenter = glm::length(translation); - return (distanceToCenter - cubeRadius) > 1.0f; - }; + auto isCubeOutsideUnitSphere = [](const float32_t3& translation, const float32_t3& scale) -> bool + { + float cubeRadius = glm::length(scale) * 0.5f; + float distanceToCenter = glm::length(translation); + return (distanceToCenter - cubeRadius) > 1.0f; + }; static TRS lastTRS = {}; if (ImGui::Button("Randomize Translation")) @@ -1404,7 +1482,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // mutables struct TRS // Source of truth { - float32_t3 translation{ 0.0f, 0.0f, 3.0f }; + float32_t3 translation{ 0.0f, 0.0f, 1.5f }; float32_t3 rotation{ 0.0f }; // MUST stay orthonormal float32_t3 scale{ 1.0f }; } m_TRS; @@ -1415,7 +1493,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR TransformReturnInfo mainViewTransformReturnInfo; TransformReturnInfo solidAngleViewTransformReturnInfo; - const static inline core::vectorSIMDf cameraIntialPosition{ -3.0f, 6.0f, 3.0f }; const static inline core::vectorSIMDf cameraInitialTarget{ 0.f, 0.0f, 3.f }; const static inline core::vectorSIMDf cameraInitialUp{ 0.f, 0.f, 1.f }; @@ -1425,7 +1502,289 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR //uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed bool isPerspective = true, isLH = true, flipGizmoY = true, move = true; bool firstFrame = true; + + SolidAngleVisualizer* m_visualizer; } interface; + + class SamplingBenchmark final + { + public: + SamplingBenchmark(SolidAngleVisualizer& base) + : m_api(base.m_api), m_device(base.m_device), m_logger(base.m_logger), m_visualizer(&base) + { + + // setting up pipeline in the constructor + m_queueFamily = base.getComputeQueue()->getFamilyIndex(); + m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + //core::smart_refctd_ptr* cmdBuffs[] = { &m_cmdbuf, &m_timestampBeforeCmdBuff, &m_timestampAfterCmdBuff }; + if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) + base.logFail("Failed to create Command Buffers!\n"); + if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdBuff)) + base.logFail("Failed to create Command Buffers!\n"); + if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdBuff)) + base.logFail("Failed to create Command Buffers!\n"); + + // Load shaders, set up pipeline + { + smart_refctd_ptr shader; + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = base.m_logger.get(); + lp.workingDirectory = "app_resources"; // virtual root + // this time we load a shader directly from a file + auto key = nbl::this_example::builtin::build::get_spirv_key<"benchmark">(m_device.get()); + auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + base.logFail("Could not load shader!"); + assert(0); + } + + // It would be super weird if loading a shader from a file produced more than 1 asset + assert(assets.size() == 1); + shader = IAsset::castDown(assets[0]); + } + + if (!shader) + base.logFail("Failed to load precompiled \"benchmark\" shader!\n"); + + nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { + { + .binding = 0, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = ShaderStage::ESS_COMPUTE, + .count = 1 + } + }; + smart_refctd_ptr dsLayout = base.m_device->createDescriptorSetLayout(bindings); + if (!dsLayout) + base.logFail("Failed to create a Descriptor Layout!\n"); + + SPushConstantRange pushConstantRanges[] = { + { + .stageFlags = ShaderStage::ESS_COMPUTE, + .offset = 0, + .size = sizeof(BenchmarkPushConstants) + } + }; + m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout)); + if (!m_pplnLayout) + base.logFail("Failed to create a Pipeline Layout!\n"); + + { + IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_pplnLayout.get(); + params.shader.entryPoint = "main"; + params.shader.shader = shader.get(); + if (!base.m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) + base.logFail("Failed to create pipelines (compile & link shaders)!\n"); + } + + // Allocate the memory + { + constexpr size_t BufferSize = BENCHMARK_WORKGROUP_COUNT * BENCHMARK_WORKGROUP_DIMENSION_SIZE_X * + BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y * BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z * sizeof(uint32_t); + + nbl::video::IGPUBuffer::SCreationParams params = {}; + params.size = BufferSize; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + smart_refctd_ptr dummyBuff = base.m_device->createBuffer(std::move(params)); + if (!dummyBuff) + base.logFail("Failed to create a GPU Buffer of size %d!\n", params.size); + + dummyBuff->setObjectDebugName("benchmark buffer"); + + nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = dummyBuff->getMemoryReqs(); + + m_allocation = base.m_device->allocate(reqs, dummyBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); + if (!m_allocation.isValid()) + base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + + assert(dummyBuff->getBoundMemory().memory == m_allocation.memory.get()); + smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 }); + + m_ds = pool->createDescriptorSet(std::move(dsLayout)); + { + IGPUDescriptorSet::SDescriptorInfo info[1]; + info[0].desc = smart_refctd_ptr(dummyBuff); + info[0].info.buffer = { .offset = 0,.size = BufferSize }; + IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { + {.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info} + }; + base.m_device->updateDescriptorSets(writes, {}); + } + } + } + + IQueryPool::SCreationParams queryPoolCreationParams{}; + queryPoolCreationParams.queryType = IQueryPool::TYPE::TIMESTAMP; + queryPoolCreationParams.queryCount = 2; + queryPoolCreationParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + m_queryPool = m_device->createQueryPool(queryPoolCreationParams); + + m_computeQueue = m_device->getQueue(m_queueFamily, 0); + } + + void run() + { + m_logger->log("\n\nsampling benchmark result:", ILogger::ELL_PERFORMANCE); + m_logger->log("sampling benchmark, triangle solid angle result:", ILogger::ELL_PERFORMANCE); + performBenchmark(SAMPLING_BENCHMARK_MODE::TRIANGLE_SOLID_ANGLE, SAMPLING_MODE_SOLID_ANGLE); + + m_logger->log("sampling benchmark, triangle projected solid angle result:", ILogger::ELL_PERFORMANCE); + performBenchmark(SAMPLING_BENCHMARK_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE, SAMPLING_MODE_PROJECTED_SOLID_ANGLE); + } + + private: + void performBenchmark(SAMPLING_BENCHMARK_MODE mode, uint32_t solidAngleMode) + { + m_device->waitIdle(); + + recordTimestampQueryCmdBuffers(); + + uint64_t semaphoreCounter = 0; + smart_refctd_ptr semaphore = m_device->createSemaphore(semaphoreCounter); + + IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} }; + IQueue::SSubmitInfo::SSemaphoreInfo waits[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT } }; + + IQueue::SSubmitInfo beforeTimestapSubmitInfo[1] = {}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsBegin[] = { {.cmdbuf = m_timestampBeforeCmdBuff.get()} }; + beforeTimestapSubmitInfo[0].commandBuffers = cmdbufsBegin; + beforeTimestapSubmitInfo[0].signalSemaphores = signals; + beforeTimestapSubmitInfo[0].waitSemaphores = waits; + + IQueue::SSubmitInfo afterTimestapSubmitInfo[1] = {}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsEnd[] = { {.cmdbuf = m_timestampAfterCmdBuff.get()} }; + afterTimestapSubmitInfo[0].commandBuffers = cmdbufsEnd; + afterTimestapSubmitInfo[0].signalSemaphores = signals; + afterTimestapSubmitInfo[0].waitSemaphores = waits; + + IQueue::SSubmitInfo benchmarkSubmitInfos[1] = {}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} }; + benchmarkSubmitInfos[0].commandBuffers = cmdbufs; + benchmarkSubmitInfos[0].signalSemaphores = signals; + benchmarkSubmitInfos[0].waitSemaphores = waits; + + + m_pushConstants.benchmarkMode = mode; + m_pushConstants.samplingMode = solidAngleMode; + m_pushConstants.modelMatrix = float32_t3x4(transpose(m_visualizer->interface.m_OBBModelMatrix)); + recordCmdBuff(); + + // warmup runs + for (int i = 0; i < WarmupIterations; ++i) + { + if (i == 0) + m_api->startCapture(); + waits[0].value = semaphoreCounter; + signals[0].value = ++semaphoreCounter; + m_computeQueue->submit(benchmarkSubmitInfos); + if (i == 0) + m_api->endCapture(); + } + + waits[0].value = semaphoreCounter; + signals[0].value = ++semaphoreCounter; + m_computeQueue->submit(beforeTimestapSubmitInfo); + + // actual benchmark runs + for (int i = 0; i < Iterations; ++i) + { + waits[0].value = semaphoreCounter; + signals[0].value = ++semaphoreCounter; + m_computeQueue->submit(benchmarkSubmitInfos); + } + + waits[0].value = semaphoreCounter; + signals[0].value = ++semaphoreCounter; + m_computeQueue->submit(afterTimestapSubmitInfo); + + m_device->waitIdle(); + + const uint64_t nativeBenchmarkTimeElapsedNanoseconds = calcTimeElapsed(); + const float nativeBenchmarkTimeElapsedSeconds = double(nativeBenchmarkTimeElapsedNanoseconds) / 1000000000.0; + + m_logger->log("%llu ns, %f s", ILogger::ELL_PERFORMANCE, nativeBenchmarkTimeElapsedNanoseconds, nativeBenchmarkTimeElapsedSeconds); + } + + void recordCmdBuff() + { + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); + m_cmdbuf->beginDebugMarker("sampling compute dispatch", vectorSIMDf(0, 1, 0, 1)); + m_cmdbuf->bindComputePipeline(m_pipeline.get()); + m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); + m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants); + m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); + m_cmdbuf->endDebugMarker(); + m_cmdbuf->end(); + } + + void recordTimestampQueryCmdBuffers() + { + static bool firstInvocation = true; + + if (!firstInvocation) + { + m_timestampBeforeCmdBuff->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); + m_timestampBeforeCmdBuff->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); + } + + m_timestampBeforeCmdBuff->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_timestampBeforeCmdBuff->resetQueryPool(m_queryPool.get(), 0, 2); + m_timestampBeforeCmdBuff->writeTimestamp(PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0); + m_timestampBeforeCmdBuff->end(); + + m_timestampAfterCmdBuff->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_timestampAfterCmdBuff->writeTimestamp(PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1); + m_timestampAfterCmdBuff->end(); + + firstInvocation = false; + } + + uint64_t calcTimeElapsed() + { + uint64_t timestamps[2]; + const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); + m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, ×tamps, sizeof(uint64_t), flags); + return timestamps[1] - timestamps[0]; + } + + private: + core::smart_refctd_ptr m_api; + smart_refctd_ptr m_device; + smart_refctd_ptr m_logger; + SolidAngleVisualizer* m_visualizer; + + nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; + smart_refctd_ptr m_cmdpool = nullptr; + smart_refctd_ptr m_cmdbuf = nullptr; + smart_refctd_ptr m_ds = nullptr; + smart_refctd_ptr m_pplnLayout = nullptr; + BenchmarkPushConstants m_pushConstants; + smart_refctd_ptr m_pipeline; + + smart_refctd_ptr m_timestampBeforeCmdBuff = nullptr; + smart_refctd_ptr m_timestampAfterCmdBuff = nullptr; + smart_refctd_ptr m_queryPool = nullptr; + + uint32_t m_queueFamily; + IQueue* m_computeQueue; + static constexpr int WarmupIterations = 50; + static constexpr int Iterations = 1; + }; + + template + inline bool logFail(const char* msg, Args&&... args) + { + m_logger->log(msg, ILogger::ELL_ERROR, std::forward(args)...); + return false; + } + + std::ofstream m_logFile; }; + NBL_MAIN_FUNC(SolidAngleVisualizer) \ No newline at end of file From 3e39f036cda70bc7a8e4dccdfe99d59a60b0a263 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Wed, 21 Jan 2026 04:55:25 +0300 Subject: [PATCH 16/26] Projected Parallelogram sampling --- 73_SolidAngleVisualizer/CMakeLists.txt | 2 +- .../app_resources/hlsl/Drawing.hlsl | 19 +- .../app_resources/hlsl/RayVis.frag.hlsl | 273 ++++++--- .../app_resources/hlsl/Sampling.hlsl | 87 +-- .../hlsl/SolidAngleVis.frag.hlsl | 72 ++- .../hlsl/benchmark/benchmark.comp.hlsl | 70 ++- .../app_resources/hlsl/benchmark/common.hlsl | 14 +- .../app_resources/hlsl/common.hlsl | 42 +- .../app_resources/hlsl/gpu_common.hlsl | 5 +- .../hlsl/parallelogram_sampling.hlsl | 535 ++++++++++++++++++ .../app_resources/hlsl/silhouette.hlsl | 55 +- .../app_resources/hlsl/utils.hlsl | 7 + 73_SolidAngleVisualizer/main.cpp | 524 ++++++++--------- 13 files changed, 1215 insertions(+), 490 deletions(-) create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl diff --git a/73_SolidAngleVisualizer/CMakeLists.txt b/73_SolidAngleVisualizer/CMakeLists.txt index f1701829f..6438c8e06 100644 --- a/73_SolidAngleVisualizer/CMakeLists.txt +++ b/73_SolidAngleVisualizer/CMakeLists.txt @@ -43,9 +43,9 @@ if(NBL_BUILD_IMGUI) app_resources/hlsl/gpu_common.hlsl app_resources/hlsl/Drawing.hlsl app_resources/hlsl/Sampling.hlsl - app_resources/hlsl/Sampling.hlsl app_resources/hlsl/silhouette.hlsl app_resources/hlsl/utils.hlsl + app_resources/hlsl/parallelogram_sampling.hlsl # app_resources/hlsl/test.comp.hlsl app_resources/hlsl/benchmark/benchmark.comp.hlsl diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl index 1a2962c78..fa2a93b45 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl @@ -4,16 +4,12 @@ #include "common.hlsl" #include "gpu_common.hlsl" -#if DEBUG_DATA // Check if a face on the hemisphere is visible from camera at origin bool isFaceVisible(float32_t3 faceCenter, float32_t3 faceNormal) { float32_t3 viewVec = normalize(-faceCenter); // Vector from camera to face return dot(faceNormal, viewVec) > 0.0f; } -#endif // DEBUG_DATA - -#if VISUALIZE_SAMPLES // doesn't change Z coordinate float32_t3 sphereToCircle(float32_t3 spherePoint) @@ -30,6 +26,8 @@ float32_t3 sphereToCircle(float32_t3 spherePoint) } } +#if VISUALIZE_SAMPLES + float32_t drawGreatCircleArc(float32_t3 fragPos, float32_t3 points[2], float32_t aaWidth, float32_t width = 0.01f) { float32_t3 v0 = normalize(points[0]); @@ -103,8 +101,8 @@ float32_t4 drawHiddenEdges(float32_t3x4 modelMatrix, float32_t3 spherePos, uint3 } float32_t3 pts[2] = {p0, p1}; - float32_t4 c = drawGreatCircleArc(spherePos, pts, aaWidth, 0.005f); - color += float32_t4(hiddenEdgeColor * c.a, c.a); + float32_t c = drawGreatCircleArc(spherePos, pts, aaWidth, 0.003f); + color += float32_t4(hiddenEdgeColor * c, c); } return color; @@ -128,7 +126,7 @@ float32_t4 drawCorner(float32_t3 cornerNDCPos, float32_t2 ndc, float32_t aaWidth // ------------------------------------------------- // inner black dot for hidden corners // ------------------------------------------------- - if (cornerNDCPos.z < 0.0f) + if (cornerNDCPos.z < 0.0f && innerDotSize > 0.0) { float32_t innerAlpha = 1.0f - smoothstep(innerDotSize - aaWidth, innerDotSize + aaWidth, @@ -191,23 +189,22 @@ float32_t arrowHead(float32_t2 ndc, float32_t2 tip, float32_t2 direction, float3 } // Helper to draw an edge with proper color mapping -float32_t4 drawEdge(uint32_t originalEdgeIdx, float32_t3 pts[2], float32_t3 spherePos, float32_t aaWidth, float32_t width = 0.01f) +float32_t4 drawEdge(uint32_t originalEdgeIdx, float32_t3 pts[2], float32_t3 spherePos, float32_t aaWidth, float32_t width = 0.003f) { float32_t4 edgeContribution = drawGreatCircleArc(spherePos, pts, aaWidth, width); return float32_t4(colorLUT[originalEdgeIdx] * edgeContribution.a, edgeContribution.a); }; -float32_t4 drawCorners(float32_t3x4 modelMatrix, float32_t2 ndc, float32_t aaWidth) +float32_t4 drawCorners(float32_t3x4 modelMatrix, float32_t2 ndc, float32_t aaWidth, float32_t dotSize) { float32_t4 color = float32_t4(0, 0, 0, 0); - float32_t dotSize = 0.02f; float32_t innerDotSize = dotSize * 0.5f; for (uint32_t i = 0; i < 8; i++) { float32_t3 cornerCirclePos = sphereToCircle(normalize(getVertex(modelMatrix, i))); - color += drawCorner(cornerCirclePos, ndc, aaWidth, dotSize, innerDotSize, colorLUT[i]); + color += drawCorner(cornerCirclePos, ndc, aaWidth, dotSize, 0.0, colorLUT[i]); } return color; diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/RayVis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/RayVis.frag.hlsl index 2b4d7e3ef..a8a1ff52d 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/RayVis.frag.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/RayVis.frag.hlsl @@ -7,9 +7,20 @@ using namespace nbl::hlsl; using namespace ext::FullScreenTriangle; +// Visualizes a ray as an arrow from origin in NDC space +// Returns color (rgb), intensity (a), and depth (in extra component) +struct ArrowResult +{ + float32_t4 color : SV_Target0; + float32_t depth : SV_Depth; +}; + [[vk::push_constant]] struct PushConstantRayVis pc; +// #if DEBUG_DATA [[vk::binding(0, 0)]] RWStructuredBuffer DebugDataBuffer; -#define VISUALIZE_SAMPLES 1 +// #endif + +#if VISUALIZE_SAMPLES #include "Drawing.hlsl" // Ray-AABB intersection in world space @@ -46,71 +57,101 @@ float32_t2 projectToNDC(float32_t3 worldPos, float32_t4x4 viewProj, float32_t as return clipPos.xy; } -// Visualizes a ray as an arrow from origin in NDC space -// Returns color (rgb), intensity (a), and depth (in extra component) -struct ArrowResult -{ - float32_t4 color : SV_Target0; - float32_t depth : SV_Depth; -}; - ArrowResult visualizeRayAsArrow(float32_t3 rayOrigin, float32_t4 directionAndPdf, float32_t arrowLength, float32_t2 ndcPos, float32_t aspect) { ArrowResult result; result.color = float32_t4(0, 0, 0, 0); - result.depth = 0.0; + result.depth = 1.0; // Far plane in reversed-Z float32_t3 rayDir = normalize(directionAndPdf.xyz); float32_t pdf = directionAndPdf.w; - float32_t3 rayEnd = rayOrigin + rayDir * arrowLength; + // Define the 3D line segment + float32_t3 worldStart = rayOrigin; + float32_t3 worldEnd = rayOrigin + rayDir * arrowLength; + + // Transform to view space (camera space) for clipping + float32_t4x4 viewMatrix = pc.viewProjMatrix; // If you have view matrix separately, use that + // For now, we'll work in clip space and check w values + + float32_t4 clipStart = mul(pc.viewProjMatrix, float32_t4(worldStart, 1.0)); + float32_t4 clipEnd = mul(pc.viewProjMatrix, float32_t4(worldEnd, 1.0)); + + // Clip against near plane (w = 0 plane in clip space) + // If both points are behind camera, reject + if (clipStart.w <= 0.001 && clipEnd.w <= 0.001) + return result; + + // If line crosses the near plane, clip it + float32_t t0 = 0.0; + float32_t t1 = 1.0; + + if (clipStart.w <= 0.001) + { + // Start is behind camera, clip to near plane + float32_t t = (0.001 - clipStart.w) / (clipEnd.w - clipStart.w); + t0 = saturate(t); + clipStart = lerp(clipStart, clipEnd, t0); + worldStart = lerp(worldStart, worldEnd, t0); + } + + if (clipEnd.w <= 0.001) + { + // End is behind camera, clip to near plane + float32_t t = (0.001 - clipStart.w) / (clipEnd.w - clipStart.w); + t1 = saturate(t); + clipEnd = lerp(clipStart, clipEnd, t1); + worldEnd = lerp(worldStart, worldEnd, t1); + } + + // Now check if the clipped segment is valid + if (t0 >= t1) + return result; - // Project start and end points to NDC space - float32_t2 ndcStart = projectToNDC(rayOrigin, pc.viewProjMatrix, aspect); - float32_t2 ndcEnd = projectToNDC(rayEnd, pc.viewProjMatrix, aspect); + // Perspective divide to NDC + float32_t2 ndcStart = clipStart.xy / clipStart.w; + float32_t2 ndcEnd = clipEnd.xy / clipEnd.w; - // Get clip space positions - float32_t4 clipStart = mul(pc.viewProjMatrix, float32_t4(rayOrigin, 1.0)); - float32_t4 clipEnd = mul(pc.viewProjMatrix, float32_t4(rayEnd, 1.0)); + // Apply aspect ratio correction + ndcStart.x *= aspect; + ndcEnd.x *= aspect; - // Calculate arrow properties in NDC space - float32_t arrowNDCLength = length(ndcEnd - ndcStart); + // Calculate arrow direction in NDC + float32_t2 arrowVec = ndcEnd - ndcStart; + float32_t arrowNDCLength = length(arrowVec); - // Skip if arrow is too small on screen (in NDC units) - if (arrowNDCLength < 0.01) + // Skip if arrow is too small on screen + if (arrowNDCLength < 0.005) return result; - // Calculate the parametric position along the arrow shaft IN NDC - float32_t2 pa = ndcPos - ndcStart; - float32_t2 ba = ndcEnd - ndcStart; - float32_t t_ndc = saturate(dot(pa, ba) / dot(ba, ba)); + // Calculate perpendicular distance to line segment in NDC space + float32_t2 toPixel = ndcPos - ndcStart; + float32_t t_ndc = saturate(dot(toPixel, arrowVec) / dot(arrowVec, arrowVec)); // Draw line shaft float32_t lineThickness = 0.002; float32_t lineIntensity = lineSegment(ndcPos, ndcStart, ndcEnd, lineThickness); - // Calculate depth at this pixel's position along the arrow + // Calculate perspective-correct depth if (lineIntensity > 0.0) { - // Interpolate in CLIP space for perspective-correct depth + // Interpolate in clip space float32_t4 clipPos = lerp(clipStart, clipEnd, t_ndc); - float32_t depthNDC = clipPos.z / clipPos.w; - // Convert to reversed depth [0,1] -> [1,0] - result.depth = 1.0 - depthNDC; + // Compute NDC depth for reversed-Z + float32_t depthNDC = clipPos.z / clipPos.w; + result.depth = depthNDC; - // Clip against depth range (like hardware would) - // In reversed depth: near=1.0, far=0.0 + // Clip against valid depth range if (result.depth < 0.0 || result.depth > 1.0) { - lineIntensity = 0.0; // Outside depth range, clip it + lineIntensity = 0.0; } } // Modulate by PDF float32_t pdfIntensity = saturate(pdf * 0.5); - - float32_t3 finalColor = pdfIntensity; + float32_t3 finalColor = float32_t3(pdfIntensity, pdfIntensity, pdfIntensity); result.color = float32_t4(finalColor, lineIntensity); return result; @@ -141,81 +182,137 @@ float32_t3 worldToLocalDir(float32_t3 worldDir, float32_t3x4 modelMatrix) float32_t4x4 invModel = inverse(model4x4); return mul(invModel, float32_t4(worldDir, 0.0)).xyz; } + +// Returns both tMin (entry) and tMax (exit) for ray-AABB intersection +struct AABBIntersection +{ + float32_t tMin; // Distance to front face (entry point) + float32_t tMax; // Distance to back face (exit point) + bool hit; // Whether ray intersects the AABB at all +}; + +AABBIntersection rayAABBIntersectionFull(float32_t3 origin, float32_t3 dir, float32_t3 boxMin, float32_t3 boxMax) +{ + AABBIntersection result; + result.hit = false; + result.tMin = 0.0f; + result.tMax = 0.0f; + + float32_t3 invDir = 1.0f / dir; + float32_t3 t0 = (boxMin - origin) * invDir; + float32_t3 t1 = (boxMax - origin) * invDir; + + float32_t3 tmin = min(t0, t1); + float32_t3 tmax = max(t0, t1); + + result.tMin = max(max(tmin.x, tmin.y), tmin.z); + result.tMax = min(min(tmax.x, tmax.y), tmax.z); + + // Ray intersects if tMax >= tMin and tMax > 0 + result.hit = (result.tMax >= result.tMin) && (result.tMax > 0.0f); + + // If we're inside the box, tMin will be negative + // In that case, we want to use tMax (exit point) + if (result.tMin < 0.0f) + result.tMin = 0.0f; + + return result; +} +#endif // VISUALIZE_SAMPLES + [[vk::location(0)]] ArrowResult main(SVertexAttributes vx) { ArrowResult output; +#if VISUALIZE_SAMPLES output.color = float32_t4(0.0, 0.0, 0.0, 0.0); - output.depth = 0.0; // Default to far plane in reversed depth - float32_t maxDepth = 0.0; // Track the closest depth (maximum in reversed depth) + output.depth = 0.0; // Far plane in reversed-Z (near=0, far=1) + float32_t maxDepth = 0.0; // Track closest depth (minimum in reversed-Z) + float32_t aaWidth = length(float32_t2(ddx(vx.uv.x), ddy(vx.uv.y))); // Convert to NDC space with aspect ratio correction float32_t2 ndcPos = vx.uv * 2.0f - 1.0f; float32_t aspect = pc.viewport.z / pc.viewport.w; ndcPos.x *= aspect; - // Draw clipped silhouett vertices using drawCorners() for (uint32_t v = 0; v < DebugDataBuffer[0].clippedSilhouetteVertexCount; v++) { float32_t4 clipPos = mul(pc.viewProjMatrix, float32_t4(DebugDataBuffer[0].clippedSilhouetteVertices[v], 1.0)); - float32_t3 ndcPosVertex = clipPos.xyz / clipPos.w; // Perspective divide to get NDC - - float32_t4 intensity = drawCorner(ndcPosVertex, ndcPos, 0.005, 0.01, 0.01, float32_t3(1.0, 0.0, 0.0)); - - output.color += intensity; - output.depth = intensity > 0.0 ? 1.0 : output.depth; // Update depth - maxDepth = max(maxDepth, output.depth); - } - - int sampleCount = DebugDataBuffer[0].sampleCount; - - for (int i = 0; i < sampleCount; i++) - { - float32_t3 rayOrigin = float32_t3(0, 0, 0); - float32_t4 directionAndPdf = DebugDataBuffer[0].rayData[i]; - float32_t3 rayDir = normalize(directionAndPdf.xyz); + float32_t3 ndcPosVertex = clipPos.xyz / clipPos.w; + if (ndcPosVertex.z < maxDepth) + continue; - // Define cube bounds in local space (unit cube from -0.5 to 0.5, adjust as needed) - float32_t3 cubeLocalMin = float32_t3(-0.5, -0.5, -0.5); - float32_t3 cubeLocalMax = float32_t3(0.5, 0.5, 0.5); + float32_t4 intensity = drawCorner(ndcPosVertex, ndcPos, aaWidth, 0.03, 0.0, colorLUT[DebugDataBuffer[0].clippedSilhouetteVerticesIndices[v]]); - // Transform ray to local space of the cube - float32_t3 localRayOrigin = worldToLocal(rayOrigin, pc.modelMatrix); - float32_t3 localRayDir = normalize(worldToLocalDir(rayDir, pc.modelMatrix)); - - // Perform intersection test in local space - float32_t hitDistance = rayAABBIntersection(localRayOrigin, localRayDir, cubeLocalMin, cubeLocalMax); - - float32_t arrowLength; - if (hitDistance > 0.0) - { - // Calculate world space hit distance - // We need to account for the scaling in the model matrix - float32_t3 localHitPoint = localRayOrigin + localRayDir * hitDistance; - float32_t3 worldHitPoint = mul(pc.modelMatrix, float32_t4(localHitPoint, 1.0)).xyz; - arrowLength = length(worldHitPoint - rayOrigin); - } - else + // Update depth only where we drew something + if (any(intensity.rgb > 0.0)) { - // No intersection, use fallback (e.g., fixed length or distance to cube center) - float32_t3 cubeCenter = mul(pc.modelMatrix, float32_t4(0, 0, 0, 1)).xyz; - arrowLength = length(cubeCenter - rayOrigin) + 2.0; + output.color.rgb += intensity.rgb; + maxDepth = max(maxDepth, 1.0f - ndcPosVertex.z); } - - ArrowResult arrow = visualizeRayAsArrow(rayOrigin, directionAndPdf, arrowLength, ndcPos, aspect); - maxDepth = max(maxDepth, arrow.depth); - - // Additive blending - output.color.rgb += hitDistance > 0.0 ? arrow.color.rgb : float32_t3(1.0, 0.0, 0.0); - output.color.a = max(output.color.a, arrow.color.a); } + uint32_t sampleCount = DebugDataBuffer[0].sampleCount; + + // for (uint32_t i = 0; i < sampleCount; i++) + // { + // float32_t3 rayOrigin = float32_t3(0, 0, 0); + // float32_t4 directionAndPdf = DebugDataBuffer[0].rayData[i]; + // float32_t3 rayDir = normalize(directionAndPdf.xyz); + + // // Define cube bounds in local space + // float32_t3 cubeLocalMin = float32_t3(-0.5, -0.5, -0.5); + // float32_t3 cubeLocalMax = float32_t3(0.5, 0.5, 0.5); + + // // Transform ray to local space of the cube + // float32_t3 localRayOrigin = worldToLocal(rayOrigin, pc.modelMatrix); + // float32_t3 localRayDir = normalize(worldToLocalDir(rayDir, pc.modelMatrix)); + + // // Get both entry and exit distances + // AABBIntersection intersection = rayAABBIntersectionFull( + // localRayOrigin, + // localRayDir, + // cubeLocalMin, + // cubeLocalMax); + + // float32_t arrowLength; + // float32_t3 arrowColor; + + // if (intersection.hit) + // { + // // Use tMax (exit point at back face) instead of tMin (entry point at front face) + // float32_t3 localExitPoint = localRayOrigin + localRayDir * intersection.tMax; + // float32_t3 worldExitPoint = mul(pc.modelMatrix, float32_t4(localExitPoint, 1.0)).xyz; + // arrowLength = length(worldExitPoint - rayOrigin); + // arrowColor = float32_t3(0.0, 1.0, 0.0); // Green for valid samples + // } + // else + // { + // // Ray doesn't intersect - THIS SHOULD NEVER HAPPEN with correct sampling! + // float32_t3 cubeCenter = mul(pc.modelMatrix, float32_t4(0, 0, 0, 1)).xyz; + // arrowLength = length(cubeCenter - rayOrigin) + 2.0; + // arrowColor = float32_t3(1.0, 0.0, 0.0); // Red for BROKEN samples + // } + + // ArrowResult arrow = visualizeRayAsArrow(rayOrigin, directionAndPdf, arrowLength, ndcPos, aspect); + + // // Only update depth if arrow was actually drawn + // if (arrow.color.a > 0.0) + // { + // maxDepth = max(maxDepth, arrow.depth); + // } + + // // Modulate arrow color by its alpha (only add where arrow is visible) + // output.color.rgb += arrowColor * arrow.color.a; + // output.color.a = max(output.color.a, arrow.color.a); + // } + // Clamp to prevent overflow output.color = saturate(output.color); output.color.a = 1.0; - // Write the closest depth (maximum in reversed depth) - // ONLY write depth if we actually drew something - output.depth = output.color.a > 0.0 ? maxDepth : 0.0; + // Write the closest depth (minimum in reversed-Z) + output.depth = maxDepth; +#endif return output; -} \ No newline at end of file +} diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl index 9caf83246..cefa65267 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl @@ -2,17 +2,15 @@ #define _SAMPLING_HLSL_ // Include the spherical triangle utilities -#include +#include "gpu_common.hlsl" +#include "parallelogram_sampling.hlsl" #include #include #include -#include "nbl/builtin/hlsl/random/pcg.hlsl" -#include "nbl/builtin/hlsl/random/xoroshiro.hlsl" +#include +#include using namespace nbl::hlsl; -// Sampling mode enum -#define SAMPLING_MODE_SOLID_ANGLE 0 -#define SAMPLING_MODE_PROJECTED_SOLID_ANGLE 1 // Maximum number of triangles we can have after clipping // Without clipping, max 3 faces can be visible at once so 3 faces * 2 triangles = 6 edges, forming max 4 triangles @@ -59,9 +57,9 @@ float32_t computeProjectedSolidAngleFallback(float32_t3 v0, float32_t3 v1, float n2 /= l2; // 3. Get arc lengths (angles in radians) - float32_t a = asin(clamp(l0, -1.0, 1.0)); // side v0-v1 - float32_t b = asin(clamp(l1, -1.0, 1.0)); // side v1-v2 - float32_t c = asin(clamp(l2, -1.0, 1.0)); // side v2-v0 + float32_t a = asin(clamp(l0, -1.0f, 1.0f)); // side v0-v1 + float32_t b = asin(clamp(l1, -1.0f, 1.0f)); // side v1-v2 + float32_t c = asin(clamp(l2, -1.0f, 1.0f)); // side v2-v0 // Handle acos/asin quadrant if dot product is negative if (dot(v0, v1) < 0) @@ -94,7 +92,7 @@ SamplingData buildSamplingDataFromSilhouette(ClippedSilhouette silhouette, uint3 const float32_t3 origin = float32_t3(0, 0, 0); // Compute face normal ONCE before the loop - silhouette is planar! - if (samplingMode == SAMPLING_MODE_PROJECTED_SOLID_ANGLE) + if (samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) { float32_t3 v1 = silhouette.vertices[1]; float32_t3 v2 = silhouette.vertices[2]; @@ -116,7 +114,7 @@ SamplingData buildSamplingDataFromSilhouette(ClippedSilhouette silhouette, uint3 // Calculate triangle solid angle float32_t solidAngle; - if (samplingMode == SAMPLING_MODE_PROJECTED_SOLID_ANGLE) + if (samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) { // scalar_type projectedSolidAngleOfTriangle(const vector3_type receiverNormal, NBL_REF_ARG(vector3_type) cos_sides, NBL_REF_ARG(vector3_type) csc_sides, NBL_REF_ARG(vector3_type) cos_vertices) float32_t3 cos_vertices = clamp( @@ -141,7 +139,7 @@ SamplingData buildSamplingDataFromSilhouette(ClippedSilhouette silhouette, uint3 data.count++; } -#ifdef DEBUG_DATA +#if DEBUG_DATA // Validate no antipodal edges exist (would create spherical lune) for (uint32_t i = 0; i < silhouette.count; i++) { @@ -156,13 +154,6 @@ SamplingData buildSamplingDataFromSilhouette(ClippedSilhouette silhouette, uint3 } } DebugDataBuffer[0].maxTrianglesExceeded = (data.count > MAX_TRIANGLES); - - DebugDataBuffer[0].clippedSilhouetteVertexCount = silhouette.count; - for (uint32_t v = 0; v < silhouette.count; v++) - { - DebugDataBuffer[0].clippedSilhouetteVertices[v] = silhouette.vertices[v]; - } - DebugDataBuffer[0].triangleCount = data.count; DebugDataBuffer[0].totalSolidAngles = data.totalWeight; for (uint32_t tri = 0; tri < data.count; tri++) @@ -214,7 +205,7 @@ float32_t3 sampleFromData(SamplingData data, ClippedSilhouette silhouette, float float32_t3 v1 = silhouette.vertices[vertexIdx]; float32_t3 v2 = silhouette.vertices[vertexIdx + 1]; - float32_t3 faceNormal = normalize(cross(v1 - v0, v2 - v0)); + float32_t3 faceNormal = normalize(cross(v1 - v0, v2 - v0)); float32_t3 origin = float32_t3(0, 0, 0); @@ -232,7 +223,7 @@ float32_t3 sampleFromData(SamplingData data, ClippedSilhouette silhouette, float float32_t3 direction; float32_t rcpPdf; - if (data.samplingMode == SAMPLING_MODE_PROJECTED_SOLID_ANGLE) + if (data.samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) { sampling::ProjectedSphericalTriangle samplingTri = sampling::ProjectedSphericalTriangle::create(shapeTri); @@ -277,8 +268,12 @@ float32_t3 sampleFromData(SamplingData data, ClippedSilhouette silhouette, float #if VISUALIZE_SAMPLES -float32_t4 visualizeSamples(float32_t2 screenUV, float32_t3 spherePos, ClippedSilhouette silhouette, - uint32_t samplingMode, uint32_t frameIndex, SamplingData samplingData, uint32_t numSamples, inout RWStructuredBuffer DebugDataBuffer) +float32_t4 visualizeSamples(float32_t2 screenUV, float32_t3 spherePos, float32_t2 ndc, float32_t aaWidth, ClippedSilhouette silhouette, SAMPLING_MODE samplingMode, uint32_t frameIndex, SamplingData samplingData, uint32_t numSamples +#if DEBUG_DATA + , + inout RWStructuredBuffer DebugDataBuffer +#endif +) { float32_t4 accumColor = 0; @@ -289,27 +284,49 @@ float32_t4 visualizeSamples(float32_t2 screenUV, float32_t3 spherePos, ClippedSi float32_t2 pssPos = float32_t2(0.01, 0.01); // Offset from corner bool isInsidePSS = all(and(screenUV >= pssPos, screenUV <= (pssPos + pssSize))); + ParallelogramSilhouette paraSilhouette = buildParallelogram(silhouette, ndc, spherePos, aaWidth, accumColor); + +#if DEBUG_DATA DebugDataBuffer[0].sampleCount = numSamples; +#endif for (uint32_t i = 0; i < numSamples; i++) { - nbl::hlsl::random::PCG32 seedGen = nbl::hlsl::random::PCG32::construct(frameIndex * 65536u + i); - const uint32_t seed1 = seedGen(); - const uint32_t seed2 = seedGen(); - nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(uint32_t2(seed1, seed2)); - float32_t2 xi = nextRandomUnorm2(rnd); - float32_t pdf; - uint32_t triIdx; - float32_t3 sampleDir = sampleFromData(samplingData, silhouette, xi, pdf, triIdx); + // Hash the invocation to offset the grid + uint32_t offset = i * 747796405u + 2891336453u; + uint32_t idx = (offset) & 63u; // Keep within 64 samples + float32_t2 xi = float32_t2( + (float32_t(idx & 7u) + 0.5) / 8.0f, + (float32_t(idx >> 3u) + 0.5) / 8.0f); + float32_t pdf; + uint32_t index = 0; + float32_t3 sampleDir; + if (samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || + samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + { + sampleDir = sampleFromData(samplingData, silhouette, xi, pdf, index); + } + else if (samplingMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) + { + bool valid; + sampleDir = sampleFromParallelogram(paraSilhouette, xi, pdf, valid); + if (!valid) + { + pdf = 0.0f; + sampleDir = float32_t3(0, 0, 1); + } + } +#if DEBUG_DATA DebugDataBuffer[0].rayData[i] = float32_t4(sampleDir, pdf); +#endif float32_t dist3D = distance(sampleDir, normalize(spherePos)); float32_t alpha3D = 1.0f - smoothstep(0.0f, 0.02f, dist3D); if (alpha3D > 0.0f && !isInsidePSS) { - float32_t3 sampleColor = colorLUT[triIdx].rgb; + float32_t3 sampleColor = colorLUT[index].rgb; accumColor += float32_t4(sampleColor * alpha3D, alpha3D); } @@ -322,7 +339,7 @@ float32_t4 visualizeSamples(float32_t2 screenUV, float32_t3 spherePos, ClippedSi float32_t alpha2D = drawCross2D(screenUV, xiPixelPos, 0.005f, 0.001f); if (alpha2D > 0.0f) { - float32_t3 sampleColor = colorLUT[triIdx].rgb; + float32_t3 sampleColor = colorLUT[index].rgb; accumColor += float32_t4(sampleColor * alpha2D, alpha2D); } } @@ -334,5 +351,5 @@ float32_t4 visualizeSamples(float32_t2 screenUV, float32_t3 spherePos, ClippedSi return accumColor; } -#endif -#endif +#endif // VISUALIZE_SAMPLES +#endif // _SAMPLING_HLSL_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl index 79791af57..bd9312733 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl @@ -6,9 +6,9 @@ using namespace nbl::hlsl; using namespace ext::FullScreenTriangle; +#if DEBUG_DATA [[vk::binding(0, 0)]] RWStructuredBuffer DebugDataBuffer; // TODO: move below other includes - -#define VISUALIZE_SAMPLES 1 +#endif #include "utils.hlsl" #include "Drawing.hlsl" @@ -120,10 +120,25 @@ void computeSpherePos(SVertexAttributes vx, out float32_t2 ndc, out float32_t3 s #else computeSilhouette(pc.modelMatrix, vertexCount, sil, silhouette); #endif - // Draw clipped silhouette vertices - // color += drawClippedSilhouetteVertices(ndc, silhouette, aaWidth); - SamplingData samplingData = buildSamplingDataFromSilhouette(silhouette, pc.samplingMode); + SamplingData samplingData; + ParallelogramSilhouette paraSilhouette; + if (pc.samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || + pc.samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + { + samplingData = buildSamplingDataFromSilhouette(silhouette, pc.samplingMode); + } + else + { + + paraSilhouette = buildParallelogram(silhouette +#if VISUALIZE_SAMPLES + , + ndc, spherePos, aaWidth, color +#endif + ); + } + #if VISUALIZE_SAMPLES // For debugging: Draw a small indicator of which faces are found @@ -131,27 +146,50 @@ void computeSpherePos(SVertexAttributes vx, out float32_t2 ndc, out float32_t3 s // color += drawFaces(pc.modelMatrix, spherePos, aaWidth); - // Draw samples on sphere - color += visualizeSamples(vx.uv, spherePos, silhouette, pc.samplingMode, pc.frameIndex, samplingData, 64, DebugDataBuffer); - + // Draw clipped silhouette vertices + // color += drawClippedSilhouetteVertices(ndc, silhouette, aaWidth); color += drawHiddenEdges(pc.modelMatrix, spherePos, silEdgeMask, aaWidth); - color += drawCorners(pc.modelMatrix, ndc, aaWidth); + // color += drawCorners(pc.modelMatrix, ndc, aaWidth, 0.05f); color += drawRing(ndc, aaWidth); - if (all(vx.uv >= float32_t2(0.49f, 0.49f)) && all(vx.uv <= float32_t2(0.51f, 0.51f))) + // Draw samples on sphere + color += visualizeSamples(vx.uv, spherePos, ndc, aaWidth, silhouette, pc.samplingMode, pc.frameIndex, samplingData, pc.sampleCount +#if DEBUG_DATA + , + DebugDataBuffer +#endif + ); + + if (all(vx.uv >= float32_t2(0.f, 0.97f)) && all(vx.uv <= float32_t2(0.03f, 1.0f))) { return float32_t4(colorLUT[configIndex], 1.0f); } #else - nbl::hlsl::random::PCG32 seedGen = nbl::hlsl::random::PCG32::construct(65536u + i); - const uint32_t2 seeds = uint32_t2(seedGen(), seedGen()); - nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(seeds); - float32_t2 xi = nextRandomUnorm2(rnd); + // Hash the invocation to offset the grid + uint32_t offset = 747796405u + 2891336453u; + uint32_t idx = (offset) & 63u; // Keep within 64 samples + float32_t2 xi = float32_t2( + (float32_t(idx & 7u) + 0.5) / 8.0f, + (float32_t(idx >> 3u) + 0.5) / 8.0f); float32_t pdf; - uint32_t triIdx; - float32_t3 sampleDir = sampleFromData(samplingData, silhouette, xi, pdf, triIdx); - + uint32_t index = 0; + float32_t3 sampleDir; + if (pc.samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || + pc.samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + { + sampleDir = sampleFromData(samplingData, silhouette, xi, pdf, index); + } + else if (pc.samplingMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) + { + bool valid; + sampleDir = sampleFromParallelogram(paraSilhouette, xi, pdf, valid); + if (!valid) + { + pdf = 0.0f; + sampleDir = float32_t3(0, 0, 1); + } + } color += float4(sampleDir * 0.02f / pdf, 1.0f); #endif // VISUALIZE_SAMPLES setDebugData(sil, region, configIndex); diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl index 6d04538a5..0ea7c2afb 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl @@ -2,10 +2,30 @@ //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #pragma shader_stage(compute) -#define DEBUG_DATA 0 + +#include "app_resources/hlsl/common.hlsl" +// doesn't change Z coordinate +float32_t3 sphereToCircle(float32_t3 spherePoint) +{ + if (spherePoint.z >= 0.0f) + { + return float32_t3(spherePoint.xy, spherePoint.z); + } + else + { + float32_t r2 = (1.0f - spherePoint.z) / (1.0f + spherePoint.z); + float32_t uv2Plus1 = r2 + 1.0f; + return float32_t3((spherePoint.xy * uv2Plus1 / 2.0f), spherePoint.z); + } +} + +#undef DEBUG_DATA // Avoid conflict with DebugDataBuffer in this file +#undef VISUALIZE_SAMPLES + #include "app_resources/hlsl/benchmark/common.hlsl" #include "app_resources/hlsl/silhouette.hlsl" #include "app_resources/hlsl/Sampling.hlsl" +#include "app_resources/hlsl/parallelogram_sampling.hlsl" using namespace nbl::hlsl; @@ -14,30 +34,50 @@ using namespace nbl::hlsl; [numthreads(BENCHMARK_WORKGROUP_DIMENSION_SIZE_X, 1, 1)] [shader("compute")] void - main(uint3 invocationID : SV_DispatchThreadID) + main(uint32_t3 invocationID : SV_DispatchThreadID) { + // Perturb model matrix slightly per sample group + float32_t3x4 perturbedMatrix = pc.modelMatrix; + perturbedMatrix[0][3] += float32_t(invocationID.x) * 1e-6f; + uint32_t3 region; uint32_t configIndex; uint32_t vertexCount; - uint32_t sil = computeRegionAndConfig(pc.modelMatrix, region, configIndex, vertexCount); + uint32_t sil = computeRegionAndConfig(perturbedMatrix, region, configIndex, vertexCount); ClippedSilhouette silhouette; - computeSilhouette(pc.modelMatrix, vertexCount, sil, silhouette); - - SamplingData samplingData; - samplingData = buildSamplingDataFromSilhouette(silhouette, pc.samplingMode); - - nbl::hlsl::random::PCG32 seedGen = nbl::hlsl::random::PCG32::construct(65536u + invocationID.x); - const uint32_t2 seeds = uint32_t2(seedGen(), seedGen()); - + computeSilhouette(perturbedMatrix, vertexCount, sil, silhouette); float32_t pdf; uint32_t triIdx; float32_t3 sampleDir = float32_t3(0.0, 0.0, 0.0); - for (uint32_t i = 0; i < 64; i++) + if (pc.benchmarkMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || + pc.benchmarkMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) { - nbl::hlsl::Xoroshiro64StarStar rnd = nbl::hlsl::Xoroshiro64StarStar::construct(seeds); - float32_t2 xi = nextRandomUnorm2(rnd); - sampleDir += sampleFromData(samplingData, silhouette, xi, pdf, triIdx); + SamplingData samplingData; + samplingData = buildSamplingDataFromSilhouette(silhouette, pc.benchmarkMode); + + for (uint32_t i = 0; i < 64; i++) + { + float32_t2 xi = float32_t2( + (float32_t(i & 7u) + 0.5f) / 8.0f, + (float32_t(i >> 3u) + 0.5f) / 8.0f); + + sampleDir += sampleFromData(samplingData, silhouette, xi, pdf, triIdx); + } + } + else if (pc.benchmarkMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) + { + // Precompute parallelogram for sampling + ParallelogramSilhouette paraSilhouette = buildParallelogram(silhouette); + for (uint32_t i = 0; i < 64; i++) + { + float32_t2 xi = float32_t2( + (float32_t(i & 7u) + 0.5f) / 8.0f, + (float32_t(i >> 3u) + 0.5f) / 8.0f); + + bool valid; + sampleDir += sampleFromParallelogram(paraSilhouette, xi, pdf, valid); + } } const uint32_t offset = sizeof(uint32_t) * invocationID.x; diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl index d54ee8a36..3091bc793 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl @@ -7,17 +7,5 @@ NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_X = 64u; NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y = 1u; NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z = 1u; -NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_COUNT = 1920u * 1080u / BENCHMARK_WORKGROUP_DIMENSION_SIZE_X; +NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_COUNT = 1000000u; -enum SAMPLING_BENCHMARK_MODE -{ - TRIANGLE_SOLID_ANGLE, - TRIANGLE_PROJECTED_SOLID_ANGLE, -}; - -struct BenchmarkPushConstants -{ - float32_t3x4 modelMatrix; - uint32_t samplingMode; - SAMPLING_BENCHMARK_MODE benchmarkMode; -}; \ No newline at end of file diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl index db2f328b5..9e4954ebc 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl @@ -1,21 +1,40 @@ #ifndef _SOLID_ANGLE_VIS_COMMON_HLSL_ #define _SOLID_ANGLE_VIS_COMMON_HLSL_ -#include "nbl/builtin/hlsl/cpp_compat.hlsl" -// Sampling mode enum -#define SAMPLING_MODE_SOLID_ANGLE 0 -#define SAMPLING_MODE_PROJECTED_SOLID_ANGLE 1 +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#define DEBUG_DATA 01 +#define VISUALIZE_SAMPLES 01 -#define DEBUG_DATA 1 #define FAST 1 namespace nbl { namespace hlsl { + // Sampling mode enum + enum SAMPLING_MODE : uint32_t + { + TRIANGLE_SOLID_ANGLE, + TRIANGLE_PROJECTED_SOLID_ANGLE, + PROJECTED_PARALLELOGRAM_SOLID_ANGLE + }; struct ResultData { + uint32_t parallelogramDoesNotBound; + float32_t parallelogramArea; + uint32_t failedVertexIndex; + uint32_t edgeIsConvex[4]; + + uint32_t parallelogramVerticesInside; + uint32_t parallelogramEdgesInside; + uint32_t failedEdgeIndex; + float32_t2 failedVertexUV; + float32_t3 failedPoint; + uint32_t failedEdgeSample; + float32_t2 failedEdgeUV; + float32_t2 parallelogramCorners[4]; + uint32_t3 region; uint32_t silhouetteIndex; @@ -38,11 +57,14 @@ namespace nbl uint32_t clippedSilhouetteVertexCount; float32_t3 clippedSilhouetteVertices[7]; + uint32_t clippedSilhouetteVerticesIndices[7]; uint32_t triangleCount; float32_t solidAngles[5]; float32_t totalSolidAngles; + uint32_t sampleOutsideSilhouette; + // Sampling ray visualization data uint32_t sampleCount; float32_t4 rayData[64]; // xyz = direction, w = PDF @@ -52,18 +74,26 @@ namespace nbl { float32_t3x4 modelMatrix; float32_t4 viewport; - uint32_t samplingMode; + SAMPLING_MODE samplingMode; + uint32_t sampleCount; uint32_t frameIndex; }; struct PushConstantRayVis { float32_t4x4 viewProjMatrix; + float32_t3x4 viewMatrix; float32_t3x4 modelMatrix; float32_t4 viewport; uint32_t frameIndex; }; + struct BenchmarkPushConstants + { + float32_t3x4 modelMatrix; + SAMPLING_MODE benchmarkMode; + }; + static const float32_t3 colorLUT[27] = { float32_t3(0, 0, 0), float32_t3(0.5, 0.5, 0.5), float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1), diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl index d4ef71d07..040883956 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl @@ -1,7 +1,8 @@ #ifndef GPU_COMMON_HLSL #define GPU_COMMON_HLSL -static const float32_t CIRCLE_RADIUS = 0.5f; +static const float32_t CIRCLE_RADIUS = 1.0f; +static const float32_t INV_CIRCLE_RADIUS = 1.0f / CIRCLE_RADIUS; // --- Geometry Utils --- struct ClippedSilhouette @@ -136,7 +137,7 @@ bool getVertexZNeg(float32_t3x4 modelMatrix, uint32_t vertexIdx) (vertexIdx & 2) ? 0.5f : -0.5f, (vertexIdx & 4) ? 0.5f : -0.5f); - float32_t transformedZ = dot(modelMatrix[2].xyz, localPos) + modelMatrix[2].w; + float32_t transformedZ = nbl::hlsl::dot(modelMatrix[2].xyz, localPos) + modelMatrix[2].w; return transformedZ < 0.0f; #else return corners[vertexIdx].z < 0.0f; diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl new file mode 100644 index 000000000..ea9bebcb3 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl @@ -0,0 +1,535 @@ +#ifndef _PARALLELOGRAM_SAMPLING_HLSL_ +#define _PARALLELOGRAM_SAMPLING_HLSL_ + +#include +#include + +#define MAX_SILHOUETTE_VERTICES 7 +#define MAX_CURVE_APEXES 2 +#define GET_PROJ_VERT(i) vertices[i].xy *CIRCLE_RADIUS + +// ============================================================================ +// Core structures +// ============================================================================ + +struct Parallelogram +{ + float16_t2 corner; + float16_t2 axisDir; + float16_t width; + float16_t height; +}; + +struct PrecomputedSilhouette +{ + float16_t3 edgeNormals[MAX_SILHOUETTE_VERTICES]; // 10.5 floats instead of 21 + uint32_t count; +}; + +struct ParallelogramSilhouette +{ + Parallelogram para; + PrecomputedSilhouette silhouette; +}; + +// ============================================================================ +// Silhouette helpers +// ============================================================================ + +PrecomputedSilhouette precomputeSilhouette(NBL_CONST_REF_ARG(ClippedSilhouette) sil) +{ + PrecomputedSilhouette result; + result.count = sil.count; + + float32_t3 v0 = sil.vertices[0]; + float32_t3 v1 = sil.vertices[1]; + float32_t3 v2 = sil.vertices[2]; + + result.edgeNormals[0] = float16_t3(cross(v0, v1)); + result.edgeNormals[1] = float16_t3(cross(v1, v2)); + + if (sil.count > 3) + { + float32_t3 v3 = sil.vertices[3]; + result.edgeNormals[2] = float16_t3(cross(v2, v3)); + + if (sil.count > 4) + { + float32_t3 v4 = sil.vertices[4]; + result.edgeNormals[3] = float16_t3(cross(v3, v4)); + + if (sil.count > 5) + { + float32_t3 v5 = sil.vertices[5]; + result.edgeNormals[4] = float16_t3(cross(v4, v5)); + + if (sil.count > 6) + { + float32_t3 v6 = sil.vertices[6]; + result.edgeNormals[5] = float16_t3(cross(v5, v6)); + result.edgeNormals[6] = float16_t3(cross(v6, v0)); + } + else + { + result.edgeNormals[5] = float16_t3(cross(v5, v0)); + result.edgeNormals[6] = float16_t3(0.0f, 0.0f, 0.0f); + } + } + else + { + result.edgeNormals[4] = float16_t3(cross(v4, v0)); + result.edgeNormals[5] = float16_t3(0.0f, 0.0f, 0.0f); + result.edgeNormals[6] = float16_t3(0.0f, 0.0f, 0.0f); + } + } + else + { + result.edgeNormals[3] = float16_t3(cross(v3, v0)); + result.edgeNormals[4] = float16_t3(0.0f, 0.0f, 0.0f); + result.edgeNormals[5] = float16_t3(0.0f, 0.0f, 0.0f); + result.edgeNormals[6] = float16_t3(0.0f, 0.0f, 0.0f); + } + } + else + { + result.edgeNormals[2] = float16_t3(cross(v2, v0)); + result.edgeNormals[3] = float16_t3(0.0f, 0.0f, 0.0f); + result.edgeNormals[4] = float16_t3(0.0f, 0.0f, 0.0f); + result.edgeNormals[5] = float16_t3(0.0f, 0.0f, 0.0f); + result.edgeNormals[6] = float16_t3(0.0f, 0.0f, 0.0f); + } + + return result; +} + +bool isInsideSilhouetteFast(float32_t3 dir, NBL_CONST_REF_ARG(PrecomputedSilhouette) sil) +{ + float16_t3 d = float16_t3(dir); + half maxDot = dot(d, sil.edgeNormals[0]); + maxDot = max(maxDot, dot(d, sil.edgeNormals[1])); + maxDot = max(maxDot, dot(d, sil.edgeNormals[2])); + maxDot = max(maxDot, dot(d, sil.edgeNormals[3])); + maxDot = max(maxDot, dot(d, sil.edgeNormals[4])); + maxDot = max(maxDot, dot(d, sil.edgeNormals[5])); + maxDot = max(maxDot, dot(d, sil.edgeNormals[6])); + return maxDot <= half(0.0f); +} +float32_t3 circleToSphere(float32_t2 circlePoint) +{ + float32_t2 xy = circlePoint / CIRCLE_RADIUS; + float32_t xy_len_sq = dot(xy, xy); + + // if (xy_len_sq >= 1.0f) + // return float32_t3(0, 0, 0); + + return float32_t3(xy, sqrt(1.0f - xy_len_sq)); +} + +bool isEdgeConvex(float32_t3 S, float32_t3 E) +{ + return nbl::hlsl::cross2D(S.xy, E.xy) < -1e-6f; +} + +// ============================================================================ +// Curve evaluation helpers +// ============================================================================ + +// Evaluate curve point at t using rsqrt +float32_t2 evalCurvePoint(float32_t3 S, float32_t3 E, float32_t t) +{ + float32_t3 v = S + t * (E - S); + float32_t invLen = rsqrt(dot(v, v)); + return v.xy * (invLen * CIRCLE_RADIUS); +} + +// Evaluate tangent at arbitrary t +float32_t2 evalCurveTangent(float32_t3 S, float32_t3 E, float32_t t) +{ + float32_t3 v = S + t * (E - S); + float32_t vLenSq = dot(v, v); + + if (vLenSq < 1e-12f) + return normalize(E.xy - S.xy); + + float32_t3 p = v * rsqrt(vLenSq); + float32_t3 vPrime = E - S; + float32_t2 tangent2D = (vPrime - p * dot(p, vPrime)).xy; + + float32_t len = length(tangent2D); + return (len > 1e-7f) ? tangent2D / len : normalize(E.xy - S.xy); +} + +// Get both endpoint tangents efficiently (shares SdotE computation) +void getProjectedTangents(float32_t3 S, float32_t3 E, out float32_t2 t0, out float32_t2 t1) +{ + float32_t SdotE = dot(S, E); + + float32_t2 tangent0_2D = (E - S * SdotE).xy; + float32_t2 tangent1_2D = (E * SdotE - S).xy; + + float32_t len0Sq = dot(tangent0_2D, tangent0_2D); + float32_t len1Sq = dot(tangent1_2D, tangent1_2D); + + const float32_t eps = 1e-14f; + + if (len0Sq > eps && len1Sq > eps) + { + t0 = tangent0_2D * rsqrt(len0Sq); + t1 = tangent1_2D * rsqrt(len1Sq); + return; + } + + // Rare fallback path + float32_t2 diff = E.xy - S.xy; + float32_t diffLenSq = dot(diff, diff); + float32_t2 fallback = diffLenSq > eps ? diff * rsqrt(diffLenSq) : float32_t2(1.0f, 0.0f); + + t0 = len0Sq > eps ? tangent0_2D * rsqrt(len0Sq) : fallback; + t1 = len1Sq > eps ? tangent1_2D * rsqrt(len1Sq) : fallback; +} + +// Compute apex with clamping to prevent apex explosion +void computeApexClamped(float32_t2 p0, float32_t2 p1, float32_t2 t0, float32_t2 t1, out float32_t2 apex) +{ + float32_t denom = t0.x * t1.y - t0.y * t1.x; + float32_t2 center = (p0 + p1) * 0.5f; + + if (abs(denom) < 1e-6f) + { + apex = center; + return; + } + + float32_t2 dp = p1 - p0; + float32_t s = (dp.x * t1.y - dp.y * t1.x) / denom; + apex = p0 + s * t0; + + float32_t2 toApex = apex - center; + float32_t distSq = dot(toApex, toApex); + float32_t maxDistSq = CIRCLE_RADIUS * CIRCLE_RADIUS * 4.0f; + + if (distSq > maxDistSq) + { + apex = center + toApex * (CIRCLE_RADIUS * 2.0f * rsqrt(distSq)); + } +} + +void testPoint(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, float32_t2 pt, float32_t2 axisDir, float32_t2 perpDir) +{ + float32_t projAlong = dot(pt, axisDir); + float32_t projPerp = dot(pt, perpDir); + + minAlong = min(minAlong, projAlong); + maxAlong = max(maxAlong, projAlong); + minPerp = min(minPerp, projPerp); + maxPerp = max(maxPerp, projPerp); +} + +template +void testEdgeForAxisFast(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, + uint32_t count, uint32_t n3Mask, float32_t2 axisDir, float32_t2 perpDir, + const float32_t3 vertices[MAX_SILHOUETTE_VERTICES]) +{ + const uint32_t nextIdx = (I + 1 < count) ? I + 1 : 0; + + testPoint(minAlong, maxAlong, minPerp, maxPerp, GET_PROJ_VERT(I), axisDir, perpDir); + + if (n3Mask & (1u << I)) + { + float32_t2 midPoint = evalCurvePoint(vertices[I], vertices[nextIdx], 0.5f); + testPoint(minAlong, maxAlong, minPerp, maxPerp, midPoint, axisDir, perpDir); + } +} + +float32_t computeBoundingBoxAreaForAxisFast(NBL_CONST_REF_ARG(float32_t3) vertices[MAX_SILHOUETTE_VERTICES], uint32_t n3Mask, uint32_t count, float32_t2 axisDir) +{ + float32_t2 perpDir = float32_t2(-axisDir.y, axisDir.x); + + float32_t minAlong = 1e10f; + float32_t maxAlong = -1e10f; + float32_t minPerp = 1e10f; + float32_t maxPerp = -1e10f; + + testEdgeForAxisFast<0>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); + testEdgeForAxisFast<1>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); + testEdgeForAxisFast<2>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); + if (count > 3) + { + testEdgeForAxisFast<3>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); + if (count > 4) + { + testEdgeForAxisFast<4>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); + if (count > 5) + { + testEdgeForAxisFast<5>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); + if (count > 6) + { + testEdgeForAxisFast<6>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); + } + } + } + } + + return (maxAlong - minAlong) * (maxPerp - minPerp); +} + +void tryCaliperDir(inout float32_t bestArea, inout float32_t2 bestDir, const float32_t2 dir, const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t n3Mask, uint32_t count) +{ + float32_t area = computeBoundingBoxAreaForAxisFast(vertices, n3Mask, count, dir); + + if (area < bestArea) + { + bestArea = area; + bestDir = dir; + } +} + +template +inline void processEdge(inout float32_t bestArea, inout float32_t2 bestDir, inout uint32_t convexMask, inout uint32_t n3Mask, uint32_t count, const float32_t3 vertices[MAX_SILHOUETTE_VERTICES]) +{ + const uint32_t nextIdx = (I + 1 < count) ? I + 1 : 0; + float32_t3 S = vertices[I]; + float32_t3 E = vertices[nextIdx]; + + float32_t2 t0, t1; + getProjectedTangents(S, E, t0, t1); + + tryCaliperDir(bestArea, bestDir, t0, vertices, n3Mask, count); + + if (isEdgeConvex(S, E)) + { + convexMask |= (1u << I); + tryCaliperDir(bestArea, bestDir, t1, vertices, n3Mask, count); + + if (dot(t0, t1) < 0.5f) + { + n3Mask |= (1u << I); + float32_t2 tangentAtMid = evalCurveTangent(S, E, 0.5f); + tryCaliperDir(bestArea, bestDir, tangentAtMid, vertices, n3Mask, count); + } + } +} + +template +inline void testEdgeForAxisAccurate(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, uint32_t count, uint32_t convexMask, uint32_t n3Mask, + float32_t2 axisDir, float32_t2 perpDir, const float32_t3 vertices[MAX_SILHOUETTE_VERTICES]) +{ + const uint32_t nextIdx = (I + 1 < count) ? I + 1 : 0; + float32_t2 projectedVertex = vertices[I].xy * CIRCLE_RADIUS; + + testPoint(minAlong, maxAlong, minPerp, maxPerp, projectedVertex, axisDir, perpDir); + + bool isN3 = (n3Mask & (1u << I)) != 0; + bool isConvex = (convexMask & (1u << I)) != 0; + + if (!isN3 && !isConvex) + return; + + float32_t3 S = vertices[I]; + float32_t3 E = vertices[nextIdx]; + float32_t2 midPoint = evalCurvePoint(S, E, 0.5f); + + if (isN3) + { + testPoint(minAlong, maxAlong, minPerp, maxPerp, midPoint, axisDir, perpDir); + } + + if (isConvex) + { + float32_t2 t0, endTangent; + getProjectedTangents(S, E, t0, endTangent); + + if (dot(t0, perpDir) > 0.0f) + { + float32_t2 apex0; + if (isN3) + { + float32_t2 tangentAtMid = evalCurveTangent(S, E, 0.5f); + computeApexClamped(projectedVertex, midPoint, t0, tangentAtMid, apex0); + testPoint(minAlong, maxAlong, minPerp, maxPerp, apex0, axisDir, perpDir); + + if (dot(tangentAtMid, perpDir) > 0.0f) + { + float32_t2 apex1; + computeApexClamped(midPoint, E.xy * CIRCLE_RADIUS, tangentAtMid, endTangent, apex1); + testPoint(minAlong, maxAlong, minPerp, maxPerp, apex1, axisDir, perpDir); + } + } + else + { + computeApexClamped(projectedVertex, E.xy * CIRCLE_RADIUS, t0, endTangent, apex0); + testPoint(minAlong, maxAlong, minPerp, maxPerp, apex0, axisDir, perpDir); + } + } + } +} + +Parallelogram buildParallelogramForAxisAccurate(const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t convexMask, uint32_t n3Mask, uint32_t count, float32_t2 axisDir) +{ + float32_t2 perpDir = float32_t2(-axisDir.y, axisDir.x); + + float32_t minAlong = 1e10f; + float32_t maxAlong = -1e10f; + float32_t minPerp = 1e10f; + float32_t maxPerp = -1e10f; + + testEdgeForAxisAccurate<0>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); + testEdgeForAxisAccurate<1>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); + testEdgeForAxisAccurate<2>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); + if (count > 3) + { + testEdgeForAxisAccurate<3>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); + if (count > 4) + { + testEdgeForAxisAccurate<4>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); + if (count > 5) + { + testEdgeForAxisAccurate<5>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); + if (count > 6) + { + testEdgeForAxisAccurate<6>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); + } + } + } + } + + Parallelogram result; + result.width = float16_t(maxAlong - minAlong); + result.height = float16_t(maxPerp - minPerp); + result.axisDir = float16_t2(axisDir); + result.corner = float16_t2(minAlong * axisDir + minPerp * float16_t2(-axisDir.y, axisDir.x)); + + return result; +} + +Parallelogram findMinimumBoundingBoxCurved(const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count +#if VISUALIZE_SAMPLES + , + float32_t2 ndc, float32_t3 spherePos, float32_t aaWidth, + inout float32_t4 color +#endif +) +{ + uint32_t convexMask = 0; + uint32_t n3Mask = 0; + float32_t bestArea = 1e10f; + float32_t2 bestDir = float32_t2(1.0f, 0.0f); + + processEdge<0>(bestArea, bestDir, convexMask, n3Mask, count, vertices); + processEdge<1>(bestArea, bestDir, convexMask, n3Mask, count, vertices); + processEdge<2>(bestArea, bestDir, convexMask, n3Mask, count, vertices); + if (count > 3) + { + processEdge<3>(bestArea, bestDir, convexMask, n3Mask, count, vertices); + if (count > 4) + { + processEdge<4>(bestArea, bestDir, convexMask, n3Mask, count, vertices); + if (count > 5) + { + processEdge<5>(bestArea, bestDir, convexMask, n3Mask, count, vertices); + if (count > 6) + { + processEdge<6>(bestArea, bestDir, convexMask, n3Mask, count, vertices); + } + } + } + } + + tryCaliperDir(bestArea, bestDir, float32_t2(1.0f, 0.0f), vertices, n3Mask, count); + tryCaliperDir(bestArea, bestDir, float32_t2(0.0f, 1.0f), vertices, n3Mask, count); + + Parallelogram best = buildParallelogramForAxisAccurate(vertices, convexMask, n3Mask, count, bestDir); + +#if VISUALIZE_SAMPLES + for (uint32_t i = 0; i < count; i++) + { + if (convexMask & (1u << i)) + { + uint32_t nextIdx = (i + 1) % count; + float32_t2 p0 = vertices[i].xy * CIRCLE_RADIUS; + float32_t2 p1 = vertices[nextIdx].xy * CIRCLE_RADIUS; + + float32_t2 t0, endTangent; + getProjectedTangents(vertices[i], vertices[nextIdx], t0, endTangent); + + if (n3Mask & (1u << i)) + { + float32_t2 tangentAtMid = evalCurveTangent(vertices[i], vertices[nextIdx], 0.5f); + float32_t2 midPoint = evalCurvePoint(vertices[i], vertices[nextIdx], 0.5f); + + float32_t2 apex0, apex1; + computeApexClamped(p0, midPoint, t0, tangentAtMid, apex0); + computeApexClamped(midPoint, p1, tangentAtMid, endTangent, apex1); + + color += drawCorner(float32_t3(apex0, 0.0f), ndc, aaWidth, 0.03, 0.0f, float32_t3(1, 0, 1)); + color += drawCorner(float32_t3(midPoint, 0.0f), ndc, aaWidth, 0.02, 0.0f, float32_t3(0, 1, 0)); + color += drawCorner(float32_t3(apex1, 0.0f), ndc, aaWidth, 0.03, 0.0f, float32_t3(1, 0.5, 0)); + } + else + { + float32_t2 apex; + computeApexClamped(p0, p1, t0, endTangent, apex); + color += drawCorner(float32_t3(apex, 0.0f), ndc, aaWidth, 0.03, 0.0f, float32_t3(1, 0, 1)); + } + } + } +#endif + + return best; +} +// ============================================================================ +// Main entry points +// ============================================================================ + +ParallelogramSilhouette buildParallelogram(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette +#if VISUALIZE_SAMPLES + , + float32_t2 ndc, float32_t3 spherePos, float32_t aaWidth, + inout float32_t4 color +#endif +) +{ + ParallelogramSilhouette result; + + // if (silhouette.count < 3) + // { + // result.para.corner = float32_t2(0, 0); + // result.para.edge0 = float32_t2(1, 0); + // result.para.edge1 = float32_t2(0, 1); + // result.para.area = 1.0f; + // return result; + // } + + result.para = findMinimumBoundingBoxCurved(silhouette.vertices, silhouette.count +#if VISUALIZE_SAMPLES + , + ndc, spherePos, aaWidth, color +#endif + ); + +#if DEBUG_DATA + DebugDataBuffer[0].parallelogramArea = result.para.width * result.para.height; +#endif + result.silhouette = precomputeSilhouette(silhouette); + + return result; +} + +float32_t3 sampleFromParallelogram(NBL_CONST_REF_ARG(ParallelogramSilhouette) paraSilhouette, float32_t2 xi, out float32_t pdf, out bool valid) +{ + float16_t2 axisDir = paraSilhouette.para.axisDir; + float16_t2 perpDir = float16_t2(-axisDir.y, axisDir.x); + + float16_t2 circleXY = paraSilhouette.para.corner + + float16_t(xi.x) * paraSilhouette.para.width * axisDir + + float16_t(xi.y) * paraSilhouette.para.height * perpDir; + + float32_t3 direction = circleToSphere(circleXY); + + valid = (direction.z > 0.0f) && isInsideSilhouetteFast(direction, paraSilhouette.silhouette); + pdf = valid ? (1.0f / (paraSilhouette.para.width * paraSilhouette.para.height)) : 0.0f; + + return direction; +} + +#endif // _PARALLELOGRAM_SAMPLING_HLSL_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl index 05d913e01..504db2db9 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl @@ -1,19 +1,25 @@ #ifndef _SILHOUETTE_HLSL_ #define _SILHOUETTE_HLSL_ -#include "gpu_common.hlsl" +#include "gpu_common.hlsl" #include "utils.hlsl" +// Special index values for clip points +static const uint32_t CLIP_POINT_A = 23; // Clip point between last positive and first negative +static const uint32_t CLIP_POINT_B = 24; // Clip point between last negative and first positive + // Compute region and configuration index from model matrix uint32_t computeRegionAndConfig(float32_t3x4 modelMatrix, out uint32_t3 region, out uint32_t configIndex, out uint32_t vertexCount) { float32_t4x3 columnModel = transpose(modelMatrix); float32_t3 obbCenter = columnModel[3].xyz; float32_t3x3 upper3x3 = (float32_t3x3)columnModel; + float32_t3 rcpSqScales = rcp(float32_t3( dot(upper3x3[0], upper3x3[0]), dot(upper3x3[1], upper3x3[1]), dot(upper3x3[2], upper3x3[2]))); + float32_t3 normalizedProj = mul(upper3x3, obbCenter) * rcpSqScales; region = uint32_t3( @@ -23,9 +29,10 @@ uint32_t computeRegionAndConfig(float32_t3x4 modelMatrix, out uint32_t3 region, configIndex = region.x + region.y * 3u + region.z * 9u; - uint32_t sil = packSilhouette(silhouettes[configIndex]); - // uint32_t sil = binSilhouettes[configIndex]; + // uint32_t sil = packSilhouette(silhouettes[configIndex]); + uint32_t sil = binSilhouettes[configIndex]; vertexCount = getSilhouetteSize(sil); + return sil; } @@ -45,6 +52,7 @@ computeSilhouette(float32_t3x4 modelMatrix, uint32_t vertexCount, uint32_t sil #if VISUALIZE_SAMPLES float32_t4 color = float32_t4(0, 0, 0, 0); #endif + silhouette.count = 0; // Build clip mask (z < 0) @@ -74,9 +82,10 @@ computeSilhouette(float32_t3x4 modelMatrix, uint32_t vertexCount, uint32_t sil { uint32_t i0 = i; uint32_t i1 = (i + 1) % vertexCount; - float32_t3 v0 = getVertex(modelMatrix, getSilhouetteVertex(sil, i0)); - silhouette.vertices[silhouette.count++] = v0; + silhouette.vertices[silhouette.count] = v0; + silhouette.indices[silhouette.count++] = i0; // Original index (no rotation) + #if VISUALIZE_SAMPLES float32_t3 v1 = getVertex(modelMatrix, getSilhouetteVertex(sil, i1)); float32_t3 pts[2] = {v0, v1}; @@ -89,20 +98,19 @@ computeSilhouette(float32_t3x4 modelMatrix, uint32_t vertexCount, uint32_t sil // Rotate clip mask so positives come first uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u); - bool wrapAround = ((clipMask & 1u) != 0u) && - ((clipMask & (1u << (vertexCount - 1))) != 0u); + bool wrapAround = ((clipMask & 1u) != 0u) && ((clipMask & (1u << (vertexCount - 1))) != 0u); uint32_t rotateAmount = wrapAround ? firstbitlow(invertedMask) // -> First POSITIVE : firstbithigh(clipMask) + 1; // -> First vertex AFTER last negative uint32_t rotatedClipMask = rotr(clipMask, rotateAmount, vertexCount); uint32_t rotatedSil = rotr(sil, rotateAmount * 3, vertexCount * 3); - uint32_t positiveCount = vertexCount - clipCount; // ALWAYS compute both clip points uint32_t lastPosIdx = positiveCount - 1; uint32_t firstNegIdx = positiveCount; + float32_t3 vLastPos = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, lastPosIdx)); float32_t3 vFirstNeg = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, firstNegIdx)); float32_t t = vLastPos.z / (vLastPos.z - vFirstNeg.z); @@ -118,18 +126,23 @@ computeSilhouette(float32_t3x4 modelMatrix, uint32_t vertexCount, uint32_t sil { // Get raw vertex float32_t3 v0 = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, i)); - bool isLastPositive = (i == positiveCount - 1); bool useClipA = (clipCount > 0) && isLastPositive; -#if VISUALIZE_SAMPLES - float32_t3 v1 = useClipA ? clipA - : getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, (i + 1) % vertexCount)); + // Compute original index before rotation + uint32_t originalIndex = (i + rotateAmount) % vertexCount; +#if VISUALIZE_SAMPLES + float32_t3 v1 = useClipA ? clipA : getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, (i + 1) % vertexCount)); float32_t3 pts[2] = {normalize(v0), normalize(v1)}; color += drawEdge((i + 1) % vertexCount, pts, spherePos, aaWidth); #endif - silhouette.vertices[silhouette.count++] = v0; + +#if DEBUG_DATA + DebugDataBuffer[0].clippedSilhouetteVertices[silhouette.count] = v0; + DebugDataBuffer[0].clippedSilhouetteVerticesIndices[silhouette.count] = originalIndex; +#endif + silhouette.vertices[silhouette.count++] = normalize(v0); } if (clipCount > 0 && clipCount < vertexCount) @@ -143,11 +156,22 @@ computeSilhouette(float32_t3x4 modelMatrix, uint32_t vertexCount, uint32_t sil float32_t3 arcPts[2] = {normalize(clipA), normalize(clipB)}; color += drawEdge(23, arcPts, spherePos, aaWidth, 0.6f); #endif - silhouette.vertices[silhouette.count++] = clipA; - silhouette.vertices[silhouette.count++] = clipB; + +#if DEBUG_DATA + DebugDataBuffer[0].clippedSilhouetteVertices[silhouette.count] = clipA; + DebugDataBuffer[0].clippedSilhouetteVerticesIndices[silhouette.count] = CLIP_POINT_A; +#endif + silhouette.vertices[silhouette.count++] = normalize(clipA); + +#if DEBUG_DATA + DebugDataBuffer[0].clippedSilhouetteVertices[silhouette.count] = clipB; + DebugDataBuffer[0].clippedSilhouetteVerticesIndices[silhouette.count] = CLIP_POINT_B; +#endif + silhouette.vertices[silhouette.count++] = normalize(clipB); } #if DEBUG_DATA + DebugDataBuffer[0].clippedSilhouetteVertexCount = silhouette.count; DebugDataBuffer[0].clipMask = clipMask; DebugDataBuffer[0].clipCount = clipCount; DebugDataBuffer[0].rotatedClipMask = rotatedClipMask; @@ -156,6 +180,7 @@ computeSilhouette(float32_t3x4 modelMatrix, uint32_t vertexCount, uint32_t sil DebugDataBuffer[0].wrapAround = (uint32_t)wrapAround; DebugDataBuffer[0].rotatedSil = rotatedSil; #endif + #if VISUALIZE_SAMPLES return color; #endif diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl index f01667bf0..e4bf804cb 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl @@ -39,4 +39,11 @@ uint32_t packSilhouette(const uint32_t s[7]) return packed; } +float32_t2 hammersleySample(uint32_t i, uint32_t numSamples) +{ + return float32_t2( + float32_t(i) / float32_t(numSamples), + float32_t(reversebits(i)) / 4294967295.0f); +} + #endif // _UTILS_HLSL_ diff --git a/73_SolidAngleVisualizer/main.cpp b/73_SolidAngleVisualizer/main.cpp index 4c32069ff..9d9941da3 100644 --- a/73_SolidAngleVisualizer/main.cpp +++ b/73_SolidAngleVisualizer/main.cpp @@ -20,10 +20,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR inline static std::string SolidAngleVisShaderPath = "app_resources/hlsl/SolidAngleVis.frag.hlsl"; inline static std::string RayVisShaderPath = "app_resources/hlsl/RayVis.frag.hlsl"; + public: inline SolidAngleVisualizer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD), - device_base_t({ 2048,1024 }, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) { + device_base_t({ 2048, 1024 }, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) + { } inline bool onAppInitialized(smart_refctd_ptr&& system) override @@ -44,60 +46,48 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { if (!pool) return logFail("Couldn't create Command Pool!"); - if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i,1 })) + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) return logFail("Couldn't create Command Buffer!"); } const uint32_t addtionalBufferOwnershipFamilies[] = { getGraphicsQueue()->getFamilyIndex() }; m_scene = CGeometryCreatorScene::create( - { - .transferQueue = getTransferUpQueue(), - .utilities = m_utils.get(), - .logger = m_logger.get(), - .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies - }, - CSimpleDebugRenderer::DefaultPolygonGeometryPatch - ); + { .transferQueue = getTransferUpQueue(), + .utilities = m_utils.get(), + .logger = m_logger.get(), + .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies }, + CSimpleDebugRenderer::DefaultPolygonGeometryPatch); // for the scene drawing pass { IGPURenderpass::SCreationParams params = {}; const IGPURenderpass::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] = { - {{ - { - .format = sceneRenderDepthFormat, - .samples = IGPUImage::ESCF_1_BIT, - .mayAlias = false - }, - /*.loadOp =*/ {IGPURenderpass::LOAD_OP::CLEAR}, - /*.storeOp =*/ {IGPURenderpass::STORE_OP::STORE}, - /*.initialLayout =*/ {IGPUImage::LAYOUT::UNDEFINED}, - /*.finalLayout =*/ {IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} - }}, - IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd - }; + {{{.format = sceneRenderDepthFormat, + .samples = IGPUImage::ESCF_1_BIT, + .mayAlias = false}, + /*.loadOp =*/{IGPURenderpass::LOAD_OP::CLEAR}, + /*.storeOp =*/{IGPURenderpass::STORE_OP::STORE}, + /*.initialLayout =*/{IGPUImage::LAYOUT::UNDEFINED}, + /*.finalLayout =*/{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}}, + IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd }; params.depthStencilAttachments = depthAttachments; const IGPURenderpass::SCreationParams::SColorAttachmentDescription colorAttachments[] = { {{ - { - .format = finalSceneRenderFormat, - .samples = IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT, - .mayAlias = false - }, - /*.loadOp =*/ IGPURenderpass::LOAD_OP::CLEAR, - /*.storeOp =*/ IGPURenderpass::STORE_OP::STORE, - /*.initialLayout =*/ IGPUImage::LAYOUT::UNDEFINED, - /*.finalLayout =*/ IGPUImage::LAYOUT::READ_ONLY_OPTIMAL // ImGUI shall read - }}, - IGPURenderpass::SCreationParams::ColorAttachmentsEnd - }; + {.format = finalSceneRenderFormat, + .samples = IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT, + .mayAlias = false}, + /*.loadOp =*/IGPURenderpass::LOAD_OP::CLEAR, + /*.storeOp =*/IGPURenderpass::STORE_OP::STORE, + /*.initialLayout =*/IGPUImage::LAYOUT::UNDEFINED, + /*.finalLayout =*/IGPUImage::LAYOUT::READ_ONLY_OPTIMAL // ImGUI shall read + }}, + IGPURenderpass::SCreationParams::ColorAttachmentsEnd }; params.colorAttachments = colorAttachments; IGPURenderpass::SCreationParams::SSubpassDescription subpasses[] = { {}, - IGPURenderpass::SCreationParams::SubpassesEnd - }; - subpasses[0].depthStencilAttachment = { {.render = {.attachmentIndex = 0,.layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}} }; - subpasses[0].colorAttachments[0] = { .render = {.attachmentIndex = 0,.layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} }; + IGPURenderpass::SCreationParams::SubpassesEnd }; + subpasses[0].depthStencilAttachment = { {.render = {.attachmentIndex = 0, .layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}} }; + subpasses[0].colorAttachments[0] = { .render = {.attachmentIndex = 0, .layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} }; params.subpasses = subpasses; const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { @@ -115,27 +105,21 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h` .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, // because depth and color get cleared first no read mask - .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - } - // leave view offsets and flags default - }, - { - .srcSubpass = 0, - .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .memoryBarrier = { - // last place where the color can get modified, depth is implicitly earlier - .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - // only write ops, reads can't be made available, also won't be using depth so don't care about it being visible to anyone else - .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT, - // the ImGUI will sample the color, then next frame we overwrite both attachments - .dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT | PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT, - // but we only care about the availability-visibility chain between renderpass and imgui - .dstAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT - } - // leave view offsets and flags default - }, - IGPURenderpass::SCreationParams::DependenciesEnd - }; + .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT} + // leave view offsets and flags default + }, + { + .srcSubpass = 0, .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, .memoryBarrier = {// last place where the color can get modified, depth is implicitly earlier + .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // only write ops, reads can't be made available, also won't be using depth so don't care about it being visible to anyone else + .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT, + // the ImGUI will sample the color, then next frame we overwrite both attachments + .dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT | PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT, + // but we only care about the availability-visibility chain between renderpass and imgui + .dstAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT} + // leave view offsets and flags default + }, + IGPURenderpass::SCreationParams::DependenciesEnd }; params.dependencies = dependencies; auto solidAngleRenderpassParams = params; m_mainRenderpass = m_device->createRenderpass(std::move(params)); @@ -145,11 +129,10 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR m_solidAngleRenderpass = m_device->createRenderpass(std::move(solidAngleRenderpassParams)); if (!m_solidAngleRenderpass) return logFail("Failed to create Solid Angle Renderpass!"); - } const auto& geometries = m_scene->getInitParams().geometries; - m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(), m_solidAngleRenderpass.get(), 0, { &geometries.front().get(),geometries.size() }); + m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(), m_solidAngleRenderpass.get(), 0, { &geometries.front().get(), geometries.size() }); // special case { const auto& pipelines = m_renderer->getInitParams().pipelines; @@ -192,7 +175,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); options.spirvOptimizer = opt.get(); #endif - options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT; + options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;// | IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_FILE_BIT | IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT; options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); options.preprocessorOptions.logger = m_logger.get(); options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); @@ -226,39 +209,30 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR const IGPUPipelineBase::SShaderSpecInfo solidAngleFragSpec = { .shader = solidAngleVisFragShader.get(), - .entryPoint = "main" - }; + .entryPoint = "main" }; auto rayVisFragShader = loadAndCompileHLSLShader(RayVisShaderPath, ESS_FRAGMENT); if (!rayVisFragShader) return logFail("Failed to Load and Compile Fragment Shader: rayVis!"); const IGPUPipelineBase::SShaderSpecInfo RayFragSpec = { .shader = rayVisFragShader.get(), - .entryPoint = "main" - }; + .entryPoint = "main" }; smart_refctd_ptr solidAngleVisLayout, rayVisLayout; nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { - { - .binding = 0, - .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ShaderStage::ESS_FRAGMENT, - .count = 1 - } - }; + {.binding = 0, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = ShaderStage::ESS_FRAGMENT, + .count = 1} }; smart_refctd_ptr dsLayout = m_device->createDescriptorSetLayout(bindings); - const asset::SPushConstantRange saRanges[] = { { - .stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, - .offset = 0, - .size = sizeof(PushConstants) - } }; - const asset::SPushConstantRange rayRanges[] = { { - .stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, - .offset = 0, - .size = sizeof(PushConstantRayVis) - } }; + const asset::SPushConstantRange saRanges[] = { {.stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, + .offset = 0, + .size = sizeof(PushConstants)} }; + const asset::SPushConstantRange rayRanges[] = { {.stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, + .offset = 0, + .size = sizeof(PushConstantRayVis)} }; if (!dsLayout) logFail("Failed to create a Descriptor Layout!\n"); @@ -301,21 +275,20 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); assert(m_outputStorageBuffer->getBoundMemory().memory == m_allocation.memory.get()); - smart_refctd_ptr pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 }); + smart_refctd_ptr pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(), 1 }); m_ds = pool->createDescriptorSet(std::move(dsLayout)); { IGPUDescriptorSet::SDescriptorInfo info[1]; info[0].desc = smart_refctd_ptr(m_outputStorageBuffer); - info[0].info.buffer = { .offset = 0,.size = BufferSize }; + info[0].info.buffer = { .offset = 0, .size = BufferSize }; IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { - {.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info} - }; + {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info} }; m_device->updateDescriptorSets(writes, {}); } } - if (!m_allocation.memory->map({ 0ull,m_allocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ)) + if (!m_allocation.memory->map({ 0ull, m_allocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ)) logFail("Failed to map the Device Memory!\n"); // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches @@ -328,8 +301,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { auto scRes = static_cast(m_surface->getSwapchainResources()); ext::imgui::UI::SCreationParameters params = {}; - params.resources.texturesInfo = { .setIx = 0u,.bindingIx = TexturesImGUIBindingIndex }; - params.resources.samplersInfo = { .setIx = 0u,.bindingIx = 1u }; + params.resources.texturesInfo = { .setIx = 0u, .bindingIx = TexturesImGUIBindingIndex }; + params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u }; params.utilities = m_utils; params.transfer = getTransferUpQueue(); params.pipelineLayout = ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxImGUITextures); @@ -349,7 +322,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { // note that we use default layout provided by our extension, but you are free to create your own by filling ext::imgui::UI::S_CREATION_PARAMETERS::resources const auto* layout = interface.imGUI->getPipeline()->getLayout()->getDescriptorSetLayout(0u); - auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT, { &layout,1 }); + auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT, { &layout, 1 }); auto ds = pool->createDescriptorSet(smart_refctd_ptr(layout)); interface.subAllocDS = make_smart_refctd_ptr(std::move(ds)); if (!interface.subAllocDS) @@ -369,12 +342,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .binding = TexturesImGUIBindingIndex, .arrayElement = ext::imgui::UI::FontAtlasTexId, .count = 1, - .info = &info - }; - if (!m_device->updateDescriptorSets({ &write,1 }, {})) + .info = &info }; + if (!m_device->updateDescriptorSets({ &write, 1 }, {})) return logFail("Failed to write the descriptor set"); } - imgui->registerListener([this]() {interface(); }); + imgui->registerListener([this]() + { interface(); }); } interface.camera.mapKeysToWASD(); @@ -411,16 +384,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR auto* const cb = m_cmdBufs.data()[resourceIx].get(); cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - // clear to black for both things - const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f,0.f,0.f,1.f} }; + if (m_solidAngleViewFramebuffer) { - asset::SBufferRange range - { + asset::SBufferRange range{ .offset = 0, .size = m_outputStorageBuffer->getSize(), - .buffer = m_outputStorageBuffer - }; + .buffer = m_outputStorageBuffer }; cb->fillBuffer(range, 0u); { @@ -428,16 +398,15 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR cb->beginDebugMarker("Draw Circle View Frame"); { const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f }; + const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f, 0.f, 0.f, 1.f} }; const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = { .framebuffer = m_solidAngleViewFramebuffer.get(), .colorClearValues = &clearValue, .depthStencilClearValues = &farValue, .renderArea = { - .offset = {0,0}, - .extent = {creationParams.width, creationParams.height} - } - }; + .offset = {0, 0}, + .extent = {creationParams.width, creationParams.height}} }; beginRenderpass(cb, renderpassInfo); } // draw scene @@ -446,10 +415,10 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR lastFrameSeed = m_frameSeeding ? static_cast(m_realFrameIx) : lastFrameSeed; PushConstants pc{ .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), - .viewport = { 0.f,0.f,static_cast(creationParams.width),static_cast(creationParams.height) }, + .viewport = {0.f, 0.f, static_cast(creationParams.width), static_cast(creationParams.height)}, .samplingMode = m_samplingMode, - .frameIndex = lastFrameSeed - }; + .sampleCount = static_cast(m_SampleCount), + .frameIndex = lastFrameSeed }; auto pipeline = m_solidAngleVisPipeline; cb->bindGraphicsPipeline(pipeline.get()); cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(pc), &pc); @@ -471,19 +440,16 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { auto creationParams = m_mainViewFramebuffer->getCreationParameters(); const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f }; + const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.1f, 0.1f, 0.1f, 1.f} }; const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = - { .framebuffer = m_mainViewFramebuffer.get(), .colorClearValues = &clearValue, .depthStencilClearValues = &farValue, .renderArea = { - .offset = {0,0}, - .extent = {creationParams.width, creationParams.height} - } - }; + .offset = {0, 0}, + .extent = {creationParams.width, creationParams.height}} }; beginRenderpass(cb, renderpassInfo); - } { // draw rays visualization auto creationParams = m_mainViewFramebuffer->getCreationParameters(); @@ -492,12 +458,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // draw scene { float32_t4x4 viewProj = *reinterpret_cast(&interface.camera.getConcatenatedMatrix()); + float32_t3x4 view = *reinterpret_cast(&interface.camera.getViewMatrix()); PushConstantRayVis pc{ .viewProjMatrix = viewProj, + .viewMatrix = view, .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), - .viewport = { 0.f,0.f,static_cast(creationParams.width),static_cast(creationParams.height) }, - .frameIndex = m_frameSeeding ? static_cast(m_realFrameIx) : 0u - }; + .viewport = {0.f, 0.f, static_cast(creationParams.width), static_cast(creationParams.height)}, + .frameIndex = m_frameSeeding ? static_cast(m_realFrameIx) : 0u }; auto pipeline = m_rayVisualizationPipeline; cb->bindGraphicsPipeline(pipeline.get()); cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(pc), &pc); @@ -524,7 +491,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR auto& instance = m_renderer->m_instances[0]; instance.world = float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)); instance.packedGeo = m_renderer->getGeometries().data(); // cube // +interface.gcIndex; - m_renderer->render(cb, viewParams); // draw the cube/OBB + m_renderer->render(cb, viewParams); // draw the cube/OBB instance.world = float32_t3x4(1.0f); instance.packedGeo = m_renderer->getGeometries().data() + 2; // disk @@ -539,16 +506,15 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR cb->beginDebugMarker("SolidAngleVisualizer IMGUI Frame"); { auto scRes = static_cast(m_surface->getSwapchainResources()); + const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f, 0.f, 0.f, 1.f} }; const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = { .framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex), .colorClearValues = &clearValue, .depthStencilClearValues = nullptr, .renderArea = { - .offset = {0,0}, - .extent = {m_window->getWidth(),m_window->getHeight()} - } - }; + .offset = {0, 0}, + .extent = {m_window->getWidth(), m_window->getHeight()}} }; beginRenderpass(cb, renderpassInfo); } // draw ImGUI @@ -560,7 +526,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR const auto* ds = interface.subAllocDS->getDescriptorSet(); cb->bindDescriptorSets(EPBP_GRAPHICS, pipeline->getLayout(), imgui->getCreationParameters().resources.texturesInfo.setIx, 1u, &ds); // a timepoint in the future to release streaming resources for geometry - const ISemaphore::SWaitInfo drawFinished = { .semaphore = m_semaphore.get(),.value = m_realFrameIx + 1u }; + const ISemaphore::SWaitInfo drawFinished = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u }; if (!imgui->render(cb, drawFinished)) { m_logger->log("TODO: need to present acquired image before bailing because its already acquired.", ILogger::ELL_ERROR); @@ -576,27 +542,19 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { .semaphore = m_semaphore.get(), .value = ++m_realFrameIx, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS - }; + .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS }; const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = { - {.cmdbuf = cb } - }; + {.cmdbuf = cb} }; const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { - { - .semaphore = device_base_t::getCurrentAcquire().semaphore, - .value = device_base_t::getCurrentAcquire().acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::NONE - } - }; + {.semaphore = device_base_t::getCurrentAcquire().semaphore, + .value = device_base_t::getCurrentAcquire().acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE} }; const IQueue::SSubmitInfo infos[] = { - { - .waitSemaphores = acquired, - .commandBuffers = commandBuffers, - .signalSemaphores = {&retval,1} - } - }; + {.waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = {&retval, 1}} }; if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS) { @@ -604,7 +562,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR m_realFrameIx--; } - m_window->setCaption("[Nabla Engine] UI App Test Demo"); return retval; } @@ -619,19 +576,16 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, .dstSubpass = 0, .memoryBarrier = { - .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, // should sync against the semaphore wait anyway + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, // should sync against the semaphore wait anyway .srcAccessMask = ACCESS_FLAGS::NONE, // layout transition needs to finish before the color write .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - } - // leave view offsets and flags default - }, + .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT} + // leave view offsets and flags default + }, // want layout transition to begin after all color output is done { - .srcSubpass = 0, - .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .memoryBarrier = { + .srcSubpass = 0, .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, .memoryBarrier = { // last place where the color can get modified, depth is implicitly earlier .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, // only write ops, reads can't be made available @@ -640,8 +594,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } // leave view offsets and flags default }, - IGPURenderpass::SCreationParams::DependenciesEnd - }; + IGPURenderpass::SCreationParams::DependenciesEnd }; return dependencies; } @@ -667,7 +620,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // I think begin/end should always be called on camera, just events shouldn't be fed, why? // If you stop begin/end, whatever keys were up/down get their up/down values frozen leading to // `perActionDt` becoming obnoxiously large the first time the even processing resumes due to - // `timeDiff` being computed since `lastVirtualUpTimeStamp` + // `timeDiff` being computed since `lastVirtualUpTimeStamp` camera.beginInputProcessing(nextPresentationTimestamp); { mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void @@ -690,10 +643,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // interface.gcIndex += int16_t(core::sign(e.scrollEvent.verticalScroll)); // interface.gcIndex = core::clamp(interface.gcIndex, 0ull, m_renderer->getGeometries().size() - 1); //} - } - }, - m_logger.get() - ); + } }, + m_logger.get()); keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void { if (interface.move) @@ -706,10 +657,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR previousEventTimestamp = e.timeStamp; uiEvents.keyboard.emplace_back(e); - } - }, - m_logger.get() - ); + } }, + m_logger.get()); } camera.endInputProcessing(nextPresentationTimestamp); @@ -717,37 +666,33 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ext::imgui::UI::SUpdateParameters params = { - .mousePosition = float32_t2(cursorPosition.x,cursorPosition.y) - float32_t2(m_window->getX(),m_window->getY()), - .displaySize = {m_window->getWidth(),m_window->getHeight()}, + .mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()), + .displaySize = {m_window->getWidth(), m_window->getHeight()}, .mouseEvents = uiEvents.mouse, - .keyboardEvents = uiEvents.keyboard - }; + .keyboardEvents = uiEvents.keyboard }; - //interface.objectName = m_scene->getInitParams().geometryNames[interface.gcIndex]; + // interface.objectName = m_scene->getInitParams().geometryNames[interface.gcIndex]; interface.imGUI->update(params); } void recreateFramebuffers() { - auto createImageAndView = [&](const uint16_t2 resolution, E_FORMAT format)->smart_refctd_ptr + auto createImageAndView = [&](const uint16_t2 resolution, E_FORMAT format) -> smart_refctd_ptr { - auto image = m_device->createImage({ { - .type = IGPUImage::ET_2D, - .samples = IGPUImage::ESCF_1_BIT, - .format = format, - .extent = {resolution.x,resolution.y,1}, - .mipLevels = 1, - .arrayLayers = 1, - .usage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT | IGPUImage::EUF_SAMPLED_BIT - } }); + auto image = m_device->createImage({ {.type = IGPUImage::ET_2D, + .samples = IGPUImage::ESCF_1_BIT, + .format = format, + .extent = {resolution.x, resolution.y, 1}, + .mipLevels = 1, + .arrayLayers = 1, + .usage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT | IGPUImage::EUF_SAMPLED_BIT} }); if (!m_device->allocate(image->getMemoryReqs(), image.get()).isValid()) return nullptr; IGPUImageView::SCreationParams params = { .image = std::move(image), .viewType = IGPUImageView::ET_2D, - .format = format - }; + .format = format }; params.subresourceRange.aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT : IGPUImage::EAF_COLOR_BIT; return m_device->createImageView(std::move(params)); }; @@ -763,23 +708,19 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { solidAngleView = createImageAndView(solidAngleViewRes, finalSceneRenderFormat); auto solidAngleDepthView = createImageAndView(solidAngleViewRes, sceneRenderDepthFormat); - m_solidAngleViewFramebuffer = m_device->createFramebuffer({ { - .renderpass = m_solidAngleRenderpass, - .depthStencilAttachments = &solidAngleDepthView.get(), - .colorAttachments = &solidAngleView.get(), - .width = solidAngleViewRes.x, - .height = solidAngleViewRes.y - } }); + m_solidAngleViewFramebuffer = m_device->createFramebuffer({ {.renderpass = m_solidAngleRenderpass, + .depthStencilAttachments = &solidAngleDepthView.get(), + .colorAttachments = &solidAngleView.get(), + .width = solidAngleViewRes.x, + .height = solidAngleViewRes.y} }); mainView = createImageAndView(mainViewRes, finalSceneRenderFormat); auto mainDepthView = createImageAndView(mainViewRes, sceneRenderDepthFormat); - m_mainViewFramebuffer = m_device->createFramebuffer({ { - .renderpass = m_mainRenderpass, - .depthStencilAttachments = &mainDepthView.get(), - .colorAttachments = &mainView.get(), - .width = mainViewRes.x, - .height = mainViewRes.y - } }); + m_mainViewFramebuffer = m_device->createFramebuffer({ {.renderpass = m_mainRenderpass, + .depthStencilAttachments = &mainDepthView.get(), + .colorAttachments = &mainView.get(), + .width = mainViewRes.x, + .height = mainViewRes.y} }); } else { @@ -788,7 +729,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } // release previous slot and its image - interface.subAllocDS->multi_deallocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices, { .semaphore = m_semaphore.get(),.value = m_realFrameIx + 1 }); + interface.subAllocDS->multi_deallocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices, { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1 }); // if (solidAngleView && mainView) { @@ -801,19 +742,15 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR infos[1].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; const IGPUDescriptorSet::SWriteDescriptorSet write[static_cast(CInterface::Count)] = { {.dstSet = interface.subAllocDS->getDescriptorSet(), - .binding = TexturesImGUIBindingIndex, - .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_MAIN_VIEW)], - .count = 1, - .info = &infos[static_cast(CInterface::ERV_MAIN_VIEW)] - }, - { - .dstSet = interface.subAllocDS->getDescriptorSet(), - .binding = TexturesImGUIBindingIndex, - .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)], - .count = 1, - .info = &infos[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)] - } - }; + .binding = TexturesImGUIBindingIndex, + .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_MAIN_VIEW)], + .count = 1, + .info = &infos[static_cast(CInterface::ERV_MAIN_VIEW)]}, + {.dstSet = interface.subAllocDS->getDescriptorSet(), + .binding = TexturesImGUIBindingIndex, + .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)], + .count = 1, + .info = &infos[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)]} }; m_device->updateDescriptorSets({ write, static_cast(CInterface::Count) }, {}); } interface.transformParams.sceneTexDescIx = interface.renderColorViewDescIndices[CInterface::ERV_MAIN_VIEW]; @@ -827,8 +764,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .x = 0, .y = 0, .width = static_cast(info.renderArea.extent.width), - .height = static_cast(info.renderArea.extent.height) - }; + .height = static_cast(info.renderArea.extent.height) }; cb->setViewport(0u, 1u, &viewport); } @@ -845,7 +781,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight; - static inline uint32_t m_samplingMode = SAMPLING_MODE_SOLID_ANGLE; + static inline SAMPLING_MODE m_samplingMode = SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE; + static inline int m_SampleCount = 64; static inline bool m_frameSeeding = true; static inline ResultData m_GPUOutResulData; // @@ -895,8 +832,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR projection = matrix4SIMD::buildProjectionMatrixOrthoRH(viewWidth, viewHeight, zNear, zFar); } - return projection; - }()); + return projection; }()); ImGuizmo::SetOrthographic(!isPerspective); ImGuizmo::BeginFrame(); @@ -918,19 +854,29 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } ImGui::Separator(); - ImGui::Text("Sampling Mode: "); + ImGui::Text("Sampling Mode:"); ImGui::SameLine(); - if (ImGui::RadioButton("Solid Angle", m_samplingMode == 0)) - m_samplingMode = SAMPLING_MODE_SOLID_ANGLE; + const char* samplingModes[] = + { + "Triangle Solid Angle", + "Triangle Projected Solid Angle", + "Parallelogram Projected Solid Angle" + }; + + int currentMode = static_cast(m_samplingMode); + + if (ImGui::Combo("##SamplingMode", ¤tMode, samplingModes, IM_ARRAYSIZE(samplingModes))) + { + m_samplingMode = static_cast(currentMode); + } - ImGui::SameLine(); - if (ImGui::RadioButton("Projected Solid Angle", m_samplingMode == 1)) - m_samplingMode = SAMPLING_MODE_PROJECTED_SOLID_ANGLE; ImGui::Checkbox("Frame seeding", &m_frameSeeding); + ImGui::SliderInt("Sample Count", &m_SampleCount, 0, 512); + ImGui::Separator(); ImGui::Text("Camera"); @@ -952,7 +898,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR isPerspective = false; ImGui::Checkbox("Enable \"view manipulate\"", &transformParams.enableViewManipulate); - //ImGui::Checkbox("Enable camera movement", &move); + // ImGui::Checkbox("Enable camera movement", &move); ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f); ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f); @@ -966,7 +912,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); - if (firstFrame) { camera.setPosition(cameraIntialPosition); @@ -1057,16 +1002,16 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGuizmo::SetID(0u); - // TODO: camera will return hlsl::float32_tMxN + // TODO: camera will return hlsl::float32_tMxN auto view = *reinterpret_cast(camera.getViewMatrix().pointer()); imguizmoM16InOut.view = hlsl::transpose(getMatrix3x4As4x4(view)); - // TODO: camera will return hlsl::float32_tMxN + // TODO: camera will return hlsl::float32_tMxN imguizmoM16InOut.projection = hlsl::transpose(*reinterpret_cast(camera.getProjectionMatrix().pointer())); ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &imguizmoM16InOut.model[0][0]); - if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates - imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/ + if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates + imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/ transformParams.editTransformDecomposition = true; mainViewTransformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams); @@ -1121,8 +1066,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR fieldName, ImVec4(c.r, c.g, c.b, 1.0f), 0, - ImVec2(20, 20) - ); + ImVec2(20, 20)); ImGui::SameLine(); ImGui::Text("%s", colorNames[index]); @@ -1140,9 +1084,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR drawColorField(":", m_GPUOutResulData.vertices[i]); ImGui::SameLine(); static const float32_t3 constCorners[8] = { - float32_t3(-1, -1, -1), float32_t3(1, -1, -1), float32_t3(-1, 1, -1), float32_t3(1, 1, -1), - float32_t3(-1, -1, 1), float32_t3(1, -1, 1), float32_t3(-1, 1, 1), float32_t3(1, 1, 1) - }; + float32_t3(-1, -1, -1), float32_t3(1, -1, -1), float32_t3(-1, 1, -1), float32_t3(1, 1, -1), + float32_t3(-1, -1, 1), float32_t3(1, -1, 1), float32_t3(-1, 1, 1), float32_t3(1, 1, 1) }; float32_t3 vertexLocation = constCorners[m_GPUOutResulData.vertices[i]]; ImGui::Text(" : (%.3f, %.3f, %.3f", vertexLocation.x, vertexLocation.y, vertexLocation.z); } @@ -1154,13 +1097,10 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR "", ImVec4(0.0f, 0.0f, 0.0f, 0.0f), 0, - ImVec2(20, 20) - ); + ImVec2(20, 20)); ImGui::SameLine(); ImGui::Text(""); - } - } } @@ -1178,8 +1118,24 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::Text("silhouette Vertex Count: %u", m_GPUOutResulData.silhouetteVertexCount); ImGui::Text("silhouette Positive VertexCount: %u", m_GPUOutResulData.positiveVertCount); ImGui::Text("Silhouette Mismatch: %s", m_GPUOutResulData.edgeVisibilityMismatch ? "true" : "false"); + ImGui::Separator(); ImGui::Text("Max triangles exceeded: %s", m_GPUOutResulData.maxTrianglesExceeded ? "true" : "false"); ImGui::Text("spherical lune detected: %s", m_GPUOutResulData.sphericalLuneDetected ? "true" : "false"); + ImGui::Separator(); + //ImGui::Text("Sampling outside the silhouette: %s", m_GPUOutResulData.sampleOutsideSilhouette ? "true" : "false"); + ImGui::Text("Parallelogram does not bound: %s", m_GPUOutResulData.parallelogramDoesNotBound ? "true" : "false"); + ImGui::Text("Parallelogram vertices inside: %s", m_GPUOutResulData.parallelogramVerticesInside ? "true" : "false"); + ImGui::Text("Parallelogram edges inside: %s", m_GPUOutResulData.parallelogramEdgesInside ? "true" : "false"); + ImGui::Text("Parallelogram area: %.3f", m_GPUOutResulData.parallelogramArea); + ImGui::Text("Failed vertex index: %u", m_GPUOutResulData.failedVertexIndex); + ImGui::Text("Failed vertex UV: (%.3f, %.3f)", m_GPUOutResulData.failedVertexUV.x, m_GPUOutResulData.failedVertexUV.y); + ImGui::Text("Failed edge index: %u", m_GPUOutResulData.failedEdgeIndex); + ImGui::Text("Failed edge sample: %u", m_GPUOutResulData.failedEdgeSample); + ImGui::Text("Failed edge UV: (%.3f, %.3f)", m_GPUOutResulData.failedEdgeUV.x, m_GPUOutResulData.failedEdgeUV.y); + ImGui::Text("Failed point 3D: (%.3f, %.3f, %.3f)", m_GPUOutResulData.failedPoint.x, m_GPUOutResulData.failedPoint.y, m_GPUOutResulData.failedPoint.z); + for (uint32_t i = 0; i < 8; i++) + ImGui::Text("edge is convex: %s", m_GPUOutResulData.edgeIsConvex[i] ? "true" : "false"); + ImGui::Separator(); { float32_t3 xAxis = m_OBBModelMatrix[0].xyz; @@ -1205,7 +1161,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR if (m_GPUOutResulData.silhouetteIndex != lastSilhouetteIndex) { modalShown = false; - modalDismissed = false; // Allow modal to show again for new configuration + modalDismissed = false; // Allow modal to show again for new configuration lastSilhouetteIndex = m_GPUOutResulData.silhouetteIndex; } @@ -1217,7 +1173,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } // Open modal only if not already shown/dismissed - if ((m_GPUOutResulData.edgeVisibilityMismatch || m_GPUOutResulData.maxTrianglesExceeded || m_GPUOutResulData.sphericalLuneDetected) && m_GPUOutResulData.silhouetteIndex != 13 && !modalShown && !modalDismissed) // Don't reopen if user dismissed it + if ((m_GPUOutResulData.edgeVisibilityMismatch || m_GPUOutResulData.maxTrianglesExceeded || m_GPUOutResulData.sphericalLuneDetected) && m_GPUOutResulData.silhouetteIndex != 13 && !modalShown && !modalDismissed) // Don't reopen if user dismissed it { ImGui::OpenPopup("Edge Visibility Mismatch Warning"); modalShown = true; @@ -1250,7 +1206,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { ImGui::CloseCurrentPopup(); modalShown = false; - modalDismissed = true; // Mark as dismissed to prevent reopening + modalDismissed = true; // Mark as dismissed to prevent reopening } ImGui::EndPopup(); } @@ -1284,7 +1240,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // Silhouette mask printed in binary - auto printBin = [](uint32_t bin, const char* name) { char buf[33]; @@ -1347,7 +1302,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { lastTRS = m_TRS; // Backup before randomizing int attempts = 0; - do { + do + { m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); attempts++; } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); @@ -1363,17 +1319,19 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { lastTRS = m_TRS; // Backup before randomizing int attempts = 0; - do { + do + { m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); attempts++; } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); } - //ImGui::SameLine(); + // ImGui::SameLine(); if (ImGui::Button("Randomize All")) { lastTRS = m_TRS; // Backup before randomizing int attempts = 0; - do { + do + { m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f)); m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); @@ -1399,9 +1357,9 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { auto* streaminingBuffer = imGUI->getStreamingBuffer(); - const size_t total = streaminingBuffer->get_total_size(); // total memory range size for which allocation can be requested - const size_t freeSize = streaminingBuffer->getAddressAllocator().get_free_size(); // max total free bloock memory size we can still allocate from total memory available - const size_t consumedMemory = total - freeSize; // memory currently consumed by streaming buffer + const size_t total = streaminingBuffer->get_total_size(); // total memory range size for which allocation can be requested + const size_t freeSize = streaminingBuffer->getAddressAllocator().get_free_size(); // max total free bloock memory size we can still allocate from total memory available + const size_t consumedMemory = total - freeSize; // memory currently consumed by streaming buffer float freePercentage = 100.0f * (float)(freeSize) / (float)total; float allocatedPercentage = (float)(consumedMemory) / (float)total; @@ -1420,11 +1378,11 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::SetCursorPosX(windowPadding); if (freePercentage > 70.0f) - ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(0.0f, 1.0f, 0.0f, 0.4f)); // Green + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(0.0f, 1.0f, 0.0f, 0.4f)); // Green else if (freePercentage > 30.0f) - ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 1.0f, 0.0f, 0.4f)); // Yellow + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 1.0f, 0.0f, 0.4f)); // Yellow else - ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 0.0f, 0.0f, 0.4f)); // Red + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 0.0f, 0.0f, 0.4f)); // Red ImGui::ProgressBar(allocatedPercentage, barSize, ""); @@ -1440,19 +1398,15 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR snprintf(textBuffer, sizeof(textBuffer), text, freePercentage); ImVec2 textSize = ImGui::CalcTextSize(textBuffer); - ImVec2 textPos = ImVec2 - ( + ImVec2 textPos = ImVec2( progressBarPos.x + (progressBarSize.x - textSize.x) * 0.5f, - progressBarPos.y + (progressBarSize.y - textSize.y) * 0.5f - ); + progressBarPos.y + (progressBarSize.y - textSize.y) * 0.5f); ImVec4 bgColor = ImGui::GetStyleColorVec4(ImGuiCol_WindowBg); - drawList->AddRectFilled - ( + drawList->AddRectFilled( ImVec2(textPos.x - 5, textPos.y - 2), ImVec2(textPos.x + textSize.x + 5, textPos.y + textSize.y + 2), - ImGui::GetColorU32(bgColor) - ); + ImGui::GetColorU32(bgColor)); ImGui::SetCursorScreenPos(textPos); ImGui::Text("%s", textBuffer); @@ -1483,12 +1437,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR struct TRS // Source of truth { float32_t3 translation{ 0.0f, 0.0f, 1.5f }; - float32_t3 rotation{ 0.0f }; // MUST stay orthonormal + float32_t3 rotation{ 0.0f }; // MUST stay orthonormal float32_t3 scale{ 1.0f }; } m_TRS; float32_t4x4 m_OBBModelMatrix; // always overwritten from TRS - //std::string_view objectName; + // std::string_view objectName; TransformRequestParams transformParams; TransformReturnInfo mainViewTransformReturnInfo; TransformReturnInfo solidAngleViewTransformReturnInfo; @@ -1499,7 +1453,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR float fov = 90.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f; float viewWidth = 10.f; - //uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed + // uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed bool isPerspective = true, isLH = true, flipGizmoY = true, move = true; bool firstFrame = true; @@ -1516,7 +1470,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // setting up pipeline in the constructor m_queueFamily = base.getComputeQueue()->getFamilyIndex(); m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - //core::smart_refctd_ptr* cmdBuffs[] = { &m_cmdbuf, &m_timestampBeforeCmdBuff, &m_timestampAfterCmdBuff }; + // core::smart_refctd_ptr* cmdBuffs[] = { &m_cmdbuf, &m_timestampBeforeCmdBuff, &m_timestampAfterCmdBuff }; if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) base.logFail("Failed to create Command Buffers!\n"); if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdBuff)) @@ -1550,25 +1504,19 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR base.logFail("Failed to load precompiled \"benchmark\" shader!\n"); nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { - { - .binding = 0, - .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ShaderStage::ESS_COMPUTE, - .count = 1 - } - }; + {.binding = 0, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = ShaderStage::ESS_COMPUTE, + .count = 1} }; smart_refctd_ptr dsLayout = base.m_device->createDescriptorSetLayout(bindings); if (!dsLayout) base.logFail("Failed to create a Descriptor Layout!\n"); SPushConstantRange pushConstantRanges[] = { - { - .stageFlags = ShaderStage::ESS_COMPUTE, - .offset = 0, - .size = sizeof(BenchmarkPushConstants) - } - }; + {.stageFlags = ShaderStage::ESS_COMPUTE, + .offset = 0, + .size = sizeof(BenchmarkPushConstants)} }; m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout)); if (!m_pplnLayout) base.logFail("Failed to create a Pipeline Layout!\n"); @@ -1578,7 +1526,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR params.layout = m_pplnLayout.get(); params.shader.entryPoint = "main"; params.shader.shader = shader.get(); - if (!base.m_device->createComputePipelines(nullptr, { ¶ms,1 }, &m_pipeline)) + if (!base.m_device->createComputePipelines(nullptr, { ¶ms, 1 }, &m_pipeline)) base.logFail("Failed to create pipelines (compile & link shaders)!\n"); } @@ -1603,16 +1551,15 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); assert(dummyBuff->getBoundMemory().memory == m_allocation.memory.get()); - smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(),1 }); + smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(), 1 }); m_ds = pool->createDescriptorSet(std::move(dsLayout)); { IGPUDescriptorSet::SDescriptorInfo info[1]; info[0].desc = smart_refctd_ptr(dummyBuff); - info[0].info.buffer = { .offset = 0,.size = BufferSize }; + info[0].info.buffer = { .offset = 0, .size = BufferSize }; IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { - {.dstSet = m_ds.get(),.binding = 0,.arrayElement = 0,.count = 1,.info = info} - }; + {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info} }; base.m_device->updateDescriptorSets(writes, {}); } } @@ -1630,15 +1577,20 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR void run() { m_logger->log("\n\nsampling benchmark result:", ILogger::ELL_PERFORMANCE); + + m_logger->log("sampling benchmark, parallelogram projected solid angle result:", ILogger::ELL_PERFORMANCE); + performBenchmark(SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE); + m_logger->log("sampling benchmark, triangle solid angle result:", ILogger::ELL_PERFORMANCE); - performBenchmark(SAMPLING_BENCHMARK_MODE::TRIANGLE_SOLID_ANGLE, SAMPLING_MODE_SOLID_ANGLE); + performBenchmark(SAMPLING_MODE::TRIANGLE_SOLID_ANGLE); + + //m_logger->log("sampling benchmark, triangle projected solid angle result:", ILogger::ELL_PERFORMANCE); + //performBenchmark(SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE); - m_logger->log("sampling benchmark, triangle projected solid angle result:", ILogger::ELL_PERFORMANCE); - performBenchmark(SAMPLING_BENCHMARK_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE, SAMPLING_MODE_PROJECTED_SOLID_ANGLE); } private: - void performBenchmark(SAMPLING_BENCHMARK_MODE mode, uint32_t solidAngleMode) + void performBenchmark(SAMPLING_MODE mode) { m_device->waitIdle(); @@ -1648,7 +1600,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR smart_refctd_ptr semaphore = m_device->createSemaphore(semaphoreCounter); IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} }; - IQueue::SSubmitInfo::SSemaphoreInfo waits[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT } }; + IQueue::SSubmitInfo::SSemaphoreInfo waits[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} }; IQueue::SSubmitInfo beforeTimestapSubmitInfo[1] = {}; const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsBegin[] = { {.cmdbuf = m_timestampBeforeCmdBuff.get()} }; @@ -1668,15 +1620,14 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR benchmarkSubmitInfos[0].signalSemaphores = signals; benchmarkSubmitInfos[0].waitSemaphores = waits; - m_pushConstants.benchmarkMode = mode; - m_pushConstants.samplingMode = solidAngleMode; m_pushConstants.modelMatrix = float32_t3x4(transpose(m_visualizer->interface.m_OBBModelMatrix)); recordCmdBuff(); // warmup runs for (int i = 0; i < WarmupIterations; ++i) { + if (i == 0) m_api->startCapture(); waits[0].value = semaphoreCounter; @@ -1776,8 +1727,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR static constexpr int Iterations = 1; }; - template - inline bool logFail(const char* msg, Args&&... args) + template + inline bool logFail(const char* msg, Args &&...args) { m_logger->log(msg, ILogger::ELL_ERROR, std::forward(args)...); return false; @@ -1786,5 +1737,4 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR std::ofstream m_logFile; }; - NBL_MAIN_FUNC(SolidAngleVisualizer) \ No newline at end of file From 2b034eb4a796e043d882e9e6335070466e7a871f Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Wed, 18 Feb 2026 02:41:47 +0300 Subject: [PATCH 17/26] huge shader refactor, more debug UI, also: - Added bilinear and biquadratic samplers - Added a modified version of Urena 2003 with better pre-computation - Fixes after merge from master - Shaders precompiled with permutations for runtime changing of sampling modes without register overhead etc. - removed a lot of code duplications --- .../app_resources/hlsl/Drawing.hlsl | 125 ++- .../app_resources/hlsl/Sampling.hlsl | 355 ------- .../hlsl/SolidAngleVis.frag.hlsl | 199 ---- .../hlsl/benchmark/benchmark.comp.hlsl | 113 ++- .../app_resources/hlsl/common.hlsl | 84 +- .../app_resources/hlsl/gpu_common.hlsl | 26 +- .../hlsl/parallelogram_sampling.hlsl | 727 ++++++-------- .../app_resources/hlsl/pyramid_sampling.hlsl | 568 +++++++++++ .../hlsl/pyramid_sampling/bilinear.hlsl | 86 ++ .../hlsl/pyramid_sampling/biquadratic.hlsl | 158 +++ .../hlsl/pyramid_sampling/urena.hlsl | 87 ++ .../{RayVis.frag.hlsl => ray_vis.frag.hlsl} | 141 ++- .../app_resources/hlsl/silhouette.hlsl | 355 ++++--- .../hlsl/solid_angle_vis.frag.hlsl | 305 ++++++ .../app_resources/hlsl/triangle_sampling.hlsl | 241 +++++ .../app_resources/hlsl/utils.hlsl | 33 +- 73_SolidAngleVisualizer/include/common.hpp | 1 - 73_SolidAngleVisualizer/main.cpp | 925 +++++++++--------- .../include/nbl/examples/cameras/CCamera.hpp | 139 +-- 19 files changed, 2863 insertions(+), 1805 deletions(-) delete mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl delete mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/biquadratic.hlsl create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/urena.hlsl rename 73_SolidAngleVisualizer/app_resources/hlsl/{RayVis.frag.hlsl => ray_vis.frag.hlsl} (68%) create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl index fa2a93b45..4338bd958 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl @@ -1,5 +1,8 @@ -#ifndef _DEBUG_HLSL_ -#define _DEBUG_HLSL_ +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_DRAWING_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_DRAWING_HLSL_INCLUDED_ #include "common.hlsl" #include "gpu_common.hlsl" @@ -210,6 +213,7 @@ float32_t4 drawCorners(float32_t3x4 modelMatrix, float32_t2 ndc, float32_t aaWid return color; } +#ifdef _SOLID_ANGLE_VIS_EXAMPLE_SILHOUETTE_HLSL_INCLUDED_ float32_t4 drawClippedSilhouetteVertices(float32_t2 ndc, ClippedSilhouette silhouette, float32_t aaWidth) { float32_t4 color = 0; @@ -235,6 +239,7 @@ float32_t4 drawClippedSilhouetteVertices(float32_t2 ndc, ClippedSilhouette silho } return color; } +#endif // _SOLID_ANGLE_VIS_EXAMPLE_SILHOUETTE_HLSL_INCLUDED_ float32_t4 drawRing(float32_t2 ndc, float32_t aaWidth) { @@ -378,6 +383,120 @@ float32_t4 drawFaces(float32_t3x4 modelMatrix, float32_t3 spherePos, float32_t a return color; } +// ============================================================================ +// Spherical geometry drawing helpers (for pyramid visualization) +// ============================================================================ + +// Draw a great circle where dot(p, axis) = 0 +// Used to visualize caliper planes +float32_t4 drawGreatCirclePlane( + float32_t3 axis, + float32_t3 spherePos, + float32_t aaWidth, + float32_t3 color, + float32_t width = 0.005f) +{ + float32_t3 fragDir = normalize(spherePos); + + // Only draw on front hemisphere + if (fragDir.z < 0.0f) + return float32_t4(0, 0, 0, 0); + + // Distance from the great circle plane + float32_t distFromPlane = abs(dot(fragDir, axis)); + + float32_t alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, distFromPlane); + + return float32_t4(color * alpha, alpha); +} + +// Draw lune boundaries - two small circles at dot(p, axis) = offset ± halfWidth +// halfWidth and offset are in sin-space (not radians) +float32_t4 drawLuneBoundary(float32_t3 axis, float32_t halfWidth, float32_t offset, float32_t3 spherePos, float32_t aaWidth, float32_t3 color, float32_t lineWidth = 0.004f) +{ + float32_t3 fragDir = normalize(spherePos); + + // Only draw on front hemisphere + if (fragDir.z < 0.0f) + return float32_t4(0, 0, 0, 0); + + // The lune boundaries are where dot(p, axis) = offset ± halfWidth + float32_t dotWithAxis = dot(fragDir, axis); + + // Draw both boundaries of the lune (accounting for offset) + float32_t upperBound = offset + halfWidth; + float32_t lowerBound = offset - halfWidth; + float32_t distFromUpperBoundary = abs(dotWithAxis - upperBound); + float32_t distFromLowerBoundary = abs(dotWithAxis - lowerBound); + + float32_t alphaUpper = 1.0f - smoothstep(lineWidth - aaWidth, lineWidth + aaWidth, distFromUpperBoundary); + float32_t alphaLower = 1.0f - smoothstep(lineWidth - aaWidth, lineWidth + aaWidth, distFromLowerBoundary); + + float32_t alpha = max(alphaUpper, alphaLower); + + return float32_t4(color * alpha, alpha); +} + +// Draw axis direction markers (dots at +/- axis from center) +float32_t4 drawAxisMarkers( + float32_t3 axis, + float32_t3 center, + float32_t2 ndc, + float32_t aaWidth, + float32_t3 color, + float32_t extent = 0.25f) +{ + float32_t4 result = float32_t4(0, 0, 0, 0); + + // Positive axis endpoint + float32_t3 axisEndPos = normalize(center + axis * extent); + float32_t3 axisEndPosCircle = sphereToCircle(axisEndPos); + result += drawCorner(axisEndPosCircle, ndc, aaWidth, 0.025f, 0.0f, color); + + // Negative axis endpoint (smaller, dimmer) + float32_t3 axisEndNeg = normalize(center - axis * extent); + float32_t3 axisEndNegCircle = sphereToCircle(axisEndNeg); + result += drawCorner(axisEndNegCircle, ndc, aaWidth, 0.015f, 0.0f, color * 0.5f); + + return result; +} + +// ============================================================================ +// Visualization +// ============================================================================ + +// Draw half of a great circle (the visible half of a lune boundary) +float32_t4 drawGreatCircleHalf(float32_t3 normal, float32_t3 spherePos, float32_t3 axis3, float32_t aaWidth, float32_t3 color, float32_t thickness) +{ + // Point is on great circle if dot(point, normal) ≈ 0 + // Only draw the half where dot(point, axis3) > 0 (toward silhouette) + float32_t dist = abs(dot(spherePos, normal)); + float32_t sideFade = smoothstep(-0.1f, 0.1f, dot(spherePos, axis3)); + float32_t alpha = (1.0f - smoothstep(thickness - aaWidth, thickness + aaWidth, dist)) * sideFade; + return float32_t4(color * alpha, alpha); +} + +// Visualize the best caliper edge (the edge that determined axis1) +float32_t4 visualizeBestCaliperEdge(const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t bestEdgeIdx, uint32_t count, float32_t3 spherePos, float32_t aaWidth) +{ + float32_t4 result = float32_t4(0, 0, 0, 0); + + if (bestEdgeIdx >= count) + return result; + + uint32_t nextIdx = (bestEdgeIdx + 1 < count) ? bestEdgeIdx + 1 : 0; + float32_t3 v0 = vertices[bestEdgeIdx]; + float32_t3 v1 = vertices[nextIdx]; + + // Draw the best caliper edge with a thicker, gold line + float32_t3 pts[2] = {v0, v1}; + float32_t3 highlightColor = float32_t3(1.0f, 0.8f, 0.0f); + float32_t alpha = drawGreatCircleArc(spherePos, pts, aaWidth, 0.008f); + result += float32_t4(highlightColor * alpha, alpha); + + return result; +} + #endif // VISUALIZE_SAMPLES #if DEBUG_DATA @@ -472,4 +591,4 @@ void validateEdgeVisibility(float32_t3x4 modelMatrix, uint32_t sil, uint32_t ver } #endif // DEBUG_DATA -#endif // _DEBUG_HLSL_ +#endif // _SOLID_ANGLE_VIS_EXAMPLE_DRAWING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl deleted file mode 100644 index cefa65267..000000000 --- a/73_SolidAngleVisualizer/app_resources/hlsl/Sampling.hlsl +++ /dev/null @@ -1,355 +0,0 @@ -#ifndef _SAMPLING_HLSL_ -#define _SAMPLING_HLSL_ - -// Include the spherical triangle utilities -#include "gpu_common.hlsl" -#include "parallelogram_sampling.hlsl" -#include -#include -#include -#include -#include - -using namespace nbl::hlsl; - -// Maximum number of triangles we can have after clipping -// Without clipping, max 3 faces can be visible at once so 3 faces * 2 triangles = 6 edges, forming max 4 triangles -// With clipping, one more edge. 7 - 2 = 5 max triangles because fanning from one vertex -#define MAX_TRIANGLES 5 - -// Minimal cached sampling data - only what's needed for selection -struct SamplingData -{ - uint32_t count; // Number of valid triangles - uint32_t samplingMode; // Mode used during build - float32_t totalWeight; // Sum of all triangle weights - float32_t3 faceNormal; // Face normal (only used for projected mode) - float32_t triangleSolidAngles[MAX_TRIANGLES]; // Weight per triangle (for selection) - uint32_t triangleIndices[MAX_TRIANGLES]; // Vertex index i (forms triangle with v0, vi, vi+1) -}; - -float32_t2 nextRandomUnorm2(inout nbl::hlsl::Xoroshiro64StarStar rnd) -{ - return float32_t2( - float32_t(rnd()) * 2.3283064365386963e-10, - float32_t(rnd()) * 2.3283064365386963e-10); -} - -float32_t computeProjectedSolidAngleFallback(float32_t3 v0, float32_t3 v1, float32_t3 v2, float32_t3 N) -{ - // 1. Get edge normals (unit vectors) - // We use the cross product of the vertices (unit vectors on sphere) - float32_t3 n0 = cross(v0, v1); - float32_t3 n1 = cross(v1, v2); - float32_t3 n2 = cross(v2, v0); - - // 2. Normalize edge normals (magnitude is sin of the arc length) - float32_t l0 = length(n0); - float32_t l1 = length(n1); - float32_t l2 = length(n2); - - // Guard against degenerate triangles - if (l0 < 1e-7 || l1 < 1e-7 || l2 < 1e-7) - return 0.0f; - - n0 /= l0; - n1 /= l1; - n2 /= l2; - - // 3. Get arc lengths (angles in radians) - float32_t a = asin(clamp(l0, -1.0f, 1.0f)); // side v0-v1 - float32_t b = asin(clamp(l1, -1.0f, 1.0f)); // side v1-v2 - float32_t c = asin(clamp(l2, -1.0f, 1.0f)); // side v2-v0 - - // Handle acos/asin quadrant if dot product is negative - if (dot(v0, v1) < 0) - a = 3.14159265 - a; - if (dot(v1, v2) < 0) - b = 3.14159265 - b; - if (dot(v2, v0) < 0) - c = 3.14159265 - c; - - // 4. Compute projected solid angle - float32_t Gamma = 0.5f * (a * dot(n0, N) + b * dot(n1, N) + c * dot(n2, N)); - - // Return the absolute value of the total - return abs(Gamma); -} - -// Build sampling data once - cache only weights for triangle selection -SamplingData buildSamplingDataFromSilhouette(ClippedSilhouette silhouette, uint32_t samplingMode) -{ - SamplingData data; - data.count = 0; - data.totalWeight = 0.0f; - data.samplingMode = samplingMode; - data.faceNormal = float32_t3(0, 0, 0); - - if (silhouette.count < 3) - return data; - - const float32_t3 v0 = silhouette.vertices[0]; - const float32_t3 origin = float32_t3(0, 0, 0); - - // Compute face normal ONCE before the loop - silhouette is planar! - if (samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - { - float32_t3 v1 = silhouette.vertices[1]; - float32_t3 v2 = silhouette.vertices[2]; - data.faceNormal = normalize(cross(v1 - v0, v2 - v0)); - } - - // Build fan triangulation from v0 - NBL_UNROLL - for (uint32_t i = 1; i < silhouette.count - 1; i++) - { - float32_t3 v1 = silhouette.vertices[i]; - float32_t3 v2 = silhouette.vertices[i + 1]; - - shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(v0, v1, v2, origin); - - // Skip degenerate triangles - if (shapeTri.pyramidAngles()) - continue; - - // Calculate triangle solid angle - float32_t solidAngle; - if (samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - { - // scalar_type projectedSolidAngleOfTriangle(const vector3_type receiverNormal, NBL_REF_ARG(vector3_type) cos_sides, NBL_REF_ARG(vector3_type) csc_sides, NBL_REF_ARG(vector3_type) cos_vertices) - float32_t3 cos_vertices = clamp( - (shapeTri.cos_sides - shapeTri.cos_sides.yzx * shapeTri.cos_sides.zxy) * - shapeTri.csc_sides.yzx * shapeTri.csc_sides.zxy, - float32_t3(-1.0f, -1.0f, -1.0f), - float32_t3(1.0f, 1.0f, 1.0f)); - solidAngle = shapeTri.projectedSolidAngleOfTriangle(data.faceNormal, shapeTri.cos_sides, shapeTri.csc_sides, cos_vertices); - } - else - { - solidAngle = shapeTri.solidAngleOfTriangle(); - } - - if (solidAngle <= 0.0f) - continue; - - // Store only what's needed for weighted selection - data.triangleSolidAngles[data.count] = solidAngle; - data.triangleIndices[data.count] = i; - data.totalWeight += solidAngle; - data.count++; - } - -#if DEBUG_DATA - // Validate no antipodal edges exist (would create spherical lune) - for (uint32_t i = 0; i < silhouette.count; i++) - { - uint32_t j = (i + 1) % silhouette.count; - float32_t3 n1 = normalize(silhouette.vertices[i]); - float32_t3 n2 = normalize(silhouette.vertices[j]); - - if (dot(n1, n2) < -0.99f) - { - DebugDataBuffer[0].sphericalLuneDetected = 1; - assert(false && "Spherical lune detected: antipodal silhouette edge"); - } - } - DebugDataBuffer[0].maxTrianglesExceeded = (data.count > MAX_TRIANGLES); - DebugDataBuffer[0].triangleCount = data.count; - DebugDataBuffer[0].totalSolidAngles = data.totalWeight; - for (uint32_t tri = 0; tri < data.count; tri++) - { - DebugDataBuffer[0].solidAngles[tri] = data.triangleSolidAngles[tri]; - } -#endif - - return data; -} - -// Sample using cached selection weights, but recompute geometry on-demand -float32_t3 sampleFromData(SamplingData data, ClippedSilhouette silhouette, float32_t2 xi, out float32_t pdf, out uint32_t selectedIdx) -{ - selectedIdx = 0; - - // Handle empty or invalid data - if (data.count == 0 || data.totalWeight <= 0.0f) - { - pdf = 0.0f; - return float32_t3(0, 0, 1); - } - - // Select triangle using cached weighted random selection - float32_t targetWeight = xi.x * data.totalWeight; - float32_t cumulativeWeight = 0.0f; - float32_t prevCumulativeWeight = 0.0f; - - NBL_UNROLL - for (uint32_t i = 0; i < data.count; i++) - { - prevCumulativeWeight = cumulativeWeight; - cumulativeWeight += data.triangleSolidAngles[i]; - - if (targetWeight <= cumulativeWeight) - { - selectedIdx = i; - break; - } - } - - // Remap xi.x to [0,1] within selected triangle's solidAngle interval - float32_t triSolidAngle = data.triangleSolidAngles[selectedIdx]; - float32_t u = (targetWeight - prevCumulativeWeight) / max(triSolidAngle, 1e-7f); - - // Reconstruct the selected triangle geometry - uint32_t vertexIdx = data.triangleIndices[selectedIdx]; - float32_t3 v0 = silhouette.vertices[0]; - float32_t3 v1 = silhouette.vertices[vertexIdx]; - float32_t3 v2 = silhouette.vertices[vertexIdx + 1]; - - float32_t3 faceNormal = normalize(cross(v1 - v0, v2 - v0)); - - float32_t3 origin = float32_t3(0, 0, 0); - - shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(v0, v1, v2, origin); - - // Compute vertex angles once - float32_t3 cos_vertices = clamp( - (shapeTri.cos_sides - shapeTri.cos_sides.yzx * shapeTri.cos_sides.zxy) * - shapeTri.csc_sides.yzx * shapeTri.csc_sides.zxy, - float32_t3(-1.0f, -1.0f, -1.0f), - float32_t3(1.0f, 1.0f, 1.0f)); - float32_t3 sin_vertices = sqrt(float32_t3(1.0f, 1.0f, 1.0f) - cos_vertices * cos_vertices); - - // Sample based on mode - float32_t3 direction; - float32_t rcpPdf; - - if (data.samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - { - sampling::ProjectedSphericalTriangle samplingTri = - sampling::ProjectedSphericalTriangle::create(shapeTri); - - direction = samplingTri.generate( - rcpPdf, - triSolidAngle, - cos_vertices, - sin_vertices, - shapeTri.cos_sides[0], - shapeTri.cos_sides[2], - shapeTri.csc_sides[1], - shapeTri.csc_sides[2], - faceNormal, - false, - float32_t2(u, xi.y)); - triSolidAngle = rcpPdf; // projected solid angle returned as rcpPdf - } - else - { - sampling::SphericalTriangle samplingTri = - sampling::SphericalTriangle::create(shapeTri); - - direction = samplingTri.generate( - triSolidAngle, - cos_vertices, - sin_vertices, - shapeTri.cos_sides[0], - shapeTri.cos_sides[2], - shapeTri.csc_sides[1], - shapeTri.csc_sides[2], - float32_t2(u, xi.y)); - } - - // Calculate PDF - float32_t trianglePdf = 1.0f / triSolidAngle; - float32_t selectionProb = triSolidAngle / data.totalWeight; - pdf = trianglePdf * selectionProb; - - return normalize(direction); -} - -#if VISUALIZE_SAMPLES - -float32_t4 visualizeSamples(float32_t2 screenUV, float32_t3 spherePos, float32_t2 ndc, float32_t aaWidth, ClippedSilhouette silhouette, SAMPLING_MODE samplingMode, uint32_t frameIndex, SamplingData samplingData, uint32_t numSamples -#if DEBUG_DATA - , - inout RWStructuredBuffer DebugDataBuffer -#endif -) -{ - float32_t4 accumColor = 0; - - if (silhouette.count == 0) - return 0; - - float32_t2 pssSize = float32_t2(0.3, 0.3); // 30% of screen - float32_t2 pssPos = float32_t2(0.01, 0.01); // Offset from corner - bool isInsidePSS = all(and(screenUV >= pssPos, screenUV <= (pssPos + pssSize))); - - ParallelogramSilhouette paraSilhouette = buildParallelogram(silhouette, ndc, spherePos, aaWidth, accumColor); - -#if DEBUG_DATA - DebugDataBuffer[0].sampleCount = numSamples; -#endif - for (uint32_t i = 0; i < numSamples; i++) - { - - // Hash the invocation to offset the grid - uint32_t offset = i * 747796405u + 2891336453u; - uint32_t idx = (offset) & 63u; // Keep within 64 samples - float32_t2 xi = float32_t2( - (float32_t(idx & 7u) + 0.5) / 8.0f, - (float32_t(idx >> 3u) + 0.5) / 8.0f); - - float32_t pdf; - uint32_t index = 0; - float32_t3 sampleDir; - if (samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || - samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - { - sampleDir = sampleFromData(samplingData, silhouette, xi, pdf, index); - } - else if (samplingMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) - { - bool valid; - sampleDir = sampleFromParallelogram(paraSilhouette, xi, pdf, valid); - if (!valid) - { - pdf = 0.0f; - sampleDir = float32_t3(0, 0, 1); - } - } -#if DEBUG_DATA - DebugDataBuffer[0].rayData[i] = float32_t4(sampleDir, pdf); -#endif - - float32_t dist3D = distance(sampleDir, normalize(spherePos)); - float32_t alpha3D = 1.0f - smoothstep(0.0f, 0.02f, dist3D); - - if (alpha3D > 0.0f && !isInsidePSS) - { - float32_t3 sampleColor = colorLUT[index].rgb; - accumColor += float32_t4(sampleColor * alpha3D, alpha3D); - } - - if (isInsidePSS) - { - // Map the raw xi to the PSS square dimensions - float32_t2 xiPixelPos = pssPos + xi * pssSize; - float32_t dist2D = distance(screenUV, xiPixelPos); - - float32_t alpha2D = drawCross2D(screenUV, xiPixelPos, 0.005f, 0.001f); - if (alpha2D > 0.0f) - { - float32_t3 sampleColor = colorLUT[index].rgb; - accumColor += float32_t4(sampleColor * alpha2D, alpha2D); - } - } - } - - // just the outline of the PSS - if (isInsidePSS && accumColor.a < 0.1) - accumColor = float32_t4(0.1, 0.1, 0.1, 1.0); - - return accumColor; -} -#endif // VISUALIZE_SAMPLES -#endif // _SAMPLING_HLSL_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl deleted file mode 100644 index bd9312733..000000000 --- a/73_SolidAngleVisualizer/app_resources/hlsl/SolidAngleVis.frag.hlsl +++ /dev/null @@ -1,199 +0,0 @@ -#pragma wave shader_stage(fragment) - -#include "common.hlsl" -#include - -using namespace nbl::hlsl; -using namespace ext::FullScreenTriangle; - -#if DEBUG_DATA -[[vk::binding(0, 0)]] RWStructuredBuffer DebugDataBuffer; // TODO: move below other includes -#endif - -#include "utils.hlsl" -#include "Drawing.hlsl" -#include "Sampling.hlsl" -#include "silhouette.hlsl" -[[vk::push_constant]] struct PushConstants pc; - -void setDebugData(uint32_t sil, uint32_t3 region, uint32_t configIndex) -{ -#if DEBUG_DATA - DebugDataBuffer[0].region = uint32_t3(region); - DebugDataBuffer[0].silhouetteIndex = uint32_t(configIndex); - DebugDataBuffer[0].silhouetteVertexCount = uint32_t(getSilhouetteSize(sil)); - for (uint32_t i = 0; i < 6; i++) - { - DebugDataBuffer[0].vertices[i] = uint32_t(getSilhouetteVertex(sil, i)); - } - DebugDataBuffer[0].silhouette = sil; -#endif -} - -void computeCubeGeo() -{ - for (uint32_t i = 0; i < 8; i++) - corners[i] = mul(pc.modelMatrix, float32_t4(constCorners[i], 1.0f)).xyz; - - for (uint32_t f = 0; f < 6; f++) - { - faceCenters[f] = float32_t3(0, 0, 0); - for (uint32_t v = 0; v < 4; v++) - faceCenters[f] += corners[faceToCorners[f][v]]; - faceCenters[f] /= 4.0f; - } -} - -void validateSilhouetteEdges(uint32_t sil, uint32_t vertexCount, inout uint32_t silEdgeMask) -{ -#if DEBUG_DATA - { - for (uint32_t i = 0; i < vertexCount; i++) - { - uint32_t vIdx = i % vertexCount; - uint32_t v1Idx = (i + 1) % vertexCount; - - uint32_t v0Corner = getSilhouetteVertex(sil, vIdx); - uint32_t v1Corner = getSilhouetteVertex(sil, v1Idx); - // Mark edge as part of silhouette - for (uint32_t e = 0; e < 12; e++) - { - uint32_t2 edge = allEdges[e]; - if ((edge.x == v0Corner && edge.y == v1Corner) || - (edge.x == v1Corner && edge.y == v0Corner)) - { - silEdgeMask |= (1u << e); - } - } - } - validateEdgeVisibility(pc.modelMatrix, sil, vertexCount, silEdgeMask); - } -#endif -} - -void computeSpherePos(SVertexAttributes vx, out float32_t2 ndc, out float32_t3 spherePos) -{ - ndc = vx.uv * 2.0f - 1.0f; - float32_t aspect = pc.viewport.z / pc.viewport.w; - ndc.x *= aspect; - - float32_t2 normalized = ndc / CIRCLE_RADIUS; - float32_t r2 = dot(normalized, normalized); - - if (r2 <= 1.0f) - { - spherePos = float32_t3(normalized.x, normalized.y, sqrt(1.0f - r2)); - } - else - { - float32_t uv2Plus1 = r2 + 1.0f; - spherePos = float32_t3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; - } - spherePos = normalize(spherePos); -} - -[[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 -{ - float32_t4 color = float32_t4(0, 0, 0, 0); - for (uint32_t i = 0; i < 1; i++) - { - float32_t aaWidth = length(float32_t2(ddx(vx.uv.x), ddy(vx.uv.y))); - float32_t3 spherePos; - float32_t2 ndc; - computeSpherePos(vx, ndc, spherePos); -#if !FAST || DEBUG_DATA - computeCubeGeo(); -#endif - uint32_t3 region; - uint32_t configIndex; - uint32_t vertexCount; - uint32_t sil = computeRegionAndConfig(pc.modelMatrix, region, configIndex, vertexCount); - - uint32_t silEdgeMask = 0; // TODO: take from 'fast' computeSilhouette() -#if DEBUG_DATA - validateSilhouetteEdges(sil, vertexCount, silEdgeMask); -#endif - ClippedSilhouette silhouette; - -#if VISUALIZE_SAMPLES - color += computeSilhouette(pc.modelMatrix, vertexCount, sil, spherePos, aaWidth, silhouette); -#else - computeSilhouette(pc.modelMatrix, vertexCount, sil, silhouette); -#endif - - SamplingData samplingData; - ParallelogramSilhouette paraSilhouette; - if (pc.samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || - pc.samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - { - samplingData = buildSamplingDataFromSilhouette(silhouette, pc.samplingMode); - } - else - { - - paraSilhouette = buildParallelogram(silhouette -#if VISUALIZE_SAMPLES - , - ndc, spherePos, aaWidth, color -#endif - ); - } - -#if VISUALIZE_SAMPLES - - // For debugging: Draw a small indicator of which faces are found - // color += drawVisibleFaceOverlay(pc.modelMatrix, spherePos, region, aaWidth); - - // color += drawFaces(pc.modelMatrix, spherePos, aaWidth); - - // Draw clipped silhouette vertices - // color += drawClippedSilhouetteVertices(ndc, silhouette, aaWidth); - color += drawHiddenEdges(pc.modelMatrix, spherePos, silEdgeMask, aaWidth); - // color += drawCorners(pc.modelMatrix, ndc, aaWidth, 0.05f); - color += drawRing(ndc, aaWidth); - - // Draw samples on sphere - color += visualizeSamples(vx.uv, spherePos, ndc, aaWidth, silhouette, pc.samplingMode, pc.frameIndex, samplingData, pc.sampleCount -#if DEBUG_DATA - , - DebugDataBuffer -#endif - ); - - if (all(vx.uv >= float32_t2(0.f, 0.97f)) && all(vx.uv <= float32_t2(0.03f, 1.0f))) - { - return float32_t4(colorLUT[configIndex], 1.0f); - } -#else - // Hash the invocation to offset the grid - uint32_t offset = 747796405u + 2891336453u; - uint32_t idx = (offset) & 63u; // Keep within 64 samples - float32_t2 xi = float32_t2( - (float32_t(idx & 7u) + 0.5) / 8.0f, - (float32_t(idx >> 3u) + 0.5) / 8.0f); - - float32_t pdf; - uint32_t index = 0; - float32_t3 sampleDir; - if (pc.samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || - pc.samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - { - sampleDir = sampleFromData(samplingData, silhouette, xi, pdf, index); - } - else if (pc.samplingMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) - { - bool valid; - sampleDir = sampleFromParallelogram(paraSilhouette, xi, pdf, valid); - if (!valid) - { - pdf = 0.0f; - sampleDir = float32_t3(0, 0, 1); - } - } - color += float4(sampleDir * 0.02f / pdf, 1.0f); -#endif // VISUALIZE_SAMPLES - setDebugData(sil, region, configIndex); - } - - return color; -} \ No newline at end of file diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl index 0ea7c2afb..3b49d17ca 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl @@ -1,37 +1,22 @@ -//// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O. +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #pragma shader_stage(compute) #include "app_resources/hlsl/common.hlsl" -// doesn't change Z coordinate -float32_t3 sphereToCircle(float32_t3 spherePoint) -{ - if (spherePoint.z >= 0.0f) - { - return float32_t3(spherePoint.xy, spherePoint.z); - } - else - { - float32_t r2 = (1.0f - spherePoint.z) / (1.0f + spherePoint.z); - float32_t uv2Plus1 = r2 + 1.0f; - return float32_t3((spherePoint.xy * uv2Plus1 / 2.0f), spherePoint.z); - } -} - -#undef DEBUG_DATA // Avoid conflict with DebugDataBuffer in this file -#undef VISUALIZE_SAMPLES - #include "app_resources/hlsl/benchmark/common.hlsl" #include "app_resources/hlsl/silhouette.hlsl" -#include "app_resources/hlsl/Sampling.hlsl" #include "app_resources/hlsl/parallelogram_sampling.hlsl" +#include "app_resources/hlsl/pyramid_sampling.hlsl" +#include "app_resources/hlsl/triangle_sampling.hlsl" using namespace nbl::hlsl; [[vk::binding(0, 0)]] RWByteAddressBuffer outputBuffer; [[vk::push_constant]] BenchmarkPushConstants pc; +static const SAMPLING_MODE benchmarkMode = (SAMPLING_MODE)SAMPLING_MODE_CONST; + [numthreads(BENCHMARK_WORKGROUP_DIMENSION_SIZE_X, 1, 1)] [shader("compute")] void main(uint32_t3 invocationID : SV_DispatchThreadID) @@ -43,43 +28,101 @@ using namespace nbl::hlsl; uint32_t3 region; uint32_t configIndex; uint32_t vertexCount; - uint32_t sil = computeRegionAndConfig(perturbedMatrix, region, configIndex, vertexCount); + uint32_t sil = ClippedSilhouette::computeRegionAndConfig(perturbedMatrix, region, configIndex, vertexCount); + + ClippedSilhouette silhouette = (ClippedSilhouette)0; + silhouette.compute(perturbedMatrix, vertexCount, sil); - ClippedSilhouette silhouette; - computeSilhouette(perturbedMatrix, vertexCount, sil, silhouette); float32_t pdf; uint32_t triIdx; + uint32_t validSampleCount = 0; float32_t3 sampleDir = float32_t3(0.0, 0.0, 0.0); - if (pc.benchmarkMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || - pc.benchmarkMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + + bool sampleValid; + if (benchmarkMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || + benchmarkMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) { - SamplingData samplingData; - samplingData = buildSamplingDataFromSilhouette(silhouette, pc.benchmarkMode); + TriangleFanSampler samplingData; + samplingData = TriangleFanSampler::create(silhouette, benchmarkMode); - for (uint32_t i = 0; i < 64; i++) + for (uint32_t i = 0; i < pc.sampleCount; i++) { float32_t2 xi = float32_t2( (float32_t(i & 7u) + 0.5f) / 8.0f, (float32_t(i >> 3u) + 0.5f) / 8.0f); - sampleDir += sampleFromData(samplingData, silhouette, xi, pdf, triIdx); + sampleDir += samplingData.sample(silhouette, xi, pdf, triIdx); + validSampleCount++; } } - else if (pc.benchmarkMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) + else if (benchmarkMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) { // Precompute parallelogram for sampling - ParallelogramSilhouette paraSilhouette = buildParallelogram(silhouette); - for (uint32_t i = 0; i < 64; i++) + silhouette.normalize(); + SilEdgeNormals silEdgeNormals; + Parallelogram parallelogram = Parallelogram::create(silhouette, silEdgeNormals); + for (uint32_t i = 0; i < pc.sampleCount; i++) + { + float32_t2 xi = float32_t2( + (float32_t(i & 7u) + 0.5f) / 8.0f, + (float32_t(i >> 3u) + 0.5f) / 8.0f); + + sampleDir += parallelogram.sample(silEdgeNormals, xi, pdf, sampleValid); + validSampleCount += sampleValid ? 1u : 0u; + } + } + else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE) + { + // Precompute spherical pyramid and Urena sampler once (edge normals fused) + SilEdgeNormals silEdgeNormals; + SphericalPyramid pyramid = SphericalPyramid::create(silhouette, silEdgeNormals); + UrenaSampler urena = UrenaSampler::create(pyramid); + + for (uint32_t i = 0; i < pc.sampleCount; i++) + { + float32_t2 xi = float32_t2( + (float32_t(i & 7u) + 0.5f) / 8.0f, + (float32_t(i >> 3u) + 0.5f) / 8.0f); + + sampleDir += urena.sample(pyramid, silEdgeNormals, xi, pdf, sampleValid); + validSampleCount += sampleValid ? 1u : 0u; + } + } + else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC) + { + // Precompute spherical pyramid and biquadratic sampler once (edge normals fused) + SilEdgeNormals silEdgeNormals; + SphericalPyramid pyramid = SphericalPyramid::create(silhouette, silEdgeNormals); + BiquadraticSampler biquad = BiquadraticSampler::create(pyramid); + + for (uint32_t i = 0; i < pc.sampleCount; i++) + { + float32_t2 xi = float32_t2( + (float32_t(i & 7u) + 0.5f) / 8.0f, + (float32_t(i >> 3u) + 0.5f) / 8.0f); + + sampleDir += biquad.sample(pyramid, silEdgeNormals, xi, pdf, sampleValid); + validSampleCount += sampleValid ? 1u : 0u; + } + } + else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) + { + // Precompute spherical pyramid and bilinear sampler once (edge normals fused) + SilEdgeNormals silEdgeNormals; + SphericalPyramid pyramid = SphericalPyramid::create(silhouette, silEdgeNormals); + BilinearSampler bilin = BilinearSampler::create(pyramid); + + for (uint32_t i = 0; i < pc.sampleCount; i++) { float32_t2 xi = float32_t2( (float32_t(i & 7u) + 0.5f) / 8.0f, (float32_t(i >> 3u) + 0.5f) / 8.0f); - bool valid; - sampleDir += sampleFromParallelogram(paraSilhouette, xi, pdf, valid); + sampleDir += bilin.sample(pyramid, silEdgeNormals, xi, pdf, sampleValid); + validSampleCount += sampleValid ? 1u : 0u; } } const uint32_t offset = sizeof(uint32_t) * invocationID.x; - outputBuffer.Store(offset, pdf + triIdx + asuint(sampleDir.x) + asuint(sampleDir.y) + asuint(sampleDir.z)); + outputBuffer.Store(offset, pdf + validSampleCount + triIdx + asuint(sampleDir.x) + asuint(sampleDir.y) + asuint(sampleDir.z)); } diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl index 9e4954ebc..d63ec3c6a 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl @@ -1,9 +1,10 @@ -#ifndef _SOLID_ANGLE_VIS_COMMON_HLSL_ -#define _SOLID_ANGLE_VIS_COMMON_HLSL_ +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_COMMON_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_COMMON_HLSL_INCLUDED_ #include "nbl/builtin/hlsl/cpp_compat.hlsl" -#define DEBUG_DATA 01 -#define VISUALIZE_SAMPLES 01 #define FAST 1 @@ -16,65 +17,83 @@ namespace nbl { TRIANGLE_SOLID_ANGLE, TRIANGLE_PROJECTED_SOLID_ANGLE, - PROJECTED_PARALLELOGRAM_SOLID_ANGLE + PROJECTED_PARALLELOGRAM_SOLID_ANGLE, + SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE, + SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC, + SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR, + Count }; struct ResultData { - uint32_t parallelogramDoesNotBound; - float32_t parallelogramArea; - uint32_t failedVertexIndex; - uint32_t edgeIsConvex[4]; - - uint32_t parallelogramVerticesInside; - uint32_t parallelogramEdgesInside; - uint32_t failedEdgeIndex; - float32_t2 failedVertexUV; - float32_t3 failedPoint; - uint32_t failedEdgeSample; - float32_t2 failedEdgeUV; - float32_t2 parallelogramCorners[4]; - + // Silhouette uint32_t3 region; uint32_t silhouetteIndex; - uint32_t silhouetteVertexCount; uint32_t silhouette; uint32_t positiveVertCount; uint32_t edgeVisibilityMismatch; - uint32_t clipMask; uint32_t clipCount; uint32_t rotatedSil; uint32_t wrapAround; - uint32_t rotatedClipMask; uint32_t rotateAmount; - uint32_t maxTrianglesExceeded; - uint32_t sphericalLuneDetected; - uint32_t vertices[6]; - uint32_t clippedSilhouetteVertexCount; float32_t3 clippedSilhouetteVertices[7]; uint32_t clippedSilhouetteVerticesIndices[7]; + // Parallelogram + uint32_t parallelogramDoesNotBound; + float32_t parallelogramArea; + uint32_t failedVertexIndex; + uint32_t edgeIsConvex[4]; + uint32_t parallelogramVerticesInside; + uint32_t parallelogramEdgesInside; + float32_t2 parallelogramCorners[4]; + + // spherical triangle + uint32_t maxTrianglesExceeded; + uint32_t sphericalLuneDetected; uint32_t triangleCount; float32_t solidAngles[5]; float32_t totalSolidAngles; - uint32_t sampleOutsideSilhouette; - // Sampling ray visualization data uint32_t sampleCount; - float32_t4 rayData[64]; // xyz = direction, w = PDF + float32_t4 rayData[512]; // xyz = direction, w = PDF + + // Pyramid sampling debug data + float32_t3 pyramidAxis1; // First caliper axis direction + float32_t3 pyramidAxis2; // Second caliper axis direction + float32_t3 pyramidCenter; // Silhouette center direction + float32_t pyramidHalfWidth1; // Half-width along axis1 (sin-space) + float32_t pyramidHalfWidth2; // Half-width along axis2 (sin-space) + float32_t pyramidOffset1; // Center offset along axis1 + float32_t pyramidOffset2; // Center offset along axis2 + float32_t pyramidSolidAngle; // Bounding region solid angle + uint32_t pyramidBestEdge; // Which edge produced best caliper + uint32_t pyramidSpansHemisphere; // Warning: silhouette >= hemisphere + float32_t pyramidMin1; // Min dot product along axis1 + float32_t pyramidMax1; // Max dot product along axis1 + float32_t pyramidMin2; // Min dot product along axis2 + float32_t pyramidMax2; // Max dot product along axis2 + uint32_t axis2BiggerThanAxis1; + + // Sampling stats + uint32_t validSampleCount; + uint32_t threadCount; // Used as a hack for fragment shader, as dividend for validSampleCount }; +#ifdef __HLSL_VERSION + [[vk::binding(0, 0)]] RWStructuredBuffer DebugDataBuffer; +#endif + struct PushConstants { float32_t3x4 modelMatrix; float32_t4 viewport; - SAMPLING_MODE samplingMode; uint32_t sampleCount; uint32_t frameIndex; }; @@ -84,6 +103,7 @@ namespace nbl float32_t4x4 viewProjMatrix; float32_t3x4 viewMatrix; float32_t3x4 modelMatrix; + float32_t3x4 invModelMatrix; float32_t4 viewport; uint32_t frameIndex; }; @@ -91,7 +111,7 @@ namespace nbl struct BenchmarkPushConstants { float32_t3x4 modelMatrix; - SAMPLING_MODE benchmarkMode; + uint32_t sampleCount; }; static const float32_t3 colorLUT[27] = { @@ -113,4 +133,4 @@ namespace nbl #endif // __HLSL_VERSION } } -#endif // _SOLID_ANGLE_VIS_COMMON_HLSL_ +#endif // _SOLID_ANGLE_VIS_EXAMPLE_COMMON_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl index 040883956..142471493 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl @@ -1,15 +1,20 @@ -#ifndef GPU_COMMON_HLSL -#define GPU_COMMON_HLSL +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_GPU_COMMON_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_GPU_COMMON_HLSL_INCLUDED_ -static const float32_t CIRCLE_RADIUS = 1.0f; +#include "utils.hlsl" + +static const float32_t CIRCLE_RADIUS = 0.5f; static const float32_t INV_CIRCLE_RADIUS = 1.0f / CIRCLE_RADIUS; // --- Geometry Utils --- -struct ClippedSilhouette -{ - float32_t3 vertices[7]; // Max 7 vertices after clipping, unnormalized - uint32_t count; -}; +#define MAX_SILHOUETTE_VERTICES 7 + +// Special index values for clip points +static const uint32_t CLIP_POINT_A = 23; // Clip point between last positive and first negative +static const uint32_t CLIP_POINT_B = 24; // Clip point between last negative and first positive static const float32_t3 constCorners[8] = { float32_t3(-0.5f, -0.5f, -0.5f), float32_t3(0.5f, -0.5f, -0.5f), float32_t3(-0.5f, 0.5f, -0.5f), float32_t3(0.5f, 0.5f, -0.5f), @@ -70,7 +75,7 @@ static const uint32_t silhouettes[27][7] = { {4, 2, 6, 7, 3, 0, 0}, // 10: Light Orange {6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange {4, 1, 3, 7, 5, 0, 0}, // 12: Pink - {6, 0, 4, 6, 7, 3, 2}, // 13: Light Pink + {4, 0, 4, 6, 7, 3, 2}, // 13: Light Pink {4, 0, 4, 6, 2, 0, 0}, // 14: Deep Rose {6, 0, 1, 3, 7, 5, 4}, // 15: Purple {4, 0, 1, 5, 4, 0, 0}, // 16: Light Purple @@ -166,4 +171,5 @@ float32_t3 getVertex(float32_t3x4 modelMatrix, uint32_t vertexIdx) return corners[vertexIdx]; #endif } -#endif // GPU_COMMON_HLSL + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_GPU_COMMON_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl index ea9bebcb3..cd02171af 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl @@ -1,535 +1,418 @@ -#ifndef _PARALLELOGRAM_SAMPLING_HLSL_ -#define _PARALLELOGRAM_SAMPLING_HLSL_ +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_PARALLELOGRAM_SAMPLING_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_PARALLELOGRAM_SAMPLING_HLSL_INCLUDED_ #include #include +#include "silhouette.hlsl" +#include "drawing.hlsl" -#define MAX_SILHOUETTE_VERTICES 7 #define MAX_CURVE_APEXES 2 -#define GET_PROJ_VERT(i) vertices[i].xy *CIRCLE_RADIUS +#define GET_PROJ_VERT(i) silhouette.vertices[i].xy *CIRCLE_RADIUS // ============================================================================ -// Core structures +// Minimum bounding rectangle on projected sphere // ============================================================================ - struct Parallelogram { float16_t2 corner; float16_t2 axisDir; float16_t width; float16_t height; -}; - -struct PrecomputedSilhouette -{ - float16_t3 edgeNormals[MAX_SILHOUETTE_VERTICES]; // 10.5 floats instead of 21 - uint32_t count; -}; - -struct ParallelogramSilhouette -{ - Parallelogram para; - PrecomputedSilhouette silhouette; -}; - -// ============================================================================ -// Silhouette helpers -// ============================================================================ - -PrecomputedSilhouette precomputeSilhouette(NBL_CONST_REF_ARG(ClippedSilhouette) sil) -{ - PrecomputedSilhouette result; - result.count = sil.count; - - float32_t3 v0 = sil.vertices[0]; - float32_t3 v1 = sil.vertices[1]; - float32_t3 v2 = sil.vertices[2]; - result.edgeNormals[0] = float16_t3(cross(v0, v1)); - result.edgeNormals[1] = float16_t3(cross(v1, v2)); + // ======================================================================== + // Projection helpers + // ======================================================================== - if (sil.count > 3) + static float32_t3 circleToSphere(float32_t2 circlePoint) { - float32_t3 v3 = sil.vertices[3]; - result.edgeNormals[2] = float16_t3(cross(v2, v3)); - - if (sil.count > 4) - { - float32_t3 v4 = sil.vertices[4]; - result.edgeNormals[3] = float16_t3(cross(v3, v4)); + float32_t2 xy = circlePoint / CIRCLE_RADIUS; + float32_t xy_len_sq = dot(xy, xy); + return float32_t3(xy, sqrt(1.0f - xy_len_sq)); + } - if (sil.count > 5) - { - float32_t3 v5 = sil.vertices[5]; - result.edgeNormals[4] = float16_t3(cross(v4, v5)); + // ======================================================================== + // Curve evaluation helpers + // ======================================================================== - if (sil.count > 6) - { - float32_t3 v6 = sil.vertices[6]; - result.edgeNormals[5] = float16_t3(cross(v5, v6)); - result.edgeNormals[6] = float16_t3(cross(v6, v0)); - } - else - { - result.edgeNormals[5] = float16_t3(cross(v5, v0)); - result.edgeNormals[6] = float16_t3(0.0f, 0.0f, 0.0f); - } - } - else - { - result.edgeNormals[4] = float16_t3(cross(v4, v0)); - result.edgeNormals[5] = float16_t3(0.0f, 0.0f, 0.0f); - result.edgeNormals[6] = float16_t3(0.0f, 0.0f, 0.0f); - } - } - else - { - result.edgeNormals[3] = float16_t3(cross(v3, v0)); - result.edgeNormals[4] = float16_t3(0.0f, 0.0f, 0.0f); - result.edgeNormals[5] = float16_t3(0.0f, 0.0f, 0.0f); - result.edgeNormals[6] = float16_t3(0.0f, 0.0f, 0.0f); - } - } - else + static float32_t2 evalCurvePoint(float32_t3 S, float32_t3 E, float32_t t) { - result.edgeNormals[2] = float16_t3(cross(v2, v0)); - result.edgeNormals[3] = float16_t3(0.0f, 0.0f, 0.0f); - result.edgeNormals[4] = float16_t3(0.0f, 0.0f, 0.0f); - result.edgeNormals[5] = float16_t3(0.0f, 0.0f, 0.0f); - result.edgeNormals[6] = float16_t3(0.0f, 0.0f, 0.0f); + float32_t3 v = S + t * (E - S); + float32_t invLen = rsqrt(dot(v, v)); + return v.xy * (invLen * CIRCLE_RADIUS); } - return result; -} + static float32_t2 evalCurveTangent(float32_t3 S, float32_t3 E, float32_t t) + { + float32_t3 v = S + t * (E - S); + float32_t vLenSq = dot(v, v); -bool isInsideSilhouetteFast(float32_t3 dir, NBL_CONST_REF_ARG(PrecomputedSilhouette) sil) -{ - float16_t3 d = float16_t3(dir); - half maxDot = dot(d, sil.edgeNormals[0]); - maxDot = max(maxDot, dot(d, sil.edgeNormals[1])); - maxDot = max(maxDot, dot(d, sil.edgeNormals[2])); - maxDot = max(maxDot, dot(d, sil.edgeNormals[3])); - maxDot = max(maxDot, dot(d, sil.edgeNormals[4])); - maxDot = max(maxDot, dot(d, sil.edgeNormals[5])); - maxDot = max(maxDot, dot(d, sil.edgeNormals[6])); - return maxDot <= half(0.0f); -} -float32_t3 circleToSphere(float32_t2 circlePoint) -{ - float32_t2 xy = circlePoint / CIRCLE_RADIUS; - float32_t xy_len_sq = dot(xy, xy); + if (vLenSq < 1e-12f) + return normalize(E.xy - S.xy); - // if (xy_len_sq >= 1.0f) - // return float32_t3(0, 0, 0); + float32_t3 p = v * rsqrt(vLenSq); + float32_t3 vPrime = E - S; + float32_t2 tangent2D = (vPrime - p * dot(p, vPrime)).xy; - return float32_t3(xy, sqrt(1.0f - xy_len_sq)); -} + float32_t len = length(tangent2D); + return (len > 1e-7f) ? tangent2D / len : normalize(E.xy - S.xy); + } -bool isEdgeConvex(float32_t3 S, float32_t3 E) -{ - return nbl::hlsl::cross2D(S.xy, E.xy) < -1e-6f; -} + // Get both endpoint tangents (shares SdotE computation) + static void getProjectedTangents(float32_t3 S, float32_t3 E, out float32_t2 t0, out float32_t2 t1) + { + float32_t SdotE = dot(S, E); -// ============================================================================ -// Curve evaluation helpers -// ============================================================================ + float32_t2 tangent0_2D = (E - S * SdotE).xy; + float32_t2 tangent1_2D = (E * SdotE - S).xy; -// Evaluate curve point at t using rsqrt -float32_t2 evalCurvePoint(float32_t3 S, float32_t3 E, float32_t t) -{ - float32_t3 v = S + t * (E - S); - float32_t invLen = rsqrt(dot(v, v)); - return v.xy * (invLen * CIRCLE_RADIUS); -} + float32_t len0Sq = dot(tangent0_2D, tangent0_2D); + float32_t len1Sq = dot(tangent1_2D, tangent1_2D); -// Evaluate tangent at arbitrary t -float32_t2 evalCurveTangent(float32_t3 S, float32_t3 E, float32_t t) -{ - float32_t3 v = S + t * (E - S); - float32_t vLenSq = dot(v, v); + const float32_t eps = 1e-14f; - if (vLenSq < 1e-12f) - return normalize(E.xy - S.xy); + if (len0Sq > eps && len1Sq > eps) + { + t0 = tangent0_2D * rsqrt(len0Sq); + t1 = tangent1_2D * rsqrt(len1Sq); + return; + } - float32_t3 p = v * rsqrt(vLenSq); - float32_t3 vPrime = E - S; - float32_t2 tangent2D = (vPrime - p * dot(p, vPrime)).xy; + // Rare fallback path + float32_t2 diff = E.xy - S.xy; + float32_t diffLenSq = dot(diff, diff); + float32_t2 fallback = diffLenSq > eps ? diff * rsqrt(diffLenSq) : float32_t2(1.0f, 0.0f); - float32_t len = length(tangent2D); - return (len > 1e-7f) ? tangent2D / len : normalize(E.xy - S.xy); -} + t0 = len0Sq > eps ? tangent0_2D * rsqrt(len0Sq) : fallback; + t1 = len1Sq > eps ? tangent1_2D * rsqrt(len1Sq) : fallback; + } -// Get both endpoint tangents efficiently (shares SdotE computation) -void getProjectedTangents(float32_t3 S, float32_t3 E, out float32_t2 t0, out float32_t2 t1) -{ - float32_t SdotE = dot(S, E); + // Compute apex with clamping to prevent apex explosion + static void computeApexClamped(float32_t2 p0, float32_t2 p1, float32_t2 t0, float32_t2 t1, out float32_t2 apex) + { + float32_t denom = t0.x * t1.y - t0.y * t1.x; + float32_t2 center = (p0 + p1) * 0.5f; - float32_t2 tangent0_2D = (E - S * SdotE).xy; - float32_t2 tangent1_2D = (E * SdotE - S).xy; + if (abs(denom) < 1e-6f) + { + apex = center; + return; + } - float32_t len0Sq = dot(tangent0_2D, tangent0_2D); - float32_t len1Sq = dot(tangent1_2D, tangent1_2D); + float32_t2 dp = p1 - p0; + float32_t s = (dp.x * t1.y - dp.y * t1.x) / denom; + apex = p0 + s * t0; - const float32_t eps = 1e-14f; + float32_t2 toApex = apex - center; + float32_t distSq = dot(toApex, toApex); + float32_t maxDistSq = CIRCLE_RADIUS * CIRCLE_RADIUS * 4.0f; - if (len0Sq > eps && len1Sq > eps) - { - t0 = tangent0_2D * rsqrt(len0Sq); - t1 = tangent1_2D * rsqrt(len1Sq); - return; + if (distSq > maxDistSq) + { + apex = center + toApex * (CIRCLE_RADIUS * 2.0f * rsqrt(distSq)); + } } - // Rare fallback path - float32_t2 diff = E.xy - S.xy; - float32_t diffLenSq = dot(diff, diff); - float32_t2 fallback = diffLenSq > eps ? diff * rsqrt(diffLenSq) : float32_t2(1.0f, 0.0f); - - t0 = len0Sq > eps ? tangent0_2D * rsqrt(len0Sq) : fallback; - t1 = len1Sq > eps ? tangent1_2D * rsqrt(len1Sq) : fallback; -} + // ======================================================================== + // Bounding box computation (rotating calipers) + // + // testEdgeForAxis and computeBoundsForAxis are + // templated on a bool to select between two precision levels: + // + // Accurate=false (used by tryCaliperDir, O(N^2) total calls): + // Tests vertices + edge midpoints only. Cheap (just dot products) and + // sufficient for *ranking* candidate axes, even though it may + // underestimate the true extent of convex edges. + // + // Accurate=true (used by buildForAxis, called once): + // Also computes tangent-line apex intersections for convex edges to + // find the true extremum. Great circle arcs that project as convex + // curves can bulge beyond their endpoints; the apex (tangent + // evaluation + line intersection + clamping) captures this but is + // ~4x more expensive per edge. + // + // The fast path gives the same relative ranking of axes (the + // approximation error is consistent across candidates), so the + // cheapest axis found by Fast is also the cheapest under Accurate. + // ======================================================================== + + static void testPoint(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, float32_t2 pt, float32_t2 dir, float32_t2 perpDir) + { + float32_t projAlong = dot(pt, dir); + float32_t projPerp = dot(pt, perpDir); -// Compute apex with clamping to prevent apex explosion -void computeApexClamped(float32_t2 p0, float32_t2 p1, float32_t2 t0, float32_t2 t1, out float32_t2 apex) -{ - float32_t denom = t0.x * t1.y - t0.y * t1.x; - float32_t2 center = (p0 + p1) * 0.5f; + minAlong = min(minAlong, projAlong); + maxAlong = max(maxAlong, projAlong); + minPerp = min(minPerp, projPerp); + maxPerp = max(maxPerp, projPerp); + } - if (abs(denom) < 1e-6f) + // Accurate=false (Fast): tests vertex + midpoint only. Used O(N^2) times for axis ranking. + // Accurate=true: also computes tangent-line apex for convex edges. Used once for final rect. + template + static void testEdgeForAxis(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, const ClippedSilhouette silhouette, uint32_t convexMask, uint32_t n3Mask, float32_t2 dir, float32_t2 perpDir) { - apex = center; - return; - } + const uint32_t nextIdx = (I + 1 < silhouette.count) ? I + 1 : 0; + const float32_t2 projectedVertex = GET_PROJ_VERT(I); - float32_t2 dp = p1 - p0; - float32_t s = (dp.x * t1.y - dp.y * t1.x) / denom; - apex = p0 + s * t0; + testPoint(minAlong, maxAlong, minPerp, maxPerp, projectedVertex, dir, perpDir); - float32_t2 toApex = apex - center; - float32_t distSq = dot(toApex, toApex); - float32_t maxDistSq = CIRCLE_RADIUS * CIRCLE_RADIUS * 4.0f; + bool isN3 = (n3Mask & (1u << I)) != 0; - if (distSq > maxDistSq) - { - apex = center + toApex * (CIRCLE_RADIUS * 2.0f * rsqrt(distSq)); - } -} + if (Accurate) + { + bool isConvex = (convexMask & (1u << I)) != 0; -void testPoint(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, float32_t2 pt, float32_t2 axisDir, float32_t2 perpDir) -{ - float32_t projAlong = dot(pt, axisDir); - float32_t projPerp = dot(pt, perpDir); - - minAlong = min(minAlong, projAlong); - maxAlong = max(maxAlong, projAlong); - minPerp = min(minPerp, projPerp); - maxPerp = max(maxPerp, projPerp); -} - -template -void testEdgeForAxisFast(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, - uint32_t count, uint32_t n3Mask, float32_t2 axisDir, float32_t2 perpDir, - const float32_t3 vertices[MAX_SILHOUETTE_VERTICES]) -{ - const uint32_t nextIdx = (I + 1 < count) ? I + 1 : 0; + if (!isN3 && !isConvex) + return; - testPoint(minAlong, maxAlong, minPerp, maxPerp, GET_PROJ_VERT(I), axisDir, perpDir); + float32_t3 S = silhouette.vertices[I]; + float32_t3 E = silhouette.vertices[nextIdx]; + float32_t2 midPoint = evalCurvePoint(S, E, 0.5f); - if (n3Mask & (1u << I)) - { - float32_t2 midPoint = evalCurvePoint(vertices[I], vertices[nextIdx], 0.5f); - testPoint(minAlong, maxAlong, minPerp, maxPerp, midPoint, axisDir, perpDir); - } -} + if (isN3) + { + testPoint(minAlong, maxAlong, minPerp, maxPerp, midPoint, dir, perpDir); + } -float32_t computeBoundingBoxAreaForAxisFast(NBL_CONST_REF_ARG(float32_t3) vertices[MAX_SILHOUETTE_VERTICES], uint32_t n3Mask, uint32_t count, float32_t2 axisDir) -{ - float32_t2 perpDir = float32_t2(-axisDir.y, axisDir.x); + if (isConvex) + { + float32_t2 t0, endTangent; + getProjectedTangents(S, E, t0, endTangent); - float32_t minAlong = 1e10f; - float32_t maxAlong = -1e10f; - float32_t minPerp = 1e10f; - float32_t maxPerp = -1e10f; + if (dot(t0, perpDir) > 0.0f) + { + float32_t2 apex0; + if (isN3) + { + float32_t2 tangentAtMid = evalCurveTangent(S, E, 0.5f); + computeApexClamped(projectedVertex, midPoint, t0, tangentAtMid, apex0); + testPoint(minAlong, maxAlong, minPerp, maxPerp, apex0, dir, perpDir); + + if (dot(tangentAtMid, perpDir) > 0.0f) + { + float32_t2 apex1; + computeApexClamped(midPoint, E.xy * CIRCLE_RADIUS, tangentAtMid, endTangent, apex1); + testPoint(minAlong, maxAlong, minPerp, maxPerp, apex1, dir, perpDir); + } + } + else + { + computeApexClamped(projectedVertex, E.xy * CIRCLE_RADIUS, t0, endTangent, apex0); + testPoint(minAlong, maxAlong, minPerp, maxPerp, apex0, dir, perpDir); + } + } + } + } + else + { + if (isN3) + { + float32_t2 midPoint = evalCurvePoint(silhouette.vertices[I], silhouette.vertices[nextIdx], 0.5f); + testPoint(minAlong, maxAlong, minPerp, maxPerp, midPoint, dir, perpDir); + } + } + } - testEdgeForAxisFast<0>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); - testEdgeForAxisFast<1>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); - testEdgeForAxisFast<2>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); - if (count > 3) + // Unrolled bounding box computation for a given axis direction. + // Accurate=false: fast path for axis ranking during candidate selection. + // Accurate=true: tight bounds with apex computation for the final rectangle. + template + static void computeBoundsForAxis(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, const ClippedSilhouette silhouette, uint32_t convexMask, uint32_t n3Mask, float32_t2 dir, float32_t2 perpDir) { - testEdgeForAxisFast<3>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); - if (count > 4) + testEdgeForAxis<0, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); + testEdgeForAxis<1, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); + testEdgeForAxis<2, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); + if (silhouette.count > 3) { - testEdgeForAxisFast<4>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); - if (count > 5) + testEdgeForAxis<3, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); + if (silhouette.count > 4) { - testEdgeForAxisFast<5>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); - if (count > 6) + testEdgeForAxis<4, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); + if (silhouette.count > 5) { - testEdgeForAxisFast<6>(minAlong, maxAlong, minPerp, maxPerp, count, n3Mask, axisDir, perpDir, vertices); + testEdgeForAxis<5, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); + if (silhouette.count > 6) + { + testEdgeForAxis<6, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); + } } } } } - return (maxAlong - minAlong) * (maxPerp - minPerp); -} - -void tryCaliperDir(inout float32_t bestArea, inout float32_t2 bestDir, const float32_t2 dir, const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t n3Mask, uint32_t count) -{ - float32_t area = computeBoundingBoxAreaForAxisFast(vertices, n3Mask, count, dir); - - if (area < bestArea) + static void tryCaliperDir(inout float32_t bestArea, inout float32_t2 bestDir, const float32_t2 dir, const ClippedSilhouette silhouette, uint32_t n3Mask) { - bestArea = area; - bestDir = dir; - } -} + float32_t2 perpDir = float32_t2(-dir.y, dir.x); -template -inline void processEdge(inout float32_t bestArea, inout float32_t2 bestDir, inout uint32_t convexMask, inout uint32_t n3Mask, uint32_t count, const float32_t3 vertices[MAX_SILHOUETTE_VERTICES]) -{ - const uint32_t nextIdx = (I + 1 < count) ? I + 1 : 0; - float32_t3 S = vertices[I]; - float32_t3 E = vertices[nextIdx]; + float32_t minAlong = 1e10f; + float32_t maxAlong = -1e10f; + float32_t minPerp = 1e10f; + float32_t maxPerp = -1e10f; - float32_t2 t0, t1; - getProjectedTangents(S, E, t0, t1); + computeBoundsForAxis(minAlong, maxAlong, minPerp, maxPerp, silhouette, 0, n3Mask, dir, perpDir); - tryCaliperDir(bestArea, bestDir, t0, vertices, n3Mask, count); - - if (isEdgeConvex(S, E)) - { - convexMask |= (1u << I); - tryCaliperDir(bestArea, bestDir, t1, vertices, n3Mask, count); - - if (dot(t0, t1) < 0.5f) + float32_t area = (maxAlong - minAlong) * (maxPerp - minPerp); + if (area < bestArea) { - n3Mask |= (1u << I); - float32_t2 tangentAtMid = evalCurveTangent(S, E, 0.5f); - tryCaliperDir(bestArea, bestDir, tangentAtMid, vertices, n3Mask, count); + bestArea = area; + bestDir = dir; } } -} - -template -inline void testEdgeForAxisAccurate(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, uint32_t count, uint32_t convexMask, uint32_t n3Mask, - float32_t2 axisDir, float32_t2 perpDir, const float32_t3 vertices[MAX_SILHOUETTE_VERTICES]) -{ - const uint32_t nextIdx = (I + 1 < count) ? I + 1 : 0; - float32_t2 projectedVertex = vertices[I].xy * CIRCLE_RADIUS; - - testPoint(minAlong, maxAlong, minPerp, maxPerp, projectedVertex, axisDir, perpDir); - bool isN3 = (n3Mask & (1u << I)) != 0; - bool isConvex = (convexMask & (1u << I)) != 0; - - if (!isN3 && !isConvex) - return; - - float32_t3 S = vertices[I]; - float32_t3 E = vertices[nextIdx]; - float32_t2 midPoint = evalCurvePoint(S, E, 0.5f); - - if (isN3) + template + static void processEdge(inout float32_t bestArea, inout float32_t2 bestDir, inout uint32_t convexMask, inout uint32_t n3Mask, const ClippedSilhouette silhouette, inout SilEdgeNormals precompSil) { - testPoint(minAlong, maxAlong, minPerp, maxPerp, midPoint, axisDir, perpDir); - } + const uint32_t nextIdx = (I + 1 < silhouette.count) ? I + 1 : 0; + float32_t3 S = silhouette.vertices[I]; + float32_t3 E = silhouette.vertices[nextIdx]; + precompSil.edgeNormals[I] = float16_t3(cross(S, E)); - if (isConvex) - { - float32_t2 t0, endTangent; - getProjectedTangents(S, E, t0, endTangent); + float32_t2 t0, t1; + getProjectedTangents(S, E, t0, t1); - if (dot(t0, perpDir) > 0.0f) + tryCaliperDir(bestArea, bestDir, t0, silhouette, n3Mask); + + if (nbl::hlsl::cross2D(S.xy, E.xy) < -1e-6f) { - float32_t2 apex0; - if (isN3) - { - float32_t2 tangentAtMid = evalCurveTangent(S, E, 0.5f); - computeApexClamped(projectedVertex, midPoint, t0, tangentAtMid, apex0); - testPoint(minAlong, maxAlong, minPerp, maxPerp, apex0, axisDir, perpDir); + convexMask |= (1u << I); + tryCaliperDir(bestArea, bestDir, t1, silhouette, n3Mask); - if (dot(tangentAtMid, perpDir) > 0.0f) - { - float32_t2 apex1; - computeApexClamped(midPoint, E.xy * CIRCLE_RADIUS, tangentAtMid, endTangent, apex1); - testPoint(minAlong, maxAlong, minPerp, maxPerp, apex1, axisDir, perpDir); - } - } - else + if (dot(t0, t1) < 0.5f) { - computeApexClamped(projectedVertex, E.xy * CIRCLE_RADIUS, t0, endTangent, apex0); - testPoint(minAlong, maxAlong, minPerp, maxPerp, apex0, axisDir, perpDir); + n3Mask |= (1u << I); + float32_t2 tangentAtMid = evalCurveTangent(S, E, 0.5f); + tryCaliperDir(bestArea, bestDir, tangentAtMid, silhouette, n3Mask); } } } -} -Parallelogram buildParallelogramForAxisAccurate(const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t convexMask, uint32_t n3Mask, uint32_t count, float32_t2 axisDir) -{ - float32_t2 perpDir = float32_t2(-axisDir.y, axisDir.x); + // ======================================================================== + // Factory methods + // ======================================================================== - float32_t minAlong = 1e10f; - float32_t maxAlong = -1e10f; - float32_t minPerp = 1e10f; - float32_t maxPerp = -1e10f; - - testEdgeForAxisAccurate<0>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); - testEdgeForAxisAccurate<1>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); - testEdgeForAxisAccurate<2>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); - if (count > 3) + static Parallelogram buildForAxis(const ClippedSilhouette silhouette, uint32_t convexMask, uint32_t n3Mask, float32_t2 dir) { - testEdgeForAxisAccurate<3>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); - if (count > 4) - { - testEdgeForAxisAccurate<4>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); - if (count > 5) - { - testEdgeForAxisAccurate<5>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); - if (count > 6) - { - testEdgeForAxisAccurate<6>(minAlong, maxAlong, minPerp, maxPerp, count, convexMask, n3Mask, axisDir, perpDir, vertices); - } - } - } - } + float32_t2 perpDir = float32_t2(-dir.y, dir.x); + + float32_t minAlong = 1e10f; + float32_t maxAlong = -1e10f; + float32_t minPerp = 1e10f; + float32_t maxPerp = -1e10f; - Parallelogram result; - result.width = float16_t(maxAlong - minAlong); - result.height = float16_t(maxPerp - minPerp); - result.axisDir = float16_t2(axisDir); - result.corner = float16_t2(minAlong * axisDir + minPerp * float16_t2(-axisDir.y, axisDir.x)); + computeBoundsForAxis(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); - return result; -} + Parallelogram result; + result.width = float16_t(maxAlong - minAlong); + result.height = float16_t(maxPerp - minPerp); + result.axisDir = float16_t2(dir); + result.corner = float16_t2(minAlong * dir + minPerp * float16_t2(-dir.y, dir.x)); -Parallelogram findMinimumBoundingBoxCurved(const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count + return result; + } + + // Silhouette vertices must be normalized before calling create() + static Parallelogram create(const ClippedSilhouette silhouette, out SilEdgeNormals precompSil #if VISUALIZE_SAMPLES - , - float32_t2 ndc, float32_t3 spherePos, float32_t aaWidth, - inout float32_t4 color + , + float32_t2 ndc, float32_t3 spherePos, float32_t aaWidth, + inout float32_t4 color #endif -) -{ - uint32_t convexMask = 0; - uint32_t n3Mask = 0; - float32_t bestArea = 1e10f; - float32_t2 bestDir = float32_t2(1.0f, 0.0f); - - processEdge<0>(bestArea, bestDir, convexMask, n3Mask, count, vertices); - processEdge<1>(bestArea, bestDir, convexMask, n3Mask, count, vertices); - processEdge<2>(bestArea, bestDir, convexMask, n3Mask, count, vertices); - if (count > 3) + ) { - processEdge<3>(bestArea, bestDir, convexMask, n3Mask, count, vertices); - if (count > 4) + precompSil = (SilEdgeNormals)0; + precompSil.count = silhouette.count; + + uint32_t convexMask = 0; + uint32_t n3Mask = 0; + float32_t bestArea = 1e10f; + float32_t2 bestDir = float32_t2(1.0f, 0.0f); + + processEdge<0>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); + processEdge<1>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); + processEdge<2>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); + if (silhouette.count > 3) { - processEdge<4>(bestArea, bestDir, convexMask, n3Mask, count, vertices); - if (count > 5) + processEdge<3>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); + if (silhouette.count > 4) { - processEdge<5>(bestArea, bestDir, convexMask, n3Mask, count, vertices); - if (count > 6) + processEdge<4>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); + if (silhouette.count > 5) { - processEdge<6>(bestArea, bestDir, convexMask, n3Mask, count, vertices); + processEdge<5>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); + if (silhouette.count > 6) + { + processEdge<6>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); + } } } } - } - tryCaliperDir(bestArea, bestDir, float32_t2(1.0f, 0.0f), vertices, n3Mask, count); - tryCaliperDir(bestArea, bestDir, float32_t2(0.0f, 1.0f), vertices, n3Mask, count); + tryCaliperDir(bestArea, bestDir, float32_t2(1.0f, 0.0f), silhouette, n3Mask); + tryCaliperDir(bestArea, bestDir, float32_t2(0.0f, 1.0f), silhouette, n3Mask); - Parallelogram best = buildParallelogramForAxisAccurate(vertices, convexMask, n3Mask, count, bestDir); + Parallelogram best = buildForAxis(silhouette, convexMask, n3Mask, bestDir); #if VISUALIZE_SAMPLES - for (uint32_t i = 0; i < count; i++) - { - if (convexMask & (1u << i)) + for (uint32_t i = 0; i < silhouette.count; i++) { - uint32_t nextIdx = (i + 1) % count; - float32_t2 p0 = vertices[i].xy * CIRCLE_RADIUS; - float32_t2 p1 = vertices[nextIdx].xy * CIRCLE_RADIUS; + if (convexMask & (1u << i)) + { + uint32_t nextIdx = (i + 1) % silhouette.count; + float32_t2 p0 = GET_PROJ_VERT(i); + float32_t2 p1 = GET_PROJ_VERT(nextIdx); - float32_t2 t0, endTangent; - getProjectedTangents(vertices[i], vertices[nextIdx], t0, endTangent); + float32_t2 t0, endTangent; + getProjectedTangents(silhouette.vertices[i], silhouette.vertices[nextIdx], t0, endTangent); - if (n3Mask & (1u << i)) - { - float32_t2 tangentAtMid = evalCurveTangent(vertices[i], vertices[nextIdx], 0.5f); - float32_t2 midPoint = evalCurvePoint(vertices[i], vertices[nextIdx], 0.5f); + if (n3Mask & (1u << i)) + { + float32_t2 tangentAtMid = evalCurveTangent(silhouette.vertices[i], silhouette.vertices[nextIdx], 0.5f); + float32_t2 midPoint = evalCurvePoint(silhouette.vertices[i], silhouette.vertices[nextIdx], 0.5f); - float32_t2 apex0, apex1; - computeApexClamped(p0, midPoint, t0, tangentAtMid, apex0); - computeApexClamped(midPoint, p1, tangentAtMid, endTangent, apex1); + float32_t2 apex0, apex1; + computeApexClamped(p0, midPoint, t0, tangentAtMid, apex0); + computeApexClamped(midPoint, p1, tangentAtMid, endTangent, apex1); - color += drawCorner(float32_t3(apex0, 0.0f), ndc, aaWidth, 0.03, 0.0f, float32_t3(1, 0, 1)); - color += drawCorner(float32_t3(midPoint, 0.0f), ndc, aaWidth, 0.02, 0.0f, float32_t3(0, 1, 0)); - color += drawCorner(float32_t3(apex1, 0.0f), ndc, aaWidth, 0.03, 0.0f, float32_t3(1, 0.5, 0)); - } - else - { - float32_t2 apex; - computeApexClamped(p0, p1, t0, endTangent, apex); - color += drawCorner(float32_t3(apex, 0.0f), ndc, aaWidth, 0.03, 0.0f, float32_t3(1, 0, 1)); + color += drawCorner(float32_t3(apex0, 0.0f), ndc, aaWidth, 0.03, 0.0f, float32_t3(1, 0, 1)); + color += drawCorner(float32_t3(midPoint, 0.0f), ndc, aaWidth, 0.02, 0.0f, float32_t3(0, 1, 0)); + color += drawCorner(float32_t3(apex1, 0.0f), ndc, aaWidth, 0.03, 0.0f, float32_t3(1, 0.5, 0)); + } + else + { + float32_t2 apex; + computeApexClamped(p0, p1, t0, endTangent, apex); + color += drawCorner(float32_t3(apex, 0.0f), ndc, aaWidth, 0.03, 0.0f, float32_t3(1, 0, 1)); + } } } - } #endif - - return best; -} -// ============================================================================ -// Main entry points -// ============================================================================ - -ParallelogramSilhouette buildParallelogram(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette -#if VISUALIZE_SAMPLES - , - float32_t2 ndc, float32_t3 spherePos, float32_t aaWidth, - inout float32_t4 color -#endif -) -{ - ParallelogramSilhouette result; - - // if (silhouette.count < 3) - // { - // result.para.corner = float32_t2(0, 0); - // result.para.edge0 = float32_t2(1, 0); - // result.para.edge1 = float32_t2(0, 1); - // result.para.area = 1.0f; - // return result; - // } - - result.para = findMinimumBoundingBoxCurved(silhouette.vertices, silhouette.count -#if VISUALIZE_SAMPLES - , - ndc, spherePos, aaWidth, color -#endif - ); - #if DEBUG_DATA - DebugDataBuffer[0].parallelogramArea = result.para.width * result.para.height; + DebugDataBuffer[0].parallelogramArea = best.width * best.height; #endif - result.silhouette = precomputeSilhouette(silhouette); - return result; -} + return best; + } -float32_t3 sampleFromParallelogram(NBL_CONST_REF_ARG(ParallelogramSilhouette) paraSilhouette, float32_t2 xi, out float32_t pdf, out bool valid) -{ - float16_t2 axisDir = paraSilhouette.para.axisDir; - float16_t2 perpDir = float16_t2(-axisDir.y, axisDir.x); + float32_t3 sample(NBL_CONST_REF_ARG(SilEdgeNormals) silhouette, float32_t2 xi, out float32_t pdf, out bool valid) + { + float16_t2 perpDir = float16_t2(-axisDir.y, axisDir.x); - float16_t2 circleXY = paraSilhouette.para.corner + - float16_t(xi.x) * paraSilhouette.para.width * axisDir + - float16_t(xi.y) * paraSilhouette.para.height * perpDir; + float16_t2 circleXY = corner + + float16_t(xi.x) * width * axisDir + + float16_t(xi.y) * height * perpDir; - float32_t3 direction = circleToSphere(circleXY); + float32_t3 direction = circleToSphere(circleXY); - valid = (direction.z > 0.0f) && isInsideSilhouetteFast(direction, paraSilhouette.silhouette); - pdf = valid ? (1.0f / (paraSilhouette.para.width * paraSilhouette.para.height)) : 0.0f; + valid = direction.z > 0.0f && silhouette.isInside(direction); + // PDF in solid angle measure: the rectangle is in circle-space (scaled by CIRCLE_RADIUS), + // and the orthographic projection Jacobian is dA_circle/dω = CIRCLE_RADIUS^2 * z + pdf = valid ? (CIRCLE_RADIUS * CIRCLE_RADIUS * direction.z / (float32_t(width) * float32_t(height))) : 0.0f; - return direction; -} + return direction; + } +}; -#endif // _PARALLELOGRAM_SAMPLING_HLSL_ +#endif // _SOLID_ANGLE_VIS_EXAMPLE_PARALLELOGRAM_SAMPLING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl new file mode 100644 index 000000000..fab111b3e --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl @@ -0,0 +1,568 @@ +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_PYRAMID_SAMPLING_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_PYRAMID_SAMPLING_HLSL_INCLUDED_ + +#include "gpu_common.hlsl" + +#include +#include +#include +#include + +#include "silhouette.hlsl" +#include "drawing.hlsl" + +// ============================================================================ +// Spherical Rectangle Bound via Rotating Calipers +// +// Bounds the silhouette with a spherical rectangle (intersection of two +// orthogonal lunes). Each lune is defined by two great circles (planes +// through the origin). The rectangle is parameterized for downstream +// samplers (Urena, bilinear, biquadratic) in pyramid_sampling/*.hlsl. +// +// Algorithm: +// 1. Rotating Calipers: Find the edge that minimizes the lune-width proxy +// dot(cross(A, B), C) = sin(edge_len) * sin(angular_dist) +// No per-edge normalization needed, scalar triple product suffices. +// +// 2. Build orthonormal frame from the minimum-width edge: +// - axis1 = normalize(cross(A, B)), pole of the primary lune +// - axis2, axis3 complete the frame via edge-based candidate search +// (tryPrimaryFrameCandidate), oriented toward silhouette center +// +// 3. Project vertices onto the frame as (x/z, y/z) +// to find the bounding rectangle extents (rectR0, rectExtents) +// +// 4. Fallback: if the primary frame leaves vertices near the z=0 plane, +// fix axis3 = camera forward (0,0,1) and search axis1/axis2 via +// tryFallbackFrameCandidate +// +// Key property: If all vertices are inside a great circle half-space, +// then all edges (geodesic arcs) are also inside. No edge extremum +// checking needed (unlike parallelogram_sampling which works in +// projected 2D space where arcs can bulge beyond vertices). +// ============================================================================ +// Spherical rectangle bound: stores the orthonormal frame and gnomonic +// projection extents. Consumed by UrenaSampler, BilinearSampler, BiquadraticSampler. +struct SphericalPyramid +{ + // Orthonormal frame for the bounding region + float32_t3 axis1; // Primary axis (from minimum-width edge's great circle normal) + float32_t3 axis2; // Secondary axis (perpendicular to axis1) + float32_t3 axis3; // Forward axis, toward silhouette (primary) or camera forward (fallback) + + // SphericalRectangle parameters (in the local frame where axis3 is Z) + float32_t3 rectR0; // Corner position in local frame + float32_t2 rectExtents; // Width (along axis1) and height (along axis2) + float32_t solidAngle; // Solid angle of the bounding region (steradians) + + // ======================================================================== + // Rotating Calipers - Minimum Width Edge Finding (Scalar Triple Product) + // ======================================================================== + + // Simplified metric: dot(cross(A, B), C) = sin(edge_len) * sin(angular_dist) + // This is a lune-area proxy, no per-edge normalization needed for comparison. + // Per-vertex cost: one dot product with precomputed edge normal. + // Per-edge cost: one cross product (replaces addition + rsqrt). + // + // Triangular column-major traversal (rotating calipers pattern): + // Vertex V_j checks against edges 0..j-2. + // V2 -> edge 0; V3 -> edges 0,1; V4 -> edges 0,1,2; etc. + // Total checks: (N-2)(N-1)/2 instead of N(N-2). + // + // Endpoints: dot(cross(A,B), A) = dot(cross(A,B), B) = 0, never affect max. + static void findMinimumWidthEdge(const ClippedSilhouette silhouette, out uint32_t bestEdge, out float32_t3 bestV0, out float32_t3 bestV1, out float32_t bestWidth, out SilEdgeNormals precompSil) + { + precompSil = (SilEdgeNormals)0; + precompSil.count = silhouette.count; + + // Edge normals: cross(v[i], v[i+1]), inward-facing for CCW-from-origin winding + float32_t3 en0 = cross(silhouette.vertices[0], silhouette.vertices[1]); + precompSil.edgeNormals[0] = float16_t3(en0); + float32_t3 en1 = cross(silhouette.vertices[1], silhouette.vertices[2]); + precompSil.edgeNormals[1] = float16_t3(en1); + + // Per-edge max(dot(en_i, v_j)), positive = inside, maximum = widest vertex + float32_t maxDot0 = dot(silhouette.vertices[2], en0); // V2 vs edge 0 + + float32_t maxDot1 = 1e10f; + float32_t maxDot2 = 1e10f; + float32_t maxDot3 = 1e10f; + float32_t maxDot4 = 1e10f; + + if (silhouette.count > 3) + { + float32_t3 en2 = cross(silhouette.vertices[2], silhouette.vertices[3]); + precompSil.edgeNormals[2] = float16_t3(en2); + + // V3 vs edges 0, 1 + float32_t3 v3 = silhouette.vertices[3]; + maxDot0 = max(maxDot0, dot(v3, en0)); + maxDot1 = dot(v3, en1); + + if (silhouette.count > 4) + { + float32_t3 en3 = cross(silhouette.vertices[3], silhouette.vertices[4]); + precompSil.edgeNormals[3] = float16_t3(en3); + + // V4 vs edges 0, 1, 2 + float32_t3 v4 = silhouette.vertices[4]; + maxDot0 = max(maxDot0, dot(v4, en0)); + maxDot1 = max(maxDot1, dot(v4, en1)); + maxDot2 = dot(v4, en2); + + if (silhouette.count > 5) + { + float32_t3 en4 = cross(silhouette.vertices[4], silhouette.vertices[5]); + precompSil.edgeNormals[4] = float16_t3(en4); + + // V5 vs edges 0, 1, 2, 3 + float32_t3 v5 = silhouette.vertices[5]; + maxDot0 = max(maxDot0, dot(v5, en0)); + maxDot1 = max(maxDot1, dot(v5, en1)); + maxDot2 = max(maxDot2, dot(v5, en2)); + maxDot3 = dot(v5, en3); + + if (silhouette.count > 6) + { + // V6 vs edges 0, 1, 2, 3, 4 + float32_t3 v6 = silhouette.vertices[6]; + maxDot0 = max(maxDot0, dot(v6, en0)); + maxDot1 = max(maxDot1, dot(v6, en1)); + maxDot2 = max(maxDot2, dot(v6, en2)); + maxDot3 = max(maxDot3, dot(v6, en3)); + maxDot4 = dot(v6, en4); + } + } + } + } + + // Best edge: minimum maxDot, no per-edge normalization needed. + // Relative epsilon prevents tie-breaking flicker when two edges have + // nearly identical widths — the current winner is "sticky" unless a + // new edge is meaningfully better (0.1% narrower). + const float32_t EDGE_SELECT_EPS = 1e-3f; + + bestWidth = maxDot0; + bestEdge = 0; + bestV0 = silhouette.vertices[0]; + bestV1 = silhouette.vertices[1]; + + if (silhouette.count > 3) + { + bool better = maxDot1 < bestWidth * (1.0f - EDGE_SELECT_EPS); + bestWidth = better ? maxDot1 : bestWidth; + bestEdge = better ? 1 : bestEdge; + bestV0 = better ? silhouette.vertices[1] : bestV0; + bestV1 = better ? silhouette.vertices[2] : bestV1; + + if (silhouette.count > 4) + { + better = maxDot2 < bestWidth * (1.0f - EDGE_SELECT_EPS); + bestWidth = better ? maxDot2 : bestWidth; + bestEdge = better ? 2 : bestEdge; + bestV0 = better ? silhouette.vertices[2] : bestV0; + bestV1 = better ? silhouette.vertices[3] : bestV1; + + if (silhouette.count > 5) + { + better = maxDot3 < bestWidth * (1.0f - EDGE_SELECT_EPS); + bestWidth = better ? maxDot3 : bestWidth; + bestEdge = better ? 3 : bestEdge; + bestV0 = better ? silhouette.vertices[3] : bestV0; + bestV1 = better ? silhouette.vertices[4] : bestV1; + + if (silhouette.count > 6) + { + better = maxDot4 < bestWidth * (1.0f - EDGE_SELECT_EPS); + bestWidth = better ? maxDot4 : bestWidth; + bestEdge = better ? 4 : bestEdge; + bestV0 = better ? silhouette.vertices[4] : bestV0; + bestV1 = better ? silhouette.vertices[5] : bestV1; + } + } + } + } + + // Check the last 2 edges missed by the triangular traversal: + // Edge count-2: vertices[count-2] -> vertices[count-1], check V0..V[count-3] + // Edge count-1: vertices[count-1] -> vertices[0], check V1..V[count-2] + // Explicit per-count unrolling avoids the generic loop with runtime index comparisons. + { + // Penultimate edge: vertices[count-2] -> vertices[count-1] + const uint32_t penIdx = silhouette.count - 2; + float32_t3 enPen = cross(silhouette.vertices[penIdx], silhouette.vertices[penIdx + 1]); + precompSil.edgeNormals[penIdx] = float16_t3(enPen); + float32_t maxDotPen = dot(silhouette.vertices[0], enPen); + if (silhouette.count > 3) + { + maxDotPen = max(maxDotPen, dot(silhouette.vertices[1], enPen)); + if (silhouette.count > 4) + { + maxDotPen = max(maxDotPen, dot(silhouette.vertices[2], enPen)); + if (silhouette.count > 5) + { + maxDotPen = max(maxDotPen, dot(silhouette.vertices[3], enPen)); + if (silhouette.count > 6) + { + maxDotPen = max(maxDotPen, dot(silhouette.vertices[4], enPen)); + } + } + } + } + + bool betterPen = maxDotPen < bestWidth * (1.0f - EDGE_SELECT_EPS); + bestWidth = betterPen ? maxDotPen : bestWidth; + bestEdge = betterPen ? penIdx : bestEdge; + bestV0 = betterPen ? silhouette.vertices[penIdx] : bestV0; + bestV1 = betterPen ? silhouette.vertices[penIdx + 1] : bestV1; + + // Last edge: vertices[count-1] -> vertices[0] (wrap-around) + const uint32_t lastIdx = silhouette.count - 1; + float32_t3 enLast = cross(silhouette.vertices[lastIdx], silhouette.vertices[0]); + precompSil.edgeNormals[lastIdx] = float16_t3(enLast); + float32_t maxDotLast = dot(silhouette.vertices[1], enLast); + if (silhouette.count > 3) + { + maxDotLast = max(maxDotLast, dot(silhouette.vertices[2], enLast)); + if (silhouette.count > 4) + { + maxDotLast = max(maxDotLast, dot(silhouette.vertices[3], enLast)); + if (silhouette.count > 5) + { + maxDotLast = max(maxDotLast, dot(silhouette.vertices[4], enLast)); + if (silhouette.count > 6) + { + maxDotLast = max(maxDotLast, dot(silhouette.vertices[5], enLast)); + } + } + } + } + + bool betterLast = maxDotLast < bestWidth * (1.0f - EDGE_SELECT_EPS); + bestWidth = betterLast ? maxDotLast : bestWidth; + bestEdge = betterLast ? lastIdx : bestEdge; + bestV0 = betterLast ? silhouette.vertices[lastIdx] : bestV0; + bestV1 = betterLast ? silhouette.vertices[0] : bestV1; + } + } + + // ======================================================================== + // Template-Unrolled Projection Helpers + // ======================================================================== + + // Project a single vertex onto candidate axes, updating bounds and minZ in one fused pass + template + static void projectAndBound(const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], float32_t3 projAxis1, float32_t3 projAxis2, float32_t3 projAxis3, NBL_REF_ARG(float32_t4) bound, NBL_REF_ARG(float32_t) minZ) + { + float32_t3 v = vertices[I]; + float32_t x = dot(v, projAxis1); + float32_t y = dot(v, projAxis2); + float32_t z = dot(v, projAxis3); + minZ = min(minZ, z); + float32_t rcpZ = rcp(z); + float32_t projX = x * rcpZ; + float32_t projY = y * rcpZ; + bound.x = min(bound.x, projX); + bound.y = min(bound.y, projY); + bound.z = max(bound.z, projX); + bound.w = max(bound.w, projY); + } + + // Project all silhouette vertices (template-unrolled, fused bounds + minZ) + static void projectAllVertices(const ClippedSilhouette silhouette, float32_t3 projAxis1, float32_t3 projAxis2, float32_t3 projAxis3, NBL_REF_ARG(float32_t4) bound, NBL_REF_ARG(float32_t) minZ) + { + bound = float32_t4(1e10f, 1e10f, -1e10f, -1e10f); + minZ = 1e10f; + projectAndBound<0>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); + projectAndBound<1>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); + projectAndBound<2>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); + if (silhouette.count > 3) + { + projectAndBound<3>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); + if (silhouette.count > 4) + { + projectAndBound<4>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); + if (silhouette.count > 5) + { + projectAndBound<5>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); + if (silhouette.count > 6) + { + projectAndBound<6>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); + } + } + } + } + } + + // ======================================================================== + // Template-Unrolled Frame Candidate Selection + // ======================================================================== + + // Try an edge as frame candidate for the primary path (axis1 fixed, find best axis2/axis3) + template + static void tryPrimaryFrameCandidate(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, float32_t3 fixedAxis1, float32_t3 axis3Ref, + NBL_REF_ARG(float32_t) bestArea, NBL_REF_ARG(float32_t3) bestAxis2, + NBL_REF_ARG(float32_t3) bestAxis3, NBL_REF_ARG(bool) found, + NBL_REF_ARG(float32_t) bestMinZ, NBL_REF_ARG(float32_t4) bestBound) + { + const uint32_t j = CheckCount ? ((I + 1 < silhouette.count) ? I + 1 : 0) : I + 1; + float32_t3 edge = silhouette.vertices[j] - silhouette.vertices[I]; + + // Candidate axis2: perpendicular to edge, in plane perpendicular to axis1 + float32_t3 axis2Cand = cross(fixedAxis1, edge); + float32_t lenSq = dot(axis2Cand, axis2Cand); + if (lenSq < 1e-14f) + return; + axis2Cand *= rsqrt(lenSq); + + // Candidate axis3: completes the frame + float32_t3 axis3Cand = cross(fixedAxis1, axis2Cand); + + // Ensure axis3 points toward center (same hemisphere as reference) + if (dot(axis3Cand, axis3Ref) < 0.0f) + { + axis2Cand = -axis2Cand; + axis3Cand = -axis3Cand; + } + + // Fused: check all vertices have positive z AND compute bounding rect in one pass + float32_t4 bound; + float32_t minZ; + projectAllVertices(silhouette, fixedAxis1, axis2Cand, axis3Cand, bound, minZ); + + // Skip if any vertex would have z <= 0 + if (minZ <= 1e-6f) + return; + + float32_t rectArea = (bound.z - bound.x) * (bound.w - bound.y); + if (rectArea < bestArea) + { + bestArea = rectArea; + bestAxis2 = axis2Cand; + bestAxis3 = axis3Cand; + bestMinZ = minZ; + bestBound = bound; + found = true; + } + } + + // Try an edge as frame candidate for the fallback path (axis3 fixed, find best axis1/axis2) + template + static void tryFallbackFrameCandidate(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, float32_t3 fixedAxis3, NBL_REF_ARG(float32_t) bestArea, NBL_REF_ARG(float32_t3) bestAxis1, NBL_REF_ARG(float32_t3) bestAxis2, NBL_REF_ARG(uint32_t) bestEdge, NBL_REF_ARG(float32_t4) bestBound) + { + const uint32_t j = CheckCount ? ((I + 1 < silhouette.count) ? I + 1 : 0) : I + 1; + float32_t3 edge = silhouette.vertices[j] - silhouette.vertices[I]; + + float32_t3 edgeInPlane = edge - fixedAxis3 * dot(edge, fixedAxis3); + float32_t lenSq = dot(edgeInPlane, edgeInPlane); + if (lenSq < 1e-14f) + return; + + float32_t3 axis1Cand = edgeInPlane * rsqrt(lenSq); + float32_t3 axis2Cand = cross(fixedAxis3, axis1Cand); + + float32_t4 bound; + float32_t minZ; + projectAllVertices(silhouette, axis1Cand, axis2Cand, fixedAxis3, bound, minZ); + + float32_t rectArea = (bound.z - bound.x) * (bound.w - bound.y); + if (rectArea < bestArea) + { + bestArea = rectArea; + bestAxis1 = axis1Cand; + bestAxis2 = axis2Cand; + bestBound = bound; + bestEdge = I; + } + } + + // ======================================================================== + // Visualization + // ======================================================================== + +#if VISUALIZE_SAMPLES + float32_t4 visualize(float32_t3 spherePos, float32_t2 ndc, float32_t aaWidth) + { + float32_t4 color = float32_t4(0, 0, 0, 0); + + // Colors for visualization + float32_t3 boundColor1 = float32_t3(1.0f, 0.5f, 0.5f); // Light red for axis1 bounds + float32_t3 boundColor2 = float32_t3(0.5f, 0.5f, 1.0f); // Light blue for axis2 bounds + float32_t3 centerColor = float32_t3(1.0f, 1.0f, 0.0f); // Yellow for center + + float32_t x0 = rectR0.x; + float32_t x1 = rectR0.x + rectExtents.x; + float32_t y0 = rectR0.y; + float32_t y1 = rectR0.y + rectExtents.y; + float32_t z = rectR0.z; + + // Great circle normals for the 4 edges (in local frame, then transform to world) + float32_t3 bottomNormalLocal = normalize(float32_t3(0, -z, y0)); + float32_t3 topNormalLocal = normalize(float32_t3(0, z, -y1)); + float32_t3 leftNormalLocal = normalize(float32_t3(-z, 0, x0)); + float32_t3 rightNormalLocal = normalize(float32_t3(z, 0, -x1)); + + // Transform to world space + float32_t3 bottomNormal = bottomNormalLocal.x * axis1 + bottomNormalLocal.y * axis2 + bottomNormalLocal.z * axis3; + float32_t3 topNormal = topNormalLocal.x * axis1 + topNormalLocal.y * axis2 + topNormalLocal.z * axis3; + float32_t3 leftNormal = leftNormalLocal.x * axis1 + leftNormalLocal.y * axis2 + leftNormalLocal.z * axis3; + float32_t3 rightNormal = rightNormalLocal.x * axis1 + rightNormalLocal.y * axis2 + rightNormalLocal.z * axis3; + + // Draw the 4 bounding great circles + color += drawGreatCircleHalf(bottomNormal, spherePos, axis3, aaWidth, boundColor2, 0.004f); + color += drawGreatCircleHalf(topNormal, spherePos, axis3, aaWidth, boundColor2, 0.004f); + color += drawGreatCircleHalf(leftNormal, spherePos, axis3, aaWidth, boundColor1, 0.004f); + color += drawGreatCircleHalf(rightNormal, spherePos, axis3, aaWidth, boundColor1, 0.004f); + + // Draw center point (center of the rectangle projected onto sphere) + float32_t centerX = (x0 + x1) * 0.5f; + float32_t centerY = (y0 + y1) * 0.5f; + float32_t3 centerLocal = normalize(float32_t3(centerX, centerY, z)); + float32_t3 centerWorld = centerLocal.x * axis1 - centerLocal.y * axis2 + centerLocal.z * axis3; + + float32_t3 centerCircle = sphereToCircle(centerWorld); + color += drawCorner(centerCircle, ndc, aaWidth, 0.025f, 0.0f, centerColor); + + color += drawCorner(axis1, ndc, aaWidth, 0.025f, 0.0f, float32_t3(1.0f, 0.0f, 0.0f)); + color += drawCorner(axis2, ndc, aaWidth, 0.025f, 0.0f, float32_t3(0.0f, 1.0f, 0.0f)); + color += drawCorner(axis3, ndc, aaWidth, 0.025f, 0.0f, float32_t3(0.0f, 0.0f, 1.0f)); + + return color; + } +#endif // VISUALIZE_SAMPLES + + // ======================================================================== + // Factory + // ======================================================================== + + static SphericalPyramid create(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, NBL_REF_ARG(SilEdgeNormals) silEdgeNormals +#if VISUALIZE_SAMPLES + , + float32_t2 ndc, float32_t3 spherePos, float32_t aaWidth, inout float32_t4 color +#endif + ) + { + SphericalPyramid self; + + // Step 1: Find minimum-width edge using rotating calipers with lune metric + uint32_t bestEdge; + float32_t3 bestV0, bestV1; + float32_t minWidth; + findMinimumWidthEdge(silhouette, bestEdge, bestV0, bestV1, minWidth, silEdgeNormals); + + // Step 2: Build orthonormal frame from best edge + // axis1 = perpendicular to the best edge's great circle (primary caliper direction) + self.axis1 = normalize(cross(bestV0, bestV1)); + + // Compute centroid for reference direction + float32_t3 center = silhouette.getCenter(); + float32_t3 centerInPlane = center - self.axis1 * dot(center, self.axis1); + float32_t3 axis3Ref = normalize(centerInPlane); + + // Step 2b: Try each edge-aligned rotation around axis1 to find the axis2/axis3 + // orientation that keeps all vertices in the positive half-space with minimum + // bounding rectangle area + float32_t bestRectArea = 1e20f; + float32_t3 bestAxis2 = cross(axis3Ref, self.axis1); + float32_t3 bestAxis3 = axis3Ref; + bool foundValidFrame = false; + float32_t bestMinZ = 0.0f; + float32_t4 bounds = float32_t4(-0.1f, -0.1f, 0.1f, 0.1f); + + tryPrimaryFrameCandidate<0>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); + tryPrimaryFrameCandidate<1>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); + tryPrimaryFrameCandidate<2>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); + if (silhouette.count > 3) + { + tryPrimaryFrameCandidate<3, true>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); + if (silhouette.count > 4) + { + tryPrimaryFrameCandidate<4, true>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); + if (silhouette.count > 5) + { + tryPrimaryFrameCandidate<5, true>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); + if (silhouette.count > 6) + { + tryPrimaryFrameCandidate<6, true>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); + } + } + } + } + + self.axis2 = bestAxis2; + self.axis3 = bestAxis3; + + // Fallback: if the primary path failed (no valid frame found, or axis3 leaves + // vertices too close to the z=0 singularity), fix axis3 = camera forward and + // search for the best axis1/axis2 rotation around it. + if (!foundValidFrame || bestMinZ < 0.15f) + { + // Use camera forward as axis3 (all silhouette vertices have z > 0 by construction) + self.axis3 = float32_t3(0.0f, 0.0f, 1.0f); + + // Find optimal axis1/axis2 rotation around axis3 by trying each edge + float32_t bestFallbackArea = 1e20f; + // axis3 = (0,0,1), so cross((0,0,1), (1,0,0)) = (0,1,0), cross((0,0,1), (0,1,0)) = (-1,0,0) + self.axis1 = float32_t3(0.0f, 1.0f, 0.0f); + self.axis2 = float32_t3(-1.0f, 0.0f, 0.0f); + + tryFallbackFrameCandidate<0>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); + tryFallbackFrameCandidate<1>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); + tryFallbackFrameCandidate<2>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); + if (silhouette.count > 3) + { + tryFallbackFrameCandidate<3, true>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); + if (silhouette.count > 4) + { + tryFallbackFrameCandidate<4, true>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); + if (silhouette.count > 5) + { + tryFallbackFrameCandidate<5, true>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); + if (silhouette.count > 6) + { + tryFallbackFrameCandidate<6, true>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); + } + } + } + } + } + + // Degenerate bounds check (single computation, after primary/fallback decision) + if (bounds.x >= bounds.z || bounds.y >= bounds.w) + bounds = float32_t4(-0.1f, -0.1f, 0.1f, 0.1f); + + self.rectR0 = float32_t3(bounds.xy, 1.0f); + self.rectExtents = float32_t2(bounds.zw - bounds.xy); + +#if VISUALIZE_SAMPLES + color += drawCorner(center, ndc, aaWidth, 0.05f, 0.0f, float32_t3(1.0f, 0.0f, 1.0f)); + color += visualizeBestCaliperEdge(silhouette.vertices, bestEdge, silhouette.count, spherePos, aaWidth); + color += self.visualize(spherePos, ndc, aaWidth); +#endif + +#if DEBUG_DATA + DebugDataBuffer[0].pyramidAxis1 = self.axis1; + DebugDataBuffer[0].pyramidAxis2 = self.axis2; + DebugDataBuffer[0].pyramidCenter = center; + DebugDataBuffer[0].pyramidHalfWidth1 = (atan(bounds.z) - atan(bounds.x)) * 0.5f; + DebugDataBuffer[0].pyramidHalfWidth2 = (atan(bounds.w) - atan(bounds.y)) * 0.5f; + DebugDataBuffer[0].pyramidSolidAngle = self.solidAngle; + DebugDataBuffer[0].pyramidBestEdge = bestEdge; + DebugDataBuffer[0].pyramidMin1 = bounds.x; + DebugDataBuffer[0].pyramidMin2 = bounds.y; + DebugDataBuffer[0].pyramidMax1 = bounds.z; + DebugDataBuffer[0].pyramidMax2 = bounds.w; +#endif + + return self; + } +}; + +#include "pyramid_sampling/urena.hlsl" +#include "pyramid_sampling/bilinear.hlsl" +#include "pyramid_sampling/biquadratic.hlsl" + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_PYRAMID_SAMPLING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl new file mode 100644 index 000000000..7d3319a7c --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl @@ -0,0 +1,86 @@ +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BILINEAR_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BILINEAR_HLSL_INCLUDED_ +#include + +// ============================================================================ +// Bilinear Approximation Sampling (closed-form, faster than biquadratic) +// ============================================================================ +// +struct BilinearSampler +{ + nbl::hlsl::sampling::Bilinear sampler; + + float32_t rcpTotalIntegral; + float32_t rectArea; + + // Precompute bilinear sampler from pyramid + static BilinearSampler create(NBL_CONST_REF_ARG(SphericalPyramid) pyramid) + { + BilinearSampler self; + + // 4 corner positions on the rectangle + const float32_t x0 = pyramid.rectR0.x; + const float32_t x1 = x0 + pyramid.rectExtents.x; + const float32_t y0 = pyramid.rectR0.y; + const float32_t y1 = y0 + pyramid.rectExtents.y; + + // dSA(x,y) = 1 / (x^2 + y^2 + 1)^(3/2) [z = 1.0 in local frame] + const float32_t xx0 = x0 * x0, xx1 = x1 * x1; + const float32_t yy0 = y0 * y0, yy1 = y1 * y1; + + float32_t d; + d = xx0 + yy0 + 1.0f; + const float32_t v00 = rsqrt(d) / d; // x0y0 + d = xx1 + yy0 + 1.0f; + const float32_t v10 = rsqrt(d) / d; // x1y0 + d = xx0 + yy1 + 1.0f; + const float32_t v01 = rsqrt(d) / d; // x0y1 + d = xx1 + yy1 + 1.0f; + const float32_t v11 = rsqrt(d) / d; // x1y1 + + // Bilinear layout: (x0y0, x0y1, x1y0, x1y1) + self.sampler = nbl::hlsl::sampling::Bilinear::create(float32_t4(v00, v01, v10, v11)); + + // Total integral = average of 4 corners (bilinear integral over unit square) + const float32_t totalIntegral = (v00 + v10 + v01 + v11) * 0.25f; + self.rcpTotalIntegral = 1.0f / max(totalIntegral, 1e-20f); + self.rectArea = pyramid.rectExtents.x * pyramid.rectExtents.y; + + return self; + } + + // Sample a direction on the spherical pyramid using bilinear importance sampling. + // Returns the world-space direction; outputs pdf in solid-angle space and validity flag. + float32_t3 sample(NBL_CONST_REF_ARG(SphericalPyramid) pyramid, NBL_CONST_REF_ARG(SilEdgeNormals) silhouette, float32_t2 xi, out float32_t pdf, out bool valid) + { + // Step 1: Sample UV from bilinear distribution (closed-form via quadratic formula) + float32_t rcpPdf; + float32_t2 uv = sampler.generate(rcpPdf, xi); + + // Step 2: UV to direction + // Bilinear sampler convention: u.y = first-sampled axis (X), u.x = second-sampled axis (Y) + const float32_t localX = pyramid.rectR0.x + uv.y * pyramid.rectExtents.x; + const float32_t localY = pyramid.rectR0.y + uv.x * pyramid.rectExtents.y; + + // Compute dist2 and rcpLen once, reuse for both normalization and dSA + const float32_t dist2 = localX * localX + localY * localY + 1.0f; + const float32_t rcpLen = rsqrt(dist2); + float32_t3 direction = (localX * pyramid.axis1 + + localY * pyramid.axis2 + + pyramid.axis3) * rcpLen; + + valid = direction.z > 0.0f && silhouette.isInside(direction); + + // PDF in solid angle space: 1 / (rcpPdf * dSA * rectArea) + // rcpPdf already = 1/pdfUV from Bilinear::generate, avoid redundant reciprocal + const float32_t dsa = rcpLen / dist2; + pdf = 1.0f / max(rcpPdf * dsa * rectArea, 1e-7f); + + return direction; + } +}; + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BILINEAR_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/biquadratic.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/biquadratic.hlsl new file mode 100644 index 000000000..e75c89595 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/biquadratic.hlsl @@ -0,0 +1,158 @@ +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BIQUADRATIC_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BIQUADRATIC_HLSL_INCLUDED_ + +// ============================================================================ +// Biquadratic Approximation Sampling (Hart et al. 2020) +// ============================================================================ +// +// Precomputed biquadratic sampler for importance sampling solid angle density. +// Build once from a SphericalPyramid, then call sample() per random pair. + +struct BiquadraticSampler +{ + // Column-major: cols[i] = (row0[i], row1[i], row2[i]) for fast sliceAtY via dot + float32_t3x3 cols; + + // Precomputed marginal (Y) polynomial: f(y) = c0 + y*(c1 + y*c2) + float32_t margC0, margC1, margC2, margIntegral; + + float32_t rcpTotalIntegral; + float32_t rcpIntegralTimesRcpArea; // rcpTotalIntegral / rectArea (fused for PDF computation) + + // Newton-Raphson CDF inversion for a quadratic PDF (2 iterations) + // Solves: c0*t + (c1/2)*t^2 + (c2/3)*t^3 = u * integral + // Returns sampled t and the PDF value at t (avoids redundant recomputation by caller). + // 2 iterations give ~4 decimal digits, should be sufficient for importance sampling with rejection? + static float32_t sampleQuadraticCDF(float32_t u, float32_t c0, float32_t c1, float32_t c2, float32_t integral, out float32_t lastPdfVal) + { + const float32_t target = u * integral; + const float32_t c1half = c1 * 0.5f; + const float32_t c2third = c2 * (1.0f / 3.0f); + float32_t t = u; + + // Iteration 1 + float32_t cdfVal = t * (c0 + t * (c1half + t * c2third)); + lastPdfVal = c0 + t * (c1 + t * c2); + t = clamp(t - (cdfVal - target) / lastPdfVal, 0.0f, 1.0f); + + // Iteration 2 + cdfVal = t * (c0 + t * (c1half + t * c2third)); + lastPdfVal = c0 + t * (c1 + t * c2); + t = clamp(t - (cdfVal - target) / lastPdfVal, 0.0f, 1.0f); + + return t; + } + + // Precompute biquadratic sampler from pyramid (call ONCE, reuse for all samples) + static BiquadraticSampler create(NBL_CONST_REF_ARG(SphericalPyramid) pyramid) + { + BiquadraticSampler self; + + // 3x3 grid positions on the rectangle + const float32_t x0 = pyramid.rectR0.x; + const float32_t x1 = x0 + 0.5f * pyramid.rectExtents.x; + const float32_t x2 = x0 + pyramid.rectExtents.x; + const float32_t y0 = pyramid.rectR0.y; + const float32_t y1 = y0 + 0.5f * pyramid.rectExtents.y; + const float32_t y2 = y0 + pyramid.rectExtents.y; + + // dSA(x,y) = rsqrt(x^2+y^2+1) / (x^2+y^2+1) [z = rectR0.z = 1.0] + const float32_t xx0 = x0 * x0, xx1 = x1 * x1, xx2 = x2 * x2; + const float32_t yy0 = y0 * y0, yy1 = y1 * y1, yy2 = y2 * y2; + + float32_t3 row0, row1, row2; + float32_t d; + + d = xx0 + yy0 + 1.0f; + row0.x = rsqrt(d) / d; + d = xx1 + yy0 + 1.0f; + row0.y = rsqrt(d) / d; + d = xx2 + yy0 + 1.0f; + row0.z = rsqrt(d) / d; + + d = xx0 + yy1 + 1.0f; + row1.x = rsqrt(d) / d; + d = xx1 + yy1 + 1.0f; + row1.y = rsqrt(d) / d; + d = xx2 + yy1 + 1.0f; + row1.z = rsqrt(d) / d; + + d = xx0 + yy2 + 1.0f; + row2.x = rsqrt(d) / d; + d = xx1 + yy2 + 1.0f; + row2.y = rsqrt(d) / d; + d = xx2 + yy2 + 1.0f; + row2.z = rsqrt(d) / d; + + // Store column-major for sliceAtY: cols[i] = (row0[i], row1[i], row2[i]) + self.cols[0] = float32_t3(row0.x, row1.x, row2.x); + self.cols[1] = float32_t3(row0.y, row1.y, row2.y); + self.cols[2] = float32_t3(row0.z, row1.z, row2.z); + + // Marginal along Y: Simpson's rule integral of each row + const float32_t3 marginal = float32_t3( + (row0.x + 4.0f * row0.y + row0.z) / 6.0f, + (row1.x + 4.0f * row1.y + row1.z) / 6.0f, + (row2.x + 4.0f * row2.y + row2.z) / 6.0f); + + // Precompute marginal polynomial: f(y) = c0 + y*(c1 + y*c2) + self.margC0 = marginal[0]; + self.margC1 = -3.0f * marginal[0] + 4.0f * marginal[1] - marginal[2]; + self.margC2 = 2.0f * (marginal[0] - 2.0f * marginal[1] + marginal[2]); + self.margIntegral = (marginal[0] + 4.0f * marginal[1] + marginal[2]) / 6.0f; + + self.rcpTotalIntegral = 1.0f / max(self.margIntegral, 1e-20f); + const float32_t rectArea = pyramid.rectExtents.x * pyramid.rectExtents.y; + self.rcpIntegralTimesRcpArea = self.rcpTotalIntegral / max(rectArea, 1e-20f); + + return self; + } + + // Sample a direction on the spherical pyramid using biquadratic importance sampling. + // Returns the world-space direction; outputs pdf in solid-angle space and validity flag. + float32_t3 sample(NBL_CONST_REF_ARG(SphericalPyramid) pyramid, NBL_CONST_REF_ARG(SilEdgeNormals) silhouette, float32_t2 xi, out float32_t pdf, out bool valid) + { + // Step 1: Sample Y from precomputed marginal polynomial + float32_t margPdfAtY; + const float32_t y = sampleQuadraticCDF(xi.y, margC0, margC1, margC2, margIntegral, margPdfAtY); + + // Step 2: Compute conditional X slice at sampled Y via Lagrange basis + const float32_t y2 = y * y; + const float32_t3 Ly = float32_t3(2.0f * y2 - 3.0f * y + 1.0f, -4.0f * y2 + 4.0f * y, 2.0f * y2 - y); + const float32_t3 slice = float32_t3(dot(cols[0], Ly), dot(cols[1], Ly), dot(cols[2], Ly)); + + // Step 3: Build conditional polynomial and sample X + const float32_t condC0 = slice[0]; + const float32_t condC1 = -3.0f * slice[0] + 4.0f * slice[1] - slice[2]; + const float32_t condC2 = 2.0f * (slice[0] - 2.0f * slice[1] + slice[2]); + const float32_t condIntegral = (slice[0] + 4.0f * slice[1] + slice[2]) / 6.0f; + float32_t condPdfAtX; + const float32_t x = sampleQuadraticCDF(xi.x, condC0, condC1, condC2, condIntegral, condPdfAtX); + + // Step 4: UV to direction + const float32_t localX = pyramid.rectR0.x + x * pyramid.rectExtents.x; + const float32_t localY = pyramid.rectR0.y + y * pyramid.rectExtents.y; + + // Compute dist2 and rcpLen once, reuse for both normalization and dSA + const float32_t dist2 = localX * localX + localY * localY + 1.0f; + const float32_t rcpLen = rsqrt(dist2); + float32_t3 direction = (localX * pyramid.axis1 + + localY * pyramid.axis2 + + pyramid.axis3) * + rcpLen; + + valid = direction.z > 0.0f && silhouette.isInside(direction); + + // Step 5: PDF in solid angle space = condPdfAtX / (totalIntegral * dSA * rectArea) + // condPdfAtX is reused from the last Newton iteration + const float32_t dsa = rcpLen / dist2; + pdf = condPdfAtX * rcpIntegralTimesRcpArea / max(dsa, 1e-7f); + + return direction; + } +}; + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BIQUADRATIC_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/urena.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/urena.hlsl new file mode 100644 index 000000000..6709bf7da --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/urena.hlsl @@ -0,0 +1,87 @@ +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_URENA_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_URENA_HLSL_INCLUDED_ + +// ============================================================================ +// Sampling using Urena 2003 (SphericalRectangle) +// ============================================================================ + +struct UrenaSampler +{ + float32_t solidAngle; // Solid angle of the bounding region (steradians) + float32_t samplerK; // = 2*pi - q (angle offset for horizontal sampling) + float32_t samplerB0; // = n_z[0] (normalized edge parameter) + float32_t samplerB1; // = n_z[2] (normalized edge parameter) + + // Precompute solid angle AND sampler intermediates in one pass + // (solidAngleOfRectangle and generate() both compute n_z/cosGamma -- fuse them) + static UrenaSampler create(NBL_CONST_REF_ARG(SphericalPyramid) pyramid) + { + UrenaSampler self; + + const float32_t4 denorm_n_z = float32_t4(-pyramid.rectR0.y, pyramid.rectR0.x + pyramid.rectExtents.x, pyramid.rectR0.y + pyramid.rectExtents.y, -pyramid.rectR0.x); + const float32_t4 n_z = denorm_n_z / sqrt((float32_t4)(pyramid.rectR0.z * pyramid.rectR0.z) + denorm_n_z * denorm_n_z); + const float32_t4 cosGamma = float32_t4(-n_z[0] * n_z[1], -n_z[1] * n_z[2], + -n_z[2] * n_z[3], -n_z[3] * n_z[0]); + + nbl::hlsl::math::sincos_accumulator adder = nbl::hlsl::math::sincos_accumulator::create(cosGamma[0]); + adder.addCosine(cosGamma[1]); + const float32_t p = adder.getSumofArccos(); + adder = nbl::hlsl::math::sincos_accumulator::create(cosGamma[2]); + adder.addCosine(cosGamma[3]); + const float32_t q = adder.getSumofArccos(); + + self.solidAngle = p + q - 2.0f * nbl::hlsl::numbers::pi; + self.samplerK = 2.0f * nbl::hlsl::numbers::pi - q; + self.samplerB0 = n_z[0]; + self.samplerB1 = n_z[2]; + + return self; + } + + float32_t3 sample(NBL_CONST_REF_ARG(SphericalPyramid) pyramid, NBL_CONST_REF_ARG(SilEdgeNormals) silhouette, float32_t2 xi, out float32_t pdf, out bool valid) + { + // Inlined Urena 2003 with algebraic simplifications: + const float32_t r1x = pyramid.rectR0.x + pyramid.rectExtents.x; + const float32_t r1y = pyramid.rectR0.y + pyramid.rectExtents.y; + + // Horizontal CDF inversion + const float32_t au = xi.x * solidAngle + samplerK; + float32_t sinAu, cosAu; + sincos(au, sinAu, cosAu); + const float32_t fu = (cosAu * samplerB0 - samplerB1) / sinAu; + + // cu = sign(fu)/sqrt(cu_2), xu = cu/sqrt(1-cu^2) + // Fused: xu = sign(fu)/sqrt(cu_2 - 1) [eliminates 2 sqrt + 2 div -> 1 rsqrt] + const float32_t cu_2 = max(fu * fu + samplerB0 * samplerB0, 1.0f); + const float32_t xu = clamp( + (fu >= 0.0f ? 1.0f : -1.0f) * rsqrt(max(cu_2 - 1.0f, 1e-10f)), + pyramid.rectR0.x, r1x); + const float32_t d_2 = xu * xu + 1.0f; + + // Vertical sampling in h-space (div -> rsqrt + mul) + const float32_t h0 = pyramid.rectR0.y * rsqrt(d_2 + pyramid.rectR0.y * pyramid.rectR0.y); + const float32_t h1 = r1y * rsqrt(d_2 + r1y * r1y); + const float32_t hv = h0 + xi.y * (h1 - h0); + + // Normalized direction via ||(xu,yv,1)||^2 = d_2/(1-hv^2): + // localDir.y = yv/||v|| = hv (exact cancellation) + // localDir.xz = (xu, 1) * t where t = sqrt(1-hv^2)/sqrt(d_2) + // Eliminates: sqrt(d_2), yv computation, and normalize() + const float32_t t = sqrt(max(1.0f - hv * hv, 0.0f)) * rsqrt(d_2); + const float32_t3 localDir = float32_t3(xu * t, hv, t); + + float32_t3 direction = localDir.x * pyramid.axis1 + + localDir.y * pyramid.axis2 + + localDir.z * pyramid.axis3; + + valid = direction.z > 0.0f && silhouette.isInside(direction); + pdf = 1.0f / max(solidAngle, 1e-7f); + + return direction; + } +}; + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_URENA_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/RayVis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/ray_vis.frag.hlsl similarity index 68% rename from 73_SolidAngleVisualizer/app_resources/hlsl/RayVis.frag.hlsl rename to 73_SolidAngleVisualizer/app_resources/hlsl/ray_vis.frag.hlsl index a8a1ff52d..d01b3a07f 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/RayVis.frag.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/ray_vis.frag.hlsl @@ -1,3 +1,6 @@ +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h #pragma wave shader_stage(fragment) #include "common.hlsl" @@ -16,18 +19,15 @@ struct ArrowResult }; [[vk::push_constant]] struct PushConstantRayVis pc; -// #if DEBUG_DATA -[[vk::binding(0, 0)]] RWStructuredBuffer DebugDataBuffer; -// #endif #if VISUALIZE_SAMPLES -#include "Drawing.hlsl" +#include "drawing.hlsl" // Ray-AABB intersection in world space // Returns the distance to the nearest intersection point, or -1 if no hit float32_t rayAABBIntersection(float32_t3 rayOrigin, float32_t3 rayDir, float32_t3 aabbMin, float32_t3 aabbMax) { - float32_t3 invDir = 1.0 / rayDir; + float32_t3 invDir = 1.0f / rayDir; float32_t3 t0 = (aabbMin - rayOrigin) * invDir; float32_t3 t1 = (aabbMax - rayOrigin) * invDir; @@ -61,7 +61,7 @@ ArrowResult visualizeRayAsArrow(float32_t3 rayOrigin, float32_t4 directionAndPdf { ArrowResult result; result.color = float32_t4(0, 0, 0, 0); - result.depth = 1.0; // Far plane in reversed-Z + result.depth = 0.0; // Far plane in reversed-Z float32_t3 rayDir = normalize(directionAndPdf.xyz); float32_t pdf = directionAndPdf.w; @@ -140,7 +140,7 @@ ArrowResult visualizeRayAsArrow(float32_t3 rayOrigin, float32_t4 directionAndPdf // Compute NDC depth for reversed-Z float32_t depthNDC = clipPos.z / clipPos.w; - result.depth = depthNDC; + result.depth = 1.0f - depthNDC; // Clip against valid depth range if (result.depth < 0.0 || result.depth > 1.0) @@ -157,32 +157,6 @@ ArrowResult visualizeRayAsArrow(float32_t3 rayOrigin, float32_t4 directionAndPdf return result; } -// Transform a point by inverse of model matrix (world to local space) -float32_t3 worldToLocal(float32_t3 worldPos, float32_t3x4 modelMatrix) -{ - // Manually construct 4x4 from 3x4 - float32_t4x4 model4x4 = float32_t4x4( - modelMatrix[0], - modelMatrix[1], - modelMatrix[2], - float32_t4(0.0, 0.0, 0.0, 1.0)); - float32_t4x4 invModel = inverse(model4x4); - return mul(invModel, float32_t4(worldPos, 1.0)).xyz; -} - -// Transform a direction by inverse of model matrix (no translation) -float32_t3 worldToLocalDir(float32_t3 worldDir, float32_t3x4 modelMatrix) -{ - // Manually construct 4x4 from 3x4 - float32_t4x4 model4x4 = float32_t4x4( - modelMatrix[0], - modelMatrix[1], - modelMatrix[2], - float32_t4(0.0, 0.0, 0.0, 1.0)); - float32_t4x4 invModel = inverse(model4x4); - return mul(invModel, float32_t4(worldDir, 0.0)).xyz; -} - // Returns both tMin (entry) and tMax (exit) for ray-AABB intersection struct AABBIntersection { @@ -220,6 +194,7 @@ AABBIntersection rayAABBIntersectionFull(float32_t3 origin, float32_t3 dir, floa } #endif // VISUALIZE_SAMPLES +// [shader("pixel")] [[vk::location(0)]] ArrowResult main(SVertexAttributes vx) { ArrowResult output; @@ -253,58 +228,54 @@ AABBIntersection rayAABBIntersectionFull(float32_t3 origin, float32_t3 dir, floa uint32_t sampleCount = DebugDataBuffer[0].sampleCount; - // for (uint32_t i = 0; i < sampleCount; i++) - // { - // float32_t3 rayOrigin = float32_t3(0, 0, 0); - // float32_t4 directionAndPdf = DebugDataBuffer[0].rayData[i]; - // float32_t3 rayDir = normalize(directionAndPdf.xyz); - - // // Define cube bounds in local space - // float32_t3 cubeLocalMin = float32_t3(-0.5, -0.5, -0.5); - // float32_t3 cubeLocalMax = float32_t3(0.5, 0.5, 0.5); - - // // Transform ray to local space of the cube - // float32_t3 localRayOrigin = worldToLocal(rayOrigin, pc.modelMatrix); - // float32_t3 localRayDir = normalize(worldToLocalDir(rayDir, pc.modelMatrix)); - - // // Get both entry and exit distances - // AABBIntersection intersection = rayAABBIntersectionFull( - // localRayOrigin, - // localRayDir, - // cubeLocalMin, - // cubeLocalMax); - - // float32_t arrowLength; - // float32_t3 arrowColor; - - // if (intersection.hit) - // { - // // Use tMax (exit point at back face) instead of tMin (entry point at front face) - // float32_t3 localExitPoint = localRayOrigin + localRayDir * intersection.tMax; - // float32_t3 worldExitPoint = mul(pc.modelMatrix, float32_t4(localExitPoint, 1.0)).xyz; - // arrowLength = length(worldExitPoint - rayOrigin); - // arrowColor = float32_t3(0.0, 1.0, 0.0); // Green for valid samples - // } - // else - // { - // // Ray doesn't intersect - THIS SHOULD NEVER HAPPEN with correct sampling! - // float32_t3 cubeCenter = mul(pc.modelMatrix, float32_t4(0, 0, 0, 1)).xyz; - // arrowLength = length(cubeCenter - rayOrigin) + 2.0; - // arrowColor = float32_t3(1.0, 0.0, 0.0); // Red for BROKEN samples - // } - - // ArrowResult arrow = visualizeRayAsArrow(rayOrigin, directionAndPdf, arrowLength, ndcPos, aspect); - - // // Only update depth if arrow was actually drawn - // if (arrow.color.a > 0.0) - // { - // maxDepth = max(maxDepth, arrow.depth); - // } - - // // Modulate arrow color by its alpha (only add where arrow is visible) - // output.color.rgb += arrowColor * arrow.color.a; - // output.color.a = max(output.color.a, arrow.color.a); - // } + for (uint32_t i = 0; i < sampleCount; i++) + { + float32_t3 rayOrigin = float32_t3(0, 0, 0); + float32_t4 directionAndPdf = DebugDataBuffer[0].rayData[i]; + float32_t3 rayDir = normalize(directionAndPdf.xyz); + + // Define cube bounds in local space + float32_t3 cubeLocalMin = float32_t3(-0.5, -0.5, -0.5); + float32_t3 cubeLocalMax = float32_t3(0.5, 0.5, 0.5); + + // Transform ray to local space of the cube (using precomputed inverse) + float32_t3 localRayOrigin = mul(pc.invModelMatrix, float32_t4(rayOrigin, 1.0)).xyz; + float32_t3 localRayDir = normalize(mul(pc.invModelMatrix, float32_t4(rayDir, 0.0)).xyz); + + // Get both entry and exit distances + AABBIntersection intersection = rayAABBIntersectionFull(localRayOrigin, localRayDir, cubeLocalMin, cubeLocalMax); + + float32_t arrowLength; + float32_t3 arrowColor; + + if (intersection.hit) + { + // Use tMax (exit point at back face) instead of tMin (entry point at front face) + float32_t3 localExitPoint = localRayOrigin + localRayDir * intersection.tMax; + float32_t3 worldExitPoint = mul(pc.modelMatrix, float32_t4(localExitPoint, 1.0)).xyz; + arrowLength = length(worldExitPoint - rayOrigin); + arrowColor = float32_t3(0.0, 1.0, 0.0); // Green for valid samples + } + else + { + // Ray doesn't intersect - THIS SHOULD NEVER HAPPEN with correct sampling! + float32_t3 cubeCenter = mul(pc.modelMatrix, float32_t4(0, 0, 0, 1)).xyz; + arrowLength = length(cubeCenter - rayOrigin) + 2.0; + arrowColor = float32_t3(1.0, 0.0, 0.0); // Red for BROKEN samples + } + + ArrowResult arrow = visualizeRayAsArrow(rayOrigin, directionAndPdf, arrowLength, ndcPos, aspect); + + // Only update depth if arrow was actually drawn + if (arrow.color.a > 0.0) + { + maxDepth = max(maxDepth, arrow.depth); + } + + // Modulate arrow color by its alpha (only add where arrow is visible) + output.color.rgb += arrowColor * arrow.color.a; + output.color.a = max(output.color.a, arrow.color.a); + } // Clamp to prevent overflow output.color = saturate(output.color); diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl index 504db2db9..8213c17fc 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl @@ -1,189 +1,244 @@ -#ifndef _SILHOUETTE_HLSL_ -#define _SILHOUETTE_HLSL_ +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_SILHOUETTE_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_SILHOUETTE_HLSL_INCLUDED_ #include "gpu_common.hlsl" -#include "utils.hlsl" -// Special index values for clip points -static const uint32_t CLIP_POINT_A = 23; // Clip point between last positive and first negative -static const uint32_t CLIP_POINT_B = 24; // Clip point between last negative and first positive - -// Compute region and configuration index from model matrix -uint32_t computeRegionAndConfig(float32_t3x4 modelMatrix, out uint32_t3 region, out uint32_t configIndex, out uint32_t vertexCount) +struct ClippedSilhouette { - float32_t4x3 columnModel = transpose(modelMatrix); - float32_t3 obbCenter = columnModel[3].xyz; - float32_t3x3 upper3x3 = (float32_t3x3)columnModel; + float32_t3 vertices[MAX_SILHOUETTE_VERTICES]; // Max 7 vertices after clipping, unnormalized + uint32_t count; - float32_t3 rcpSqScales = rcp(float32_t3( - dot(upper3x3[0], upper3x3[0]), - dot(upper3x3[1], upper3x3[1]), - dot(upper3x3[2], upper3x3[2]))); + void normalize() + { + vertices[0] = nbl::hlsl::normalize(vertices[0]); + vertices[1] = nbl::hlsl::normalize(vertices[1]); + vertices[2] = nbl::hlsl::normalize(vertices[2]); + if (count > 3) + { + vertices[3] = nbl::hlsl::normalize(vertices[3]); + if (count > 4) + { + vertices[4] = nbl::hlsl::normalize(vertices[4]); + if (count > 5) + { + vertices[5] = nbl::hlsl::normalize(vertices[5]); + if (count > 6) + { + vertices[6] = nbl::hlsl::normalize(vertices[6]); + } + } + } + } + } - float32_t3 normalizedProj = mul(upper3x3, obbCenter) * rcpSqScales; + // Compute the silhouette centroid (average direction) + float32_t3 getCenter() + { + float32_t3 sum = float32_t3(0, 0, 0); - region = uint32_t3( - normalizedProj.x < -0.5f ? 0 : (normalizedProj.x > 0.5f ? 2 : 1), - normalizedProj.y < -0.5f ? 0 : (normalizedProj.y > 0.5f ? 2 : 1), - normalizedProj.z < -0.5f ? 0 : (normalizedProj.z > 0.5f ? 2 : 1)); + NBL_UNROLL + for (uint32_t i = 0; i < MAX_SILHOUETTE_VERTICES; i++) + { + if (i < count) + sum += vertices[i]; + } - configIndex = region.x + region.y * 3u + region.z * 9u; + return nbl::hlsl::normalize(sum); + } - // uint32_t sil = packSilhouette(silhouettes[configIndex]); - uint32_t sil = binSilhouettes[configIndex]; - vertexCount = getSilhouetteSize(sil); + static uint32_t computeRegionAndConfig(float32_t3x4 modelMatrix, out uint32_t3 region, out uint32_t configIndex, out uint32_t vertexCount) + { + float32_t4x3 columnModel = transpose(modelMatrix); + float32_t3 obbCenter = columnModel[3].xyz; + float32_t3x3 upper3x3 = (float32_t3x3)columnModel; - return sil; -} + float32_t3 rcpSqScales = rcp(float32_t3( + dot(upper3x3[0], upper3x3[0]), + dot(upper3x3[1], upper3x3[1]), + dot(upper3x3[2], upper3x3[2]))); -#if VISUALIZE_SAMPLES -float32_t4 -#else -void -#endif -computeSilhouette(float32_t3x4 modelMatrix, uint32_t vertexCount, uint32_t sil -#if VISUALIZE_SAMPLES - , - float32_t3 spherePos, float32_t aaWidth -#endif - , - NBL_REF_ARG(ClippedSilhouette) silhouette) -{ -#if VISUALIZE_SAMPLES - float32_t4 color = float32_t4(0, 0, 0, 0); -#endif + float32_t3 normalizedProj = mul(upper3x3, obbCenter) * rcpSqScales; - silhouette.count = 0; + region = uint32_t3( + normalizedProj.x < -0.5f ? 0 : (normalizedProj.x > 0.5f ? 2 : 1), + normalizedProj.y < -0.5f ? 0 : (normalizedProj.y > 0.5f ? 2 : 1), + normalizedProj.z < -0.5f ? 0 : (normalizedProj.z > 0.5f ? 2 : 1)); - // Build clip mask (z < 0) - uint32_t clipMask = 0u; - NBL_UNROLL - for (uint32_t i = 0; i < 4; i++) - clipMask |= (getVertexZNeg(modelMatrix, getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; + configIndex = region.x + region.y * 3u + region.z * 9u; - if (vertexCount == 6) + uint32_t sil = binSilhouettes[configIndex]; + vertexCount = getSilhouetteSize(sil); + + return sil; + } + + void compute(float32_t3x4 modelMatrix, uint32_t vertexCount, uint32_t sil) { + count = 0; + + // Build clip mask (z < 0) + uint32_t clipMask = 0u; NBL_UNROLL - for (uint32_t i = 4; i < 6; i++) + for (uint32_t i = 0; i < 4; i++) clipMask |= (getVertexZNeg(modelMatrix, getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; - } - uint32_t clipCount = countbits(clipMask); + if (vertexCount == 6) + { + NBL_UNROLL + for (uint32_t i = 4; i < 6; i++) + clipMask |= (getVertexZNeg(modelMatrix, getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; + } + + uint32_t clipCount = countbits(clipMask); -#if 0 - // Early exit if fully clipped - if (clipCount == vertexCount) - return color; + // Invert clip mask to find first positive vertex + uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u); - // No clipping needed - fast path - if (clipCount == 0) - { - for (uint32_t i = 0; i < vertexCount; i++) + // Check if wrap-around is needed (first and last bits negative) + bool wrapAround = ((clipMask & 1u) != 0u) && ((clipMask & (1u << (vertexCount - 1))) != 0u); + + // Compute rotation amount + uint32_t rotateAmount = wrapAround + ? firstbitlow(invertedMask) // first positive + : firstbithigh(clipMask) + 1; // first vertex after last negative + + // Rotate masks + uint32_t rotatedClipMask = rotr(clipMask, rotateAmount, vertexCount); + uint32_t rotatedSil = rotr(sil, rotateAmount * 3, vertexCount * 3); + uint32_t positiveCount = vertexCount - clipCount; + + // ALWAYS compute both clip points + uint32_t lastPosIdx = positiveCount - 1; + uint32_t firstNegIdx = positiveCount; + + float32_t3 vLastPos = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, lastPosIdx)); + float32_t3 vFirstNeg = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, firstNegIdx)); + float32_t t = vLastPos.z / (vLastPos.z - vFirstNeg.z); + float32_t3 clipA = lerp(vLastPos, vFirstNeg, t); + + float32_t3 vLastNeg = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, vertexCount - 1)); + float32_t3 vFirstPos = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, 0)); + t = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + float32_t3 clipB = lerp(vLastNeg, vFirstPos, t); + + NBL_UNROLL + for (uint32_t i = 0; i < positiveCount; i++) { - uint32_t i0 = i; - uint32_t i1 = (i + 1) % vertexCount; - float32_t3 v0 = getVertex(modelMatrix, getSilhouetteVertex(sil, i0)); - silhouette.vertices[silhouette.count] = v0; - silhouette.indices[silhouette.count++] = i0; // Original index (no rotation) - -#if VISUALIZE_SAMPLES - float32_t3 v1 = getVertex(modelMatrix, getSilhouetteVertex(sil, i1)); - float32_t3 pts[2] = {v0, v1}; - color += drawEdge(i1, pts, spherePos, aaWidth); + float32_t3 v0 = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, i)); + +#if DEBUG_DATA + uint32_t originalIndex = (i + rotateAmount) % vertexCount; + DebugDataBuffer[0].clippedSilhouetteVertices[count] = v0; + DebugDataBuffer[0].clippedSilhouetteVerticesIndices[count] = originalIndex; #endif + vertices[count++] = v0; } - return color; - } + + if (clipCount > 0 && clipCount < vertexCount) + { +#if DEBUG_DATA + DebugDataBuffer[0].clippedSilhouetteVertices[count] = clipA; + DebugDataBuffer[0].clippedSilhouetteVerticesIndices[count] = CLIP_POINT_A; #endif + vertices[count++] = clipA; - // Rotate clip mask so positives come first - uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u); - bool wrapAround = ((clipMask & 1u) != 0u) && ((clipMask & (1u << (vertexCount - 1))) != 0u); - uint32_t rotateAmount = wrapAround - ? firstbitlow(invertedMask) // -> First POSITIVE - : firstbithigh(clipMask) + 1; // -> First vertex AFTER last negative - - uint32_t rotatedClipMask = rotr(clipMask, rotateAmount, vertexCount); - uint32_t rotatedSil = rotr(sil, rotateAmount * 3, vertexCount * 3); - uint32_t positiveCount = vertexCount - clipCount; - - // ALWAYS compute both clip points - uint32_t lastPosIdx = positiveCount - 1; - uint32_t firstNegIdx = positiveCount; - - float32_t3 vLastPos = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, lastPosIdx)); - float32_t3 vFirstNeg = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, firstNegIdx)); - float32_t t = vLastPos.z / (vLastPos.z - vFirstNeg.z); - float32_t3 clipA = lerp(vLastPos, vFirstNeg, t); - - float32_t3 vLastNeg = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, vertexCount - 1)); - float32_t3 vFirstPos = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, 0)); - t = vLastNeg.z / (vLastNeg.z - vFirstPos.z); - float32_t3 clipB = lerp(vLastNeg, vFirstPos, t); - - NBL_UNROLL - for (uint32_t i = 0; i < positiveCount; i++) - { - // Get raw vertex - float32_t3 v0 = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, i)); - bool isLastPositive = (i == positiveCount - 1); - bool useClipA = (clipCount > 0) && isLastPositive; - - // Compute original index before rotation - uint32_t originalIndex = (i + rotateAmount) % vertexCount; - -#if VISUALIZE_SAMPLES - float32_t3 v1 = useClipA ? clipA : getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, (i + 1) % vertexCount)); - float32_t3 pts[2] = {normalize(v0), normalize(v1)}; - color += drawEdge((i + 1) % vertexCount, pts, spherePos, aaWidth); +#if DEBUG_DATA + DebugDataBuffer[0].clippedSilhouetteVertices[count] = clipB; + DebugDataBuffer[0].clippedSilhouetteVerticesIndices[count] = CLIP_POINT_B; #endif + vertices[count++] = clipB; + } #if DEBUG_DATA - DebugDataBuffer[0].clippedSilhouetteVertices[silhouette.count] = v0; - DebugDataBuffer[0].clippedSilhouetteVerticesIndices[silhouette.count] = originalIndex; + DebugDataBuffer[0].clippedSilhouetteVertexCount = count; + DebugDataBuffer[0].clipMask = clipMask; + DebugDataBuffer[0].clipCount = clipCount; + DebugDataBuffer[0].rotatedClipMask = rotatedClipMask; + DebugDataBuffer[0].rotateAmount = rotateAmount; + DebugDataBuffer[0].positiveVertCount = positiveCount; + DebugDataBuffer[0].wrapAround = (uint32_t)wrapAround; + DebugDataBuffer[0].rotatedSil = rotatedSil; #endif - silhouette.vertices[silhouette.count++] = normalize(v0); } +}; - if (clipCount > 0 && clipCount < vertexCount) +struct SilEdgeNormals +{ + float16_t3 edgeNormals[MAX_SILHOUETTE_VERTICES]; // 10.5 floats instead of 21 + uint32_t count; + + // Better not use and calculate it while creating the sampler + static SilEdgeNormals create(NBL_CONST_REF_ARG(ClippedSilhouette) sil) { - float32_t3 vFirst = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, 0)); + SilEdgeNormals result = (SilEdgeNormals)0; + result.count = sil.count; -#if VISUALIZE_SAMPLES - float32_t3 npPts[2] = {normalize(clipB), normalize(vFirst)}; - color += drawEdge(0, npPts, spherePos, aaWidth); + float32_t3 v0 = sil.vertices[0]; + float32_t3 v1 = sil.vertices[1]; + float32_t3 v2 = sil.vertices[2]; - float32_t3 arcPts[2] = {normalize(clipA), normalize(clipB)}; - color += drawEdge(23, arcPts, spherePos, aaWidth, 0.6f); -#endif + result.edgeNormals[0] = float16_t3(cross(v0, v1)); + result.edgeNormals[1] = float16_t3(cross(v1, v2)); -#if DEBUG_DATA - DebugDataBuffer[0].clippedSilhouetteVertices[silhouette.count] = clipA; - DebugDataBuffer[0].clippedSilhouetteVerticesIndices[silhouette.count] = CLIP_POINT_A; -#endif - silhouette.vertices[silhouette.count++] = normalize(clipA); + if (sil.count > 3) + { + float32_t3 v3 = sil.vertices[3]; + result.edgeNormals[2] = float16_t3(cross(v2, v3)); + + if (sil.count > 4) + { + float32_t3 v4 = sil.vertices[4]; + result.edgeNormals[3] = float16_t3(cross(v3, v4)); + + if (sil.count > 5) + { + float32_t3 v5 = sil.vertices[5]; + result.edgeNormals[4] = float16_t3(cross(v4, v5)); + + if (sil.count > 6) + { + float32_t3 v6 = sil.vertices[6]; + result.edgeNormals[5] = float16_t3(cross(v5, v6)); + result.edgeNormals[6] = float16_t3(cross(v6, v0)); + } + else + { + result.edgeNormals[5] = float16_t3(cross(v5, v0)); + } + } + else + { + result.edgeNormals[4] = float16_t3(cross(v4, v0)); + } + } + else + { + result.edgeNormals[3] = float16_t3(cross(v3, v0)); + } + } + else + { + result.edgeNormals[2] = float16_t3(cross(v2, v0)); + } -#if DEBUG_DATA - DebugDataBuffer[0].clippedSilhouetteVertices[silhouette.count] = clipB; - DebugDataBuffer[0].clippedSilhouetteVerticesIndices[silhouette.count] = CLIP_POINT_B; -#endif - silhouette.vertices[silhouette.count++] = normalize(clipB); + return result; } -#if DEBUG_DATA - DebugDataBuffer[0].clippedSilhouetteVertexCount = silhouette.count; - DebugDataBuffer[0].clipMask = clipMask; - DebugDataBuffer[0].clipCount = clipCount; - DebugDataBuffer[0].rotatedClipMask = rotatedClipMask; - DebugDataBuffer[0].rotateAmount = rotateAmount; - DebugDataBuffer[0].positiveVertCount = positiveCount; - DebugDataBuffer[0].wrapAround = (uint32_t)wrapAround; - DebugDataBuffer[0].rotatedSil = rotatedSil; -#endif - -#if VISUALIZE_SAMPLES - return color; -#endif -} + bool isInside(float32_t3 dir) + { + float16_t3 d = float16_t3(dir); + half maxDot = dot(d, edgeNormals[0]); + maxDot = max(maxDot, dot(d, edgeNormals[1])); + maxDot = max(maxDot, dot(d, edgeNormals[2])); + maxDot = max(maxDot, dot(d, edgeNormals[3])); + maxDot = max(maxDot, dot(d, edgeNormals[4])); + maxDot = max(maxDot, dot(d, edgeNormals[5])); + maxDot = max(maxDot, dot(d, edgeNormals[6])); + return maxDot <= float16_t(0.0f); + } +}; -#endif // _SILHOUETTE_HLSL_ +#endif // _SOLID_ANGLE_VIS_EXAMPLE_SILHOUETTE_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl new file mode 100644 index 000000000..bba9aba75 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl @@ -0,0 +1,305 @@ +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#pragma wave shader_stage(fragment) + +#include "common.hlsl" +#include + +using namespace nbl::hlsl; +using namespace ext::FullScreenTriangle; + +#include "drawing.hlsl" +#include "utils.hlsl" +#include "silhouette.hlsl" +#include "triangle_sampling.hlsl" +#include "pyramid_sampling.hlsl" +#include "parallelogram_sampling.hlsl" + +[[vk::push_constant]] struct PushConstants pc; + +static const SAMPLING_MODE samplingMode = (SAMPLING_MODE)SAMPLING_MODE_CONST; + +void computeCubeGeo() +{ + for (uint32_t i = 0; i < 8; i++) + corners[i] = mul(pc.modelMatrix, float32_t4(constCorners[i], 1.0f)).xyz; + + for (uint32_t f = 0; f < 6; f++) + { + faceCenters[f] = float32_t3(0, 0, 0); + for (uint32_t v = 0; v < 4; v++) + faceCenters[f] += corners[faceToCorners[f][v]]; + faceCenters[f] /= 4.0f; + } +} + +void validateSilhouetteEdges(uint32_t sil, uint32_t vertexCount, inout uint32_t silEdgeMask) +{ +#if DEBUG_DATA + { + for (uint32_t i = 0; i < vertexCount; i++) + { + uint32_t vIdx = i % vertexCount; + uint32_t v1Idx = (i + 1) % vertexCount; + + uint32_t v0Corner = getSilhouetteVertex(sil, vIdx); + uint32_t v1Corner = getSilhouetteVertex(sil, v1Idx); + // Mark edge as part of silhouette + for (uint32_t e = 0; e < 12; e++) + { + uint32_t2 edge = allEdges[e]; + if ((edge.x == v0Corner && edge.y == v1Corner) || + (edge.x == v1Corner && edge.y == v0Corner)) + { + silEdgeMask |= (1u << e); + } + } + } + validateEdgeVisibility(pc.modelMatrix, sil, vertexCount, silEdgeMask); + } +#endif +} + +void computeSpherePos(SVertexAttributes vx, out float32_t2 ndc, out float32_t3 spherePos) +{ + ndc = vx.uv * 2.0f - 1.0f; + float32_t aspect = pc.viewport.z / pc.viewport.w; + ndc.x *= aspect; + + float32_t2 normalized = ndc / CIRCLE_RADIUS; + float32_t r2 = dot(normalized, normalized); + + if (r2 <= 1.0f) + { + spherePos = float32_t3(normalized.x, normalized.y, sqrt(1.0f - r2)); + } + else + { + float32_t uv2Plus1 = r2 + 1.0f; + spherePos = float32_t3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; + } + spherePos = normalize(spherePos); +} + +#if VISUALIZE_SAMPLES +float32_t4 visualizeSample(float32_t3 sampleDir, float32_t2 xi, uint32_t index, float32_t2 screenUV, float32_t3 spherePos, float32_t2 ndc, float32_t aaWidth +#if DEBUG_DATA + , + inout RWStructuredBuffer DebugDataBuffer +#endif +) +{ + float32_t4 accumColor = 0; + + float32_t2 pssSize = float32_t2(0.3, 0.3); // 30% of screen + float32_t2 pssPos = float32_t2(0.01, 0.01); // Offset from corner + bool isInsidePSS = all(and(screenUV >= pssPos, screenUV <= (pssPos + pssSize))); + + float32_t dist3D = distance(sampleDir, normalize(spherePos)); + float32_t alpha3D = 1.0f - smoothstep(0.0f, 0.02f, dist3D); + + if (alpha3D > 0.0f /* && !isInsidePSS*/) + { + float32_t3 sampleColor = colorLUT[index].rgb; + accumColor += float32_t4(sampleColor * alpha3D, alpha3D); + } + + // if (isInsidePSS) + // { + // // Map the raw xi to the PSS square dimensions + // float32_t2 xiPixelPos = pssPos + xi * pssSize; + // float32_t dist2D = distance(screenUV, xiPixelPos); + + // float32_t alpha2D = drawCross2D(screenUV, xiPixelPos, 0.005f, 0.001f); + // if (alpha2D > 0.0f) + // { + // float32_t3 sampleColor = colorLUT[index].rgb; + // accumColor += float32_t4(sampleColor * alpha2D, alpha2D); + // } + // } + + // // just the outline of the PSS + // if (isInsidePSS && accumColor.a < 0.1) + // accumColor = float32_t4(0.1, 0.1, 0.1, 1.0); + + return accumColor; +} +#endif // VISUALIZE_SAMPLES + +// [shader("pixel")] +[[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 +{ + float32_t4 color = float32_t4(0, 0, 0, 0); + for (uint32_t i = 0; i < 1; i++) + { + float32_t aaWidth = length(float32_t2(ddx(vx.uv.x), ddy(vx.uv.y))); + float32_t3 spherePos; + float32_t2 ndc; + computeSpherePos(vx, ndc, spherePos); +#if !FAST || DEBUG_DATA + computeCubeGeo(); +#endif + uint32_t3 region; + uint32_t configIndex; + uint32_t vertexCount; + uint32_t sil = ClippedSilhouette::computeRegionAndConfig(pc.modelMatrix, region, configIndex, vertexCount); + + uint32_t silEdgeMask = 0; // TODO: take from 'fast' compute() +#if DEBUG_DATA + validateSilhouetteEdges(sil, vertexCount, silEdgeMask); +#endif + ClippedSilhouette silhouette; + silhouette.compute(pc.modelMatrix, vertexCount, sil); + +#if VISUALIZE_SAMPLES + // Draw silhouette edges on the sphere + for (uint32_t ei = 0; ei < silhouette.count; ei++) + { + float32_t3 v0 = normalize(silhouette.vertices[ei]); + float32_t3 v1 = normalize(silhouette.vertices[(ei + 1) % silhouette.count]); + float32_t3 pts[2] = {v0, v1}; + color += drawEdge(0, pts, spherePos, aaWidth); + } +#endif + + TriangleFanSampler samplingData; + Parallelogram parallelogram; + SphericalPyramid pyramid; + UrenaSampler urena; + BiquadraticSampler biquad; + BilinearSampler bilin; + + SilEdgeNormals silEdgeNormals; + //===================================================================== + // Building + //===================================================================== + if (samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || + samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + { + samplingData = TriangleFanSampler::create(silhouette, samplingMode); + } + else if (samplingMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) + { + silhouette.normalize(); + parallelogram = Parallelogram::create(silhouette, silEdgeNormals +#if VISUALIZE_SAMPLES + , + ndc, spherePos, aaWidth, color +#endif + ); + } + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE || + samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC || + samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) + { + pyramid = SphericalPyramid::create(silhouette, silEdgeNormals +#if VISUALIZE_SAMPLES + , + ndc, spherePos, aaWidth, color +#endif + ); + + if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE) + urena = UrenaSampler::create(pyramid); + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC) + biquad = BiquadraticSampler::create(pyramid); + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) + bilin = BilinearSampler::create(pyramid); + } + +#if DEBUG_DATA + uint32_t validSampleCount = 0u; + DebugDataBuffer[0].sampleCount = pc.sampleCount; +#endif + //===================================================================== + // Sampling + //===================================================================== + for (uint32_t i = 0; i < pc.sampleCount; i++) + { + // Hash the invocation to offset the grid + float32_t2 xi = float32_t2( + (float32_t(i & 7u) + 0.5) / 8.0f, + (float32_t(i >> 3u) + 0.5) / 8.0f); + + float32_t pdf; + uint32_t index = 0; + float32_t3 sampleDir; + bool valid; + + if (samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + sampleDir = samplingData.sample(silhouette, xi, pdf, index); + else if (samplingMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) + sampleDir = parallelogram.sample(silEdgeNormals, xi, pdf, valid); + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE) + sampleDir = urena.sample(pyramid, silEdgeNormals, xi, pdf, valid); + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC) + sampleDir = biquad.sample(pyramid, silEdgeNormals, xi, pdf, valid); + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) + sampleDir = bilin.sample(pyramid, silEdgeNormals, xi, pdf, valid); + + if (!valid) + { + pdf = 0.0f; + // sampleDir = float32_t3(0, 0, 1); + } +#if DEBUG_DATA + else + { + validSampleCount++; + } + + DebugDataBuffer[0].rayData[i] = float32_t4(sampleDir, pdf); +#endif + +#if VISUALIZE_SAMPLES + // Draw samples on sphere + color += visualizeSample(sampleDir, xi, index, vx.uv, spherePos, ndc, aaWidth +#if DEBUG_DATA + , + DebugDataBuffer +#endif + ); +#else + if (pdf > 0.0f) + color += float4(sampleDir * 0.02f / pdf, 1.0f); +#endif // VISUALIZE_SAMPLES + } + +#if VISUALIZE_SAMPLES + + // For debugging: Draw a small indicator of which faces are found + // color += drawVisibleFaceOverlay(pc.modelMatrix, spherePos, region, aaWidth); + + // color += drawFaces(pc.modelMatrix, spherePos, aaWidth); + + // Draw clipped silhouette vertices + // color += drawClippedSilhouetteVertices(ndc, silhouette, aaWidth); + // color += drawHiddenEdges(pc.modelMatrix, spherePos, silEdgeMask, aaWidth); + // color += drawCorners(pc.modelMatrix, ndc, aaWidth, 0.05f); + color += drawRing(ndc, aaWidth); + + if (all(vx.uv >= float32_t2(0.f, 0.97f)) && all(vx.uv <= float32_t2(0.03f, 1.0f))) + { + return float32_t4(colorLUT[configIndex], 1.0f); + } +#else +#endif // VISUALIZE_SAMPLES + +#if DEBUG_DATA + InterlockedAdd(DebugDataBuffer[0].validSampleCount, validSampleCount); + InterlockedAdd(DebugDataBuffer[0].threadCount, 1u); + DebugDataBuffer[0].region = uint32_t3(region); + DebugDataBuffer[0].silhouetteIndex = uint32_t(configIndex); + DebugDataBuffer[0].silhouetteVertexCount = uint32_t(getSilhouetteSize(sil)); + for (uint32_t i = 0; i < 6; i++) + { + DebugDataBuffer[0].vertices[i] = uint32_t(getSilhouetteVertex(sil, i)); + } + DebugDataBuffer[0].silhouette = sil; + +#endif + } + + return color; +} diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl new file mode 100644 index 000000000..46277ca27 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl @@ -0,0 +1,241 @@ +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_TRIANGLE_SAMPLING_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_TRIANGLE_SAMPLING_HLSL_INCLUDED_ + +// Include the spherical triangle utilities +#include "gpu_common.hlsl" +#include +#include +#include +#include +#include +#include "silhouette.hlsl" + +using namespace nbl::hlsl; + +// Maximum number of triangles we can have after clipping +// Without clipping, max 3 faces can be visible at once so 3 faces * 2 triangles = 6 edges, forming max 4 triangles +// With clipping, one more edge. 7 - 2 = 5 max triangles because fanning from one vertex +#define MAX_TRIANGLES 5 + +struct TriangleFanSampler +{ + uint32_t count; // Number of valid triangles + uint32_t samplingMode; // Mode used during build + float32_t totalWeight; // Sum of all triangle weights + float32_t3 faceNormal; // Face normal (only used for projected mode) + float32_t triangleSolidAngles[MAX_TRIANGLES]; // Weight per triangle (for selection) + uint32_t triangleIndices[MAX_TRIANGLES]; // Vertex index i (forms triangle with v0, vi, vi+1) + + float32_t computeProjectedSolidAngleFallback(float32_t3 v0, float32_t3 v1, float32_t3 v2, float32_t3 N) + { + // 1. Get edge normals (unit vectors) + // We use the cross product of the vertices (unit vectors on sphere) + float32_t3 n0 = cross(v0, v1); + float32_t3 n1 = cross(v1, v2); + float32_t3 n2 = cross(v2, v0); + + // 2. Normalize edge normals (magnitude is sin of the arc length) + float32_t l0 = length(n0); + float32_t l1 = length(n1); + float32_t l2 = length(n2); + + // Guard against degenerate triangles + if (l0 < 1e-7 || l1 < 1e-7 || l2 < 1e-7) + return 0.0f; + + n0 /= l0; + n1 /= l1; + n2 /= l2; + + // 3. Get arc lengths (angles in radians) + float32_t a = asin(clamp(l0, -1.0f, 1.0f)); // side v0-v1 + float32_t b = asin(clamp(l1, -1.0f, 1.0f)); // side v1-v2 + float32_t c = asin(clamp(l2, -1.0f, 1.0f)); // side v2-v0 + + // Handle acos/asin quadrant if dot product is negative + if (dot(v0, v1) < 0) + a = 3.14159265 - a; + if (dot(v1, v2) < 0) + b = 3.14159265 - b; + if (dot(v2, v0) < 0) + c = 3.14159265 - c; + + // 4. Compute projected solid angle + float32_t Gamma = 0.5f * (a * dot(n0, N) + b * dot(n1, N) + c * dot(n2, N)); + + // Return the absolute value of the total + return abs(Gamma); + } + + // Build fan triangulation, cache weights for triangle selection + static TriangleFanSampler create(ClippedSilhouette silhouette, uint32_t mode) + { + TriangleFanSampler self; + self.count = 0; + self.totalWeight = 0.0f; + self.samplingMode = mode; + self.faceNormal = float32_t3(0, 0, 0); + + if (silhouette.count < 3) + return self; + + const float32_t3 v0 = silhouette.vertices[0]; + const float32_t3 origin = float32_t3(0, 0, 0); + + // Compute face normal ONCE before the loop - silhouette is planar! + if (mode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + { + float32_t3 v1 = silhouette.vertices[1]; + float32_t3 v2 = silhouette.vertices[2]; + self.faceNormal = normalize(cross(v1 - v0, v2 - v0)); + } + + // Build fan triangulation from v0 + NBL_UNROLL + for (uint32_t i = 1; i < silhouette.count - 1; i++) + { + float32_t3 v1 = silhouette.vertices[i]; + float32_t3 v2 = silhouette.vertices[i + 1]; + + shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(v0, v1, v2, origin); + + // Skip degenerate triangles + if (shapeTri.pyramidAngles()) + continue; + + // Calculate triangle solid angle + float32_t solidAngle; + if (mode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + { + float32_t3 cos_vertices = clamp( + (shapeTri.cos_sides - shapeTri.cos_sides.yzx * shapeTri.cos_sides.zxy) * + shapeTri.csc_sides.yzx * shapeTri.csc_sides.zxy, + float32_t3(-1.0f, -1.0f, -1.0f), + float32_t3(1.0f, 1.0f, 1.0f)); + solidAngle = shapeTri.projectedSolidAngleOfTriangle(self.faceNormal, shapeTri.cos_sides, shapeTri.csc_sides, cos_vertices); + } + else + { + solidAngle = shapeTri.solidAngleOfTriangle(); + } + + if (solidAngle <= 0.0f) + continue; + + // Store only what's needed for weighted selection + self.triangleSolidAngles[self.count] = solidAngle; + self.triangleIndices[self.count] = i; + self.totalWeight += solidAngle; + self.count++; + } + +#if DEBUG_DATA + // Validate no antipodal edges exist (would create spherical lune) + for (uint32_t i = 0; i < silhouette.count; i++) + { + uint32_t j = (i + 1) % silhouette.count; + float32_t3 n1 = normalize(silhouette.vertices[i]); + float32_t3 n2 = normalize(silhouette.vertices[j]); + + if (dot(n1, n2) < -0.99f) + { + DebugDataBuffer[0].sphericalLuneDetected = 1; + assert(false && "Spherical lune detected: antipodal silhouette edge"); + } + } + DebugDataBuffer[0].maxTrianglesExceeded = (self.count > MAX_TRIANGLES); + DebugDataBuffer[0].triangleCount = self.count; + DebugDataBuffer[0].totalSolidAngles = self.totalWeight; + for (uint32_t tri = 0; tri < self.count; tri++) + { + DebugDataBuffer[0].solidAngles[tri] = self.triangleSolidAngles[tri]; + } +#endif + + return self; + } + + // Sample using cached selection weights, recompute geometry on-demand + float32_t3 sample(ClippedSilhouette silhouette, float32_t2 xi, out float32_t pdf, out uint32_t selectedIdx) + { + selectedIdx = 0; + + // Handle empty or invalid data + if (count == 0 || totalWeight <= 0.0f) + { + pdf = 0.0f; + return float32_t3(0, 0, 1); + } + + // Select triangle using cached weighted random selection + float32_t targetWeight = xi.x * totalWeight; + float32_t cumulativeWeight = 0.0f; + float32_t prevCumulativeWeight = 0.0f; + + NBL_UNROLL + for (uint32_t i = 0; i < count; i++) + { + prevCumulativeWeight = cumulativeWeight; + cumulativeWeight += triangleSolidAngles[i]; + + if (targetWeight <= cumulativeWeight) + { + selectedIdx = i; + break; + } + } + + // Remap xi.x to [0,1] within selected triangle's solidAngle interval + float32_t triSolidAngle = triangleSolidAngles[selectedIdx]; + float32_t u = (targetWeight - prevCumulativeWeight) / max(triSolidAngle, 1e-7f); + + // Reconstruct the selected triangle geometry + uint32_t vertexIdx = triangleIndices[selectedIdx]; + float32_t3 v0 = silhouette.vertices[0]; + float32_t3 v1 = silhouette.vertices[vertexIdx]; + float32_t3 v2 = silhouette.vertices[vertexIdx + 1]; + + float32_t3 fn = normalize(cross(v1 - v0, v2 - v0)); + + float32_t3 origin = float32_t3(0, 0, 0); + + shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(v0, v1, v2, origin); + + // Compute vertex angles once + float32_t3 cos_vertices = clamp( + (shapeTri.cos_sides - shapeTri.cos_sides.yzx * shapeTri.cos_sides.zxy) * + shapeTri.csc_sides.yzx * shapeTri.csc_sides.zxy, + float32_t3(-1.0f, -1.0f, -1.0f), + float32_t3(1.0f, 1.0f, 1.0f)); + float32_t3 sin_vertices = sqrt(float32_t3(1.0f, 1.0f, 1.0f) - cos_vertices * cos_vertices); + + // Sample based on mode + float32_t3 direction; + float32_t rcpPdf; + + if (samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + { + sampling::ProjectedSphericalTriangle samplingTri = sampling::ProjectedSphericalTriangle::create(shapeTri); + + direction = samplingTri.generate(rcpPdf, triSolidAngle, cos_vertices, sin_vertices, shapeTri.cos_sides[0], shapeTri.cos_sides[2], shapeTri.csc_sides[1], shapeTri.csc_sides[2], fn, false, float32_t2(u, xi.y)); + triSolidAngle = rcpPdf; // projected solid angle returned as rcpPdf + } + else + { + sampling::SphericalTriangle samplingTri = sampling::SphericalTriangle::create(shapeTri); + direction = samplingTri.generate(triSolidAngle, cos_vertices, sin_vertices, shapeTri.cos_sides[0], shapeTri.cos_sides[2], shapeTri.csc_sides[1], shapeTri.csc_sides[2], float32_t2(u, xi.y)); + } + + // Calculate PDF + float32_t trianglePdf = 1.0f / triSolidAngle; + float32_t selectionProb = triSolidAngle / totalWeight; + pdf = trianglePdf * selectionProb; + + return normalize(direction); + } +}; + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_TRIANGLE_SAMPLING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl index e4bf804cb..832204cf2 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl @@ -1,21 +1,33 @@ -#ifndef _UTILS_HLSL_ -#define _UTILS_HLSL_ +//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_UTILS_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_UTILS_HLSL_INCLUDED_ +#include +#include // TODO: implemented somewhere else? // Bit rotation helpers uint32_t rotl(uint32_t value, uint32_t bits, uint32_t width) { - bits = bits % width; - uint32_t mask = (1u << width) - 1u; + // mask for the width + uint32_t mask = (width == 32) ? 0xFFFFFFFFu : ((1u << width) - 1u); value &= mask; + + // Map bits==width -> 0 + bits &= -(bits < width); + return ((value << bits) | (value >> (width - bits))) & mask; } uint32_t rotr(uint32_t value, uint32_t bits, uint32_t width) { - bits = bits % width; - uint32_t mask = (1u << width) - 1u; + uint32_t mask = ((1u << width) - 1u); value &= mask; + + // Map bits==width -> 0 + bits &= -(bits < width); + return ((value >> bits) | (value << (width - bits))) & mask; } @@ -46,4 +58,11 @@ float32_t2 hammersleySample(uint32_t i, uint32_t numSamples) float32_t(reversebits(i)) / 4294967295.0f); } -#endif // _UTILS_HLSL_ +float32_t2 nextRandomUnorm2(inout nbl::hlsl::Xoroshiro64StarStar rnd) +{ + return float32_t2( + float32_t(rnd()) * 2.3283064365386963e-10, + float32_t(rnd()) * 2.3283064365386963e-10); +} + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_UTILS_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/include/common.hpp b/73_SolidAngleVisualizer/include/common.hpp index 2e8e985dd..fe7d086dd 100644 --- a/73_SolidAngleVisualizer/include/common.hpp +++ b/73_SolidAngleVisualizer/include/common.hpp @@ -6,7 +6,6 @@ // the example's headers #include "transform.hpp" -#include "nbl/builtin/hlsl/matrix_utils/transformation_matrix_utils.hlsl" using namespace nbl; using namespace nbl::core; diff --git a/73_SolidAngleVisualizer/main.cpp b/73_SolidAngleVisualizer/main.cpp index 9d9941da3..c60952394 100644 --- a/73_SolidAngleVisualizer/main.cpp +++ b/73_SolidAngleVisualizer/main.cpp @@ -4,6 +4,8 @@ #include "nbl/this_example/builtin/build/spirv/keys.hpp" #include "common.hpp" +#include +#include #include "app_resources/hlsl/common.hlsl" #include "app_resources/hlsl/benchmark/common.hlsl" #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" @@ -18,17 +20,14 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR using device_base_t = MonoWindowApplication; using asset_base_t = BuiltinResourcesApplication; - inline static std::string SolidAngleVisShaderPath = "app_resources/hlsl/SolidAngleVis.frag.hlsl"; - inline static std::string RayVisShaderPath = "app_resources/hlsl/RayVis.frag.hlsl"; - public: - inline SolidAngleVisualizer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) + inline SolidAngleVisualizer(const path &_localInputCWD, const path &_localOutputCWD, const path &_sharedInputCWD, const path &_sharedOutputCWD) : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD), - device_base_t({ 2048, 1024 }, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) + device_base_t({2048, 1024}, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) { } - inline bool onAppInitialized(smart_refctd_ptr&& system) override + inline bool onAppInitialized(smart_refctd_ptr &&system) override { if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) return false; @@ -46,16 +45,16 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { if (!pool) return logFail("Couldn't create Command Pool!"); - if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { m_cmdBufs.data() + i, 1 })) + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, {m_cmdBufs.data() + i, 1})) return logFail("Couldn't create Command Buffer!"); } - const uint32_t addtionalBufferOwnershipFamilies[] = { getGraphicsQueue()->getFamilyIndex() }; + const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()}; m_scene = CGeometryCreatorScene::create( - { .transferQueue = getTransferUpQueue(), + {.transferQueue = getTransferUpQueue(), .utilities = m_utils.get(), .logger = m_logger.get(), - .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies }, + .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies}, CSimpleDebugRenderer::DefaultPolygonGeometryPatch); // for the scene drawing pass @@ -65,29 +64,29 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR {{{.format = sceneRenderDepthFormat, .samples = IGPUImage::ESCF_1_BIT, .mayAlias = false}, - /*.loadOp =*/{IGPURenderpass::LOAD_OP::CLEAR}, - /*.storeOp =*/{IGPURenderpass::STORE_OP::STORE}, - /*.initialLayout =*/{IGPUImage::LAYOUT::UNDEFINED}, - /*.finalLayout =*/{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}}, - IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd }; + /*.loadOp =*/{IGPURenderpass::LOAD_OP::CLEAR}, + /*.storeOp =*/{IGPURenderpass::STORE_OP::STORE}, + /*.initialLayout =*/{IGPUImage::LAYOUT::UNDEFINED}, + /*.finalLayout =*/{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}}, + IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd}; params.depthStencilAttachments = depthAttachments; const IGPURenderpass::SCreationParams::SColorAttachmentDescription colorAttachments[] = { {{ {.format = finalSceneRenderFormat, .samples = IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT, .mayAlias = false}, - /*.loadOp =*/IGPURenderpass::LOAD_OP::CLEAR, - /*.storeOp =*/IGPURenderpass::STORE_OP::STORE, - /*.initialLayout =*/IGPUImage::LAYOUT::UNDEFINED, - /*.finalLayout =*/IGPUImage::LAYOUT::READ_ONLY_OPTIMAL // ImGUI shall read - }}, - IGPURenderpass::SCreationParams::ColorAttachmentsEnd }; + /*.loadOp =*/IGPURenderpass::LOAD_OP::CLEAR, + /*.storeOp =*/IGPURenderpass::STORE_OP::STORE, + /*.initialLayout =*/IGPUImage::LAYOUT::UNDEFINED, + /*.finalLayout =*/IGPUImage::LAYOUT::READ_ONLY_OPTIMAL // ImGUI shall read + }}, + IGPURenderpass::SCreationParams::ColorAttachmentsEnd}; params.colorAttachments = colorAttachments; IGPURenderpass::SCreationParams::SSubpassDescription subpasses[] = { {}, - IGPURenderpass::SCreationParams::SubpassesEnd }; - subpasses[0].depthStencilAttachment = { {.render = {.attachmentIndex = 0, .layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}} }; - subpasses[0].colorAttachments[0] = { .render = {.attachmentIndex = 0, .layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL} }; + IGPURenderpass::SCreationParams::SubpassesEnd}; + subpasses[0].depthStencilAttachment = {{.render = {.attachmentIndex = 0, .layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}}; + subpasses[0].colorAttachments[0] = {.render = {.attachmentIndex = 0, .layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}; params.subpasses = subpasses; const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { @@ -96,16 +95,16 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, .dstSubpass = 0, .memoryBarrier = { - // last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later - // while color is sampled by ImGUI - .srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, - // don't want any writes to be available, as we are clearing both attachments - .srcAccessMask = ACCESS_FLAGS::NONE, - // destination needs to wait as early as possible - // TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h` - .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - // because depth and color get cleared first no read mask - .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT} + // last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later + // while color is sampled by ImGUI + .srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, + // don't want any writes to be available, as we are clearing both attachments + .srcAccessMask = ACCESS_FLAGS::NONE, + // destination needs to wait as early as possible + // TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h` + .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // because depth and color get cleared first no read mask + .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT} // leave view offsets and flags default }, { @@ -117,9 +116,9 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT | PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT, // but we only care about the availability-visibility chain between renderpass and imgui .dstAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT} - // leave view offsets and flags default - }, - IGPURenderpass::SCreationParams::DependenciesEnd }; + // leave view offsets and flags default + }, + IGPURenderpass::SCreationParams::DependenciesEnd}; params.dependencies = dependencies; auto solidAngleRenderpassParams = params; m_mainRenderpass = m_device->createRenderpass(std::move(params)); @@ -131,13 +130,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR return logFail("Failed to create Solid Angle Renderpass!"); } - const auto& geometries = m_scene->getInitParams().geometries; - m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(), m_solidAngleRenderpass.get(), 0, { &geometries.front().get(), geometries.size() }); + const auto &geometries = m_scene->getInitParams().geometries; + m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(), m_solidAngleRenderpass.get(), 0, {&geometries.front().get(), geometries.size()}); // special case { - const auto& pipelines = m_renderer->getInitParams().pipelines; + const auto &pipelines = m_renderer->getInitParams().pipelines; auto ix = 0u; - for (const auto& name : m_scene->getInitParams().geometryNames) + for (const auto &name : m_scene->getInitParams().geometryNames) { if (name == "Cone") m_renderer->getGeometry(ix).pipeline = pipelines[CSimpleDebugRenderer::SInitParams::PipelineType::Cone]; @@ -149,90 +148,65 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // Create graphics pipeline { - auto loadAndCompileHLSLShader = [&](const std::string& pathToShader, IShader::E_SHADER_STAGE stage, const std::string& defineMacro = "") -> smart_refctd_ptr + auto loadPrecompiledShader = [&](auto key) -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; + auto assetBundle = m_assetMgr->getAsset(key.data(), lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) { - IAssetLoader::SAssetLoadParams lp = {}; - lp.workingDirectory = localInputCWD; - auto assetBundle = m_assetMgr->getAsset(pathToShader, lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - { - m_logger->log("Could not load shader: ", ILogger::ELL_ERROR, pathToShader); - std::exit(-1); - } - - auto source = smart_refctd_ptr_static_cast(assets[0]); - // The down-cast should not fail! - assert(source); - - auto compiler = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); - CHLSLCompiler::SOptions options = {}; - options.stage = stage; - options.preprocessorOptions.targetSpirvVersion = m_device->getPhysicalDevice()->getLimits().spirvVersion; - options.spirvOptimizer = nullptr; -#ifndef _NBL_DEBUG - ISPIRVOptimizer::E_OPTIMIZER_PASS optPasses = ISPIRVOptimizer::EOP_STRIP_DEBUG_INFO; - auto opt = make_smart_refctd_ptr(std::span(&optPasses, 1)); - options.spirvOptimizer = opt.get(); -#endif - options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;// | IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_FILE_BIT | IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_SOURCE_BIT; - options.preprocessorOptions.sourceIdentifier = source->getFilepathHint(); - options.preprocessorOptions.logger = m_logger.get(); - options.preprocessorOptions.includeFinder = compiler->getDefaultIncludeFinder(); - - core::vector defines; - if (!defineMacro.empty()) - defines.push_back({ defineMacro, "" }); - - options.preprocessorOptions.extraDefines = defines; - - source = compiler->compileToSPIRV((const char*)source->getContent()->getPointer(), options); - - auto shader = m_device->compileShader({ source.get(), nullptr, nullptr, nullptr }); - if (!shader) - { - m_logger->log("HLSL shader creationed failed: %s!", ILogger::ELL_ERROR, pathToShader); - std::exit(-1); - } - - return shader; - }; + m_logger->log("Could not load precompiled shader!", ILogger::ELL_ERROR); + std::exit(-1); + } + assert(assets.size() == 1); + auto shader = IAsset::castDown(assets[0]); + if (!shader) + { + m_logger->log("Failed to load precompiled shader!", ILogger::ELL_ERROR); + std::exit(-1); + } + return shader; + }; ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); if (!fsTriProtoPPln) return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); - // Load Fragment Shader - auto solidAngleVisFragShader = loadAndCompileHLSLShader(SolidAngleVisShaderPath, ESS_FRAGMENT); - if (!solidAngleVisFragShader) - return logFail("Failed to Load and Compile Fragment Shader: SolidAngleVis!"); - - const IGPUPipelineBase::SShaderSpecInfo solidAngleFragSpec = { - .shader = solidAngleVisFragShader.get(), - .entryPoint = "main" }; - - auto rayVisFragShader = loadAndCompileHLSLShader(RayVisShaderPath, ESS_FRAGMENT); - if (!rayVisFragShader) - return logFail("Failed to Load and Compile Fragment Shader: rayVis!"); - const IGPUPipelineBase::SShaderSpecInfo RayFragSpec = { - .shader = rayVisFragShader.get(), - .entryPoint = "main" }; + // Load pre-compiled fragment shaders (6 modes x 2 debug = 12 SolidAngleVis + 2 RayVis) + // Can't use string literal template args in a loop, so unroll manually + // Index: mode * 2 + debugFlag (0=release, 1=debug) + smart_refctd_ptr saVisShaders[SAMPLING_MODE::Count * DebugPermutations]; + saVisShaders[0] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_sa">(m_device.get())); + saVisShaders[1] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_sa_dbg">(m_device.get())); + saVisShaders[2] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_psa">(m_device.get())); + saVisShaders[3] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_psa_dbg">(m_device.get())); + saVisShaders[4] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_para">(m_device.get())); + saVisShaders[5] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_para_dbg">(m_device.get())); + saVisShaders[6] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_rectangle">(m_device.get())); + saVisShaders[7] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_rectangle_dbg">(m_device.get())); + saVisShaders[8] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_biquad">(m_device.get())); + saVisShaders[9] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_biquad_dbg">(m_device.get())); + saVisShaders[10] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_bilinear">(m_device.get())); + saVisShaders[11] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_bilinear_dbg">(m_device.get())); + + smart_refctd_ptr rayVisShaders[DebugPermutations]; + rayVisShaders[0] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"ray_vis">(m_device.get())); + rayVisShaders[1] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"ray_vis_dbg">(m_device.get())); smart_refctd_ptr solidAngleVisLayout, rayVisLayout; - nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { - {.binding = 0, - .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ShaderStage::ESS_FRAGMENT, - .count = 1} }; + nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = + { + {.binding = 0, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = ShaderStage::ESS_FRAGMENT, + .count = 1}}; smart_refctd_ptr dsLayout = m_device->createDescriptorSetLayout(bindings); - const asset::SPushConstantRange saRanges[] = { {.stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, - .offset = 0, - .size = sizeof(PushConstants)} }; - const asset::SPushConstantRange rayRanges[] = { {.stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, - .offset = 0, - .size = sizeof(PushConstantRayVis)} }; + const asset::SPushConstantRange saRanges[] = {{.stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, .offset = 0, .size = sizeof(PushConstants)}}; + const asset::SPushConstantRange rayRanges[] = {{.stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, .offset = 0, .size = sizeof(PushConstantRayVis)}}; if (!dsLayout) logFail("Failed to create a Descriptor Layout!\n"); @@ -242,17 +216,31 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR rayVisLayout = m_device->createPipelineLayout(rayRanges, dsLayout); { - m_solidAngleVisPipeline = fsTriProtoPPln.createPipeline(solidAngleFragSpec, solidAngleVisLayout.get(), m_solidAngleRenderpass.get()); - if (!m_solidAngleVisPipeline) - return logFail("Could not create Graphics Pipeline!"); + // Create all SolidAngleVis pipeline variants + for (uint32_t i = 0; i < SAMPLING_MODE::Count * DebugPermutations; i++) + { + const IGPUPipelineBase::SShaderSpecInfo fragSpec = { + .shader = saVisShaders[i].get(), + .entryPoint = "main"}; + m_solidAngleVisPipelines[i] = fsTriProtoPPln.createPipeline(fragSpec, solidAngleVisLayout.get(), m_solidAngleRenderpass.get()); + if (!m_solidAngleVisPipelines[i]) + return logFail("Could not create SolidAngleVis Graphics Pipeline variant %d!", i); + } asset::SRasterizationParams rasterParams = ext::FullScreenTriangle::ProtoPipeline::DefaultRasterParams; rasterParams.depthWriteEnable = true; rasterParams.depthCompareOp = asset::E_COMPARE_OP::ECO_GREATER; - m_rayVisualizationPipeline = fsTriProtoPPln.createPipeline(RayFragSpec, rayVisLayout.get(), m_mainRenderpass.get(), 0, {}, rasterParams); - if (!m_rayVisualizationPipeline) - return logFail("Could not create Graphics Pipeline!"); + // Create all RayVis pipeline variants + for (uint32_t i = 0; i < DebugPermutations; i++) + { + const IGPUPipelineBase::SShaderSpecInfo fragSpec = { + .shader = rayVisShaders[i].get(), + .entryPoint = "main"}; + m_rayVisPipelines[i] = fsTriProtoPPln.createPipeline(fragSpec, rayVisLayout.get(), m_mainRenderpass.get(), 0, {}, rasterParams); + if (!m_rayVisPipelines[i]) + return logFail("Could not create RayVis Graphics Pipeline variant %d!", i); + } } // Allocate the memory { @@ -275,20 +263,20 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); assert(m_outputStorageBuffer->getBoundMemory().memory == m_allocation.memory.get()); - smart_refctd_ptr pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(), 1 }); + smart_refctd_ptr pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, {&dsLayout.get(), 1}); m_ds = pool->createDescriptorSet(std::move(dsLayout)); { IGPUDescriptorSet::SDescriptorInfo info[1]; info[0].desc = smart_refctd_ptr(m_outputStorageBuffer); - info[0].info.buffer = { .offset = 0, .size = BufferSize }; + info[0].info.buffer = {.offset = 0, .size = BufferSize}; IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { - {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info} }; + {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info}}; m_device->updateDescriptorSets(writes, {}); } } - if (!m_allocation.memory->map({ 0ull, m_allocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_READ)) + if (!m_allocation.memory->map({0ull, m_allocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ)) logFail("Failed to map the Device Memory!\n"); // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches @@ -299,10 +287,10 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // Create ImGUI { - auto scRes = static_cast(m_surface->getSwapchainResources()); + auto scRes = static_cast(m_surface->getSwapchainResources()); ext::imgui::UI::SCreationParameters params = {}; - params.resources.texturesInfo = { .setIx = 0u, .bindingIx = TexturesImGUIBindingIndex }; - params.resources.samplersInfo = { .setIx = 0u, .bindingIx = 1u }; + params.resources.texturesInfo = {.setIx = 0u, .bindingIx = TexturesImGUIBindingIndex}; + params.resources.samplersInfo = {.setIx = 0u, .bindingIx = 1u}; params.utilities = m_utils; params.transfer = getTransferUpQueue(); params.pipelineLayout = ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxImGUITextures); @@ -317,12 +305,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // create rest of User Interface { - auto* imgui = interface.imGUI.get(); + auto *imgui = interface.imGUI.get(); // create the suballocated descriptor set { // note that we use default layout provided by our extension, but you are free to create your own by filling ext::imgui::UI::S_CREATION_PARAMETERS::resources - const auto* layout = interface.imGUI->getPipeline()->getLayout()->getDescriptorSetLayout(0u); - auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT, { &layout, 1 }); + const auto *layout = interface.imGUI->getPipeline()->getLayout()->getDescriptorSetLayout(0u); + auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT, {&layout, 1}); auto ds = pool->createDescriptorSet(smart_refctd_ptr(layout)); interface.subAllocDS = make_smart_refctd_ptr(std::move(ds)); if (!interface.subAllocDS) @@ -342,12 +330,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .binding = TexturesImGUIBindingIndex, .arrayElement = ext::imgui::UI::FontAtlasTexId, .count = 1, - .info = &info }; - if (!m_device->updateDescriptorSets({ &write, 1 }, {})) + .info = &info}; + if (!m_device->updateDescriptorSets({&write, 1}, {})) return logFail("Failed to write the descriptor set"); } imgui->registerListener([this]() - { interface(); }); + { interface(); }); } interface.camera.mapKeysToWASD(); @@ -371,8 +359,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR update(nextPresentationTimestamp); { - const auto& virtualSolidAngleWindowRes = interface.solidAngleViewTransformReturnInfo.sceneResolution; - const auto& virtualMainWindowRes = interface.mainViewTransformReturnInfo.sceneResolution; + const auto &virtualSolidAngleWindowRes = interface.solidAngleViewTransformReturnInfo.sceneResolution; + const auto &virtualMainWindowRes = interface.mainViewTransformReturnInfo.sceneResolution; if (!m_solidAngleViewFramebuffer || m_solidAngleViewFramebuffer->getCreationParameters().width != virtualSolidAngleWindowRes[0] || m_solidAngleViewFramebuffer->getCreationParameters().height != virtualSolidAngleWindowRes[1] || !m_mainViewFramebuffer || m_mainViewFramebuffer->getCreationParameters().width != virtualMainWindowRes[0] || m_mainViewFramebuffer->getCreationParameters().height != virtualMainWindowRes[1]) recreateFramebuffers(); @@ -381,7 +369,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // const auto resourceIx = m_realFrameIx % MaxFramesInFlight; - auto* const cb = m_cmdBufs.data()[resourceIx].get(); + auto *const cb = m_cmdBufs.data()[resourceIx].get(); cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); @@ -390,23 +378,23 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR asset::SBufferRange range{ .offset = 0, .size = m_outputStorageBuffer->getSize(), - .buffer = m_outputStorageBuffer }; + .buffer = m_outputStorageBuffer}; cb->fillBuffer(range, 0u); { - const auto& creationParams = m_solidAngleViewFramebuffer->getCreationParameters(); + const auto &creationParams = m_solidAngleViewFramebuffer->getCreationParameters(); cb->beginDebugMarker("Draw Circle View Frame"); { - const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f }; - const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f, 0.f, 0.f, 1.f} }; + const IGPUCommandBuffer::SClearDepthStencilValue farValue = {.depth = 0.f}; + const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.f, 0.f, 0.f, 1.f}}; const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = - { - .framebuffer = m_solidAngleViewFramebuffer.get(), - .colorClearValues = &clearValue, - .depthStencilClearValues = &farValue, - .renderArea = { - .offset = {0, 0}, - .extent = {creationParams.width, creationParams.height}} }; + { + .framebuffer = m_solidAngleViewFramebuffer.get(), + .colorClearValues = &clearValue, + .depthStencilClearValues = &farValue, + .renderArea = { + .offset = {0, 0}, + .extent = {creationParams.width, creationParams.height}}}; beginRenderpass(cb, renderpassInfo); } // draw scene @@ -416,10 +404,10 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR PushConstants pc{ .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), .viewport = {0.f, 0.f, static_cast(creationParams.width), static_cast(creationParams.height)}, - .samplingMode = m_samplingMode, .sampleCount = static_cast(m_SampleCount), - .frameIndex = lastFrameSeed }; - auto pipeline = m_solidAngleVisPipeline; + .frameIndex = lastFrameSeed}; + const uint32_t debugIdx = m_debugVisualization ? 1u : 0u; + auto pipeline = m_solidAngleVisPipelines[m_samplingMode * DebugPermutations + debugIdx]; cb->bindGraphicsPipeline(pipeline.get()); cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(pc), &pc); cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get()); @@ -428,27 +416,29 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR cb->endRenderPass(); cb->endDebugMarker(); } -#if DEBUG_DATA - m_device->waitIdle(); - std::memcpy(&m_GPUOutResulData, static_cast(m_allocation.memory->getMappedPointer()), sizeof(ResultData)); - m_device->waitIdle(); -#endif + + if (m_debugVisualization) + { + m_device->waitIdle(); + std::memcpy(&m_GPUOutResulData, static_cast(m_allocation.memory->getMappedPointer()), sizeof(ResultData)); + m_device->waitIdle(); + } } // draw main view if (m_mainViewFramebuffer) { { auto creationParams = m_mainViewFramebuffer->getCreationParameters(); - const IGPUCommandBuffer::SClearDepthStencilValue farValue = { .depth = 0.f }; - const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.1f, 0.1f, 0.1f, 1.f} }; + const IGPUCommandBuffer::SClearDepthStencilValue farValue = {.depth = 0.f}; + const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.1f, 0.1f, 0.1f, 1.f}}; const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = - { - .framebuffer = m_mainViewFramebuffer.get(), - .colorClearValues = &clearValue, - .depthStencilClearValues = &farValue, - .renderArea = { - .offset = {0, 0}, - .extent = {creationParams.width, creationParams.height}} }; + { + .framebuffer = m_mainViewFramebuffer.get(), + .colorClearValues = &clearValue, + .depthStencilClearValues = &farValue, + .renderArea = { + .offset = {0, 0}, + .extent = {creationParams.width, creationParams.height}}}; beginRenderpass(cb, renderpassInfo); } { // draw rays visualization @@ -457,15 +447,16 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR cb->beginDebugMarker("Draw Rays visualization"); // draw scene { - float32_t4x4 viewProj = *reinterpret_cast(&interface.camera.getConcatenatedMatrix()); - float32_t3x4 view = *reinterpret_cast(&interface.camera.getViewMatrix()); + float32_t4x4 viewProj = *reinterpret_cast(&interface.camera.getConcatenatedMatrix()); + float32_t3x4 view = *reinterpret_cast(&interface.camera.getViewMatrix()); PushConstantRayVis pc{ .viewProjMatrix = viewProj, .viewMatrix = view, .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), + .invModelMatrix = hlsl::float32_t3x4(hlsl::transpose(hlsl::inverse(interface.m_OBBModelMatrix))), .viewport = {0.f, 0.f, static_cast(creationParams.width), static_cast(creationParams.height)}, - .frameIndex = m_frameSeeding ? static_cast(m_realFrameIx) : 0u }; - auto pipeline = m_rayVisualizationPipeline; + .frameIndex = m_frameSeeding ? static_cast(m_realFrameIx) : 0u}; + auto pipeline = m_rayVisPipelines[m_debugVisualization ? 1u : 0u]; cb->bindGraphicsPipeline(pipeline.get()); cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(pc), &pc); cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get()); @@ -481,14 +472,14 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR float32_t4x4 viewProjMatrix; // TODO: get rid of legacy matrices { - const auto& camera = interface.camera; - memcpy(&viewMatrix, camera.getViewMatrix().pointer(), sizeof(viewMatrix)); - memcpy(&viewProjMatrix, camera.getConcatenatedMatrix().pointer(), sizeof(viewProjMatrix)); + const auto &camera = interface.camera; + memcpy(&viewMatrix, &camera.getViewMatrix(), sizeof(viewMatrix)); + memcpy(&viewProjMatrix, &camera.getConcatenatedMatrix(), sizeof(viewProjMatrix)); } const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix, viewProjMatrix); // tear down scene every frame - auto& instance = m_renderer->m_instances[0]; + auto &instance = m_renderer->m_instances[0]; instance.world = float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)); instance.packedGeo = m_renderer->getGeometries().data(); // cube // +interface.gcIndex; m_renderer->render(cb, viewParams); // draw the cube/OBB @@ -505,28 +496,28 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { cb->beginDebugMarker("SolidAngleVisualizer IMGUI Frame"); { - auto scRes = static_cast(m_surface->getSwapchainResources()); - const IGPUCommandBuffer::SClearColorValue clearValue = { .float32 = {0.f, 0.f, 0.f, 1.f} }; + auto scRes = static_cast(m_surface->getSwapchainResources()); + const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.f, 0.f, 0.f, 1.f}}; const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = - { - .framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex), - .colorClearValues = &clearValue, - .depthStencilClearValues = nullptr, - .renderArea = { - .offset = {0, 0}, - .extent = {m_window->getWidth(), m_window->getHeight()}} }; + { + .framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex), + .colorClearValues = &clearValue, + .depthStencilClearValues = nullptr, + .renderArea = { + .offset = {0, 0}, + .extent = {m_window->getWidth(), m_window->getHeight()}}}; beginRenderpass(cb, renderpassInfo); } // draw ImGUI { - auto* imgui = interface.imGUI.get(); - auto* pipeline = imgui->getPipeline(); + auto *imgui = interface.imGUI.get(); + auto *pipeline = imgui->getPipeline(); cb->bindGraphicsPipeline(pipeline); // note that we use default UI pipeline layout where uiParams.resources.textures.setIx == uiParams.resources.samplers.setIx - const auto* ds = interface.subAllocDS->getDescriptorSet(); + const auto *ds = interface.subAllocDS->getDescriptorSet(); cb->bindDescriptorSets(EPBP_GRAPHICS, pipeline->getLayout(), imgui->getCreationParameters().resources.texturesInfo.setIx, 1u, &ds); // a timepoint in the future to release streaming resources for geometry - const ISemaphore::SWaitInfo drawFinished = { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u }; + const ISemaphore::SWaitInfo drawFinished = {.semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u}; if (!imgui->render(cb, drawFinished)) { m_logger->log("TODO: need to present acquired image before bailing because its already acquired.", ILogger::ELL_ERROR); @@ -539,22 +530,22 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR cb->end(); IQueue::SSubmitInfo::SSemaphoreInfo retval = - { - .semaphore = m_semaphore.get(), - .value = ++m_realFrameIx, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS }; + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS}; const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = - { - {.cmdbuf = cb} }; + { + {.cmdbuf = cb}}; const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {.semaphore = device_base_t::getCurrentAcquire().semaphore, .value = device_base_t::getCurrentAcquire().acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::NONE} }; + .stageMask = PIPELINE_STAGE_FLAGS::NONE}}; const IQueue::SSubmitInfo infos[] = - { - {.waitSemaphores = acquired, - .commandBuffers = commandBuffers, - .signalSemaphores = {&retval, 1}} }; + { + {.waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = {&retval, 1}}}; if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS) { @@ -567,7 +558,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } protected: - const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override + const video::IGPURenderpass::SCreationParams::SSubpassDependency *getDefaultSubpassDependencies() const override { // Subsequent submits don't wait for each other, but they wait for acquire and get waited on by present const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { @@ -581,27 +572,27 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // layout transition needs to finish before the color write .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT} - // leave view offsets and flags default - }, + // leave view offsets and flags default + }, // want layout transition to begin after all color output is done { .srcSubpass = 0, .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, .memoryBarrier = { - // last place where the color can get modified, depth is implicitly earlier - .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - // only write ops, reads can't be made available - .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - // spec says nothing is needed when presentation is the destination - } - // leave view offsets and flags default - }, - IGPURenderpass::SCreationParams::DependenciesEnd }; + // last place where the color can get modified, depth is implicitly earlier + .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // only write ops, reads can't be made available + .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + // spec says nothing is needed when presentation is the destination + } + // leave view offsets and flags default + }, + IGPURenderpass::SCreationParams::DependenciesEnd}; return dependencies; } private: inline void update(const std::chrono::microseconds nextPresentationTimestamp) { - auto& camera = interface.camera; + auto &camera = interface.camera; camera.setMoveSpeed(interface.moveSpeed); camera.setRotateSpeed(interface.rotateSpeed); @@ -623,8 +614,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // `timeDiff` being computed since `lastVirtualUpTimeStamp` camera.beginInputProcessing(nextPresentationTimestamp); { - mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void - { + mouse.consumeEvents([&](const IMouseEventChannel::range_t &events) -> void + { if (interface.move) camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl else @@ -644,9 +635,9 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // interface.gcIndex = core::clamp(interface.gcIndex, 0ull, m_renderer->getGeometries().size() - 1); //} } }, - m_logger.get()); - keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void - { + m_logger.get()); + keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t &events) -> void + { if (interface.move) camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl @@ -658,18 +649,18 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR previousEventTimestamp = e.timeStamp; uiEvents.keyboard.emplace_back(e); } }, - m_logger.get()); + m_logger.get()); } camera.endInputProcessing(nextPresentationTimestamp); const auto cursorPosition = m_window->getCursorControl()->getPosition(); ext::imgui::UI::SUpdateParameters params = - { - .mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()), - .displaySize = {m_window->getWidth(), m_window->getHeight()}, - .mouseEvents = uiEvents.mouse, - .keyboardEvents = uiEvents.keyboard }; + { + .mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()), + .displaySize = {m_window->getWidth(), m_window->getHeight()}, + .mouseEvents = uiEvents.mouse, + .keyboardEvents = uiEvents.keyboard}; // interface.objectName = m_scene->getInitParams().geometryNames[interface.gcIndex]; interface.imGUI->update(params); @@ -679,23 +670,23 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { auto createImageAndView = [&](const uint16_t2 resolution, E_FORMAT format) -> smart_refctd_ptr - { - auto image = m_device->createImage({ {.type = IGPUImage::ET_2D, - .samples = IGPUImage::ESCF_1_BIT, - .format = format, - .extent = {resolution.x, resolution.y, 1}, - .mipLevels = 1, - .arrayLayers = 1, - .usage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT | IGPUImage::EUF_SAMPLED_BIT} }); - if (!m_device->allocate(image->getMemoryReqs(), image.get()).isValid()) - return nullptr; - IGPUImageView::SCreationParams params = { - .image = std::move(image), - .viewType = IGPUImageView::ET_2D, - .format = format }; - params.subresourceRange.aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT : IGPUImage::EAF_COLOR_BIT; - return m_device->createImageView(std::move(params)); - }; + { + auto image = m_device->createImage({{.type = IGPUImage::ET_2D, + .samples = IGPUImage::ESCF_1_BIT, + .format = format, + .extent = {resolution.x, resolution.y, 1}, + .mipLevels = 1, + .arrayLayers = 1, + .usage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT | IGPUImage::EUF_SAMPLED_BIT}}); + if (!m_device->allocate(image->getMemoryReqs(), image.get()).isValid()) + return nullptr; + IGPUImageView::SCreationParams params = { + .image = std::move(image), + .viewType = IGPUImageView::ET_2D, + .format = format}; + params.subresourceRange.aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT : IGPUImage::EAF_COLOR_BIT; + return m_device->createImageView(std::move(params)); + }; smart_refctd_ptr solidAngleView; smart_refctd_ptr mainView; @@ -708,19 +699,19 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { solidAngleView = createImageAndView(solidAngleViewRes, finalSceneRenderFormat); auto solidAngleDepthView = createImageAndView(solidAngleViewRes, sceneRenderDepthFormat); - m_solidAngleViewFramebuffer = m_device->createFramebuffer({ {.renderpass = m_solidAngleRenderpass, + m_solidAngleViewFramebuffer = m_device->createFramebuffer({{.renderpass = m_solidAngleRenderpass, .depthStencilAttachments = &solidAngleDepthView.get(), .colorAttachments = &solidAngleView.get(), .width = solidAngleViewRes.x, - .height = solidAngleViewRes.y} }); + .height = solidAngleViewRes.y}}); mainView = createImageAndView(mainViewRes, finalSceneRenderFormat); auto mainDepthView = createImageAndView(mainViewRes, sceneRenderDepthFormat); - m_mainViewFramebuffer = m_device->createFramebuffer({ {.renderpass = m_mainRenderpass, + m_mainViewFramebuffer = m_device->createFramebuffer({{.renderpass = m_mainRenderpass, .depthStencilAttachments = &mainDepthView.get(), .colorAttachments = &mainView.get(), .width = mainViewRes.x, - .height = mainViewRes.y} }); + .height = mainViewRes.y}}); } else { @@ -729,7 +720,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } // release previous slot and its image - interface.subAllocDS->multi_deallocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices, { .semaphore = m_semaphore.get(), .value = m_realFrameIx + 1 }); + interface.subAllocDS->multi_deallocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices, {.semaphore = m_semaphore.get(), .value = m_realFrameIx + 1}); // if (solidAngleView && mainView) { @@ -750,13 +741,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .binding = TexturesImGUIBindingIndex, .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)], .count = 1, - .info = &infos[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)]} }; - m_device->updateDescriptorSets({ write, static_cast(CInterface::Count) }, {}); + .info = &infos[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)]}}; + m_device->updateDescriptorSets({write, static_cast(CInterface::Count)}, {}); } interface.transformParams.sceneTexDescIx = interface.renderColorViewDescIndices[CInterface::ERV_MAIN_VIEW]; } - inline void beginRenderpass(IGPUCommandBuffer* cb, const IGPUCommandBuffer::SRenderpassBeginInfo& info) + inline void beginRenderpass(IGPUCommandBuffer *cb, const IGPUCommandBuffer::SRenderpassBeginInfo &info) { cb->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); cb->setScissor(0, 1, &info.renderArea); @@ -764,7 +755,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .x = 0, .y = 0, .width = static_cast(info.renderArea.extent.width), - .height = static_cast(info.renderArea.extent.height) }; + .height = static_cast(info.renderArea.extent.height)}; cb->setViewport(0u, 1u, &viewport); } @@ -781,7 +772,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight; - static inline SAMPLING_MODE m_samplingMode = SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE; + static inline SAMPLING_MODE m_samplingMode = SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE; + static inline bool m_debugVisualization = true; static inline int m_SampleCount = 64; static inline bool m_frameSeeding = true; static inline ResultData m_GPUOutResulData; @@ -792,8 +784,10 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR smart_refctd_ptr m_renderer; smart_refctd_ptr m_solidAngleViewFramebuffer; smart_refctd_ptr m_mainViewFramebuffer; - smart_refctd_ptr m_solidAngleVisPipeline; - smart_refctd_ptr m_rayVisualizationPipeline; + // Pipeline variants: SolidAngleVis indexed by [mode * 2 + debugFlag], RayVis by [debugFlag] + static constexpr uint32_t DebugPermutations = 2; + smart_refctd_ptr m_solidAngleVisPipelines[SAMPLING_MODE::Count * DebugPermutations]; + smart_refctd_ptr m_rayVisPipelines[DebugPermutations]; // nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; smart_refctd_ptr m_outputStorageBuffer; @@ -809,27 +803,26 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { void operator()() { - ImGuiIO& io = ImGui::GetIO(); + ImGuiIO &io = ImGui::GetIO(); // TODO: why is this a lambda and not just an assignment in a scope ? camera.setProjectionMatrix([&]() - { - const auto& sceneRes = float16_t2(mainViewTransformReturnInfo.sceneResolution); + { + hlsl::float32_t4x4 projection; - matrix4SIMD projection; if (isPerspective) if (isLH) - projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovLH(core::radians(fov), sceneRes.x / sceneRes.y, zNear, zFar); + projection = hlsl::math::thin_lens::lhPerspectiveFovMatrix(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y * 0.5f, zNear, zFar); // TODO: why do I need to divide aspect ratio by 2? else - projection = matrix4SIMD::buildProjectionMatrixPerspectiveFovRH(core::radians(fov), sceneRes.x / sceneRes.y, zNear, zFar); + projection = hlsl::math::thin_lens::rhPerspectiveFovMatrix(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y * 0.5f, zNear, zFar); else { - float viewHeight = viewWidth * sceneRes.y / sceneRes.x; + float viewHeight = viewWidth * io.DisplaySize.y / io.DisplaySize.x; if (isLH) - projection = matrix4SIMD::buildProjectionMatrixOrthoLH(viewWidth, viewHeight, zNear, zFar); + projection = hlsl::math::thin_lens::lhPerspectiveFovMatrix(viewWidth, viewHeight, zNear, zFar); else - projection = matrix4SIMD::buildProjectionMatrixOrthoRH(viewWidth, viewHeight, zNear, zFar); + projection = hlsl::math::thin_lens::rhPerspectiveFovMatrix(viewWidth, viewHeight, zNear, zFar); } return projection; }()); @@ -857,12 +850,14 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::Text("Sampling Mode:"); ImGui::SameLine(); - const char* samplingModes[] = - { - "Triangle Solid Angle", - "Triangle Projected Solid Angle", - "Parallelogram Projected Solid Angle" - }; + const char *samplingModes[] = + { + "Triangle Solid Angle", + "Triangle Projected Solid Angle", + "Parallelogram Projected Solid Angle", + "Rectangle Pyramid Solid Angle", + "Biquadratic pyramid solid angle", + "Bilinear pyramid solid angle"}; int currentMode = static_cast(m_samplingMode); @@ -871,8 +866,10 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR m_samplingMode = static_cast(currentMode); } - - + ImGui::Checkbox("Debug Visualization", &m_debugVisualization); + ImGui::Text("Pipeline idx: SA=%d, Ray=%d", + static_cast(m_samplingMode) * DebugPermutations + (m_debugVisualization ? 1 : 0), + m_debugVisualization ? 1 : 0); ImGui::Checkbox("Frame seeding", &m_frameSeeding); ImGui::SliderInt("Sample Count", &m_SampleCount, 0, 512); @@ -983,12 +980,6 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR * note it also modifies input view matrix but projection matrix is immutable */ - // No need because camera already has this functionality - // if (ImGui::IsKeyPressed(ImGuiKey_Home)) - // { - // cameraToHome(); - // } - if (ImGui::IsKeyPressed(ImGuiKey_End)) { m_TRS = TRS{}; @@ -1003,11 +994,11 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGuizmo::SetID(0u); // TODO: camera will return hlsl::float32_tMxN - auto view = *reinterpret_cast(camera.getViewMatrix().pointer()); - imguizmoM16InOut.view = hlsl::transpose(getMatrix3x4As4x4(view)); + auto view = camera.getViewMatrix(); + imguizmoM16InOut.view = hlsl::transpose(hlsl::math::linalg::promote_affine<4, 4>(view)); // TODO: camera will return hlsl::float32_tMxN - imguizmoM16InOut.projection = hlsl::transpose(*reinterpret_cast(camera.getProjectionMatrix().pointer())); + imguizmoM16InOut.projection = hlsl::transpose(camera.getProjectionMatrix()); ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &imguizmoM16InOut.model[0][0]); if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates @@ -1037,40 +1028,40 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImVec2 contentRegionSize = ImGui::GetContentRegionAvail(); solidAngleViewTransformReturnInfo.sceneResolution = uint16_t2(static_cast(contentRegionSize.x), static_cast(contentRegionSize.y)); solidAngleViewTransformReturnInfo.allowCameraMovement = false; // not used in this view - ImGui::Image({ renderColorViewDescIndices[ERV_SOLID_ANGLE_VIEW] }, contentRegionSize); + ImGui::Image({renderColorViewDescIndices[ERV_SOLID_ANGLE_VIEW]}, contentRegionSize); ImGui::End(); } // Show data coming from GPU -#if DEBUG_DATA + if (m_debugVisualization) { if (ImGui::Begin("Result Data")) { - auto drawColorField = [&](const char* fieldName, uint32_t index) - { - ImGui::Text("%s: %u", fieldName, index); + auto drawColorField = [&](const char *fieldName, uint32_t index) + { + ImGui::Text("%s: %u", fieldName, index); - if (index >= 27) - { - ImGui::SameLine(); - ImGui::Text(""); - return; - } + if (index >= 27) + { + ImGui::SameLine(); + ImGui::Text(""); + return; + } - const auto& c = colorLUT[index]; // uses the combined LUT we made earlier + const auto &c = colorLUT[index]; // uses the combined LUT we made earlier - ImGui::SameLine(); + ImGui::SameLine(); - // Color preview button - ImGui::ColorButton( - fieldName, - ImVec4(c.r, c.g, c.b, 1.0f), - 0, - ImVec2(20, 20)); + // Color preview button + ImGui::ColorButton( + fieldName, + ImVec4(c.r, c.g, c.b, 1.0f), + 0, + ImVec2(20, 20)); - ImGui::SameLine(); - ImGui::Text("%s", colorNames[index]); - }; + ImGui::SameLine(); + ImGui::Text("%s", colorNames[index]); + }; // Vertices if (ImGui::CollapsingHeader("Vertices", ImGuiTreeNodeFlags_DefaultOpen)) @@ -1085,7 +1076,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::SameLine(); static const float32_t3 constCorners[8] = { float32_t3(-1, -1, -1), float32_t3(1, -1, -1), float32_t3(-1, 1, -1), float32_t3(1, 1, -1), - float32_t3(-1, -1, 1), float32_t3(1, -1, 1), float32_t3(-1, 1, 1), float32_t3(1, 1, 1) }; + float32_t3(-1, -1, 1), float32_t3(1, -1, 1), float32_t3(-1, 1, 1), float32_t3(1, 1, 1)}; float32_t3 vertexLocation = constCorners[m_GPUOutResulData.vertices[i]]; ImGui::Text(" : (%.3f, %.3f, %.3f", vertexLocation.x, vertexLocation.y, vertexLocation.z); } @@ -1110,32 +1101,112 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR drawColorField(" ", i); } + ImGui::Separator(); + ImGui::Text("Valid Samples: %u / %u", m_GPUOutResulData.validSampleCount / hlsl::max(m_GPUOutResulData.threadCount, 1u), m_GPUOutResulData.sampleCount); + ImGui::ProgressBar(static_cast(m_GPUOutResulData.validSampleCount / hlsl::max(m_GPUOutResulData.threadCount, 1u)) / static_cast(m_GPUOutResulData.sampleCount)); ImGui::Separator(); - // Silhouette info - drawColorField("silhouetteIndex", m_GPUOutResulData.silhouetteIndex); + // Silhouette + if (ImGui::CollapsingHeader("Silhouette")) + { + drawColorField("silhouetteIndex", m_GPUOutResulData.silhouetteIndex); + ImGui::Text("Region: (%u, %u, %u)", m_GPUOutResulData.region.x, m_GPUOutResulData.region.y, m_GPUOutResulData.region.z); + ImGui::Text("Silhouette Vertex Count: %u", m_GPUOutResulData.silhouetteVertexCount); + ImGui::Text("Positive Vertex Count: %u", m_GPUOutResulData.positiveVertCount); + ImGui::Text("Edge Visibility Mismatch: %s", m_GPUOutResulData.edgeVisibilityMismatch ? "true" : "false"); + ImGui::Text("Max Triangles Exceeded: %s", m_GPUOutResulData.maxTrianglesExceeded ? "true" : "false"); + for (uint32_t i = 0; i < 6; i++) + ImGui::Text("Vertex[%u]: %u", i, m_GPUOutResulData.vertices[i]); + ImGui::Text("Clipped Silhouette Vertex Count: %u", m_GPUOutResulData.clippedSilhouetteVertexCount); + for (uint32_t i = 0; i < 7; i++) + ImGui::Text("Clipped Vertex[%u]: (%.3f, %.3f, %.3f) Index: %u", i, + m_GPUOutResulData.clippedSilhouetteVertices[i].x, + m_GPUOutResulData.clippedSilhouetteVertices[i].y, + m_GPUOutResulData.clippedSilhouetteVertices[i].z, + m_GPUOutResulData.clippedSilhouetteVerticesIndices[i]); + + // Silhouette mask printed in binary + auto printBin = [](uint32_t bin, const char *name) + { + char buf[33]; + for (int i = 0; i < 32; i++) + buf[i] = (bin & (1u << (31 - i))) ? '1' : '0'; + buf[32] = '\0'; + ImGui::Text("%s: 0x%08X", name, bin); + ImGui::Text("binary: 0b%s", buf); + ImGui::Separator(); + }; + printBin(m_GPUOutResulData.silhouette, "Silhouette"); + printBin(m_GPUOutResulData.rotatedSil, "rotatedSilhouette"); + + printBin(m_GPUOutResulData.clipCount, "clipCount"); + printBin(m_GPUOutResulData.clipMask, "clipMask"); + printBin(m_GPUOutResulData.rotatedClipMask, "rotatedClipMask"); + printBin(m_GPUOutResulData.rotateAmount, "rotateAmount"); + printBin(m_GPUOutResulData.wrapAround, "wrapAround"); + } - ImGui::Text("silhouette Vertex Count: %u", m_GPUOutResulData.silhouetteVertexCount); - ImGui::Text("silhouette Positive VertexCount: %u", m_GPUOutResulData.positiveVertCount); - ImGui::Text("Silhouette Mismatch: %s", m_GPUOutResulData.edgeVisibilityMismatch ? "true" : "false"); - ImGui::Separator(); - ImGui::Text("Max triangles exceeded: %s", m_GPUOutResulData.maxTrianglesExceeded ? "true" : "false"); - ImGui::Text("spherical lune detected: %s", m_GPUOutResulData.sphericalLuneDetected ? "true" : "false"); - ImGui::Separator(); - //ImGui::Text("Sampling outside the silhouette: %s", m_GPUOutResulData.sampleOutsideSilhouette ? "true" : "false"); - ImGui::Text("Parallelogram does not bound: %s", m_GPUOutResulData.parallelogramDoesNotBound ? "true" : "false"); - ImGui::Text("Parallelogram vertices inside: %s", m_GPUOutResulData.parallelogramVerticesInside ? "true" : "false"); - ImGui::Text("Parallelogram edges inside: %s", m_GPUOutResulData.parallelogramEdgesInside ? "true" : "false"); - ImGui::Text("Parallelogram area: %.3f", m_GPUOutResulData.parallelogramArea); - ImGui::Text("Failed vertex index: %u", m_GPUOutResulData.failedVertexIndex); - ImGui::Text("Failed vertex UV: (%.3f, %.3f)", m_GPUOutResulData.failedVertexUV.x, m_GPUOutResulData.failedVertexUV.y); - ImGui::Text("Failed edge index: %u", m_GPUOutResulData.failedEdgeIndex); - ImGui::Text("Failed edge sample: %u", m_GPUOutResulData.failedEdgeSample); - ImGui::Text("Failed edge UV: (%.3f, %.3f)", m_GPUOutResulData.failedEdgeUV.x, m_GPUOutResulData.failedEdgeUV.y); - ImGui::Text("Failed point 3D: (%.3f, %.3f, %.3f)", m_GPUOutResulData.failedPoint.x, m_GPUOutResulData.failedPoint.y, m_GPUOutResulData.failedPoint.z); - for (uint32_t i = 0; i < 8; i++) - ImGui::Text("edge is convex: %s", m_GPUOutResulData.edgeIsConvex[i] ? "true" : "false"); - ImGui::Separator(); + // Parallelogram + if (m_samplingMode == PROJECTED_PARALLELOGRAM_SOLID_ANGLE && ImGui::CollapsingHeader("Projected Parallelogram", ImGuiTreeNodeFlags_DefaultOpen)) + { + ImGui::Text("Does Not Bound: %s", m_GPUOutResulData.parallelogramDoesNotBound ? "true" : "false"); + ImGui::Text("Area: %.3f", m_GPUOutResulData.parallelogramArea); + ImGui::Text("Failed Vertex Index: %u", m_GPUOutResulData.failedVertexIndex); + for (uint32_t i = 0; i < 4; i++) + ImGui::Text("Edge Is Convex[%u]: %s", i, m_GPUOutResulData.edgeIsConvex[i] ? "true" : "false"); + ImGui::Text("Vertices Inside: %s", m_GPUOutResulData.parallelogramVerticesInside ? "true" : "false"); + ImGui::Text("Edges Inside: %s", m_GPUOutResulData.parallelogramEdgesInside ? "true" : "false"); + for (uint32_t i = 0; i < 4; i++) + ImGui::Text("Corner[%u]: (%.3f, %.3f)", i, m_GPUOutResulData.parallelogramCorners[i].x, m_GPUOutResulData.parallelogramCorners[i].y); + } + else if ((m_samplingMode == SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE || m_samplingMode == SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC ||m_samplingMode == SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) && ImGui::CollapsingHeader("Spherical Pyramid", ImGuiTreeNodeFlags_DefaultOpen)) + { + ImGui::Text("Spans Hemisphere: %s", m_GPUOutResulData.pyramidSpansHemisphere ? "YES (warning)" : "no"); + ImGui::Text("Best Caliper Edge: %u", m_GPUOutResulData.pyramidBestEdge); + ImGui::Separator(); + + ImGui::Text("Axis 1: (%.4f, %.4f, %.4f)", + m_GPUOutResulData.pyramidAxis1.x, m_GPUOutResulData.pyramidAxis1.y, m_GPUOutResulData.pyramidAxis1.z); + ImGui::Text(" Half-Width: %.4f Offset: %.4f", + m_GPUOutResulData.pyramidHalfWidth1, m_GPUOutResulData.pyramidOffset1); + ImGui::Text(" Bounds: [%.4f, %.4f]", + m_GPUOutResulData.pyramidMin1, m_GPUOutResulData.pyramidMax1); + + ImGui::Text("Axis 2: (%.4f, %.4f, %.4f)", + m_GPUOutResulData.pyramidAxis2.x, m_GPUOutResulData.pyramidAxis2.y, m_GPUOutResulData.pyramidAxis2.z); + ImGui::Text(" Half-Width: %.4f Offset: %.4f", + m_GPUOutResulData.pyramidHalfWidth2, m_GPUOutResulData.pyramidOffset2); + ImGui::Text(" Bounds: [%.4f, %.4f]", + m_GPUOutResulData.pyramidMin2, m_GPUOutResulData.pyramidMax2); + + ImGui::Separator(); + ImGui::Text("Center: (%.4f, %.4f, %.4f)", + m_GPUOutResulData.pyramidCenter.x, m_GPUOutResulData.pyramidCenter.y, m_GPUOutResulData.pyramidCenter.z); + ImGui::Text("Solid Angle (bound): %.6f sr", m_GPUOutResulData.pyramidSolidAngle); + } + else if (m_samplingMode == TRIANGLE_SOLID_ANGLE || m_samplingMode == TRIANGLE_PROJECTED_SOLID_ANGLE && ImGui::CollapsingHeader("Spherical Triangle", ImGuiTreeNodeFlags_DefaultOpen)) + { + ImGui::Text("Spherical Lune Detected: %s", m_GPUOutResulData.sphericalLuneDetected ? "true" : "false"); + ImGui::Text("Triangle Count: %u", m_GPUOutResulData.triangleCount); + // print solidAngles for each triangle + { + ImGui::Text("Solid Angles per Triangle:"); + ImGui::BeginTable("SolidAnglesTable", 2); + ImGui::TableSetupColumn("Triangle Index"); + ImGui::TableSetupColumn("Solid Angle"); + ImGui::TableHeadersRow(); + for (uint32_t i = 0; i < m_GPUOutResulData.triangleCount; ++i) + { + ImGui::TableNextRow(); + ImGui::TableSetColumnIndex(0); + ImGui::Text("%u", i); + ImGui::TableSetColumnIndex(1); + ImGui::Text("%.6f", m_GPUOutResulData.solidAngles[i]); + } + ImGui::Text("Total: %.6f", m_GPUOutResulData.totalSolidAngles); + ImGui::EndTable(); + } + } { float32_t3 xAxis = m_OBBModelMatrix[0].xyz; @@ -1150,6 +1221,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR bool hasSkew = false; if (abs(dot(nx, ny)) > epsilon || abs(dot(nx, nz)) > epsilon || abs(dot(ny, nz)) > epsilon) hasSkew = true; + ImGui::Separator(); ImGui::Text("Matrix Has Skew: %s", hasSkew ? "true" : "false"); } @@ -1210,92 +1282,44 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } ImGui::EndPopup(); } - - ImGui::Separator(); - - // Region (uint32_t3) - ImGui::Text("region: (%u, %u, %u)", - m_GPUOutResulData.region.x, m_GPUOutResulData.region.y, m_GPUOutResulData.region.z); - - // print solidAngles for each triangle - { - ImGui::Text("Solid Angles per Triangle:"); - ImGui::BeginTable("SolidAnglesTable", 2); - ImGui::TableSetupColumn("Triangle Index"); - ImGui::TableSetupColumn("Solid Angle"); - ImGui::TableHeadersRow(); - for (uint32_t i = 0; i < m_GPUOutResulData.triangleCount; ++i) - { - ImGui::TableNextRow(); - ImGui::TableSetColumnIndex(0); - ImGui::Text("%u", i); - ImGui::TableSetColumnIndex(1); - ImGui::Text("%.6f", m_GPUOutResulData.solidAngles[i]); - } - ImGui::Text("Total: %.6f", m_GPUOutResulData.totalSolidAngles); - ImGui::EndTable(); - } - - ImGui::Separator(); - - // Silhouette mask printed in binary - - auto printBin = [](uint32_t bin, const char* name) - { - char buf[33]; - for (int i = 0; i < 32; i++) - buf[i] = (bin & (1u << (31 - i))) ? '1' : '0'; - buf[32] = '\0'; - ImGui::Text("%s: 0x%08X", name, bin); - ImGui::Text("binary: 0b%s", buf); - ImGui::Separator(); - }; - printBin(m_GPUOutResulData.silhouette, "Silhouette"); - printBin(m_GPUOutResulData.rotatedSil, "rotatedSilhouette"); - - printBin(m_GPUOutResulData.clipCount, "clipCount"); - printBin(m_GPUOutResulData.clipMask, "clipMask"); - printBin(m_GPUOutResulData.rotatedClipMask, "rotatedClipMask"); - printBin(m_GPUOutResulData.rotateAmount, "rotateAmount"); - printBin(m_GPUOutResulData.wrapAround, "wrapAround"); } ImGui::End(); } -#endif + // view matrices editor { ImGui::Begin("Matrices"); - auto addMatrixTable = [&](const char* topText, const char* tableName, const int rows, const int columns, const float* pointer, const bool withSeparator = true) + auto addMatrixTable = [&](const char *topText, const char *tableName, const int rows, const int columns, const float *pointer, const bool withSeparator = true) + { + ImGui::Text(topText); + if (ImGui::BeginTable(tableName, columns)) { - ImGui::Text(topText); - if (ImGui::BeginTable(tableName, columns)) + for (int y = 0; y < rows; ++y) { - for (int y = 0; y < rows; ++y) + ImGui::TableNextRow(); + for (int x = 0; x < columns; ++x) { - ImGui::TableNextRow(); - for (int x = 0; x < columns; ++x) - { - ImGui::TableSetColumnIndex(x); - ImGui::Text("%.3f", *(pointer + (y * columns) + x)); - } + ImGui::TableSetColumnIndex(x); + ImGui::Text("%.3f", *(pointer + (y * columns) + x)); } - ImGui::EndTable(); } + ImGui::EndTable(); + } - if (withSeparator) - ImGui::Separator(); - }; + if (withSeparator) + ImGui::Separator(); + }; static RandomSampler rng(0x45); // Initialize RNG with seed // Helper function to check if cube intersects unit sphere at origin - auto isCubeOutsideUnitSphere = [](const float32_t3& translation, const float32_t3& scale) -> bool - { - float cubeRadius = glm::length(scale) * 0.5f; - float distanceToCenter = glm::length(translation); - return (distanceToCenter - cubeRadius) > 1.0f; - }; + auto isCubeOutsideUnitSphere = [](const float32_t3 &translation, const float32_t3 &scale) -> bool + { + float cubeRadius = glm::length(scale) * 0.5f; + float distanceToCenter = glm::length(translation); + return (distanceToCenter - cubeRadius) > 1.0f; + }; static TRS lastTRS = {}; if (ImGui::Button("Randomize Translation")) @@ -1345,8 +1369,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } addMatrixTable("Model Matrix", "ModelMatrixTable", 4, 4, &m_OBBModelMatrix[0][0]); - addMatrixTable("Camera View Matrix", "ViewMatrixTable", 3, 4, camera.getViewMatrix().pointer()); - addMatrixTable("Camera View Projection Matrix", "ViewProjectionMatrixTable", 4, 4, camera.getProjectionMatrix().pointer(), false); + addMatrixTable("Camera View Matrix", "ViewMatrixTable", 3, 4, &camera.getViewMatrix()[0].x); + addMatrixTable("Camera View Projection Matrix", "ViewProjectionMatrixTable", 4, 4, &camera.getProjectionMatrix()[0].x, false); ImGui::End(); } @@ -1355,7 +1379,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // To be 100% accurate and not overly conservative we'd have to explicitly `cull_frees` and defragment each time, // so unless you do that, don't use this basic info to optimize the size of your IMGUI buffer. { - auto* streaminingBuffer = imGUI->getStreamingBuffer(); + auto *streaminingBuffer = imGUI->getStreamingBuffer(); const size_t total = streaminingBuffer->get_total_size(); // total memory range size for which allocation can be requested const size_t freeSize = streaminingBuffer->getAddressAllocator().get_free_size(); // max total free bloock memory size we can still allocate from total memory available @@ -1388,12 +1412,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::PopStyleColor(); - ImDrawList* drawList = ImGui::GetWindowDrawList(); + ImDrawList *drawList = ImGui::GetWindowDrawList(); ImVec2 progressBarPos = ImGui::GetItemRectMin(); ImVec2 progressBarSize = ImGui::GetItemRectSize(); - const char* text = "%.2f%% free"; + const char *text = "%.2f%% free"; char textBuffer[64]; snprintf(textBuffer, sizeof(textBuffer), text, freePercentage); @@ -1430,15 +1454,15 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ERV_SOLID_ANGLE_VIEW, Count }; - SubAllocatedDescriptorSet::value_type renderColorViewDescIndices[E_RENDER_VIEWS::Count] = { SubAllocatedDescriptorSet::invalid_value, SubAllocatedDescriptorSet::invalid_value }; + SubAllocatedDescriptorSet::value_type renderColorViewDescIndices[E_RENDER_VIEWS::Count] = {SubAllocatedDescriptorSet::invalid_value, SubAllocatedDescriptorSet::invalid_value}; // - Camera camera = Camera(cameraIntialPosition, cameraInitialTarget, core::matrix4SIMD(), 1, 1, nbl::core::vectorSIMDf(0.0f, 0.0f, 1.0f)); + Camera camera = Camera(cameraIntialPosition, cameraInitialTarget, {}, 1, 1, nbl::core::vectorSIMDf(0.0f, 0.0f, 1.0f)); // mutables struct TRS // Source of truth { - float32_t3 translation{ 0.0f, 0.0f, 1.5f }; - float32_t3 rotation{ 0.0f }; // MUST stay orthonormal - float32_t3 scale{ 1.0f }; + float32_t3 translation{0.0f, 0.0f, 1.5f}; + float32_t3 rotation{0.0f}; // MUST stay orthonormal + float32_t3 scale{1.0f}; } m_TRS; float32_t4x4 m_OBBModelMatrix; // always overwritten from TRS @@ -1447,9 +1471,9 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR TransformReturnInfo mainViewTransformReturnInfo; TransformReturnInfo solidAngleViewTransformReturnInfo; - const static inline core::vectorSIMDf cameraIntialPosition{ -3.0f, 6.0f, 3.0f }; - const static inline core::vectorSIMDf cameraInitialTarget{ 0.f, 0.0f, 3.f }; - const static inline core::vectorSIMDf cameraInitialUp{ 0.f, 0.f, 1.f }; + const static inline core::vectorSIMDf cameraIntialPosition{-3.0f, 6.0f, 3.0f}; + const static inline core::vectorSIMDf cameraInitialTarget{0.f, 0.0f, 3.f}; + const static inline core::vectorSIMDf cameraInitialUp{0.f, 0.f, 1.f}; float fov = 90.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f; float viewWidth = 10.f; @@ -1457,13 +1481,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR bool isPerspective = true, isLH = true, flipGizmoY = true, move = true; bool firstFrame = true; - SolidAngleVisualizer* m_visualizer; + SolidAngleVisualizer *m_visualizer; } interface; class SamplingBenchmark final { public: - SamplingBenchmark(SolidAngleVisualizer& base) + SamplingBenchmark(SolidAngleVisualizer &base) : m_api(base.m_api), m_device(base.m_device), m_logger(base.m_logger), m_visualizer(&base) { @@ -1478,15 +1502,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdBuff)) base.logFail("Failed to create Command Buffers!\n"); - // Load shaders, set up pipeline + // Load shaders, set up pipelines (one per sampling mode) { - smart_refctd_ptr shader; + auto loadShader = [&](auto key) -> smart_refctd_ptr { IAssetLoader::SAssetLoadParams lp = {}; lp.logger = base.m_logger.get(); - lp.workingDirectory = "app_resources"; // virtual root - // this time we load a shader directly from a file - auto key = nbl::this_example::builtin::build::get_spirv_key<"benchmark">(m_device.get()); + lp.workingDirectory = "app_resources"; auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp); const auto assets = assetBundle.getContents(); if (assets.empty()) @@ -1494,21 +1516,28 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR base.logFail("Could not load shader!"); assert(0); } - - // It would be super weird if loading a shader from a file produced more than 1 asset assert(assets.size() == 1); - shader = IAsset::castDown(assets[0]); - } + auto shader = IAsset::castDown(assets[0]); + if (!shader) + base.logFail("Failed to load precompiled benchmark shader!\n"); + return shader; + }; - if (!shader) - base.logFail("Failed to load precompiled \"benchmark\" shader!\n"); + smart_refctd_ptr shaders[SAMPLING_MODE::Count] = { + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_tri_sa">(m_device.get())), + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_tri_psa">(m_device.get())), + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_para">(m_device.get())), + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_rectangle">(m_device.get())), + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_biquad">(m_device.get())), + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_bilinear">(m_device.get())), + }; nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { {.binding = 0, .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, .stageFlags = ShaderStage::ESS_COMPUTE, - .count = 1} }; + .count = 1}}; smart_refctd_ptr dsLayout = base.m_device->createDescriptorSetLayout(bindings); if (!dsLayout) base.logFail("Failed to create a Descriptor Layout!\n"); @@ -1516,24 +1545,25 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR SPushConstantRange pushConstantRanges[] = { {.stageFlags = ShaderStage::ESS_COMPUTE, .offset = 0, - .size = sizeof(BenchmarkPushConstants)} }; + .size = sizeof(BenchmarkPushConstants)}}; m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout)); if (!m_pplnLayout) base.logFail("Failed to create a Pipeline Layout!\n"); + for (uint32_t i = 0; i < SAMPLING_MODE::Count; i++) { IGPUComputePipeline::SCreationParams params = {}; params.layout = m_pplnLayout.get(); params.shader.entryPoint = "main"; - params.shader.shader = shader.get(); - if (!base.m_device->createComputePipelines(nullptr, { ¶ms, 1 }, &m_pipeline)) + params.shader.shader = shaders[i].get(); + if (!base.m_device->createComputePipelines(nullptr, {¶ms, 1}, &m_pipelines[i])) base.logFail("Failed to create pipelines (compile & link shaders)!\n"); } // Allocate the memory { constexpr size_t BufferSize = BENCHMARK_WORKGROUP_COUNT * BENCHMARK_WORKGROUP_DIMENSION_SIZE_X * - BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y * BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z * sizeof(uint32_t); + BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y * BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z * sizeof(uint32_t); nbl::video::IGPUBuffer::SCreationParams params = {}; params.size = BufferSize; @@ -1551,15 +1581,15 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); assert(dummyBuff->getBoundMemory().memory == m_allocation.memory.get()); - smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout.get(), 1 }); + smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, {&dsLayout.get(), 1}); m_ds = pool->createDescriptorSet(std::move(dsLayout)); { IGPUDescriptorSet::SDescriptorInfo info[1]; info[0].desc = smart_refctd_ptr(dummyBuff); - info[0].info.buffer = { .offset = 0, .size = BufferSize }; + info[0].info.buffer = {.offset = 0, .size = BufferSize}; IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { - {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info} }; + {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info}}; base.m_device->updateDescriptorSets(writes, {}); } } @@ -1578,15 +1608,23 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { m_logger->log("\n\nsampling benchmark result:", ILogger::ELL_PERFORMANCE); - m_logger->log("sampling benchmark, parallelogram projected solid angle result:", ILogger::ELL_PERFORMANCE); + m_logger->log("sampling benchmark, SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE result:", ILogger::ELL_PERFORMANCE); + performBenchmark(SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE); + + m_logger->log("sampling benchmark, SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC result:", ILogger::ELL_PERFORMANCE); + performBenchmark(SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC); + + m_logger->log("sampling benchmark, SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR result:", ILogger::ELL_PERFORMANCE); + performBenchmark(SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR); + + m_logger->log("sampling benchmark, PROJECTED_PARALLELOGRAM_SOLID_ANGLE result:", ILogger::ELL_PERFORMANCE); performBenchmark(SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE); - m_logger->log("sampling benchmark, triangle solid angle result:", ILogger::ELL_PERFORMANCE); + m_logger->log("sampling benchmark, TRIANGLE_SOLID_ANGLE result:", ILogger::ELL_PERFORMANCE); performBenchmark(SAMPLING_MODE::TRIANGLE_SOLID_ANGLE); - //m_logger->log("sampling benchmark, triangle projected solid angle result:", ILogger::ELL_PERFORMANCE); - //performBenchmark(SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE); - + // m_logger->log("sampling benchmark, triangle projected solid angle result:", ILogger::ELL_PERFORMANCE); + // performBenchmark(SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE); } private: @@ -1599,35 +1637,34 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR uint64_t semaphoreCounter = 0; smart_refctd_ptr semaphore = m_device->createSemaphore(semaphoreCounter); - IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} }; - IQueue::SSubmitInfo::SSemaphoreInfo waits[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} }; + IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {{.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; + IQueue::SSubmitInfo::SSemaphoreInfo waits[] = {{.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; IQueue::SSubmitInfo beforeTimestapSubmitInfo[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsBegin[] = { {.cmdbuf = m_timestampBeforeCmdBuff.get()} }; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsBegin[] = {{.cmdbuf = m_timestampBeforeCmdBuff.get()}}; beforeTimestapSubmitInfo[0].commandBuffers = cmdbufsBegin; beforeTimestapSubmitInfo[0].signalSemaphores = signals; beforeTimestapSubmitInfo[0].waitSemaphores = waits; IQueue::SSubmitInfo afterTimestapSubmitInfo[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsEnd[] = { {.cmdbuf = m_timestampAfterCmdBuff.get()} }; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsEnd[] = {{.cmdbuf = m_timestampAfterCmdBuff.get()}}; afterTimestapSubmitInfo[0].commandBuffers = cmdbufsEnd; afterTimestapSubmitInfo[0].signalSemaphores = signals; afterTimestapSubmitInfo[0].waitSemaphores = waits; IQueue::SSubmitInfo benchmarkSubmitInfos[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} }; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = {{.cmdbuf = m_cmdbuf.get()}}; benchmarkSubmitInfos[0].commandBuffers = cmdbufs; benchmarkSubmitInfos[0].signalSemaphores = signals; benchmarkSubmitInfos[0].waitSemaphores = waits; - m_pushConstants.benchmarkMode = mode; m_pushConstants.modelMatrix = float32_t3x4(transpose(m_visualizer->interface.m_OBBModelMatrix)); - recordCmdBuff(); + m_pushConstants.sampleCount = m_SampleCount; + recordCmdBuff(mode); // warmup runs for (int i = 0; i < WarmupIterations; ++i) { - if (i == 0) m_api->startCapture(); waits[0].value = semaphoreCounter; @@ -1661,11 +1698,11 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR m_logger->log("%llu ns, %f s", ILogger::ELL_PERFORMANCE, nativeBenchmarkTimeElapsedNanoseconds, nativeBenchmarkTimeElapsedSeconds); } - void recordCmdBuff() + void recordCmdBuff(SAMPLING_MODE mode) { m_cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); m_cmdbuf->beginDebugMarker("sampling compute dispatch", vectorSIMDf(0, 1, 0, 1)); - m_cmdbuf->bindComputePipeline(m_pipeline.get()); + m_cmdbuf->bindComputePipeline(m_pipelines[mode].get()); m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants); m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); @@ -1707,7 +1744,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR core::smart_refctd_ptr m_api; smart_refctd_ptr m_device; smart_refctd_ptr m_logger; - SolidAngleVisualizer* m_visualizer; + SolidAngleVisualizer *m_visualizer; nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; smart_refctd_ptr m_cmdpool = nullptr; @@ -1715,20 +1752,20 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR smart_refctd_ptr m_ds = nullptr; smart_refctd_ptr m_pplnLayout = nullptr; BenchmarkPushConstants m_pushConstants; - smart_refctd_ptr m_pipeline; + smart_refctd_ptr m_pipelines[SAMPLING_MODE::Count]; smart_refctd_ptr m_timestampBeforeCmdBuff = nullptr; smart_refctd_ptr m_timestampAfterCmdBuff = nullptr; smart_refctd_ptr m_queryPool = nullptr; uint32_t m_queueFamily; - IQueue* m_computeQueue; + IQueue *m_computeQueue; static constexpr int WarmupIterations = 50; static constexpr int Iterations = 1; }; template - inline bool logFail(const char* msg, Args &&...args) + inline bool logFail(const char *msg, Args &&...args) { m_logger->log(msg, ILogger::ELL_ERROR, std::forward(args)...); return false; diff --git a/common/include/nbl/examples/cameras/CCamera.hpp b/common/include/nbl/examples/cameras/CCamera.hpp index 782c8b624..8fadbd866 100644 --- a/common/include/nbl/examples/cameras/CCamera.hpp +++ b/common/include/nbl/examples/cameras/CCamera.hpp @@ -16,8 +16,8 @@ #include #include -class Camera -{ +class Camera +{ public: Camera() = default; Camera(const nbl::core::vectorSIMDf& position, const nbl::core::vectorSIMDf& lookat, const nbl::hlsl::float32_t4x4& projection, float moveSpeed = 1.0f, float rotateSpeed = 1.0f, const nbl::core::vectorSIMDf& upVec = nbl::core::vectorSIMDf(0.0f, 1.0f, 0.0f), const nbl::core::vectorSIMDf& backupUpVec = nbl::core::vectorSIMDf(0.5f, 1.0f, 0.0f)) @@ -72,7 +72,7 @@ class Camera inline void mapKeysCustom(std::array& map) { keysMap = map; } inline const nbl::hlsl::float32_t4x4& getProjectionMatrix() const { return projMatrix; } - inline const nbl::hlsl::float32_t3x4& getViewMatrix() const { return viewMatrix; } + inline const nbl::hlsl::float32_t3x4& getViewMatrix() const { return viewMatrix; } inline const nbl::hlsl::float32_t4x4& getConcatenatedMatrix() const { return concatMatrix; } inline void setProjectionMatrix(const nbl::hlsl::float32_t4x4& projection) @@ -81,16 +81,16 @@ class Camera leftHanded = nbl::hlsl::determinant(projMatrix) < 0.f; concatMatrix = nbl::hlsl::math::linalg::promoted_mul(projMatrix, viewMatrix); } - + inline void setPosition(const nbl::core::vectorSIMDf& pos) { position.set(pos); recomputeViewMatrix(); } - + inline const nbl::core::vectorSIMDf& getPosition() const { return position; } - inline void setTarget(const nbl::core::vectorSIMDf& pos) + inline void setTarget(const nbl::core::vectorSIMDf& pos) { target.set(pos); recomputeViewMatrix(); @@ -99,11 +99,11 @@ class Camera inline const nbl::core::vectorSIMDf& getTarget() const { return target; } inline void setUpVector(const nbl::core::vectorSIMDf& up) { upVector = up; } - + inline void setBackupUpVector(const nbl::core::vectorSIMDf& up) { backupUpVector = up; } inline const nbl::core::vectorSIMDf& getUpVector() const { return upVector; } - + inline const nbl::core::vectorSIMDf& getBackupUpVector() const { return backupUpVector; } inline const float getMoveSpeed() const { return moveSpeed; } @@ -114,7 +114,7 @@ class Camera inline void setRotateSpeed(const float _rotateSpeed) { rotateSpeed = _rotateSpeed; } - inline void recomputeViewMatrix() + inline void recomputeViewMatrix() { nbl::hlsl::float32_t3 pos = nbl::core::convertToHLSLVector(position).xyz; nbl::hlsl::float32_t3 localTarget = nbl::hlsl::normalize(nbl::core::convertToHLSLVector(target).xyz - pos); @@ -144,64 +144,78 @@ class Camera void mouseProcess(const nbl::ui::IMouseEventChannel::range_t& events) { - for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++) + for (auto eventIt = events.begin(); eventIt != events.end(); eventIt++) { auto ev = *eventIt; - if(ev.type == nbl::ui::SMouseEvent::EET_CLICK && ev.clickEvent.mouseButton == nbl::ui::EMB_LEFT_BUTTON) - if(ev.clickEvent.action == nbl::ui::SMouseEvent::SClickEvent::EA_PRESSED) + if (ev.type == nbl::ui::SMouseEvent::EET_CLICK && ev.clickEvent.mouseButton == nbl::ui::EMB_LEFT_BUTTON) + if (ev.clickEvent.action == nbl::ui::SMouseEvent::SClickEvent::EA_PRESSED) mouseDown = true; else if (ev.clickEvent.action == nbl::ui::SMouseEvent::SClickEvent::EA_RELEASED) mouseDown = false; - if(ev.type == nbl::ui::SMouseEvent::EET_MOVEMENT && mouseDown) + if (ev.type == nbl::ui::SMouseEvent::EET_MOVEMENT && mouseDown) { - nbl::hlsl::float32_t4 pos = nbl::core::convertToHLSLVector(getPosition()); - nbl::hlsl::float32_t4 localTarget = nbl::core::convertToHLSLVector(getTarget()) - pos; - - // Get Relative Rotation for localTarget in Radians - float relativeRotationX, relativeRotationY; - relativeRotationY = atan2(localTarget.x, localTarget.z); - const double z1 = nbl::core::sqrt(localTarget.x*localTarget.x + localTarget.z*localTarget.z); - relativeRotationX = atan2(z1, localTarget.y) - nbl::core::PI()/2; - - constexpr float RotateSpeedScale = 0.003f; - relativeRotationX -= ev.movementEvent.relativeMovementY * rotateSpeed * RotateSpeedScale * -1.0f; - float tmpYRot = ev.movementEvent.relativeMovementX * rotateSpeed * RotateSpeedScale * -1.0f; - + // --- corrected camera rotation update --- + nbl::hlsl::float32_t3 pos = nbl::core::convertToHLSLVector(getPosition()).xyz; + nbl::hlsl::float32_t3 targetVec = nbl::core::convertToHLSLVector(getTarget()).xyz - pos; // original vector to target + + // preserve distance so we don't collapse to unit length + float targetDistance = nbl::hlsl::length(targetVec); + if (targetDistance < 1e-6f) targetDistance = 1.0f; // avoid div-by-zero + + nbl::hlsl::float32_t3 forward = nbl::hlsl::normalize(targetVec); + nbl::hlsl::float32_t3 upVector = nbl::core::convertToHLSLVector(getUpVector()).xyz; + nbl::hlsl::float32_t3 right = nbl::hlsl::normalize(nbl::hlsl::cross(upVector, forward)); + nbl::hlsl::float32_t3 correctedForward = nbl::hlsl::normalize(nbl::hlsl::cross(right, upVector)); + + // horizontal yaw (angle from correctedForward towards right) + float rightDot = nbl::hlsl::dot(targetVec, right); + float forwardDot = nbl::hlsl::dot(targetVec, correctedForward); + float relativeRotationY = atan2(rightDot, forwardDot); + + // pitch: angle above/below horizontal + float upDot = nbl::hlsl::dot(targetVec, upVector); + nbl::hlsl::float32_t3 horizontalComponent = targetVec - upVector * upDot; + float horizontalLength = nbl::hlsl::length(horizontalComponent); + float relativeRotationX = atan2(upDot, horizontalLength); + + // apply mouse/controller deltas (signs simplified) + constexpr float RotateSpeedScale = 0.003f; + relativeRotationX -= ev.movementEvent.relativeMovementY * rotateSpeed * RotateSpeedScale; + float tmpYRot = ev.movementEvent.relativeMovementX * rotateSpeed * RotateSpeedScale; if (leftHanded) - yawDelta = -yawDelta; + relativeRotationY += tmpYRot; + else + relativeRotationY -= tmpYRot; - // Clamp pitch BEFORE applying rotation + // clamp pitch const float MaxVerticalAngle = nbl::core::radians(88.0f); - float currentPitch = asin(nbl::core::dot(forward, upVector).X); - float newPitch = nbl::core::clamp(currentPitch + pitchDelta, -MaxVerticalAngle, MaxVerticalAngle); - pitchDelta = newPitch - currentPitch; - - // Create rotation quaternions using axis-angle method - nbl::core::quaternion pitchRot = nbl::core::quaternion::fromAngleAxis(pitchDelta, right); - nbl::core::quaternion yawRot = nbl::core::quaternion::fromAngleAxis(yawDelta, upVector); - nbl::core::quaternion combinedRot = yawRot * pitchRot; - - pos.w = 0; - localTarget = nbl::hlsl::float32_t4(0, 0, nbl::core::max(1.f, nbl::hlsl::length(pos)), 1.0f); - - const nbl::hlsl::math::quaternion quat = nbl::hlsl::math::quaternion::create(relativeRotationX, relativeRotationY, 0.0f); - nbl::hlsl::float32_t3x4 mat = nbl::hlsl::math::linalg::promote_affine<3, 4, 3, 3>(quat.__constructMatrix()); + if (relativeRotationX > MaxVerticalAngle) relativeRotationX = MaxVerticalAngle; + if (relativeRotationX < -MaxVerticalAngle) relativeRotationX = -MaxVerticalAngle; + // build final direction by first yaw-rotating in the horizontal plane, then pitching + float cosYaw = cos(relativeRotationY); + float sinYaw = sin(relativeRotationY); + nbl::hlsl::float32_t3 yawForward = correctedForward * cosYaw + right * sinYaw; + yawForward = nbl::hlsl::normalize(yawForward); - localTarget = nbl::hlsl::float32_t4(nbl::hlsl::mul(mat, localTarget), 1.0f); + float cosPitch = cos(relativeRotationX); + float sinPitch = sin(relativeRotationX); + nbl::hlsl::float32_t3 finalDir = nbl::hlsl::normalize(yawForward * cosPitch + upVector * sinPitch); - nbl::core::vectorSIMDf finalTarget = nbl::core::constructVecorSIMDFromHLSLVector(localTarget + pos); + // restore original distance and set target + nbl::core::vectorSIMDf finalTarget = nbl::core::constructVecorSIMDFromHLSLVector(pos + finalDir * targetDistance); finalTarget.w = 1.0f; setTarget(finalTarget); + } } } void keyboardProcess(const nbl::ui::IKeyboardEventChannel::range_t& events) { - for(uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) + for (uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) perActionDt[k] = 0.0; /* @@ -210,8 +224,8 @@ class Camera * And If an UP event was sent It will get subtracted it from this value. (Currently Disabled Because we Need better Oracle) */ - for(uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) - if(keysDown[k]) + for (uint32_t k = 0; k < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++k) + if (keysDown[k]) { auto timeDiff = std::chrono::duration_cast(nextPresentationTimeStamp - lastVirtualUpTimeStamp).count(); if (timeDiff < 0) @@ -219,10 +233,10 @@ class Camera perActionDt[k] += timeDiff; } - for (auto eventIt=events.begin(); eventIt!=events.end(); eventIt++) + for (auto eventIt = events.begin(); eventIt != events.end(); eventIt++) { const auto ev = *eventIt; - + // accumulate the periods for which a key was down auto timeDiff = std::chrono::duration_cast(nextPresentationTimeStamp - ev.timeStamp).count(); if (timeDiff < 0) @@ -235,12 +249,12 @@ class Camera if (ev.keyCode == code) { - if (ev.action == nbl::ui::SKeyboardEvent::ECA_PRESSED && !keysDown[logicalKey]) + if (ev.action == nbl::ui::SKeyboardEvent::ECA_PRESSED && !keysDown[logicalKey]) { perActionDt[logicalKey] += timeDiff; keysDown[logicalKey] = true; } - else if (ev.action == nbl::ui::SKeyboardEvent::ECA_RELEASED) + else if (ev.action == nbl::ui::SKeyboardEvent::ECA_RELEASED) { // perActionDt[logicalKey] -= timeDiff; keysDown[logicalKey] = false; @@ -264,7 +278,7 @@ class Camera nextPresentationTimeStamp = _nextPresentationTimeStamp; return; } - + void endInputProcessing(std::chrono::microseconds _nextPresentationTimeStamp) { nbl::core::vectorSIMDf pos = getPosition(); @@ -276,13 +290,12 @@ class Camera movedir.makeSafe3D(); movedir = nbl::core::normalize(movedir); - constexpr float MoveSpeedScale = 0.02f; + constexpr float MoveSpeedScale = 0.02f; pos += movedir * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_FORWARD] * moveSpeed * MoveSpeedScale; pos -= movedir * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_BACKWARD] * moveSpeed * MoveSpeedScale; - // strafing - + // if upvector and vector to the target are the same, we have a // problem. so solve this problem: nbl::core::vectorSIMDf up = nbl::core::normalize(upVector); @@ -293,9 +306,11 @@ class Camera up = nbl::core::normalize(backupUpVector); } - pos += up * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_UP] * moveSpeed * MoveSpeedScale; - pos -= up * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_DOWN] * moveSpeed * MoveSpeedScale; + nbl::core::vectorSIMDf currentUp = nbl::core::normalize(nbl::core::cross(localTarget, nbl::core::cross(up, localTarget))); + pos += currentUp * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_UP] * moveSpeed * MoveSpeedScale; + pos -= currentUp * perActionDt[E_CAMERA_MOVE_KEYS::ECMK_MOVE_DOWN] * moveSpeed * MoveSpeedScale; + // strafing nbl::core::vectorSIMDf strafevect = localTarget; if (leftHanded) strafevect = nbl::core::cross(strafevect, up); @@ -311,7 +326,7 @@ class Camera firstUpdate = false; setPosition(pos); - setTarget(localTarget+pos); + setTarget(localTarget + pos); lastVirtualUpTimeStamp = nextPresentationTimeStamp; } @@ -324,10 +339,10 @@ class Camera private: inline void initDefaultKeysMap() { mapKeysToWASD(); } - - inline void allKeysUp() + + inline void allKeysUp() { - for (uint32_t i=0; i< E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++i) + for (uint32_t i = 0; i < E_CAMERA_MOVE_KEYS::ECMK_COUNT; ++i) keysDown[i] = false; mouseDown = false; @@ -340,7 +355,7 @@ class Camera float moveSpeed, rotateSpeed; bool leftHanded, firstUpdate = true, mouseDown = false; - + std::array keysMap = { {nbl::ui::EKC_NONE} }; // map camera E_CAMERA_MOVE_KEYS to corresponding Nabla key codes, by default camera uses WSAD to move // TODO: make them use std::array bool keysDown[E_CAMERA_MOVE_KEYS::ECMK_COUNT] = {}; From 5eeb47351e29cb6ee02f8d8319f131a2c012b5a2 Mon Sep 17 00:00:00 2001 From: devshgraphicsprogramming Date: Thu, 16 Apr 2026 14:58:47 +0200 Subject: [PATCH 18/26] make NEE work in ex 31 with Global L solid angle sampling of spherical rect --- .../hlsl/next_event_estimator.hlsl | 53 +++++++++++-------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index c8bee786c..29aca1824 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -246,6 +246,7 @@ template struct ShapeSampling { using scalar_type = T; + using vector2_type = vector; using vector3_type = vector; static ShapeSampling create(NBL_CONST_REF_ARG(Shape) rect) @@ -262,48 +263,56 @@ struct ShapeSampling matrix rectNormalBasis; vector rectExtents; rect.getNormalBasis(rectNormalBasis, rectExtents); + shapes::SphericalRectangle sphR0; sphR0.origin = rect.offset; sphR0.extents = rectExtents; sphR0.basis = rectNormalBasis; - scalar_type solidAngle = sphR0.solidAngle(ray.origin).value; - if (solidAngle > numeric_limits::min) - pdf = 1.f / solidAngle; - else - pdf = bit_cast(numeric_limits::infinity); - return pdf; + + // 1.f/0.f gives infinity no special checks needed + return 1.f / sphR0.solidAngle(ray.origin).value; } template vector3_type generate_and_pdf(NBL_REF_ARG(scalar_type) pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(Aniso) interaction, NBL_CONST_REF_ARG(vector3_type) xi) { - const vector3_type N = rect.getNormalTimesArea(); - const vector3_type origin2origin = rect.offset - origin; - matrix rectNormalBasis; vector rectExtents; rect.getNormalBasis(rectNormalBasis, rectExtents); + shapes::SphericalRectangle sphR0; sphR0.origin = rect.offset; sphR0.extents = rectExtents; sphR0.basis = rectNormalBasis; - vector3_type L = hlsl::promote(0.0); + // sampling::SphericalRectangle ssph = sampling::SphericalRectangle::create(sphR0, origin); - if ( ssph.solidAngle > numeric_limits::min) + typename sampling::SphericalRectangle::cache_type cache; + + const vector3_type origin2origin = rect.offset - origin; + vector3_type L = hlsl::promote(0.0); + const bool FastVersion = true; + if (FastVersion) { - typename sampling::SphericalRectangle::cache_type cache; - const vector3_type localDir = ssph.generate(xi.xy, cache); - // not sure if generate() can produce NaN/inf when solidAngle > min - assert(!hlsl::any(hlsl::isinf(localDir) || hlsl::isnan(localDir))); - // transform local direction to world space - L = localDir.x * rectNormalBasis[0] + localDir.y * rectNormalBasis[1] + localDir.z * rectNormalBasis[2]; - pdf = ssph.forwardPdf(xi.xy, cache); + // actually the slowest + //L = ssph.generate(xi.xy, cache); + //newRayMaxT = ssph.computeHitT(L); + + // fastest + const vector3_type localL = ssph.generateNormalizedLocal(xi.xy,cache,newRayMaxT); + L = hlsl::mul(hlsl::transpose(ssph.basis),localL); } else - pdf = bit_cast(numeric_limits::infinity); + { + L = ssph.generateUnnormalized(xi.xy,cache); + const scalar_type rcpLen = hlsl::rsqrt(hlsl::dot(L,L)); + newRayMaxT = 1.f / rcpLen; + L *= rcpLen; + } + // prevent self intersections against the emitter + newRayMaxT -= 0.0001f; - newRayMaxT = hlsl::dot(N, origin2origin) / hlsl::dot(N, L); + pdf = ssph.forwardPdf(xi.xy,cache); return L; } @@ -322,7 +331,7 @@ struct EffectivePolygonMethod NBL_CONSTEXPR_STATIC_INLINE NEEPolygonMethod value = PPM_SOLID_ANGLE; }; - +#if 0 // Projected solid angle NEE for rectangles using "Practical Warps": // bilinear warp over 4-corner NdotL + spherical rectangle sampling. // Same grazing-angle limitations as the triangle variant -- see comments @@ -398,7 +407,7 @@ struct ShapeSampling Shape rect; }; - +#endif template struct NextEventEstimator From 89ecce14443c216b30ff84b837b899045bb5513f Mon Sep 17 00:00:00 2001 From: devshgraphicsprogramming Date: Fri, 17 Apr 2026 03:26:32 +0200 Subject: [PATCH 19/26] prep for rendering with PSA rectangle --- .../hlsl/next_event_estimator.hlsl | 54 +++++++++---------- 1 file changed, 26 insertions(+), 28 deletions(-) diff --git a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl index 29aca1824..91d2a2d5e 100644 --- a/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl +++ b/31_HLSLPathTracer/app_resources/hlsl/next_event_estimator.hlsl @@ -177,9 +177,7 @@ struct ShapeSampling const vector3_type tri_vertices[3] = {tri.vertex0, tri.vertex1, tri.vertex2}; shapes::SphericalTriangle st = shapes::SphericalTriangle::create(tri_vertices, ray.origin); sampling::ProjectedSphericalTriangle pst = sampling::ProjectedSphericalTriangle::create(st, ray.normalAtOrigin, ray.wasBSDFAtOrigin); - const scalar_type pdf = pst.backwardPdf(L); - // if `pdf` is NAN then the triangle's projected solid angle was close to 0.0, if its close to INF then the triangle was very small - return pdf < numeric_limits::max ? pdf : numeric_limits::max; + return pst.backwardWeight(L); } template @@ -331,7 +329,6 @@ struct EffectivePolygonMethod NBL_CONSTEXPR_STATIC_INLINE NEEPolygonMethod value = PPM_SOLID_ANGLE; }; -#if 0 // Projected solid angle NEE for rectangles using "Practical Warps": // bilinear warp over 4-corner NdotL + spherical rectangle sampling. // Same grazing-angle limitations as the triangle variant -- see comments @@ -361,21 +358,12 @@ struct ShapeSampling sphR0.extents = rectExtents; sphR0.basis = rectNormalBasis; sampling::ProjectedSphericalRectangle psr = sampling::ProjectedSphericalRectangle::create(sphR0, ray.origin, ray.normalAtOrigin, ray.wasBSDFAtOrigin); - // Reconstruct normalized [0,1]^2 position on the rectangle from the ray direction - const vector3_type N = rect.getNormalTimesArea(); - const scalar_type t = hlsl::dot(N, rect.offset - ray.origin) / hlsl::dot(N, ray.direction); - const vector3_type hitPoint = ray.origin + ray.direction * t; - const vector3_type localHit = hitPoint - rect.offset; - const vector p = vector(hlsl::dot(localHit, rectNormalBasis[0]) / rectExtents.x, hlsl::dot(localHit, rectNormalBasis[1]) / rectExtents.y); - const scalar_type pdf = psr.backwardPdf(p); - return pdf < numeric_limits::max ? pdf : numeric_limits::max; + return psr.backwardWeight(ray.direction); } template vector3_type generate_and_pdf(NBL_REF_ARG(scalar_type) pdf, NBL_REF_ARG(scalar_type) newRayMaxT, NBL_CONST_REF_ARG(vector3_type) origin, NBL_CONST_REF_ARG(Aniso) interaction, NBL_CONST_REF_ARG(vector3_type) xi) { - const vector3_type N = rect.getNormalTimesArea(); - const vector3_type origin2origin = rect.offset - origin; matrix rectNormalBasis; vector rectExtents; @@ -384,30 +372,40 @@ struct ShapeSampling sphR0.origin = rect.offset; sphR0.extents = rectExtents; sphR0.basis = rectNormalBasis; - vector3_type L = hlsl::promote(0.0); sampling::ProjectedSphericalRectangle psr = sampling::ProjectedSphericalRectangle::create(sphR0, origin, interaction.getN(), interaction.isMaterialBSDF()); - const scalar_type solidAngle = psr.sphrect.solidAngle; - if (solidAngle > numeric_limits::min) + typename sampling::ProjectedSphericalRectangle::cache_type cache; + + const vector3_type origin2origin = rect.offset - origin; + vector3_type L = hlsl::promote(0.0); + const bool FastVersion = true; + if (FastVersion) { - typename sampling::ProjectedSphericalRectangle::cache_type cache; - const vector3_type localDir = psr.generate(xi.xy, cache); - // not sure if generate() can produce NaN/inf when solidAngle > min - assert(!hlsl::any(hlsl::isinf(localDir) || hlsl::isnan(localDir))); - // transform local direction to world space - L = localDir.x * rectNormalBasis[0] + localDir.y * rectNormalBasis[1] + localDir.z * rectNormalBasis[2]; - pdf = psr.forwardPdf(xi.xy, cache); + // actually the slowest + //L = psr.generate(xi.xy, cache); + //newRayMaxT = psr.sphrect.computeHitT(L); + + // fastest + const vector3_type localL = psr.generateNormalizedLocal(xi.xy,cache,newRayMaxT); + // hopefully CSE kicks in for the `UsePdfAsWeight==true` + L = hlsl::mul(hlsl::transpose(psr.sphrect.basis),localL); } else - pdf = bit_cast(numeric_limits::infinity); - - newRayMaxT = hlsl::dot(N, origin2origin) / hlsl::dot(N, L); + { + L = psr.generateUnnormalized(xi.xy,cache); + const scalar_type rcpLen = hlsl::rsqrt(hlsl::dot(L,L)); + newRayMaxT = 1.f / rcpLen; + L *= rcpLen; + } + // prevent self intersections against the emitter + newRayMaxT -= 0.0001f; + + pdf = psr.forwardPdf(xi.xy,cache); return L; } Shape rect; }; -#endif template struct NextEventEstimator From fb5cfa2bcaa0a92aafb429f3d390658d28d1ca02 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Wed, 22 Apr 2026 01:16:12 +0300 Subject: [PATCH 20/26] jacobian tests, better benchmarks, addressed comments --- 37_HLSLSamplingTests/CMakeLists.txt | 172 ++++++- .../app_resources/common/alias_table.hlsl | 2 + .../app_resources/common/array_accessor.hlsl | 1 - .../app_resources/common/bilinear.hlsl | 6 + .../common/box_muller_transform.hlsl | 3 + .../common/concentric_mapping.hlsl | 12 +- .../common/cumulative_probability.hlsl | 2 + .../common/discrete_sampler_bench.hlsl | 3 - .../app_resources/common/jacobian_test.hlsl | 264 ++++++++++ .../app_resources/common/linear.hlsl | 3 + .../app_resources/common/polar_mapping.hlsl | 20 +- .../common/projected_hemisphere.hlsl | 8 +- .../common/projected_sphere.hlsl | 3 + .../common/projected_spherical_rectangle.hlsl | 44 +- .../common/projected_spherical_triangle.hlsl | 21 +- .../common/spherical_rectangle.hlsl | 16 +- .../common/spherical_triangle.hlsl | 17 +- .../common/uniform_hemisphere.hlsl | 8 +- .../app_resources/common/uniform_sphere.hlsl | 9 +- .../shaders/alias_table_test.comp.hlsl | 11 +- .../shaders/bilinear_test.comp.hlsl | 28 +- .../box_muller_transform_test.comp.hlsl | 28 +- .../shaders/concentric_mapping_test.comp.hlsl | 20 +- .../cumulative_probability_test.comp.hlsl | 6 +- .../shaders/linear_test.comp.hlsl | 28 +- .../shaders/polar_mapping_test.comp.hlsl | 20 +- .../projected_hemisphere_test.comp.hlsl | 20 +- .../shaders/projected_sphere_test.comp.hlsl | 20 +- ...ojected_spherical_rectangle_test.comp.hlsl | 54 +- ...rojected_spherical_triangle_test.comp.hlsl | 41 +- .../spherical_rectangle_test.comp.hlsl | 102 +++- .../shaders/spherical_triangle.comp.hlsl | 40 +- .../shaders/test_compile.comp.hlsl | 78 ++- .../shaders/uniform_hemisphere_test.comp.hlsl | 20 +- .../shaders/uniform_sphere_test.comp.hlsl | 20 +- .../benchmarks/CDiscreteSamplerBenchmark.h | 391 +++++++------- .../benchmarks/CSamplerBenchmark.h | 6 +- 37_HLSLSamplingTests/main.cpp | 206 +++++--- .../tests/CAliasTableGPUTester.h | 1 + 37_HLSLSamplingTests/tests/CBilinearTester.h | 5 +- .../tests/CBoxMullerTransformTester.h | 1 + .../tests/CConcentricMappingTester.h | 3 +- .../tests/CCumulativeProbabilityGPUTester.h | 1 + 37_HLSLSamplingTests/tests/CLinearTester.h | 9 +- .../tests/CPolarMappingTester.h | 3 +- .../tests/CProjectedHemisphereTester.h | 7 +- .../tests/CProjectedSphereTester.h | 5 +- .../CProjectedSphericalRectangleTester.h | 81 ++- .../tests/CProjectedSphericalTriangleTester.h | 30 +- .../tests/CSphericalRectangleTester.h | 33 +- .../tests/CSphericalTriangleTester.h | 13 +- .../tests/CUniformHemisphereTester.h | 3 +- .../tests/CUniformSphereTester.h | 3 +- .../tests/SamplerTestHelpers.h | 482 ++++++++++++------ .../tests/property/CSamplerPropertyTester.h | 220 +++++--- 55 files changed, 1869 insertions(+), 784 deletions(-) create mode 100644 37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl diff --git a/37_HLSLSamplingTests/CMakeLists.txt b/37_HLSLSamplingTests/CMakeLists.txt index 2ac238c33..12cbb5bb1 100644 --- a/37_HLSLSamplingTests/CMakeLists.txt +++ b/37_HLSLSamplingTests/CMakeLists.txt @@ -113,8 +113,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/linear_test.comp.hlsl\", - \"KEY\": \"linear_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"linear_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/linear_test.comp.hlsl\", + \"KEY\": \"linear_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/uniform_hemisphere_test.comp.hlsl\", @@ -122,8 +127,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/uniform_hemisphere_test.comp.hlsl\", - \"KEY\": \"uniform_hemisphere_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"uniform_hemisphere_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/uniform_hemisphere_test.comp.hlsl\", + \"KEY\": \"uniform_hemisphere_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/uniform_sphere_test.comp.hlsl\", @@ -131,8 +141,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/uniform_sphere_test.comp.hlsl\", - \"KEY\": \"uniform_sphere_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"uniform_sphere_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/uniform_sphere_test.comp.hlsl\", + \"KEY\": \"uniform_sphere_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/projected_hemisphere_test.comp.hlsl\", @@ -140,8 +155,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/projected_hemisphere_test.comp.hlsl\", - \"KEY\": \"projected_hemisphere_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"projected_hemisphere_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_hemisphere_test.comp.hlsl\", + \"KEY\": \"projected_hemisphere_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/projected_sphere_test.comp.hlsl\", @@ -149,8 +169,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/projected_sphere_test.comp.hlsl\", - \"KEY\": \"projected_sphere_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"projected_sphere_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_sphere_test.comp.hlsl\", + \"KEY\": \"projected_sphere_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\", @@ -158,8 +183,18 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\", - \"KEY\": \"spherical_triangle_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"spherical_triangle_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\", + \"KEY\": \"spherical_triangle_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_triangle.comp.hlsl\", + \"KEY\": \"spherical_triangle_bench_create_only\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"] }, { \"INPUT\": \"app_resources/shaders/concentric_mapping_test.comp.hlsl\", @@ -167,8 +202,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/concentric_mapping_test.comp.hlsl\", - \"KEY\": \"concentric_mapping_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"concentric_mapping_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/concentric_mapping_test.comp.hlsl\", + \"KEY\": \"concentric_mapping_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/polar_mapping_test.comp.hlsl\", @@ -176,8 +216,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/polar_mapping_test.comp.hlsl\", - \"KEY\": \"polar_mapping_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"polar_mapping_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/polar_mapping_test.comp.hlsl\", + \"KEY\": \"polar_mapping_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/bilinear_test.comp.hlsl\", @@ -185,8 +230,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/bilinear_test.comp.hlsl\", - \"KEY\": \"bilinear_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"bilinear_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/bilinear_test.comp.hlsl\", + \"KEY\": \"bilinear_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/box_muller_transform_test.comp.hlsl\", @@ -194,8 +244,13 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/box_muller_transform_test.comp.hlsl\", - \"KEY\": \"box_muller_transform_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"box_muller_transform_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/box_muller_transform_test.comp.hlsl\", + \"KEY\": \"box_muller_transform_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] }, { \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\", @@ -203,8 +258,18 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\", - \"KEY\": \"projected_spherical_triangle_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"projected_spherical_triangle_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\", + \"KEY\": \"projected_spherical_triangle_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_spherical_triangle_test.comp.hlsl\", + \"KEY\": \"projected_spherical_triangle_bench_create_only\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"] }, { \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\", @@ -212,8 +277,18 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\", - \"KEY\": \"projected_spherical_rectangle_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"projected_spherical_rectangle_bench_1_1\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"projected_spherical_rectangle_bench_1_16\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] + }, + { + \"INPUT\": \"app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"projected_spherical_rectangle_bench_create_only\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"] }, { \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", @@ -221,8 +296,48 @@ set(JSON " }, { \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", - \"KEY\": \"spherical_rectangle_bench\", - \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + \"KEY\": \"spherical_rectangle_bench_1_1_shape_observer\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_1_sa_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\", \"-DBENCH_VARIANT_SA_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_1_r0_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=1\", \"-DBENCH_VARIANT_R0_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_16_shape_observer\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_16_sa_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\", \"-DBENCH_VARIANT_SA_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_1_16_r0_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_SAMPLES_PER_CREATE=16\", \"-DBENCH_VARIANT_R0_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_create_only_shape_observer\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_create_only_sa_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\", \"-DBENCH_VARIANT_SA_EXTENTS\"] + }, + { + \"INPUT\": \"app_resources/shaders/spherical_rectangle_test.comp.hlsl\", + \"KEY\": \"spherical_rectangle_bench_create_only_r0_extents\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\", \"-DBENCH_VARIANT_R0_EXTENTS\"] }, { \"INPUT\": \"app_resources/shaders/alias_table_test.comp.hlsl\", @@ -241,6 +356,11 @@ set(JSON " \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", \"KEY\": \"cumulative_probability_bench\", \"COMPILE_OPTIONS\": [${BENCH_OPTS}] + }, + { + \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", + \"KEY\": \"cumulative_probability_yolo_bench\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_CUMPROB_YOLO_READS\"] } ] ") diff --git a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl index da7048a1f..bb1ed54ef 100644 --- a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl @@ -27,6 +27,7 @@ struct AliasTableTestResults float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; // Pre-computed alias table for weights {1, 2, 3, 4}: @@ -63,6 +64,7 @@ struct AliasTableTestExecutor output.backwardPdf = sampler.backwardPdf(output.generatedIndex); output.forwardWeight = sampler.forwardWeight(input.u, cache); output.backwardWeight = sampler.backwardWeight(output.generatedIndex); + output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; } }; diff --git a/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl b/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl index 1f0a68195..5e679c98a 100644 --- a/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/array_accessor.hlsl @@ -12,7 +12,6 @@ struct ArrayAccessor using value_type = T; template void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC { val = V(data[i]); } - T operator[](uint32_t i) NBL_CONST_MEMBER_FUNC { return data[i]; } T data[N]; }; diff --git a/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl b/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl index 64a13d3e1..752e547ce 100644 --- a/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/bilinear.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -19,6 +20,7 @@ struct BilinearTestResults float32_t forwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; struct BilinearTestExecutor @@ -37,6 +39,10 @@ struct BilinearTestExecutor output.backwardPdf = sampler.backwardPdf(output.generated); output.backwardWeight = sampler.backwardWeight(output.generated); } + // marginFactor = 3: same reasoning as Linear; Bilinear is two Linear stages, so the skewed- + // coefficient inverse-CDF d^2/du^2 divergence near [0,1]^2 boundary applies on both axes. + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 3.0f); + } }; diff --git a/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl b/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl index e8247e259..2b86e8560 100644 --- a/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/box_muller_transform.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -21,6 +22,7 @@ struct BoxMullerTransformTestResults float32_t forwardWeight; float32_t backwardWeight; float32_t2 separateBackwardPdf; + float32_t jacobianProduct; }; struct BoxMullerTransformTestExecutor @@ -40,6 +42,7 @@ struct BoxMullerTransformTestExecutor output.backwardPdf = sampler.backwardPdf(output.generated); output.backwardWeight = sampler.backwardWeight(output.generated); output.separateBackwardPdf = sampler.separateBackwardPdf(output.generated); + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 10.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl b/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl index 67d8e5869..e0c6a570c 100644 --- a/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/concentric_mapping.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -20,6 +21,7 @@ struct ConcentricMappingTestResults float32_t forwardWeight; float32_t backwardWeight; float32_t jacobianProduct; + float32_t inverseJacobianPdf; float32_t2 roundtripError; }; @@ -39,7 +41,15 @@ struct ConcentricMappingTestExecutor output.backwardWeight = sampling::ConcentricMapping::backwardWeight(input.u); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = float32_t(1.0 / output.backwardPdf) * output.forwardPdf; + { + sampling::ConcentricMapping sampler; + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 1.0f); + // Disk-center singularity: concentric atan2 blows up as r->0. + const float32_t diskRadius = nbl::hlsl::length(output.mapped); + output.inverseJacobianPdf = diskRadius < 0.1f + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.mapped, output.backwardPdf, 0.0f, 1e30f); + } } }; diff --git a/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl b/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl index f58a22741..e66cb44fe 100644 --- a/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/cumulative_probability.hlsl @@ -24,6 +24,7 @@ struct CumProbTestResults float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; // Pre-computed CDF table for weights {1, 2, 3, 4}: @@ -46,6 +47,7 @@ struct CumProbTestExecutor output.backwardPdf = sampler.backwardPdf(output.generatedIndex); output.forwardWeight = sampler.forwardWeight(input.u, cache); output.backwardWeight = sampler.backwardWeight(output.generatedIndex); + output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; } }; diff --git a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl index 9f1fec422..d5c1d313c 100644 --- a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl @@ -5,9 +5,6 @@ using namespace nbl::hlsl; -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif NBL_CONSTEXPR uint32_t WorkgroupSize = WORKGROUP_SIZE; struct AliasTablePushConstants diff --git a/37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl b/37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl new file mode 100644 index 000000000..f949f5b86 --- /dev/null +++ b/37_HLSLSamplingTests/app_resources/common/jacobian_test.hlsl @@ -0,0 +1,264 @@ +#ifndef _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_JACOBIAN_TEST_INCLUDED_ +#define _NBL_EXAMPLES_TESTS_37_SAMPLING_COMMON_JACOBIAN_TEST_INCLUDED_ + +#include +#include + +using namespace nbl::hlsl; + +// Negative sentinels signal "skipped" to the host verifier; the value encodes the reason. +static const float32_t JACOBIAN_SKIP_U_DOMAIN = -1.0f; +static const float32_t JACOBIAN_SKIP_CREASE = -2.0f; +static const float32_t JACOBIAN_SKIP_HEMI_BOUNDARY = -3.0f; +static const float32_t JACOBIAN_SKIP_BWD_PDF_RANGE = -4.0f; +static const float32_t JACOBIAN_SKIP_CODOMAIN_SINGULARITY = -5.0f; + + +template +struct ForwardJacobianMeasure; + +// Signed step that stays inside [0,1]: flip direction when u is in the upper half so u +/- eps +// never overshoots the domain. Magnitude is what matters (the stencil results take abs/length). +template +T signedEps(T u, T eps) +{ + return u > T(0.5) ? -eps : eps; +} + +template +struct ForwardJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + using cache_type = typename Sampler::cache_type; + + static scalar_type compute(Sampler _sampler, domain_type u, scalar_type eps, codomain_type L) + { + cache_type c; + const codomain_type L_x = _sampler.generate(u + signedEps(u, eps), c); + return nbl::hlsl::abs(L_x - L) / eps; + } +}; + +template +struct ForwardJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + using cache_type = typename Sampler::cache_type; + + static scalar_type compute(Sampler _sampler, domain_type u, scalar_type eps, codomain_type L) + { + domain_type u_x = u; + u_x[0] += signedEps(u[0], eps); + domain_type u_y = u; + u_y[1] += signedEps(u[1], eps); + cache_type c; + const codomain_type L_x = _sampler.generate(u_x, c); + const codomain_type L_y = _sampler.generate(u_y, c); + using matrix2_type = matrix; + const scalar_type det = nbl::hlsl::determinant(matrix2_type(L_x - L, L_y - L)); + return nbl::hlsl::abs(det) / (eps * eps); + } +}; + +template +struct ForwardJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + using cache_type = typename Sampler::cache_type; + + static scalar_type compute(Sampler _sampler, domain_type u, scalar_type eps, codomain_type L) + { + domain_type u_x = u; + u_x[0] += signedEps(u[0], eps); + domain_type u_y = u; + u_y[1] += signedEps(u[1], eps); + cache_type c; + const codomain_type L_x = _sampler.generate(u_x, c); + const codomain_type L_y = _sampler.generate(u_y, c); + return nbl::hlsl::length(nbl::hlsl::cross(L_x - L, L_y - L)) / (eps * eps); + } +}; + +// 3D domain: stencil perturbs u[0] and u[1] only, so the (2,3) body applies unchanged. +template +struct ForwardJacobianMeasure : ForwardJacobianMeasure +{ +}; + + +template +struct DomainMarginCheck; + +template +struct DomainMarginCheck +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + static bool outsideMargin(domain_type u, scalar_type margin) + { + return u < margin || u > scalar_type(1) - margin; + } +}; + +template +struct DomainMarginCheck +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + static bool outsideMargin(domain_type u, scalar_type margin) + { + return u[0] < margin || u[0] > scalar_type(1) - margin || u[1] < margin || u[1] > scalar_type(1) - margin; + } +}; + +// 3D domain: forward stencil only perturbs u[0] and u[1], so u[2] is irrelevant and (2) applies. +template +struct DomainMarginCheck : DomainMarginCheck +{ +}; + +enum JacobianMode : uint32_t +{ + JACOBIAN_PLAIN = 0, + JACOBIAN_CONCENTRIC = 1, // + concentric crease skip + JACOBIAN_CONCENTRIC_UXFOLD = 2 // + crease + u.x=0.5 hemi-boundary skip +}; + +// marginFactor scales the u-domain skip to marginFactor * eps. Use > 1 only for samplers whose +// stencil bias extends past a single eps-step (e.g. Arvo spherical triangle: sinZ ~ sqrt(u.y) +// gives O(h/u.y) forward-diff bias, so u.y in [0, k*eps] must be skipped). +template +float32_t computeJacobianProduct(Sampler _sampler, typename Sampler::domain_type u, float32_t eps, float32_t marginFactor) +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + using cache_type = typename Sampler::cache_type; + + NBL_IF_CONSTEXPR(Mode != JACOBIAN_PLAIN) + { + // Cast via float32_t2 so this block typechecks for scalar / vec2 / vec3 domains alike + // (HLSL splats scalars, identity on vec2, .xy on vec3). 1D samplers never reach here. + const float32_t2 uxy = (float32_t2)u; + const float32_t ux = uxy.x; + const float32_t uy = uxy.y; + + NBL_IF_CONSTEXPR(Mode == JACOBIAN_CONCENTRIC_UXFOLD) + { + if (nbl::hlsl::abs(ux - float32_t(0.5)) <= float32_t(2e-3)) + return JACOBIAN_SKIP_HEMI_BOUNDARY; + } + + const bool uxFold = (Mode == JACOBIAN_CONCENTRIC_UXFOLD); + // Empirical: the concentric C0 crease's stencil bias spreads wider than the 2*eps geometric + // straddle band. Non-uxFold 6e-3 covers the disk-center residual for Projected samplers; + // uxFold 1e-2 accounts for the doubled local_ux rate when u.x is folded. + const float32_t creaseBand = uxFold ? float32_t(1e-2) : float32_t(6e-3); + const float32_t local_ux = uxFold ? nbl::hlsl::abs(float32_t(2) * ux - float32_t(1)) : ux; + const float32_t a = float32_t(2) * local_ux - float32_t(1); + const float32_t b = float32_t(2) * uy - float32_t(1); + if (nbl::hlsl::abs(nbl::hlsl::abs(a) - nbl::hlsl::abs(b)) <= creaseBand) + return JACOBIAN_SKIP_CREASE; + } + + using margin_check_type = DomainMarginCheck::Dimension>; + if (margin_check_type::outsideMargin(u, scalar_type(eps * marginFactor))) + return JACOBIAN_SKIP_U_DOMAIN; + + // Generate on a copy: some samplers mutate u through NBL_REF_ARG (e.g. ProjectedSphere + // consumes u.z for hemisphere selection), and the perturbations below need the original u. + cache_type cache; + domain_type uGen = u; + const codomain_type L = _sampler.generate(uGen, cache); + const scalar_type pdf = _sampler.forwardPdf(uGen, cache); + + using measure_type = ForwardJacobianMeasure::Dimension, vector_traits::Dimension>; + const scalar_type measure = measure_type::compute(_sampler, u, scalar_type(eps), L); + + return pdf * measure; +} + + +template +struct InverseJacobianMeasure; + +template +struct InverseJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + + static scalar_type compute(Sampler _sampler, codomain_type x, scalar_type eps) + { + const scalar_type twoEps = scalar_type(2) * eps; + codomain_type x0_lo = x; + x0_lo[0] -= eps; + codomain_type x0_hi = x; + x0_hi[0] += eps; + codomain_type x1_lo = x; + x1_lo[1] -= eps; + codomain_type x1_hi = x; + x1_hi[1] += eps; + domain_type u0_lo = _sampler.generateInverse(x0_lo); + domain_type u0_hi = _sampler.generateInverse(x0_hi); + domain_type u1_lo = _sampler.generateInverse(x1_lo); + domain_type u1_hi = _sampler.generateInverse(x1_hi); + const domain_type dudx0 = (u0_hi - u0_lo) / twoEps; + const domain_type dudx1 = (u1_hi - u1_lo) / twoEps; + using matrix2_type = matrix; + const scalar_type det = nbl::hlsl::determinant(matrix2_type(dudx0, dudx1)); + return nbl::hlsl::abs(det); + } +}; + +template +struct InverseJacobianMeasure +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + + static scalar_type compute(Sampler _sampler, codomain_type x, scalar_type eps) + { + const scalar_type twoEps = scalar_type(2) * eps; + codomain_type t1, t2; + const codomain_type up = nbl::hlsl::abs(x[2]) < scalar_type(0.999) + ? codomain_type(scalar_type(0), scalar_type(0), scalar_type(1)) + : codomain_type(scalar_type(1), scalar_type(0), scalar_type(0)); + t1 = nbl::hlsl::normalize(nbl::hlsl::cross(up, x)); + t2 = nbl::hlsl::cross(x, t1); + domain_type u_t1_lo = _sampler.generateInverse(nbl::hlsl::normalize(x - t1 * eps)); + domain_type u_t1_hi = _sampler.generateInverse(nbl::hlsl::normalize(x + t1 * eps)); + domain_type u_t2_lo = _sampler.generateInverse(nbl::hlsl::normalize(x - t2 * eps)); + domain_type u_t2_hi = _sampler.generateInverse(nbl::hlsl::normalize(x + t2 * eps)); + const domain_type dudt1 = (u_t1_hi - u_t1_lo) / twoEps; + const domain_type dudt2 = (u_t2_hi - u_t2_lo) / twoEps; + using matrix2_type = matrix; + const scalar_type det = nbl::hlsl::determinant(matrix2_type(dudt1, dudt2)); + return nbl::hlsl::abs(det); + } +}; + +template +float32_t computeInverseJacobianPdf(Sampler _sampler, typename Sampler::codomain_type sample, float32_t backwardPdf, float32_t pdfMin, float32_t pdfMax) +{ + using scalar_type = typename Sampler::scalar_type; + using domain_type = typename Sampler::domain_type; + using codomain_type = typename Sampler::codomain_type; + + if (backwardPdf < scalar_type(pdfMin) || backwardPdf > scalar_type(pdfMax)) + return JACOBIAN_SKIP_BWD_PDF_RANGE; + + using measure_type = InverseJacobianMeasure::Dimension, vector_traits::Dimension>; + const scalar_type eps = scalar_type(1e-3); + return measure_type::compute(_sampler, sample, eps); +} + +#endif diff --git a/37_HLSLSamplingTests/app_resources/common/linear.hlsl b/37_HLSLSamplingTests/app_resources/common/linear.hlsl index b27d88e5b..af269ad2f 100644 --- a/37_HLSLSamplingTests/app_resources/common/linear.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/linear.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -19,6 +20,7 @@ struct LinearTestResults float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; struct LinearTestExecutor @@ -37,6 +39,7 @@ struct LinearTestExecutor output.backwardPdf = _sampler.backwardPdf(output.generated); output.backwardWeight = _sampler.backwardWeight(output.generated); } + output.jacobianProduct = computeJacobianProduct(_sampler, input.u, 1e-3f, 3.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl b/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl index 82e020fdc..e4b8ffabb 100644 --- a/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/polar_mapping.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -20,6 +21,7 @@ struct PolarMappingTestResults float32_t forwardWeight; float32_t backwardWeight; float32_t jacobianProduct; + float32_t inverseJacobianPdf; float32_t2 roundtripError; }; @@ -39,7 +41,23 @@ struct PolarMappingTestExecutor output.backwardWeight = sampling::PolarMapping::backwardWeight(input.u); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = float32_t(1.0 / output.backwardPdf) * output.forwardPdf; + + { + sampling::PolarMapping sampler; + // marginFactor = 3: r = sqrt(u.x) gives O(h/u.x) forward-diff bias near u.x=0, so skip + // u.x within 3*eps of the domain boundary (same reasoning as Linear's skewed-density case). + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 3.0f); + // Two inverse singularities: + // - disk center: atan2 diverges as r -> 0 + // - atan2 branch cut at y=0, x>0: the stencil's +/-eps in y straddles the 2*pi wrap, + // producing du.y/eps ~ 1/eps spikes (seen as test values ~305-862 with eps=1e-3). + const float32_t polarRadius = nbl::hlsl::length(output.mapped); + const bool onCutBand = nbl::hlsl::abs(output.mapped.y) < 5e-3f && output.mapped.x > 0.0f; + output.inverseJacobianPdf = (polarRadius < 0.1f || onCutBand) + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.mapped, output.backwardPdf, 0.0f, 1e30f); + } + } }; diff --git a/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl index 9697cf0df..c48697b03 100644 --- a/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/projected_hemisphere.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -22,6 +23,7 @@ struct ProjectedHemisphereTestResults float32_t backwardWeight; float32_t2 roundtripError; float32_t jacobianProduct; + float32_t inverseJacobianPdf; }; struct ProjectedHemisphereTestExecutor @@ -43,7 +45,11 @@ struct ProjectedHemisphereTestExecutor output.backwardWeight = sampler.backwardWeight(output.generated); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 5.0f); + const float32_t phDiskR = nbl::hlsl::length((float32_t2)output.generated); + output.inverseJacobianPdf = phDiskR < 0.1f + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 1e-3f, 1e30f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl index e9886b61d..a78a937f6 100644 --- a/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/projected_sphere.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -20,6 +21,7 @@ struct ProjectedSphereTestResults float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; + float32_t jacobianProduct; }; struct ProjectedSphereTestExecutor @@ -38,6 +40,7 @@ struct ProjectedSphereTestExecutor } output.backwardPdf = sampler.backwardPdf(output.generated); output.backwardWeight = sampler.backwardWeight(output.generated); + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 5.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl index 8370952ca..4aed7d9c3 100644 --- a/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/projected_spherical_rectangle.hlsl @@ -4,6 +4,7 @@ #include #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -24,12 +25,10 @@ struct ProjectedSphericalRectangleTestResults float32_t2 surfaceOffset; float32_t3 referenceDirection; float32_t forwardPdf; - float32_t backwardPdf; float32_t forwardWeight; float32_t backwardWeight; - float32_t backwardPdfAtGenerated; - float32_t backwardWeightAtGenerated; float32_t2 extents; + float32_t jacobianProduct; }; struct ProjectedSphericalRectangleTestExecutor @@ -46,30 +45,29 @@ struct ProjectedSphericalRectangleTestExecutor output.extents = rect.extents; sampling::ProjectedSphericalRectangle::cache_type cache; + output.generated = sampler.generate(input.u, cache); + output.forwardPdf = sampler.forwardPdf(input.u, cache); + output.forwardWeight = sampler.forwardWeight(input.u, cache); + // backwardWeight now takes a 3D direction; evaluate at generated L. + output.backwardWeight = sampler.backwardWeight(output.generated); + + float32_t2 absXY; { - output.generated = sampler.generate(input.u, cache); - output.forwardPdf = sampler.forwardPdf(input.u, cache); - output.forwardWeight = sampler.forwardWeight(input.u, cache); - } - { - sampling::ProjectedSphericalRectangle::cache_type offsetCache; - output.surfaceOffset = sampler.generateSurfaceOffset(input.u, offsetCache); + typename sampling::Bilinear::cache_type bc; + const float32_t2 warped = sampler.bilinearPatch.generate(input.u, bc); + typename sampling::SphericalRectangle::cache_type sphrectCache; + absXY = sampler.sphrect.generateLocalBasisXY(warped, sphrectCache); + output.surfaceOffset = absXY - float32_t2(sampler.sphrect.r0.x, sampler.sphrect.r0.y); } - // reference direction: reconstruct local 3D point from surfaceOffset and normalize { - const float32_t3 localPoint = sampler.sphrect.r0 + float32_t3(output.surfaceOffset.x, output.surfaceOffset.y, float32_t(0)); - output.referenceDirection = nbl::hlsl::normalize(localPoint); + const float32_t3 localPoint = float32_t3(absXY.x, absXY.y, sampler.sphrect.r0.z); + const float32_t3 localDir = nbl::hlsl::normalize(localPoint); + output.referenceDirection = sampler.sphrect.basis[0] * localDir[0] + + sampler.sphrect.basis[1] * localDir[1] + + sampler.sphrect.basis[2] * localDir[2]; } - // Test backwardPdf/Weight at the rect center: a deterministic interior point - // that avoids amplifying generate's FP errors through backward evaluation. - const float32_t2 center = float32_t2(0.5, 0.5); - output.backwardPdf = sampler.backwardPdf(center); - output.backwardWeight = sampler.backwardWeight(center); - // Use cache.warped (the [0,1]^2 input to the spherical rect warp) for consistency - // checks, NOT generated/extents (the nonlinear warp output). The bilinear in - // forwardPdf evaluates at cache.warped, so backwardPdf must too. - output.backwardPdfAtGenerated = sampler.backwardPdf(cache.warped); - output.backwardWeightAtGenerated = sampler.backwardWeight(cache.warped); + + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 10.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl b/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl index 5c81e53e0..0c424590b 100644 --- a/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/projected_spherical_triangle.hlsl @@ -4,6 +4,7 @@ #include #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -21,11 +22,10 @@ struct ProjectedSphericalTriangleTestResults { float32_t3 generated; float32_t forwardPdf; - float32_t backwardPdf; - float32_t backwardPdfAtGenerated; float32_t forwardWeight; float32_t backwardWeight; float32_t backwardWeightAtGenerated; + float32_t jacobianProduct; }; struct ProjectedSphericalTriangleTestExecutor @@ -43,15 +43,20 @@ struct ProjectedSphericalTriangleTestExecutor output.forwardPdf = sampler.forwardPdf(input.u, cache); output.forwardWeight = sampler.forwardWeight(input.u, cache); } - // Test backwardPdf/Weight at the triangle centroid: a deterministic interior point computed - // from only basic arithmetic + sqrt (IEEE 754 exact), so CPU and GPU agree bit-exactly. - // Using output.generated would amplify generate's transcendental FP errors through - // generateInverse's acos, producing CPU/GPU divergence. const float32_t3 center = nbl::hlsl::normalize(input.vertex0 + input.vertex1 + input.vertex2); - output.backwardPdf = sampler.backwardPdf(center); output.backwardWeight = sampler.backwardWeight(center); - output.backwardPdfAtGenerated = sampler.backwardPdf(output.generated); output.backwardWeightAtGenerated = sampler.backwardWeight(output.generated); + // Check the bilinear-warped (inner) u directly: for skinny triangles with a strongly biased + // receiver normal, outer u well inside [0,1] can still warp to inner u <~ 0.02 where Arvo's + // sqrt(sinZ) noise dominates. Pre-skip on the inner u instead of padding an outer marginFactor. + sampling::Bilinear::cache_type bc; + const float32_t2 innerU = sampler.bilinearPatch.generate(input.u, bc); + const float32_t innerMargin = 0.02f; + const bool innerNearEdge = innerU.x < innerMargin || innerU.x > (1.0f - innerMargin) + || innerU.y < innerMargin || innerU.y > (1.0f - innerMargin); + output.jacobianProduct = innerNearEdge + ? JACOBIAN_SKIP_U_DOMAIN + : computeJacobianProduct(sampler, input.u, 1e-3f, 1.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl b/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl index 9ae4df256..4f8d20964 100644 --- a/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/spherical_rectangle.hlsl @@ -4,6 +4,7 @@ #include #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -26,6 +27,7 @@ struct SphericalRectangleTestResults float32_t forwardWeight; float32_t backwardWeight; float32_t2 extents; + float32_t jacobianProduct; }; struct SphericalRectangleTestExecutor @@ -47,17 +49,23 @@ struct SphericalRectangleTestExecutor output.forwardPdf = sampler.forwardPdf(input.u, cache); output.forwardWeight = sampler.forwardWeight(input.u, cache); } + float32_t2 absXY; { sampling::SphericalRectangle::cache_type cache; - output.surfaceOffset = sampler.generateSurfaceOffset(input.u, cache); + absXY = sampler.generateLocalBasisXY(input.u, cache); + output.surfaceOffset = absXY - float32_t2(sampler.r0.x, sampler.r0.y); } - // reference direction: reconstruct local 3D point from surfaceOffset and normalize { - const float32_t3 localPoint = sampler.r0 + float32_t3(output.surfaceOffset.x, output.surfaceOffset.y, float32_t(0)); - output.referenceDirection = nbl::hlsl::normalize(localPoint); + const float32_t3 localDir = nbl::hlsl::normalize(float32_t3(absXY.x, absXY.y, sampler.r0.z)); + output.referenceDirection = sampler.basis[0] * localDir[0] + + sampler.basis[1] * localDir[1] + + sampler.basis[2] * localDir[2]; } output.backwardPdf = sampler.backwardPdf(output.generated); output.backwardWeight = sampler.backwardWeight(output.generated); + // marginFactor = 3: __generate's sin_au denominator goes through catastrophic cancellation + // for u.x within ~2*eps of 0 or 1 (au near n*pi), leaving ~0.5% residual at factor 3. + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 3.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl index 291661629..1828139d4 100644 --- a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -24,6 +25,7 @@ struct SphericalTriangleTestResults float32_t backwardWeight; float32_t2 roundtripError; float32_t jacobianProduct; + float32_t inverseJacobianPdf; // Minimum signed distance to a triangle edge (sin of angular distance to nearest great circle). // Positive = inside, negative = outside. Allows tolerance at boundaries. float32_t generatedInside; @@ -39,7 +41,7 @@ struct SphericalTriangleTestExecutor const float32_t3 verts[3] = { input.vertex0, input.vertex1, input.vertex2 }; shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); - sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); + sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); // Forward: u -> v { @@ -57,10 +59,8 @@ struct SphericalTriangleTestExecutor output.backwardWeight = sampler.backwardWeight(output.generated); } // Roundtrip error: ||u - u'|| - output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - - // Jacobian product: (1/forwardPdf) * backwardPdf should equal 1 for bijective samplers - output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + output.roundtripError = nbl::hlsl::abs(input.u - output.inverted);. + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 20.0f); // Domain preservation: // A point is inside the spherical triangle iff it is on the "inside" half-plane @@ -79,6 +79,13 @@ struct SphericalTriangleTestExecutor float32_t2 u = output.inverted; output.invertedInDomain = nbl::hlsl::min(nbl::hlsl::min(u.x, float32_t(1.0) - u.x), nbl::hlsl::min(u.y, float32_t(1.0) - u.y)); + + const float32_t uMargin = 1e-2f; + const bool nearUBoundary = output.inverted.x < uMargin || output.inverted.x > (1.0f - uMargin) + || output.inverted.y < uMargin || output.inverted.y > (1.0f - uMargin); + output.inverseJacobianPdf = nearUBoundary + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 0.1f, 10.0f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl b/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl index 76a724774..fb51838c7 100644 --- a/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/uniform_hemisphere.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -22,6 +23,7 @@ struct UniformHemisphereTestResults float32_t backwardWeight; float32_t2 roundtripError; float32_t jacobianProduct; + float32_t inverseJacobianPdf; }; struct UniformHemisphereTestExecutor @@ -42,7 +44,11 @@ struct UniformHemisphereTestExecutor output.backwardWeight = sampler.backwardWeight(output.generated); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 1.0f); + const float32_t uhDiskR = nbl::hlsl::length((float32_t2)output.generated); + output.inverseJacobianPdf = uhDiskR < 0.1f + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 0.0f, 1e30f); } }; diff --git a/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl b/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl index 3780b82ef..3737f4575 100644 --- a/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/uniform_sphere.hlsl @@ -3,6 +3,7 @@ #include #include +#include "jacobian_test.hlsl" using namespace nbl::hlsl; @@ -22,6 +23,7 @@ struct UniformSphereTestResults float32_t backwardWeight; float32_t2 roundtripError; float32_t jacobianProduct; + float32_t inverseJacobianPdf; }; struct UniformSphereTestExecutor @@ -43,7 +45,12 @@ struct UniformSphereTestExecutor output.backwardWeight = sampler.backwardWeight(output.generated); } output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); - output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 1.0f); + const float32_t usDiskR = nbl::hlsl::length((float32_t2)output.generated); + const float32_t absZ = nbl::hlsl::abs(output.generated.z); + output.inverseJacobianPdf = (absZ < 0.1f || usDiskR < 0.1f) + ? JACOBIAN_SKIP_CODOMAIN_SINGULARITY + : computeInverseJacobianPdf(sampler, output.generated, output.backwardPdf, 0.0f, 1e30f); } }; diff --git a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl index 72c4f1977..67047f997 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl @@ -58,18 +58,15 @@ void main() float32_t xi = float32_t(nbl::hlsl::glsl::bitfieldReverse(invID)) / float32_t(~0u); NBL_CONSTEXPR float32_t goldenRatio = 0.6180339887498949f; uint32_t acc = 0u; - uint32_t accPdf = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t u = frac(xi + float32_t(i) * goldenRatio); + xi = frac(xi + goldenRatio); BenchAliasTable::cache_type cache; - uint32_t generated = sampler.generate(u, cache); - acc ^= generated; - accPdf ^= asuint(sampler.forwardPdf(u, cache)); + uint32_t generated = sampler.generate(xi, cache); + acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache)); } - vk::RawBufferStore(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc + accPdf); + vk::RawBufferStore(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc); #else AliasTableTestExecutor executor; executor(inputTestValues[invID], outputTestValues[invID]); diff --git a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl index 06aad4fdc..03ac7b36a 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -20,20 +24,24 @@ void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb coefficients by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - const float32_t4 coeffs = float32_t4(0.25f, 0.5f, 0.75f, 1.0f) + perturbation; - sampling::Bilinear sampler = sampling::Bilinear::create(coeffs); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::Bilinear::cache_type cache; - float32_t2 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + const float32_t4 coeffs = float32_t4(0.25f, 0.5f, 0.75f, 1.0f) + perturbation; + sampling::Bilinear sampler = sampling::Bilinear::create(coeffs); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::Bilinear::cache_type cache; + float32_t2 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl index cf0f4065a..6189d4658 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -20,20 +24,24 @@ void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb stddev by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - sampling::BoxMullerTransform sampler = sampling::BoxMullerTransform::create(1.0f + perturbation); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - u.x = max(u.x, 1e-7f); - sampling::BoxMullerTransform::cache_type cache; - float32_t2 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + sampling::BoxMullerTransform sampler = sampling::BoxMullerTransform::create(1.0f + perturbation); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + u.x = max(u.x, 1e-7f); + sampling::BoxMullerTransform::cache_type cache; + float32_t2 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl index 973aba4fe..649c323b2 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -23,13 +27,17 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::ConcentricMapping::cache_type cache; - float32_t2 generated = sampling::ConcentricMapping::generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y); - acc ^= asuint(sampling::ConcentricMapping::forwardPdf(generated, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::ConcentricMapping::cache_type cache; + float32_t2 generated = sampling::ConcentricMapping::generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y); + acc ^= asuint(sampling::ConcentricMapping::forwardPdf(generated, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl index 2e48adc4a..1091ee447 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl @@ -46,10 +46,10 @@ void main() for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t u = frac(xi + float32_t(i) * goldenRatio); + xi = frac(xi + goldenRatio); BenchCumProbSampler::cache_type cache; - uint32_t generated = sampler.generate(u, cache); - acc ^= generated ^ asuint(sampler.forwardPdf(u, cache)); + uint32_t generated = sampler.generate(xi, cache); + acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache)); } vk::RawBufferStore(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc); diff --git a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl index 614f339b4..17cf83ac5 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -20,20 +24,24 @@ void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb coefficients by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - const float32_t2 coeffs = float32_t2(0.2f, 0.8f) + perturbation; - sampling::Linear sampler = sampling::Linear::create(coeffs); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t u = float32_t(rng()) * toFloat; - sampling::Linear::cache_type cache; - float32_t generated = sampler.generate(u, cache); - acc ^= asuint(generated); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + const float32_t2 coeffs = float32_t2(0.2f, 0.8f) + perturbation; + sampling::Linear sampler = sampling::Linear::create(coeffs); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t u = float32_t(rng()) * toFloat; + sampling::Linear::cache_type cache; + float32_t generated = sampler.generate(u, cache); + acc ^= asuint(generated); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl index db7488acd..e0cf7aea0 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -23,13 +27,17 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::PolarMapping::cache_type cache; - float32_t2 generated = sampling::PolarMapping::generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y); - acc ^= asuint(sampling::PolarMapping::forwardPdf(generated, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::PolarMapping::cache_type cache; + float32_t2 generated = sampling::PolarMapping::generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y); + acc ^= asuint(sampling::PolarMapping::forwardPdf(generated, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl index 871444955..d1ef313e5 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -23,14 +27,18 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; sampling::ProjectedHemisphere sampler; - sampling::ProjectedHemisphere::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::ProjectedHemisphere::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl index 67a3fa662..9b8c234c4 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -23,14 +27,18 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t3 u = float32_t3(rng(), rng(), rng()) * toFloat; sampling::ProjectedSphere sampler; - sampling::ProjectedSphere::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t3 u = float32_t3(rng(), rng(), rng()) * toFloat; + sampling::ProjectedSphere::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl index 903075804..ca9b4d43e 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl @@ -11,6 +11,12 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +// Number of generate() calls per create(). Default = BENCH_ITERS (persistent: 1 create total). +// Set to 1 for 1:1, 16 for 1:16 multisampling, etc. Must divide BENCH_ITERS. +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -21,25 +27,49 @@ main() const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS // Perturb rectangle origin by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - shapes::CompressedSphericalRectangle compressed; - compressed.origin = float32_t3(perturbation, perturbation, -2.0f); - compressed.right = float32_t3(1.0f, 0.0f, 0.0f); - compressed.up = float32_t3(0.0f, 1.0f, 0.0f); - shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); - sampling::ProjectedSphericalRectangle sampler = sampling::ProjectedSphericalRectangle::create(rect, float32_t3(perturbation, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, perturbation + 0.5), false); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; +#ifdef BENCH_CREATE_ONLY for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::ProjectedSphericalRectangle::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + // Depend on i so the compiler can't hoist create() out of the loop. + const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f; + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + sampling::ProjectedSphericalRectangle sampler = sampling::ProjectedSphericalRectangle::create(rect, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, perturbation + 0.5), false); + // Read a cheap function of sampler state so create() can't be elided. + sampling::ProjectedSphericalRectangle::cache_type pdfCache; + sampler.generate(float32_t2(0.5f, 0.5f), pdfCache); + acc ^= asuint(sampler.forwardPdf(float32_t2(0.5f, 0.5f), pdfCache)); + } +#else + // Unified create:generate loop — one create per BENCH_SAMPLES_PER_CREATE generates. + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) + { + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + sampling::ProjectedSphericalRectangle sampler = sampling::ProjectedSphericalRectangle::create(rect, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, perturbation + 0.5), false); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::ProjectedSphericalRectangle::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } +#endif benchOutput.Store(invID * 4u, acc); #else ProjectedSphericalRectangleTestExecutor executor; diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl index 83e47b3e1..3d8ec8961 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -20,23 +24,40 @@ void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb vertices and normal by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; - shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); - sampling::ProjectedSphericalTriangle sampler = sampling::ProjectedSphericalTriangle::create(shape, normalize(float32_t3(perturbation, perturbation, 1.0f)), false); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; +#ifdef BENCH_CREATE_ONLY for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::ProjectedSphericalTriangle::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f; + const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; + shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); + sampling::ProjectedSphericalTriangle sampler = sampling::ProjectedSphericalTriangle::create(shape, normalize(float32_t3(perturbation, perturbation, 1.0f)), false); + sampling::ProjectedSphericalTriangle::cache_type pdfCache; + sampler.generate(float32_t2(0.5f, 0.5f), pdfCache); + acc ^= asuint(sampler.forwardPdf(float32_t2(0.5f, 0.5f), pdfCache)); + } +#else + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) + { + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; + shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); + sampling::ProjectedSphericalTriangle sampler = sampling::ProjectedSphericalTriangle::create(shape, normalize(float32_t3(perturbation, perturbation, 1.0f)), false); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::ProjectedSphericalTriangle::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } +#endif benchOutput.Store(invID * 4u, acc); #else ProjectedSphericalTriangleTestExecutor executor; diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl index 3e9a6fcae..b9766d5ff 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl @@ -11,6 +11,12 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +// Number of generate() calls per create(). Default = BENCH_ITERS (persistent: 1 create total). +// Set to 1 for 1:1 (create+generate per iter), 16 for 1:16 multisampling, etc. Must divide BENCH_ITERS. +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -20,26 +26,96 @@ main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb rectangle origin by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - shapes::CompressedSphericalRectangle compressed; - compressed.origin = float32_t3(perturbation, perturbation, -2.0f); - compressed.right = float32_t3(1.0f, 0.0f, 0.0f); - compressed.up = float32_t3(0.0f, 1.0f, 0.0f); - shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); - sampling::SphericalRectangle sampler = sampling::SphericalRectangle::create(rect, float32_t3(perturbation, 0.0f, 0.0f)); + // Observer at origin so origin - observer = (p, p, -2) has no zero components: + // keeps all 4 denorm_n_z components perturbation-dependent (no constant-folding). + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; + +#if (defined(BENCH_VARIANT_SA_EXTENTS) || defined(BENCH_VARIANT_R0_EXTENTS)) && !defined(BENCH_CREATE_ONLY) + // variants 2/3 pre-build: produce a rect (for its basis, sa, extents) once per thread. + shapes::CompressedSphericalRectangle compressedBase; + compressedBase.origin = float32_t3(perturbationBase, perturbationBase, -2.0f); + compressedBase.right = float32_t3(1.0f, 0.0f, 0.0f); + compressedBase.up = float32_t3(0.0f, 1.0f, 0.0f); + const shapes::SphericalRectangle rectBase = shapes::SphericalRectangle::create(compressedBase); + const typename shapes::SphericalRectangle::solid_angle_type saBase = rectBase.solidAngle(float32_t3(0.0f, 0.0f, 0.0f)); + const float32_t2 extentsBase = rectBase.extents; + const matrix basisBase = rectBase.basis; +#endif nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; +#ifdef BENCH_CREATE_ONLY for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::SphericalRectangle::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + // Depend on i so the compiler can't hoist create() out of the loop. + const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f; + sampling::SphericalRectangle sampler; + #if defined(BENCH_VARIANT_SA_EXTENTS) + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + typename shapes::SphericalRectangle::solid_angle_type sa = rect.solidAngle(float32_t3(0.0f, 0.0f, 0.0f)); + sampler = sampling::SphericalRectangle::create(rect.basis, sa, rect.extents); + #elif defined(BENCH_VARIANT_R0_EXTENTS) + // Build a basis from the same rect geometry so create(basis, r0, extents) has the right frame. + shapes::CompressedSphericalRectangle compressedR0; + compressedR0.origin = float32_t3(perturbation, perturbation, -2.0f); + compressedR0.right = float32_t3(1.0f, 0.0f, 0.0f); + compressedR0.up = float32_t3(0.0f, 1.0f, 0.0f); + const shapes::SphericalRectangle rectR0 = shapes::SphericalRectangle::create(compressedR0); + const float32_t3 r0 = float32_t3(perturbation, perturbation, -2.0f); + const float32_t2 extents = float32_t2(1.0f, 1.0f); + sampler = sampling::SphericalRectangle::create(rectR0.basis, r0, extents); + #else + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + sampler = sampling::SphericalRectangle::create(rect, float32_t3(0.0f, 0.0f, 0.0f)); + #endif + // Read a cheap function of sampler state so create() can't be elided. + acc ^= asuint(sampler.backwardPdf(float32_t3(0.0f, 0.0f, 1.0f))); } +#else + // Unified create:generate loop - one create per BENCH_SAMPLES_PER_CREATE generates. + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) + { + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + sampling::SphericalRectangle sampler; + #if defined(BENCH_VARIANT_SA_EXTENTS) + // variant 2: create(basis, sa, extents). Poison one cosGamma so the sincos_accumulator can't be hoisted. + typename shapes::SphericalRectangle::solid_angle_type sa = saBase; + sa.cosGamma[2] += perturbation; + sampler = sampling::SphericalRectangle::create(basisBase, sa, extentsBase); + #elif defined(BENCH_VARIANT_R0_EXTENTS) + // variant 3: create(basis, r0, extents). r0 matches what variant 1 produces. + const float32_t3 r0 = float32_t3(perturbation, perturbation, -2.0f); + const float32_t2 extents = float32_t2(1.0f, 1.0f); + sampler = sampling::SphericalRectangle::create(basisBase, r0, extents); + #else + // variant 1 (default): create(shape, observer). + shapes::CompressedSphericalRectangle compressed; + compressed.origin = float32_t3(perturbation, perturbation, -2.0f); + compressed.right = float32_t3(1.0f, 0.0f, 0.0f); + compressed.up = float32_t3(0.0f, 1.0f, 0.0f); + shapes::SphericalRectangle rect = shapes::SphericalRectangle::create(compressed); + sampler = sampling::SphericalRectangle::create(rect, float32_t3(0.0f, 0.0f, 0.0f)); + #endif + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::SphericalRectangle::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } + } +#endif benchOutput.Store(invID * 4u, acc); #else SphericalRectangleTestExecutor executor; diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl index 55991bcb3..3595ac86a 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl @@ -11,32 +11,50 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS - // Perturb vertices by invID so the sampler is non-uniform across threads. - const float32_t perturbation = float32_t(invID) * 1.0e-7f; - const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; - shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); - sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); + const float32_t perturbationBase = float32_t(invID) * 1.0e-7f; nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; +#ifdef BENCH_CREATE_ONLY for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; - sampling::SphericalTriangle::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + const float32_t perturbation = perturbationBase + float32_t(i) * 1.0e-9f; + const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; + shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); + sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); + acc ^= asuint(sampler.backwardPdf(float32_t3(0.0f, 0.0f, 1.0f))); + } +#else + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) + { + const float32_t perturbation = perturbationBase + float32_t(j) * 1.0e-9f; + const float32_t3 verts[3] = { normalize(float32_t3(1.0f, perturbation, 0.0f)), normalize(float32_t3(0.0f, 1.0f, perturbation)), normalize(float32_t3(perturbation, 0.0f, 1.0f)) }; + shapes::SphericalTriangle shape = shapes::SphericalTriangle::createFromUnitSphereVertices(verts); + sampling::SphericalTriangle sampler = sampling::SphericalTriangle::create(shape); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::SphericalTriangle::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } +#endif benchOutput.Store(invID * 4u, acc); #else SphericalTriangleTestExecutor executor; diff --git a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl index 908520243..cd43c630e 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl @@ -1,4 +1,8 @@ +#pragma shader_stage(compute) + // Compile test: instantiate all sampling types and their concept-required methods to verify DXC compilation +#include +#include #include #include #include @@ -9,12 +13,15 @@ #include #include #include +#include +#include +#include +#include "../common/array_accessor.hlsl" using namespace nbl::hlsl; [[vk::binding(0, 0)]] RWStructuredBuffer output; [numthreads(1, 1, 1)] -[shader("compute")] void main() { float32_t2 u2 = float32_t2(0.5, 0.5); @@ -119,7 +126,7 @@ void main() // Octant triangle: all dot products between vertices are 0, so cos_sides=0, csc_sides=1 const float32_t3 triVerts[3] = {float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1)}; shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::createFromUnitSphereVertices(triVerts); - sampling::SphericalTriangle sphTri = sampling::SphericalTriangle::create(shapeTri); + sampling::SphericalTriangle sphTri = sampling::SphericalTriangle::create(shapeTri); sampling::SphericalTriangle::cache_type sphTriCache; float32_t3 stSample = sphTri.generate(u2, sphTriCache); acc.xyz += stSample; @@ -129,7 +136,7 @@ void main() acc.x += sphTri.backwardPdf(stSample); acc.x += sphTri.backwardWeight(stSample); - // SphericalRectangle — generate, forwardPdf, backwardPdf, forwardWeight, backwardWeight + // SphericalRectangle — generate, generateSurfaceOffset, forwardPdf, backwardPdf, forwardWeight, backwardWeight shapes::CompressedSphericalRectangle csr; csr.origin = float32_t3(0.0, 0.0, -1.0); csr.right = float32_t3(1.0, 0.0, 0.0); @@ -140,20 +147,81 @@ void main() sampling::SphericalRectangle::cache_type sphRectCache; float32_t3 srSample = sphRect.generate(u2, sphRectCache); acc.xyz += srSample; + acc.xy += sphRect.generateLocalBasisXY(u2, sphRectCache); acc.x += sphRect.forwardPdf(u2, sphRectCache); acc.x += sphRect.forwardWeight(u2, sphRectCache); acc.x += sphRect.backwardPdf(srSample); acc.x += sphRect.backwardWeight(srSample); - // ProjectedSphericalTriangle — generate, forwardPdf, backwardPdf, forwardWeight, backwardWeight + // ProjectedSphericalTriangle — generate, forwardPdf, forwardWeight, backwardWeight(L) sampling::ProjectedSphericalTriangle projTri = sampling::ProjectedSphericalTriangle::create(shapeTri, float32_t3(0.0, 0.0, 1.0), false); sampling::ProjectedSphericalTriangle::cache_type projTriCache; float32_t3 ptSample = projTri.generate(u2, projTriCache); acc.xyz += ptSample; acc.x += projTri.forwardPdf(u2, projTriCache); acc.x += projTri.forwardWeight(u2, projTriCache); - acc.x += projTri.backwardPdf(ptSample); acc.x += projTri.backwardWeight(ptSample); + // ProjectedSphericalRectangle (UsePdfAsWeight=true) — generate, forwardPdf, forwardWeight, backwardWeight(L) + const float32_t3 psrNormal = float32_t3(0.0, 0.0, 1.0); + sampling::ProjectedSphericalRectangle projRectPdf = + sampling::ProjectedSphericalRectangle::create(shapeRect, srObserver, psrNormal, false); + sampling::ProjectedSphericalRectangle::cache_type projRectPdfCache; + float32_t3 prPdfSample = projRectPdf.generate(u2, projRectPdfCache); + acc.xyz += prPdfSample; + acc.x += projRectPdf.forwardPdf(u2, projRectPdfCache); + acc.x += projRectPdf.forwardWeight(u2, projRectPdfCache); + acc.x += projRectPdf.backwardWeight(prPdfSample); + + // ProjectedSphericalRectangle (UsePdfAsWeight=false) — exercise the MIS-weight path + sampling::ProjectedSphericalRectangle projRectMis = + sampling::ProjectedSphericalRectangle::create(shapeRect, srObserver, psrNormal, true); + sampling::ProjectedSphericalRectangle::cache_type projRectMisCache; + float32_t3 prMisSample = projRectMis.generate(u2, projRectMisCache); + acc.xyz += prMisSample; + acc.x += projRectMis.forwardPdf(u2, projRectMisCache); + acc.x += projRectMis.forwardWeight(u2, projRectMisCache); + acc.x += projRectMis.backwardWeight(prMisSample); + + // AliasTable — generate (with/without cache), forwardPdf, backwardPdf, forwardWeight, backwardWeight + ArrayAccessor aliasProb; + aliasProb.data[0] = 0.25; aliasProb.data[1] = 0.5; aliasProb.data[2] = 0.75; aliasProb.data[3] = 1.0; + ArrayAccessor aliasIdx; + aliasIdx.data[0] = 1u; aliasIdx.data[1] = 2u; aliasIdx.data[2] = 3u; aliasIdx.data[3] = 0u; + ArrayAccessor aliasPdf; + aliasPdf.data[0] = 0.25; aliasPdf.data[1] = 0.25; aliasPdf.data[2] = 0.25; aliasPdf.data[3] = 0.25; + sampling::AliasTable, ArrayAccessor, ArrayAccessor > aliasTable = + sampling::AliasTable, ArrayAccessor, ArrayAccessor >::create(aliasProb, aliasIdx, aliasPdf, 4u); + sampling::AliasTable, ArrayAccessor, ArrayAccessor >::cache_type aliasCache; + uint32_t aliasBin0 = aliasTable.generate(0.3); + uint32_t aliasBin = aliasTable.generate(0.3, aliasCache); + acc.x += float32_t(aliasBin0 + aliasBin); + acc.x += aliasTable.forwardPdf(0.3, aliasCache); + acc.x += aliasTable.forwardWeight(0.3, aliasCache); + acc.x += aliasTable.backwardPdf(aliasBin); + acc.x += aliasTable.backwardWeight(aliasBin); + + // CumulativeProbabilitySampler — generate (with/without cache), forwardPdf, backwardPdf, forwardWeight, backwardWeight + ArrayAccessor cumProb; + cumProb.data[0] = 0.25; cumProb.data[1] = 0.5; cumProb.data[2] = 0.75; + sampling::CumulativeProbabilitySampler > cumSampler = + sampling::CumulativeProbabilitySampler >::create(cumProb, 4u); + sampling::CumulativeProbabilitySampler >::cache_type cumCache; + uint32_t cumBin0 = cumSampler.generate(0.6); + uint32_t cumBin = cumSampler.generate(0.6, cumCache); + acc.x += float32_t(cumBin0 + cumBin); + acc.x += cumSampler.forwardPdf(0.6, cumCache); + acc.x += cumSampler.forwardWeight(0.6, cumCache); + acc.x += cumSampler.backwardPdf(cumBin); + acc.x += cumSampler.backwardWeight(cumBin); + + // PartitionRandVariable — operator() partitions u into a left/right branch + sampling::PartitionRandVariable partition; + partition.leftProb = 0.25; + float32_t partXi = 0.5; + float32_t partRcp; + bool partRight = partition(partXi, partRcp); + acc.x += partXi + partRcp + float32_t(partRight ? 1 : 0); + output[0] = acc; } diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl index d0990ef43..3c43ee119 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -23,14 +27,18 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; sampling::UniformHemisphere sampler; - sampling::UniformHemisphere::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::UniformHemisphere::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl index 0d33f5c11..5879e28bb 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl @@ -11,6 +11,10 @@ [[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; #endif +#if !defined(BENCH_SAMPLES_PER_CREATE) && defined(BENCH_ITERS) +#define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) +#endif + #ifndef WORKGROUP_SIZE #define WORKGROUP_SIZE 64 #endif @@ -23,14 +27,18 @@ void main() nbl::hlsl::Xoroshiro64Star rng = nbl::hlsl::Xoroshiro64Star::construct(uint32_t2(invID, 0u)); const float32_t toFloat = asfloat(0x2f800004u); uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + const uint32_t outerIters = uint32_t(BENCH_ITERS) / uint32_t(BENCH_SAMPLES_PER_CREATE); + for (uint32_t j = 0u; j < outerIters; j++) { - float32_t2 u = float32_t2(rng(), rng()) * toFloat; sampling::UniformSphere sampler; - sampling::UniformSphere::cache_type cache; - float32_t3 generated = sampler.generate(u, cache); - acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); - acc ^= asuint(sampler.forwardPdf(u, cache)); + for (uint32_t k = 0u; k < uint32_t(BENCH_SAMPLES_PER_CREATE); k++) + { + float32_t2 u = float32_t2(rng(), rng()) * toFloat; + sampling::UniformSphere::cache_type cache; + float32_t3 generated = sampler.generate(u, cache); + acc ^= asuint(generated.x) ^ asuint(generated.y) ^ asuint(generated.z); + acc ^= asuint(sampler.forwardPdf(u, cache)); + } } benchOutput.Store(invID * 4u, acc); #else diff --git a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h index 8f85545b3..02fbf58d2 100644 --- a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h @@ -12,8 +12,11 @@ using namespace nbl; // Benchmarks alias table vs cumulative probability sampler on the GPU using BDA. -// Builds both tables from the same weight distribution, uploads via BDA buffers, -// and measures GPU throughput using timestamp queries. +// Builds pipelines once, then sweeps a list of table sizes. For each N it builds +// both tables from the same weight distribution, uploads via BDA buffers, and +// measures GPU throughput using timestamp queries. The cumulative probability +// sampler is run in two variants: the stateful-comparator cache population +// (default) and the "YOLO re-read" variant (cumulative_probability.hlsl). class CDiscreteSamplerBenchmark { public: @@ -26,17 +29,17 @@ class CDiscreteSamplerBenchmark video::IPhysicalDevice* physicalDevice; std::string aliasShaderKey; std::string cumProbShaderKey; + std::string cumProbYoloShaderKey; uint32_t computeFamilyIndex; uint32_t dispatchGroupCount; - uint32_t tableSize; }; void setup(const SetupData& data) { m_device = data.device; m_logger = data.logger; + m_assetMgr = data.assetMgr; m_dispatchGroupCount = data.dispatchGroupCount; - m_tableSize = data.tableSize; m_physicalDevice = data.physicalDevice; m_queue = m_device->getQueue(data.computeFamilyIndex, 0); @@ -44,8 +47,6 @@ class CDiscreteSamplerBenchmark // Command pool + buffers m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchCmdbuf); - m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdbuf); - m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdbuf); // Timestamp query pool { @@ -56,61 +57,9 @@ class CDiscreteSamplerBenchmark m_queryPool = m_device->createQueryPool(qp); } - // Generate random weights - const uint32_t N = m_tableSize; - std::vector weights(N); - std::mt19937 rng(42); - std::uniform_real_distribution dist(0.001f, 100.0f); - for (uint32_t i = 0; i < N; i++) - weights[i] = dist(rng); - - // Build alias table - std::vector aliasProb(N); - std::vector aliasIdx(N); - std::vector aliasPdf(N); - std::vector workspace(N); - nbl::hlsl::sampling::AliasTableBuilder::build({weights}, aliasProb.data(), aliasIdx.data(), aliasPdf.data(), workspace.data()); - - // Build cumulative probability table - std::vector cumProb(N - 1); - nbl::hlsl::sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data()); - - // Create BDA buffers and upload data - auto createBdaBuffer = [&](const void* srcData, size_t bytes) -> core::smart_refctd_ptr - { - video::IGPUBuffer::SCreationParams bp = {}; - bp.size = bytes; - bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | - video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - auto buf = m_device->createBuffer(std::move(bp)); - - video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits(); - auto alloc = m_device->allocate(reqs, buf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - - const auto allocSize = alloc.memory->getAllocationSize(); - if (alloc.memory->map({0ull, allocSize}, video::IDeviceMemoryAllocation::EMCAF_WRITE)) - { - std::memcpy(alloc.memory->getMappedPointer(), srcData, bytes); - // Flush so GPU can see the written data - video::ILogicalDevice::MappedMemoryRange flushRange(alloc.memory.get(), 0ull, allocSize); - m_device->flushMappedMemoryRanges(1u, &flushRange); - alloc.memory->unmap(); - } - return buf; - }; - const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE; - // Alias table buffers - m_aliasProbBuf = createBdaBuffer(aliasProb.data(), N * sizeof(float)); - m_aliasIdxBuf = createBdaBuffer(aliasIdx.data(), N * sizeof(uint32_t)); - m_aliasPdfBuf = createBdaBuffer(aliasPdf.data(), N * sizeof(float)); - - // CDF buffer - m_cumProbBuf = createBdaBuffer(cumProb.data(), (N - 1) * sizeof(float)); - - // Shared output buffer + // Shared output buffer (size only depends on thread count) { video::IGPUBuffer::SCreationParams bp = {}; bp.size = totalThreads * sizeof(uint32_t); @@ -122,163 +71,218 @@ class CDiscreteSamplerBenchmark m_device->allocate(reqs, m_outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); } - // Create pipelines (push constants only, no descriptor sets) - auto loadShader = [&](const std::string& key) + // Pipelines (N-independent; only push constants change per run) + m_aliasPipeline = createPipeline(data.aliasShaderKey, m_aliasPplnLayout, "alias"); + m_cumProbPipeline = createPipeline(data.cumProbShaderKey, m_cumProbPplnLayout, "cumprob-comparator"); + m_cumProbYoloPipeline = createPipeline(data.cumProbYoloShaderKey, m_cumProbYoloPplnLayout, "cumprob-yolo"); + } + + // DispatchScheduler: uint32_t N -> std::pair. + // Lets the caller trade wall-clock for statistical stability per size: + // big-N runs are DRAM-bound and need fewer dispatches to hit the same total sample count. + struct DispatchCounts { uint32_t warmup; uint32_t bench; }; + + // Sweep a list of table sizes. For each N: build tables from a fresh weight + // distribution (deterministic seed = 42 + N so different N's get distinct + // distributions but runs are reproducible), upload via BDA, then run all + // three samplers with the dispatch counts chosen by `scheduler`. + template + void runSweep(const std::vector& tableSizes, DispatchScheduler scheduler) + { + const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE; + m_logger->log("=== GPU Discrete Sampler Benchmark sweep (%u threads * %u iters/thread; wg=%u; dispatches chosen per-N) ===", + system::ILogger::ELL_PERFORMANCE, totalThreads, BENCH_ITERS, WORKGROUP_SIZE); + m_logger->log("%12s | %-28s | %12s | %12s | %12s | %10s", + system::ILogger::ELL_PERFORMANCE, "N", "Sampler", "ps/sample", "GSamples/s", "ms total", "dispatches"); + + for (uint32_t N : tableSizes) + { + const DispatchCounts dc = scheduler(N); + buildAndUpload(N); + runSingle(N, "AliasTable", m_aliasPipeline, m_aliasPplnLayout, SamplerKind::Alias, dc.warmup, dc.bench); + runSingle(N, "CumulativeProbability", m_cumProbPipeline, m_cumProbPplnLayout, SamplerKind::CumProbCompare, dc.warmup, dc.bench); + runSingle(N, "CumulativeProbability (YOLO)", m_cumProbYoloPipeline, m_cumProbYoloPplnLayout, SamplerKind::CumProbYolo, dc.warmup, dc.bench); + releaseTables(); + } + } + + // Convenience: sweep with fixed dispatch counts for every size. + void runSweep(const std::vector& tableSizes, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) + { + runSweep(tableSizes, [warmupIterations, benchmarkIterations](uint32_t) -> DispatchCounts { + return {warmupIterations, benchmarkIterations}; + }); + } + + private: + enum class SamplerKind { Alias, CumProbCompare, CumProbYolo }; + + template + core::smart_refctd_ptr createPipeline(const std::string& shaderKey, core::smart_refctd_ptr& outLayout, const char* tag) + { + const asset::SPushConstantRange pcRange = { + .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, + .offset = 0, + .size = sizeof(PushConstantT)}; + auto layout = m_device->createPipelineLayout({&pcRange, 1}); + if (!layout) + m_logger->log("CDiscreteSamplerBenchmark: failed to create %s pipeline layout", system::ILogger::ELL_ERROR, tag); + + asset::IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; + auto bundle = m_assetMgr->getAsset(shaderKey, lp); + auto source = asset::IAsset::castDown(bundle.getContents()[0]); + auto shader = m_device->compileShader({.source = source.get()}); + if (!shader) + m_logger->log("CDiscreteSamplerBenchmark: failed to load %s shader", system::ILogger::ELL_ERROR, tag); + + video::IGPUComputePipeline::SCreationParams pp = {}; + pp.layout = layout.get(); + pp.shader.shader = shader.get(); + pp.shader.entryPoint = "main"; + if (m_device->getEnabledFeatures().pipelineExecutableInfo) { - asset::IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; - auto bundle = data.assetMgr->getAsset(key, lp); - auto source = asset::IAsset::castDown(bundle.getContents()[0]); - return m_device->compileShader({.source = source.get()}); - }; - - // Alias table pipeline + pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; + } + + core::smart_refctd_ptr pipeline; + if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &pipeline)) + m_logger->log("CDiscreteSamplerBenchmark: failed to create %s compute pipeline", system::ILogger::ELL_ERROR, tag); + + if (m_device->getEnabledFeatures().pipelineExecutableInfo) { - const asset::SPushConstantRange pcRange = { - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, - .offset = 0, - .size = sizeof(AliasTablePushConstants)}; - auto layout = m_device->createPipelineLayout({&pcRange, 1}); - if (!layout) - m_logger->log("CDiscreteSamplerBenchmark: failed to create alias pipeline layout", system::ILogger::ELL_ERROR); - video::IGPUComputePipeline::SCreationParams pp = {}; - pp.layout = layout.get(); - auto shader = loadShader(data.aliasShaderKey); - if (!shader) - m_logger->log("CDiscreteSamplerBenchmark: failed to load alias shader", system::ILogger::ELL_ERROR); - pp.shader.shader = shader.get(); - pp.shader.entryPoint = "main"; - - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; - } - - if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &m_aliasPipeline)) - m_logger->log("CDiscreteSamplerBenchmark: failed to create alias compute pipeline", system::ILogger::ELL_ERROR); - - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - auto report = system::to_string(m_aliasPipeline->getExecutableInfo()); - m_logger->log("Alias Table Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, report.c_str()); - } - m_aliasPplnLayout = std::move(layout); + auto report = system::to_string(pipeline->getExecutableInfo()); + m_logger->log("%s Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, tag, report.c_str()); } + outLayout = std::move(layout); + return pipeline; + } - // CDF pipeline + core::smart_refctd_ptr createBdaBuffer(const void* srcData, size_t bytes) + { + video::IGPUBuffer::SCreationParams bp = {}; + bp.size = bytes; + bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | + video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + auto buf = m_device->createBuffer(std::move(bp)); + + video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buf->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); + auto alloc = m_device->allocate(reqs, buf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + + const auto allocSize = alloc.memory->getAllocationSize(); + if (alloc.memory->map({0ull, allocSize}, video::IDeviceMemoryAllocation::EMCAF_WRITE)) { - const asset::SPushConstantRange pcRange = { - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, - .offset = 0, - .size = sizeof(CumProbPushConstants)}; - auto layout = m_device->createPipelineLayout({&pcRange, 1}); - if (!layout) - m_logger->log("CDiscreteSamplerBenchmark: failed to create cumprob pipeline layout", system::ILogger::ELL_ERROR); - video::IGPUComputePipeline::SCreationParams pp = {}; - pp.layout = layout.get(); - auto shader = loadShader(data.cumProbShaderKey); - if (!shader) - m_logger->log("CDiscreteSamplerBenchmark: failed to load cumprob shader", system::ILogger::ELL_ERROR); - pp.shader.shader = shader.get(); - pp.shader.entryPoint = "main"; - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; - } - if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &m_cumProbPipeline)) - m_logger->log("CDiscreteSamplerBenchmark: failed to create cumprob compute pipeline", system::ILogger::ELL_ERROR); - if (m_device->getEnabledFeatures().pipelineExecutableInfo) - { - auto report = system::to_string(m_cumProbPipeline->getExecutableInfo()); - m_logger->log("Cumulative Probability Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, report.c_str()); - } - m_cumProbPplnLayout = std::move(layout); + std::memcpy(alloc.memory->getMappedPointer(), srcData, bytes); + video::ILogicalDevice::MappedMemoryRange flushRange(alloc.memory.get(), 0ull, allocSize); + m_device->flushMappedMemoryRanges(1u, &flushRange); + alloc.memory->unmap(); } + return buf; } - void run(uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) + void buildAndUpload(uint32_t N) { - constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; - const uint32_t totalThreads = m_dispatchGroupCount * benchWorkgroupSize; - m_logger->log("=== GPU Discrete Sampler Benchmark (N=%u, %u dispatches, %u threads/dispatch, %u iters/thread, ps/sample is per all GPU threads) ===", - system::ILogger::ELL_PERFORMANCE, m_tableSize, benchmarkIterations, totalThreads, BENCH_ITERS); + m_currentN = N; + + std::vector weights(N); + std::mt19937 rng(42u + N); + std::uniform_real_distribution dist(0.001f, 100.0f); + for (uint32_t i = 0; i < N; i++) + weights[i] = dist(rng); + + // Alias table + std::vector aliasProb(N); + std::vector aliasIdx(N); + std::vector aliasPdf(N); + std::vector workspace(N); + nbl::hlsl::sampling::AliasTableBuilder::build({weights}, aliasProb.data(), aliasIdx.data(), aliasPdf.data(), workspace.data()); + + // Cumulative probability (N-1 entries, last bucket implicitly 1.0) + std::vector cumProb(N > 0 ? N - 1 : 0); + nbl::hlsl::sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data()); - runSingle("AliasTable", m_aliasPipeline, m_aliasPplnLayout, true, warmupIterations, benchmarkIterations); - runSingle("CumulativeProbability", m_cumProbPipeline, m_cumProbPplnLayout, false, warmupIterations, benchmarkIterations); + m_aliasProbBuf = createBdaBuffer(aliasProb.data(), N * sizeof(float)); + m_aliasIdxBuf = createBdaBuffer(aliasIdx.data(), N * sizeof(uint32_t)); + m_aliasPdfBuf = createBdaBuffer(aliasPdf.data(), N * sizeof(float)); + const size_t cumProbBytes = (N > 0 ? (N - 1) : 0) * sizeof(float); + m_cumProbBuf = cumProbBytes ? createBdaBuffer(cumProb.data(), cumProbBytes) : nullptr; } - private: - void runSingle(const char* name, const core::smart_refctd_ptr& pipeline, const core::smart_refctd_ptr& layout, bool isAlias, uint32_t warmupIterations, uint32_t benchmarkIterations) + void releaseTables() + { + m_aliasProbBuf = nullptr; + m_aliasIdxBuf = nullptr; + m_aliasPdfBuf = nullptr; + m_cumProbBuf = nullptr; + } + + void runSingle( + uint32_t N, + const char* name, + const core::smart_refctd_ptr& pipeline, + const core::smart_refctd_ptr& layout, + SamplerKind kind, + uint32_t warmupIterations, + uint32_t benchmarkIterations) { m_device->waitIdle(); - // Record benchmark command buffer + // Everything (warmup, timestamped bench, cooldown) goes into ONE cmdbuf and ONE + // submit. Serial submissions with semaphore waits between them would add sync cost + // to every dispatch and prevent the driver from overlapping adjacent dispatches. + // With a single cmdbuf the driver pipelines freely, and GPU memory latency is + // hidden by warp hyperthreading rather than by cross-submit overlap. + // + // Layout: [warmup dispatches] [ts 0] [bench dispatches] [ts 1] [cooldown dispatches] + // Warmup brings clocks + caches to steady state before ts 0. Cooldown keeps the + // same steady-state context alive across ts 1 so the trailing bench dispatches + // don't measure a tail where the GPU is already winding down. + const uint32_t cooldownIterations = warmupIterations; + m_benchCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_benchCmdbuf->begin(video::IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); + m_benchCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_benchCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); m_benchCmdbuf->bindComputePipeline(pipeline.get()); - if (isAlias) + if (kind == SamplerKind::Alias) { AliasTablePushConstants pc = {}; - pc.probAddress = m_aliasProbBuf->getDeviceAddress(); + pc.probAddress = m_aliasProbBuf->getDeviceAddress(); pc.aliasAddress = m_aliasIdxBuf->getDeviceAddress(); - pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress(); + pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress(); pc.outputAddress = m_outputBuf->getDeviceAddress(); - pc.tableSize = m_tableSize; + pc.tableSize = N; m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); } else { CumProbPushConstants pc = {}; - pc.cumProbAddress = m_cumProbBuf->getDeviceAddress(); - pc.outputAddress = m_outputBuf->getDeviceAddress(); - pc.tableSize = m_tableSize; + pc.cumProbAddress = m_cumProbBuf ? m_cumProbBuf->getDeviceAddress() : 0ull; + pc.outputAddress = m_outputBuf->getDeviceAddress(); + pc.tableSize = N; m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); } - m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); + for (uint32_t i = 0u; i < warmupIterations; ++i) + m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); + m_benchCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); + for (uint32_t i = 0u; i < benchmarkIterations; ++i) + m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); + m_benchCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + for (uint32_t i = 0u; i < cooldownIterations; ++i) + m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); m_benchCmdbuf->end(); - // Record timestamp command buffers - m_timestampBeforeCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampBeforeCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampBeforeCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); - m_timestampBeforeCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0); - m_timestampBeforeCmdbuf->end(); - - m_timestampAfterCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampAfterCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampAfterCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1); - m_timestampAfterCmdbuf->end(); - auto semaphore = m_device->createSemaphore(0u); - uint64_t semCounter = 0u; - const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_benchCmdbuf.get()}}; - const video::IQueue::SSubmitInfo::SCommandBufferInfo beforeCmds[] = {{.cmdbuf = m_timestampBeforeCmdbuf.get()}}; - const video::IQueue::SSubmitInfo::SCommandBufferInfo afterCmds[] = {{.cmdbuf = m_timestampAfterCmdbuf.get()}}; - - auto submitSerial = [&](const video::IQueue::SSubmitInfo::SCommandBufferInfo* cmds, uint32_t count) - { - const video::IQueue::SSubmitInfo::SSemaphoreInfo waitSem[] = { - {.semaphore = semaphore.get(), .value = semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { - {.semaphore = semaphore.get(), .value = ++semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - video::IQueue::SSubmitInfo submit = {}; - submit.commandBuffers = {cmds, count}; - submit.waitSemaphores = waitSem; - submit.signalSemaphores = signalSem; - m_queue->submit({&submit, 1u}); - }; - - for (uint32_t i = 0u; i < warmupIterations; ++i) - submitSerial(benchCmds, 1u); - - submitSerial(beforeCmds, 1u); - for (uint32_t i = 0u; i < benchmarkIterations; ++i) - submitSerial(benchCmds, 1u); - submitSerial(afterCmds, 1u); + const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { + {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; + video::IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = benchCmds; + submit.signalSemaphores = signalSem; + m_queue->submit({&submit, 1u}); m_device->waitIdle(); @@ -288,36 +292,37 @@ class CDiscreteSamplerBenchmark m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags); constexpr uint32_t benchIters = BENCH_ITERS; - constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); const float64_t elapsed_ns = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod; - const uint64_t totalThreads = uint64_t(m_dispatchGroupCount) * uint64_t(benchWorkgroupSize); + const uint64_t totalThreads = uint64_t(m_dispatchGroupCount) * uint64_t(WORKGROUP_SIZE); const uint64_t totalSamples = uint64_t(benchmarkIterations) * totalThreads * uint64_t(benchIters); const float64_t ps_per_sample = elapsed_ns * 1e3 / float64_t(totalSamples); const float64_t gsamples_per_s = float64_t(totalSamples) / elapsed_ns; const float64_t elapsed_ms = elapsed_ns * 1e-6; - m_logger->log("[Benchmark] %-28s: %9.3f ps/sample | %10.3f GSamples/s | %10.3f ms total", system::ILogger::ELL_PERFORMANCE, name, ps_per_sample, gsamples_per_s, elapsed_ms); + m_logger->log("%12u | %-28s | %12.3f | %12.3f | %12.3f | %10u", + system::ILogger::ELL_PERFORMANCE, N, name, ps_per_sample, gsamples_per_s, elapsed_ms, benchmarkIterations); } core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_logger; + core::smart_refctd_ptr m_assetMgr; core::smart_refctd_ptr m_cmdpool; core::smart_refctd_ptr m_benchCmdbuf; - core::smart_refctd_ptr m_timestampBeforeCmdbuf; - core::smart_refctd_ptr m_timestampAfterCmdbuf; core::smart_refctd_ptr m_queryPool; - // Alias table + // Pipelines (set up once) core::smart_refctd_ptr m_aliasPplnLayout; core::smart_refctd_ptr m_aliasPipeline; + core::smart_refctd_ptr m_cumProbPplnLayout; + core::smart_refctd_ptr m_cumProbPipeline; + core::smart_refctd_ptr m_cumProbYoloPplnLayout; + core::smart_refctd_ptr m_cumProbYoloPipeline; + + // Per-N data buffers (rebuilt each sweep step) core::smart_refctd_ptr m_aliasProbBuf; core::smart_refctd_ptr m_aliasIdxBuf; core::smart_refctd_ptr m_aliasPdfBuf; - - // Cumulative probability - core::smart_refctd_ptr m_cumProbPplnLayout; - core::smart_refctd_ptr m_cumProbPipeline; core::smart_refctd_ptr m_cumProbBuf; // Shared @@ -325,7 +330,7 @@ class CDiscreteSamplerBenchmark video::IQueue* m_queue = nullptr; video::IPhysicalDevice* m_physicalDevice = nullptr; uint32_t m_dispatchGroupCount = 0; - uint32_t m_tableSize = 0; + uint32_t m_currentN = 0; }; #endif diff --git a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h index 3e2092670..9f9854ac5 100644 --- a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h @@ -162,7 +162,7 @@ class CSamplerBenchmark } // Runs warmupIterations submits (unclocked), then benchmarkIterations submits under GPU timestamps. - void run(const std::string& samplerName, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) + void run(const std::string& samplerName, const std::string& mode, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) { m_device->waitIdle(); recordBenchmarkCmdBuf(); @@ -213,9 +213,9 @@ class CSamplerBenchmark const float64_t gsamples_per_s = float64_t(total_samples) / elapsed_ns; const float64_t elapsed_ms = elapsed_ns * 1e-6; - m_logger->log("[Benchmark] %-28s: %9.3f ps/sample | %10.3f GSamples/s | %10.3f ms total", + m_logger->log("[Benchmark] %-28s | %-38s | %12.3f | %12.3f | %12.3f", system::ILogger::ELL_PERFORMANCE, - samplerName.c_str(), ps_per_sample, gsamples_per_s, elapsed_ms); + samplerName.c_str(), mode.c_str(), ps_per_sample, gsamples_per_s, elapsed_ms); } private: diff --git a/37_HLSLSamplingTests/main.cpp b/37_HLSLSamplingTests/main.cpp index 98ea127cc..470132aba 100644 --- a/37_HLSLSamplingTests/main.cpp +++ b/37_HLSLSamplingTests/main.cpp @@ -51,12 +51,11 @@ using namespace nbl::examples; #include "benchmarks/CDiscreteSamplerBenchmark.h" #include "tests/property/CSamplerPropertyTester.h" -constexpr bool DoBenchmark = true; class HLSLSamplingTests final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication { using device_base_t = application_templates::MonoDeviceApplication; - using asset_base_t = BuiltinResourcesApplication; + using asset_base_t = BuiltinResourcesApplication; public: HLSLSamplingTests(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) @@ -64,7 +63,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat virtual SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override { - auto retval = device_base_t::getPreferredDeviceFeatures(); + auto retval = device_base_t::getPreferredDeviceFeatures(); retval.pipelineExecutableInfo = true; return retval; } @@ -80,10 +79,10 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // test compile with dxc { IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; - auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get()); - auto bundle = m_assetMgr->getAsset(key.c_str(), lp); + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; + auto key = nbl::this_example::builtin::build::get_spirv_key<"shader">(m_device.get()); + auto bundle = m_assetMgr->getAsset(key.c_str(), lp); const auto assets = bundle.getContents(); if (assets.empty()) @@ -155,8 +154,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat static_assert(sampling::concepts::BackwardTractableSampler>); static_assert(sampling::concepts::BackwardTractableSampler>); static_assert(sampling::concepts::BackwardTractableSampler>); - static_assert(sampling::concepts::BackwardTractableSampler>); - static_assert(sampling::concepts::BackwardTractableSampler>); + //static_assert(sampling::concepts::BackwardTractableSampler>); // no backwardPdf + //static_assert(sampling::concepts::BackwardTractableSampler>); // no backwardPdf static_assert(sampling::concepts::BackwardTractableSampler>); static_assert(sampling::concepts::BackwardTractableSampler>); static_assert(sampling::concepts::BackwardTractableSampler>); @@ -166,7 +165,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat static_assert(sampling::concepts::BijectiveSampler>); static_assert(sampling::concepts::BijectiveSampler>); static_assert(sampling::concepts::BijectiveSampler>); - static_assert(sampling::concepts::BijectiveSampler>); + static_assert(sampling::concepts::BijectiveSampler>); static_assert(sampling::concepts::BijectiveSampler>); static_assert(sampling::concepts::BijectiveSampler>); @@ -180,89 +179,162 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // ====================================================================== // GPU throughput benchmarks // ====================================================================== - const uint32_t testBatchCount = 1024; + // 4096 workgroups * WORKGROUP_SIZE(64) = 256k invocations per dispatch — enough + // to saturate a 3080 (68 SMs * ~1536 resident invocations) so memory latency is + // hidden by hyperthreading rather than by cross-dispatch overlap. + constexpr uint32_t testBatchCount = 4096; + constexpr bool DoBenchmark = true; if constexpr (DoBenchmark) { - constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; + constexpr uint32_t benchWorkgroupSize = WORKGROUP_SIZE; constexpr uint32_t totalThreadsPerDispatch = testBatchCount * benchWorkgroupSize; - constexpr uint32_t iterationsPerThread = BENCH_ITERS; + constexpr uint32_t iterationsPerThread = BENCH_ITERS; constexpr uint32_t benchSamplesPerDispatch = totalThreadsPerDispatch * iterationsPerThread; struct BenchEntry { CSamplerBenchmark bench; - std::string name; + std::string sampler; + std::string mode; }; std::vector benchmarks; - auto addBench = [&](const char* name, const std::string& shaderKey, size_t inputSize, size_t outputSize) + auto addBench = [&](const char* sampler, const char* mode, const std::string& shaderKey, size_t inputSize, size_t outputSize) { - auto& entry = benchmarks.emplace_back(); - entry.name = name; + auto& entry = benchmarks.emplace_back(); + entry.sampler = sampler; + entry.mode = mode; CSamplerBenchmark::SetupData data; - data.device = m_device; - data.api = m_api; - data.assetMgr = m_assetMgr; - data.logger = m_logger; - data.physicalDevice = m_physicalDevice; + data.device = m_device; + data.api = m_api; + data.assetMgr = m_assetMgr; + data.logger = m_logger; + data.physicalDevice = m_physicalDevice; data.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - data.shaderKey = shaderKey; + data.shaderKey = shaderKey; data.dispatchGroupCount = testBatchCount; data.samplesPerDispatch = benchSamplesPerDispatch; - data.inputBufferBytes = inputSize; - data.outputBufferBytes = outputSize; + data.inputBufferBytes = inputSize; + data.outputBufferBytes = outputSize; entry.bench.setup(data); }; // Bench shaders don't read input (hardcoded values) and write a single uint32_t per thread via RWByteAddressBuffer - constexpr size_t benchInputBytes = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks + constexpr size_t benchInputBytes = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks constexpr size_t benchOutputBytes = sizeof(uint32_t) * totalThreadsPerDispatch; - addBench("Linear", nbl::this_example::builtin::build::get_spirv_key<"linear_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("Bilinear", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("BoxMullerTransform", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("UniformHemisphere", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("UniformSphere", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ConcentricMapping", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("PolarMapping", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedHemisphere", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphere", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalRectangle", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphericalRectangle", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("SphericalTriangle", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes); - addBench("ProjectedSphericalTriangle", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("Linear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("Linear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("Bilinear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("Bilinear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("BoxMullerTransform", "1:1", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("BoxMullerTransform", "1:16", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("UniformHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("UniformHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("UniformSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("UniformSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ConcentricMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ConcentricMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("PolarMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("PolarMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "1:1 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "1:16 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "1:1 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "1:16 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "1:1 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "1:16 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "create-only (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "create-only (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalRectangle", "create-only (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphericalRectangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphericalRectangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphericalRectangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("SphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + //addBench("ProjectedSphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); // Print all pipeline reports first for (auto& entry : benchmarks) - entry.bench.logPipelineReport(entry.name); + entry.bench.logPipelineReport(entry.sampler + " (" + entry.mode + ")"); // Discrete sampler benchmark: alias table vs cumulative probability (BDA) { CDiscreteSamplerBenchmark::SetupData dsData; - dsData.device = m_device; - dsData.api = m_api; - dsData.assetMgr = m_assetMgr; - dsData.logger = m_logger; - dsData.physicalDevice = m_physicalDevice; - dsData.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - dsData.aliasShaderKey = nbl::this_example::builtin::build::get_spirv_key<"alias_table_bench">(m_device.get()); - dsData.cumProbShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get()); - dsData.dispatchGroupCount = testBatchCount; - dsData.tableSize = 1024; + dsData.device = m_device; + dsData.api = m_api; + dsData.assetMgr = m_assetMgr; + dsData.logger = m_logger; + dsData.physicalDevice = m_physicalDevice; + dsData.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); + dsData.aliasShaderKey = nbl::this_example::builtin::build::get_spirv_key<"alias_table_bench">(m_device.get()); + dsData.cumProbShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get()); + dsData.cumProbYoloShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get()); + dsData.dispatchGroupCount = testBatchCount; CDiscreteSamplerBenchmark discreteBench; discreteBench.setup(dsData); // Then run all benchmarks here so the reports are at the top of the log, followed by timings - constexpr uint32_t warmupDispatches = 500; - constexpr uint32_t benchDispatches = 5000; - m_logger->log("=== GPU Sampler Benchmarks (%u dispatches, %u threads/dispatch, %u iters/thread, ps/sample is per all GPU threads) ===", - ILogger::ELL_PERFORMANCE, benchDispatches, totalThreadsPerDispatch, iterationsPerThread); - for (auto& entry : benchmarks) - entry.bench.run(entry.name, warmupDispatches, benchDispatches); - - discreteBench.run(warmupDispatches, benchDispatches); + { + constexpr uint32_t warmupDispatches = 300; + constexpr uint32_t benchDispatches = 1000; + m_logger->log("=== GPU Sampler Benchmarks (%u dispatches, %u threads/dispatch, %u iters/thread, ps/sample is per all GPU threads) ===", + ILogger::ELL_PERFORMANCE, benchDispatches, totalThreadsPerDispatch, iterationsPerThread); + m_logger->log(" %-28s | %-38s | %12s | %12s | %12s", + ILogger::ELL_PERFORMANCE, "Sampler", "Mode", "ps/sample", "GSamples/s", "ms total"); + for (auto& entry : benchmarks) + entry.bench.run(entry.sampler, entry.mode, warmupDispatches, benchDispatches); + } + + { + // Sweep covers both the YOLO-vs-Comparator comparison (explicit points at + // N=100, 10k, 1M for wg=WORKGROUP_SIZE) and an alias-vs-CDF ramp from + // N=4 up to 32M in a roughly-power-of-8 progression. + const std::vector discreteSizes = { + 4u, + 16u, + 32u, + 100u, + 128u, + 512u, + 8192u, + 10000u, + 131072u, + 1000000u, + 2097152u, + 16777216u, + 33554432u, + }; + + // Adaptive dispatch scheduler: pick dispatch counts so total wall-clock + // per sampler-per-N stays near 1.5 s. Cost model comes from the prior + // sweep (order-of-magnitude ps/sample vs N). + auto dispatchScheduler = [](uint32_t N) -> CDiscreteSamplerBenchmark::DispatchCounts + { + double ps_per_sample; + if (N < 1000u) ps_per_sample = 15.0; // L1-resident + else if (N < 100000u) ps_per_sample = 100.0; // L1/L2 + else if (N < 2000000u) ps_per_sample = 1000.0; // L2-edge + else ps_per_sample = 8000.0; // DRAM-bound + + constexpr double targetNs = 1.5e9; // ~1.5 s per bench + constexpr uint64_t samplesPerDispatch = uint64_t(WORKGROUP_SIZE) * uint64_t(testBatchCount) * uint64_t(BENCH_ITERS); + const uint64_t targetSamples = uint64_t((targetNs * 1000.0) / ps_per_sample); + const uint32_t bench = std::max(10u, uint32_t(targetSamples / samplesPerDispatch)); + const uint32_t warmup = std::max(20u, bench / 10u); + return {warmup, bench}; + }; + + discreteBench.runSweep(discreteSizes, dispatchScheduler); + } } } @@ -270,21 +342,20 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // Runtime CPU/GPU comparison tests using ITester harness // ================================================================ bool pass = true; - const uint32_t workgroupSize = WORKGROUP_SIZE; // generic lambda to run a GPU sampler test auto runSamplerTest = [&](const char* testName, auto spirvKey, const char* logFile) { m_logger->log("Running %s tests...", ILogger::ELL_INFO, testName); typename Tester::PipelineSetupData data; - data.device = m_device; - data.api = m_api; - data.assetMgr = m_assetMgr; - data.logger = m_logger; - data.physicalDevice = m_physicalDevice; + data.device = m_device; + data.api = m_api; + data.assetMgr = m_assetMgr; + data.logger = m_logger; + data.physicalDevice = m_physicalDevice; data.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - data.shaderKey = spirvKey; - Tester tester(testBatchCount, workgroupSize); + data.shaderKey = spirvKey; + Tester tester(testBatchCount, WORKGROUP_SIZE); tester.setupPipeline(data); pass &= tester.performTestsAndVerifyResults(logFile); }; @@ -307,7 +378,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat runSamplerTest.operator()("ProjectedSphericalRectangle sampler", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_test">(m_device.get()), "ProjectedSphericalRectangleTestLog.txt"); } - if constexpr (true) + if constexpr (DoBenchmark) { // --- Discrete table construction (CPU) --- { @@ -320,6 +391,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat runSamplerTest.operator()("AliasTable GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"alias_table_test">(m_device.get()), "AliasTableTestLog.txt"); runSamplerTest.operator()("CumulativeProbability GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_test">(m_device.get()), "CumulativeProbabilityTestLog.txt"); } + logJacobianSkipCounts(m_logger.get()); if (pass) m_logger->log("All sampling tests PASSED.", ILogger::ELL_INFO); else diff --git a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h index 87aac65ba..32f0e3b28 100644 --- a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h +++ b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h @@ -52,6 +52,7 @@ class CAliasTableGPUTester final : public ITesterlog(" coeffStart=%s coeffEnd=%s", nbl::system::ILogger::ELL_ERROR, - to_string(s.linearCoeffStart).c_str(), to_string(s.linearCoeffEnd).c_str()); + to_string(s.normalizedCoeffStart).c_str(), to_string(s.normalizedCoeffEnd).c_str()); } }; @@ -140,7 +141,7 @@ struct LinearStressConfig { using nbl::system::to_string; logger->log(" coeffStart=%s coeffEnd=%s", nbl::system::ILogger::ELL_ERROR, - to_string(s.linearCoeffStart).c_str(), to_string(s.linearCoeffEnd).c_str()); + to_string(s.normalizedCoeffStart).c_str(), to_string(s.normalizedCoeffEnd).c_str()); } }; diff --git a/37_HLSLSamplingTests/tests/CPolarMappingTester.h b/37_HLSLSamplingTests/tests/CPolarMappingTester.h index f7009176b..6c43f8877 100644 --- a/37_HLSLSamplingTests/tests/CPolarMappingTester.h +++ b/37_HLSLSamplingTests/tests/CPolarMappingTester.h @@ -46,7 +46,8 @@ class CPolarMappingTester final : public ITester sizeDist(0.5f, 3.0f); std::uniform_real_distribution uDist(0.0f, 1.0f); - ProjectedSphericalRectangleInputValues input; - // Observer at origin, rect placed in front (negative Z) so the solid angle is valid. - input.observer = nbl::hlsl::float32_t3(0.0f, 0.0f, 0.0f); - const float width = sizeDist(getRandomEngine()); - const float height = sizeDist(getRandomEngine()); - input.rectOrigin = nbl::hlsl::float32_t3(0.0f, 0.0f, -2.0f); - input.right = nbl::hlsl::float32_t3(width, 0.0f, 0.0f); - input.up = nbl::hlsl::float32_t3(0.0f, height, 0.0f); - - // Build shape to use centralized corner check nbl::hlsl::shapes::CompressedSphericalRectangle compressed; - compressed.origin = input.rectOrigin; - compressed.right = input.right; - compressed.up = input.up; + nbl::hlsl::float32_t3 observer; + generateRandomRectangle(getRandomEngine(), compressed, observer); + + ProjectedSphericalRectangleInputValues input; + input.observer = observer; + input.rectOrigin = compressed.origin; + input.right = compressed.right; + input.up = compressed.up; + auto shape = nbl::hlsl::shapes::SphericalRectangle::create(compressed); // Ensure the receiver normal has positive projection onto at least one vertex, @@ -63,25 +58,25 @@ class CProjectedSphericalRectangleTester final : public ITester actual.extents.x || - actual.surfaceOffset.y < 0.0f || actual.surfaceOffset.y > actual.extents.y) + PdfCheck {"ProjectedSphericalRectangle::forwardPdf", &R::forwardPdf}); + VERIFY_JACOBIAN_OR_SKIP(pass, "ProjectedSphericalRectangle::jacobianProduct", 1.0f, actual.jacobianProduct, iteration, seed, testType, 5e-2, 5e-2); + + constexpr float boundsEps = 1e-5f; + if (actual.surfaceOffset.x < -boundsEps || actual.surfaceOffset.x > actual.extents.x + boundsEps || + actual.surfaceOffset.y < -boundsEps || actual.surfaceOffset.y > actual.extents.y + boundsEps) { pass = false; - printTestFail("ProjectedSphericalRectangle::generateSurfaceOffset (inside rect bounds)", actual.extents, actual.surfaceOffset, iteration, seed, testType, 0.0, 0.0); + printTestFail("ProjectedSphericalRectangle::generateSurfaceOffset (inside rect bounds)", actual.extents, actual.surfaceOffset, iteration, seed, testType, 0.0, boundsEps); } // generate must be unit length @@ -90,7 +85,7 @@ class CProjectedSphericalRectangleTester final : public ITester createProjectedRectSampler( +inline nbl::hlsl::sampling::ProjectedSphericalRectangle createProjectedRectSampler( std::mt19937& rng, nbl::hlsl::shapes::CompressedSphericalRectangle& compressed, nbl::hlsl::float32_t3& observer, @@ -121,15 +116,16 @@ inline nbl::hlsl::sampling::ProjectedSphericalRectangle cr outNormal = generateRandomUnitVector(rng); } while (!anyRectCornerAboveHorizon(shape, observer, outNormal)); - return sampling::ProjectedSphericalRectangle::create(shape, observer, outNormal, false); + return sampling::ProjectedSphericalRectangle::create(shape, observer, outNormal, false); } struct ProjectedSphericalRectanglePropertyConfig { - using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle; + // UsePdfAsWeight=false so receiverNormal and projSolidAngle are populated for logSamplerInfo. + using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle; static constexpr uint32_t numConfigurations = 200; - static constexpr uint32_t samplesPerConfig = 20000; + static constexpr uint32_t samplesPerConfig = 50000; static constexpr bool hasMCNormalization = true; static constexpr bool hasGridIntegration = false; static constexpr float64_t mcNormalizationRelTol = 0.08; @@ -155,23 +151,20 @@ struct ProjectedSphericalRectanglePropertyConfig static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { using nbl::system::to_string; - logger->log(" r0=%s extents=%s solidAngle=%s rcpSolidAngle=%s rcpProjSolidAngle=%s", + logger->log(" r0=%s extents=%s solidAngle=%s projSolidAngle=%s receiverNormal=%s", nbl::system::ILogger::ELL_ERROR, to_string(s.sphrect.r0).c_str(), to_string(s.sphrect.extents).c_str(), to_string(s.sphrect.solidAngle).c_str(), - to_string(s.rcpSolidAngle).c_str(), - to_string(s.rcpProjSolidAngle).c_str()); - logger->log(" localReceiverNormal=%s receiverWasBSDF=%u", - nbl::system::ILogger::ELL_ERROR, - to_string(s.localReceiverNormal).c_str(), - static_cast(s.receiverWasBSDF)); + to_string(s.projSolidAngle).c_str(), + to_string(s.receiverNormal).c_str()); } }; struct ProjectedSphericalRectangleGrazingConfig { - using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle; + // UsePdfAsWeight=false so receiverNormal and projSolidAngle are populated for logSamplerInfo. + using sampler_type = nbl::hlsl::sampling::ProjectedSphericalRectangle; static constexpr uint32_t numConfigurations = 200; static constexpr uint32_t samplesPerConfig = 20000; @@ -202,17 +195,13 @@ struct ProjectedSphericalRectangleGrazingConfig static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { using nbl::system::to_string; - logger->log(" r0=%s extents=%s solidAngle=%s rcpSolidAngle=%s rcpProjSolidAngle=%s", + logger->log(" r0=%s extents=%s solidAngle=%s projSolidAngle=%s receiverNormal=%s", nbl::system::ILogger::ELL_ERROR, to_string(s.sphrect.r0).c_str(), to_string(s.sphrect.extents).c_str(), to_string(s.sphrect.solidAngle).c_str(), - to_string(s.rcpSolidAngle).c_str(), - to_string(s.rcpProjSolidAngle).c_str()); - logger->log(" localReceiverNormal=%s receiverWasBSDF=%u", - nbl::system::ILogger::ELL_ERROR, - to_string(s.localReceiverNormal).c_str(), - static_cast(s.receiverWasBSDF)); + to_string(s.projSolidAngle).c_str(), + to_string(s.receiverNormal).c_str()); } }; diff --git a/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h b/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h index 31f85ba02..0460a30ee 100644 --- a/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h +++ b/37_HLSLSamplingTests/tests/CProjectedSphericalTriangleTester.h @@ -60,17 +60,19 @@ class CProjectedSphericalTriangleTester final : public ITester; + // UsePdfAsWeight=false so receiverNormal is populated for logSamplerInfo. + using sampler_type = nbl::hlsl::sampling::ProjectedSphericalTriangle; static constexpr uint32_t numConfigurations = 200; static constexpr uint32_t samplesPerConfig = 20000; @@ -117,18 +120,19 @@ struct ProjectedSphericalTrianglePropertyConfig // E[1/pdf] = solidAngle * E[1/bilinearPdf] = solidAngle * 1.0 = solidAngle static float64_t expectedCodomainMeasure(const sampler_type& s) { - return 1.0 / static_cast(s.sphtri.base.rcpSolidAngle); + return 1.0 / static_cast(s.sphtri.rcpSolidAngle); } static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { - logTriangleInfo(logger, s.sphtri.base.tri_vertices[0], s.sphtri.base.tri_vertices[1], s.sphtri.vertexC, s.receiverNormal); + logTriangleInfo(logger, s.sphtri.tri_vertices[0], s.sphtri.tri_vertices[1], s.sphtri.APlusC - s.sphtri.tri_vertices[0], s.receiverNormal); } }; struct ProjectedSphericalTriangleGrazingConfig { - using sampler_type = nbl::hlsl::sampling::ProjectedSphericalTriangle; + // UsePdfAsWeight=false so receiverNormal is populated for logSamplerInfo. + using sampler_type = nbl::hlsl::sampling::ProjectedSphericalTriangle; static constexpr uint32_t numConfigurations = 200; static constexpr uint32_t samplesPerConfig = 20000; @@ -169,12 +173,12 @@ struct ProjectedSphericalTriangleGrazingConfig static float64_t expectedCodomainMeasure(const sampler_type& s) { - return 1.0 / static_cast(s.sphtri.base.rcpSolidAngle); + return 1.0 / static_cast(s.sphtri.rcpSolidAngle); } static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { - logTriangleInfo(logger, s.sphtri.base.tri_vertices[0], s.sphtri.base.tri_vertices[1], s.sphtri.vertexC, s.receiverNormal); + logTriangleInfo(logger, s.sphtri.tri_vertices[0], s.sphtri.tri_vertices[1], s.sphtri.APlusC - s.sphtri.tri_vertices[0], s.receiverNormal); } }; diff --git a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h index 2a6030b78..fa5c93ccb 100644 --- a/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h +++ b/37_HLSLSamplingTests/tests/CSphericalRectangleTester.h @@ -20,17 +20,17 @@ class CSphericalRectangleTester final : public ITester sizeDist(0.5f, 3.0f); std::uniform_real_distribution uDist(0.0f, 1.0f); + nbl::hlsl::shapes::CompressedSphericalRectangle compressed; + nbl::hlsl::float32_t3 observer; + generateRandomRectangle(getRandomEngine(), compressed, observer); + SphericalRectangleInputValues input; - // Observer at origin, rect placed in front (negative Z) so the solid angle is valid. - input.observer = nbl::hlsl::float32_t3(0.0f, 0.0f, 0.0f); - const float width = sizeDist(getRandomEngine()); - const float height = sizeDist(getRandomEngine()); - input.rectOrigin = nbl::hlsl::float32_t3(0.0f, 0.0f, -2.0f); - input.right = nbl::hlsl::float32_t3(width, 0.0f, 0.0f); - input.up = nbl::hlsl::float32_t3(0.0f, height, 0.0f); + input.observer = observer; + input.rectOrigin = compressed.origin; + input.right = compressed.right; + input.up = compressed.up; input.u = nbl::hlsl::float32_t2(uDist(getRandomEngine()), uDist(getRandomEngine())); m_inputs.push_back(input); return input; @@ -48,16 +48,21 @@ class CSphericalRectangleTester final : public ITester; + using sampler_type = nbl::hlsl::sampling::SphericalTriangle; static constexpr uint32_t numConfigurations = 500; static constexpr uint32_t samplesPerConfig = 20000; @@ -121,7 +124,7 @@ struct SphericalTrianglePropertyConfig static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { - logTriangleInfo(logger, s.base.tri_vertices[0], s.base.tri_vertices[1], s.vertexC); + logTriangleInfo(logger, s.tri_vertices[0], s.tri_vertices[1], s.APlusC - s.tri_vertices[0]); } }; @@ -130,7 +133,7 @@ struct SphericalTrianglePropertyConfig // These stress the C_s great-circle intersection and v-recovery in generateInverse. struct SphericalTriangleStressConfig { - using sampler_type = nbl::hlsl::sampling::SphericalTriangle; + using sampler_type = nbl::hlsl::sampling::SphericalTriangle; static constexpr uint32_t numConfigurations = 500; static constexpr uint32_t samplesPerConfig = 20000; @@ -218,7 +221,7 @@ struct SphericalTriangleStressConfig static void logSamplerInfo(nbl::system::ILogger* logger, const sampler_type& s) { - logTriangleInfo(logger, s.base.tri_vertices[0], s.base.tri_vertices[1], s.vertexC); + logTriangleInfo(logger, s.tri_vertices[0], s.tri_vertices[1], s.APlusC - s.tri_vertices[0]); } }; diff --git a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h index 29994511f..4f2ae08a4 100644 --- a/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h +++ b/37_HLSLSamplingTests/tests/CUniformHemisphereTester.h @@ -45,7 +45,8 @@ class CUniformHemisphereTester final : public ITester& jacobianStats() +{ + static nbl::core::map s; + return s; +} +} // namespace detail + +inline void logJacobianSkipCounts(nbl::system::ILogger* logger) +{ + auto& stats = detail::jacobianStats(); + if (stats.empty()) + return; + logger->log("Jacobian skip summary (skipped samples are NOT counted as passes):", nbl::system::ILogger::ELL_INFO); + for (const auto& [name, s] : stats) + { + const uint64_t skipped = s.skipUDomain + s.skipCrease + s.skipHemiBoundary + s.skipBwdPdfRange + s.skipCodomainSingularity; + if (skipped == 0) + continue; + const double percentage = s.total ? (100.0 * double(skipped) / double(s.total)) : 0.0; + logger->log(" [JacobianSkip] %s: %llu / %llu skipped (%.2f%%) -- u-domain=%llu, crease=%llu, hemi-boundary=%llu, bwd-pdf-range=%llu, codomain-singularity=%llu", + nbl::system::ILogger::ELL_WARNING, + name.c_str(), + skipped, + s.total, + percentage, + s.skipUDomain, + s.skipCrease, + s.skipHemiBoundary, + s.skipBwdPdfRange, + s.skipCodomainSingularity); + } +} + +// Verify a jacobianProduct value OR bin it by reason if it is a skip sentinel (< 0). +// Skipped samples are counted by reason and NEVER counted as a pass. +// Must be called from a method that has access to verifyTestValue. +#define VERIFY_JACOBIAN_OR_SKIP(pass, name, expected, actual, iteration, seed, testType, relTol, absTol) \ + do \ + { \ + auto& _jstats = detail::jacobianStats()[(name)]; \ + ++_jstats.total; \ + const float _jval = (actual); \ + if (_jval < 0.0f) \ + { \ + /* Sentinel values are integers at -1..-5, so round-to-nearest on _jval picks the bin. */ \ + const int _bin = static_cast(-_jval + 0.5f); \ + switch (_bin) \ + { \ + case 1: \ + ++_jstats.skipUDomain; \ + break; \ + case 2: \ + ++_jstats.skipCrease; \ + break; \ + case 3: \ + ++_jstats.skipHemiBoundary; \ + break; \ + case 4: \ + ++_jstats.skipBwdPdfRange; \ + break; \ + case 5: \ + ++_jstats.skipCodomainSingularity; \ + break; \ + default: \ + ++_jstats.skipUDomain; \ + break; /* fall-through bucket */ \ + } \ + } \ + else \ + { \ + pass &= verifyTestValue((name), (expected), _jval, (iteration), (seed), (testType), (relTol), (absTol)); \ + } \ + } while (0) + // Check that each PDF field is positive and finite. // Must be called from within a method that has access to printTestFail. -#define VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType, ...) \ - do \ - { \ - auto _pdfChecks = std::make_tuple(__VA_ARGS__); \ - std::apply([&](const auto&... c) { (([&] { \ +#define VERIFY_PDFS_POSITIVE(pass, actual, iteration, seed, testType, ...) \ + do \ + { \ + auto _pdfChecks = std::make_tuple(__VA_ARGS__); \ + std::apply([&](const auto&... c) { (([&] { \ if (!((actual).*c.field > 0.0f) || !std::isfinite((actual).*c.field)) \ - { \ - pass = false; \ - printTestFail(std::string(c.name) + " (positive & finite)", \ - 1.0f, (actual).*c.field, iteration, seed, testType, 0.0, 0.0); \ - } \ - }()), \ - ...); }, _pdfChecks); \ + { \ + pass = false; \ + printTestFail(std::string(c.name) + " (positive & finite)", \ + 1.0f, (actual).*c.field, iteration, seed, testType, 0.0, 0.0); \ + } \ + }()), \ + ...); }, _pdfChecks); \ } while (0) // ============================================================================ @@ -139,7 +235,7 @@ inline float64_t gridIntegratePdf1D(const auto& sampler, uint32_t N = 100000) // 2D grid integration of backwardPdf over [0,1]^2 inline float64_t gridIntegratePdf2D(const auto& sampler, uint32_t N = 1000) { - float64_t sum = 0.0; + float64_t sum = 0.0; const float64_t cellArea = 1.0 / static_cast(N * N); for (uint32_t iy = 0; iy < N; iy++) { @@ -190,17 +286,15 @@ inline void buildTangentFrame(nbl::hlsl::float32_t3 dir, nbl::hlsl::float32_t3& // Generate a small equilateral triangle on the unit sphere around baseDir with given half-angle. // Also generates a random normal with decent projection onto the triangle. -inline void generateSmallTriangle(std::mt19937& rng, float halfAngle, - nbl::hlsl::float32_t3& v0, nbl::hlsl::float32_t3& v1, nbl::hlsl::float32_t3& v2, - nbl::hlsl::float32_t3& baseDir, nbl::hlsl::float32_t3& normal) +inline void generateSmallTriangle(std::mt19937& rng, float halfAngle, nbl::hlsl::float32_t3& v0, nbl::hlsl::float32_t3& v1, nbl::hlsl::float32_t3& v2, nbl::hlsl::float32_t3& baseDir, nbl::hlsl::float32_t3& normal) { using namespace nbl::hlsl; baseDir = generateRandomUnitVector(rng); float32_t3 t1, t2; buildTangentFrame(baseDir, t1, t2); - v0 = normalize(baseDir + t1 * halfAngle); - v1 = normalize(baseDir - t1 * (halfAngle * 0.5f) + t2 * (halfAngle * 0.866f)); - v2 = normalize(baseDir - t1 * (halfAngle * 0.5f) - t2 * (halfAngle * 0.866f)); + v0 = normalize(baseDir + t1 * halfAngle); + v1 = normalize(baseDir - t1 * (halfAngle * 0.5f) + t2 * (halfAngle * 0.866f)); + v2 = normalize(baseDir - t1 * (halfAngle * 0.5f) - t2 * (halfAngle * 0.866f)); normal = generateRandomUnitVector(rng); if (dot(normal, baseDir) < 0.1f) normal = normalize(normal + baseDir * 2.0f); @@ -221,10 +315,10 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32 float32_t3 t1, t2; buildTangentFrame(base, t1, t2); float spread = 0.15f + angleDist(rng) * 0.2f; - v0 = normalize(base + t1 * spread); - v1 = normalize(base - t1 * spread); - float far_ = 0.8f + angleDist(rng) * 0.8f; - v2 = normalize(base * std::cos(far_) + t2 * std::sin(far_)); + v0 = normalize(base + t1 * spread); + v1 = normalize(base - t1 * spread); + float far_ = 0.8f + angleDist(rng) * 0.8f; + v2 = normalize(base * std::cos(far_) + t2 * std::sin(far_)); break; } case 1: // Nearly coplanar @@ -233,12 +327,12 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32 float32_t3 t1, t2; buildTangentFrame(pole, t1, t2); float offset = 0.05f + angleDist(rng) * 0.1f; - float a1 = angleDist(rng) * 6.2832f; - float a2 = a1 + 0.8f + angleDist(rng); - float a3 = a2 + 0.8f + angleDist(rng); - v0 = normalize(t1 * std::cos(a1) + t2 * std::sin(a1) + pole * offset); - v1 = normalize(t1 * std::cos(a2) + t2 * std::sin(a2) - pole * offset * 0.5f); - v2 = normalize(t1 * std::cos(a3) + t2 * std::sin(a3) + pole * offset * 0.3f); + float a1 = angleDist(rng) * 6.2832f; + float a2 = a1 + 0.8f + angleDist(rng); + float a3 = a2 + 0.8f + angleDist(rng); + v0 = normalize(t1 * std::cos(a1) + t2 * std::sin(a1) + pole * offset); + v1 = normalize(t1 * std::cos(a2) + t2 * std::sin(a2) - pole * offset * 0.5f); + v2 = normalize(t1 * std::cos(a3) + t2 * std::sin(a3) + pole * offset * 0.3f); break; } default: // One short edge @@ -247,9 +341,9 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32 float32_t3 t1, t2; buildTangentFrame(base, t1, t2); float shortAngle = 0.32f + angleDist(rng) * 0.1f; - v0 = normalize(base + t1 * shortAngle * 0.5f); - v1 = normalize(base - t1 * shortAngle * 0.5f); - v2 = normalize(t2 + base * (0.3f + angleDist(rng) * 0.5f)); + v0 = normalize(base + t1 * shortAngle * 0.5f); + v1 = normalize(base - t1 * shortAngle * 0.5f); + v2 = normalize(t2 + base * (0.3f + angleDist(rng) * 0.5f)); break; } } @@ -262,65 +356,114 @@ inline void generateStressTriangleVertices(std::mt19937& rng, nbl::hlsl::float32 inline void makeEquilateralTriangle(float64_t theta, nbl::hlsl::float32_t3 verts[3]) { using namespace nbl::hlsl; - const float32_t st = static_cast(std::sin(theta)); - const float32_t ct = static_cast(std::cos(theta)); + const float32_t st = static_cast(std::sin(theta)); + const float32_t ct = static_cast(std::cos(theta)); constexpr float64_t twoPiOver3 = 2.0 * numbers::pi / 3.0; - verts[0] = float32_t3(st, 0.0f, ct); - verts[1] = float32_t3(static_cast(st * std::cos(twoPiOver3)), + verts[0] = float32_t3(st, 0.0f, ct); + verts[1] = float32_t3(static_cast(st * std::cos(twoPiOver3)), static_cast(st * std::sin(twoPiOver3)), ct); - verts[2] = float32_t3(static_cast(st * std::cos(2.0 * twoPiOver3)), + verts[2] = float32_t3(static_cast(st * std::cos(2.0 * twoPiOver3)), static_cast(st * std::sin(2.0 * twoPiOver3)), ct); } -// Monte Carlo estimate of projected solid angle: E[abs(dot(L, normal))] * solidAngle. -// Uses abs() to match the BSDF projected solid angle formula (which uses abs so that -// triangles straddling the horizon contribute positively from both hemispheres). -// Samples L uniformly from the spherical triangle. -inline float64_t mcEstimatePSA(const nbl::hlsl::shapes::SphericalTriangle& shape, nbl::hlsl::float32_t3 normal, uint32_t N, std::mt19937& rng) +// Grid estimate of projected solid angle: mean of abs(dot(L, normal)) over a regular +// [0,1]^2 grid, times solidAngle. Uses abs() to match the BSDF projected solid angle +// formula (triangles/rects straddling the horizon contribute from both hemispheres). +// `N` is the total number of samples; the grid side is ceil(sqrt(N)). Grid integration +// is deterministic and has much lower variance than MC at the same sample count, +// so it's a tighter ground truth for PSA-vs-formula comparisons. +inline float64_t gridEstimatePSA(const nbl::hlsl::shapes::SphericalTriangle& shape, nbl::hlsl::float32_t3 normal, uint32_t N) { using namespace nbl::hlsl; - auto sampler = sampling::SphericalTriangle::create(shape); - std::uniform_real_distribution uDist(0.0f, 1.0f); - float64_t sum = 0.0; - for (uint32_t i = 0; i < N; i++) + auto sampler = sampling::SphericalTriangle::create(shape); + const uint32_t gridSide = static_cast(std::ceil(std::sqrt(static_cast(N)))); + const float invSide = 1.0f / static_cast(gridSide); + float64_t sum = 0.0; + for (uint32_t iy = 0; iy < gridSide; iy++) { - float32_t2 u(uDist(rng), uDist(rng)); - typename sampling::SphericalTriangle::cache_type cache; - float32_t3 L = sampler.generate(u, cache); - sum += static_cast(hlsl::abs(dot(normal, L))); + const float uy = (static_cast(iy) + 0.5f) * invSide; + for (uint32_t ix = 0; ix < gridSide; ix++) + { + const float ux = (static_cast(ix) + 0.5f) * invSide; + typename sampling::SphericalTriangle::cache_type cache; + const float32_t3 L = sampler.generate(float32_t2(ux, uy), cache); + sum += static_cast(hlsl::abs(dot(normal, L))); + } } - return sum / static_cast(N) * static_cast(shape.solid_angle); + return sum / static_cast(gridSide * gridSide) * static_cast(shape.solid_angle); } -// Monte Carlo estimate of projected solid angle for a rectangle: E[abs(dot(L, normal))] * solidAngle. -// Uses abs() to match the BSDF projected solid angle formula. -// Samples uniformly from the spherical rectangle, reconstructs world-space direction. -inline float64_t mcEstimatePSA( +// Sampler-independent PSA reference for rectangles. Integrates the projected-solid-angle integral +// PSA = integral over rect surface of |cos(theta_receiver)| * |cos(theta_rect)| / d^2 dA +// on a uniform surface grid in (s, t) in [0, extents.x] x [0, extents.y]. No sampler involved, +// so disagreement with a sampler-derived PSA isolates the sampler / formula. +inline float64_t surfaceGridEstimatePSA( const nbl::hlsl::shapes::SphericalRectangle& shape, const nbl::hlsl::float32_t3& observer, const nbl::hlsl::float32_t3& normal, - uint32_t N, std::mt19937& rng) + uint32_t N) +{ + using namespace nbl::hlsl; + const float32_t3 rdir = shape.basis[0]; + const float32_t3 udir = shape.basis[1]; + const float32_t3 rectNormal = shape.basis[2]; + const float32_t width = shape.extents.x; + const float32_t height = shape.extents.y; + const uint32_t gridSide = static_cast(std::ceil(std::sqrt(static_cast(N)))); + const float64_t cellArea = static_cast(width) * static_cast(height) / static_cast(gridSide * gridSide); + float64_t sum = 0.0; + for (uint32_t iy = 0; iy < gridSide; iy++) + { + const float32_t t = (static_cast(iy) + 0.5f) * height / static_cast(gridSide); + for (uint32_t ix = 0; ix < gridSide; ix++) + { + const float32_t s = (static_cast(ix) + 0.5f) * width / static_cast(gridSide); + const float32_t3 worldPt = shape.origin + rdir * s + udir * t; + const float32_t3 toSurf = worldPt - observer; + const float64_t d2 = static_cast(dot(toSurf, toSurf)); + const float64_t d = std::sqrt(d2); + const float32_t3 L = toSurf * static_cast(1.0 / d); + const float64_t cosRx = static_cast(hlsl::abs(dot(normal, L))); + const float64_t cosRt = static_cast(hlsl::abs(dot(rectNormal, L))); + sum += cosRx * cosRt / d2; + } + } + return sum * cellArea; +} + +// Grid estimate of projected solid angle for a rectangle: mean of abs(dot(L, normal)) +// over a regular [0,1]^2 grid, times solidAngle. See the triangle overload above. +inline float64_t gridEstimatePSA( + const nbl::hlsl::shapes::SphericalRectangle& shape, + const nbl::hlsl::float32_t3& observer, + const nbl::hlsl::float32_t3& normal, + uint32_t N) { using namespace nbl::hlsl; auto sampler = sampling::SphericalRectangle::create(shape, observer); if (sampler.solidAngle <= 0.0f || !std::isfinite(sampler.solidAngle)) return 0.0; - std::uniform_real_distribution uDist(0.0f, 1.0f); - float64_t sum = 0.0; - for (uint32_t i = 0; i < N; i++) + const uint32_t gridSide = static_cast(std::ceil(std::sqrt(static_cast(N)))); + const float invSide = 1.0f / static_cast(gridSide); + float64_t sum = 0.0; + for (uint32_t iy = 0; iy < gridSide; iy++) { - float32_t2 u(uDist(rng), uDist(rng)); - typename sampling::SphericalRectangle::cache_type cache; - float32_t2 gen = sampler.generateSurfaceOffset(u, cache); - // Reconstruct world-space direction from rectangle offset - float32_t3 worldPt = shape.origin - + shape.basis[0] * gen.x - + shape.basis[1] * gen.y; - float32_t3 L = normalize(worldPt - observer); - sum += static_cast(hlsl::abs(dot(normal, L))); + const float uy = (static_cast(iy) + 0.5f) * invSide; + for (uint32_t ix = 0; ix < gridSide; ix++) + { + const float ux = (static_cast(ix) + 0.5f) * invSide; + typename sampling::SphericalRectangle::cache_type cache; + // `generateLocalBasisXY` returns absolute (xu, yv) on the rectangle surface; subtract r0.xy + // to get the offset-from-r0 that the world-space reconstruction below expects. + const float32_t2 absXY = sampler.generateLocalBasisXY(float32_t2(ux, uy), cache); + const float32_t2 gen = absXY - float32_t2(sampler.r0.x, sampler.r0.y); + const float32_t3 worldPt = shape.origin + shape.basis[0] * gen.x + shape.basis[1] * gen.y; + const float32_t3 L = normalize(worldPt - observer); + sum += static_cast(hlsl::abs(dot(normal, L))); + } } - return sum / static_cast(N) * static_cast(sampler.solidAngle); + return sum / static_cast(gridSide * gridSide) * static_cast(sampler.solidAngle); } // Bundles seed + rng + failCount for randomized property tests. @@ -357,14 +500,18 @@ struct SeededTestContext } }; -// Generic PSA vs MC comparison. -// ConfigGen: void(std::mt19937& rng, uint32_t index, float64_t& formulaPSA, float64_t& mcPSA, InfoLogger& info) -// Must set formulaPSA and mcPSA for config `index`, or set both to 0 to skip. +// Generic PSA vs grid-integration comparison. +// ConfigGen: void(std::mt19937& rng, uint32_t index, float64_t& formulaPSA, float64_t& gridPSA, InfoLogger& info) +// Must set formulaPSA and gridPSA for config `index`, or set both to 0 to skip. // `info` is a callable: void(nbl::system::ILogger*, nbl::system::ILogger::E_LOG_LEVEL) that logs // sampler/shape details for the current config. Called on mismatch. -// When diagnostic=true, failures log at ELL_WARNING instead of ELL_ERROR (non-hard-fail). +// Two-tier tolerance: +// - (relTol, absTol): soft threshold. Exceedance counts as a mismatch. With diagnostic=true +// the run still returns true (known-limitation noise); with diagnostic=false it hard-fails. +// - (hardRelTol, hardAbsTol): egregious threshold. Always hard-fails regardless of diagnostic, +// so a catastrophic regression can't hide inside the warning stream. template -inline bool testPSAVersusMonteCarlo( +inline bool testPSAVersusGrid( nbl::system::ILogger* logger, const char* tag, const char* label, @@ -372,49 +519,78 @@ inline bool testPSAVersusMonteCarlo( uint32_t numConfigs, float64_t relTol, float64_t absTol, + float64_t hardRelTol, + float64_t hardAbsTol, bool diagnostic = false) { - const auto failLevel = diagnostic ? nbl::system::ILogger::ELL_WARNING : nbl::system::ILogger::ELL_ERROR; + const auto softFailLevel = diagnostic ? nbl::system::ILogger::ELL_WARNING : nbl::system::ILogger::ELL_ERROR; SeededTestContext ctx; + uint32_t hardFailCount = 0; + uint32_t testedCount = 0; for (uint32_t c = 0; c < numConfigs; c++) { - float64_t formulaPSA = 0.0, mcPSA = 0.0; + float64_t formulaPSA = 0.0, gridPSA = 0.0; std::function logInfo = - [](nbl::system::ILogger*, nbl::system::ILogger::E_LOG_LEVEL) {}; - configGenerator(ctx.rng, c, formulaPSA, mcPSA, logInfo); + [](nbl::system::ILogger*, nbl::system::ILogger::E_LOG_LEVEL) { + }; + configGenerator(ctx.rng, c, formulaPSA, gridPSA, logInfo); - if (mcPSA == 0.0 && formulaPSA == 0.0) + if (gridPSA == 0.0 && formulaPSA == 0.0) continue; + testedCount++; + + const float64_t absErr = std::abs(formulaPSA - gridPSA); + const float64_t relErr = (std::abs(gridPSA) > 1e-10) ? absErr / std::abs(gridPSA) : 0.0; - const float64_t absErr = std::abs(formulaPSA - mcPSA); - const float64_t relErr = (std::abs(mcPSA) > 1e-10) ? absErr / std::abs(mcPSA) : 0.0; + const bool softFail = relErr > relTol && absErr > absTol; + const bool hardFail = relErr > hardRelTol && absErr > hardAbsTol; - if (relErr > relTol && absErr > absTol) + if (softFail) { ctx.failCount++; + if (hardFail) + hardFailCount++; if (ctx.failCount <= 5) { - logger->log(" [%s] %s mismatch: formula=%f expected(MC)=%f relErr=%e absErr=%e config %u", - failLevel, tag, label, formulaPSA, mcPSA, relErr, absErr, c); - logInfo(logger, failLevel); + const auto level = hardFail ? nbl::system::ILogger::ELL_ERROR : softFailLevel; + logger->log(" [%s] %s %s: formula=%f expected(grid)=%f relErr=%e absErr=%e config %u", + level, tag, label, hardFail ? "HARD mismatch" : "mismatch", + formulaPSA, gridPSA, relErr, absErr, c); + logInfo(logger, level); } } } + const uint32_t skippedCount = numConfigs - testedCount; + if (ctx.failCount == 0) - logger->log(" [%s] %s PASSED (%u configs, relTol=%e absTol=%e)", - nbl::system::ILogger::ELL_PERFORMANCE, tag, label, numConfigs, relTol, absTol); - else { - logger->log(" [%s] %s FAILED (%u/%u configs exceeded tolerance, relTol=%e absTol=%e)", - failLevel, tag, label, ctx.failCount, numConfigs, relTol, absTol); - if (diagnostic) - logger->log(" [%s] reproduce with seed=%u (diagnostic only, not a hard failure)", - nbl::system::ILogger::ELL_WARNING, tag, ctx.seed); + logger->log(" [%s] %s PASSED (%u tested, %u skipped of %u requested, relTol=%e absTol=%e)", + nbl::system::ILogger::ELL_PERFORMANCE, tag, label, + testedCount, skippedCount, numConfigs, relTol, absTol); + return true; } - return diagnostic ? true : ctx.finalize(logger, tag); + const bool hardFailed = hardFailCount > 0; + const auto summaryLevel = hardFailed ? nbl::system::ILogger::ELL_ERROR : softFailLevel; + if (hardFailed) + logger->log(" [%s] %s FAILED (%u/%u exceeded soft tol, %u/%u exceeded HARD tol, %u skipped of %u, hardRelTol=%e hardAbsTol=%e)", + summaryLevel, tag, label, ctx.failCount, testedCount, hardFailCount, testedCount, + skippedCount, numConfigs, hardRelTol, hardAbsTol); + else + logger->log(" [%s] %s FAILED (%u/%u configs exceeded tolerance, %u skipped of %u, relTol=%e absTol=%e)", + summaryLevel, tag, label, ctx.failCount, testedCount, skippedCount, numConfigs, relTol, absTol); + + const bool shouldHardFail = hardFailed || !diagnostic; + if (shouldHardFail) + logger->log(" [%s] reproduce with seed=%u", + nbl::system::ILogger::ELL_ERROR, tag, ctx.seed); + else + logger->log(" [%s] reproduce with seed=%u (diagnostic only, not a hard failure)", + nbl::system::ILogger::ELL_WARNING, tag, ctx.seed); + + return !shouldHardFail; } // ============================================================================ @@ -435,23 +611,21 @@ inline void generateRandomRectangle(std::mt19937& rng, float32_t3 t1, t2; buildTangentFrame(normal, t1, t2); - const float width = sizeDist(rng); + const float width = sizeDist(rng); const float height = sizeDist(rng); - const float dist = distDist(rng); + const float dist = distDist(rng); - observer = float32_t3(offsetDist(rng), offsetDist(rng), offsetDist(rng)); + observer = float32_t3(offsetDist(rng), offsetDist(rng), offsetDist(rng)); compressed.origin = observer - normal * dist + t1 * offsetDist(rng) + t2 * offsetDist(rng); - compressed.right = t1 * width; - compressed.up = t2 * height; + compressed.right = t1 * width; + compressed.up = t2 * height; } // Stress rectangles: ill-conditioned geometries that exercise edge cases. // - Extreme aspect ratio (10:1 to 20:1) // - Grazing angle (observer nearly in the rectangle plane) // - Observer near corner (most of the rectangle off to one side) -inline void generateStressRectangle(std::mt19937& rng, - nbl::hlsl::shapes::CompressedSphericalRectangle& compressed, - nbl::hlsl::float32_t3& observer) +inline void generateStressRectangle(std::mt19937& rng, nbl::hlsl::shapes::CompressedSphericalRectangle& compressed, nbl::hlsl::float32_t3& observer) { using namespace nbl::hlsl; std::uniform_real_distribution uDist(0.0f, 1.0f); @@ -464,39 +638,39 @@ inline void generateStressRectangle(std::mt19937& rng, switch (caseDist(rng)) { case 0: // Extreme aspect ratio - { - const float longSide = 3.0f + uDist(rng) * 5.0f; - const float shortSide = 0.1f + uDist(rng) * 0.2f; - const float dist = 1.5f + uDist(rng) * 2.0f; - observer = float32_t3(0.0f, 0.0f, 0.0f); - compressed.origin = -normal * dist - t1 * (longSide * 0.5f) - t2 * (shortSide * 0.5f); - compressed.right = t1 * longSide; - compressed.up = t2 * shortSide; - break; - } + { + const float longSide = 3.0f + uDist(rng) * 5.0f; + const float shortSide = 0.1f + uDist(rng) * 0.2f; + const float dist = 1.5f + uDist(rng) * 2.0f; + observer = float32_t3(0.0f, 0.0f, 0.0f); + compressed.origin = -normal * dist - t1 * (longSide * 0.5f) - t2 * (shortSide * 0.5f); + compressed.right = t1 * longSide; + compressed.up = t2 * shortSide; + break; + } case 1: // Grazing angle (observer nearly in the rectangle plane) - { - const float width = 1.0f + uDist(rng) * 2.0f; - const float height = 1.0f + uDist(rng) * 2.0f; - const float normalDist = 0.05f + uDist(rng) * 0.15f; - const float tangentOffset = 0.5f + uDist(rng) * 1.0f; - observer = float32_t3(0.0f, 0.0f, 0.0f); - compressed.origin = -normal * normalDist + t1 * tangentOffset - t2 * (height * 0.5f); - compressed.right = t1 * width; - compressed.up = t2 * height; - break; - } + { + const float width = 1.0f + uDist(rng) * 2.0f; + const float height = 1.0f + uDist(rng) * 2.0f; + const float normalDist = 0.05f + uDist(rng) * 0.15f; + const float tangentOffset = 0.5f + uDist(rng) * 1.0f; + observer = float32_t3(0.0f, 0.0f, 0.0f); + compressed.origin = -normal * normalDist + t1 * tangentOffset - t2 * (height * 0.5f); + compressed.right = t1 * width; + compressed.up = t2 * height; + break; + } default: // Observer near corner - { - const float width = 2.0f + uDist(rng) * 3.0f; - const float height = 2.0f + uDist(rng) * 3.0f; - const float dist = 0.5f + uDist(rng) * 1.0f; - observer = float32_t3(0.0f, 0.0f, 0.0f); - compressed.origin = -normal * dist - t1 * (0.05f + uDist(rng) * 0.1f) - t2 * (0.05f + uDist(rng) * 0.1f); - compressed.right = t1 * width; - compressed.up = t2 * height; - break; - } + { + const float width = 2.0f + uDist(rng) * 3.0f; + const float height = 2.0f + uDist(rng) * 3.0f; + const float dist = 0.5f + uDist(rng) * 1.0f; + observer = float32_t3(0.0f, 0.0f, 0.0f); + compressed.origin = -normal * dist - t1 * (0.05f + uDist(rng) * 0.1f) - t2 * (0.05f + uDist(rng) * 0.1f); + compressed.right = t1 * width; + compressed.up = t2 * height; + break; + } } } @@ -590,10 +764,10 @@ inline void logRectInfo( { using namespace nbl::system; using namespace nbl::hlsl; - const float width = length(compressed.right); - const float height = length(compressed.up); + const float width = length(compressed.right); + const float height = length(compressed.up); const float32_t3 normal = normalize(cross(compressed.right, compressed.up)); - const float dist = length(compressed.origin - observer); + const float dist = length(compressed.origin - observer); logger->log(" origin=%s right=%s up=%s observer=%s", ILogger::ELL_ERROR, to_string(compressed.origin).c_str(), @@ -617,14 +791,14 @@ inline bool anyRectCornerAboveHorizon( const nbl::hlsl::float32_t3& normal) { using namespace nbl::hlsl; - const float32_t3 r0 = mul(shape.basis, shape.origin - observer); + const float32_t3 r0 = mul(shape.basis, shape.origin - observer); const float32_t3 localN = mul(shape.basis, normal); - const float32_t3 v0 = normalize(r0); - const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f)); - const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f)); - const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f)); + const float32_t3 v0 = normalize(r0); + const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f)); + const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f)); + const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f)); return dot(localN, v0) > 0.0f || dot(localN, v1) > 0.0f || - dot(localN, v2) > 0.0f || dot(localN, v3) > 0.0f; + dot(localN, v2) > 0.0f || dot(localN, v3) > 0.0f; } // True if all rectangle corners have positive NdotL with the given normal. @@ -635,14 +809,14 @@ inline bool allRectCornersAboveHorizon( const nbl::hlsl::float32_t3& normal) { using namespace nbl::hlsl; - const float32_t3 r0 = mul(shape.basis, shape.origin - observer); + const float32_t3 r0 = mul(shape.basis, shape.origin - observer); const float32_t3 localN = mul(shape.basis, normal); - const float32_t3 v0 = normalize(r0); - const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f)); - const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f)); - const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f)); + const float32_t3 v0 = normalize(r0); + const float32_t3 v1 = normalize(r0 + float32_t3(shape.extents.x, 0.0f, 0.0f)); + const float32_t3 v2 = normalize(r0 + float32_t3(shape.extents.x, shape.extents.y, 0.0f)); + const float32_t3 v3 = normalize(r0 + float32_t3(0.0f, shape.extents.y, 0.0f)); return dot(localN, v0) > 0.0f && dot(localN, v1) > 0.0f && - dot(localN, v2) > 0.0f && dot(localN, v3) > 0.0f; + dot(localN, v2) > 0.0f && dot(localN, v3) > 0.0f; } #endif diff --git a/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h b/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h index cb28b63fc..ecb0f606d 100644 --- a/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h +++ b/37_HLSLSamplingTests/tests/property/CSamplerPropertyTester.h @@ -414,6 +414,12 @@ class CSphericalTriangleGenerateTester auto sampler = sampling::SphericalTriangle::create(shape); const float64_t SA = static_cast(shape.solid_angle); + // Float32 solid angle (acos sum - pi) loses precision for small + // triangles due to catastrophic cancellation, making the expected + // sub-solid-angle ratio unreliable as a reference value. + // At SA ~ 0.003, the relative error in float32 solid angles reaches + // ~1-3%, comparable to the half-space counting tolerance. + const bool tinyTriangle = SA < 4e-3; // For each cut: pick a vertex and a point on the opposite edge, // forming a great circle that splits the triangle in two. @@ -482,12 +488,20 @@ class CSphericalTriangleGenerateTester testedCuts++; if (absErr > relTol) { - ctx.failCount++; - if (ctx.failCount <= 5) + if (tinyTriangle) { - m_logger->log("[SphericalTriangle::generate] %s half-space: observed=%f expected=%f absErr=%e (tol=%e) tri %u cut %u", - system::ILogger::ELL_ERROR, label, observedFraction, expectedFraction, absErr, relTol, t, c); - logTriangleInfo(m_logger, v0, v1, v2); + m_logger->log("[SphericalTriangle::generate] %s half-space: observed=%f expected=%f absErr=%e (tol=%e) tri %u cut %u -- solid angle %e too small for float32, especially on GPU", + system::ILogger::ELL_WARNING, label, observedFraction, expectedFraction, absErr, relTol, t, c, SA); + } + else + { + ctx.failCount++; + if (ctx.failCount <= 5) + { + m_logger->log("[SphericalTriangle::generate] %s half-space: observed=%f expected=%f absErr=%e (tol=%e) tri %u cut %u", + system::ILogger::ELL_ERROR, label, observedFraction, expectedFraction, absErr, relTol, t, c); + logTriangleInfo(m_logger, v0, v1, v2); + } } } } @@ -504,12 +518,20 @@ class CSphericalTriangleGenerateTester } // ------------------------------------------------------------------------- - // Moment matching: E[dot(generate(u), N)] should equal PSA(N) / SA. + // Moment matching: E[dot(generate(u), N)] should equal signedPSA(N) / SA. // // For a uniform distribution over a spherical triangle: // E[f(L)] = (1/SA) * integral_triangle f(L) dw // - // Choosing f(L) = dot(L, N) gives E[dot(L, N)] = PSA(N) / SA. + // Choosing f(L) = dot(L, N) gives E[dot(L, N)] = signedPSA(N) / SA, + // where signedPSA is the exact signed projected solid angle computed + // via the Kelvin-Stokes theorem: + // signedPSA(N) = 0.5 * sum_edges dot(edgeNormal_i, N) * edgeArcLength_i + // + // Note: shapes::SphericalTriangle::projectedSolidAngle() returns a signed result + // (Kelvin-Stokes signed sum); tests abs() the return to compare against the + // |cos(theta)| (BSDF) PSA integral reference. + // // If generate() has a systematic bias (e.g., concentrating samples // near one vertex), this moment will be wrong for most directions N. // Testing multiple random N per triangle makes it very unlikely that @@ -533,11 +555,34 @@ class CSphericalTriangleGenerateTester auto sampler = sampling::SphericalTriangle::create(shape); const float64_t SA = static_cast(shape.solid_angle); + // Precompute edge normals and arc lengths for the signed PSA formula. + // cross(v_j, v_k) * csc_sides[i] gives outward-pointing edge normals + // only when the vertices are CCW as seen from outside the sphere. + // The sign of the triple product dot(v0, cross(v1, v2)) tells us the + // winding: positive = CCW (outward normals), negative = CW (inward). + const float32_t3 crossBC = hlsl::cross(shape.vertices[1], shape.vertices[2]); + const float64_t windingSign = (hlsl::dot(shape.vertices[0], crossBC) >= 0.0f) ? 1.0 : -1.0; + const float32_t3 edgeNormals[3] = { + crossBC * shape.csc_sides[0], + hlsl::cross(shape.vertices[2], shape.vertices[0]) * shape.csc_sides[1], + hlsl::cross(shape.vertices[0], shape.vertices[1]) * shape.csc_sides[2] + }; + const float64_t edgeAngles[3] = { + std::acos(static_cast(hlsl::clamp(shape.cos_sides[0], -1.0f, 1.0f))), + std::acos(static_cast(hlsl::clamp(shape.cos_sides[1], -1.0f, 1.0f))), + std::acos(static_cast(hlsl::clamp(shape.cos_sides[2], -1.0f, 1.0f))) + }; + for (uint32_t n = 0; n < numNormals; n++) { float32_t3 N = generateRandomUnitVector(ctx.rng); - const float64_t psa = static_cast(shape.projectedSolidAngle(N)); - const float64_t expected = psa / SA; + + // Signed PSA via Kelvin-Stokes: exact for integral dot(L,N) dOmega + float64_t signedPSA = 0.0; + for (uint32_t e = 0; e < 3; e++) + signedPSA += static_cast(hlsl::dot(edgeNormals[e], N)) * edgeAngles[e]; + signedPSA *= 0.5 * windingSign; + const float64_t expected = signedPSA / SA; float64_t sum = 0.0; std::uniform_real_distribution uDist(0.0f, 1.0f); @@ -546,7 +591,7 @@ class CSphericalTriangleGenerateTester float32_t2 u(uDist(ctx.rng), uDist(ctx.rng)); typename sampling::SphericalTriangle::cache_type cache; float32_t3 L = sampler.generate(u, cache); - sum += static_cast(hlsl::abs(dot(L, N))); + sum += static_cast(dot(L, N)); } const float64_t mcEstimate = sum / static_cast(numSamples); @@ -601,7 +646,7 @@ class CSphericalTriangleGenerateTester if (shape.solid_angle <= 0.0f || !std::isfinite(shape.solid_angle)) continue; - auto sampler = sampling::SphericalTriangle::create(shape); + auto sampler = sampling::SphericalTriangle::create(shape); std::uniform_real_distribution uDist(0.0f, 1.0f); for (uint32_t i = 0; i < samplesPerTriangle; i++) @@ -742,7 +787,7 @@ class CSphericalTriangleGenerateTester // Tests two aspects of projected spherical triangles: // // 1. PSA formula accuracy: shapes::SphericalTriangle::projectedSolidAngle -// against Monte Carlo ground truth (PSA = integral_{tri} abs(dot(L,N)) dOmega). +// against grid-integration ground truth (PSA = integral_{tri} abs(dot(L,N)) dOmega). // // 2. PST sampler accuracy: how well ProjectedSphericalTriangle's bilinear // importance sampling approximates the true NdotL distribution, and @@ -767,18 +812,21 @@ class CProjectedSphericalTriangleGeometricTester // when edge normals have mixed signs, even when all vertices are above the horizon. // These tests are diagnostic-only until proper hemisphere clipping is implemented. // TODO: make these hard failures once projectedSolidAngle clips to the hemisphere. - testPSAVersusMonteCarlo("random MC", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal) + // Hard-fail thresholds: relErr > 3.0 AND absErr > 0.3 means the formula is catastrophically + // wrong, not just affected by the known abs()-overcount limitation. Catches regressions that + // would otherwise hide in the warning stream. + pass &= testPSAVersusGrid("random", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal) { generateRandomTriangleVertices(rng, v0, v1, v2); - normal = generateRandomUnitVector(rng); }, 200, 500000, 0.05, 0.01, true); - testPSAVersusMonteCarlo("grazing MC", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal) + normal = generateRandomUnitVector(rng); }, 200, 500000, 0.05, 0.01, 3.0, 0.3, true); + pass &= testPSAVersusGrid("grazing", [](std::mt19937& rng, uint32_t, float32_t3& v0, float32_t3& v1, float32_t3& v2, float32_t3& normal) { generateRandomTriangleVertices(rng, v0, v1, v2); float32_t3 triCenter = normalize(v0 + v1 + v2); float32_t3 tangent, unused; buildTangentFrame(triCenter, tangent, unused); std::uniform_real_distribution grazeDist(0.02f, 0.15f); - normal = normalize(tangent + triCenter * grazeDist(rng)); }, 200, 500000, 0.1, 0.01, true); + normal = normalize(tangent + triCenter * grazeDist(rng)); }, 200, 500000, 0.1, 0.01, 3.0, 0.3, true); // Also diagnostic -- same abs() issue affects small triangles testPSASmallTriangle(); @@ -860,7 +908,7 @@ class CProjectedSphericalTriangleGeometricTester // Known analytic cases bool testPSAKnownCases() { - constexpr float64_t psaOctantMCRelTol = 0.05; + constexpr float64_t psaOctantGridRelTol = 0.05; constexpr float64_t psaSymmetryRelTol = 1e-4; SeededTestContext ctx; @@ -872,51 +920,52 @@ class CProjectedSphericalTriangleGeometricTester // By Kelvin-Stokes / direct integration, PSA = pi/4 for any axis-aligned normal. { auto shape = createSphericalTriangleShape(float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1)); - const float64_t psaZ = static_cast(shape.projectedSolidAngle(float32_t3(0, 0, 1))); + const float64_t psaZ = std::abs(static_cast(shape.projectedSolidAngle(float32_t3(0, 0, 1)))); - // MC verification: sample many points uniformly from the octant triangle - const float64_t mcPSA = mcEstimatePSA(shape, float32_t3(0, 0, 1), 1000000, ctx.rng); + // Grid verification: evaluate abs(N.L) over a dense grid on the octant triangle + const float64_t gridPSA = gridEstimatePSA(shape, float32_t3(0, 0, 1), 1000000); - const float64_t formulaVsMC = std::abs(psaZ - mcPSA) / std::abs(mcPSA); - m_logger->log(" [PSA] octant z-normal: formula=%f expected(pi/4)=%f reference=%f relErr=%e", - system::ILogger::ELL_PERFORMANCE, psaZ, nbl::hlsl::numbers::pi / 4.0, mcPSA, formulaVsMC); + const float64_t formulaVsGrid = std::abs(psaZ - gridPSA) / std::abs(gridPSA); + m_logger->log(" [TriPSA] octant z-normal: formula=%f expected(pi/4)=%f reference=%f relErr=%e", + system::ILogger::ELL_PERFORMANCE, psaZ, nbl::hlsl::numbers::pi / 4.0, gridPSA, formulaVsGrid); - if (formulaVsMC > psaOctantMCRelTol) + if (formulaVsGrid > psaOctantGridRelTol) { - m_logger->log(" [PSA] octant z-normal FAILED: formula=%f expected(reference)=%f relErr=%e relTol=%e", - system::ILogger::ELL_ERROR, psaZ, mcPSA, formulaVsMC, psaOctantMCRelTol); + m_logger->log(" [TriPSA] octant z-normal FAILED: formula=%f expected(reference)=%f relErr=%e relTol=%e", + system::ILogger::ELL_ERROR, psaZ, gridPSA, formulaVsGrid, psaOctantGridRelTol); pass = false; } // Same octant, normal = (1,0,0): by symmetry same result as z-normal - const float64_t psaX = static_cast(shape.projectedSolidAngle(float32_t3(1, 0, 0))); + const float64_t psaX = std::abs(static_cast(shape.projectedSolidAngle(float32_t3(1, 0, 0)))); const float64_t relDiff = std::abs(psaZ - psaX) / std::max(psaZ, psaX); - m_logger->log(" [PSA] octant symmetry: psaZ=%f psaX=%f relDiff=%e", + m_logger->log(" [TriPSA] octant symmetry: psaZ=%f psaX=%f relDiff=%e", system::ILogger::ELL_PERFORMANCE, psaZ, psaX, relDiff); if (relDiff > psaSymmetryRelTol) { - m_logger->log(" [PSA] octant symmetry FAILED: psaZ=%f psaX=%f relDiff=%e relTol=%e", + m_logger->log(" [TriPSA] octant symmetry FAILED: psaZ=%f psaX=%f relDiff=%e relTol=%e", system::ILogger::ELL_ERROR, psaZ, psaX, relDiff, psaSymmetryRelTol); pass = false; } } if (pass) - m_logger->log(" [PSA] known cases PASSED (octant z-normal vs MC relTol=%e, octant symmetry z vs x relTol=%e)", - system::ILogger::ELL_PERFORMANCE, psaOctantMCRelTol, psaSymmetryRelTol); + m_logger->log(" [TriPSA] known cases PASSED (octant z-normal vs grid relTol=%e, octant symmetry z vs x relTol=%e)", + system::ILogger::ELL_PERFORMANCE, psaOctantGridRelTol, psaSymmetryRelTol); - return ctx.finalize(pass, m_logger, "PSA"); + return ctx.finalize(pass, m_logger, "TriPSA"); } - // Helper: run MC comparison of formulaPSA vs E[dot(L,N)]*SA for a set of triangle configs. + // Helper: run grid-integration comparison of formulaPSA vs PSA reference for a set of triangle configs. // TriConfigGen: void(rng, index, v0, v1, v2, normal) — generates triangle vertices + normal. template - bool testPSAVersusMonteCarlo(const char* label, TriConfigGen triConfigGenerator, uint32_t numConfigs, uint32_t mcSamples, float64_t relTol, float64_t absTol, bool diagnostic = false) + bool testPSAVersusGrid(const char* label, TriConfigGen triConfigGenerator, uint32_t numConfigs, uint32_t gridSamples, + float64_t relTol, float64_t absTol, float64_t hardRelTol, float64_t hardAbsTol, bool diagnostic = false) { - return ::testPSAVersusMonteCarlo(m_logger, "PSA", label, - [&](std::mt19937& rng, uint32_t c, float64_t& formulaPSA, float64_t& mcPSA, auto& logInfo) + return ::testPSAVersusGrid(m_logger, "TriPSA", label, + [&](std::mt19937& rng, uint32_t c, float64_t& formulaPSA, float64_t& gridPSA, auto& logInfo) { float32_t3 v0, v1, v2, normal; triConfigGenerator(rng, c, v0, v1, v2, normal); @@ -925,8 +974,8 @@ class CProjectedSphericalTriangleGeometricTester if (shape.solid_angle <= 0.0f || !std::isfinite(shape.solid_angle)) return; - formulaPSA = static_cast(shape.projectedSolidAngle(normal)); - mcPSA = mcEstimatePSA(shape, normal, mcSamples, rng); + formulaPSA = std::abs(static_cast(shape.projectedSolidAngle(normal))); + gridPSA = gridEstimatePSA(shape, normal, gridSamples); logInfo = [=](system::ILogger* logger, system::ILogger::E_LOG_LEVEL level) { using nbl::system::to_string; @@ -935,14 +984,14 @@ class CProjectedSphericalTriangleGeometricTester to_string(normal).c_str(), to_string(shape.solid_angle).c_str()); }; }, - numConfigs, relTol, absTol, diagnostic); + numConfigs, relTol, absTol, hardRelTol, hardAbsTol, diagnostic); } - // Small triangles -- PSA should approach MC ground truth + // Small triangles -- PSA should approach grid ground truth bool testPSASmallTriangle() { constexpr float64_t smallTriMeanRelErrTol = 0.1; - constexpr uint32_t smallTriMCSamples = 100000; + constexpr uint32_t smallTriGridSamples = 100000; SeededTestContext ctx; bool pass = true; @@ -973,27 +1022,27 @@ class CProjectedSphericalTriangleGeometricTester if (shape.solid_angle <= 0.0f || !std::isfinite(shape.solid_angle)) continue; - const float64_t formulaPSA = static_cast(shape.projectedSolidAngle(normal)); + const float64_t formulaPSA = std::abs(static_cast(shape.projectedSolidAngle(normal))); const float64_t sa = static_cast(shape.solid_angle); const float64_t centerNdotL = static_cast(dot(normal, baseDir)); if (std::abs(centerNdotL) < 0.1 || sa < 1e-10) continue; - // MC ground truth: E[abs(dot(L, N))] * solidAngle - const float64_t mcPSA = mcEstimatePSA(shape, normal, smallTriMCSamples, ctx.rng); + // Grid ground truth: mean over regular [0,1]^2 grid of abs(dot(L, N)) * solidAngle + const float64_t gridPSA = gridEstimatePSA(shape, normal, smallTriGridSamples); - if (std::abs(mcPSA) < 1e-10) + if (std::abs(gridPSA) < 1e-10) continue; - const float64_t relErr = (formulaPSA - mcPSA) / mcPSA; + const float64_t relErr = (formulaPSA - gridPSA) / gridPSA; sumRelErrPerSize[s] += relErr; validTrials[s]++; } } - m_logger->log(" [PSA] small triangle PSA vs MC (signed relErr, positive=overestimate):", system::ILogger::ELL_PERFORMANCE); + m_logger->log(" [TriPSA] small triangle PSA vs grid (signed relErr, positive=overestimate):", system::ILogger::ELL_PERFORMANCE); for (uint32_t s = 0; s < numSizes; s++) { if (validTrials[s] > 0) @@ -1005,14 +1054,14 @@ class CProjectedSphericalTriangleGeometricTester // Skip halfAngle=0.01 (s==5): float32 solid angle precision collapses if (s == 4 && std::abs(meanRelErr) > smallTriMeanRelErrTol) { - m_logger->log(" [PSA] small triangle exceeded tolerance at halfAngle=%.3f meanRelErr=%+e meanRelErrTol=%e (%u trials)", + m_logger->log(" [TriPSA] small triangle exceeded tolerance at halfAngle=%.3f meanRelErr=%+e meanRelErrTol=%e (%u trials)", system::ILogger::ELL_WARNING, halfAngles[s], meanRelErr, smallTriMeanRelErrTol, validTrials[s]); } } } - m_logger->log(" [PSA] small triangle test complete (%u trials across %u sizes, %u MC samples each, meanRelErrTol=%e) -- diagnostic only", - system::ILogger::ELL_PERFORMANCE, numTrials, numSizes, smallTriMCSamples, smallTriMeanRelErrTol); + m_logger->log(" [TriPSA] small triangle test complete (%u trials across %u sizes, %u grid samples each, meanRelErrTol=%e) -- diagnostic only", + system::ILogger::ELL_PERFORMANCE, numTrials, numSizes, smallTriGridSamples, smallTriMeanRelErrTol); return true; // diagnostic only -- abs()-based PSA overestimates, not a hard failure } @@ -1076,7 +1125,7 @@ class CProjectedSphericalTriangleGeometricTester if (!std::isfinite(sampler.sphtri.rcpSolidAngle) || sampler.sphtri.rcpSolidAngle <= 0.0f) continue; - const float64_t projSA = static_cast(shape.projectedSolidAngle(cfg.normal)); + const float64_t projSA = std::abs(static_cast(shape.projectedSolidAngle(cfg.normal))); const bool hasPSA = projSA > 0.0 && std::isfinite(projSA); const float64_t rcpPSA = hasPSA ? 1.0 / projSA : 0.0; MISStats& mis = isGrazing ? grazingMIS : normalMIS; @@ -1090,7 +1139,7 @@ class CProjectedSphericalTriangleGeometricTester float32_t3 L = sampler.generate(u, cache); const float64_t trueNdotL = std::max(0.0, static_cast(dot(cfg.normal, L))); - const float64_t bilinearNdotL = static_cast(cache.abs_cos_theta); + const float64_t bilinearNdotL = std::numeric_limits::quiet_NaN(); const float64_t pstPdf = static_cast(sampler.forwardPdf(u, cache)); // Bilinear vs true NdotL @@ -1323,7 +1372,7 @@ class CProjectedSphericalTriangleGeometricTester continue; auto sampler = createSampler(cfg); - const float64_t projSA = static_cast(shape.projectedSolidAngle(cfg.normal)); + const float64_t projSA = std::abs(static_cast(shape.projectedSolidAngle(cfg.normal))); if (projSA <= 0.0 || !std::isfinite(projSA) || !std::isfinite(sampler.sphtri.rcpSolidAngle) || sampler.sphtri.rcpSolidAngle <= 0.0f) @@ -1344,7 +1393,11 @@ class CProjectedSphericalTriangleGeometricTester if (trueNdotL < 1e-6) continue; - const float64_t pstPdf = static_cast(sampler.backwardPdf(L)); + // No direct backwardPdf; evaluate forwardPdf at the inverted u to recover pdf(L). + const float32_t2 uInv = sampler.sphtri.generateInverse(L); + typename sampling::ProjectedSphericalTriangle::cache_type pdfCache; + sampler.generate(uInv, pdfCache); + const float64_t pstPdf = static_cast(sampler.forwardPdf(uInv, pdfCache)); const float64_t idealPdf = trueNdotL * rcpPSA; if (!std::isfinite(pstPdf) || pstPdf <= 0.0 || idealPdf <= 0.0) @@ -1416,6 +1469,15 @@ struct UniformRectSamplerPolicy return sampler_type::create(shape, observer); } + // Returns offset-from-r0 on the rectangle surface. Goes through generateLocalBasisXY + // (absolute xy) and subtracts r0.xy so the [0, extents] bounds check still applies. + static float32_t2 generateOffset(sampler_type& s, const float32_t2& u) + { + typename sampler_type::cache_type cache; + const float32_t2 absXY = s.generateLocalBasisXY(u, cache); + return absXY - float32_t2(s.r0.x, s.r0.y); + } + static float getSolidAngle(const sampler_type& s) { return s.solidAngle; } static const char* name() { return "SphericalRectangle"; } @@ -1425,7 +1487,8 @@ struct UniformRectSamplerPolicy struct ProjectedRectSamplerPolicy { - using sampler_type = sampling::ProjectedSphericalRectangle; + // UsePdfAsWeight=false so receiverNormal and projSolidAngle are populated for diagnostic logs. + using sampler_type = sampling::ProjectedSphericalRectangle; static sampler_type createSampler(shapes::SphericalRectangle& shape, const float32_t3& observer, std::mt19937& rng) @@ -1439,6 +1502,17 @@ struct ProjectedRectSamplerPolicy return sampler_type::create(shape, observer, receiverNormal, false); } + // Run u through the bilinear warp then the inner sphrect's generateLocalBasisXY, and subtract + // r0.xy to get offset-from-r0 on the rectangle surface. + static float32_t2 generateOffset(sampler_type& s, const float32_t2& u) + { + typename sampling::Bilinear::cache_type bc; + const float32_t2 warped = s.bilinearPatch.generate(u, bc); + typename sampling::SphericalRectangle::cache_type sphrectCache; + const float32_t2 absXY = s.sphrect.generateLocalBasisXY(warped, sphrectCache); + return absXY - float32_t2(s.sphrect.r0.x, s.sphrect.r0.y); + } + static float getSolidAngle(const sampler_type& s) { return s.sphrect.solidAngle; } static const char* name() { return "ProjectedSphericalRectangle"; } @@ -1635,8 +1709,7 @@ class CRectangleGenerateTester for (uint32_t i = 0; i < numSamples; i++) { float32_t2 u(uDist(ctx.rng), uDist(ctx.rng)); - typename sampler_type::cache_type cache; - float32_t2 gen = sampler.generateSurfaceOffset(u, cache); + float32_t2 gen = Policy::generateOffset(sampler, u); const float coord = cutAlongX ? gen.x : gen.y; if (coord < cutThreshold) countInSub++; @@ -1714,8 +1787,7 @@ class CRectangleGenerateTester for (uint32_t i = 0; i < numSamples; i++) { float32_t2 u(uDist(ctx.rng), uDist(ctx.rng)); - typename sampler_type::cache_type cache; - float32_t2 gen = sampler.generateSurfaceOffset(u, cache); + float32_t2 gen = Policy::generateOffset(sampler, u); float32_t3 dir = reconstructDirection(compressed, shape.extents, observer, gen); sum += static_cast(dot(dir, N)); } @@ -1778,8 +1850,7 @@ class CRectangleGenerateTester for (uint32_t i = 0; i < numSamples; i++) { float32_t2 u(uDist(ctx.rng), uDist(ctx.rng)); - typename sampler_type::cache_type cache; - float32_t2 gen = sampler.generateSurfaceOffset(u, cache); + float32_t2 gen = Policy::generateOffset(sampler, u); if (gen.x < -1e-5f || gen.x > extX + 1e-5f || gen.y < -1e-5f || gen.y > extY + 1e-5f) { @@ -1891,9 +1962,9 @@ using CProjectedSphericalRectangleGenerateTester = CRectangleGenerateTester 3.0 AND absErr > 0.3) still catch catastrophic regressions. + bool pass = true; + pass &= testPSAVersusGrid("random", generateRandomRectangle, 200, 500000, 0.05, 0.01, 3.0, 0.3); + pass &= testPSAVersusGrid("grazing", generateStressRectangle, 200, 500000, 0.1, 0.01, 3.0, 0.3); + return pass; } private: // Reuse rectangle generators from CRectangleGenerateTester using RectGen = void(*)(std::mt19937&, shapes::CompressedSphericalRectangle&, float32_t3&); - bool testPSAVersusMonteCarlo(const char* label, RectGen rectGen, uint32_t numConfigs, uint32_t mcSamples, float64_t relTol, float64_t absTol) + bool testPSAVersusGrid(const char* label, RectGen rectGen, uint32_t numConfigs, uint32_t gridSamples, + float64_t relTol, float64_t absTol, float64_t hardRelTol, float64_t hardAbsTol) { - return ::testPSAVersusMonteCarlo(m_logger, "RectPSA", label, - [&](std::mt19937& rng, uint32_t, float64_t& formulaPSA, float64_t& mcPSA, auto& logInfo) + return ::testPSAVersusGrid(m_logger, "RectPSA", label, + [&](std::mt19937& rng, uint32_t, float64_t& formulaPSA, float64_t& gridPSA, auto& logInfo) { shapes::CompressedSphericalRectangle compressed; float32_t3 observer; @@ -1932,7 +2006,9 @@ class CProjectedSphericalRectangleGeometricTester float32_t3 normal = generateRandomUnitVector(rng); formulaPSA = static_cast(shape.projectedSolidAngle(observer, normal)); - mcPSA = mcEstimatePSA(shape, observer, normal, mcSamples, rng); + // surfaceGridEstimatePSA integrates over the rectangle surface directly (no sampler in + // the loop), so a formula-vs-reference mismatch here isolates the PSA formula. + gridPSA = surfaceGridEstimatePSA(shape, observer, normal, gridSamples); logInfo = [compressed, observer, normal, saValue = sa.value](system::ILogger* logger, system::ILogger::E_LOG_LEVEL level) { using nbl::system::to_string; @@ -1945,7 +2021,7 @@ class CProjectedSphericalRectangleGeometricTester to_string(saValue).c_str()); }; }, - numConfigs, relTol, absTol, true); + numConfigs, relTol, absTol, hardRelTol, hardAbsTol, true); } system::ILogger* m_logger; From a4559b941a9d0f465ccc8687630077e045829403 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Fri, 24 Apr 2026 21:22:28 +0300 Subject: [PATCH 21/26] alias table is packed, 2 versions, consolidated WORKGROUP_SIZE for tests and benchmarks, example 37 and 64 now use a single command buffer for benchmarks --- 37_HLSLSamplingTests/CMakeLists.txt | 31 +- .../app_resources/common/alias_table.hlsl | 102 ++- .../common/discrete_sampler_bench.hlsl | 14 +- .../common/spherical_triangle.hlsl | 2 +- .../shaders/alias_table_test.comp.hlsl | 74 --- .../shaders/bilinear_test.comp.hlsl | 4 - .../box_muller_transform_test.comp.hlsl | 4 - .../shaders/concentric_mapping_test.comp.hlsl | 4 - .../cumulative_probability_test.comp.hlsl | 15 +- .../shaders/linear_test.comp.hlsl | 4 - .../shaders/packed_alias_test.comp.hlsl | 114 ++++ .../shaders/polar_mapping_test.comp.hlsl | 4 - .../projected_hemisphere_test.comp.hlsl | 4 - .../shaders/projected_sphere_test.comp.hlsl | 4 - ...ojected_spherical_rectangle_test.comp.hlsl | 6 +- ...rojected_spherical_triangle_test.comp.hlsl | 4 - .../spherical_rectangle_test.comp.hlsl | 6 +- .../shaders/spherical_triangle.comp.hlsl | 4 +- .../shaders/test_compile.comp.hlsl | 10 - .../shaders/uniform_hemisphere_test.comp.hlsl | 4 - .../shaders/uniform_sphere_test.comp.hlsl | 4 - .../benchmarks/CDiscreteSamplerBenchmark.h | 386 +++++------ .../benchmarks/CSamplerBenchmark.h | 120 ++-- 37_HLSLSamplingTests/main.cpp | 189 +++--- .../tests/CAliasTableGPUTester.h | 52 +- 37_HLSLSamplingTests/tests/CBilinearTester.h | 2 +- .../tests/CBoxMullerTransformTester.h | 2 +- .../tests/CConcentricMappingTester.h | 2 +- .../tests/CCumulativeProbabilityGPUTester.h | 2 +- .../tests/CDiscreteTableTester.h | 608 +++++++++++------- 37_HLSLSamplingTests/tests/CLinearTester.h | 2 +- .../tests/CPolarMappingTester.h | 2 +- .../tests/CProjectedHemisphereTester.h | 2 +- .../tests/CProjectedSphereTester.h | 2 +- .../CProjectedSphericalRectangleTester.h | 2 +- .../tests/CProjectedSphericalTriangleTester.h | 2 +- .../tests/CSphericalRectangleTester.h | 2 +- .../tests/CSphericalTriangleTester.h | 2 +- .../tests/CUniformHemisphereTester.h | 2 +- .../tests/CUniformSphereTester.h | 2 +- 64_EmulatedFloatTest/main.cpp | 123 +--- 41 files changed, 1031 insertions(+), 893 deletions(-) delete mode 100644 37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl create mode 100644 37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl diff --git a/37_HLSLSamplingTests/CMakeLists.txt b/37_HLSLSamplingTests/CMakeLists.txt index 12cbb5bb1..e50fe4663 100644 --- a/37_HLSLSamplingTests/CMakeLists.txt +++ b/37_HLSLSamplingTests/CMakeLists.txt @@ -26,7 +26,7 @@ set(DEPENDS app_resources/shaders/projected_spherical_triangle_test.comp.hlsl app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl app_resources/shaders/spherical_rectangle_test.comp.hlsl - app_resources/shaders/alias_table_test.comp.hlsl + app_resources/shaders/packed_alias_test.comp.hlsl app_resources/shaders/cumulative_probability_test.comp.hlsl app_resources/common/linear.hlsl app_resources/common/uniform_hemisphere.hlsl @@ -91,7 +91,7 @@ endif() set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen") -set(BENCH_ITERS 2048) +set(BENCH_ITERS 128) set(WORKGROUP_SIZE 64) target_compile_definitions(${EXECUTABLE_NAME} PRIVATE @@ -99,7 +99,7 @@ target_compile_definitions(${EXECUTABLE_NAME} PRIVATE WORKGROUP_SIZE=${WORKGROUP_SIZE} ) -set(BENCH_OPTS "\"-DBENCH_ITERS=${BENCH_ITERS}\", \"-DWORKGROUP_SIZE=${WORKGROUP_SIZE}\"") +set(BENCH_OPTS "\"-DBENCH_ITERS=${BENCH_ITERS}\"") set(JSON " [ @@ -340,14 +340,24 @@ set(JSON " \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DBENCH_CREATE_ONLY\", \"-DBENCH_VARIANT_R0_EXTENTS\"] }, { - \"INPUT\": \"app_resources/shaders/alias_table_test.comp.hlsl\", - \"KEY\": \"alias_table_test\" + \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\", + \"KEY\": \"packed_alias_a_test\" }, { - \"INPUT\": \"app_resources/shaders/alias_table_test.comp.hlsl\", - \"KEY\": \"alias_table_bench\", + \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\", + \"KEY\": \"packed_alias_b_test\", + \"COMPILE_OPTIONS\": [\"-DNBL_PACKED_ALIAS_B\"] + }, + { + \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\", + \"KEY\": \"packed_alias_a_bench\", \"COMPILE_OPTIONS\": [${BENCH_OPTS}] }, + { + \"INPUT\": \"app_resources/shaders/packed_alias_test.comp.hlsl\", + \"KEY\": \"packed_alias_b_bench\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_PACKED_ALIAS_B\"] + }, { \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", \"KEY\": \"cumulative_probability_test\" @@ -361,6 +371,11 @@ set(JSON " \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", \"KEY\": \"cumulative_probability_yolo_bench\", \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_CUMPROB_YOLO_READS\"] + }, + { + \"INPUT\": \"app_resources/shaders/cumulative_probability_test.comp.hlsl\", + \"KEY\": \"cumulative_probability_eytzinger_bench\", + \"COMPILE_OPTIONS\": [${BENCH_OPTS}, \"-DNBL_CUMPROB_EYTZINGER\"] } ] ") @@ -370,7 +385,7 @@ NBL_CREATE_NSC_COMPILE_RULES( LINK_TO ${EXECUTABLE_NAME} BINARY_DIR ${OUTPUT_DIRECTORY} MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT - COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR} -T cs_6_8 + COMMON_OPTIONS -I ${CMAKE_CURRENT_SOURCE_DIR} -T cs_6_8 -DWORKGROUP_SIZE=${WORKGROUP_SIZE} OUTPUT_VAR KEYS INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp NAMESPACE nbl::this_example::builtin::build diff --git a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl index bb1ed54ef..08706408f 100644 --- a/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/alias_table.hlsl @@ -8,12 +8,28 @@ using namespace nbl::hlsl; NBL_CONSTEXPR uint32_t AliasTestTableSize = 4; +// Log2N = ceil_log2(N) minimises quantisation drift on the stayProb unorm +// (here 30 unorm bits, essentially lossless). +NBL_CONSTEXPR uint32_t AliasTestLog2N = 2; -using AliasTestProbAccessor = ArrayAccessor; -using AliasTestAliasAccessor = ArrayAccessor; -using AliasTestPdfAccessor = ArrayAccessor; +using AliasTestPdfAccessor = ArrayAccessor; +using AliasTestPackedWordAccessor = ArrayAccessor; -using AliasTestSampler = sampling::AliasTable; +// Dedicated struct-valued accessor for PackedAliasEntryB. Field-wise copy +// sidesteps HLSL's struct functional-cast ambiguity. +struct AliasTestEntryBAccessor +{ + using value_type = sampling::PackedAliasEntryB; + + template + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC + { + val.packedWord = data[i].packedWord; + val.ownPdf = data[i].ownPdf; + } + + value_type data[AliasTestTableSize]; +}; struct AliasTableInputValues { @@ -22,7 +38,7 @@ struct AliasTableInputValues struct AliasTableTestResults { - uint32_t generatedIndex; + uint32_t generatedIndex; float32_t forwardPdf; float32_t backwardPdf; float32_t forwardWeight; @@ -31,24 +47,55 @@ struct AliasTableTestResults }; // Pre-computed alias table for weights {1, 2, 3, 4}: -// pdf = {0.1, 0.2, 0.3, 0.4} -// prob = {0.4, 0.8, 1.0, 0.8} -// alias = {3, 3, 2, 2} -struct AliasTableTestExecutor +// pdf = {0.1, 0.2, 0.3, 0.4} +// stayProb = {0.4, 0.8, 1.0, 0.8} +// alias = {3, 3, 2, 2} +// +// Log2N = 2 unorm encoding (30 bits for stayProb, 2 bits for alias): +// packedWord = (alias & 0x3) | (round(stayProb * ((1u<<30) - 1)) << 2) +// bin 0: (3) | (429496729 << 2) = 0x66666667 +// bin 1: (3) | (858993458 << 2) = 0xCCCCCCCB +// bin 2: (2) | (1073741823 << 2) = 0xFFFFFFFE +// bin 3: (2) | (858993458 << 2) = 0xCCCCCCCA + +struct PackedAliasATestExecutor +{ + void operator()(NBL_CONST_REF_ARG(AliasTableInputValues) input, NBL_REF_ARG(AliasTableTestResults) output) + { + AliasTestPackedWordAccessor wordAcc; + wordAcc.data[0] = 0x66666667u; + wordAcc.data[1] = 0xCCCCCCCBu; + wordAcc.data[2] = 0xFFFFFFFEu; + wordAcc.data[3] = 0xCCCCCCCAu; + + AliasTestPdfAccessor pdfAcc; + pdfAcc.data[0] = 0.1f; + pdfAcc.data[1] = 0.2f; + pdfAcc.data[2] = 0.3f; + pdfAcc.data[3] = 0.4f; + + using Sampler = sampling::PackedAliasTableA; + Sampler sampler = Sampler::create(wordAcc, pdfAcc, AliasTestTableSize); + + Sampler::cache_type cache; + output.generatedIndex = sampler.generate(input.u, cache); + output.forwardPdf = sampler.forwardPdf(input.u, cache); + output.backwardPdf = sampler.backwardPdf(output.generatedIndex); + output.forwardWeight = sampler.forwardWeight(input.u, cache); + output.backwardWeight = sampler.backwardWeight(output.generatedIndex); + output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; + } +}; + +struct PackedAliasBTestExecutor { void operator()(NBL_CONST_REF_ARG(AliasTableInputValues) input, NBL_REF_ARG(AliasTableTestResults) output) { - AliasTestProbAccessor probAcc; - probAcc.data[0] = 0.4f; - probAcc.data[1] = 0.8f; - probAcc.data[2] = 1.0f; - probAcc.data[3] = 0.8f; - - AliasTestAliasAccessor aliasAcc; - aliasAcc.data[0] = 3u; - aliasAcc.data[1] = 3u; - aliasAcc.data[2] = 2u; - aliasAcc.data[3] = 2u; + AliasTestEntryBAccessor entryAcc; + entryAcc.data[0].packedWord = 0x66666667u; entryAcc.data[0].ownPdf = 0.1f; + entryAcc.data[1].packedWord = 0xCCCCCCCBu; entryAcc.data[1].ownPdf = 0.2f; + entryAcc.data[2].packedWord = 0xFFFFFFFEu; entryAcc.data[2].ownPdf = 0.3f; + entryAcc.data[3].packedWord = 0xCCCCCCCAu; entryAcc.data[3].ownPdf = 0.4f; AliasTestPdfAccessor pdfAcc; pdfAcc.data[0] = 0.1f; @@ -56,14 +103,15 @@ struct AliasTableTestExecutor pdfAcc.data[2] = 0.3f; pdfAcc.data[3] = 0.4f; - AliasTestSampler sampler = AliasTestSampler::create(probAcc, aliasAcc, pdfAcc, AliasTestTableSize); + using Sampler = sampling::PackedAliasTableB; + Sampler sampler = Sampler::create(entryAcc, pdfAcc, AliasTestTableSize); - AliasTestSampler::cache_type cache; - output.generatedIndex = sampler.generate(input.u, cache); - output.forwardPdf = sampler.forwardPdf(input.u, cache); - output.backwardPdf = sampler.backwardPdf(output.generatedIndex); - output.forwardWeight = sampler.forwardWeight(input.u, cache); - output.backwardWeight = sampler.backwardWeight(output.generatedIndex); + Sampler::cache_type cache; + output.generatedIndex = sampler.generate(input.u, cache); + output.forwardPdf = sampler.forwardPdf(input.u, cache); + output.backwardPdf = sampler.backwardPdf(output.generatedIndex); + output.forwardWeight = sampler.forwardWeight(input.u, cache); + output.backwardWeight = sampler.backwardWeight(output.generatedIndex); output.jacobianProduct = (float32_t(1.0) / output.forwardPdf) * output.backwardPdf; } }; diff --git a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl index d5c1d313c..198b72faf 100644 --- a/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/discrete_sampler_bench.hlsl @@ -7,18 +7,20 @@ using namespace nbl::hlsl; NBL_CONSTEXPR uint32_t WorkgroupSize = WORKGROUP_SIZE; -struct AliasTablePushConstants +struct CumProbPushConstants { - uint64_t probAddress; // float probability[N] - uint64_t aliasAddress; // uint32_t alias[N] - uint64_t pdfAddress; // float pdf[N] + uint64_t cumProbAddress; // float cumProb[N-1] uint64_t outputAddress; // uint32_t acc[threadCount] uint32_t tableSize; // N }; -struct CumProbPushConstants +// Variants A and B both take the entry array plus a separate pdf[] array +// (A: 4 B words, B: 8 B {packedWord, ownPdf}; pdf[] has the same contents in +// both but is tapped independently by the sampler). +struct PackedAliasABPushConstants { - uint64_t cumProbAddress; // float cumProb[N-1] + uint64_t entriesAddress; // A: uint32_t words[N] (4 B); B: PackedAliasEntryB[N] (8 B) + uint64_t pdfAddress; // float pdf[N] uint64_t outputAddress; // uint32_t acc[threadCount] uint32_t tableSize; // N }; diff --git a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl index 1828139d4..d3cd09326 100644 --- a/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl +++ b/37_HLSLSamplingTests/app_resources/common/spherical_triangle.hlsl @@ -59,7 +59,7 @@ struct SphericalTriangleTestExecutor output.backwardWeight = sampler.backwardWeight(output.generated); } // Roundtrip error: ||u - u'|| - output.roundtripError = nbl::hlsl::abs(input.u - output.inverted);. + output.roundtripError = nbl::hlsl::abs(input.u - output.inverted); output.jacobianProduct = computeJacobianProduct(sampler, input.u, 1e-3f, 20.0f); // Domain preservation: diff --git a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl deleted file mode 100644 index 67047f997..000000000 --- a/37_HLSLSamplingTests/app_resources/shaders/alias_table_test.comp.hlsl +++ /dev/null @@ -1,74 +0,0 @@ -#pragma shader_stage(compute) - -#include - -#ifdef BENCH_ITERS -#include "../common/discrete_sampler_bench.hlsl" -#include - -[[vk::push_constant]] AliasTablePushConstants pc; - -struct BdaProbabilityAccessor -{ - template && is_integral_v) - void get(I i, NBL_REF_ARG(V) val) { val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i)); } - uint64_t addr; -}; - -struct BdaAliasIndexAccessor -{ - template && is_integral_v) - void get(I i, NBL_REF_ARG(V) val) { val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i)); } - uint64_t addr; -}; - -struct BdaPdfAccessor -{ - template && is_integral_v) - void get(I i, NBL_REF_ARG(V) val) { val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i)); } - uint64_t addr; -}; - -using BenchAliasTable = sampling::AliasTable; -#else -#include "../common/alias_table.hlsl" - -[[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; -[[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; -#endif - -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif -[numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] -void main() -{ - const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; - -#ifdef BENCH_ITERS - BdaProbabilityAccessor probAcc; - probAcc.addr = pc.probAddress; - BdaAliasIndexAccessor aliasAcc; - aliasAcc.addr = pc.aliasAddress; - BdaPdfAccessor pdfAcc; - pdfAcc.addr = pc.pdfAddress; - BenchAliasTable sampler = BenchAliasTable::create(probAcc, aliasAcc, pdfAcc, pc.tableSize); - - float32_t xi = float32_t(nbl::hlsl::glsl::bitfieldReverse(invID)) / float32_t(~0u); - NBL_CONSTEXPR float32_t goldenRatio = 0.6180339887498949f; - uint32_t acc = 0u; - for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) - { - xi = frac(xi + goldenRatio); - BenchAliasTable::cache_type cache; - uint32_t generated = sampler.generate(xi, cache); - acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache)); - } - - vk::RawBufferStore(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc); -#else - AliasTableTestExecutor executor; - executor(inputTestValues[invID], outputTestValues[invID]); -#endif -} diff --git a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl index 03ac7b36a..438eea31e 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/bilinear_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl index 6189d4658..1fb5f6644 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/box_muller_transform_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl index 649c323b2..2a7f1861e 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/concentric_mapping_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl index 1091ee447..f06613b49 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/cumulative_probability_test.comp.hlsl @@ -12,13 +12,18 @@ struct BdaCumProbAccessor { using value_type = float32_t; template - void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC { val = V(vk::RawBufferLoad(addr + uint64_t(sizeof(value_type)) * uint64_t(i))); } - value_type operator[](uint32_t i) NBL_CONST_MEMBER_FUNC { value_type v; get(i, v); return v; } + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC { val = V(vk::RawBufferLoad(addr + uint64_t(sizeof(value_type)) * uint64_t(i), sizeof(value_type))); } uint64_t addr; }; -using BenchCumProbSampler = sampling::CumulativeProbabilitySampler; +#if defined(NBL_CUMPROB_EYTZINGER) +using BenchCumProbSampler = sampling::CumulativeProbabilitySampler; +#elif defined(NBL_CUMPROB_YOLO_READS) +using BenchCumProbSampler = sampling::CumulativeProbabilitySampler; +#else +using BenchCumProbSampler = sampling::CumulativeProbabilitySampler; +#endif #else #include "../common/cumulative_probability.hlsl" @@ -26,11 +31,7 @@ using BenchCumProbSampler = sampling::CumulativeProbabilitySampler outputTestValues; #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl index 17cf83ac5..7b97645b5 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/linear_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl new file mode 100644 index 000000000..b0dbeedac --- /dev/null +++ b/37_HLSLSamplingTests/app_resources/shaders/packed_alias_test.comp.hlsl @@ -0,0 +1,114 @@ +#pragma shader_stage(compute) + +#include + +#ifdef BENCH_ITERS +#include "../common/discrete_sampler_bench.hlsl" +#include + +[[vk::push_constant]] PackedAliasABPushConstants pc; + +// Log2N bucket. Covers all sweep sizes up to 2^LOG2N buckets without precision +// loss. The same value must be passed to the host-side packA() / +// packB() call so the bit layouts match. +NBL_CONSTEXPR uint32_t LOG2N_BUCKET = 26; + +// Variant A accessor: 4 B packed words. +struct BdaPackedWordAccessor +{ + using value_type = uint32_t; + + template && is_integral_v) + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC + { + val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i), sizeof(V)); + } + + uint64_t addr; +}; + +// Variant B accessor: 8 B PackedAliasEntryB. Loads a uint2 and decomposes it +// into the POD entry so DXC never sees a bitfield — avoids the Insert/Extract +// round-trip we observed when the sampler read from a bitfield struct. +struct BdaPackedAliasBAccessor +{ + using value_type = nbl::hlsl::sampling::PackedAliasEntryB; + + template) + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC + { + const uint64_t loadAddr = addr + uint64_t(8u) * uint64_t(i); + const uint2 raw = vk::RawBufferLoad(loadAddr, 8u); + val.packedWord = raw.x; + val.ownPdf = asfloat(raw.y); + } + + uint64_t addr; +}; + +// Separate 4 B pdf[] accessor. +struct BdaPdfAccessor +{ + using value_type = float32_t; + + template && is_integral_v) + void get(I i, NBL_REF_ARG(V) val) NBL_CONST_MEMBER_FUNC + { + val = vk::RawBufferLoad(addr + uint64_t(sizeof(V)) * uint64_t(i), sizeof(V)); + } + + uint64_t addr; +}; + +#ifdef NBL_PACKED_ALIAS_B +using BenchPackedAlias = nbl::hlsl::sampling::PackedAliasTableB; +#else +using BenchPackedAlias = nbl::hlsl::sampling::PackedAliasTableA; +#endif + +#else +#include "../common/alias_table.hlsl" + +[[vk::binding(0, 0)]] RWStructuredBuffer inputTestValues; +[[vk::binding(1, 0)]] RWStructuredBuffer outputTestValues; +#endif + +[numthreads(WORKGROUP_SIZE, 1, 1)] +void main() +{ + const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; + +#ifdef BENCH_ITERS +#ifdef NBL_PACKED_ALIAS_B + BdaPackedAliasBAccessor entryAcc; +#else + BdaPackedWordAccessor entryAcc; +#endif + entryAcc.addr = pc.entriesAddress; + BdaPdfAccessor pdfAcc; + pdfAcc.addr = pc.pdfAddress; + BenchPackedAlias sampler = BenchPackedAlias::create(entryAcc, pdfAcc, pc.tableSize); + + float32_t xi = float32_t(nbl::hlsl::glsl::bitfieldReverse(invID)) / float32_t(~0u); + NBL_CONSTEXPR float32_t goldenRatio = 0.6180339887498949f; + uint32_t acc = 0u; + + [loop] + for (uint32_t i = 0u; i < uint32_t(BENCH_ITERS); i++) + { + xi = frac(xi + goldenRatio); + BenchPackedAlias::cache_type cache; + uint32_t generated = sampler.generate(xi, cache); + acc ^= generated ^ asuint(sampler.forwardPdf(xi, cache)); + } + + vk::RawBufferStore(pc.outputAddress + uint64_t(sizeof(uint32_t)) * uint64_t(invID), acc); +#else +#ifdef NBL_PACKED_ALIAS_B + PackedAliasBTestExecutor executor; +#else + PackedAliasATestExecutor executor; +#endif + executor(inputTestValues[invID], outputTestValues[invID]); +#endif +} diff --git a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl index e0cf7aea0..b5d48cc36 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/polar_mapping_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl index d1ef313e5..f543d6dc2 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_hemisphere_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl index 9b8c234c4..ca4e7eef7 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_sphere_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl index ca9b4d43e..fc4ae03b7 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_rectangle_test.comp.hlsl @@ -17,12 +17,8 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void -main() +void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS diff --git a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl index 3d8ec8961..e32251ed8 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/projected_spherical_triangle_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl index b9766d5ff..542d20587 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_rectangle_test.comp.hlsl @@ -17,12 +17,8 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void -main() +void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; #ifdef BENCH_ITERS diff --git a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl index 3595ac86a..bc55facbd 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/spherical_triangle.comp.hlsl @@ -15,9 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif + [numthreads(WORKGROUP_SIZE, 1, 1)] void main() { diff --git a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl index cd43c630e..3c832e995 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/test_compile.comp.hlsl @@ -190,16 +190,6 @@ void main() aliasIdx.data[0] = 1u; aliasIdx.data[1] = 2u; aliasIdx.data[2] = 3u; aliasIdx.data[3] = 0u; ArrayAccessor aliasPdf; aliasPdf.data[0] = 0.25; aliasPdf.data[1] = 0.25; aliasPdf.data[2] = 0.25; aliasPdf.data[3] = 0.25; - sampling::AliasTable, ArrayAccessor, ArrayAccessor > aliasTable = - sampling::AliasTable, ArrayAccessor, ArrayAccessor >::create(aliasProb, aliasIdx, aliasPdf, 4u); - sampling::AliasTable, ArrayAccessor, ArrayAccessor >::cache_type aliasCache; - uint32_t aliasBin0 = aliasTable.generate(0.3); - uint32_t aliasBin = aliasTable.generate(0.3, aliasCache); - acc.x += float32_t(aliasBin0 + aliasBin); - acc.x += aliasTable.forwardPdf(0.3, aliasCache); - acc.x += aliasTable.forwardWeight(0.3, aliasCache); - acc.x += aliasTable.backwardPdf(aliasBin); - acc.x += aliasTable.backwardWeight(aliasBin); // CumulativeProbabilitySampler — generate (with/without cache), forwardPdf, backwardPdf, forwardWeight, backwardWeight ArrayAccessor cumProb; diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl index 3c43ee119..c0a0e58b2 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_hemisphere_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl index 5879e28bb..1c810afbf 100644 --- a/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl +++ b/37_HLSLSamplingTests/app_resources/shaders/uniform_sphere_test.comp.hlsl @@ -15,11 +15,7 @@ #define BENCH_SAMPLES_PER_CREATE (BENCH_ITERS) #endif -#ifndef WORKGROUP_SIZE -#define WORKGROUP_SIZE 64 -#endif [numthreads(WORKGROUP_SIZE, 1, 1)] -[shader("compute")] void main() { const uint32_t invID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; diff --git a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h index 02fbf58d2..b2a2fad9a 100644 --- a/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CDiscreteSamplerBenchmark.h @@ -11,97 +11,103 @@ using namespace nbl; -// Benchmarks alias table vs cumulative probability sampler on the GPU using BDA. -// Builds pipelines once, then sweeps a list of table sizes. For each N it builds -// both tables from the same weight distribution, uploads via BDA buffers, and -// measures GPU throughput using timestamp queries. The cumulative probability -// sampler is run in two variants: the stateful-comparator cache population -// (default) and the "YOLO re-read" variant (cumulative_probability.hlsl). class CDiscreteSamplerBenchmark { public: struct SetupData { - core::smart_refctd_ptr device; - core::smart_refctd_ptr api; - core::smart_refctd_ptr assetMgr; - core::smart_refctd_ptr logger; - video::IPhysicalDevice* physicalDevice; - std::string aliasShaderKey; - std::string cumProbShaderKey; - std::string cumProbYoloShaderKey; - uint32_t computeFamilyIndex; - uint32_t dispatchGroupCount; + core::smart_refctd_ptr device; + core::smart_refctd_ptr api; + core::smart_refctd_ptr assetMgr; + core::smart_refctd_ptr logger; + IPhysicalDevice* physicalDevice; + std::string packedAliasAShaderKey; + std::string packedAliasBShaderKey; + std::string cumProbShaderKey; + std::string cumProbYoloShaderKey; + std::string cumProbEytzingerShaderKey; + uint32_t computeFamilyIndex; + uint32_t dispatchGroupCount; }; void setup(const SetupData& data) { - m_device = data.device; - m_logger = data.logger; - m_assetMgr = data.assetMgr; + m_device = data.device; + m_logger = data.logger; + m_assetMgr = data.assetMgr; m_dispatchGroupCount = data.dispatchGroupCount; - m_physicalDevice = data.physicalDevice; + m_physicalDevice = data.physicalDevice; m_queue = m_device->getQueue(data.computeFamilyIndex, 0); + // Staging-upload utility. Without this, BDA buffers land in host-visible (system RAM) + // and every sampler load becomes a PCIe round-trip instead of hitting VRAM/L2. + m_utils = IUtilities::create(core::smart_refctd_ptr(m_device), core::smart_refctd_ptr(m_logger)); + // Command pool + buffers - m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchCmdbuf); + m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchCmdbuf); // Timestamp query pool { - video::IQueryPool::SCreationParams qp = {}; - qp.queryType = video::IQueryPool::TYPE::TIMESTAMP; - qp.queryCount = 2; - qp.pipelineStatisticsFlags = video::IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; - m_queryPool = m_device->createQueryPool(qp); + IQueryPool::SCreationParams qp = {}; + qp.queryType = IQueryPool::TYPE::TIMESTAMP; + qp.queryCount = 2; + qp.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + m_queryPool = m_device->createQueryPool(qp); } const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE; - // Shared output buffer (size only depends on thread count) + // Shared output buffer (size only depends on thread count). GPU writes via BDA and + // nothing reads it on the CPU, so pin it to device-local VRAM. { - video::IGPUBuffer::SCreationParams bp = {}; - bp.size = totalThreads * sizeof(uint32_t); - bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | - video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - m_outputBuf = m_device->createBuffer(std::move(bp)); - video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputBuf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits(); - m_device->allocate(reqs, m_outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + IGPUBuffer::SCreationParams bp = {}; + bp.size = totalThreads * sizeof(uint32_t); + bp.usage = core::bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + m_outputBuf = m_device->createBuffer(std::move(bp)); + IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputBuf->getMemoryReqs(); + reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits(); + m_device->allocate(reqs, m_outputBuf.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); } // Pipelines (N-independent; only push constants change per run) - m_aliasPipeline = createPipeline(data.aliasShaderKey, m_aliasPplnLayout, "alias"); - m_cumProbPipeline = createPipeline(data.cumProbShaderKey, m_cumProbPplnLayout, "cumprob-comparator"); - m_cumProbYoloPipeline = createPipeline(data.cumProbYoloShaderKey, m_cumProbYoloPplnLayout, "cumprob-yolo"); + m_packedAliasAPipeline = createPipeline(data.packedAliasAShaderKey, m_packedAliasAPplnLayout, "alias-packed-A"); + m_packedAliasBPipeline = createPipeline(data.packedAliasBShaderKey, m_packedAliasBPplnLayout, "alias-packed-B"); + m_cumProbPipeline = createPipeline(data.cumProbShaderKey, m_cumProbPplnLayout, "cumprob-comparator"); + m_cumProbYoloPipeline = createPipeline(data.cumProbYoloShaderKey, m_cumProbYoloPplnLayout, "cumprob-yolo"); + m_cumProbEytzingerPipeline = createPipeline(data.cumProbEytzingerShaderKey, m_cumProbEytzingerPplnLayout, "cumprob-eytzinger"); } // DispatchScheduler: uint32_t N -> std::pair. // Lets the caller trade wall-clock for statistical stability per size: // big-N runs are DRAM-bound and need fewer dispatches to hit the same total sample count. - struct DispatchCounts { uint32_t warmup; uint32_t bench; }; + struct DispatchCounts + { + uint32_t warmup; + uint32_t bench; + }; - // Sweep a list of table sizes. For each N: build tables from a fresh weight - // distribution (deterministic seed = 42 + N so different N's get distinct - // distributions but runs are reproducible), upload via BDA, then run all - // three samplers with the dispatch counts chosen by `scheduler`. template void runSweep(const std::vector& tableSizes, DispatchScheduler scheduler) { const uint32_t totalThreads = m_dispatchGroupCount * WORKGROUP_SIZE; m_logger->log("=== GPU Discrete Sampler Benchmark sweep (%u threads * %u iters/thread; wg=%u; dispatches chosen per-N) ===", - system::ILogger::ELL_PERFORMANCE, totalThreads, BENCH_ITERS, WORKGROUP_SIZE); - m_logger->log("%12s | %-28s | %12s | %12s | %12s | %10s", - system::ILogger::ELL_PERFORMANCE, "N", "Sampler", "ps/sample", "GSamples/s", "ms total", "dispatches"); + ILogger::ELL_PERFORMANCE, totalThreads, BENCH_ITERS, WORKGROUP_SIZE); + m_logger->log("%12s | %-34s | %12s | %12s | %12s | %10s", ILogger::ELL_PERFORMANCE, + "N", "Sampler", "ps/sample", "GSamples/s", "ms total", "dispatches"); for (uint32_t N : tableSizes) { const DispatchCounts dc = scheduler(N); buildAndUpload(N); - runSingle(N, "AliasTable", m_aliasPipeline, m_aliasPplnLayout, SamplerKind::Alias, dc.warmup, dc.bench); - runSingle(N, "CumulativeProbability", m_cumProbPipeline, m_cumProbPplnLayout, SamplerKind::CumProbCompare, dc.warmup, dc.bench); - runSingle(N, "CumulativeProbability (YOLO)", m_cumProbYoloPipeline, m_cumProbYoloPplnLayout, SamplerKind::CumProbYolo, dc.warmup, dc.bench); + // Packed A wins N<=16k; Packed B wins N>=32k. SoA and Packed C were dominated + // across every N measured, removed from the sweep. + runSingle(N, "AliasTable (packed A, 4 B)", m_packedAliasAPipeline, m_packedAliasAPplnLayout, SamplerKind::AliasPackedA, dc.warmup, dc.bench); + runSingle(N, "AliasTable (packed B, 8 B)", m_packedAliasBPipeline, m_packedAliasBPplnLayout, SamplerKind::AliasPackedB, dc.warmup, dc.bench); + runSingle(N, "CumulativeProbability", m_cumProbPipeline, m_cumProbPplnLayout, SamplerKind::CumProbCompare, dc.warmup, dc.bench); + runSingle(N, "CumulativeProbability (YOLO)", m_cumProbYoloPipeline, m_cumProbYoloPplnLayout, SamplerKind::CumProbYolo, dc.warmup, dc.bench); + runSingle(N, "CumulativeProbability (Eytzinger)", m_cumProbEytzingerPipeline, m_cumProbEytzingerPplnLayout, SamplerKind::CumProbEytzinger, dc.warmup, dc.bench); releaseTables(); } } @@ -109,76 +115,74 @@ class CDiscreteSamplerBenchmark // Convenience: sweep with fixed dispatch counts for every size. void runSweep(const std::vector& tableSizes, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) { - runSweep(tableSizes, [warmupIterations, benchmarkIterations](uint32_t) -> DispatchCounts { - return {warmupIterations, benchmarkIterations}; - }); + runSweep(tableSizes, [warmupIterations, benchmarkIterations](uint32_t) -> DispatchCounts + { return {warmupIterations, benchmarkIterations}; }); } private: - enum class SamplerKind { Alias, CumProbCompare, CumProbYolo }; + enum class SamplerKind + { + AliasPackedA, + AliasPackedB, + CumProbCompare, + CumProbYolo, + CumProbEytzinger + }; template - core::smart_refctd_ptr createPipeline(const std::string& shaderKey, core::smart_refctd_ptr& outLayout, const char* tag) + core::smart_refctd_ptr createPipeline(const std::string& shaderKey, core::smart_refctd_ptr& outLayout, const char* tag) { - const asset::SPushConstantRange pcRange = { - .stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, - .offset = 0, - .size = sizeof(PushConstantT)}; + const SPushConstantRange pcRange = { + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .offset = 0, + .size = sizeof(PushConstantT)}; auto layout = m_device->createPipelineLayout({&pcRange, 1}); if (!layout) - m_logger->log("CDiscreteSamplerBenchmark: failed to create %s pipeline layout", system::ILogger::ELL_ERROR, tag); - - asset::IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; - auto bundle = m_assetMgr->getAsset(shaderKey, lp); - auto source = asset::IAsset::castDown(bundle.getContents()[0]); - auto shader = m_device->compileShader({.source = source.get()}); + m_logger->log("CDiscreteSamplerBenchmark: failed to create %s pipeline layout", ILogger::ELL_ERROR, tag); + + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; + auto bundle = m_assetMgr->getAsset(shaderKey, lp); + auto source = IAsset::castDown(bundle.getContents()[0]); + auto shader = m_device->compileShader({.source = source.get()}); if (!shader) - m_logger->log("CDiscreteSamplerBenchmark: failed to load %s shader", system::ILogger::ELL_ERROR, tag); + m_logger->log("CDiscreteSamplerBenchmark: failed to load %s shader", ILogger::ELL_ERROR, tag); - video::IGPUComputePipeline::SCreationParams pp = {}; - pp.layout = layout.get(); - pp.shader.shader = shader.get(); - pp.shader.entryPoint = "main"; + IGPUComputePipeline::SCreationParams pp = {}; + pp.layout = layout.get(); + pp.shader.shader = shader.get(); + pp.shader.entryPoint = "main"; if (m_device->getEnabledFeatures().pipelineExecutableInfo) { - pp.flags |= video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | video::IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; + pp.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS | IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; } - core::smart_refctd_ptr pipeline; + core::smart_refctd_ptr pipeline; if (!m_device->createComputePipelines(nullptr, {&pp, 1}, &pipeline)) - m_logger->log("CDiscreteSamplerBenchmark: failed to create %s compute pipeline", system::ILogger::ELL_ERROR, tag); + m_logger->log("CDiscreteSamplerBenchmark: failed to create %s compute pipeline", ILogger::ELL_ERROR, tag); if (m_device->getEnabledFeatures().pipelineExecutableInfo) { auto report = system::to_string(pipeline->getExecutableInfo()); - m_logger->log("%s Sampling Pipeline Executable Report:\n%s", system::ILogger::ELL_PERFORMANCE, tag, report.c_str()); + m_logger->log("%s Sampling Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, tag, report.c_str()); } outLayout = std::move(layout); return pipeline; } - core::smart_refctd_ptr createBdaBuffer(const void* srcData, size_t bytes) + core::smart_refctd_ptr createBdaBuffer(const void* srcData, size_t bytes) { - video::IGPUBuffer::SCreationParams bp = {}; - bp.size = bytes; - bp.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | - video::IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - auto buf = m_device->createBuffer(std::move(bp)); - - video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = buf->getMemoryReqs(); - reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); - auto alloc = m_device->allocate(reqs, buf.get(), video::IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - - const auto allocSize = alloc.memory->getAllocationSize(); - if (alloc.memory->map({0ull, allocSize}, video::IDeviceMemoryAllocation::EMCAF_WRITE)) - { - std::memcpy(alloc.memory->getMappedPointer(), srcData, bytes); - video::ILogicalDevice::MappedMemoryRange flushRange(alloc.memory.get(), 0ull, allocSize); - m_device->flushMappedMemoryRanges(1u, &flushRange); - alloc.memory->unmap(); - } + IGPUBuffer::SCreationParams bp = {}; + bp.size = bytes; + bp.usage = core::bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | + IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT | + IGPUBuffer::EUF_TRANSFER_DST_BIT; + + core::smart_refctd_ptr buf; + auto future = m_utils->createFilledDeviceLocalBufferOnDedMem( + SIntendedSubmitInfo {.queue = m_queue}, std::move(bp), srcData); + future.move_into(buf); return buf; } @@ -186,46 +190,53 @@ class CDiscreteSamplerBenchmark { m_currentN = N; - std::vector weights(N); - std::mt19937 rng(42u + N); + std::vector weights(N); + std::mt19937 rng(42u + N); std::uniform_real_distribution dist(0.001f, 100.0f); for (uint32_t i = 0; i < N; i++) weights[i] = dist(rng); - // Alias table - std::vector aliasProb(N); - std::vector aliasIdx(N); - std::vector aliasPdf(N); - std::vector workspace(N); - nbl::hlsl::sampling::AliasTableBuilder::build({weights}, aliasProb.data(), aliasIdx.data(), aliasPdf.data(), workspace.data()); + // Build the alias table SoA (intermediate form), then pack it for variants A and B. + // Builder may pad PoT N to N+1 for cache-friendly stride; returned size drives + // every downstream buffer / push-constant value. + std::vector aliasProb; + std::vector aliasIdx; + std::vector aliasPdf; + m_aliasTableN = sampling::AliasTableBuilder::build({weights}, aliasProb, aliasIdx, aliasPdf); + + constexpr uint32_t kPackedLog2N = 26u; + std::vector packedA(m_aliasTableN); + std::vector> packedB(m_aliasTableN); + sampling::AliasTableBuilder::packA({aliasProb}, {aliasIdx}, packedA.data()); + sampling::AliasTableBuilder::packB({aliasProb}, {aliasIdx}, {aliasPdf}, packedB.data()); // Cumulative probability (N-1 entries, last bucket implicitly 1.0) - std::vector cumProb(N > 0 ? N - 1 : 0); - nbl::hlsl::sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data()); - - m_aliasProbBuf = createBdaBuffer(aliasProb.data(), N * sizeof(float)); - m_aliasIdxBuf = createBdaBuffer(aliasIdx.data(), N * sizeof(uint32_t)); - m_aliasPdfBuf = createBdaBuffer(aliasPdf.data(), N * sizeof(float)); - const size_t cumProbBytes = (N > 0 ? (N - 1) : 0) * sizeof(float); - m_cumProbBuf = cumProbBytes ? createBdaBuffer(cumProb.data(), cumProbBytes) : nullptr; + std::vector cumProb(N - 1u); + sampling::computeNormalizedCumulativeHistogram({weights}, cumProb.data()); + + // Eytzinger level-order tree: 2*P entries where P = nextPot(N) + const uint32_t eytzingerP = sampling::eytzingerLeafCount(N); + const uint32_t eytzingerTreeSize = 2u * eytzingerP; + std::vector cumProbEytzinger(eytzingerTreeSize); + sampling::buildEytzinger({weights}, cumProbEytzinger.data()); + + m_aliasPdfBuf = createBdaBuffer(aliasPdf.data(), m_aliasTableN * sizeof(float)); + m_packedAliasABuf = createBdaBuffer(packedA.data(), m_aliasTableN * sizeof(uint32_t)); + m_packedAliasBBuf = createBdaBuffer(packedB.data(), m_aliasTableN * sizeof(sampling::PackedAliasEntryB)); + m_cumProbBuf = createBdaBuffer(cumProb.data(), (N - 1u) * sizeof(float)); + m_cumProbEytzingerBuf = createBdaBuffer(cumProbEytzinger.data(), eytzingerTreeSize * sizeof(float)); } void releaseTables() { - m_aliasProbBuf = nullptr; - m_aliasIdxBuf = nullptr; - m_aliasPdfBuf = nullptr; - m_cumProbBuf = nullptr; + m_aliasPdfBuf = nullptr; + m_packedAliasABuf = nullptr; + m_packedAliasBBuf = nullptr; + m_cumProbBuf = nullptr; + m_cumProbEytzingerBuf = nullptr; } - void runSingle( - uint32_t N, - const char* name, - const core::smart_refctd_ptr& pipeline, - const core::smart_refctd_ptr& layout, - SamplerKind kind, - uint32_t warmupIterations, - uint32_t benchmarkIterations) + void runSingle(uint32_t N, const char* name, const core::smart_refctd_ptr& pipeline, const core::smart_refctd_ptr& layout, SamplerKind kind, uint32_t warmupIterations, uint32_t benchmarkIterations) { m_device->waitIdle(); @@ -241,96 +252,103 @@ class CDiscreteSamplerBenchmark // don't measure a tail where the GPU is already winding down. const uint32_t cooldownIterations = warmupIterations; - m_benchCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_benchCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_benchCmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); + m_benchCmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); m_benchCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); m_benchCmdbuf->bindComputePipeline(pipeline.get()); - if (kind == SamplerKind::Alias) + if (kind == SamplerKind::AliasPackedA || kind == SamplerKind::AliasPackedB) { - AliasTablePushConstants pc = {}; - pc.probAddress = m_aliasProbBuf->getDeviceAddress(); - pc.aliasAddress = m_aliasIdxBuf->getDeviceAddress(); - pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress(); - pc.outputAddress = m_outputBuf->getDeviceAddress(); - pc.tableSize = N; - m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + PackedAliasABPushConstants pc = {}; + pc.entriesAddress = (kind == SamplerKind::AliasPackedA ? m_packedAliasABuf : m_packedAliasBBuf)->getDeviceAddress(); + pc.pdfAddress = m_aliasPdfBuf->getDeviceAddress(); + pc.outputAddress = m_outputBuf->getDeviceAddress(); + pc.tableSize = m_aliasTableN; + m_benchCmdbuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); } else { - CumProbPushConstants pc = {}; - pc.cumProbAddress = m_cumProbBuf ? m_cumProbBuf->getDeviceAddress() : 0ull; - pc.outputAddress = m_outputBuf->getDeviceAddress(); - pc.tableSize = N; - m_benchCmdbuf->pushConstants(layout.get(), asset::IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + CumProbPushConstants pc = {}; + const auto& buf = (kind == SamplerKind::CumProbEytzinger) ? m_cumProbEytzingerBuf : m_cumProbBuf; + pc.cumProbAddress = buf->getDeviceAddress(); + pc.outputAddress = m_outputBuf->getDeviceAddress(); + pc.tableSize = N; + m_benchCmdbuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); } for (uint32_t i = 0u; i < warmupIterations; ++i) m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); + m_benchCmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); for (uint32_t i = 0u; i < benchmarkIterations; ++i) m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + m_benchCmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); for (uint32_t i = 0u; i < cooldownIterations; ++i) m_benchCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); m_benchCmdbuf->end(); - auto semaphore = m_device->createSemaphore(0u); - const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_benchCmdbuf.get()}}; - const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { - {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - video::IQueue::SSubmitInfo submit = {}; - submit.commandBuffers = benchCmds; - submit.signalSemaphores = signalSem; + auto semaphore = m_device->createSemaphore(0u); + const IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = {{.cmdbuf = m_benchCmdbuf.get()}}; + const IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { + {.semaphore = semaphore.get(), .value = 1u, .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; + IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = benchCmds; + submit.signalSemaphores = signalSem; m_queue->submit({&submit, 1u}); m_device->waitIdle(); - uint64_t timestamps[2] = {}; - const auto flags = core::bitflag(video::IQueryPool::RESULTS_FLAGS::_64_BIT) | - core::bitflag(video::IQueryPool::RESULTS_FLAGS::WAIT_BIT); + uint64_t timestamps[2] = {}; + const auto flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | + core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, timestamps, sizeof(uint64_t), flags); - constexpr uint32_t benchIters = BENCH_ITERS; - const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); - const float64_t elapsed_ns = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod; - const uint64_t totalThreads = uint64_t(m_dispatchGroupCount) * uint64_t(WORKGROUP_SIZE); - const uint64_t totalSamples = uint64_t(benchmarkIterations) * totalThreads * uint64_t(benchIters); - const float64_t ps_per_sample = elapsed_ns * 1e3 / float64_t(totalSamples); - const float64_t gsamples_per_s = float64_t(totalSamples) / elapsed_ns; - const float64_t elapsed_ms = elapsed_ns * 1e-6; - - m_logger->log("%12u | %-28s | %12.3f | %12.3f | %12.3f | %10u", - system::ILogger::ELL_PERFORMANCE, N, name, ps_per_sample, gsamples_per_s, elapsed_ms, benchmarkIterations); + constexpr uint32_t benchIters = BENCH_ITERS; + const float64_t timestampPeriod = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); + const float64_t elapsed_ns = float64_t(timestamps[1] - timestamps[0]) * timestampPeriod; + const uint64_t totalThreads = uint64_t(m_dispatchGroupCount) * uint64_t(WORKGROUP_SIZE); + const uint64_t totalSamples = uint64_t(benchmarkIterations) * totalThreads * uint64_t(benchIters); + const float64_t ps_per_sample = elapsed_ns * 1e3 / float64_t(totalSamples); + const float64_t gsamples_per_s = float64_t(totalSamples) / elapsed_ns; + const float64_t elapsed_ms = elapsed_ns * 1e-6; + + m_logger->log("%12u | %-34s | %12.3f | %12.3f | %12.3f | %10u", + ILogger::ELL_PERFORMANCE, N, name, ps_per_sample, gsamples_per_s, elapsed_ms, benchmarkIterations); } - core::smart_refctd_ptr m_device; - core::smart_refctd_ptr m_logger; - core::smart_refctd_ptr m_assetMgr; - core::smart_refctd_ptr m_cmdpool; - core::smart_refctd_ptr m_benchCmdbuf; - core::smart_refctd_ptr m_queryPool; + core::smart_refctd_ptr m_device; + core::smart_refctd_ptr m_logger; + core::smart_refctd_ptr m_assetMgr; + core::smart_refctd_ptr m_utils; + core::smart_refctd_ptr m_cmdpool; + core::smart_refctd_ptr m_benchCmdbuf; + core::smart_refctd_ptr m_queryPool; // Pipelines (set up once) - core::smart_refctd_ptr m_aliasPplnLayout; - core::smart_refctd_ptr m_aliasPipeline; - core::smart_refctd_ptr m_cumProbPplnLayout; - core::smart_refctd_ptr m_cumProbPipeline; - core::smart_refctd_ptr m_cumProbYoloPplnLayout; - core::smart_refctd_ptr m_cumProbYoloPipeline; - - // Per-N data buffers (rebuilt each sweep step) - core::smart_refctd_ptr m_aliasProbBuf; - core::smart_refctd_ptr m_aliasIdxBuf; - core::smart_refctd_ptr m_aliasPdfBuf; - core::smart_refctd_ptr m_cumProbBuf; + core::smart_refctd_ptr m_packedAliasAPplnLayout; + core::smart_refctd_ptr m_packedAliasAPipeline; + core::smart_refctd_ptr m_packedAliasBPplnLayout; + core::smart_refctd_ptr m_packedAliasBPipeline; + core::smart_refctd_ptr m_cumProbPplnLayout; + core::smart_refctd_ptr m_cumProbPipeline; + core::smart_refctd_ptr m_cumProbYoloPplnLayout; + core::smart_refctd_ptr m_cumProbYoloPipeline; + core::smart_refctd_ptr m_cumProbEytzingerPplnLayout; + core::smart_refctd_ptr m_cumProbEytzingerPipeline; + + // Per-N data buffers (rebuilt each sweep step). pdf[] is shared between A and B. + core::smart_refctd_ptr m_aliasPdfBuf; + core::smart_refctd_ptr m_packedAliasABuf; + core::smart_refctd_ptr m_packedAliasBBuf; + core::smart_refctd_ptr m_cumProbBuf; + core::smart_refctd_ptr m_cumProbEytzingerBuf; // Shared - core::smart_refctd_ptr m_outputBuf; - video::IQueue* m_queue = nullptr; - video::IPhysicalDevice* m_physicalDevice = nullptr; - uint32_t m_dispatchGroupCount = 0; - uint32_t m_currentN = 0; + core::smart_refctd_ptr m_outputBuf; + IQueue* m_queue = nullptr; + IPhysicalDevice* m_physicalDevice = nullptr; + uint32_t m_dispatchGroupCount = 0; + uint32_t m_currentN = 0; + uint32_t m_aliasTableN = 0; }; #endif diff --git a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h index 9f9854ac5..d95d7f103 100644 --- a/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h +++ b/37_HLSLSamplingTests/benchmarks/CSamplerBenchmark.h @@ -35,14 +35,12 @@ class CSamplerBenchmark m_logger = data.logger; m_dispatchGroupCount = data.dispatchGroupCount; - // Command pool + 3 command buffers: benchmark (multi-submit), before/after timestamp + // Single cmdbuf holds [warmup dispatches][ts 0][bench dispatches][ts 1][cooldown dispatches] + // so the driver can pipeline adjacent dispatches and the trailing bench dispatches + // aren't measured in a winding-down tail. m_cmdpool = m_device->createCommandPool(data.computeFamilyIndex, video::IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_benchmarkCmdbuf)) m_logger->log("CSamplerBenchmark: failed to create benchmark cmdbuf", system::ILogger::ELL_ERROR); - if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdbuf)) - m_logger->log("CSamplerBenchmark: failed to create timestamp-before cmdbuf", system::ILogger::ELL_ERROR); - if (!m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdbuf)) - m_logger->log("CSamplerBenchmark: failed to create timestamp-after cmdbuf", system::ILogger::ELL_ERROR); // Timestamp query pool (2 queries: before and after) { @@ -101,26 +99,22 @@ class CSamplerBenchmark m_executableReport = system::to_string(m_pipeline->getExecutableInfo()); } - // Allocate input buffer (host-visible, zero-filled, correctness irrelevant for benchmarking) + // Allocate input buffer (device-local VRAM, zero-filled via cmdFillBuffer; correctness + // irrelevant for benchmarking but we want deterministic input, not garbage) core::smart_refctd_ptr inputBuf; { video::IGPUBuffer::SCreationParams bparams = {}; bparams.size = data.inputBufferBytes; - bparams.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + bparams.usage = core::bitflag(video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | video::IGPUBuffer::EUF_TRANSFER_DST_BIT; inputBuf = m_device->createBuffer(std::move(bparams)); video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = inputBuf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits(); + reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits(); m_inputAlloc = m_device->allocate(reqs, inputBuf.get(), video::IDeviceMemoryAllocation::EMAF_NONE); if (!m_inputAlloc.isValid()) m_logger->log("CSamplerBenchmark: failed to allocate input buffer memory", system::ILogger::ELL_ERROR); - if (m_inputAlloc.memory->map({ 0ull, m_inputAlloc.memory->getAllocationSize() }, video::IDeviceMemoryAllocation::EMCAF_READ)) - { - std::memset(m_inputAlloc.memory->getMappedPointer(), 0, m_inputAlloc.memory->getAllocationSize()); - m_inputAlloc.memory->unmap(); - } } - // Allocate output buffer (host-visible, GPU writes garbage, never read back) + // Allocate output buffer (device-local VRAM, GPU writes, never read back) core::smart_refctd_ptr outputBuf; { video::IGPUBuffer::SCreationParams bparams = {}; @@ -128,12 +122,29 @@ class CSamplerBenchmark bparams.usage = video::IGPUBuffer::EUF_STORAGE_BUFFER_BIT; outputBuf = m_device->createBuffer(std::move(bparams)); video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuf->getMemoryReqs(); - reqs.memoryTypeBits &= data.physicalDevice->getHostVisibleMemoryTypeBits(); + reqs.memoryTypeBits &= data.physicalDevice->getDeviceLocalMemoryTypeBits(); m_outputAlloc = m_device->allocate(reqs, outputBuf.get(), video::IDeviceMemoryAllocation::EMAF_NONE); if (!m_outputAlloc.isValid()) m_logger->log("CSamplerBenchmark: failed to allocate output buffer memory", system::ILogger::ELL_ERROR); } + // Zero-fill the input buffer once on the GPU + { + core::smart_refctd_ptr initCmdbuf; + m_cmdpool->createCommandBuffers(video::IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &initCmdbuf); + initCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + const asset::SBufferRange range = { .offset = 0, .size = data.inputBufferBytes, .buffer = inputBuf }; + initCmdbuf->fillBuffer(range, 0u); + initCmdbuf->end(); + + auto queue = m_device->getQueue(data.computeFamilyIndex, 0); + const video::IQueue::SSubmitInfo::SCommandBufferInfo cmds[] = { {.cmdbuf = initCmdbuf.get()} }; + video::IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = cmds; + queue->submit({&submit, 1u}); + m_device->waitIdle(); + } + // Descriptor set: bind both buffers auto pool = m_device->createDescriptorPoolForDSLayouts(video::IDescriptorPool::ECF_NONE, { &dsLayout.get(), 1 }); m_ds = pool->createDescriptorSet(core::smart_refctd_ptr(dsLayout)); @@ -161,43 +172,36 @@ class CSamplerBenchmark m_logger->log("%s Sampler Benchmark Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, name.c_str(), m_executableReport.c_str()); } - // Runs warmupIterations submits (unclocked), then benchmarkIterations submits under GPU timestamps. void run(const std::string& samplerName, const std::string& mode, uint32_t warmupIterations = 500, uint32_t benchmarkIterations = 5000) { m_device->waitIdle(); - recordBenchmarkCmdBuf(); - recordTimestampCmdBufs(); - - auto semaphore = m_device->createSemaphore(0u); - uint64_t semCounter = 0u; - const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = { {.cmdbuf = m_benchmarkCmdbuf.get()} }; - const video::IQueue::SSubmitInfo::SCommandBufferInfo beforeCmds[] = { {.cmdbuf = m_timestampBeforeCmdbuf.get()} }; - const video::IQueue::SSubmitInfo::SCommandBufferInfo afterCmds[] = { {.cmdbuf = m_timestampAfterCmdbuf.get()} }; - - // Chains submissions via a timeline semaphore so they execute strictly in order - auto submitSerial = [&](const video::IQueue::SSubmitInfo::SCommandBufferInfo* cmds, uint32_t count) - { - const video::IQueue::SSubmitInfo::SSemaphoreInfo waitSem[] = { - {.semaphore = semaphore.get(), .value = semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} - }; - const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { - {.semaphore = semaphore.get(), .value = ++semCounter, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} - }; - video::IQueue::SSubmitInfo submit = {}; - submit.commandBuffers = {cmds, count}; - submit.waitSemaphores = waitSem; - submit.signalSemaphores = signalSem; - m_queue->submit({&submit, 1u}); - }; + const uint32_t cooldownIterations = warmupIterations; + m_benchmarkCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); + m_benchmarkCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_benchmarkCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); + m_benchmarkCmdbuf->bindComputePipeline(m_pipeline.get()); + m_benchmarkCmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); for (uint32_t i = 0u; i < warmupIterations; ++i) - submitSerial(benchCmds, 1u); - - submitSerial(beforeCmds, 1u); + m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); + m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); for (uint32_t i = 0u; i < benchmarkIterations; ++i) - submitSerial(benchCmds, 1u); - submitSerial(afterCmds, 1u); + m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); + m_benchmarkCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + for (uint32_t i = 0u; i < cooldownIterations; ++i) + m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); + m_benchmarkCmdbuf->end(); + + auto semaphore = m_device->createSemaphore(0u); + const video::IQueue::SSubmitInfo::SCommandBufferInfo benchCmds[] = { {.cmdbuf = m_benchmarkCmdbuf.get()} }; + const video::IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { + {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} + }; + video::IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = benchCmds; + submit.signalSemaphores = signalSem; + m_queue->submit({&submit, 1u}); m_device->waitIdle(); @@ -219,36 +223,10 @@ class CSamplerBenchmark } private: - void recordBenchmarkCmdBuf() - { - m_benchmarkCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_benchmarkCmdbuf->begin(video::IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); - m_benchmarkCmdbuf->bindComputePipeline(m_pipeline.get()); - m_benchmarkCmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); - m_benchmarkCmdbuf->dispatch(m_dispatchGroupCount, 1, 1); - m_benchmarkCmdbuf->end(); - } - - void recordTimestampCmdBufs() - { - m_timestampBeforeCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampBeforeCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampBeforeCmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); - m_timestampBeforeCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0); - m_timestampBeforeCmdbuf->end(); - - m_timestampAfterCmdbuf->reset(video::IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampAfterCmdbuf->begin(video::IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampAfterCmdbuf->writeTimestamp(asset::PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1); - m_timestampAfterCmdbuf->end(); - } - core::smart_refctd_ptr m_device; core::smart_refctd_ptr m_logger; core::smart_refctd_ptr m_cmdpool; core::smart_refctd_ptr m_benchmarkCmdbuf; - core::smart_refctd_ptr m_timestampBeforeCmdbuf; - core::smart_refctd_ptr m_timestampAfterCmdbuf; core::smart_refctd_ptr m_queryPool; core::smart_refctd_ptr m_pplnLayout; core::smart_refctd_ptr m_pipeline; diff --git a/37_HLSLSamplingTests/main.cpp b/37_HLSLSamplingTests/main.cpp index 470132aba..e0248d034 100644 --- a/37_HLSLSamplingTests/main.cpp +++ b/37_HLSLSamplingTests/main.cpp @@ -1,5 +1,7 @@ #include +#include + #include "nbl/examples/examples.hpp" #include "nbl/this_example/builtin/build/spirv/keys.hpp" @@ -109,12 +111,19 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // Note: all samplers almost satisfy BasicSampler, but they have cache parameters in generate(). static_assert(sampling::concepts::BasicSampler>); static_assert(sampling::concepts::BasicSampler>); - static_assert(sampling::concepts::BasicSampler); - static_assert(sampling::concepts::BasicSampler); + static_assert(sampling::concepts::BasicSampler, sampling::TRACKING>>); + static_assert(sampling::concepts::BasicSampler, sampling::YOLO>>); + static_assert(sampling::concepts::BasicSampler, sampling::EYTZINGER>>); + static_assert(sampling::concepts::BasicSampler, ReadOnlyAccessor, 26>>); + static_assert(sampling::concepts::BasicSampler, 4>, ReadOnlyAccessor, 26>>); // --- TractableSampler (level 2) --- generate(domain_type, out cache_type) -> codomain_type, forwardPdf(domain_type, cache_type) -> density_type - static_assert(sampling::concepts::TractableSampler); - static_assert(sampling::concepts::TractableSampler); + ; + static_assert(sampling::concepts::TractableSampler, sampling::TRACKING>>); + static_assert(sampling::concepts::TractableSampler, sampling::YOLO>>); + static_assert(sampling::concepts::TractableSampler, sampling::EYTZINGER>>); + static_assert(sampling::concepts::TractableSampler, ReadOnlyAccessor, 26>>); + static_assert(sampling::concepts::TractableSampler, 4>, ReadOnlyAccessor, 26>>); static_assert(sampling::concepts::TractableSampler>); static_assert(sampling::concepts::TractableSampler>); static_assert(sampling::concepts::TractableSampler>); @@ -130,8 +139,11 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat static_assert(sampling::concepts::TractableSampler>); // --- ResamplableSampler (level 3, parallel) --- generate(domain_type, out cache_type) -> codomain_type, forwardWeight(domain_type, cache_type), backwardWeight(codomain_type) - static_assert(sampling::concepts::ResamplableSampler); - static_assert(sampling::concepts::ResamplableSampler); + static_assert(sampling::concepts::ResamplableSampler, sampling::TRACKING>>); + static_assert(sampling::concepts::ResamplableSampler, sampling::YOLO>>); + static_assert(sampling::concepts::ResamplableSampler, sampling::EYTZINGER>>); + static_assert(sampling::concepts::ResamplableSampler, ReadOnlyAccessor, 26>>); + static_assert(sampling::concepts::ResamplableSampler, 4>, ReadOnlyAccessor, 26>>); static_assert(sampling::concepts::ResamplableSampler>); static_assert(sampling::concepts::ResamplableSampler>); static_assert(sampling::concepts::ResamplableSampler>); @@ -179,11 +191,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // ====================================================================== // GPU throughput benchmarks // ====================================================================== - // 4096 workgroups * WORKGROUP_SIZE(64) = 256k invocations per dispatch — enough - // to saturate a 3080 (68 SMs * ~1536 resident invocations) so memory latency is - // hidden by hyperthreading rather than by cross-dispatch overlap. constexpr uint32_t testBatchCount = 4096; - constexpr bool DoBenchmark = true; + constexpr bool DoBenchmark = true; if constexpr (DoBenchmark) { @@ -195,8 +204,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat struct BenchEntry { CSamplerBenchmark bench; - std::string sampler; - std::string mode; + std::string sampler; + std::string mode; }; std::vector benchmarks; @@ -222,44 +231,47 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat }; // Bench shaders don't read input (hardcoded values) and write a single uint32_t per thread via RWByteAddressBuffer - constexpr size_t benchInputBytes = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks - constexpr size_t benchOutputBytes = sizeof(uint32_t) * totalThreadsPerDispatch; - //addBench("Linear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("Linear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("Bilinear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("Bilinear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("BoxMullerTransform", "1:1", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("BoxMullerTransform", "1:16", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("UniformHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("UniformHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("UniformSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("UniformSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ConcentricMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ConcentricMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("PolarMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("PolarMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "1:1 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "1:16 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "1:1 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "1:16 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "1:1 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "1:16 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "create-only (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "create-only (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalRectangle", "create-only (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphericalRectangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphericalRectangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphericalRectangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("SphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); - //addBench("ProjectedSphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); + if constexpr (true) + { + constexpr size_t benchInputBytes = sizeof(uint32_t); // unused but binding must exist, didn't bother removing because some samplers need more complex inputs and it's easier to have a consistent buffer setup for all benchmarks + constexpr size_t benchOutputBytes = sizeof(uint32_t) * totalThreadsPerDispatch; + addBench("Linear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("Linear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"linear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("Bilinear", "1:1", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("Bilinear", "1:16", nbl::this_example::builtin::build::get_spirv_key<"bilinear_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("BoxMullerTransform", "1:1", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("BoxMullerTransform", "1:16", nbl::this_example::builtin::build::get_spirv_key<"box_muller_transform_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("UniformHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("UniformHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("UniformSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("UniformSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"uniform_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ConcentricMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ConcentricMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"concentric_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("PolarMapping", "1:1", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("PolarMapping", "1:16", nbl::this_example::builtin::build::get_spirv_key<"polar_mapping_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedHemisphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedHemisphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_hemisphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphere", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphere", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_sphere_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "1:1 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "1:16 (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "1:1 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "1:16 (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "1:1 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_1_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "1:16 (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_1_16_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "create-only (shape,observer)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_shape_observer">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "create-only (sa,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_sa_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalRectangle", "create-only (r0,extents)", nbl::this_example::builtin::build::get_spirv_key<"spherical_rectangle_bench_create_only_r0_extents">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphericalRectangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphericalRectangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphericalRectangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_rectangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("SphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphericalTriangle", "1:1", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_1">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphericalTriangle", "1:16", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_1_16">(m_device.get()), benchInputBytes, benchOutputBytes); + addBench("ProjectedSphericalTriangle", "create-only", nbl::this_example::builtin::build::get_spirv_key<"projected_spherical_triangle_bench_create_only">(m_device.get()), benchInputBytes, benchOutputBytes); + } // Print all pipeline reports first for (auto& entry : benchmarks) @@ -268,16 +280,18 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // Discrete sampler benchmark: alias table vs cumulative probability (BDA) { CDiscreteSamplerBenchmark::SetupData dsData; - dsData.device = m_device; - dsData.api = m_api; - dsData.assetMgr = m_assetMgr; - dsData.logger = m_logger; - dsData.physicalDevice = m_physicalDevice; - dsData.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - dsData.aliasShaderKey = nbl::this_example::builtin::build::get_spirv_key<"alias_table_bench">(m_device.get()); - dsData.cumProbShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get()); - dsData.cumProbYoloShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get()); - dsData.dispatchGroupCount = testBatchCount; + dsData.device = m_device; + dsData.api = m_api; + dsData.assetMgr = m_assetMgr; + dsData.logger = m_logger; + dsData.physicalDevice = m_physicalDevice; + dsData.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); + dsData.packedAliasAShaderKey = nbl::this_example::builtin::build::get_spirv_key<"packed_alias_a_bench">(m_device.get()); + dsData.packedAliasBShaderKey = nbl::this_example::builtin::build::get_spirv_key<"packed_alias_b_bench">(m_device.get()); + dsData.cumProbShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_bench">(m_device.get()); + dsData.cumProbYoloShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_yolo_bench">(m_device.get()); + dsData.cumProbEytzingerShaderKey = nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_eytzinger_bench">(m_device.get()); + dsData.dispatchGroupCount = testBatchCount; CDiscreteSamplerBenchmark discreteBench; discreteBench.setup(dsData); @@ -295,41 +309,26 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat } { - // Sweep covers both the YOLO-vs-Comparator comparison (explicit points at - // N=100, 10k, 1M for wg=WORKGROUP_SIZE) and an alias-vs-CDF ramp from - // N=4 up to 32M in a roughly-power-of-8 progression. + // If you change something here, better change kBenchTable below too const std::vector discreteSizes = { - 4u, - 16u, - 32u, - 100u, - 128u, - 512u, - 8192u, - 10000u, - 131072u, - 1000000u, - 2097152u, - 16777216u, - 33554432u, - }; + 2u, 4u, 8u, 16u, 32u, 64u, 100u, 128u, 256u, 400u, 512u, 1024u, 2048u, 2049u, 3000u, 4096u, 7000u, 8192u, 10'000u, 16'384u, 32'768u, + 65'536u, 131'072u, 262'144u, 524'288u, 1'000'000u, 1'048'576u, 2'097'152u, 16'777'216u, 20'971'520u, 25'165'824u, 33'554'432u}; - // Adaptive dispatch scheduler: pick dispatch counts so total wall-clock - // per sampler-per-N stays near 1.5 s. Cost model comes from the prior - // sweep (order-of-magnitude ps/sample vs N). + // Per-N dispatch counts calibrated from a prior measured run auto dispatchScheduler = [](uint32_t N) -> CDiscreteSamplerBenchmark::DispatchCounts { - double ps_per_sample; - if (N < 1000u) ps_per_sample = 15.0; // L1-resident - else if (N < 100000u) ps_per_sample = 100.0; // L1/L2 - else if (N < 2000000u) ps_per_sample = 1000.0; // L2-edge - else ps_per_sample = 8000.0; // DRAM-bound - - constexpr double targetNs = 1.5e9; // ~1.5 s per bench - constexpr uint64_t samplesPerDispatch = uint64_t(WORKGROUP_SIZE) * uint64_t(testBatchCount) * uint64_t(BENCH_ITERS); - const uint64_t targetSamples = uint64_t((targetNs * 1000.0) / ps_per_sample); - const uint32_t bench = std::max(10u, uint32_t(targetSamples / samplesPerDispatch)); - const uint32_t warmup = std::max(20u, bench / 10u); + static constexpr std::pair kBenchTable[] = { + {2u, 7180u}, {4u, 5993u}, {8u, 4490u}, {16u, 4099u}, {32u, 3110u}, {64u, 3026u}, {100u, 2507u}, {128u, 2498u}, {256u, 2477u}, {400u, 2001u}, + {512u, 1827u}, {1024u, 1372u}, {2048u, 1010u}, {2049u, 1010u}, {3000u, 859u}, {4096u, 962u}, {7000u, 742u}, {8192u, 833u}, {10'000u, 590u}, {16'384u, 786u}, {32'768u, 608u}, + {65'536u, 283u}, {131'072u, 174u}, {262'144u, 160u}, {524'288u, 133u}, {1'000'000u, 77u}, {1'048'576u, 128u}, {2'097'152u, 106u}, {16'777'216u, 17u}, {20'971'520u, 17u}, {25'165'824u, 16u}, {33'554'432u, 14u}}; + uint32_t bench = 10u; // fallback for any N not in the table + for (const auto& e : kBenchTable) + if (e.first == N) + { + bench = e.second; + break; + } + const uint32_t warmup = std::max(5u, bench / 10u); return {warmup, bench}; }; @@ -354,8 +353,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat data.logger = m_logger; data.physicalDevice = m_physicalDevice; data.computeFamilyIndex = getComputeQueue()->getFamilyIndex(); - data.shaderKey = spirvKey; - Tester tester(testBatchCount, WORKGROUP_SIZE); + data.shaderKey = std::move(spirvKey); + Tester tester(testBatchCount); tester.setupPipeline(data); pass &= tester.performTestsAndVerifyResults(logFile); }; @@ -388,7 +387,8 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat } // --- GPU table sampler tests --- - runSamplerTest.operator()("AliasTable GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"alias_table_test">(m_device.get()), "AliasTableTestLog.txt"); + runSamplerTest.operator()("PackedAliasA GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"packed_alias_a_test">(m_device.get()), "PackedAliasATestLog.txt"); + runSamplerTest.operator()("PackedAliasB GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"packed_alias_b_test">(m_device.get()), "PackedAliasBTestLog.txt"); runSamplerTest.operator()("CumulativeProbability GPU sampler", nbl::this_example::builtin::build::get_spirv_key<"cumulative_probability_test">(m_device.get()), "CumulativeProbabilityTestLog.txt"); } logJacobianSkipCounts(m_logger.get()); @@ -470,6 +470,7 @@ class HLSLSamplingTests final : public application_templates::MonoDeviceApplicat // ================================================================ // Solid angle accuracy and small triangle convergence tests (CPU-only) // ================================================================ + if constexpr (true) { m_logger->log("Running geometry tests (CPU)...", ILogger::ELL_INFO); m_logger->log("WARNING: CPU math may use higher intermediate precision than GPU shaders. Tolerances that pass here may be too tight for GPU.", ILogger::ELL_WARNING); diff --git a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h index 32f0e3b28..7665ebbb7 100644 --- a/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h +++ b/37_HLSLSamplingTests/tests/CAliasTableGPUTester.h @@ -6,13 +6,31 @@ #include "nbl/examples/Tester/ITester.h" #include "SamplerTestHelpers.h" -class CAliasTableGPUTester final : public ITester +// Shared GPU correctness harness for the packed alias variants. Labels for +// failed-field messages are selected from the Executor type at compile time. +template +class CPackedAliasTableGPUTester final : public ITester { - using base_t = ITester; - using R = AliasTableTestResults; + using base_t = ITester; + using R = AliasTableTestResults; + + using typename base_t::TestType; + using base_t::getRandomEngine; + using base_t::verifyTestValue; + using base_t::printTestFail; + + static constexpr bool kIsA = std::is_same_v; + static constexpr const char* kGeneratedIdxName = kIsA ? "PackedAliasA::generatedIndex" : "PackedAliasB::generatedIndex"; + static constexpr const char* kForwardPdfName = kIsA ? "PackedAliasA::forwardPdf" : "PackedAliasB::forwardPdf"; + static constexpr const char* kBackwardPdfName = kIsA ? "PackedAliasA::backwardPdf" : "PackedAliasB::backwardPdf"; + static constexpr const char* kForwardWeightName = kIsA ? "PackedAliasA::forwardWeight" : "PackedAliasB::forwardWeight"; + static constexpr const char* kBackwardWeightName = kIsA ? "PackedAliasA::backwardWeight" : "PackedAliasB::backwardWeight"; + static constexpr const char* kJacobianName = kIsA ? "PackedAliasA::jacobianProduct" : "PackedAliasB::jacobianProduct"; + static constexpr const char* kPdfConsistencyName = kIsA ? "PackedAliasA::pdf consistency" : "PackedAliasB::pdf consistency"; + static constexpr const char* kWeightConsistencyName = kIsA ? "PackedAliasA::weight consistency" : "PackedAliasB::weight consistency"; public: - CAliasTableGPUTester(const uint32_t testBatchCount, const uint32_t workgroupSize) : base_t(testBatchCount, workgroupSize) {} + CPackedAliasTableGPUTester(const uint32_t testBatchCount) : base_t(testBatchCount, WORKGROUP_SIZE) {} private: AliasTableInputValues generateInputTestValues() override @@ -27,7 +45,7 @@ class CAliasTableGPUTester final : public ITester; +using CPackedAliasBGPUTester = CPackedAliasTableGPUTester; + #endif diff --git a/37_HLSLSamplingTests/tests/CBilinearTester.h b/37_HLSLSamplingTests/tests/CBilinearTester.h index 739af4584..f5bea6896 100644 --- a/37_HLSLSamplingTests/tests/CBilinearTester.h +++ b/37_HLSLSamplingTests/tests/CBilinearTester.h @@ -14,7 +14,7 @@ class CBilinearTester final : public ITester #include #include +#include // Generic ReadOnly accessor wrapping a raw pointer template + requires std::is_arithmetic_v struct ReadOnlyAccessor { - using value_type = T; - template requires std::is_arithmetic_v - void get(I i, V& val) const { val = V(data[i]); } - T operator[](uint32_t i) const { return data[i]; } + using value_type = T; + template + requires std::is_arithmetic_v + void get(I i, V& val) const { val = V(data[i]); } - const T* data; + const T* data; }; -using ProbabilityAccessor = ReadOnlyAccessor; -using AliasIndexAccessor = ReadOnlyAccessor; -using PdfAccessor = ReadOnlyAccessor; - -using TestAliasTable = nbl::hlsl::sampling::AliasTable; -using TestCumulativeProbabilitySampler = nbl::hlsl::sampling::CumulativeProbabilitySampler>; - // Tests table construction for both alias method and cumulative probability. // Sampler generate/pdf correctness is verified by GPU testers (CAliasTableGPUTester, CCumulativeProbabilityGPUTester). class CDiscreteTableTester { -public: - CDiscreteTableTester(system::ILogger* logger) : m_logger(logger) {} - - bool run() - { - bool pass = true; - auto cases = createTestCases(); - - m_logger->log("AliasTableBuilder tests:", system::ILogger::ELL_INFO); - for (const auto& tc : cases) - pass &= testAliasTable(tc.name, tc.weights); - - m_logger->log("CumulativeProbability tests:", system::ILogger::ELL_INFO); - for (const auto& tc : cases) - pass &= testCumulativeProbability(tc.name, tc.weights); - - return pass; - } - -private: - struct TestCase - { - const char* name; - std::vector weights; - }; - - static std::vector createTestCases() - { - std::vector cases; - cases.push_back({"Uniform(4)", {1.0f, 1.0f, 1.0f, 1.0f}}); - cases.push_back({"NonUniform(1,2,3,4)", {1.0f, 2.0f, 3.0f, 4.0f}}); - - { - std::vector w(32, 1.0f); - w[31] = 97.0f; - cases.push_back({"SingleDominant(32)", std::move(w)}); - } - { - std::vector w(64); - for (uint32_t i = 0; i < 64; i++) - w[i] = 1.0f / float(i + 1); - cases.push_back({"PowerLaw(64)", std::move(w)}); - } - - cases.push_back({"SingleNonZero(4)", {0.0f, 0.0f, 5.0f, 0.0f}}); - - { - std::vector w(1024); - std::mt19937 rng(42); - std::uniform_real_distribution dist(0.001f, 100.0f); - for (uint32_t i = 0; i < 1024; i++) - w[i] = dist(rng); - cases.push_back({"Random(1024)", std::move(w)}); - } - - return cases; - } - - // Verify all values in array are in [0, 1] - bool verifyRange01(const char* prefix, const char* name, const char* arrayName, const float* data, uint32_t count) const - { - bool pass = true; - for (uint32_t i = 0; i < count; i++) - { - if (data[i] < 0.0f || data[i] > 1.0f + 1e-6f) - { - m_logger->log("%s[%s] %s[%u] = %f out of range [0, 1]", - system::ILogger::ELL_ERROR, prefix, name, arrayName, i, data[i]); - pass = false; - } - } - return pass; - } - - // Shared: verify PDFs sum to 1 and each matches weight/totalWeight - bool verifyPdf(const char* prefix, const char* name, const float* pdf, const std::vector& weights) const - { - const uint32_t N = static_cast(weights.size()); - float totalWeight = 0.0f; - for (uint32_t i = 0; i < N; i++) - totalWeight += weights[i]; - - bool pass = true; - - float pdfSum = 0.0f; - for (uint32_t i = 0; i < N; i++) - pdfSum += pdf[i]; - - if (std::abs(pdfSum - 1.0f) > 1e-5f) - { - m_logger->log("%s[%s] PDF sum: expected 1.0, got %f", system::ILogger::ELL_ERROR, prefix, name, pdfSum); - pass = false; - } - - for (uint32_t i = 0; i < N; i++) - { - const float expected = weights[i] / totalWeight; - const float err = std::abs(expected - pdf[i]); - if (err > 1e-6f) - { - m_logger->log("%s[%s] pdf[%u]: expected %f, got %f (err=%e)", system::ILogger::ELL_ERROR, prefix, name, i, expected, pdf[i], err); - pass = false; - } - } - - return pass; - } - - // Verify alias table builder output: - // - bucket contributions reconstruct correct probabilities - // - PDFs sum to 1 and match weight/totalWeight - // - alias indices in range, probabilities in [0, 1] - bool testAliasTable(const char* name, const std::vector& weights) const - { - const uint32_t N = static_cast(weights.size()); - - std::vector outProbability(N); - std::vector outAlias(N); - std::vector outPdf(N); - std::vector workspace(N); - - nbl::hlsl::sampling::AliasTableBuilder::build({ weights },outProbability.data(), outAlias.data(), outPdf.data(), workspace.data()); - - // Accumulate bucket contributions - std::vector dest(N, 0.0f); - for (uint32_t i = 0; i < N; i++) - { - dest[i] += outProbability[i]; - dest[outAlias[i]] += (1.0f - outProbability[i]); - } - - bool pass = true; - - float totalWeight = 0.0f; - for (uint32_t i = 0; i < N; i++) - totalWeight += weights[i]; - - for (uint32_t i = 0; i < N; i++) - { - const float expected = weights[i] / totalWeight * float(N); - const float err = std::abs(expected - dest[i]); - const float tolerance = std::max(1e-5f * float(N), 1e-4f); - - if (err > tolerance) - { - m_logger->log("AliasTable[%s] bucket %u: expected %f, got %f (err=%e)", - system::ILogger::ELL_ERROR, name, i, expected, dest[i], err); - pass = false; - } - } - - // Alias indices in range - for (uint32_t i = 0; i < N; i++) - { - if (outAlias[i] >= N) - { - m_logger->log("AliasTable[%s] alias[%u] = %u out of range [0, %u)", - system::ILogger::ELL_ERROR, name, i, outAlias[i], N); - pass = false; - } - } - - pass &= verifyPdf("AliasTable", name, outPdf.data(), weights); - pass &= verifyRange01("AliasTable", name, "probability", outProbability.data(), N); - - if (pass) - m_logger->log(" [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name); - - return pass; - } - - // Verify CDF table construction: - // - cumulative probabilities are monotonically non-decreasing - // - PDFs match weight/totalWeight - // - PDFs sum to 1 - bool testCumulativeProbability(const char* name, const std::vector& weights) const - { - const uint32_t N = static_cast(weights.size()); - - std::vector cumProb(N - 1); - - nbl::hlsl::sampling::computeNormalizedCumulativeHistogram( - std::span(weights), - cumProb.data()); - - bool pass = true; - - // Monotonically non-decreasing - for (uint32_t i = 1; i < N - 1; i++) - { - if (cumProb[i] < cumProb[i - 1] - 1e-7f) - { - m_logger->log("CumProb[%s] non-monotonic at %u: cumProb[%u]=%f < cumProb[%u]=%f", - system::ILogger::ELL_ERROR, name, i, i, cumProb[i], i - 1, cumProb[i - 1]); - pass = false; - } - } - - // Last stored entry should be < 1.0 (the Nth bucket is implicitly 1.0) - if (N > 1 && cumProb[N - 2] >= 1.0f + 1e-6f) - { - m_logger->log("CumProb[%s] last stored entry %f >= 1.0", - system::ILogger::ELL_ERROR, name, cumProb[N - 2]); - pass = false; - } - - // Derive PDF from CDF for verification - std::vector pdf(N); - for (uint32_t i = 0; i < N; i++) - { - const float cur = (i < N - 1) ? cumProb[i] : 1.0f; - const float prev = (i > 0) ? cumProb[i - 1] : 0.0f; - pdf[i] = cur - prev; - } - - pass &= verifyPdf("CumProb", name, pdf.data(), weights); - pass &= verifyRange01("CumProb", name, "cumProb", cumProb.data(), N - 1); - - if (pass) - m_logger->log(" [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name); - - return pass; - } - - system::ILogger* m_logger; + public: + CDiscreteTableTester(system::ILogger* logger) : m_logger(logger) {} + + bool run() + { + bool pass = true; + auto cases = createTestCases(); + + m_logger->log("AliasTableBuilder tests:", system::ILogger::ELL_INFO); + for (const auto& tc : cases) + pass &= testAliasTable(tc.name, tc.weights); + + m_logger->log("CumulativeProbability tests:", system::ILogger::ELL_INFO); + for (const auto& tc : cases) + pass &= testCumulativeProbability(tc.name, tc.weights); + + m_logger->log("CumulativeProbabilitySampler tests (TRACKING / YOLO / EYTZINGER):", system::ILogger::ELL_INFO); + for (const auto& tc : cases) + pass &= testSamplers(tc.name, tc.weights); + + return pass; + } + + private: + struct TestCase + { + const char* name; + std::vector weights; + }; + + static std::vector createTestCases() + { + std::vector cases; + cases.push_back({"Uniform(4)", {1.0f, 1.0f, 1.0f, 1.0f}}); + cases.push_back({"NonUniform(1,2,3,4)", {1.0f, 2.0f, 3.0f, 4.0f}}); + + { + std::vector w(32, 1.0f); + w[31] = 97.0f; + cases.push_back({"SingleDominant(32)", std::move(w)}); + } + { + std::vector w(64); + for (uint32_t i = 0; i < 64; i++) + w[i] = 1.0f / float(i + 1); + cases.push_back({"PowerLaw(64)", std::move(w)}); + } + + cases.push_back({"SingleNonZero(4)", {0.0f, 0.0f, 5.0f, 0.0f}}); + + { + std::vector w(1024); + std::mt19937 rng(42); + std::uniform_real_distribution dist(0.001f, 100.0f); + for (uint32_t i = 0; i < 1024; i++) + w[i] = dist(rng); + cases.push_back({"Random(1024)", std::move(w)}); + } + + // NPoT cases exercise EYTZINGER padded-leaf territory (P > N). + cases.push_back({"NonPot(7)", {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f}}); + { + std::vector w(1000); + std::mt19937 rng(4242); + std::uniform_real_distribution dist(0.001f, 100.0f); + for (uint32_t i = 0; i < 1000; i++) + w[i] = dist(rng); + cases.push_back({"Random(1000)", std::move(w)}); + } + + return cases; + } + + // Verify all values in array are in [0, 1] + bool verifyRange01(const char* prefix, const char* name, const char* arrayName, const float* data, uint32_t count) const + { + bool pass = true; + for (uint32_t i = 0; i < count; i++) + { + if (data[i] < 0.0f || data[i] > 1.0f + 1e-6f) + { + m_logger->log("%s[%s] %s[%u] = %f out of range [0, 1]", + system::ILogger::ELL_ERROR, prefix, name, arrayName, i, data[i]); + pass = false; + } + } + return pass; + } + + // Shared: verify PDFs sum to 1 and each matches weight/totalWeight + bool verifyPdf(const char* prefix, const char* name, const float* pdf, const std::vector& weights) const + { + const uint32_t N = static_cast(weights.size()); + float totalWeight = 0.0f; + for (uint32_t i = 0; i < N; i++) + totalWeight += weights[i]; + + bool pass = true; + + float pdfSum = 0.0f; + for (uint32_t i = 0; i < N; i++) + pdfSum += pdf[i]; + + if (std::abs(pdfSum - 1.0f) > 1e-5f) + { + m_logger->log("%s[%s] PDF sum: expected 1.0, got %f", system::ILogger::ELL_ERROR, prefix, name, pdfSum); + pass = false; + } + + for (uint32_t i = 0; i < N; i++) + { + const float expected = weights[i] / totalWeight; + const float err = std::abs(expected - pdf[i]); + if (err > 1e-6f) + { + m_logger->log("%s[%s] pdf[%u]: expected %f, got %f (err=%e)", system::ILogger::ELL_ERROR, prefix, name, i, expected, pdf[i], err); + pass = false; + } + } + + return pass; + } + + // Verify alias table builder output: + // - bucket contributions reconstruct correct scaled probabilities + // - PDFs sum to 1 and match weight/totalWeight + // - alias indices in range, probabilities in [0, 1] + // Builder transparently pads PoT N to N+1; actual table size comes back + // as `tableN` and is what gets compared against. + bool testAliasTable(const char* name, const std::vector& weights) const + { + const uint32_t userN = static_cast(weights.size()); + + std::vector outProbability; + std::vector outAlias; + std::vector outPdf; + const uint32_t tableN = nbl::hlsl::sampling::AliasTableBuilder::build({weights}, outProbability, outAlias, outPdf); + + // Accumulate bucket contributions over the full (possibly padded) table + std::vector dest(tableN, 0.0f); + for (uint32_t i = 0; i < tableN; i++) + { + dest[i] += outProbability[i]; + dest[outAlias[i]] += (1.0f - outProbability[i]); + } + + bool pass = true; + + float totalWeight = 0.0f; + for (uint32_t i = 0; i < userN; i++) + totalWeight += weights[i]; + + // Real buckets: expected scaled prob = weight/total * tableN + for (uint32_t i = 0; i < userN; i++) + { + const float expected = weights[i] / totalWeight * float(tableN); + const float err = std::abs(expected - dest[i]); + const float tolerance = std::max(1e-5f * float(tableN), 1e-4f); + + if (err > tolerance) + { + m_logger->log("AliasTable[%s] bucket %u: expected %f, got %f (err=%e)", + system::ILogger::ELL_ERROR, name, i, expected, dest[i], err); + pass = false; + } + } + + // Dummy bucket (only when padded): no real bucket aliases to it -> dest[userN] should be 0. + if (tableN != userN && std::abs(dest[userN]) > 1e-4f) + { + m_logger->log("AliasTable[%s] dummy bucket %u has non-zero reconstructed probability %f", + system::ILogger::ELL_ERROR, name, userN, dest[userN]); + pass = false; + } + + // Alias indices in range [0, tableN) + for (uint32_t i = 0; i < tableN; i++) + { + if (outAlias[i] >= tableN) + { + m_logger->log("AliasTable[%s] alias[%u] = %u out of range [0, %u)", + system::ILogger::ELL_ERROR, name, i, outAlias[i], tableN); + pass = false; + } + } + + pass &= verifyPdf("AliasTable", name, outPdf.data(), weights); + pass &= verifyRange01("AliasTable", name, "probability", outProbability.data(), tableN); + + if (pass) + m_logger->log(" [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name); + + return pass; + } + + // Verify CDF table construction: monotonicity, implicit-1.0 invariant, and + // stored entries in [0, 1]. PDF-from-CDF correctness is covered by the + // TRACKING sampler test below (same cdf[i] - cdf[i-1] derivation via + // sampler.backwardPdf), so it's not repeated here. + bool testCumulativeProbability(const char* name, const std::vector& weights) const + { + const uint32_t N = static_cast(weights.size()); + + std::vector cumProb(N - 1); + + nbl::hlsl::sampling::computeNormalizedCumulativeHistogram(std::span(weights), cumProb.data()); + + bool pass = true; + + // Monotonically non-decreasing + for (uint32_t i = 1; i < N - 1; i++) + { + if (cumProb[i] < cumProb[i - 1] - 1e-7f) + { + m_logger->log("CumProb[%s] non-monotonic at %u: cumProb[%u]=%f < cumProb[%u]=%f", + system::ILogger::ELL_ERROR, name, i, i, cumProb[i], i - 1, cumProb[i - 1]); + pass = false; + } + } + + // Last stored entry should be < 1.0 (the Nth bucket is implicitly 1.0) + if (N > 1 && cumProb[N - 2] >= 1.0f + 1e-6f) + { + m_logger->log("CumProb[%s] last stored entry %f >= 1.0", system::ILogger::ELL_ERROR, name, cumProb[N - 2]); + pass = false; + } + + pass &= verifyRange01("CumProb", name, "cumProb", cumProb.data(), N - 1); + + if (pass) + m_logger->log(" [%s] PASSED", system::ILogger::ELL_PERFORMANCE, name); + + return pass; + } + + // Reference binary search over the full N-entry CDF (last entry == 1.0). + static uint32_t referenceUpperBound(const std::vector& fullCdf, float u) + { + auto it = std::upper_bound(fullCdf.begin(), fullCdf.end(), u); + return static_cast(std::distance(fullCdf.begin(), it)); + } + + // Run TRACKING, YOLO, and EYTZINGER samplers against the same reference + // distribution. Each mode is instantiated via the dual-compile sampler and + // exercised entirely on the CPU. + bool testSamplers(const char* name, const std::vector& weights) const + { + const uint32_t N = static_cast(weights.size()); + if (N < 2) + return true; + + float totalWeight = 0.0f; + for (uint32_t i = 0; i < N; i++) + totalWeight += weights[i]; + const float rcpTotal = 1.0f / totalWeight; + + std::vector pdfRef(N); + std::vector fullCdf(N); + float acc = 0.0f; + for (uint32_t i = 0; i < N; i++) + { + pdfRef[i] = weights[i] * rcpTotal; + acc += pdfRef[i]; + fullCdf[i] = acc; + } + fullCdf[N - 1] = 1.0f; // pin the last entry; reference must treat it as exact + + // Storage for TRACKING / YOLO (N-1 entries, last bucket implicit at 1.0). + std::vector cdfStorage(N - 1); + nbl::hlsl::sampling::computeNormalizedCumulativeHistogram({weights}, cdfStorage.data()); + + // Storage for EYTZINGER (2*P entries, level-order implicit binary tree). + const uint32_t P = nbl::hlsl::sampling::eytzingerLeafCount(N); + std::vector treeStorage(2u * P, 0.0f); + nbl::hlsl::sampling::buildEytzinger({weights}, treeStorage.data()); + + bool pass = true; + pass &= testSamplerMode("TRACKING", name, N, pdfRef, fullCdf, cdfStorage.data()); + pass &= testSamplerMode("YOLO", name, N, pdfRef, fullCdf, cdfStorage.data()); + pass &= testSamplerMode("EYTZINGER", name, N, pdfRef, fullCdf, treeStorage.data()); + return pass; + } + + template + bool testSamplerMode(const char* modeName, const char* caseName, uint32_t N, + const std::vector& pdfRef, const std::vector& fullCdf, const float* accessorData) const + { + using Sampler = nbl::hlsl::sampling::CumulativeProbabilitySampler< + float, float, uint32_t, ReadOnlyAccessor, Mode>; + + ReadOnlyAccessor accessor {accessorData}; + Sampler sampler = Sampler::create(accessor, N); + + bool pass = true; + + // backwardPdf(v) == pdfRef[v], and the implied PDF sums to 1. + float backwardSum = 0.0f; + for (uint32_t v = 0; v < N; v++) + { + const float got = sampler.backwardPdf(v); + const float expected = pdfRef[v]; + const float err = std::abs(got - expected); + const float tol = 1e-5f; + if (err > tol) + { + m_logger->log("Sampler[%s][%s] backwardPdf[%u]: expected %e, got %e (err=%e)", + system::ILogger::ELL_ERROR, modeName, caseName, v, expected, got, err); + pass = false; + } + backwardSum += got; + } + if (std::abs(backwardSum - 1.0f) > 1e-5f) + { + m_logger->log("Sampler[%s][%s] backwardPdf sum: expected 1.0, got %f", + system::ILogger::ELL_ERROR, modeName, caseName, backwardSum); + pass = false; + } + + // generate(u) lands in the correct bucket for a grid of u values, and + // generate(u, cache) produces forwardPdf matching backwardPdf(result). + std::mt19937 rng(1234u + N); + std::uniform_real_distribution udist(0.0f, std::nextafter(1.0f, 0.0f)); + constexpr uint32_t kTrials = 2048; + + for (uint32_t k = 0; k < kTrials; k++) + { + const float u = udist(rng); + const uint32_t ref = referenceUpperBound(fullCdf, u); + + const uint32_t idx = sampler.generate(u); + if (idx != ref) + { + m_logger->log("Sampler[%s][%s] generate(%.7f): expected bucket %u, got %u", + system::ILogger::ELL_ERROR, modeName, caseName, u, ref, idx); + pass = false; + continue; + } + + typename Sampler::cache_type cache; + const uint32_t idxCache = sampler.generate(u, cache); + if (idxCache != ref) + { + m_logger->log("Sampler[%s][%s] generate(u,cache)(%.7f): expected %u, got %u", + system::ILogger::ELL_ERROR, modeName, caseName, u, ref, idxCache); + pass = false; + continue; + } + + const float forwardP = sampler.forwardPdf(u, cache); + const float backwardP = sampler.backwardPdf(idxCache); + if (std::abs(forwardP - backwardP) > 1e-6f) + { + m_logger->log("Sampler[%s][%s] fwd/bwd pdf mismatch at u=%.7f bucket=%u: fwd=%e bwd=%e", + system::ILogger::ELL_ERROR, modeName, caseName, u, idxCache, forwardP, backwardP); + pass = false; + } + } + + if (pass) + m_logger->log(" [%-9s %s] PASSED", system::ILogger::ELL_PERFORMANCE, modeName, caseName); + return pass; + } + + system::ILogger* m_logger; }; #endif diff --git a/37_HLSLSamplingTests/tests/CLinearTester.h b/37_HLSLSamplingTests/tests/CLinearTester.h index 814fbb1d7..394b68721 100644 --- a/37_HLSLSamplingTests/tests/CLinearTester.h +++ b/37_HLSLSamplingTests/tests/CLinearTester.h @@ -14,7 +14,7 @@ class CLinearTester final : public ITestergetFamilyIndex(); m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - //core::smart_refctd_ptr* cmdBuffs[] = { &m_cmdbuf, &m_timestampBeforeCmdBuff, &m_timestampAfterCmdBuff }; if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) base.logFail("Failed to create Command Buffers!\n"); - if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdBuff)) - base.logFail("Failed to create Command Buffers!\n"); - if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdBuff)) - base.logFail("Failed to create Command Buffers!\n"); // Load shaders, set up pipeline { @@ -1024,6 +1019,7 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso dummyBuff->setObjectDebugName("benchmark buffer"); nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = dummyBuff->getMemoryReqs(); + reqs.memoryTypeBits &= base.m_physicalDevice->getDeviceLocalMemoryTypeBits(); m_allocation = base.m_device->allocate(reqs, dummyBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); if (!m_allocation.isValid()) @@ -1075,104 +1071,51 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso { m_device->waitIdle(); - recordTimestampQueryCmdBuffers(); - - uint64_t semaphoreCounter = 0; - smart_refctd_ptr semaphore = m_device->createSemaphore(semaphoreCounter); - - IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} }; - IQueue::SSubmitInfo::SSemaphoreInfo waits[] = { {.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT } }; - - IQueue::SSubmitInfo beforeTimestapSubmitInfo[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsBegin[] = { {.cmdbuf = m_timestampBeforeCmdBuff.get()} }; - beforeTimestapSubmitInfo[0].commandBuffers = cmdbufsBegin; - beforeTimestapSubmitInfo[0].signalSemaphores = signals; - beforeTimestapSubmitInfo[0].waitSemaphores = waits; - - IQueue::SSubmitInfo afterTimestapSubmitInfo[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsEnd[] = { {.cmdbuf = m_timestampAfterCmdBuff.get()} }; - afterTimestapSubmitInfo[0].commandBuffers = cmdbufsEnd; - afterTimestapSubmitInfo[0].signalSemaphores = signals; - afterTimestapSubmitInfo[0].waitSemaphores = waits; - - IQueue::SSubmitInfo benchmarkSubmitInfos[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} }; - benchmarkSubmitInfos[0].commandBuffers = cmdbufs; - benchmarkSubmitInfos[0].signalSemaphores = signals; - benchmarkSubmitInfos[0].waitSemaphores = waits; - - m_pushConstants.benchmarkMode = mode; - recordCmdBuff(); - // warmup runs - for (int i = 0; i < WarmupIterations; ++i) - { - if(i == 0) - m_api->startCapture(); - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(benchmarkSubmitInfos); - if (i == 0) - m_api->endCapture(); - } - - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(beforeTimestapSubmitInfo); - - // actual benchmark runs - for (int i = 0; i < Iterations; ++i) - { - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(benchmarkSubmitInfos); - } - - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(afterTimestapSubmitInfo); - - m_device->waitIdle(); + // [warmup dispatches][ts 0][bench dispatches][ts 1][cooldown dispatches] in one cmdbuf, + // one submit. Per-submit semaphore chaining adds sync cost and blocks driver pipelining; + // the cooldown keeps the GPU in steady state across ts 1 so the trailing bench + // dispatches don't land in a winding-down tail. + constexpr int CooldownIterations = WarmupIterations; - const uint64_t nativeBenchmarkTimeElapsedNanoseconds = calcTimeElapsed(); - const float nativeBenchmarkTimeElapsedSeconds = double(nativeBenchmarkTimeElapsedNanoseconds) / 1000000000.0; - - m_logger->log("%llu ns, %f s", ILogger::ELL_PERFORMANCE, nativeBenchmarkTimeElapsedNanoseconds, nativeBenchmarkTimeElapsedSeconds); - } - - void recordCmdBuff() - { - m_cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); + m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); m_cmdbuf->beginDebugMarker("emulated_float64_t compute dispatch", vectorSIMDf(0, 1, 0, 1)); + m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); m_cmdbuf->bindComputePipeline(m_pipeline.get()); m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants); - m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); + for (int i = 0; i < WarmupIterations; ++i) + m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); + m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); + for (int i = 0; i < Iterations; ++i) + m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); + m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + for (int i = 0; i < CooldownIterations; ++i) + m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); m_cmdbuf->endDebugMarker(); m_cmdbuf->end(); - } - void recordTimestampQueryCmdBuffers() - { - static bool firstInvocation = true; + smart_refctd_ptr semaphore = m_device->createSemaphore(0u); + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = { {.cmdbuf = m_cmdbuf.get()} }; + const IQueue::SSubmitInfo::SSemaphoreInfo signalSem[] = { + {.semaphore = semaphore.get(), .value = 1u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT} + }; + IQueue::SSubmitInfo submit = {}; + submit.commandBuffers = cmdbufs; + submit.signalSemaphores = signalSem; - if (!firstInvocation) - { - m_timestampBeforeCmdBuff->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampBeforeCmdBuff->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); - } + m_api->startCapture(); + m_computeQueue->submit({&submit, 1u}); + m_api->endCapture(); - m_timestampBeforeCmdBuff->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampBeforeCmdBuff->resetQueryPool(m_queryPool.get(), 0, 2); - m_timestampBeforeCmdBuff->writeTimestamp(PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0); - m_timestampBeforeCmdBuff->end(); + m_device->waitIdle(); - m_timestampAfterCmdBuff->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampAfterCmdBuff->writeTimestamp(PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1); - m_timestampAfterCmdBuff->end(); + const uint64_t nativeBenchmarkTimeElapsedNanoseconds = calcTimeElapsed(); + const float nativeBenchmarkTimeElapsedSeconds = double(nativeBenchmarkTimeElapsedNanoseconds) / 1000000000.0; - firstInvocation = false; + m_logger->log("%llu ns, %f s", ILogger::ELL_PERFORMANCE, nativeBenchmarkTimeElapsedNanoseconds, nativeBenchmarkTimeElapsedSeconds); } uint64_t calcTimeElapsed() @@ -1196,8 +1139,6 @@ class CompatibilityTest final : public MonoDeviceApplication, public BuiltinReso BenchmarkPushConstants m_pushConstants; smart_refctd_ptr m_pipeline; - smart_refctd_ptr m_timestampBeforeCmdBuff = nullptr; - smart_refctd_ptr m_timestampAfterCmdBuff = nullptr; smart_refctd_ptr m_queryPool = nullptr; uint32_t m_queueFamily; From 07e76965a740d8a420779860de133dd88a3081f6 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Mon, 27 Apr 2026 22:06:49 +0300 Subject: [PATCH 22/26] major refactor, better drawing, better benchmarks --- 31_HLSLPathTracer/main.cpp | 2 +- 73_SolidAngleVisualizer/CMakeLists.txt | 64 +- .../app_resources/hlsl/Drawing.hlsl | 980 ++--- .../hlsl/benchmark/benchmark.comp.hlsl | 326 +- .../app_resources/hlsl/benchmark/common.hlsl | 3 +- .../app_resources/hlsl/common.hlsl | 263 +- .../app_resources/hlsl/debug_vis.hlsl | 150 + .../app_resources/hlsl/gpu_common.hlsl | 175 - .../hlsl/parallelogram_sampling.hlsl | 40 +- .../app_resources/hlsl/pyramid_sampling.hlsl | 817 ++-- .../hlsl/pyramid_sampling/bilinear.hlsl | 54 +- .../hlsl/pyramid_sampling/biquadratic.hlsl | 166 +- .../hlsl/pyramid_sampling/urena.hlsl | 87 - .../app_resources/hlsl/ray_vis.frag.hlsl | 239 +- .../app_resources/hlsl/silhouette.hlsl | 589 +-- .../hlsl/solid_angle_vis.frag.hlsl | 460 +-- .../app_resources/hlsl/triangle_sampling.hlsl | 375 +- .../app_resources/hlsl/utils.hlsl | 43 +- 73_SolidAngleVisualizer/include/transform.hpp | 2 +- 73_SolidAngleVisualizer/main.cpp | 3419 +++++++++-------- 20 files changed, 3810 insertions(+), 4444 deletions(-) create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl delete mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl delete mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/urena.hlsl diff --git a/31_HLSLPathTracer/main.cpp b/31_HLSLPathTracer/main.cpp index 4668580bd..749c2787e 100644 --- a/31_HLSLPathTracer/main.cpp +++ b/31_HLSLPathTracer/main.cpp @@ -439,7 +439,7 @@ class HLSLComputePathtracer final : public SimpleWindowedApplication, public Bui nullptr, nullptr ); - m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass(), 0u, {}, hlsl::SurfaceTransform::FLAG_BITS::IDENTITY_BIT, m_pipelineCache.object.get()); + m_presentPipeline = fsTriProtoPPln.createPipeline(fragSpec, presentLayout.get(), scRes->getRenderpass(), 0u, {}, {}, hlsl::SurfaceTransform::FLAG_BITS::IDENTITY_BIT, m_pipelineCache.object.get()); if (!m_presentPipeline) return logFail("Could not create Graphics Pipeline!"); m_pipelineCache.dirty = true; diff --git a/73_SolidAngleVisualizer/CMakeLists.txt b/73_SolidAngleVisualizer/CMakeLists.txt index 6438c8e06..6dbc19664 100644 --- a/73_SolidAngleVisualizer/CMakeLists.txt +++ b/73_SolidAngleVisualizer/CMakeLists.txt @@ -11,6 +11,7 @@ if(NBL_BUILD_IMGUI) imtestengine imguizmo "${NBL_EXT_IMGUI_UI_LIB}" + Nabla::ext::FullScreenTriangle ) if(NBL_EMBED_BUILTIN_RESOURCES) @@ -40,14 +41,21 @@ if(NBL_BUILD_IMGUI) set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen") set(DEPENDS app_resources/hlsl/common.hlsl - app_resources/hlsl/gpu_common.hlsl - app_resources/hlsl/Drawing.hlsl - app_resources/hlsl/Sampling.hlsl + app_resources/hlsl/debug_vis.hlsl + app_resources/hlsl/drawing.hlsl app_resources/hlsl/silhouette.hlsl app_resources/hlsl/utils.hlsl + app_resources/hlsl/triangle_sampling.hlsl app_resources/hlsl/parallelogram_sampling.hlsl + app_resources/hlsl/pyramid_sampling.hlsl + + app_resources/hlsl/pyramid_sampling/bilinear.hlsl + app_resources/hlsl/pyramid_sampling/biquadratic.hlsl + + + app_resources/hlsl/solid_angle_vis.frag.hlsl + app_resources/hlsl/ray_vis.frag.hlsl - # app_resources/hlsl/test.comp.hlsl app_resources/hlsl/benchmark/benchmark.comp.hlsl app_resources/hlsl/benchmark/common.hlsl ) @@ -55,20 +63,54 @@ if(NBL_BUILD_IMGUI) set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON) set(SM 6_8) + set(SA_VIS "app_resources/hlsl/solid_angle_vis.frag.hlsl") + set(RAY_VIS "app_resources/hlsl/ray_vis.frag.hlsl") + set(BENCH "app_resources/hlsl/benchmark/benchmark.comp.hlsl") + set(JSON [=[ [ - - { - "INPUT": "app_resources/hlsl/benchmark/benchmark.comp.hlsl", - "KEY": "benchmark", - }, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_sa", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=0", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_sa_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=0", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_psa", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=1", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_psa_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=1", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_para", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=2", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_para_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=2", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=3", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=3", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_biquad", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=4", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_biquad_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=4", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_bilinear", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=5", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_bilinear_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=5", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_proj_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=6", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_proj_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=6", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_silhouette", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=7", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_silhouette_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=7", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_pyramid", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=8", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_pyramid_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=8", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + + {"INPUT": "${RAY_VIS}", "KEY": "ray_vis", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${RAY_VIS}", "KEY": "ray_vis_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + + {"INPUT": "${BENCH}", "KEY": "benchmark_tri_sa", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=0"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_tri_psa", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=1"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_para", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=2"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=3"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_biquad", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=4"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_bilinear", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=5"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_proj_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=6"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_silhouette", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=7"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_pyramid_creation", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=8"]}, ] ]=]) string(CONFIGURE "${JSON}" JSON) set(COMPILE_OPTIONS -I "${CMAKE_CURRENT_SOURCE_DIR}" - -T lib_${SM} + -Zi -Qembed_debug + -fspv-debug=file + -fspv-debug=source + -fspv-debug=line + -enable-16bit-types ) NBL_CREATE_NSC_COMPILE_RULES( @@ -77,7 +119,7 @@ if(NBL_BUILD_IMGUI) DEPENDS ${DEPENDS} BINARY_DIR ${OUTPUT_DIRECTORY} MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT - COMMON_OPTIONS ${COMPILE_OPTIONS} + COMMON_OPTIONS ${COMPILE_OPTIONS} OUTPUT_VAR KEYS INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp NAMESPACE nbl::this_example::builtin::build diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl index 4338bd958..8fe9adbb8 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl @@ -1,594 +1,424 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #ifndef _SOLID_ANGLE_VIS_EXAMPLE_DRAWING_HLSL_INCLUDED_ #define _SOLID_ANGLE_VIS_EXAMPLE_DRAWING_HLSL_INCLUDED_ #include "common.hlsl" -#include "gpu_common.hlsl" +#include "silhouette.hlsl" +#include -// Check if a face on the hemisphere is visible from camera at origin -bool isFaceVisible(float32_t3 faceCenter, float32_t3 faceNormal) -{ - float32_t3 viewVec = normalize(-faceCenter); // Vector from camera to face - return dot(faceNormal, viewVec) > 0.0f; -} - -// doesn't change Z coordinate -float32_t3 sphereToCircle(float32_t3 spherePoint) -{ - if (spherePoint.z >= 0.0f) - { - return float32_t3(spherePoint.xy * CIRCLE_RADIUS, spherePoint.z); - } - else - { - float32_t r2 = (1.0f - spherePoint.z) / (1.0f + spherePoint.z); - float32_t uv2Plus1 = r2 + 1.0f; - return float32_t3((spherePoint.xy * uv2Plus1 / 2.0f) * CIRCLE_RADIUS, spherePoint.z); - } -} - -#if VISUALIZE_SAMPLES - -float32_t drawGreatCircleArc(float32_t3 fragPos, float32_t3 points[2], float32_t aaWidth, float32_t width = 0.01f) -{ - float32_t3 v0 = normalize(points[0]); - float32_t3 v1 = normalize(points[1]); - float32_t3 ndc = normalize(fragPos); - - float32_t3 arcNormal = normalize(cross(v0, v1)); - float32_t dist = abs(dot(ndc, arcNormal)); - - float32_t dotMid = dot(v0, v1); - bool onArc = (dot(ndc, v0) >= dotMid) && (dot(ndc, v1) >= dotMid); - - if (!onArc) - return 0.0f; - - float32_t avgDepth = (length(points[0]) + length(points[1])) * 0.5f; - float32_t depthScale = 3.0f / avgDepth; - - width = min(width * depthScale, 0.02f); - float32_t alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); - - return alpha; -} - -float32_t drawCross2D(float32_t2 fragPos, float32_t2 center, float32_t size, float32_t thickness) -{ - float32_t2 ndc = abs(fragPos - center); - - // Check if point is inside the cross (horizontal or vertical bar) - bool inHorizontal = (ndc.x <= size && ndc.y <= thickness); - bool inVertical = (ndc.y <= size && ndc.x <= thickness); - - return (inHorizontal || inVertical) ? 1.0f : 0.0f; -} - -float32_t4 drawHiddenEdges(float32_t3x4 modelMatrix, float32_t3 spherePos, uint32_t silEdgeMask, float32_t aaWidth) -{ - float32_t4 color = 0; - float32_t3 hiddenEdgeColor = float32_t3(0.1, 0.1, 0.1); - - NBL_UNROLL - for (uint32_t i = 0; i < 12; i++) - { - // skip silhouette edges - if (silEdgeMask & (1u << i)) - continue; - - uint32_t2 edge = allEdges[i]; - - float32_t3 v0 = normalize(getVertex(modelMatrix, edge.x)); - float32_t3 v1 = normalize(getVertex(modelMatrix, edge.y)); - - bool neg0 = v0.z < 0.0f; - bool neg1 = v1.z < 0.0f; - - // fully hidden - if (neg0 && neg1) - continue; - - float32_t3 p0 = v0; - float32_t3 p1 = v1; - - // clip if needed - if (neg0 ^ neg1) - { - float32_t t = v0.z / (v0.z - v1.z); - float32_t3 clip = normalize(lerp(v0, v1, t)); - - p0 = neg0 ? clip : v0; - p1 = neg1 ? clip : v1; - } - - float32_t3 pts[2] = {p0, p1}; - float32_t c = drawGreatCircleArc(spherePos, pts, aaWidth, 0.003f); - color += float32_t4(hiddenEdgeColor * c, c); - } - - return color; -} - -float32_t4 drawCorner(float32_t3 cornerNDCPos, float32_t2 ndc, float32_t aaWidth, float32_t dotSize, float32_t innerDotSize, float32_t3 dotColor) -{ - float32_t4 color = float32_t4(0, 0, 0, 0); - float32_t dist = length(ndc - cornerNDCPos.xy); - - // outer dot - float32_t outerAlpha = 1.0f - smoothstep(dotSize - aaWidth, - dotSize + aaWidth, - dist); - - if (outerAlpha <= 0.0f) - return color; +using namespace nbl::hlsl; - color += float32_t4(dotColor * outerAlpha, outerAlpha); - - // ------------------------------------------------- - // inner black dot for hidden corners - // ------------------------------------------------- - if (cornerNDCPos.z < 0.0f && innerDotSize > 0.0) - { - float32_t innerAlpha = 1.0f - smoothstep(innerDotSize - aaWidth, - innerDotSize + aaWidth, - dist); - - // ensure it stays inside the outer dot - innerAlpha *= outerAlpha; - - color -= float32_t4(innerAlpha.xxx, 0.0f); - } - - return color; -} - -// Draw a line segment in NDC space -float32_t lineSegment(float32_t2 ndc, float32_t2 a, float32_t2 b, float32_t thickness) -{ - float32_t2 pa = ndc - a; - float32_t2 ba = b - a; - float32_t h = saturate(dot(pa, ba) / dot(ba, ba)); - float32_t dist = length(pa - ba * h); - return smoothstep(thickness, thickness * 0.5, dist); -} - -// Draw an arrow head (triangle) in NDC space -float32_t arrowHead(float32_t2 ndc, float32_t2 tip, float32_t2 direction, float32_t size) -{ - // Create perpendicular vector - float32_t2 perp = float32_t2(-direction.y, direction.x); - - // Three points of the arrow head triangle - float32_t2 p1 = tip; - float32_t2 p2 = tip - direction * size + perp * size * 0.5; - float32_t2 p3 = tip - direction * size - perp * size * 0.5; - - // Check if point is inside triangle using barycentric coordinates - float32_t2 v0 = p3 - p1; - float32_t2 v1 = p2 - p1; - float32_t2 v2 = ndc - p1; - - float32_t dot00 = dot(v0, v0); - float32_t dot01 = dot(v0, v1); - float32_t dot02 = dot(v0, v2); - float32_t dot11 = dot(v1, v1); - float32_t dot12 = dot(v1, v2); - - float32_t invDenom = 1.0 / (dot00 * dot11 - dot01 * dot01); - float32_t u = (dot11 * dot02 - dot01 * dot12) * invDenom; - float32_t v = (dot00 * dot12 - dot01 * dot02) * invDenom; - - bool inside = (u >= 0.0) && (v >= 0.0) && (u + v <= 1.0); - - // Add some antialiasing - float32_t minDist = min(min( - length(ndc - p1), - length(ndc - p2)), - length(ndc - p3)); - - return inside ? 1.0 : smoothstep(0.02, 0.0, minDist); -} - -// Helper to draw an edge with proper color mapping -float32_t4 drawEdge(uint32_t originalEdgeIdx, float32_t3 pts[2], float32_t3 spherePos, float32_t aaWidth, float32_t width = 0.003f) -{ - float32_t4 edgeContribution = drawGreatCircleArc(spherePos, pts, aaWidth, width); - return float32_t4(colorLUT[originalEdgeIdx] * edgeContribution.a, edgeContribution.a); -}; - -float32_t4 drawCorners(float32_t3x4 modelMatrix, float32_t2 ndc, float32_t aaWidth, float32_t dotSize) -{ - float32_t4 color = float32_t4(0, 0, 0, 0); - - float32_t innerDotSize = dotSize * 0.5f; - - for (uint32_t i = 0; i < 8; i++) - { - float32_t3 cornerCirclePos = sphereToCircle(normalize(getVertex(modelMatrix, i))); - color += drawCorner(cornerCirclePos, ndc, aaWidth, dotSize, 0.0, colorLUT[i]); - } - - return color; -} - -#ifdef _SOLID_ANGLE_VIS_EXAMPLE_SILHOUETTE_HLSL_INCLUDED_ -float32_t4 drawClippedSilhouetteVertices(float32_t2 ndc, ClippedSilhouette silhouette, float32_t aaWidth) +// ============================================================================ +// SphereDrawer: all visualization primitives for the solid angle visualizer. +// All methods are static and read VisContext for ndc/spherePos/aaWidth. +// ============================================================================ +struct SphereDrawer { - float32_t4 color = 0; - float32_t dotSize = 0.03f; - - for (uint i = 0; i < silhouette.count; i++) - { - float32_t3 cornerCirclePos = sphereToCircle(normalize(silhouette.vertices[i])); - float32_t dist = length(ndc - cornerCirclePos.xy); - - // Smooth circle for the vertex - float32_t alpha = 1.0f - smoothstep(dotSize * 0.8f, dotSize, dist); - - if (alpha > 0.0f) - { - // Color gradient: Red (index 0) to Cyan (last index) - // This helps verify the CCW winding order visually + // ======================================================================== + // Coordinate helpers + // ======================================================================== + + // Project sphere point to circle-space (doesn't change Z) + static float32_t3 sphereToCircle(float32_t3 spherePoint) + { + if (spherePoint.z >= 0.0f) + { + return float32_t3(spherePoint.xy * CIRCLE_RADIUS, spherePoint.z); + } + else + { + float32_t r2 = (1.0f - spherePoint.z) / (1.0f + spherePoint.z); + float32_t uv2Plus1 = r2 + 1.0f; + return float32_t3((spherePoint.xy * uv2Plus1 / 2.0f) * CIRCLE_RADIUS, spherePoint.z); + } + } + + // ======================================================================== + // Primitives + // ======================================================================== + + // Great circle arc between two points on the sphere + static float32_t drawGreatCircleArc(float32_t3 points[2], float32_t width = 0.01f) + { + float32_t3 v0 = normalize(points[0]); + float32_t3 v1 = normalize(points[1]); + float32_t3 ndc = normalize(VisContext::spherePos()); + + float32_t3 arcNormal = normalize(cross(v0, v1)); + float32_t dist = abs(dot(ndc, arcNormal)); + + float32_t dotMid = dot(v0, v1); + bool onArc = (dot(ndc, v0) >= dotMid) && (dot(ndc, v1) >= dotMid); + + if (!onArc) + return 0.0f; + + float32_t avgDepth = (length(points[0]) + length(points[1])) * 0.5f; + float32_t depthScale = 3.0f / avgDepth; + + width = min(width * depthScale, 0.02f); + const float32_t aaWidth = VisContext::aaWidth(); + float32_t alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); + + return alpha; + } + + // 2D cross marker + static float32_t drawCross2D(float32_t2 fragPos, float32_t2 center, float32_t size, float32_t thickness) + { + float32_t2 ndc = abs(fragPos - center); + + bool inHorizontal = (ndc.x <= size && ndc.y <= thickness); + bool inVertical = (ndc.y <= size && ndc.x <= thickness); + + return (inHorizontal || inVertical) ? 1.0f : 0.0f; + } + + // Dot (circle) with optional inner hollow for hidden corners + static float32_t4 drawDot(float32_t3 cornerNDCPos, float32_t dotSize, float32_t innerDotSize, float32_t3 dotColor) + { + float32_t4 color = float32_t4(0, 0, 0, 0); + const float32_t aaWidth = VisContext::aaWidth(); + const float32_t2 ndc = VisContext::ndc(); + const float32_t dist = length(ndc - cornerNDCPos.xy); + + float32_t outerAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist); + + if (outerAlpha <= 0.0f) + return color; + + color += float32_t4(dotColor * outerAlpha, outerAlpha); + + if (cornerNDCPos.z < 0.0f && innerDotSize > 0.0) + { + float32_t innerAlpha = 1.0f - smoothstep(innerDotSize - aaWidth, innerDotSize + aaWidth, dist); + innerAlpha *= outerAlpha; + color -= float32_t4(innerAlpha.xxx, 0.0f); + } + + return color; + } + + // Line segment in NDC space + static float32_t lineSegment(float32_t2 ndc, float32_t2 a, float32_t2 b, float32_t thickness) + { + float32_t2 pa = ndc - a; + float32_t2 ba = b - a; + float32_t h = saturate(dot(pa, ba) / dot(ba, ba)); + float32_t dist = length(pa - ba * h); + return smoothstep(thickness, thickness * 0.5, dist); + } + + // Draw half of a great circle (visible half of a lune boundary) + static float32_t4 drawGreatCircleHalf(float32_t3 normal, float32_t3 axis3, float32_t3 color, float32_t thickness) + { + // Point is on great circle if dot(point, normal) ~= 0 + // Only draw the half where dot(point, axis3) > 0 (toward silhouette) + const float32_t3 spherePos = VisContext::spherePos(); + const float32_t aaWidth = VisContext::aaWidth(); + + float32_t dist = abs(dot(spherePos, normal)); + float32_t sideFade = smoothstep(-0.1f, 0.1f, dot(spherePos, axis3)); + float32_t alpha = (1.0f - smoothstep(thickness - aaWidth, thickness + aaWidth, dist)) * sideFade; + return float32_t4(color * alpha, alpha); + } + + // Unit-circle ring + static float32_t4 drawRing(float32_t2 ndc) + { + const float32_t aaWidth = VisContext::aaWidth(); + float32_t ringWidth = 0.003f; + float32_t positionLength = length(ndc); + + float32_t ringDistance = abs(positionLength - CIRCLE_RADIUS); + float32_t ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance); + return ringAlpha * float32_t4(1, 1, 1, 1); + } + + // ======================================================================== + // Composite drawing helpers + // ======================================================================== + + // Silhouette edge with color from LUT + static float32_t4 drawEdge(uint32_t originalEdgeIdx, float32_t3 pts[2], float32_t width = 0.003f) + { + float32_t alpha = drawGreatCircleArc(pts, width); + return float32_t4(colorLUT[originalEdgeIdx] * alpha, alpha); + } + + static float32_t4 drawCorner(float32_t3 cornerPos, float32_t dotSize, float32_t innerDotSize, float32_t3 dotColor) + { + float32_t3 cornerCirclePos = sphereToCircle(cornerPos); + return drawDot(cornerCirclePos, dotSize, innerDotSize, dotColor); + } + + // All 8 cube corners as colored dots + static float32_t4 drawCorners(float32_t3x4 modelMatrix, float32_t dotSize) + { + float32_t4 color = float32_t4(0, 0, 0, 0); + float32_t innerDotSize = dotSize * 0.5f; + + shapes::OBBView view = shapes::OBBView::create(modelMatrix); + + for (uint32_t i = 0; i < 8; i++) + { + color += drawCorner(normalize(view.getVertex(i)), dotSize, innerDotSize, colorLUT[i]); + } + + return color; + } + + // Clipped silhouette vertices with red-to-cyan gradient + static float32_t4 drawClippedSilhouetteVertices(ClippedSilhouette silhouette) + { + float32_t4 color = 0; + float32_t dotSize = 0.03f; + + for (uint i = 0; i < silhouette.count; i++) + { + float32_t3 cornerCirclePos = sphereToCircle(normalize(silhouette.vertices[i])); + float32_t dist = length(VisContext::ndc() - cornerCirclePos.xy); + + float32_t alpha = 1.0f - smoothstep(dotSize * 0.8f, dotSize, dist); + + if (alpha > 0.0f) + { float32_t t = float32_t(i) / float32_t(max(1u, silhouette.count - 1)); float32_t3 vertexColor = lerp(float32_t3(1, 0, 0), float32_t3(0, 1, 1), t); color += float32_t4(vertexColor * alpha, alpha); - } - } - return color; -} -#endif // _SOLID_ANGLE_VIS_EXAMPLE_SILHOUETTE_HLSL_INCLUDED_ - -float32_t4 drawRing(float32_t2 ndc, float32_t aaWidth) -{ - float32_t positionLength = length(ndc); - float32_t ringWidth = 0.003f; - float32_t ringDistance = abs(positionLength - CIRCLE_RADIUS); - float32_t ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance); - return ringAlpha * float32_t4(1, 1, 1, 1); -} - -// Returns the number of visible faces and populates the faceIndices array -uint getVisibleFaces(int3 region, out uint faceIndices[3]) -{ - uint count = 0; - - // Check X axis - if (region.x == 0) - faceIndices[count++] = 3; // X+ - else if (region.x == 2) - faceIndices[count++] = 2; // X- - - // Check Y axis - if (region.y == 0) - faceIndices[count++] = 5; // Y+ - else if (region.y == 2) - faceIndices[count++] = 4; // Y- - - // Check Z axis - if (region.z == 0) - faceIndices[count++] = 1; // Z+ - else if (region.z == 2) - faceIndices[count++] = 0; // Z- - - return count; -} - -float32_t4 drawVisibleFaceOverlay(float32_t3x4 modelMatrix, float32_t3 spherePos, int3 region, float32_t aaWidth) -{ - uint faceIndices[3]; - uint count = getVisibleFaces(region, faceIndices); - - float32_t4 color = 0; - - for (uint i = 0; i < count; i++) - { - uint fIdx = faceIndices[i]; - float32_t3 n = localNormals[fIdx]; - - // Transform normal to world space (using the same logic as your corners) - float32_t3 worldNormal = -normalize(mul((float3x3)modelMatrix, n)); - worldNormal.z = -worldNormal.z; // Invert Z for correct orientation - - // Very basic visualization: highlight if the sphere position - // is generally pointing towards that face's normal - float32_t alignment = dot(spherePos, worldNormal); - if (alignment > 0.95f) - { - // Use different colors for different face indices - color += float32_t4(colorLUT[fIdx % 24], 0.5f); - } - } - return color; -} - -float32_t4 drawFaces(float32_t3x4 modelMatrix, float32_t3 spherePos, float32_t aaWidth) -{ - float32_t4 color = 0.0f; - float32_t3 ndc = normalize(spherePos); - - float3x3 rotMatrix = (float3x3)modelMatrix; - - // Check each of the 6 faces - for (uint32_t faceIdx = 0; faceIdx < 6; faceIdx++) - { - float32_t3 n_world = mul(rotMatrix, localNormals[faceIdx]); - - // Check if face is visible - if (!isFaceVisible(faceCenters[faceIdx], n_world)) - continue; - - // Get the 4 corners of this face - float32_t3 faceVerts[4]; - for (uint32_t i = 0; i < 4; i++) - { - uint32_t cornerIdx = faceToCorners[faceIdx][i]; - faceVerts[i] = normalize(getVertex(modelMatrix, cornerIdx)); - } - - // Compute face center for winding - float32_t3 faceCenter = float32_t3(0, 0, 0); - for (uint32_t i = 0; i < 4; i++) - faceCenter += faceVerts[i]; - faceCenter = normalize(faceCenter); - - // Check if point is inside this face - bool isInside = true; - float32_t minDist = 1e10; - - for (uint32_t i = 0; i < 4; i++) - { - float32_t3 v0 = faceVerts[i]; - float32_t3 v1 = faceVerts[(i + 1) % 4]; - - // Skip edges behind camera - if (v0.z < 0.0f && v1.z < 0.0f) - { - isInside = false; - break; - } - - // Great circle normal - float32_t3 edgeNormal = normalize(cross(v0, v1)); - - // Ensure normal points inward - if (dot(edgeNormal, faceCenter) < 0.0f) - edgeNormal = -edgeNormal; - - float32_t d = dot(ndc, edgeNormal); - - if (d < -1e-6f) + } + } + return color; + } + + // Non-silhouette cube edges (drawn as faint lines) + static float32_t4 drawHiddenEdges(float32_t3x4 modelMatrix, float32_t3 spherePos, uint32_t silEdgeMask) + { + float32_t4 color = 0; + float32_t3 hiddenEdgeColor = float32_t3(0.1, 0.1, 0.1); + + shapes::OBBView view = shapes::OBBView::create(modelMatrix); + + // Enumerate all 12 cube edges: for each of 3 axes, 4 edges parallel to that axis. + // compact (0..3) is the 2-bit corner index with the axis bit stripped out. + // Reconstruct the full corner by re-inserting the axis bit as 0. + NBL_UNROLL + for (uint32_t axis = 0; axis < 3; axis++) + { + NBL_UNROLL + for (uint32_t compact = 0; compact < 4; compact++) + { + uint32_t edgeIdx = axis * 4 + compact; + if (silEdgeMask & (1u << edgeIdx)) + continue; + + // Re-insert the axis bit (as 0) to recover the low corner index + uint32_t below = compact & ((1u << axis) - 1u); + uint32_t above = compact >> axis; + uint32_t corner = (above << (axis + 1u)) | below; + + float32_t3 v0 = normalize(view.getVertex(corner)); + float32_t3 v1 = normalize(view.getVertex(corner | (1u << axis))); + + bool neg0 = v0.z < 0.0f; + bool neg1 = v1.z < 0.0f; + + // fully behind camera + if (neg0 && neg1) + continue; + + float32_t3 p0 = v0; + float32_t3 p1 = v1; + + // clip if one vertex is behind camera + if (neg0 ^ neg1) { - isInside = false; - break; - } - - minDist = min(minDist, abs(d)); - } - - if (isInside) - { - float32_t alpha = smoothstep(0.0f, aaWidth * 2.0f, minDist); - - // Use colorLUT based on face index (0-5) - float32_t3 faceColor = colorLUT[faceIdx]; - - float32_t shading = saturate(ndc.z * 0.8f + 0.2f); - color += float32_t4(faceColor * shading * alpha, alpha); - } - } - - return color; -} - -// ============================================================================ -// Spherical geometry drawing helpers (for pyramid visualization) -// ============================================================================ - -// Draw a great circle where dot(p, axis) = 0 -// Used to visualize caliper planes -float32_t4 drawGreatCirclePlane( - float32_t3 axis, - float32_t3 spherePos, - float32_t aaWidth, - float32_t3 color, - float32_t width = 0.005f) -{ - float32_t3 fragDir = normalize(spherePos); - - // Only draw on front hemisphere - if (fragDir.z < 0.0f) - return float32_t4(0, 0, 0, 0); - - // Distance from the great circle plane - float32_t distFromPlane = abs(dot(fragDir, axis)); - - float32_t alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, distFromPlane); - - return float32_t4(color * alpha, alpha); -} - -// Draw lune boundaries - two small circles at dot(p, axis) = offset ± halfWidth -// halfWidth and offset are in sin-space (not radians) -float32_t4 drawLuneBoundary(float32_t3 axis, float32_t halfWidth, float32_t offset, float32_t3 spherePos, float32_t aaWidth, float32_t3 color, float32_t lineWidth = 0.004f) -{ - float32_t3 fragDir = normalize(spherePos); - - // Only draw on front hemisphere - if (fragDir.z < 0.0f) - return float32_t4(0, 0, 0, 0); - - // The lune boundaries are where dot(p, axis) = offset ± halfWidth - float32_t dotWithAxis = dot(fragDir, axis); - - // Draw both boundaries of the lune (accounting for offset) - float32_t upperBound = offset + halfWidth; - float32_t lowerBound = offset - halfWidth; - float32_t distFromUpperBoundary = abs(dotWithAxis - upperBound); - float32_t distFromLowerBoundary = abs(dotWithAxis - lowerBound); - - float32_t alphaUpper = 1.0f - smoothstep(lineWidth - aaWidth, lineWidth + aaWidth, distFromUpperBoundary); - float32_t alphaLower = 1.0f - smoothstep(lineWidth - aaWidth, lineWidth + aaWidth, distFromLowerBoundary); - - float32_t alpha = max(alphaUpper, alphaLower); - - return float32_t4(color * alpha, alpha); -} - -// Draw axis direction markers (dots at +/- axis from center) -float32_t4 drawAxisMarkers( - float32_t3 axis, - float32_t3 center, - float32_t2 ndc, - float32_t aaWidth, - float32_t3 color, - float32_t extent = 0.25f) -{ - float32_t4 result = float32_t4(0, 0, 0, 0); - - // Positive axis endpoint - float32_t3 axisEndPos = normalize(center + axis * extent); - float32_t3 axisEndPosCircle = sphereToCircle(axisEndPos); - result += drawCorner(axisEndPosCircle, ndc, aaWidth, 0.025f, 0.0f, color); - - // Negative axis endpoint (smaller, dimmer) - float32_t3 axisEndNeg = normalize(center - axis * extent); - float32_t3 axisEndNegCircle = sphereToCircle(axisEndNeg); - result += drawCorner(axisEndNegCircle, ndc, aaWidth, 0.015f, 0.0f, color * 0.5f); - - return result; -} - -// ============================================================================ -// Visualization -// ============================================================================ - -// Draw half of a great circle (the visible half of a lune boundary) -float32_t4 drawGreatCircleHalf(float32_t3 normal, float32_t3 spherePos, float32_t3 axis3, float32_t aaWidth, float32_t3 color, float32_t thickness) -{ - // Point is on great circle if dot(point, normal) ≈ 0 - // Only draw the half where dot(point, axis3) > 0 (toward silhouette) - float32_t dist = abs(dot(spherePos, normal)); - float32_t sideFade = smoothstep(-0.1f, 0.1f, dot(spherePos, axis3)); - float32_t alpha = (1.0f - smoothstep(thickness - aaWidth, thickness + aaWidth, dist)) * sideFade; - return float32_t4(color * alpha, alpha); -} - -// Visualize the best caliper edge (the edge that determined axis1) -float32_t4 visualizeBestCaliperEdge(const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t bestEdgeIdx, uint32_t count, float32_t3 spherePos, float32_t aaWidth) -{ - float32_t4 result = float32_t4(0, 0, 0, 0); - - if (bestEdgeIdx >= count) - return result; - - uint32_t nextIdx = (bestEdgeIdx + 1 < count) ? bestEdgeIdx + 1 : 0; - float32_t3 v0 = vertices[bestEdgeIdx]; - float32_t3 v1 = vertices[nextIdx]; - - // Draw the best caliper edge with a thicker, gold line - float32_t3 pts[2] = {v0, v1}; - float32_t3 highlightColor = float32_t3(1.0f, 0.8f, 0.0f); - float32_t alpha = drawGreatCircleArc(spherePos, pts, aaWidth, 0.008f); - result += float32_t4(highlightColor * alpha, alpha); - - return result; -} - -#endif // VISUALIZE_SAMPLES - -#if DEBUG_DATA - -uint32_t getEdgeVisibility(float32_t3x4 modelMatrix, uint32_t edgeIdx) -{ - - // Adjacency of edges to faces - // Corrected Adjacency of edges to faces - static const uint32_t2 edgeToFaces[12] = { - // Edge Index: | allEdges[i] | Shared Faces: - - /* 0 (0-1) */ {4, 0}, // Y- (4) and Z- (0) - /* 1 (2-3) */ {5, 0}, // Y+ (5) and Z- (0) - /* 2 (4-5) */ {4, 1}, // Y- (4) and Z+ (1) - /* 3 (6-7) */ {5, 1}, // Y+ (5) and Z+ (1) + float32_t t = v0.z / (v0.z - v1.z); + float32_t3 clip = normalize(lerp(v0, v1, t)); - /* 4 (0-2) */ {2, 0}, // X- (2) and Z- (0) - /* 5 (1-3) */ {3, 0}, // X+ (3) and Z- (0) - /* 6 (4-6) */ {2, 1}, // X- (2) and Z+ (1) - /* 7 (5-7) */ {3, 1}, // X+ (3) and Z+ (1) - - /* 8 (0-4) */ {2, 4}, // X- (2) and Y- (4) - /* 9 (1-5) */ {3, 4}, // X+ (3) and Y- (4) - /* 10 (2-6) */ {2, 5}, // X- (2) and Y+ (5) - /* 11 (3-7) */ {3, 5} // X+ (3) and Y+ (5) - }; - - uint32_t2 faces = edgeToFaces[edgeIdx]; - - // Transform normals to world space - float3x3 rotMatrix = (float3x3)modelMatrix; - float32_t3 n_world_f1 = mul(rotMatrix, localNormals[faces.x]); - float32_t3 n_world_f2 = mul(rotMatrix, localNormals[faces.y]); - - bool visible1 = isFaceVisible(faceCenters[faces.x], n_world_f1); - bool visible2 = isFaceVisible(faceCenters[faces.y], n_world_f2); - - // Silhouette: exactly one face visible - if (visible1 != visible2) - return 1; - - // Inner edge: both faces visible - if (visible1 && visible2) - return 2; - - // Hidden edge: both faces hidden - return 0; -} - -uint32_t computeGroundTruthEdgeMask(float32_t3x4 modelMatrix) -{ - uint32_t mask = 0u; - NBL_UNROLL - for (uint32_t j = 0; j < 12; j++) - { - // getEdgeVisibility returns 1 for a silhouette edge based on 3D geometry - if (getEdgeVisibility(modelMatrix, j) == 1) - { - mask |= (1u << j); - } - } - return mask; -} - -void validateEdgeVisibility(float32_t3x4 modelMatrix, uint32_t sil, uint32_t vertexCount, uint32_t generatedSilMask) -{ - uint32_t mismatchAccumulator = 0; - - // The Ground Truth now represents the full 3D silhouette, clipped or not. - uint32_t groundTruthMask = computeGroundTruthEdgeMask(modelMatrix); - - // The comparison checks if the generated mask perfectly matches the full 3D ground truth. - uint32_t mismatchMask = groundTruthMask ^ generatedSilMask; - - if (mismatchMask != 0) - { - NBL_UNROLL - for (uint32_t j = 0; j < 12; j++) - { - if ((mismatchMask >> j) & 1u) - { - uint32_t2 edge = allEdges[j]; - // Accumulate vertex indices where error occurred - mismatchAccumulator |= (1u << edge.x) | (1u << edge.y); + p0 = neg0 ? clip : v0; + p1 = neg1 ? clip : v1; } - } - } - // Simple Write (assuming all fragments calculate the same result) - InterlockedOr(DebugDataBuffer[0].edgeVisibilityMismatch, mismatchAccumulator); -} -#endif // DEBUG_DATA + float32_t3 pts[2] = {p0, p1}; + float32_t c = drawGreatCircleArc(pts, 0.003f); + color += float32_t4(hiddenEdgeColor * c, c); + } + } + + return color; + } + + // Best caliper edge highlighted in gold + static float32_t4 visualizeBestCaliperEdge(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, uint32_t bestEdgeIdx) + { + float32_t4 result = float32_t4(0, 0, 0, 0); + + if (bestEdgeIdx >= silhouette.count) + return result; + + float32_t3 v0 = silhouette.vertices[bestEdgeIdx]; + float32_t3 v1 = silhouette.vertices[(bestEdgeIdx + 1) % silhouette.count]; + + float32_t3 pts[2] = {v0, v1}; + float32_t3 highlightColor = float32_t3(1.0f, 0.8f, 0.0f); + float32_t alpha = drawGreatCircleArc(pts, 0.008f); + result += float32_t4(highlightColor * alpha, alpha); + + return result; + } + + // ======================================================================== + // Sample visualization (sphere dot + parameter-space square overlay) + // ======================================================================== + + static float32_t4 visualizeSample(float32_t3 sampleDir, float32_t2 xi, uint32_t colorIndex, float32_t2 screenUV) + { + float32_t4 accumColor = 0; + float32_t3 sampleColor = colorLUT[colorIndex].rgb; + + // 3D dot on the sphere + float32_t dist3D = distance(sampleDir, normalize(VisContext::spherePos())); + float32_t alpha3D = 1.0f - smoothstep(0.0f, 0.02f, dist3D); + if (alpha3D > 0.0f) + accumColor += float32_t4(sampleColor * alpha3D, alpha3D); + + // Parameter-space square (PSS) overlay + static const float32_t2 pssSize = float32_t2(0.2, 0.2); + static const float32_t2 pssPos = float32_t2(0.01, 0.01); + bool isInsidePSS = all(and(screenUV >= pssPos, screenUV <= (pssPos + pssSize))); + + if (isInsidePSS) + { + // Cross marker at the sample's xi position + float32_t2 xiPixelPos = pssPos + xi * pssSize; + float32_t alpha2D = drawCross2D(screenUV, xiPixelPos, 0.005f, 0.001f); + if (alpha2D > 0.0f) + accumColor += float32_t4(sampleColor * alpha2D, alpha2D); + + // Faint border outline + float32_t2 edgeDist = min(screenUV - pssPos, (pssPos + pssSize) - screenUV); + float32_t borderDist = min(edgeDist.x, edgeDist.y); + float32_t borderAlpha = 1.0f - smoothstep(0.001f, 0.003f, borderDist); + if (borderAlpha > 0.0f) + accumColor += float32_t4(0.3f, 0.3f, 0.3f, 1.0f) * borderAlpha; + } + + return accumColor; + } + + // ======================================================================== + // 3D ray arrow visualization + // ======================================================================== + + // Project 3D point to NDC space + static float32_t2 projectToNDC(float32_t3 worldPos, float32_t4x4 viewProj, float32_t aspect) + { + float32_t4 clipPos = mul(viewProj, float32_t4(worldPos, 1.0)); + clipPos /= clipPos.w; + clipPos.x *= aspect; + return clipPos.xy; + } + + struct ArrowResult + { + float32_t4 color; + float32_t depth; + }; + + // Visualize a ray as an arrow from origin in NDC space. + // Returns color (rgb), intensity (a), and depth. + static ArrowResult visualizeRayAsArrow(float32_t3 rayOrigin, float32_t4 directionAndPdf, float32_t arrowLength, + float32_t2 ndcPos, float32_t aspect, float32_t4x4 viewProjMatrix) + { + ArrowResult result; + result.color = float32_t4(0, 0, 0, 0); + result.depth = 0.0; // Far plane in reversed-Z + + float32_t3 rayDir = normalize(directionAndPdf.xyz); + float32_t pdf = directionAndPdf.w; + + // Define the 3D line segment + float32_t3 worldStart = rayOrigin; + float32_t3 worldEnd = rayOrigin + rayDir * arrowLength; + + float32_t4 clipStart = mul(viewProjMatrix, float32_t4(worldStart, 1.0)); + float32_t4 clipEnd = mul(viewProjMatrix, float32_t4(worldEnd, 1.0)); + + // Clip against near plane (w = 0 plane in clip space) + // If both points are behind camera, reject + if (clipStart.w <= 0.001 && clipEnd.w <= 0.001) + return result; + + // If line crosses the near plane, clip it + float32_t t0 = 0.0; + float32_t t1 = 1.0; + + if (clipStart.w <= 0.001) + { + float32_t t = (0.001 - clipStart.w) / (clipEnd.w - clipStart.w); + t0 = saturate(t); + clipStart = lerp(clipStart, clipEnd, t0); + worldStart = lerp(worldStart, worldEnd, t0); + } + + if (clipEnd.w <= 0.001) + { + float32_t t = (0.001 - clipStart.w) / (clipEnd.w - clipStart.w); + t1 = saturate(t); + clipEnd = lerp(clipStart, clipEnd, t1); + worldEnd = lerp(worldStart, worldEnd, t1); + } + + // Now check if the clipped segment is valid + if (t0 >= t1) + return result; + + // Perspective divide to NDC + float32_t2 ndcStart = clipStart.xy / clipStart.w; + float32_t2 ndcEnd = clipEnd.xy / clipEnd.w; + + // Apply aspect ratio correction + ndcStart.x *= aspect; + ndcEnd.x *= aspect; + + // Calculate arrow direction in NDC + float32_t2 arrowVec = ndcEnd - ndcStart; + float32_t arrowNDCLength = length(arrowVec); + + // Skip if arrow is too small on screen + if (arrowNDCLength < 0.005) + return result; + + // Calculate perpendicular distance to line segment in NDC space + float32_t2 toPixel = ndcPos - ndcStart; + float32_t t_ndc = saturate(dot(toPixel, arrowVec) / dot(arrowVec, arrowVec)); + + // Draw line shaft + float32_t lineThickness = 0.002; + float32_t lineIntensity = lineSegment(ndcPos, ndcStart, ndcEnd, lineThickness); + + // Calculate perspective-correct depth + if (lineIntensity > 0.0) + { + float32_t4 clipPos = lerp(clipStart, clipEnd, t_ndc); + float32_t depthNDC = clipPos.z / clipPos.w; + result.depth = 1.0f - depthNDC; + + if (result.depth < 0.0 || result.depth > 1.0) + lineIntensity = 0.0; + } + + // Modulate by PDF + float32_t pdfIntensity = saturate(pdf * 0.5); + float32_t3 finalColor = float32_t3(pdfIntensity, pdfIntensity, pdfIntensity); + + result.color = float32_t4(finalColor, lineIntensity); + return result; + } +}; #endif // _SOLID_ANGLE_VIS_EXAMPLE_DRAWING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl index 3b49d17ca..d21dfaf73 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl @@ -1,4 +1,4 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #pragma shader_stage(compute) @@ -17,112 +17,224 @@ using namespace nbl::hlsl; static const SAMPLING_MODE benchmarkMode = (SAMPLING_MODE)SAMPLING_MODE_CONST; +float32_t2 stratifiedXi(uint32_t sampleIdx, uint32_t threadIdx) +{ + return float32_t2( + (float32_t(sampleIdx & 7u) + 0.5f) / 8.0f + float32_t(threadIdx) * 1e-9f, + (float32_t(sampleIdx >> 3u) + 0.5f) / 8.0f + float32_t(threadIdx) * 1e-9f); +} + +struct PyramidSetup +{ + SphericalPyramid pyramid; + SilEdgeNormals silEdgeNormals; + + static PyramidSetup create(ClippedSilhouette silhouette) + { + PyramidSetup s; + s.pyramid = SphericalPyramid::create(silhouette, s.silEdgeNormals); + s.silEdgeNormals.transformToLocal(s.pyramid.axis1, s.pyramid.axis2, s.pyramid.getAxis3()); + return s; + } +}; + +// Per-thread input perturbation: scatters threads across the 27 OBB regions and +// generates a fresh silhouette per outer-loop iteration so creation work can't +// be hoisted out by the compiler. +ClippedSilhouette makePerturbedSilhouette(float32_t3 baseOffset, NBL_REF_ARG(random::PCG32) rng, float32_t rcpU32) +{ + const float32_t3 cJ = float32_t3( + (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f, + (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f, + (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f); + float32_t3x4 cM = pc.modelMatrix; + cM[0][3] += baseOffset.x + cJ.x; + cM[1][3] += baseOffset.y + cJ.y; + cM[2][3] += baseOffset.z + cJ.z; + shapes::OBBView cV = shapes::OBBView::create(cM); + return ClippedSilhouette::create(cV); +} + [numthreads(BENCHMARK_WORKGROUP_DIMENSION_SIZE_X, 1, 1)] - [shader("compute")] void - main(uint32_t3 invocationID : SV_DispatchThreadID) +void main() { - // Perturb model matrix slightly per sample group - float32_t3x4 perturbedMatrix = pc.modelMatrix; - perturbedMatrix[0][3] += float32_t(invocationID.x) * 1e-6f; - - uint32_t3 region; - uint32_t configIndex; - uint32_t vertexCount; - uint32_t sil = ClippedSilhouette::computeRegionAndConfig(perturbedMatrix, region, configIndex, vertexCount); - - ClippedSilhouette silhouette = (ClippedSilhouette)0; - silhouette.compute(perturbedMatrix, vertexCount, sil); - - float32_t pdf; - uint32_t triIdx; - uint32_t validSampleCount = 0; - float32_t3 sampleDir = float32_t3(0.0, 0.0, 0.0); - - bool sampleValid; - if (benchmarkMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || - benchmarkMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - { - TriangleFanSampler samplingData; - samplingData = TriangleFanSampler::create(silhouette, benchmarkMode); - - for (uint32_t i = 0; i < pc.sampleCount; i++) - { - float32_t2 xi = float32_t2( - (float32_t(i & 7u) + 0.5f) / 8.0f, - (float32_t(i >> 3u) + 0.5f) / 8.0f); - - sampleDir += samplingData.sample(silhouette, xi, pdf, triIdx); - validSampleCount++; - } - } - else if (benchmarkMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) - { - // Precompute parallelogram for sampling - silhouette.normalize(); - SilEdgeNormals silEdgeNormals; - Parallelogram parallelogram = Parallelogram::create(silhouette, silEdgeNormals); - for (uint32_t i = 0; i < pc.sampleCount; i++) - { - float32_t2 xi = float32_t2( - (float32_t(i & 7u) + 0.5f) / 8.0f, - (float32_t(i >> 3u) + 0.5f) / 8.0f); - - sampleDir += parallelogram.sample(silEdgeNormals, xi, pdf, sampleValid); - validSampleCount += sampleValid ? 1u : 0u; - } - } - else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE) - { - // Precompute spherical pyramid and Urena sampler once (edge normals fused) - SilEdgeNormals silEdgeNormals; - SphericalPyramid pyramid = SphericalPyramid::create(silhouette, silEdgeNormals); - UrenaSampler urena = UrenaSampler::create(pyramid); - - for (uint32_t i = 0; i < pc.sampleCount; i++) - { - float32_t2 xi = float32_t2( - (float32_t(i & 7u) + 0.5f) / 8.0f, - (float32_t(i >> 3u) + 0.5f) / 8.0f); - - sampleDir += urena.sample(pyramid, silEdgeNormals, xi, pdf, sampleValid); - validSampleCount += sampleValid ? 1u : 0u; - } - } - else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC) - { - // Precompute spherical pyramid and biquadratic sampler once (edge normals fused) - SilEdgeNormals silEdgeNormals; - SphericalPyramid pyramid = SphericalPyramid::create(silhouette, silEdgeNormals); - BiquadraticSampler biquad = BiquadraticSampler::create(pyramid); - - for (uint32_t i = 0; i < pc.sampleCount; i++) - { - float32_t2 xi = float32_t2( - (float32_t(i & 7u) + 0.5f) / 8.0f, - (float32_t(i >> 3u) + 0.5f) / 8.0f); - - sampleDir += biquad.sample(pyramid, silEdgeNormals, xi, pdf, sampleValid); - validSampleCount += sampleValid ? 1u : 0u; - } - } - else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) - { - // Precompute spherical pyramid and bilinear sampler once (edge normals fused) - SilEdgeNormals silEdgeNormals; - SphericalPyramid pyramid = SphericalPyramid::create(silhouette, silEdgeNormals); - BilinearSampler bilin = BilinearSampler::create(pyramid); - - for (uint32_t i = 0; i < pc.sampleCount; i++) - { - float32_t2 xi = float32_t2( - (float32_t(i & 7u) + 0.5f) / 8.0f, - (float32_t(i >> 3u) + 0.5f) / 8.0f); - - sampleDir += bilin.sample(pyramid, silEdgeNormals, xi, pdf, sampleValid); - validSampleCount += sampleValid ? 1u : 0u; - } - } - - const uint32_t offset = sizeof(uint32_t) * invocationID.x; - outputBuffer.Store(offset, pdf + validSampleCount + triIdx + asuint(sampleDir.x) + asuint(sampleDir.y) + asuint(sampleDir.z)); + const uint32_t invocationID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; + + // Scatter the OBB translation per invocation so threads span all 27 regions + random::PCG32 rng = random::PCG32::construct(invocationID.x + 0x9e3779b9u); + const float32_t rcpU32 = 1.0f / 4294967296.0f; + const float32_t3 rndOffset = float32_t3( + (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f, + (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f, + (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f); + + // XOR sink: every output XORs into this to prevent DCE. + uint32_t sink = 0; + + bool sampleValid; + + // Sampling modes use a nested loop: outer iterates over `creations`, inner over + // `samplesPerCreation`. Total samples per thread = sampleCount. + const uint32_t creations = pc.sampleCount / pc.samplesPerCreation; + + if (benchmarkMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || + benchmarkMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + { + for (uint32_t c = 0; c < creations; c++) + { + ClippedSilhouette silhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); + TriangleFanSampler samplingData = TriangleFanSampler::create(silhouette, benchmarkMode); + + for (uint32_t s = 0; s < pc.samplesPerCreation; s++) + { + float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); + float32_t pdf; + uint32_t triIdx; + float32_t3 dir = samplingData.sample(silhouette, xi, pdf, triIdx); + sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ triIdx; + } + } + } + else if (benchmarkMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) + { + for (uint32_t c = 0; c < creations; c++) + { + ClippedSilhouette silhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); + silhouette.normalize(); + SilEdgeNormals silEdgeNormals; + Parallelogram parallelogram = Parallelogram::create(silhouette, silEdgeNormals); + + for (uint32_t s = 0; s < pc.samplesPerCreation; s++) + { + float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); + float32_t pdf; + float32_t3 dir = parallelogram.sample(silEdgeNormals, xi, pdf, sampleValid); + sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ (uint32_t)sampleValid; + } + } + } + else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE) + { + for (uint32_t c = 0; c < creations; c++) + { + ClippedSilhouette silhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); + PyramidSetup ps = PyramidSetup::create(silhouette); + sampling::SphericalRectangle rectSampler = sampling::SphericalRectangle::create(float32_t3x3(ps.pyramid.axis1, ps.pyramid.axis2, ps.pyramid.getAxis3()), float32_t3(ps.pyramid.rectR0, 1.0f), ps.pyramid.rectExtents); + + for (uint32_t s = 0; s < pc.samplesPerCreation; s++) + { + float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); + sampling::SphericalRectangle::cache_type cache; + float32_t hitDist; + float32_t3 localDir = rectSampler.generateNormalizedLocal(xi, cache, hitDist); + float32_t3 dir = localDir.x * ps.pyramid.axis1 + localDir.y * ps.pyramid.axis2 + localDir.z * ps.pyramid.getAxis3(); + float32_t localX = localDir.x / localDir.z; + float32_t localY = localDir.y / localDir.z; + sampleValid = dir.z > 0.0f && ps.silEdgeNormals.isInsideLocal(localX, localY); + float32_t pdf = rectSampler.forwardPdf(xi, cache); + sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ (uint32_t)sampleValid; + } + } + } + else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE) + { + for (uint32_t c = 0; c < creations; c++) + { + ClippedSilhouette silhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); + PyramidSetup ps = PyramidSetup::create(silhouette); + + const float32_t3 axis3 = ps.pyramid.getAxis3(); + shapes::CompressedSphericalRectangle compressed; + compressed.origin = ps.pyramid.axis1 * ps.pyramid.rectR0.x + ps.pyramid.axis2 * ps.pyramid.rectR0.y + axis3; + compressed.right = ps.pyramid.axis1 * ps.pyramid.rectExtents.x; + compressed.up = ps.pyramid.axis2 * ps.pyramid.rectExtents.y; + sampling::ProjectedSphericalRectangle projRectSampler = sampling::ProjectedSphericalRectangle::create(compressed, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, 1.0f), false); + + for (uint32_t s = 0; s < pc.samplesPerCreation; s++) + { + float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); + sampling::ProjectedSphericalRectangle::cache_type cache; + float32_t hitDist; + float32_t3 localDir = projRectSampler.generateNormalizedLocal(xi, cache, hitDist); + float32_t3 dir = localDir.x * ps.pyramid.axis1 + localDir.y * ps.pyramid.axis2 + localDir.z * ps.pyramid.getAxis3(); + float32_t localX = localDir.x / localDir.z; + float32_t localY = localDir.y / localDir.z; + sampleValid = dir.z > 0.0f && ps.silEdgeNormals.isInsideLocal(localX, localY); + float32_t pdf = projRectSampler.forwardPdf(xi, cache); + sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ (uint32_t)sampleValid; + } + } + } + else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC) + { + for (uint32_t c = 0; c < creations; c++) + { + ClippedSilhouette silhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); + PyramidSetup ps = PyramidSetup::create(silhouette); + BiquadraticSampler biquad = BiquadraticSampler::create(ps.pyramid); + + for (uint32_t s = 0; s < pc.samplesPerCreation; s++) + { + float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); + float32_t pdf; + float32_t3 dir = biquad.sample(ps.pyramid, ps.silEdgeNormals, xi, pdf, sampleValid); + sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ (uint32_t)sampleValid; + } + } + } + else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) + { + for (uint32_t c = 0; c < creations; c++) + { + ClippedSilhouette silhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); + PyramidSetup ps = PyramidSetup::create(silhouette); + BilinearSampler bilin = BilinearSampler::create(ps.pyramid); + + for (uint32_t s = 0; s < pc.samplesPerCreation; s++) + { + float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); + float32_t pdf; + float32_t3 dir = bilin.sample(ps.pyramid, ps.silEdgeNormals, xi, pdf, sampleValid); + sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ (uint32_t)sampleValid; + } + } + } + else if (benchmarkMode == SAMPLING_MODE::SILHOUETTE_CREATION_ONLY) + { + for (uint32_t i = 0; i < pc.sampleCount; i++) + { + ClippedSilhouette iterSilhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); + + sink ^= iterSilhouette.count; + NBL_UNROLL + for (uint32_t j = 0; j < MAX_SILHOUETTE_VERTICES; j++) + sink ^= asuint(iterSilhouette.vertices[j].x) ^ asuint(iterSilhouette.vertices[j].y) ^ asuint(iterSilhouette.vertices[j].z); + } + } + else if (benchmarkMode == SAMPLING_MODE::PYRAMID_CREATION_ONLY) + { + for (uint32_t i = 0; i < pc.sampleCount; i++) + { + ClippedSilhouette synthSil = (ClippedSilhouette)0; + synthSil.count = 5; + + NBL_UNROLL + for (uint32_t v = 0; v < 5; v++) + { + float32_t x = (float32_t(rng()) * rcpU32 - 0.5f) * 1.2f; + float32_t y = (float32_t(rng()) * rcpU32 - 0.5f) * 1.2f; + synthSil.vertices[v] = normalize(float32_t3(x, y, 1.0f)); + } + + SilEdgeNormals silEdgeNormals; + SphericalPyramid pyramid = SphericalPyramid::create(synthSil, silEdgeNormals); + + uint32_t pyramidBits = asuint(pyramid.axis1.x) ^ asuint(pyramid.axis2.x) ^ asuint(pyramid.rectR0.x) ^ asuint(pyramid.rectR0.y) ^ asuint(pyramid.rectExtents.x) ^ asuint(pyramid.rectExtents.y); + uint32_t edgeBits = asuint(float32_t(silEdgeNormals.edgeNormals[0].x)) ^ asuint(float32_t(silEdgeNormals.edgeNormals[1].x)); + sink ^= pyramidBits ^ edgeBits; + } + } + + const uint32_t offset = sizeof(uint32_t) * invocationID.x; + outputBuffer.Store(offset, sink); } diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl index 3091bc793..c3fa6db7c 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/common.hlsl @@ -7,5 +7,4 @@ NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_X = 64u; NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y = 1u; NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z = 1u; -NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_COUNT = 1000000u; - +NBL_CONSTEXPR_INLINE_NSPC_SCOPE_VAR uint32_t BENCHMARK_WORKGROUP_COUNT = 4096u; diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl index d63ec3c6a..f55f27067 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl @@ -1,4 +1,4 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #ifndef _SOLID_ANGLE_VIS_EXAMPLE_COMMON_HLSL_INCLUDED_ @@ -6,131 +6,150 @@ #include "nbl/builtin/hlsl/cpp_compat.hlsl" -#define FAST 1 +#define MAX_SILHOUETTE_VERTICES 7 namespace nbl { - namespace hlsl - { - // Sampling mode enum - enum SAMPLING_MODE : uint32_t - { - TRIANGLE_SOLID_ANGLE, - TRIANGLE_PROJECTED_SOLID_ANGLE, - PROJECTED_PARALLELOGRAM_SOLID_ANGLE, - SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE, - SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC, - SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR, - Count - }; - - struct ResultData - { - // Silhouette - uint32_t3 region; - uint32_t silhouetteIndex; - uint32_t silhouetteVertexCount; - uint32_t silhouette; - uint32_t positiveVertCount; - uint32_t edgeVisibilityMismatch; - uint32_t clipMask; - uint32_t clipCount; - uint32_t rotatedSil; - uint32_t wrapAround; - uint32_t rotatedClipMask; - uint32_t rotateAmount; - uint32_t vertices[6]; - uint32_t clippedSilhouetteVertexCount; - float32_t3 clippedSilhouetteVertices[7]; - uint32_t clippedSilhouetteVerticesIndices[7]; - - // Parallelogram - uint32_t parallelogramDoesNotBound; - float32_t parallelogramArea; - uint32_t failedVertexIndex; - uint32_t edgeIsConvex[4]; - uint32_t parallelogramVerticesInside; - uint32_t parallelogramEdgesInside; - float32_t2 parallelogramCorners[4]; - - // spherical triangle - uint32_t maxTrianglesExceeded; - uint32_t sphericalLuneDetected; - uint32_t triangleCount; - float32_t solidAngles[5]; - float32_t totalSolidAngles; - - // Sampling ray visualization data - uint32_t sampleCount; - float32_t4 rayData[512]; // xyz = direction, w = PDF - - // Pyramid sampling debug data - float32_t3 pyramidAxis1; // First caliper axis direction - float32_t3 pyramidAxis2; // Second caliper axis direction - float32_t3 pyramidCenter; // Silhouette center direction - float32_t pyramidHalfWidth1; // Half-width along axis1 (sin-space) - float32_t pyramidHalfWidth2; // Half-width along axis2 (sin-space) - float32_t pyramidOffset1; // Center offset along axis1 - float32_t pyramidOffset2; // Center offset along axis2 - float32_t pyramidSolidAngle; // Bounding region solid angle - uint32_t pyramidBestEdge; // Which edge produced best caliper - uint32_t pyramidSpansHemisphere; // Warning: silhouette >= hemisphere - float32_t pyramidMin1; // Min dot product along axis1 - float32_t pyramidMax1; // Max dot product along axis1 - float32_t pyramidMin2; // Min dot product along axis2 - float32_t pyramidMax2; // Max dot product along axis2 - uint32_t axis2BiggerThanAxis1; - - // Sampling stats - uint32_t validSampleCount; - uint32_t threadCount; // Used as a hack for fragment shader, as dividend for validSampleCount - }; - -#ifdef __HLSL_VERSION - [[vk::binding(0, 0)]] RWStructuredBuffer DebugDataBuffer; -#endif - - struct PushConstants - { - float32_t3x4 modelMatrix; - float32_t4 viewport; - uint32_t sampleCount; - uint32_t frameIndex; - }; - - struct PushConstantRayVis - { - float32_t4x4 viewProjMatrix; - float32_t3x4 viewMatrix; - float32_t3x4 modelMatrix; - float32_t3x4 invModelMatrix; - float32_t4 viewport; - uint32_t frameIndex; - }; - - struct BenchmarkPushConstants - { - float32_t3x4 modelMatrix; - uint32_t sampleCount; - }; - - static const float32_t3 colorLUT[27] = { - float32_t3(0, 0, 0), float32_t3(0.5, 0.5, 0.5), - float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1), - float32_t3(1, 1, 0), float32_t3(1, 0, 1), float32_t3(0, 1, 1), - float32_t3(1, 0.5, 0), float32_t3(1, 0.65, 0), float32_t3(0.8, 0.4, 0), - float32_t3(1, 0.4, 0.7), float32_t3(1, 0.75, 0.8), float32_t3(0.7, 0.1, 0.3), - float32_t3(0.5, 0, 0.5), float32_t3(0.6, 0.4, 0.8), float32_t3(0.3, 0, 0.5), - float32_t3(0, 0.5, 0), float32_t3(0.5, 1, 0), float32_t3(0, 0.5, 0.25), - float32_t3(0, 0, 0.5), float32_t3(0.3, 0.7, 1), float32_t3(0, 0.4, 0.6), - float32_t3(0.6, 0.4, 0.2), float32_t3(0.8, 0.7, 0.3), float32_t3(0.4, 0.3, 0.1), float32_t3(1, 1, 1)}; +namespace hlsl +{ + +// Sampling mode enum +enum SAMPLING_MODE : uint32_t +{ + TRIANGLE_SOLID_ANGLE, + TRIANGLE_PROJECTED_SOLID_ANGLE, + PROJECTED_PARALLELOGRAM_SOLID_ANGLE, + SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE, + SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC, + SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR, + SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE, + SILHOUETTE_CREATION_ONLY, + PYRAMID_CREATION_ONLY, + Count +}; + +struct ResultData +{ + struct SilhouetteData + { + uint32_t3 region; + uint32_t silhouetteIndex; + uint32_t silhouetteVertexCount; + uint32_t silhouette; + uint32_t vertices[6]; + + // Clipping + uint32_t clipMask; + uint32_t clipCount; + uint32_t rotatedClipMask; + uint32_t rotateAmount; + uint32_t positiveVertCount; + uint32_t wrapAround; + uint32_t rotatedSil; + uint32_t edgeVisibilityMismatch; + + // Clipped output (layout matches ClippedSilhouette: vertices[7] then count) + float32_t3 clippedVertices[MAX_SILHOUETTE_VERTICES]; + uint32_t clippedVertexCount; + uint32_t clippedVertexIndices[MAX_SILHOUETTE_VERTICES]; + } silhouette; + + struct TriangleFanData + { + uint32_t maxTrianglesExceeded; + uint32_t sphericalLuneDetected; + uint32_t triangleCount; + float32_t solidAngles[5]; + float32_t totalSolidAngles; + } triangleFan; + + struct ParallelogramData + { + float32_t2 corners[4]; + uint32_t edgeIsConvex[4]; + uint32_t n3Mask; + uint32_t doesNotBound; + uint32_t failedVertexIndex; + uint32_t verticesInside; + uint32_t edgesInside; + float32_t area; + } parallelogram; + + struct PyramidData + { + float32_t3 axis1; // First caliper axis direction + float32_t3 axis2; // Second caliper axis direction + float32_t3 center; // Silhouette center direction + float32_t halfWidth1; // Half-width along axis1 (sin-space) + float32_t halfWidth2; // Half-width along axis2 (sin-space) + float32_t offset1; // Center offset along axis1 + float32_t offset2; // Center offset along axis2 + float32_t solidAngle; // Bounding region solid angle + uint32_t bestEdge; // Which edge produced best caliper + float32_t min1; // Min dot product along axis1 + float32_t max1; // Max dot product along axis1 + float32_t min2; // Min dot product along axis2 + float32_t max2; // Max dot product along axis2 + uint32_t axis2BiggerThanAxis1; + } pyramid; + + struct SamplingData + { + uint32_t sampleCount; + uint32_t validSampleCount; + uint32_t threadCount; // Per-fragment counter, used as divisor for validSampleCount + float32_t4 rayData[512]; // xyz = direction, w = PDF + } sampling; +}; + +struct PushConstants +{ + float32_t3x4 modelMatrix; + float32_t4 viewport; + uint32_t sampleCount; + uint32_t frameIndex; +}; + +struct PushConstantRayVis +{ + float32_t4x4 viewProjMatrix; + float32_t3x4 viewMatrix; + float32_t3x4 modelMatrix; + float32_t3x4 invModelMatrix; + float32_t4 viewport; + uint32_t frameIndex; +}; + +struct BenchmarkPushConstants +{ + float32_t3x4 modelMatrix; + uint32_t sampleCount; // total samples per thread (= creations * samplesPerCreation) + uint32_t samplesPerCreation; // inner-loop count; outer-loop count = sampleCount / samplesPerCreation +}; + +static const float32_t3 colorLUT[27] = { + float32_t3(0, 0, 0), float32_t3(0.5, 0.5, 0.5), + float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(0, 0, 1), + float32_t3(1, 1, 0), float32_t3(1, 0, 1), float32_t3(0, 1, 1), + float32_t3(1, 0.5, 0), float32_t3(1, 0.65, 0), float32_t3(0.8, 0.4, 0), + float32_t3(1, 0.4, 0.7), float32_t3(1, 0.75, 0.8), float32_t3(0.7, 0.1, 0.3), + float32_t3(0.5, 0, 0.5), float32_t3(0.6, 0.4, 0.8), float32_t3(0.3, 0, 0.5), + float32_t3(0, 0.5, 0), float32_t3(0.5, 1, 0), float32_t3(0, 0.5, 0.25), + float32_t3(0, 0, 0.5), float32_t3(0.3, 0.7, 1), float32_t3(0, 0.4, 0.6), + float32_t3(0.6, 0.4, 0.2), float32_t3(0.8, 0.7, 0.3), float32_t3(0.4, 0.3, 0.1), float32_t3(1, 1, 1)}; #ifndef __HLSL_VERSION - static const char *colorNames[27] = {"Black", "Gray", "Red", "Green", "Blue", "Yellow", "Magenta", "Cyan", - "Orange", "Light Orange", "Dark Orange", "Pink", "Light Pink", "Deep Rose", "Purple", "Light Purple", - "Indigo", "Dark Green", "Lime", "Forest Green", "Navy", "Sky Blue", "Teal", "Brown", - "Tan/Beige", "Dark Brown", "White"}; +static const char* colorNames[27] = {"Black", "Gray", "Red", "Green", "Blue", "Yellow", "Magenta", "Cyan", + "Orange", "Light Orange", "Dark Orange", "Pink", "Light Pink", "Deep Rose", "Purple", "Light Purple", + "Indigo", "Dark Green", "Lime", "Forest Green", "Navy", "Sky Blue", "Teal", "Brown", + "Tan/Beige", "Dark Brown", "White"}; #endif // __HLSL_VERSION - } -} + +} // namespace hlsl + +} // namespace nbl + +static const nbl::hlsl::float32_t CIRCLE_RADIUS = 0.5f; +static const nbl::hlsl::float32_t INV_CIRCLE_RADIUS = 1.0f / CIRCLE_RADIUS; + #endif // _SOLID_ANGLE_VIS_EXAMPLE_COMMON_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl new file mode 100644 index 000000000..916390323 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl @@ -0,0 +1,150 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_DEBUG_VIS_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_DEBUG_VIS_HLSL_INCLUDED_ + +#include "common.hlsl" + +#ifdef __HLSL_VERSION +[[vk::binding(0, 0)]] RWStructuredBuffer DebugDataBuffer; +#endif + +struct DebugRecorder +{ +#if DEBUG_DATA + static void recordClippedVertex(uint32_t slot, float32_t3 pos, uint32_t originalIndex) + { + DebugDataBuffer[0].silhouette.clippedVertices[slot] = pos; + DebugDataBuffer[0].silhouette.clippedVertexIndices[slot] = originalIndex; + } + + static void recordClipResult(uint32_t vertexCount, uint32_t clipMask, uint32_t clipCount, + uint32_t rotatedClipMask, uint32_t rotateAmount, uint32_t positiveCount, + bool wrapAround, uint32_t rotatedSil) + { + DebugDataBuffer[0].silhouette.clippedVertexCount = vertexCount; + DebugDataBuffer[0].silhouette.clipMask = clipMask; + DebugDataBuffer[0].silhouette.clipCount = clipCount; + DebugDataBuffer[0].silhouette.rotatedClipMask = rotatedClipMask; + DebugDataBuffer[0].silhouette.rotateAmount = rotateAmount; + DebugDataBuffer[0].silhouette.positiveVertCount = positiveCount; + DebugDataBuffer[0].silhouette.wrapAround = (uint32_t)wrapAround; + DebugDataBuffer[0].silhouette.rotatedSil = rotatedSil; + } + + static void recordTriangleFan(bool luneDetected, uint32_t count, float32_t totalWeight, float32_t solidAngles[5]) + { + DebugDataBuffer[0].triangleFan.sphericalLuneDetected = (uint32_t)luneDetected; + DebugDataBuffer[0].triangleFan.maxTrianglesExceeded = (count > 5); + DebugDataBuffer[0].triangleFan.triangleCount = count; + DebugDataBuffer[0].triangleFan.totalSolidAngles = totalWeight; + for (uint32_t tri = 0; tri < count; tri++) + DebugDataBuffer[0].triangleFan.solidAngles[tri] = solidAngles[tri]; + } + + static void recordParallelogram(float32_t area, uint32_t convexMask, uint32_t n3Mask, + float32_t2 corner, float32_t2 axisDir, float32_t width, float32_t height) + { + DebugDataBuffer[0].parallelogram.area = area; + + // Store per-edge convex and N3 flags + DebugDataBuffer[0].parallelogram.n3Mask = n3Mask; + for (uint32_t i = 0; i < 4; i++) + DebugDataBuffer[0].parallelogram.edgeIsConvex[i] = (convexMask >> i) & 1u; + + // Compute and store the 4 parallelogram corners in circle-space + float32_t2 perpDir = float32_t2(-axisDir.y, axisDir.x); + DebugDataBuffer[0].parallelogram.corners[0] = corner; + DebugDataBuffer[0].parallelogram.corners[1] = corner + width * axisDir; + DebugDataBuffer[0].parallelogram.corners[2] = corner + width * axisDir + height * perpDir; + DebugDataBuffer[0].parallelogram.corners[3] = corner + height * perpDir; + } + + static void recordPyramid(float32_t3 axis1, float32_t3 axis2, float32_t3 center, + float32_t4 bounds, float32_t solidAngle, uint32_t bestEdge) + { + DebugDataBuffer[0].pyramid.axis1 = axis1; + DebugDataBuffer[0].pyramid.axis2 = axis2; + DebugDataBuffer[0].pyramid.center = normalize(center); + DebugDataBuffer[0].pyramid.halfWidth1 = (atan(bounds.z) - atan(bounds.x)) * 0.5f; + DebugDataBuffer[0].pyramid.halfWidth2 = (atan(bounds.w) - atan(bounds.y)) * 0.5f; + DebugDataBuffer[0].pyramid.solidAngle = solidAngle; + DebugDataBuffer[0].pyramid.bestEdge = bestEdge; + DebugDataBuffer[0].pyramid.min1 = bounds.x; + DebugDataBuffer[0].pyramid.max1 = bounds.z; + DebugDataBuffer[0].pyramid.min2 = bounds.y; + DebugDataBuffer[0].pyramid.max2 = bounds.w; + } + + static void recordSampleCount(uint32_t count) { DebugDataBuffer[0].sampling.sampleCount = count; } + static void recordRay(uint32_t i, float32_t3 dir, float32_t pdf) { DebugDataBuffer[0].sampling.rayData[i] = float32_t4(dir, pdf); } + + static void recordFrameEnd(uint32_t3 region, uint32_t configIndex, uint32_t silSize, + uint32_t silData, uint32_t vertexIndices[6], uint32_t validSampleCount) + { + InterlockedAdd(DebugDataBuffer[0].sampling.validSampleCount, validSampleCount); + InterlockedAdd(DebugDataBuffer[0].sampling.threadCount, 1u); + DebugDataBuffer[0].silhouette.region = region; + DebugDataBuffer[0].silhouette.silhouetteIndex = configIndex; + DebugDataBuffer[0].silhouette.silhouetteVertexCount = silSize; + for (uint32_t i = 0; i < 6; i++) + DebugDataBuffer[0].silhouette.vertices[i] = vertexIndices[i]; + DebugDataBuffer[0].silhouette.silhouette = silData; + } +#else + static void recordClippedVertex(uint32_t slot, float32_t3 pos, uint32_t originalIndex) {} + static void recordClipResult(uint32_t vertexCount, uint32_t clipMask, uint32_t clipCount, + uint32_t rotatedClipMask, uint32_t rotateAmount, uint32_t positiveCount, + bool wrapAround, uint32_t rotatedSil) {} + static void recordTriangleFan(bool luneDetected, uint32_t count, float32_t totalWeight, float32_t solidAngles[5]) {} + static void recordParallelogram(float32_t area, uint32_t convexMask, uint32_t n3Mask, + float32_t2 corner, float32_t2 axisDir, float32_t width, float32_t height) {} + static void recordPyramid(float32_t3 axis1, float32_t3 axis2, float32_t3 center, + float32_t4 bounds, float32_t solidAngle, uint32_t bestEdge) {} + static void recordSampleCount(uint32_t count) {} + static void recordRay(uint32_t i, float32_t3 dir, float32_t pdf) {} + static void recordFrameEnd(uint32_t3 region, uint32_t configIndex, uint32_t silSize, + uint32_t silData, uint32_t vertexIndices[6], uint32_t validSampleCount) {} +#endif +}; + +// Module-scope visualization state (per-thread in fragment shaders) +#if VISUALIZE_SAMPLES +static float32_t2 g_visNdc; +static float32_t3 g_visSpherePos; +static float32_t g_visAaWidth; +static float32_t4 g_visColor; +#endif + +struct VisContext +{ +#if VISUALIZE_SAMPLES + static void begin(float32_t2 ndc, float32_t3 spherePos, float32_t _aaWidth) + { + g_visNdc = ndc; + g_visSpherePos = spherePos; + g_visAaWidth = _aaWidth; + g_visColor = float32_t4(0, 0, 0, 0); + } + + static void add(float32_t4 c) { g_visColor += c; } + static float32_t4 flush() { return g_visColor; } + + static float32_t2 ndc() { return g_visNdc; } + static float32_t3 spherePos() { return g_visSpherePos; } + static float32_t aaWidth() { return g_visAaWidth; } + static bool enabled() { return true; } +#else + static void begin(nbl::hlsl::float32_t2 ndc, nbl::hlsl::float32_t3 spherePos, nbl::hlsl::float32_t aaWidth) {} + static void add(nbl::hlsl::float32_t4 c) {} + static nbl::hlsl::float32_t4 flush() { return nbl::hlsl::float32_t4(0, 0, 0, 0); } + + static nbl::hlsl::float32_t2 ndc() { return nbl::hlsl::float32_t2(0, 0); } + static nbl::hlsl::float32_t3 spherePos() { return nbl::hlsl::float32_t3(0, 0, 0); } + static nbl::hlsl::float32_t aaWidth() { return 0; } + static bool enabled() { return false; } +#endif +}; + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_DEBUG_VIS_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl deleted file mode 100644 index 142471493..000000000 --- a/73_SolidAngleVisualizer/app_resources/hlsl/gpu_common.hlsl +++ /dev/null @@ -1,175 +0,0 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. -//// This file is part of the "Nabla Engine". -//// For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _SOLID_ANGLE_VIS_EXAMPLE_GPU_COMMON_HLSL_INCLUDED_ -#define _SOLID_ANGLE_VIS_EXAMPLE_GPU_COMMON_HLSL_INCLUDED_ - -#include "utils.hlsl" - -static const float32_t CIRCLE_RADIUS = 0.5f; -static const float32_t INV_CIRCLE_RADIUS = 1.0f / CIRCLE_RADIUS; - -// --- Geometry Utils --- -#define MAX_SILHOUETTE_VERTICES 7 - -// Special index values for clip points -static const uint32_t CLIP_POINT_A = 23; // Clip point between last positive and first negative -static const uint32_t CLIP_POINT_B = 24; // Clip point between last negative and first positive - -static const float32_t3 constCorners[8] = { - float32_t3(-0.5f, -0.5f, -0.5f), float32_t3(0.5f, -0.5f, -0.5f), float32_t3(-0.5f, 0.5f, -0.5f), float32_t3(0.5f, 0.5f, -0.5f), - float32_t3(-0.5f, -0.5f, 0.5f), float32_t3(0.5f, -0.5f, 0.5f), float32_t3(-0.5f, 0.5f, 0.5f), float32_t3(0.5f, 0.5f, 0.5f)}; - -static const uint32_t2 allEdges[12] = { - {0, 1}, - {2, 3}, - {4, 5}, - {6, 7}, // X axis - {0, 2}, - {1, 3}, - {4, 6}, - {5, 7}, // Y axis - {0, 4}, - {1, 5}, - {2, 6}, - {3, 7}, // Z axis -}; - -// Maps face index (0-5) to its 4 corner indices in CCW order -static const uint32_t faceToCorners[6][4] = { - {0, 2, 3, 1}, // Face 0: Z- - {4, 5, 7, 6}, // Face 1: Z+ - {0, 4, 6, 2}, // Face 2: X- - {1, 3, 7, 5}, // Face 3: X+ - {0, 1, 5, 4}, // Face 4: Y- - {2, 6, 7, 3} // Face 5: Y+ -}; - -static float32_t3 corners[8]; -static float32_t3 faceCenters[6] = { - float32_t3(0, 0, 0), float32_t3(0, 0, 0), float32_t3(0, 0, 0), - float32_t3(0, 0, 0), float32_t3(0, 0, 0), float32_t3(0, 0, 0)}; - -static const float32_t3 localNormals[6] = { - float32_t3(0, 0, -1), // Face 0 (Z-) - float32_t3(0, 0, 1), // Face 1 (Z+) - float32_t3(-1, 0, 0), // Face 2 (X-) - float32_t3(1, 0, 0), // Face 3 (X+) - float32_t3(0, -1, 0), // Face 4 (Y-) - float32_t3(0, 1, 0) // Face 5 (Y+) -}; - -// TODO: unused, remove later -// Vertices are ordered CCW relative to the camera view. -static const uint32_t silhouettes[27][7] = { - {6, 1, 3, 2, 6, 4, 5}, // 0: Black - {6, 2, 6, 4, 5, 7, 3}, // 1: White - {6, 0, 4, 5, 7, 3, 2}, // 2: Gray - {6, 1, 3, 7, 6, 4, 5}, // 3: Red - {4, 4, 5, 7, 6, 0, 0}, // 4: Green - {6, 0, 4, 5, 7, 6, 2}, // 5: Blue - {6, 0, 1, 3, 7, 6, 4}, // 6: Yellow - {6, 0, 1, 5, 7, 6, 4}, // 7: Magenta - {6, 0, 1, 5, 7, 6, 2}, // 8: Cyan - {6, 1, 3, 2, 6, 7, 5}, // 9: Orange - {4, 2, 6, 7, 3, 0, 0}, // 10: Light Orange - {6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange - {4, 1, 3, 7, 5, 0, 0}, // 12: Pink - {4, 0, 4, 6, 7, 3, 2}, // 13: Light Pink - {4, 0, 4, 6, 2, 0, 0}, // 14: Deep Rose - {6, 0, 1, 3, 7, 5, 4}, // 15: Purple - {4, 0, 1, 5, 4, 0, 0}, // 16: Light Purple - {6, 0, 1, 5, 4, 6, 2}, // 17: Indigo - {6, 0, 2, 6, 7, 5, 1}, // 18: Dark Green - {6, 0, 2, 6, 7, 3, 1}, // 19: Lime - {6, 0, 4, 6, 7, 3, 1}, // 20: Forest Green - {6, 0, 2, 3, 7, 5, 1}, // 21: Navy - {4, 0, 2, 3, 1, 0, 0}, // 22: Sky Blue - {6, 0, 4, 6, 2, 3, 1}, // 23: Teal - {6, 0, 2, 3, 7, 5, 4}, // 24: Brown - {6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige - {6, 1, 5, 4, 6, 2, 3} // 26: Dark Brown -}; - -// Binary packed silhouettes -static const uint32_t binSilhouettes[27] = { - 0b11000000000000101100110010011001, - 0b11000000000000011111101100110010, - 0b11000000000000010011111101100000, - 0b11000000000000101100110111011001, - 0b10000000000000000000110111101100, - 0b11000000000000010110111101100000, - 0b11000000000000100110111011001000, - 0b11000000000000100110111101001000, - 0b11000000000000010110111101001000, - 0b11000000000000101111110010011001, - 0b10000000000000000000011111110010, - 0b11000000000000010011111110100000, - 0b10000000000000000000101111011001, - 0b11000000000000010011111110100000, - 0b10000000000000000000010110100000, - 0b11000000000000100101111011001000, - 0b10000000000000000000100101001000, - 0b11000000000000010110100101001000, - 0b11000000000000001101111110010000, - 0b11000000000000001011111110010000, - 0b11000000000000001011111110100000, - 0b11000000000000001101111011010000, - 0b10000000000000000000001011010000, - 0b11000000000000001011010110100000, - 0b11000000000000100101111011010000, - 0b11000000000000100101001011010000, - 0b11000000000000011010110100101001, -}; - -uint32_t getSilhouetteVertex(uint32_t packedSil, uint32_t index) -{ - return (packedSil >> (3u * index)) & 0x7u; -} - -// Get silhouette size -uint32_t getSilhouetteSize(uint32_t sil) -{ - return (sil >> 29u) & 0x7u; -} - -// Check if vertex has negative z -bool getVertexZNeg(float32_t3x4 modelMatrix, uint32_t vertexIdx) -{ -#if FAST - float32_t3 localPos = float32_t3( - (vertexIdx & 1) ? 0.5f : -0.5f, - (vertexIdx & 2) ? 0.5f : -0.5f, - (vertexIdx & 4) ? 0.5f : -0.5f); - - float32_t transformedZ = nbl::hlsl::dot(modelMatrix[2].xyz, localPos) + modelMatrix[2].w; - return transformedZ < 0.0f; -#else - return corners[vertexIdx].z < 0.0f; -#endif -} - -// Get world position of cube vertex -float32_t3 getVertex(float32_t3x4 modelMatrix, uint32_t vertexIdx) -{ -#if FAST - // Reconstruct local cube corner from index bits - float32_t sx = (vertexIdx & 1) ? 0.5f : -0.5f; - float32_t sy = (vertexIdx & 2) ? 0.5f : -0.5f; - float32_t sz = (vertexIdx & 4) ? 0.5f : -0.5f; - - float32_t4x3 model = transpose(modelMatrix); - - // Transform to world - // Full position, not just Z like getVertexZNeg - return model[0].xyz * sx + - model[1].xyz * sy + - model[2].xyz * sz + - model[3].xyz; - // return mul(modelMatrix, float32_t4(sx, sy, sz, 1.0f)); -#else - return corners[vertexIdx]; -#endif -} - -#endif // _SOLID_ANGLE_VIS_EXAMPLE_GPU_COMMON_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl index cd02171af..7c99a3363 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl @@ -1,4 +1,4 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #ifndef _SOLID_ANGLE_VIS_EXAMPLE_PARALLELOGRAM_SAMPLING_HLSL_INCLUDED_ @@ -28,7 +28,7 @@ struct Parallelogram static float32_t3 circleToSphere(float32_t2 circlePoint) { - float32_t2 xy = circlePoint / CIRCLE_RADIUS; + float32_t2 xy = circlePoint * INV_CIRCLE_RADIUS; float32_t xy_len_sq = dot(xy, xy); return float32_t3(xy, sqrt(1.0f - xy_len_sq)); } @@ -305,25 +305,18 @@ struct Parallelogram computeBoundsForAxis(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); Parallelogram result; - result.width = float16_t(maxAlong - minAlong); - result.height = float16_t(maxPerp - minPerp); + result.width = (float16_t)(maxAlong - minAlong); + result.height = (float16_t)(maxPerp - minPerp); result.axisDir = float16_t2(dir); - result.corner = float16_t2(minAlong * dir + minPerp * float16_t2(-dir.y, dir.x)); + result.corner = float16_t2(minAlong * dir + minPerp * perpDir); return result; } // Silhouette vertices must be normalized before calling create() - static Parallelogram create(const ClippedSilhouette silhouette, out SilEdgeNormals precompSil -#if VISUALIZE_SAMPLES - , - float32_t2 ndc, float32_t3 spherePos, float32_t aaWidth, - inout float32_t4 color -#endif - ) + static Parallelogram create(const ClippedSilhouette silhouette, out SilEdgeNormals precompSil) { precompSil = (SilEdgeNormals)0; - precompSil.count = silhouette.count; uint32_t convexMask = 0; uint32_t n3Mask = 0; @@ -355,7 +348,6 @@ struct Parallelogram Parallelogram best = buildForAxis(silhouette, convexMask, n3Mask, bestDir); -#if VISUALIZE_SAMPLES for (uint32_t i = 0; i < silhouette.count; i++) { if (convexMask & (1u << i)) @@ -376,22 +368,19 @@ struct Parallelogram computeApexClamped(p0, midPoint, t0, tangentAtMid, apex0); computeApexClamped(midPoint, p1, tangentAtMid, endTangent, apex1); - color += drawCorner(float32_t3(apex0, 0.0f), ndc, aaWidth, 0.03, 0.0f, float32_t3(1, 0, 1)); - color += drawCorner(float32_t3(midPoint, 0.0f), ndc, aaWidth, 0.02, 0.0f, float32_t3(0, 1, 0)); - color += drawCorner(float32_t3(apex1, 0.0f), ndc, aaWidth, 0.03, 0.0f, float32_t3(1, 0.5, 0)); + VisContext::add(SphereDrawer::drawDot(float32_t3(apex0, 0.0f), 0.03, 0.0f, float32_t3(1, 0, 1))); + VisContext::add(SphereDrawer::drawDot(float32_t3(midPoint, 0.0f), 0.02, 0.0f, float32_t3(0, 1, 0))); + VisContext::add(SphereDrawer::drawDot(float32_t3(apex1, 0.0f), 0.03, 0.0f, float32_t3(1, 0.5, 0))); } else { float32_t2 apex; computeApexClamped(p0, p1, t0, endTangent, apex); - color += drawCorner(float32_t3(apex, 0.0f), ndc, aaWidth, 0.03, 0.0f, float32_t3(1, 0, 1)); + VisContext::add(SphereDrawer::drawDot(float32_t3(apex, 0.0f), 0.03, 0.0f, float32_t3(1, 0, 1))); } } } -#endif -#if DEBUG_DATA - DebugDataBuffer[0].parallelogramArea = best.width * best.height; -#endif + DebugRecorder::recordParallelogram(float32_t(best.width) * float32_t(best.height), convexMask, n3Mask, float32_t2(best.corner), float32_t2(best.axisDir), float32_t(best.width), float32_t(best.height)); return best; } @@ -401,8 +390,8 @@ struct Parallelogram float16_t2 perpDir = float16_t2(-axisDir.y, axisDir.x); float16_t2 circleXY = corner + - float16_t(xi.x) * width * axisDir + - float16_t(xi.y) * height * perpDir; + (float16_t)(xi.x) * width * axisDir + + (float16_t)(xi.y) * height * perpDir; float32_t3 direction = circleToSphere(circleXY); @@ -415,4 +404,7 @@ struct Parallelogram } }; +#undef MAX_CURVE_APEXES +#undef GET_PROJ_VERT + #endif // _SOLID_ANGLE_VIS_EXAMPLE_PARALLELOGRAM_SAMPLING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl index fab111b3e..afd60914c 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl @@ -1,567 +1,300 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #ifndef _SOLID_ANGLE_VIS_EXAMPLE_PYRAMID_SAMPLING_HLSL_INCLUDED_ #define _SOLID_ANGLE_VIS_EXAMPLE_PYRAMID_SAMPLING_HLSL_INCLUDED_ -#include "gpu_common.hlsl" +#include "common.hlsl" #include #include #include #include +#include #include "silhouette.hlsl" #include "drawing.hlsl" // ============================================================================ -// Spherical Rectangle Bound via Rotating Calipers +// Spherical Pyramid: gnomonic bounding rectangle for silhouette sampling. // -// Bounds the silhouette with a spherical rectangle (intersection of two -// orthogonal lunes). Each lune is defined by two great circles (planes -// through the origin). The rectangle is parameterized for downstream -// samplers (Urena, bilinear, biquadratic) in pyramid_sampling/*.hlsl. +// Algorithm (SphericalPyramid::create): +// 1. Adaptive axis3: blend silhouette centroid toward (0,0,1) to keep +// all vertices in the positive gnomonic half-space. Branchless. +// 2. Rotating calipers: try each edge projected perpendicular to axis3, +// keep the axis1/axis2 rotation with minimum gnomonic bounding area. +// Edge normals are fused into this pass (cross products from the same +// vertex loads). +// 3. Sign-stabilize axis1 against a world-space reference. // -// Algorithm: -// 1. Rotating Calipers: Find the edge that minimizes the lune-width proxy -// dot(cross(A, B), C) = sin(edge_len) * sin(angular_dist) -// No per-edge normalization needed, scalar triple product suffices. -// -// 2. Build orthonormal frame from the minimum-width edge: -// - axis1 = normalize(cross(A, B)), pole of the primary lune -// - axis2, axis3 complete the frame via edge-based candidate search -// (tryPrimaryFrameCandidate), oriented toward silhouette center -// -// 3. Project vertices onto the frame as (x/z, y/z) -// to find the bounding rectangle extents (rectR0, rectExtents) -// -// 4. Fallback: if the primary frame leaves vertices near the z=0 plane, -// fix axis3 = camera forward (0,0,1) and search axis1/axis2 via -// tryFallbackFrameCandidate -// -// Key property: If all vertices are inside a great circle half-space, -// then all edges (geodesic arcs) are also inside. No edge extremum -// checking needed (unlike parallelogram_sampling which works in -// projected 2D space where arcs can bulge beyond vertices). +// axis3 is not stored, reconstructed as cross(axis1, axis2). +// rectR0 is float2 (z is always 1.0 in gnomonic space). // ============================================================================ -// Spherical rectangle bound: stores the orthonormal frame and gnomonic -// projection extents. Consumed by UrenaSampler, BilinearSampler, BiquadraticSampler. struct SphericalPyramid { - // Orthonormal frame for the bounding region - float32_t3 axis1; // Primary axis (from minimum-width edge's great circle normal) - float32_t3 axis2; // Secondary axis (perpendicular to axis1) - float32_t3 axis3; // Forward axis, toward silhouette (primary) or camera forward (fallback) - - // SphericalRectangle parameters (in the local frame where axis3 is Z) - float32_t3 rectR0; // Corner position in local frame - float32_t2 rectExtents; // Width (along axis1) and height (along axis2) - float32_t solidAngle; // Solid angle of the bounding region (steradians) - - // ======================================================================== - // Rotating Calipers - Minimum Width Edge Finding (Scalar Triple Product) - // ======================================================================== - - // Simplified metric: dot(cross(A, B), C) = sin(edge_len) * sin(angular_dist) - // This is a lune-area proxy, no per-edge normalization needed for comparison. - // Per-vertex cost: one dot product with precomputed edge normal. - // Per-edge cost: one cross product (replaces addition + rsqrt). - // - // Triangular column-major traversal (rotating calipers pattern): - // Vertex V_j checks against edges 0..j-2. - // V2 -> edge 0; V3 -> edges 0,1; V4 -> edges 0,1,2; etc. - // Total checks: (N-2)(N-1)/2 instead of N(N-2). - // - // Endpoints: dot(cross(A,B), A) = dot(cross(A,B), B) = 0, never affect max. - static void findMinimumWidthEdge(const ClippedSilhouette silhouette, out uint32_t bestEdge, out float32_t3 bestV0, out float32_t3 bestV1, out float32_t bestWidth, out SilEdgeNormals precompSil) - { - precompSil = (SilEdgeNormals)0; - precompSil.count = silhouette.count; - - // Edge normals: cross(v[i], v[i+1]), inward-facing for CCW-from-origin winding - float32_t3 en0 = cross(silhouette.vertices[0], silhouette.vertices[1]); - precompSil.edgeNormals[0] = float16_t3(en0); - float32_t3 en1 = cross(silhouette.vertices[1], silhouette.vertices[2]); - precompSil.edgeNormals[1] = float16_t3(en1); - - // Per-edge max(dot(en_i, v_j)), positive = inside, maximum = widest vertex - float32_t maxDot0 = dot(silhouette.vertices[2], en0); // V2 vs edge 0 - - float32_t maxDot1 = 1e10f; - float32_t maxDot2 = 1e10f; - float32_t maxDot3 = 1e10f; - float32_t maxDot4 = 1e10f; - - if (silhouette.count > 3) - { - float32_t3 en2 = cross(silhouette.vertices[2], silhouette.vertices[3]); - precompSil.edgeNormals[2] = float16_t3(en2); - - // V3 vs edges 0, 1 - float32_t3 v3 = silhouette.vertices[3]; - maxDot0 = max(maxDot0, dot(v3, en0)); - maxDot1 = dot(v3, en1); - - if (silhouette.count > 4) - { - float32_t3 en3 = cross(silhouette.vertices[3], silhouette.vertices[4]); - precompSil.edgeNormals[3] = float16_t3(en3); - - // V4 vs edges 0, 1, 2 - float32_t3 v4 = silhouette.vertices[4]; - maxDot0 = max(maxDot0, dot(v4, en0)); - maxDot1 = max(maxDot1, dot(v4, en1)); - maxDot2 = dot(v4, en2); - - if (silhouette.count > 5) - { - float32_t3 en4 = cross(silhouette.vertices[4], silhouette.vertices[5]); - precompSil.edgeNormals[4] = float16_t3(en4); - - // V5 vs edges 0, 1, 2, 3 - float32_t3 v5 = silhouette.vertices[5]; - maxDot0 = max(maxDot0, dot(v5, en0)); - maxDot1 = max(maxDot1, dot(v5, en1)); - maxDot2 = max(maxDot2, dot(v5, en2)); - maxDot3 = dot(v5, en3); - - if (silhouette.count > 6) - { - // V6 vs edges 0, 1, 2, 3, 4 - float32_t3 v6 = silhouette.vertices[6]; - maxDot0 = max(maxDot0, dot(v6, en0)); - maxDot1 = max(maxDot1, dot(v6, en1)); - maxDot2 = max(maxDot2, dot(v6, en2)); - maxDot3 = max(maxDot3, dot(v6, en3)); - maxDot4 = dot(v6, en4); - } - } - } - } - - // Best edge: minimum maxDot, no per-edge normalization needed. - // Relative epsilon prevents tie-breaking flicker when two edges have - // nearly identical widths — the current winner is "sticky" unless a - // new edge is meaningfully better (0.1% narrower). - const float32_t EDGE_SELECT_EPS = 1e-3f; - - bestWidth = maxDot0; - bestEdge = 0; - bestV0 = silhouette.vertices[0]; - bestV1 = silhouette.vertices[1]; - - if (silhouette.count > 3) - { - bool better = maxDot1 < bestWidth * (1.0f - EDGE_SELECT_EPS); - bestWidth = better ? maxDot1 : bestWidth; - bestEdge = better ? 1 : bestEdge; - bestV0 = better ? silhouette.vertices[1] : bestV0; - bestV1 = better ? silhouette.vertices[2] : bestV1; - - if (silhouette.count > 4) - { - better = maxDot2 < bestWidth * (1.0f - EDGE_SELECT_EPS); - bestWidth = better ? maxDot2 : bestWidth; - bestEdge = better ? 2 : bestEdge; - bestV0 = better ? silhouette.vertices[2] : bestV0; - bestV1 = better ? silhouette.vertices[3] : bestV1; - - if (silhouette.count > 5) - { - better = maxDot3 < bestWidth * (1.0f - EDGE_SELECT_EPS); - bestWidth = better ? maxDot3 : bestWidth; - bestEdge = better ? 3 : bestEdge; - bestV0 = better ? silhouette.vertices[3] : bestV0; - bestV1 = better ? silhouette.vertices[4] : bestV1; - - if (silhouette.count > 6) - { - better = maxDot4 < bestWidth * (1.0f - EDGE_SELECT_EPS); - bestWidth = better ? maxDot4 : bestWidth; - bestEdge = better ? 4 : bestEdge; - bestV0 = better ? silhouette.vertices[4] : bestV0; - bestV1 = better ? silhouette.vertices[5] : bestV1; - } - } - } - } - - // Check the last 2 edges missed by the triangular traversal: - // Edge count-2: vertices[count-2] -> vertices[count-1], check V0..V[count-3] - // Edge count-1: vertices[count-1] -> vertices[0], check V1..V[count-2] - // Explicit per-count unrolling avoids the generic loop with runtime index comparisons. - { - // Penultimate edge: vertices[count-2] -> vertices[count-1] - const uint32_t penIdx = silhouette.count - 2; - float32_t3 enPen = cross(silhouette.vertices[penIdx], silhouette.vertices[penIdx + 1]); - precompSil.edgeNormals[penIdx] = float16_t3(enPen); - float32_t maxDotPen = dot(silhouette.vertices[0], enPen); - if (silhouette.count > 3) - { - maxDotPen = max(maxDotPen, dot(silhouette.vertices[1], enPen)); - if (silhouette.count > 4) - { - maxDotPen = max(maxDotPen, dot(silhouette.vertices[2], enPen)); - if (silhouette.count > 5) - { - maxDotPen = max(maxDotPen, dot(silhouette.vertices[3], enPen)); - if (silhouette.count > 6) - { - maxDotPen = max(maxDotPen, dot(silhouette.vertices[4], enPen)); - } - } - } - } - - bool betterPen = maxDotPen < bestWidth * (1.0f - EDGE_SELECT_EPS); - bestWidth = betterPen ? maxDotPen : bestWidth; - bestEdge = betterPen ? penIdx : bestEdge; - bestV0 = betterPen ? silhouette.vertices[penIdx] : bestV0; - bestV1 = betterPen ? silhouette.vertices[penIdx + 1] : bestV1; - - // Last edge: vertices[count-1] -> vertices[0] (wrap-around) - const uint32_t lastIdx = silhouette.count - 1; - float32_t3 enLast = cross(silhouette.vertices[lastIdx], silhouette.vertices[0]); - precompSil.edgeNormals[lastIdx] = float16_t3(enLast); - float32_t maxDotLast = dot(silhouette.vertices[1], enLast); - if (silhouette.count > 3) - { - maxDotLast = max(maxDotLast, dot(silhouette.vertices[2], enLast)); - if (silhouette.count > 4) - { - maxDotLast = max(maxDotLast, dot(silhouette.vertices[3], enLast)); - if (silhouette.count > 5) - { - maxDotLast = max(maxDotLast, dot(silhouette.vertices[4], enLast)); - if (silhouette.count > 6) - { - maxDotLast = max(maxDotLast, dot(silhouette.vertices[5], enLast)); - } - } - } - } - - bool betterLast = maxDotLast < bestWidth * (1.0f - EDGE_SELECT_EPS); - bestWidth = betterLast ? maxDotLast : bestWidth; - bestEdge = betterLast ? lastIdx : bestEdge; - bestV0 = betterLast ? silhouette.vertices[lastIdx] : bestV0; - bestV1 = betterLast ? silhouette.vertices[0] : bestV1; - } - } - - // ======================================================================== - // Template-Unrolled Projection Helpers - // ======================================================================== - - // Project a single vertex onto candidate axes, updating bounds and minZ in one fused pass - template - static void projectAndBound(const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], float32_t3 projAxis1, float32_t3 projAxis2, float32_t3 projAxis3, NBL_REF_ARG(float32_t4) bound, NBL_REF_ARG(float32_t) minZ) - { - float32_t3 v = vertices[I]; - float32_t x = dot(v, projAxis1); - float32_t y = dot(v, projAxis2); - float32_t z = dot(v, projAxis3); - minZ = min(minZ, z); - float32_t rcpZ = rcp(z); - float32_t projX = x * rcpZ; - float32_t projY = y * rcpZ; - bound.x = min(bound.x, projX); - bound.y = min(bound.y, projY); - bound.z = max(bound.z, projX); - bound.w = max(bound.w, projY); - } - - // Project all silhouette vertices (template-unrolled, fused bounds + minZ) - static void projectAllVertices(const ClippedSilhouette silhouette, float32_t3 projAxis1, float32_t3 projAxis2, float32_t3 projAxis3, NBL_REF_ARG(float32_t4) bound, NBL_REF_ARG(float32_t) minZ) - { - bound = float32_t4(1e10f, 1e10f, -1e10f, -1e10f); - minZ = 1e10f; - projectAndBound<0>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); - projectAndBound<1>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); - projectAndBound<2>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); - if (silhouette.count > 3) - { - projectAndBound<3>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); - if (silhouette.count > 4) - { - projectAndBound<4>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); - if (silhouette.count > 5) - { - projectAndBound<5>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); - if (silhouette.count > 6) - { - projectAndBound<6>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound, minZ); - } - } - } - } - } - - // ======================================================================== - // Template-Unrolled Frame Candidate Selection - // ======================================================================== - - // Try an edge as frame candidate for the primary path (axis1 fixed, find best axis2/axis3) - template - static void tryPrimaryFrameCandidate(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, float32_t3 fixedAxis1, float32_t3 axis3Ref, - NBL_REF_ARG(float32_t) bestArea, NBL_REF_ARG(float32_t3) bestAxis2, - NBL_REF_ARG(float32_t3) bestAxis3, NBL_REF_ARG(bool) found, - NBL_REF_ARG(float32_t) bestMinZ, NBL_REF_ARG(float32_t4) bestBound) - { - const uint32_t j = CheckCount ? ((I + 1 < silhouette.count) ? I + 1 : 0) : I + 1; - float32_t3 edge = silhouette.vertices[j] - silhouette.vertices[I]; - - // Candidate axis2: perpendicular to edge, in plane perpendicular to axis1 - float32_t3 axis2Cand = cross(fixedAxis1, edge); - float32_t lenSq = dot(axis2Cand, axis2Cand); - if (lenSq < 1e-14f) - return; - axis2Cand *= rsqrt(lenSq); - - // Candidate axis3: completes the frame - float32_t3 axis3Cand = cross(fixedAxis1, axis2Cand); - - // Ensure axis3 points toward center (same hemisphere as reference) - if (dot(axis3Cand, axis3Ref) < 0.0f) - { - axis2Cand = -axis2Cand; - axis3Cand = -axis3Cand; - } - - // Fused: check all vertices have positive z AND compute bounding rect in one pass - float32_t4 bound; - float32_t minZ; - projectAllVertices(silhouette, fixedAxis1, axis2Cand, axis3Cand, bound, minZ); - - // Skip if any vertex would have z <= 0 - if (minZ <= 1e-6f) - return; - - float32_t rectArea = (bound.z - bound.x) * (bound.w - bound.y); - if (rectArea < bestArea) - { - bestArea = rectArea; - bestAxis2 = axis2Cand; - bestAxis3 = axis3Cand; - bestMinZ = minZ; - bestBound = bound; - found = true; - } - } - - // Try an edge as frame candidate for the fallback path (axis3 fixed, find best axis1/axis2) - template - static void tryFallbackFrameCandidate(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, float32_t3 fixedAxis3, NBL_REF_ARG(float32_t) bestArea, NBL_REF_ARG(float32_t3) bestAxis1, NBL_REF_ARG(float32_t3) bestAxis2, NBL_REF_ARG(uint32_t) bestEdge, NBL_REF_ARG(float32_t4) bestBound) - { - const uint32_t j = CheckCount ? ((I + 1 < silhouette.count) ? I + 1 : 0) : I + 1; - float32_t3 edge = silhouette.vertices[j] - silhouette.vertices[I]; - - float32_t3 edgeInPlane = edge - fixedAxis3 * dot(edge, fixedAxis3); - float32_t lenSq = dot(edgeInPlane, edgeInPlane); - if (lenSq < 1e-14f) - return; - - float32_t3 axis1Cand = edgeInPlane * rsqrt(lenSq); - float32_t3 axis2Cand = cross(fixedAxis3, axis1Cand); - - float32_t4 bound; - float32_t minZ; - projectAllVertices(silhouette, axis1Cand, axis2Cand, fixedAxis3, bound, minZ); - - float32_t rectArea = (bound.z - bound.x) * (bound.w - bound.y); - if (rectArea < bestArea) - { - bestArea = rectArea; - bestAxis1 = axis1Cand; - bestAxis2 = axis2Cand; - bestBound = bound; - bestEdge = I; - } - } - - // ======================================================================== - // Visualization - // ======================================================================== - -#if VISUALIZE_SAMPLES - float32_t4 visualize(float32_t3 spherePos, float32_t2 ndc, float32_t aaWidth) - { - float32_t4 color = float32_t4(0, 0, 0, 0); - - // Colors for visualization - float32_t3 boundColor1 = float32_t3(1.0f, 0.5f, 0.5f); // Light red for axis1 bounds - float32_t3 boundColor2 = float32_t3(0.5f, 0.5f, 1.0f); // Light blue for axis2 bounds - float32_t3 centerColor = float32_t3(1.0f, 1.0f, 0.0f); // Yellow for center - - float32_t x0 = rectR0.x; - float32_t x1 = rectR0.x + rectExtents.x; - float32_t y0 = rectR0.y; - float32_t y1 = rectR0.y + rectExtents.y; - float32_t z = rectR0.z; - - // Great circle normals for the 4 edges (in local frame, then transform to world) - float32_t3 bottomNormalLocal = normalize(float32_t3(0, -z, y0)); - float32_t3 topNormalLocal = normalize(float32_t3(0, z, -y1)); - float32_t3 leftNormalLocal = normalize(float32_t3(-z, 0, x0)); - float32_t3 rightNormalLocal = normalize(float32_t3(z, 0, -x1)); - - // Transform to world space - float32_t3 bottomNormal = bottomNormalLocal.x * axis1 + bottomNormalLocal.y * axis2 + bottomNormalLocal.z * axis3; - float32_t3 topNormal = topNormalLocal.x * axis1 + topNormalLocal.y * axis2 + topNormalLocal.z * axis3; - float32_t3 leftNormal = leftNormalLocal.x * axis1 + leftNormalLocal.y * axis2 + leftNormalLocal.z * axis3; - float32_t3 rightNormal = rightNormalLocal.x * axis1 + rightNormalLocal.y * axis2 + rightNormalLocal.z * axis3; - - // Draw the 4 bounding great circles - color += drawGreatCircleHalf(bottomNormal, spherePos, axis3, aaWidth, boundColor2, 0.004f); - color += drawGreatCircleHalf(topNormal, spherePos, axis3, aaWidth, boundColor2, 0.004f); - color += drawGreatCircleHalf(leftNormal, spherePos, axis3, aaWidth, boundColor1, 0.004f); - color += drawGreatCircleHalf(rightNormal, spherePos, axis3, aaWidth, boundColor1, 0.004f); - - // Draw center point (center of the rectangle projected onto sphere) - float32_t centerX = (x0 + x1) * 0.5f; - float32_t centerY = (y0 + y1) * 0.5f; - float32_t3 centerLocal = normalize(float32_t3(centerX, centerY, z)); - float32_t3 centerWorld = centerLocal.x * axis1 - centerLocal.y * axis2 + centerLocal.z * axis3; - - float32_t3 centerCircle = sphereToCircle(centerWorld); - color += drawCorner(centerCircle, ndc, aaWidth, 0.025f, 0.0f, centerColor); - - color += drawCorner(axis1, ndc, aaWidth, 0.025f, 0.0f, float32_t3(1.0f, 0.0f, 0.0f)); - color += drawCorner(axis2, ndc, aaWidth, 0.025f, 0.0f, float32_t3(0.0f, 1.0f, 0.0f)); - color += drawCorner(axis3, ndc, aaWidth, 0.025f, 0.0f, float32_t3(0.0f, 0.0f, 1.0f)); - - return color; - } -#endif // VISUALIZE_SAMPLES - - // ======================================================================== - // Factory - // ======================================================================== - - static SphericalPyramid create(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, NBL_REF_ARG(SilEdgeNormals) silEdgeNormals -#if VISUALIZE_SAMPLES - , - float32_t2 ndc, float32_t3 spherePos, float32_t aaWidth, inout float32_t4 color -#endif - ) - { - SphericalPyramid self; - - // Step 1: Find minimum-width edge using rotating calipers with lune metric - uint32_t bestEdge; - float32_t3 bestV0, bestV1; - float32_t minWidth; - findMinimumWidthEdge(silhouette, bestEdge, bestV0, bestV1, minWidth, silEdgeNormals); - - // Step 2: Build orthonormal frame from best edge - // axis1 = perpendicular to the best edge's great circle (primary caliper direction) - self.axis1 = normalize(cross(bestV0, bestV1)); - - // Compute centroid for reference direction - float32_t3 center = silhouette.getCenter(); - float32_t3 centerInPlane = center - self.axis1 * dot(center, self.axis1); - float32_t3 axis3Ref = normalize(centerInPlane); - - // Step 2b: Try each edge-aligned rotation around axis1 to find the axis2/axis3 - // orientation that keeps all vertices in the positive half-space with minimum - // bounding rectangle area - float32_t bestRectArea = 1e20f; - float32_t3 bestAxis2 = cross(axis3Ref, self.axis1); - float32_t3 bestAxis3 = axis3Ref; - bool foundValidFrame = false; - float32_t bestMinZ = 0.0f; - float32_t4 bounds = float32_t4(-0.1f, -0.1f, 0.1f, 0.1f); - - tryPrimaryFrameCandidate<0>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); - tryPrimaryFrameCandidate<1>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); - tryPrimaryFrameCandidate<2>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); - if (silhouette.count > 3) - { - tryPrimaryFrameCandidate<3, true>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); - if (silhouette.count > 4) - { - tryPrimaryFrameCandidate<4, true>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); - if (silhouette.count > 5) - { - tryPrimaryFrameCandidate<5, true>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); - if (silhouette.count > 6) - { - tryPrimaryFrameCandidate<6, true>(silhouette, self.axis1, axis3Ref, bestRectArea, bestAxis2, bestAxis3, foundValidFrame, bestMinZ, bounds); - } - } - } - } - - self.axis2 = bestAxis2; - self.axis3 = bestAxis3; - - // Fallback: if the primary path failed (no valid frame found, or axis3 leaves - // vertices too close to the z=0 singularity), fix axis3 = camera forward and - // search for the best axis1/axis2 rotation around it. - if (!foundValidFrame || bestMinZ < 0.15f) - { - // Use camera forward as axis3 (all silhouette vertices have z > 0 by construction) - self.axis3 = float32_t3(0.0f, 0.0f, 1.0f); - - // Find optimal axis1/axis2 rotation around axis3 by trying each edge - float32_t bestFallbackArea = 1e20f; - // axis3 = (0,0,1), so cross((0,0,1), (1,0,0)) = (0,1,0), cross((0,0,1), (0,1,0)) = (-1,0,0) - self.axis1 = float32_t3(0.0f, 1.0f, 0.0f); - self.axis2 = float32_t3(-1.0f, 0.0f, 0.0f); - - tryFallbackFrameCandidate<0>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); - tryFallbackFrameCandidate<1>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); - tryFallbackFrameCandidate<2>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); - if (silhouette.count > 3) - { - tryFallbackFrameCandidate<3, true>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); - if (silhouette.count > 4) - { - tryFallbackFrameCandidate<4, true>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); - if (silhouette.count > 5) - { - tryFallbackFrameCandidate<5, true>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); - if (silhouette.count > 6) - { - tryFallbackFrameCandidate<6, true>(silhouette, self.axis3, bestFallbackArea, self.axis1, self.axis2, bestEdge, bounds); - } - } - } - } - } - - // Degenerate bounds check (single computation, after primary/fallback decision) - if (bounds.x >= bounds.z || bounds.y >= bounds.w) - bounds = float32_t4(-0.1f, -0.1f, 0.1f, 0.1f); - - self.rectR0 = float32_t3(bounds.xy, 1.0f); - self.rectExtents = float32_t2(bounds.zw - bounds.xy); - -#if VISUALIZE_SAMPLES - color += drawCorner(center, ndc, aaWidth, 0.05f, 0.0f, float32_t3(1.0f, 0.0f, 1.0f)); - color += visualizeBestCaliperEdge(silhouette.vertices, bestEdge, silhouette.count, spherePos, aaWidth); - color += self.visualize(spherePos, ndc, aaWidth); -#endif - -#if DEBUG_DATA - DebugDataBuffer[0].pyramidAxis1 = self.axis1; - DebugDataBuffer[0].pyramidAxis2 = self.axis2; - DebugDataBuffer[0].pyramidCenter = center; - DebugDataBuffer[0].pyramidHalfWidth1 = (atan(bounds.z) - atan(bounds.x)) * 0.5f; - DebugDataBuffer[0].pyramidHalfWidth2 = (atan(bounds.w) - atan(bounds.y)) * 0.5f; - DebugDataBuffer[0].pyramidSolidAngle = self.solidAngle; - DebugDataBuffer[0].pyramidBestEdge = bestEdge; - DebugDataBuffer[0].pyramidMin1 = bounds.x; - DebugDataBuffer[0].pyramidMin2 = bounds.y; - DebugDataBuffer[0].pyramidMax1 = bounds.z; - DebugDataBuffer[0].pyramidMax2 = bounds.w; -#endif - - return self; - } + float32_t3 axis1; // edge-aligned, perpendicular to axis3 + float32_t3 axis2; // = cross(axis3, axis1); axis3 reconstructed via getAxis3() + float32_t2 rectR0; // gnomonic bounding rect corner (z=1 implicit) + float32_t2 rectExtents; + + float32_t3 getAxis3() NBL_CONST_MEMBER_FUNC { return cross(axis1, axis2); } + + // ======================================================================== + // Gnomonic Projection + // ======================================================================== + template + static void projectAndBound(const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], float32_t3 projAxis1, float32_t3 projAxis2, float32_t3 projAxis3, NBL_REF_ARG(float32_t4) bound) + { + float32_t3 v = vertices[I]; + float32_t x = dot(v, projAxis1); + float32_t y = dot(v, projAxis2); + float32_t z = dot(v, projAxis3); + float32_t rcpZ = (z > 0.0f) ? rcp(z) : 0.0f; + float32_t projX = x * rcpZ; + float32_t projY = y * rcpZ; + bound.x = min(bound.x, projX); + bound.y = min(bound.y, projY); + bound.z = max(bound.z, projX); + bound.w = max(bound.w, projY); + } + + // Template-unrolled projection of all vertices. + static void projectAllVertices(const ClippedSilhouette silhouette, float32_t3 projAxis1, float32_t3 projAxis2, float32_t3 projAxis3, NBL_REF_ARG(float32_t4) bound) + { + bound = float32_t4(1e10f, 1e10f, -1e10f, -1e10f); + projectAndBound<0>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); + projectAndBound<1>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); + projectAndBound<2>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); + if (silhouette.count > 3) + { + projectAndBound<3>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); + if (silhouette.count > 4) + { + projectAndBound<4>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); + if (silhouette.count > 5) + { + projectAndBound<5>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); + if (silhouette.count > 6) + { + projectAndBound<6>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); + } + } + } + } + } + + // ======================================================================== + // Adaptive Axis3 + // ======================================================================== + + // t = max blend keeping dot(v, centroid*t + (0,0,1)) >= margin. + template + static float32_t blendLimit(const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], float32_t3 center, float32_t margin, float32_t curMin) + { + float32_t cd = dot(vertices[I], center); + float32_t tLimit = (cd < 0.0f) ? ((vertices[I].z - margin) / -cd) : 1e10f; + return min(curMin, tLimit); + } + + static float32_t computeBlendFactor(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, float32_t3 center, float32_t margin) + { + float32_t t = 1e10f; + t = blendLimit<0>(silhouette.vertices, center, margin, t); + t = blendLimit<1>(silhouette.vertices, center, margin, t); + t = blendLimit<2>(silhouette.vertices, center, margin, t); + if (silhouette.count > 3) + { + t = blendLimit<3>(silhouette.vertices, center, margin, t); + if (silhouette.count > 4) + { + t = blendLimit<4>(silhouette.vertices, center, margin, t); + if (silhouette.count > 5) + { + t = blendLimit<5>(silhouette.vertices, center, margin, t); + if (silhouette.count > 6) + { + t = blendLimit<6>(silhouette.vertices, center, margin, t); + } + } + } + } + return max(t, 0.0f); + } + + // ======================================================================== + // Rotating Calipers (fused edge normal computation) + // ======================================================================== + template + static void tryCaliperCandidate(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, float32_t3 fixedAxis3, + NBL_REF_ARG(float32_t) bestArea, NBL_REF_ARG(float32_t3) bestAxis1, + NBL_REF_ARG(float32_t3) bestAxis2, NBL_REF_ARG(float32_t4) bestBound, + NBL_REF_ARG(uint32_t) bestEdge, NBL_REF_ARG(SilEdgeNormals) silEdgeNormals) + { + const uint32_t j = CheckCount ? ((I + 1 < silhouette.count) ? I + 1 : 0) : I + 1; + float32_t3 vI = silhouette.vertices[I]; + float32_t3 vJ = silhouette.vertices[j]; + + // Fused: edge normal from the same vertex pair (vertices already in registers) + silEdgeNormals.edgeNormals[I] = cross(vI, vJ); + + float32_t3 edge = vJ - vI; + + // Project edge perpendicular to axis3. Skip edges nearly parallel to axis3. + float32_t3 edgeInPlane = edge - fixedAxis3 * dot(edge, fixedAxis3); + float32_t lenSq = dot(edgeInPlane, edgeInPlane); + if (lenSq < 0.01f * dot(edge, edge)) + return; + + float32_t3 axis1Cand = edgeInPlane * rsqrt(lenSq); + float32_t3 axis2Cand = cross(fixedAxis3, axis1Cand); + + float32_t4 bound; + projectAllVertices(silhouette, axis1Cand, axis2Cand, fixedAxis3, bound); + + // Sticky selection: new edge must be meaningfully better (1% smaller area) + // to prevent jitter when two edges have nearly identical bounding rects. + float32_t rectArea = (bound.z - bound.x) * (bound.w - bound.y); + if (rectArea < bestArea * (1.0f - 1e-2f)) + { + bestArea = rectArea; + bestAxis1 = axis1Cand; + bestAxis2 = axis2Cand; + bestBound = bound; + bestEdge = I; + } + } + + // ======================================================================== + // Factory + // ======================================================================== + + static SphericalPyramid create(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, NBL_REF_ARG(SilEdgeNormals) silEdgeNormals) + { + SphericalPyramid self; + silEdgeNormals = (SilEdgeNormals)0; + + // Step 1: Adaptive axis3 (local var, reconstructed via getAxis3() after construction). + float32_t3 center = silhouette.getUnnormalizedCenter(); + const float32_t AXIS3_MARGIN = 0.15f; + float32_t tBlend = computeBlendFactor(silhouette, center, AXIS3_MARGIN); + float32_t3 axis3 = normalize(center * tBlend + float32_t3(0.0f, 0.0f, 1.0f)); + + // Step 2: Rotating calipers, min-area gnomonic bounding rectangle. + float32_t bestArea = 1e20f; + self.axis1 = float32_t3(0.0f, 1.0f, 0.0f); + self.axis2 = float32_t3(-1.0f, 0.0f, 0.0f); + float32_t4 bounds = float32_t4(-0.1f, -0.1f, 0.1f, 0.1f); + uint32_t bestEdge = 0; + + // Each candidate also computes cross(v[I], v[j]) for edge normals. + // I=2 needs the wrap check because count can be exactly 3 (j must wrap to 0). + tryCaliperCandidate<0>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); + tryCaliperCandidate<1>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); + tryCaliperCandidate<2, true>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); + if (silhouette.count > 3) + { + tryCaliperCandidate<3, true>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); + if (silhouette.count > 4) + { + tryCaliperCandidate<4, true>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); + if (silhouette.count > 5) + { + tryCaliperCandidate<5, true>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); + if (silhouette.count > 6) + { + tryCaliperCandidate<6, true>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); + } + } + } + } + + // Step 3: Stabilize axis1 sign against a world-space reference. + { + float32_t3 worldRef = nbl::hlsl::select(abs(axis3.x) < 0.9f, float32_t3(1.0f, 0.0f, 0.0f), float32_t3(0.0f, 1.0f, 0.0f)); + float32_t3 axis1Ref = worldRef - axis3 * dot(worldRef, axis3); + if (dot(self.axis1, axis1Ref) < 0.0f) + { + self.axis1 = -self.axis1; + // axis2 also flips (recomputed below), so mirror both x and y bounds. + bounds = float32_t4(-bounds.z, -bounds.w, -bounds.x, -bounds.y); + } + } + + // Step 4: Recompute axis2 so getAxis3() = cross(axis1, axis2) recovers axis3. + self.axis2 = cross(axis3, self.axis1); + + // Degenerate bounds check + if (bounds.x >= bounds.z || bounds.y >= bounds.w) + bounds = float32_t4(-0.1f, -0.1f, 0.1f, 0.1f); + + self.rectR0 = bounds.xy; + self.rectExtents = float32_t2(bounds.zw - bounds.xy); + + float32_t solidAngle; + { + nbl::hlsl::sampling::SphericalRectangle rectSampler = nbl::hlsl::sampling::SphericalRectangle::create(float32_t3x3(self.axis1, self.axis2, self.getAxis3()), float32_t3(self.rectR0, 1.0f), self.rectExtents); + solidAngle = rectSampler.solidAngle; + } + + VisContext::add(SphereDrawer::drawDot(normalize(center), 0.05f, 0.0f, float32_t3(1.0f, 0.0f, 1.0f))); + VisContext::add(SphereDrawer::visualizeBestCaliperEdge(silhouette, bestEdge)); + self.visualize(); + + DebugRecorder::recordPyramid(self.axis1, self.axis2, center, bounds, solidAngle, bestEdge); + + return self; + } + + // ======================================================================== + // Visualization + // ======================================================================== + + void visualize() + { + // Colors for visualization + float32_t3 boundColor1 = float32_t3(1.0f, 0.5f, 0.5f); // Light red for axis1 bounds + float32_t3 boundColor2 = float32_t3(0.5f, 0.5f, 1.0f); // Light blue for axis2 bounds + float32_t3 centerColor = float32_t3(1.0f, 1.0f, 0.0f); // Yellow for center + + float32_t3 a3 = getAxis3(); + float32_t x0 = rectR0.x; + float32_t x1 = rectR0.x + rectExtents.x; + float32_t y0 = rectR0.y; + float32_t y1 = rectR0.y + rectExtents.y; + const float32_t z = 1.0f; + + // Great circle normals for the 4 edges (in local frame, then transform to world) + float32_t3 bottomNormalLocal = normalize(float32_t3(0, -z, y0)); + float32_t3 topNormalLocal = normalize(float32_t3(0, z, -y1)); + float32_t3 leftNormalLocal = normalize(float32_t3(-z, 0, x0)); + float32_t3 rightNormalLocal = normalize(float32_t3(z, 0, -x1)); + + // Transform to world space + float32_t3 bottomNormal = bottomNormalLocal.x * axis1 + bottomNormalLocal.y * axis2 + bottomNormalLocal.z * a3; + float32_t3 topNormal = topNormalLocal.x * axis1 + topNormalLocal.y * axis2 + topNormalLocal.z * a3; + float32_t3 leftNormal = leftNormalLocal.x * axis1 + leftNormalLocal.y * axis2 + leftNormalLocal.z * a3; + float32_t3 rightNormal = rightNormalLocal.x * axis1 + rightNormalLocal.y * axis2 + rightNormalLocal.z * a3; + + // Draw center point (center of the rectangle projected onto sphere) + float32_t centerX = (x0 + x1) * 0.5f; + float32_t centerY = (y0 + y1) * 0.5f; + float32_t3 centerLocal = normalize(float32_t3(centerX, centerY, z)); + float32_t3 centerWorld = centerLocal.x * axis1 + centerLocal.y * axis2 + centerLocal.z * a3; + + VisContext::add(SphereDrawer::drawCorner(centerWorld, 0.025f, 0.0f, centerColor)); + // Draw the 4 bounding great circles + VisContext::add(SphereDrawer::drawGreatCircleHalf(bottomNormal, a3, boundColor2, 0.004f)); + VisContext::add(SphereDrawer::drawGreatCircleHalf(topNormal, a3, boundColor2, 0.004f)); + VisContext::add(SphereDrawer::drawGreatCircleHalf(leftNormal, a3, boundColor1, 0.004f)); + VisContext::add(SphereDrawer::drawGreatCircleHalf(rightNormal, a3, boundColor1, 0.004f)); + + VisContext::add(SphereDrawer::drawDot(axis1, 0.025f, 0.0f, float32_t3(1.0f, 0.0f, 0.0f))); + VisContext::add(SphereDrawer::drawDot(axis2, 0.025f, 0.0f, float32_t3(0.0f, 1.0f, 0.0f))); + VisContext::add(SphereDrawer::drawDot(a3, 0.025f, 0.0f, float32_t3(0.0f, 0.0f, 1.0f))); + } }; -#include "pyramid_sampling/urena.hlsl" + #include "pyramid_sampling/bilinear.hlsl" #include "pyramid_sampling/biquadratic.hlsl" diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl index 7d3319a7c..4094e6bd3 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl @@ -1,4 +1,4 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #ifndef _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BILINEAR_HLSL_INCLUDED_ @@ -13,8 +13,7 @@ struct BilinearSampler { nbl::hlsl::sampling::Bilinear sampler; - float32_t rcpTotalIntegral; - float32_t rectArea; + float32_t rcpRectArea; // Precompute bilinear sampler from pyramid static BilinearSampler create(NBL_CONST_REF_ARG(SphericalPyramid) pyramid) @@ -31,53 +30,44 @@ struct BilinearSampler const float32_t xx0 = x0 * x0, xx1 = x1 * x1; const float32_t yy0 = y0 * y0, yy1 = y1 * y1; - float32_t d; - d = xx0 + yy0 + 1.0f; - const float32_t v00 = rsqrt(d) / d; // x0y0 - d = xx1 + yy0 + 1.0f; - const float32_t v10 = rsqrt(d) / d; // x1y0 - d = xx0 + yy1 + 1.0f; - const float32_t v01 = rsqrt(d) / d; // x0y1 - d = xx1 + yy1 + 1.0f; - const float32_t v11 = rsqrt(d) / d; // x1y1 + // d^{-3/2} = rsqrt(d)^3: 1 rsqrt + 2 mul instead of 1 rsqrt + 1 div + float32_t r; + r = rsqrt(xx0 + yy0 + 1.0f); + const float32_t v00 = r * r * r; // x0y0 + r = rsqrt(xx1 + yy0 + 1.0f); + const float32_t v10 = r * r * r; // x1y0 + r = rsqrt(xx0 + yy1 + 1.0f); + const float32_t v01 = r * r * r; // x0y1 + r = rsqrt(xx1 + yy1 + 1.0f); + const float32_t v11 = r * r * r; // x1y1 // Bilinear layout: (x0y0, x0y1, x1y0, x1y1) self.sampler = nbl::hlsl::sampling::Bilinear::create(float32_t4(v00, v01, v10, v11)); - - // Total integral = average of 4 corners (bilinear integral over unit square) - const float32_t totalIntegral = (v00 + v10 + v01 + v11) * 0.25f; - self.rcpTotalIntegral = 1.0f / max(totalIntegral, 1e-20f); - self.rectArea = pyramid.rectExtents.x * pyramid.rectExtents.y; + self.rcpRectArea = rcp(max(pyramid.rectExtents.x * pyramid.rectExtents.y, 1e-20f)); return self; } // Sample a direction on the spherical pyramid using bilinear importance sampling. // Returns the world-space direction; outputs pdf in solid-angle space and validity flag. - float32_t3 sample(NBL_CONST_REF_ARG(SphericalPyramid) pyramid, NBL_CONST_REF_ARG(SilEdgeNormals) silhouette, float32_t2 xi, out float32_t pdf, out bool valid) + float32_t3 sample(NBL_CONST_REF_ARG(SphericalPyramid) pyramid, NBL_CONST_REF_ARG(SilEdgeNormals) silEdgeNormals, float32_t2 xi, out float32_t pdf, out bool valid) { - // Step 1: Sample UV from bilinear distribution (closed-form via quadratic formula) - float32_t rcpPdf; - float32_t2 uv = sampler.generate(rcpPdf, xi); + nbl::hlsl::sampling::Bilinear::cache_type cache; + float32_t2 uv = sampler.generate(xi, cache); - // Step 2: UV to direction - // Bilinear sampler convention: u.y = first-sampled axis (X), u.x = second-sampled axis (Y) - const float32_t localX = pyramid.rectR0.x + uv.y * pyramid.rectExtents.x; - const float32_t localY = pyramid.rectR0.y + uv.x * pyramid.rectExtents.y; + const float32_t localX = pyramid.rectR0.x + uv.x * pyramid.rectExtents.x; + const float32_t localY = pyramid.rectR0.y + uv.y * pyramid.rectExtents.y; - // Compute dist2 and rcpLen once, reuse for both normalization and dSA const float32_t dist2 = localX * localX + localY * localY + 1.0f; const float32_t rcpLen = rsqrt(dist2); float32_t3 direction = (localX * pyramid.axis1 + localY * pyramid.axis2 + - pyramid.axis3) * rcpLen; + pyramid.getAxis3()) * rcpLen; - valid = direction.z > 0.0f && silhouette.isInside(direction); + valid = direction.z > 0.0f && silEdgeNormals.isInsideLocal(localX, localY); - // PDF in solid angle space: 1 / (rcpPdf * dSA * rectArea) - // rcpPdf already = 1/pdfUV from Bilinear::generate, avoid redundant reciprocal - const float32_t dsa = rcpLen / dist2; - pdf = 1.0f / max(rcpPdf * dsa * rectArea, 1e-7f); + // PDF in solid angle space: pdfBilinear * dist2^{3/2} * rcpRectArea + pdf = sampler.forwardPdf(xi, cache) * dist2 * dist2 * rcpLen * rcpRectArea; return direction; } diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/biquadratic.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/biquadratic.hlsl index e75c89595..fa9e391cc 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/biquadratic.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/biquadratic.hlsl @@ -1,155 +1,77 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #ifndef _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BIQUADRATIC_HLSL_INCLUDED_ #define _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BIQUADRATIC_HLSL_INCLUDED_ +#include // reuse basic structure // ============================================================================ -// Biquadratic Approximation Sampling (Hart et al. 2020) +// Biquadratic Approximation Sampling (cheap solid-angle approximation) // ============================================================================ -// -// Precomputed biquadratic sampler for importance sampling solid angle density. -// Build once from a SphericalPyramid, then call sample() per random pair. - struct BiquadraticSampler { - // Column-major: cols[i] = (row0[i], row1[i], row2[i]) for fast sliceAtY via dot - float32_t3x3 cols; - - // Precomputed marginal (Y) polynomial: f(y) = c0 + y*(c1 + y*c2) - float32_t margC0, margC1, margC2, margIntegral; + nbl::hlsl::sampling::Bilinear baseSampler; // underlying bilinear generator - float32_t rcpTotalIntegral; - float32_t rcpIntegralTimesRcpArea; // rcpTotalIntegral / rectArea (fused for PDF computation) - - // Newton-Raphson CDF inversion for a quadratic PDF (2 iterations) - // Solves: c0*t + (c1/2)*t^2 + (c2/3)*t^3 = u * integral - // Returns sampled t and the PDF value at t (avoids redundant recomputation by caller). - // 2 iterations give ~4 decimal digits, should be sufficient for importance sampling with rejection? - static float32_t sampleQuadraticCDF(float32_t u, float32_t c0, float32_t c1, float32_t c2, float32_t integral, out float32_t lastPdfVal) - { - const float32_t target = u * integral; - const float32_t c1half = c1 * 0.5f; - const float32_t c2third = c2 * (1.0f / 3.0f); - float32_t t = u; - - // Iteration 1 - float32_t cdfVal = t * (c0 + t * (c1half + t * c2third)); - lastPdfVal = c0 + t * (c1 + t * c2); - t = clamp(t - (cdfVal - target) / lastPdfVal, 0.0f, 1.0f); - - // Iteration 2 - cdfVal = t * (c0 + t * (c1half + t * c2third)); - lastPdfVal = c0 + t * (c1 + t * c2); - t = clamp(t - (cdfVal - target) / lastPdfVal, 0.0f, 1.0f); - - return t; - } + float32_t rcpRectArea; - // Precompute biquadratic sampler from pyramid (call ONCE, reuse for all samples) + // Precompute biquadratic sampler from pyramid static BiquadraticSampler create(NBL_CONST_REF_ARG(SphericalPyramid) pyramid) { BiquadraticSampler self; - // 3x3 grid positions on the rectangle + // 4 corner positions on the rectangle const float32_t x0 = pyramid.rectR0.x; - const float32_t x1 = x0 + 0.5f * pyramid.rectExtents.x; - const float32_t x2 = x0 + pyramid.rectExtents.x; + const float32_t x1 = x0 + pyramid.rectExtents.x; const float32_t y0 = pyramid.rectR0.y; - const float32_t y1 = y0 + 0.5f * pyramid.rectExtents.y; - const float32_t y2 = y0 + pyramid.rectExtents.y; - - // dSA(x,y) = rsqrt(x^2+y^2+1) / (x^2+y^2+1) [z = rectR0.z = 1.0] - const float32_t xx0 = x0 * x0, xx1 = x1 * x1, xx2 = x2 * x2; - const float32_t yy0 = y0 * y0, yy1 = y1 * y1, yy2 = y2 * y2; - - float32_t3 row0, row1, row2; - float32_t d; - - d = xx0 + yy0 + 1.0f; - row0.x = rsqrt(d) / d; - d = xx1 + yy0 + 1.0f; - row0.y = rsqrt(d) / d; - d = xx2 + yy0 + 1.0f; - row0.z = rsqrt(d) / d; - - d = xx0 + yy1 + 1.0f; - row1.x = rsqrt(d) / d; - d = xx1 + yy1 + 1.0f; - row1.y = rsqrt(d) / d; - d = xx2 + yy1 + 1.0f; - row1.z = rsqrt(d) / d; - - d = xx0 + yy2 + 1.0f; - row2.x = rsqrt(d) / d; - d = xx1 + yy2 + 1.0f; - row2.y = rsqrt(d) / d; - d = xx2 + yy2 + 1.0f; - row2.z = rsqrt(d) / d; - - // Store column-major for sliceAtY: cols[i] = (row0[i], row1[i], row2[i]) - self.cols[0] = float32_t3(row0.x, row1.x, row2.x); - self.cols[1] = float32_t3(row0.y, row1.y, row2.y); - self.cols[2] = float32_t3(row0.z, row1.z, row2.z); - - // Marginal along Y: Simpson's rule integral of each row - const float32_t3 marginal = float32_t3( - (row0.x + 4.0f * row0.y + row0.z) / 6.0f, - (row1.x + 4.0f * row1.y + row1.z) / 6.0f, - (row2.x + 4.0f * row2.y + row2.z) / 6.0f); - - // Precompute marginal polynomial: f(y) = c0 + y*(c1 + y*c2) - self.margC0 = marginal[0]; - self.margC1 = -3.0f * marginal[0] + 4.0f * marginal[1] - marginal[2]; - self.margC2 = 2.0f * (marginal[0] - 2.0f * marginal[1] + marginal[2]); - self.margIntegral = (marginal[0] + 4.0f * marginal[1] + marginal[2]) / 6.0f; - - self.rcpTotalIntegral = 1.0f / max(self.margIntegral, 1e-20f); - const float32_t rectArea = pyramid.rectExtents.x * pyramid.rectExtents.y; - self.rcpIntegralTimesRcpArea = self.rcpTotalIntegral / max(rectArea, 1e-20f); + const float32_t y1 = y0 + pyramid.rectExtents.y; + + // Compute solid-angle weights at corners: d^{-3/2} + const float32_t xx0 = x0 * x0, xx1 = x1 * x1; + const float32_t yy0 = y0 * y0, yy1 = y1 * y1; + + // d^{-3/2} = rsqrt(d)^3 + float32_t r; + r = rsqrt(xx0 + yy0 + 1.0f); + const float32_t v00 = r * r * r; + r = rsqrt(xx1 + yy0 + 1.0f); + const float32_t v10 = r * r * r; + r = rsqrt(xx0 + yy1 + 1.0f); + const float32_t v01 = r * r * r; + r = rsqrt(xx1 + yy1 + 1.0f); + const float32_t v11 = r * r * r; + + self.baseSampler = nbl::hlsl::sampling::Bilinear::create(float32_t4(v00, v01, v10, v11)); + self.rcpRectArea = rcp(max(pyramid.rectExtents.x * pyramid.rectExtents.y, 1e-20f)); return self; } // Sample a direction on the spherical pyramid using biquadratic importance sampling. - // Returns the world-space direction; outputs pdf in solid-angle space and validity flag. - float32_t3 sample(NBL_CONST_REF_ARG(SphericalPyramid) pyramid, NBL_CONST_REF_ARG(SilEdgeNormals) silhouette, float32_t2 xi, out float32_t pdf, out bool valid) + // Applies a quadratic warp f(t) = t*(2-t) after bilinear sampling to redistribute + // samples. The warp Jacobian f'(t) = 2*(1-t) is accounted for in the PDF. + float32_t3 sample(NBL_CONST_REF_ARG(SphericalPyramid) pyramid, NBL_CONST_REF_ARG(SilEdgeNormals) silEdgeNormals, float32_t2 xi, out float32_t pdf, out bool valid) { - // Step 1: Sample Y from precomputed marginal polynomial - float32_t margPdfAtY; - const float32_t y = sampleQuadraticCDF(xi.y, margC0, margC1, margC2, margIntegral, margPdfAtY); - - // Step 2: Compute conditional X slice at sampled Y via Lagrange basis - const float32_t y2 = y * y; - const float32_t3 Ly = float32_t3(2.0f * y2 - 3.0f * y + 1.0f, -4.0f * y2 + 4.0f * y, 2.0f * y2 - y); - const float32_t3 slice = float32_t3(dot(cols[0], Ly), dot(cols[1], Ly), dot(cols[2], Ly)); - - // Step 3: Build conditional polynomial and sample X - const float32_t condC0 = slice[0]; - const float32_t condC1 = -3.0f * slice[0] + 4.0f * slice[1] - slice[2]; - const float32_t condC2 = 2.0f * (slice[0] - 2.0f * slice[1] + slice[2]); - const float32_t condIntegral = (slice[0] + 4.0f * slice[1] + slice[2]) / 6.0f; - float32_t condPdfAtX; - const float32_t x = sampleQuadraticCDF(xi.x, condC0, condC1, condC2, condIntegral, condPdfAtX); - - // Step 4: UV to direction - const float32_t localX = pyramid.rectR0.x + x * pyramid.rectExtents.x; - const float32_t localY = pyramid.rectR0.y + y * pyramid.rectExtents.y; - - // Compute dist2 and rcpLen once, reuse for both normalization and dSA + nbl::hlsl::sampling::Bilinear::cache_type cache; + float32_t2 uv = baseSampler.generate(xi, cache); + + // Quadratic warp: f(t) = t * (2 - t), f'(t) = 2 * (1 - t) + const float32_t rcpWarpJacobian = rcp(4.0f * (1.0f - uv.x) * (1.0f - uv.y)); + uv = float32_t2(uv.x * (2.0f - uv.x), uv.y * (2.0f - uv.y)); + + const float32_t localX = pyramid.rectR0.x + uv.y * pyramid.rectExtents.x; + const float32_t localY = pyramid.rectR0.y + uv.x * pyramid.rectExtents.y; + const float32_t dist2 = localX * localX + localY * localY + 1.0f; const float32_t rcpLen = rsqrt(dist2); float32_t3 direction = (localX * pyramid.axis1 + localY * pyramid.axis2 + - pyramid.axis3) * + pyramid.getAxis3()) * rcpLen; - valid = direction.z > 0.0f && silhouette.isInside(direction); + valid = direction.z > 0.0f && silEdgeNormals.isInsideLocal(localX, localY); - // Step 5: PDF in solid angle space = condPdfAtX / (totalIntegral * dSA * rectArea) - // condPdfAtX is reused from the last Newton iteration - const float32_t dsa = rcpLen / dist2; - pdf = condPdfAtX * rcpIntegralTimesRcpArea / max(dsa, 1e-7f); + // PDF in solid-angle space, accounting for warp Jacobian + pdf = baseSampler.forwardPdf(xi, cache) * dist2 * dist2 * rcpLen * rcpRectArea * rcpWarpJacobian; return direction; } diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/urena.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/urena.hlsl deleted file mode 100644 index 6709bf7da..000000000 --- a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/urena.hlsl +++ /dev/null @@ -1,87 +0,0 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. -//// This file is part of the "Nabla Engine". -//// For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_URENA_HLSL_INCLUDED_ -#define _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_URENA_HLSL_INCLUDED_ - -// ============================================================================ -// Sampling using Urena 2003 (SphericalRectangle) -// ============================================================================ - -struct UrenaSampler -{ - float32_t solidAngle; // Solid angle of the bounding region (steradians) - float32_t samplerK; // = 2*pi - q (angle offset for horizontal sampling) - float32_t samplerB0; // = n_z[0] (normalized edge parameter) - float32_t samplerB1; // = n_z[2] (normalized edge parameter) - - // Precompute solid angle AND sampler intermediates in one pass - // (solidAngleOfRectangle and generate() both compute n_z/cosGamma -- fuse them) - static UrenaSampler create(NBL_CONST_REF_ARG(SphericalPyramid) pyramid) - { - UrenaSampler self; - - const float32_t4 denorm_n_z = float32_t4(-pyramid.rectR0.y, pyramid.rectR0.x + pyramid.rectExtents.x, pyramid.rectR0.y + pyramid.rectExtents.y, -pyramid.rectR0.x); - const float32_t4 n_z = denorm_n_z / sqrt((float32_t4)(pyramid.rectR0.z * pyramid.rectR0.z) + denorm_n_z * denorm_n_z); - const float32_t4 cosGamma = float32_t4(-n_z[0] * n_z[1], -n_z[1] * n_z[2], - -n_z[2] * n_z[3], -n_z[3] * n_z[0]); - - nbl::hlsl::math::sincos_accumulator adder = nbl::hlsl::math::sincos_accumulator::create(cosGamma[0]); - adder.addCosine(cosGamma[1]); - const float32_t p = adder.getSumofArccos(); - adder = nbl::hlsl::math::sincos_accumulator::create(cosGamma[2]); - adder.addCosine(cosGamma[3]); - const float32_t q = adder.getSumofArccos(); - - self.solidAngle = p + q - 2.0f * nbl::hlsl::numbers::pi; - self.samplerK = 2.0f * nbl::hlsl::numbers::pi - q; - self.samplerB0 = n_z[0]; - self.samplerB1 = n_z[2]; - - return self; - } - - float32_t3 sample(NBL_CONST_REF_ARG(SphericalPyramid) pyramid, NBL_CONST_REF_ARG(SilEdgeNormals) silhouette, float32_t2 xi, out float32_t pdf, out bool valid) - { - // Inlined Urena 2003 with algebraic simplifications: - const float32_t r1x = pyramid.rectR0.x + pyramid.rectExtents.x; - const float32_t r1y = pyramid.rectR0.y + pyramid.rectExtents.y; - - // Horizontal CDF inversion - const float32_t au = xi.x * solidAngle + samplerK; - float32_t sinAu, cosAu; - sincos(au, sinAu, cosAu); - const float32_t fu = (cosAu * samplerB0 - samplerB1) / sinAu; - - // cu = sign(fu)/sqrt(cu_2), xu = cu/sqrt(1-cu^2) - // Fused: xu = sign(fu)/sqrt(cu_2 - 1) [eliminates 2 sqrt + 2 div -> 1 rsqrt] - const float32_t cu_2 = max(fu * fu + samplerB0 * samplerB0, 1.0f); - const float32_t xu = clamp( - (fu >= 0.0f ? 1.0f : -1.0f) * rsqrt(max(cu_2 - 1.0f, 1e-10f)), - pyramid.rectR0.x, r1x); - const float32_t d_2 = xu * xu + 1.0f; - - // Vertical sampling in h-space (div -> rsqrt + mul) - const float32_t h0 = pyramid.rectR0.y * rsqrt(d_2 + pyramid.rectR0.y * pyramid.rectR0.y); - const float32_t h1 = r1y * rsqrt(d_2 + r1y * r1y); - const float32_t hv = h0 + xi.y * (h1 - h0); - - // Normalized direction via ||(xu,yv,1)||^2 = d_2/(1-hv^2): - // localDir.y = yv/||v|| = hv (exact cancellation) - // localDir.xz = (xu, 1) * t where t = sqrt(1-hv^2)/sqrt(d_2) - // Eliminates: sqrt(d_2), yv computation, and normalize() - const float32_t t = sqrt(max(1.0f - hv * hv, 0.0f)) * rsqrt(d_2); - const float32_t3 localDir = float32_t3(xu * t, hv, t); - - float32_t3 direction = localDir.x * pyramid.axis1 + - localDir.y * pyramid.axis2 + - localDir.z * pyramid.axis3; - - valid = direction.z > 0.0f && silhouette.isInside(direction); - pdf = 1.0f / max(solidAngle, 1e-7f); - - return direction; - } -}; - -#endif // _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_URENA_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/ray_vis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/ray_vis.frag.hlsl index d01b3a07f..360bfa510 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/ray_vis.frag.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/ray_vis.frag.hlsl @@ -1,204 +1,30 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #pragma wave shader_stage(fragment) #include "common.hlsl" +#include "debug_vis.hlsl" #include #include "utils.hlsl" using namespace nbl::hlsl; using namespace ext::FullScreenTriangle; -// Visualizes a ray as an arrow from origin in NDC space -// Returns color (rgb), intensity (a), and depth (in extra component) -struct ArrowResult -{ - float32_t4 color : SV_Target0; - float32_t depth : SV_Depth; -}; - [[vk::push_constant]] struct PushConstantRayVis pc; -#if VISUALIZE_SAMPLES #include "drawing.hlsl" -// Ray-AABB intersection in world space -// Returns the distance to the nearest intersection point, or -1 if no hit -float32_t rayAABBIntersection(float32_t3 rayOrigin, float32_t3 rayDir, float32_t3 aabbMin, float32_t3 aabbMax) -{ - float32_t3 invDir = 1.0f / rayDir; - float32_t3 t0 = (aabbMin - rayOrigin) * invDir; - float32_t3 t1 = (aabbMax - rayOrigin) * invDir; - - float32_t3 tmin = min(t0, t1); - float32_t3 tmax = max(t0, t1); - - float32_t tNear = max(max(tmin.x, tmin.y), tmin.z); - float32_t tFar = min(min(tmax.x, tmax.y), tmax.z); - - // Check if ray intersects AABB - if (tNear > tFar || tFar < 0.0) - return -1.0; - - // Return the nearest positive intersection - return tNear >= 0.0 ? tNear : tFar; -} - -// Project 3D point to NDC space -float32_t2 projectToNDC(float32_t3 worldPos, float32_t4x4 viewProj, float32_t aspect) -{ - float32_t4 clipPos = mul(viewProj, float32_t4(worldPos, 1.0)); - clipPos /= clipPos.w; - - // Apply aspect ratio correction - clipPos.x *= aspect; - - return clipPos.xy; -} - -ArrowResult visualizeRayAsArrow(float32_t3 rayOrigin, float32_t4 directionAndPdf, float32_t arrowLength, float32_t2 ndcPos, float32_t aspect) -{ - ArrowResult result; - result.color = float32_t4(0, 0, 0, 0); - result.depth = 0.0; // Far plane in reversed-Z - - float32_t3 rayDir = normalize(directionAndPdf.xyz); - float32_t pdf = directionAndPdf.w; - - // Define the 3D line segment - float32_t3 worldStart = rayOrigin; - float32_t3 worldEnd = rayOrigin + rayDir * arrowLength; - - // Transform to view space (camera space) for clipping - float32_t4x4 viewMatrix = pc.viewProjMatrix; // If you have view matrix separately, use that - // For now, we'll work in clip space and check w values - - float32_t4 clipStart = mul(pc.viewProjMatrix, float32_t4(worldStart, 1.0)); - float32_t4 clipEnd = mul(pc.viewProjMatrix, float32_t4(worldEnd, 1.0)); - - // Clip against near plane (w = 0 plane in clip space) - // If both points are behind camera, reject - if (clipStart.w <= 0.001 && clipEnd.w <= 0.001) - return result; - - // If line crosses the near plane, clip it - float32_t t0 = 0.0; - float32_t t1 = 1.0; - - if (clipStart.w <= 0.001) - { - // Start is behind camera, clip to near plane - float32_t t = (0.001 - clipStart.w) / (clipEnd.w - clipStart.w); - t0 = saturate(t); - clipStart = lerp(clipStart, clipEnd, t0); - worldStart = lerp(worldStart, worldEnd, t0); - } - - if (clipEnd.w <= 0.001) - { - // End is behind camera, clip to near plane - float32_t t = (0.001 - clipStart.w) / (clipEnd.w - clipStart.w); - t1 = saturate(t); - clipEnd = lerp(clipStart, clipEnd, t1); - worldEnd = lerp(worldStart, worldEnd, t1); - } - - // Now check if the clipped segment is valid - if (t0 >= t1) - return result; - - // Perspective divide to NDC - float32_t2 ndcStart = clipStart.xy / clipStart.w; - float32_t2 ndcEnd = clipEnd.xy / clipEnd.w; - - // Apply aspect ratio correction - ndcStart.x *= aspect; - ndcEnd.x *= aspect; - - // Calculate arrow direction in NDC - float32_t2 arrowVec = ndcEnd - ndcStart; - float32_t arrowNDCLength = length(arrowVec); - - // Skip if arrow is too small on screen - if (arrowNDCLength < 0.005) - return result; - - // Calculate perpendicular distance to line segment in NDC space - float32_t2 toPixel = ndcPos - ndcStart; - float32_t t_ndc = saturate(dot(toPixel, arrowVec) / dot(arrowVec, arrowVec)); - - // Draw line shaft - float32_t lineThickness = 0.002; - float32_t lineIntensity = lineSegment(ndcPos, ndcStart, ndcEnd, lineThickness); - - // Calculate perspective-correct depth - if (lineIntensity > 0.0) - { - // Interpolate in clip space - float32_t4 clipPos = lerp(clipStart, clipEnd, t_ndc); - - // Compute NDC depth for reversed-Z - float32_t depthNDC = clipPos.z / clipPos.w; - result.depth = 1.0f - depthNDC; - - // Clip against valid depth range - if (result.depth < 0.0 || result.depth > 1.0) - { - lineIntensity = 0.0; - } - } - - // Modulate by PDF - float32_t pdfIntensity = saturate(pdf * 0.5); - float32_t3 finalColor = float32_t3(pdfIntensity, pdfIntensity, pdfIntensity); - - result.color = float32_t4(finalColor, lineIntensity); - return result; -} - -// Returns both tMin (entry) and tMax (exit) for ray-AABB intersection -struct AABBIntersection +struct RayVisOutput { - float32_t tMin; // Distance to front face (entry point) - float32_t tMax; // Distance to back face (exit point) - bool hit; // Whether ray intersects the AABB at all + float32_t4 color : SV_Target0; + float32_t depth : SV_Depth; }; -AABBIntersection rayAABBIntersectionFull(float32_t3 origin, float32_t3 dir, float32_t3 boxMin, float32_t3 boxMax) -{ - AABBIntersection result; - result.hit = false; - result.tMin = 0.0f; - result.tMax = 0.0f; - - float32_t3 invDir = 1.0f / dir; - float32_t3 t0 = (boxMin - origin) * invDir; - float32_t3 t1 = (boxMax - origin) * invDir; - - float32_t3 tmin = min(t0, t1); - float32_t3 tmax = max(t0, t1); - - result.tMin = max(max(tmin.x, tmin.y), tmin.z); - result.tMax = min(min(tmax.x, tmax.y), tmax.z); - - // Ray intersects if tMax >= tMin and tMax > 0 - result.hit = (result.tMax >= result.tMin) && (result.tMax > 0.0f); - - // If we're inside the box, tMin will be negative - // In that case, we want to use tMax (exit point) - if (result.tMin < 0.0f) - result.tMin = 0.0f; - - return result; -} -#endif // VISUALIZE_SAMPLES - // [shader("pixel")] -[[vk::location(0)]] ArrowResult main(SVertexAttributes vx) +[[vk::location(0)]] RayVisOutput main(SVertexAttributes vx) { - ArrowResult output; -#if VISUALIZE_SAMPLES + RayVisOutput output; output.color = float32_t4(0.0, 0.0, 0.0, 0.0); output.depth = 0.0; // Far plane in reversed-Z (near=0, far=1) float32_t maxDepth = 0.0; // Track closest depth (minimum in reversed-Z) @@ -208,63 +34,56 @@ AABBIntersection rayAABBIntersectionFull(float32_t3 origin, float32_t3 dir, floa float32_t2 ndcPos = vx.uv * 2.0f - 1.0f; float32_t aspect = pc.viewport.z / pc.viewport.w; ndcPos.x *= aspect; + VisContext::begin(ndcPos, float32_t3(0, 0, 0), aaWidth); - for (uint32_t v = 0; v < DebugDataBuffer[0].clippedSilhouetteVertexCount; v++) + // Draw vertices in 3D + for (uint32_t v = 0; v < DebugDataBuffer[0].silhouette.clippedVertexCount; v++) { - float32_t4 clipPos = mul(pc.viewProjMatrix, float32_t4(DebugDataBuffer[0].clippedSilhouetteVertices[v], 1.0)); + float32_t4 clipPos = mul(pc.viewProjMatrix, float32_t4(DebugDataBuffer[0].silhouette.clippedVertices[v], 1.0)); float32_t3 ndcPosVertex = clipPos.xyz / clipPos.w; + ndcPosVertex.x *= aspect; if (ndcPosVertex.z < maxDepth) continue; - float32_t4 intensity = drawCorner(ndcPosVertex, ndcPos, aaWidth, 0.03, 0.0, colorLUT[DebugDataBuffer[0].clippedSilhouetteVerticesIndices[v]]); + float32_t4 intensity = SphereDrawer::drawDot(ndcPosVertex, 0.03, 0.0, colorLUT[DebugDataBuffer[0].silhouette.clippedVertexIndices[v]]); // Update depth only where we drew something - if (any(intensity.rgb > 0.0)) + if (intensity.a > 0.0) { - output.color.rgb += intensity.rgb; + VisContext::add(intensity); maxDepth = max(maxDepth, 1.0f - ndcPosVertex.z); } } - uint32_t sampleCount = DebugDataBuffer[0].sampleCount; - - for (uint32_t i = 0; i < sampleCount; i++) + // Draw sample rays + for (uint32_t i = 0; i < DebugDataBuffer[0].sampling.sampleCount; i++) { float32_t3 rayOrigin = float32_t3(0, 0, 0); - float32_t4 directionAndPdf = DebugDataBuffer[0].rayData[i]; + float32_t4 directionAndPdf = DebugDataBuffer[0].sampling.rayData[i]; float32_t3 rayDir = normalize(directionAndPdf.xyz); - // Define cube bounds in local space - float32_t3 cubeLocalMin = float32_t3(-0.5, -0.5, -0.5); - float32_t3 cubeLocalMax = float32_t3(0.5, 0.5, 0.5); - - // Transform ray to local space of the cube (using precomputed inverse) - float32_t3 localRayOrigin = mul(pc.invModelMatrix, float32_t4(rayOrigin, 1.0)).xyz; - float32_t3 localRayDir = normalize(mul(pc.invModelMatrix, float32_t4(rayDir, 0.0)).xyz); - - // Get both entry and exit distances - AABBIntersection intersection = rayAABBIntersectionFull(localRayOrigin, localRayDir, cubeLocalMin, cubeLocalMax); + shapes::OBBView obb = shapes::OBBView::create(pc.modelMatrix); + shapes::OBBView::Intersection intersection = obb.rayIntersection(rayOrigin, rayDir); float32_t arrowLength; float32_t3 arrowColor; if (intersection.hit) { - // Use tMax (exit point at back face) instead of tMin (entry point at front face) - float32_t3 localExitPoint = localRayOrigin + localRayDir * intersection.tMax; - float32_t3 worldExitPoint = mul(pc.modelMatrix, float32_t4(localExitPoint, 1.0)).xyz; - arrowLength = length(worldExitPoint - rayOrigin); + // Use tMax (exit point at back face) + float32_t3 worldExitPoint = rayOrigin + rayDir * intersection.tMax; + arrowLength = intersection.tMax; arrowColor = float32_t3(0.0, 1.0, 0.0); // Green for valid samples } else { - // Ray doesn't intersect - THIS SHOULD NEVER HAPPEN with correct sampling! - float32_t3 cubeCenter = mul(pc.modelMatrix, float32_t4(0, 0, 0, 1)).xyz; - arrowLength = length(cubeCenter - rayOrigin) + 2.0; + // Ray doesn't intersect + float32_t3 cubeCenter = obb.getCenter(); + arrowLength = length(cubeCenter - rayOrigin) + 2.0; // make it a little taller arrowColor = float32_t3(1.0, 0.0, 0.0); // Red for BROKEN samples } - ArrowResult arrow = visualizeRayAsArrow(rayOrigin, directionAndPdf, arrowLength, ndcPos, aspect); + SphereDrawer::ArrowResult arrow = SphereDrawer::visualizeRayAsArrow(rayOrigin, directionAndPdf, arrowLength, ndcPos, aspect, pc.viewProjMatrix); // Only update depth if arrow was actually drawn if (arrow.color.a > 0.0) @@ -273,17 +92,17 @@ AABBIntersection rayAABBIntersectionFull(float32_t3 origin, float32_t3 dir, floa } // Modulate arrow color by its alpha (only add where arrow is visible) - output.color.rgb += arrowColor * arrow.color.a; + VisContext::add(float32_t4(arrowColor * arrow.color.a, 0.0)); output.color.a = max(output.color.a, arrow.color.a); } // Clamp to prevent overflow + output.color.rgb += VisContext::flush().rgb; output.color = saturate(output.color); output.color.a = 1.0; // Write the closest depth (minimum in reversed-Z) output.depth = maxDepth; -#endif return output; } diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl index 8213c17fc..58afa5345 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl @@ -1,244 +1,399 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #ifndef _SOLID_ANGLE_VIS_EXAMPLE_SILHOUETTE_HLSL_INCLUDED_ #define _SOLID_ANGLE_VIS_EXAMPLE_SILHOUETTE_HLSL_INCLUDED_ -#include "gpu_common.hlsl" +#include "common.hlsl" +#include "debug_vis.hlsl" +#include "utils.hlsl" +#include +#include +#include + +using namespace nbl::hlsl; + +// TODO: unused, remove later +// Vertices are ordered CCW relative to the camera view. +static const uint32_t silhouettes[27][7] = { + {6, 1, 3, 2, 6, 4, 5}, // 0: Black + {6, 2, 6, 4, 5, 7, 3}, // 1: White + {6, 0, 4, 5, 7, 3, 2}, // 2: Gray + {6, 1, 3, 7, 6, 4, 5}, // 3: Red + {4, 4, 5, 7, 6, 0, 0}, // 4: Green + {6, 0, 4, 5, 7, 6, 2}, // 5: Blue + {6, 0, 1, 3, 7, 6, 4}, // 6: Yellow + {6, 0, 1, 5, 7, 6, 4}, // 7: Magenta + {6, 0, 1, 5, 7, 6, 2}, // 8: Cyan + {6, 1, 3, 2, 6, 7, 5}, // 9: Orange + {4, 2, 6, 7, 3, 0, 0}, // 10: Light Orange + {6, 0, 4, 6, 7, 3, 2}, // 11: Dark Orange + {4, 1, 3, 7, 5, 0, 0}, // 12: Pink + {4, 0, 4, 6, 7, 3, 2}, // 13: Light Pink + {4, 0, 4, 6, 2, 0, 0}, // 14: Deep Rose + {6, 0, 1, 3, 7, 5, 4}, // 15: Purple + {4, 0, 1, 5, 4, 0, 0}, // 16: Light Purple + {6, 0, 1, 5, 4, 6, 2}, // 17: Indigo + {6, 0, 2, 6, 7, 5, 1}, // 18: Dark Green + {6, 0, 2, 6, 7, 3, 1}, // 19: Lime + {6, 0, 4, 6, 7, 3, 1}, // 20: Forest Green + {6, 0, 2, 3, 7, 5, 1}, // 21: Navy + {4, 0, 2, 3, 1, 0, 0}, // 22: Sky Blue + {6, 0, 4, 6, 2, 3, 1}, // 23: Teal + {6, 0, 2, 3, 7, 5, 4}, // 24: Brown + {6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige + {6, 1, 5, 4, 6, 2, 3} // 26: Dark Brown +}; + +// Binary packed silhouettes +static const uint32_t binSilhouettes[27] = { + 0b11000000000000101100110010011001, + 0b11000000000000011111101100110010, + 0b11000000000000010011111101100000, + 0b11000000000000101100110111011001, + 0b10000000000000000000110111101100, + 0b11000000000000010110111101100000, + 0b11000000000000100110111011001000, + 0b11000000000000100110111101001000, + 0b11000000000000010110111101001000, + 0b11000000000000101111110010011001, + 0b10000000000000000000011111110010, + 0b11000000000000010011111110100000, + 0b10000000000000000000101111011001, + 0b11000000000000010011111110100000, + 0b10000000000000000000010110100000, + 0b11000000000000100101111011001000, + 0b10000000000000000000100101001000, + 0b11000000000000010110100101001000, + 0b11000000000000001101111110010000, + 0b11000000000000001011111110010000, + 0b11000000000000001011111110100000, + 0b11000000000000001101111011010000, + 0b10000000000000000000001011010000, + 0b11000000000000001011010110100000, + 0b11000000000000100101111011010000, + 0b11000000000000100101001011010000, + 0b11000000000000011010110100101001, +}; + +struct BinSilhouette +{ + static BinSilhouette create(uint32_t configIndex) + { + BinSilhouette s = (BinSilhouette)0; + s.data = binSilhouettes[configIndex]; + return s; + } + + uint32_t getVertexIndex(uint32_t index) NBL_CONST_MEMBER_FUNC + { + return (data >> (3u * index)) & 0x7u; + } + + // Get silhouette size + uint32_t getSilhouetteSize() NBL_CONST_MEMBER_FUNC + { + return (data >> 29u) & 0x7u; + } + + // Build a 12-bit mask of which cube edges are part of the silhouette. + // Edge enumeration: for axis in {0,1,2}, for each corner with axis-bit + // clear, edge = (corner, corner | (1<> (axis + 1u); + uint32_t compact = (above << axis) | below; + mask |= 1u << (axis * 4u + compact); + } + return mask; + } + + void rotr(uint32_t shift, uint32_t size) + { + data = nbl::hlsl::rotr(data, shift, size); + } + + void rotl(uint32_t shift, uint32_t size) + { + data = nbl::hlsl::rotl(data, shift, size); + } + + uint32_t data; +}; struct ClippedSilhouette { - float32_t3 vertices[MAX_SILHOUETTE_VERTICES]; // Max 7 vertices after clipping, unnormalized - uint32_t count; - - void normalize() - { - vertices[0] = nbl::hlsl::normalize(vertices[0]); - vertices[1] = nbl::hlsl::normalize(vertices[1]); - vertices[2] = nbl::hlsl::normalize(vertices[2]); - if (count > 3) - { - vertices[3] = nbl::hlsl::normalize(vertices[3]); - if (count > 4) + + static ClippedSilhouette create(shapes::OBBView view) + { + uint32_t3 region; + uint32_t configIndex, vertexCount; + BinSilhouette sil = computeRegionAndConfig(view, region, configIndex, vertexCount); + ClippedSilhouette s = (ClippedSilhouette)0; + s.compute(view, vertexCount, sil); + return s; + } + + // only used by projected parallelogram + void normalize() + { + vertices[0] = nbl::hlsl::normalize(vertices[0]); + vertices[1] = nbl::hlsl::normalize(vertices[1]); + vertices[2] = nbl::hlsl::normalize(vertices[2]); + if (count > 3) + { + vertices[3] = nbl::hlsl::normalize(vertices[3]); + if (count > 4) + { + vertices[4] = nbl::hlsl::normalize(vertices[4]); + if (count > 5) { - vertices[4] = nbl::hlsl::normalize(vertices[4]); - if (count > 5) - { - vertices[5] = nbl::hlsl::normalize(vertices[5]); - if (count > 6) - { - vertices[6] = nbl::hlsl::normalize(vertices[6]); - } - } + vertices[5] = nbl::hlsl::normalize(vertices[5]); + if (count > 6) + { + vertices[6] = nbl::hlsl::normalize(vertices[6]); + } } - } - } - - // Compute the silhouette centroid (average direction) - float32_t3 getCenter() - { - float32_t3 sum = float32_t3(0, 0, 0); - - NBL_UNROLL - for (uint32_t i = 0; i < MAX_SILHOUETTE_VERTICES; i++) - { - if (i < count) - sum += vertices[i]; - } - - return nbl::hlsl::normalize(sum); - } - - static uint32_t computeRegionAndConfig(float32_t3x4 modelMatrix, out uint32_t3 region, out uint32_t configIndex, out uint32_t vertexCount) - { - float32_t4x3 columnModel = transpose(modelMatrix); - float32_t3 obbCenter = columnModel[3].xyz; - float32_t3x3 upper3x3 = (float32_t3x3)columnModel; - - float32_t3 rcpSqScales = rcp(float32_t3( - dot(upper3x3[0], upper3x3[0]), - dot(upper3x3[1], upper3x3[1]), - dot(upper3x3[2], upper3x3[2]))); - - float32_t3 normalizedProj = mul(upper3x3, obbCenter) * rcpSqScales; - - region = uint32_t3( - normalizedProj.x < -0.5f ? 0 : (normalizedProj.x > 0.5f ? 2 : 1), - normalizedProj.y < -0.5f ? 0 : (normalizedProj.y > 0.5f ? 2 : 1), - normalizedProj.z < -0.5f ? 0 : (normalizedProj.z > 0.5f ? 2 : 1)); - - configIndex = region.x + region.y * 3u + region.z * 9u; - - uint32_t sil = binSilhouettes[configIndex]; - vertexCount = getSilhouetteSize(sil); - - return sil; - } - - void compute(float32_t3x4 modelMatrix, uint32_t vertexCount, uint32_t sil) - { - count = 0; - - // Build clip mask (z < 0) - uint32_t clipMask = 0u; - NBL_UNROLL - for (uint32_t i = 0; i < 4; i++) - clipMask |= (getVertexZNeg(modelMatrix, getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; - - if (vertexCount == 6) - { - NBL_UNROLL - for (uint32_t i = 4; i < 6; i++) - clipMask |= (getVertexZNeg(modelMatrix, getSilhouetteVertex(sil, i)) ? 1u : 0u) << i; - } - - uint32_t clipCount = countbits(clipMask); - - // Invert clip mask to find first positive vertex - uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u); - - // Check if wrap-around is needed (first and last bits negative) - bool wrapAround = ((clipMask & 1u) != 0u) && ((clipMask & (1u << (vertexCount - 1))) != 0u); - - // Compute rotation amount - uint32_t rotateAmount = wrapAround - ? firstbitlow(invertedMask) // first positive - : firstbithigh(clipMask) + 1; // first vertex after last negative - - // Rotate masks - uint32_t rotatedClipMask = rotr(clipMask, rotateAmount, vertexCount); - uint32_t rotatedSil = rotr(sil, rotateAmount * 3, vertexCount * 3); - uint32_t positiveCount = vertexCount - clipCount; - - // ALWAYS compute both clip points - uint32_t lastPosIdx = positiveCount - 1; - uint32_t firstNegIdx = positiveCount; - - float32_t3 vLastPos = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, lastPosIdx)); - float32_t3 vFirstNeg = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, firstNegIdx)); - float32_t t = vLastPos.z / (vLastPos.z - vFirstNeg.z); - float32_t3 clipA = lerp(vLastPos, vFirstNeg, t); - - float32_t3 vLastNeg = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, vertexCount - 1)); - float32_t3 vFirstPos = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, 0)); - t = vLastNeg.z / (vLastNeg.z - vFirstPos.z); - float32_t3 clipB = lerp(vLastNeg, vFirstPos, t); - - NBL_UNROLL - for (uint32_t i = 0; i < positiveCount; i++) - { - float32_t3 v0 = getVertex(modelMatrix, getSilhouetteVertex(rotatedSil, i)); - -#if DEBUG_DATA - uint32_t originalIndex = (i + rotateAmount) % vertexCount; - DebugDataBuffer[0].clippedSilhouetteVertices[count] = v0; - DebugDataBuffer[0].clippedSilhouetteVerticesIndices[count] = originalIndex; -#endif - vertices[count++] = v0; - } - - if (clipCount > 0 && clipCount < vertexCount) - { -#if DEBUG_DATA - DebugDataBuffer[0].clippedSilhouetteVertices[count] = clipA; - DebugDataBuffer[0].clippedSilhouetteVerticesIndices[count] = CLIP_POINT_A; -#endif - vertices[count++] = clipA; - -#if DEBUG_DATA - DebugDataBuffer[0].clippedSilhouetteVertices[count] = clipB; - DebugDataBuffer[0].clippedSilhouetteVerticesIndices[count] = CLIP_POINT_B; -#endif - vertices[count++] = clipB; - } - -#if DEBUG_DATA - DebugDataBuffer[0].clippedSilhouetteVertexCount = count; - DebugDataBuffer[0].clipMask = clipMask; - DebugDataBuffer[0].clipCount = clipCount; - DebugDataBuffer[0].rotatedClipMask = rotatedClipMask; - DebugDataBuffer[0].rotateAmount = rotateAmount; - DebugDataBuffer[0].positiveVertCount = positiveCount; - DebugDataBuffer[0].wrapAround = (uint32_t)wrapAround; - DebugDataBuffer[0].rotatedSil = rotatedSil; -#endif - } + } + } + } + + // Compute the silhouette centroid (average direction) + // Returns unnormalized centroid (sum of vertices). The direction is what + // matters for the adaptive axis3 blend, the magnitude cancels out after + // normalize(center * tBlend + (0,0,1)). just as small optimization. + float32_t3 getUnnormalizedCenter() + { + float32_t3 sum = float32_t3(0, 0, 0); + + NBL_UNROLL + for (uint32_t i = 0; i < MAX_SILHOUETTE_VERTICES; i++) + { + if (i < count) + sum += vertices[i]; + } + + return sum; + } + + static BinSilhouette computeRegionAndConfig(shapes::OBBView view, out uint32_t3 region, out uint32_t configIndex, out uint32_t vertexCount) + { + // With [0,1]^3 local space, the observer's unnormalized OBB-local + // coordinate along axis i is proj_i = -dot(col_i, minCorner). + // Compare against 0 and |col_i|^2 (the unnormalized [0,1] bounds) + // to classify into the 27-configuration LUT. + float32_t3 sqScales = float32_t3( + dot(view.columns[0], view.columns[0]), + dot(view.columns[1], view.columns[1]), + dot(view.columns[2], view.columns[2])); + + float32_t3 proj = -float32_t3( + dot(view.columns[0], view.minCorner), + dot(view.columns[1], view.minCorner), + dot(view.columns[2], view.minCorner)); + + region = uint32_t3( + proj.x < 0 ? 2 : (proj.x > sqScales.x ? 0 : 1), + proj.y < 0 ? 2 : (proj.y > sqScales.y ? 0 : 1), + proj.z < 0 ? 2 : (proj.z > sqScales.z ? 0 : 1)); + + configIndex = region.x + region.y * 3u + region.z * 9u; + + BinSilhouette sil = BinSilhouette::create(configIndex); + vertexCount = sil.getSilhouetteSize(); + + return sil; + } + + void compute(shapes::OBBView view, uint32_t vertexCount, BinSilhouette sil) + { + + // Build clip mask (z < 0) + uint32_t clipMask = 0u; + NBL_UNROLL + for (uint32_t i = 0; i < 4; i++) + clipMask |= (view.getVertexZ(sil.getVertexIndex(i)) < 0.0f ? 1u : 0u) << i; + + if (vertexCount == 6) + { + NBL_UNROLL + for (uint32_t i = 4; i < 6; i++) + clipMask |= (view.getVertexZ(sil.getVertexIndex(i)) < 0.0f ? 1u : 0u) << i; + } + + uint32_t clipCount = countbits(clipMask); + + // Invert clip mask to find first positive vertex + uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u); + + // Check if wrap-around is needed (first and last bits negative) + bool wrapAround = ((clipMask & 1u) != 0u) && ((clipMask & (1u << (vertexCount - 1))) != 0u); + + // Compute rotation amount + uint32_t rotateAmount = nbl::hlsl::select(wrapAround, firstbitlow(invertedMask), // first positive + firstbithigh(clipMask) + 1); // first vertex after last negative + + // Rotate masks + uint32_t rotatedClipMask = nbl::hlsl::rotr(clipMask, rotateAmount, vertexCount); + sil.rotr(rotateAmount * 3, vertexCount * 3); + uint32_t positiveCount = vertexCount - clipCount; + + // Compute all 4 clip endpoints up front , independent obbVertex calls + // give the compiler maximum ILP alongside the positive-vertex loop. + uint32_t lastPosIdx = positiveCount - 1; + uint32_t firstNegIdx = positiveCount; + + float32_t3 vLastPos = view.getVertex(sil.getVertexIndex(lastPosIdx)); + float32_t3 vFirstNeg = view.getVertex(sil.getVertexIndex(firstNegIdx)); + float32_t t = vLastPos.z / (vLastPos.z - vFirstNeg.z); + float32_t3 clipA = lerp(vLastPos, vFirstNeg, t); + + float32_t3 vLastNeg = view.getVertex(sil.getVertexIndex(vertexCount - 1)); + float32_t3 vFirstPos = view.getVertex(sil.getVertexIndex(0)); + t = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + float32_t3 clipB = lerp(vLastNeg, vFirstPos, t); + + count = 0; + + NBL_UNROLL + for (uint32_t i = 0; i < positiveCount; i++) + { + float32_t3 v0 = view.getVertex(sil.getVertexIndex(i)); + DebugRecorder::recordClippedVertex(count, v0, (i + rotateAmount) % vertexCount); + vertices[count++] = v0; + } + + if (clipCount > 0 && clipCount < vertexCount) + { + DebugRecorder::recordClippedVertex(count, clipA, 23); + vertices[count++] = clipA; + + DebugRecorder::recordClippedVertex(count, clipB, 24); + vertices[count++] = clipB; + } + + DebugRecorder::recordClipResult(count, clipMask, clipCount, rotatedClipMask, + rotateAmount, positiveCount, wrapAround, sil.data); + } + + float32_t3 vertices[MAX_SILHOUETTE_VERTICES]; // Max 7 vertices after clipping, unnormalized + uint32_t count; }; struct SilEdgeNormals { - float16_t3 edgeNormals[MAX_SILHOUETTE_VERTICES]; // 10.5 floats instead of 21 - uint32_t count; + // Better not use and calculate it while creating the sampler + static SilEdgeNormals create(NBL_CONST_REF_ARG(ClippedSilhouette) sil) + { + SilEdgeNormals result = (SilEdgeNormals)0; - // Better not use and calculate it while creating the sampler - static SilEdgeNormals create(NBL_CONST_REF_ARG(ClippedSilhouette) sil) - { - SilEdgeNormals result = (SilEdgeNormals)0; - result.count = sil.count; + float32_t3 v0 = sil.vertices[0]; + float32_t3 v1 = sil.vertices[1]; + float32_t3 v2 = sil.vertices[2]; - float32_t3 v0 = sil.vertices[0]; - float32_t3 v1 = sil.vertices[1]; - float32_t3 v2 = sil.vertices[2]; + result.edgeNormals[0] = cross(v0, v1); + result.edgeNormals[1] = cross(v1, v2); - result.edgeNormals[0] = float16_t3(cross(v0, v1)); - result.edgeNormals[1] = float16_t3(cross(v1, v2)); + if (sil.count > 3) + { + float32_t3 v3 = sil.vertices[3]; + result.edgeNormals[2] = cross(v2, v3); - if (sil.count > 3) - { - float32_t3 v3 = sil.vertices[3]; - result.edgeNormals[2] = float16_t3(cross(v2, v3)); + if (sil.count > 4) + { + float32_t3 v4 = sil.vertices[4]; + result.edgeNormals[3] = cross(v3, v4); - if (sil.count > 4) + if (sil.count > 5) { - float32_t3 v4 = sil.vertices[4]; - result.edgeNormals[3] = float16_t3(cross(v3, v4)); - - if (sil.count > 5) - { - float32_t3 v5 = sil.vertices[5]; - result.edgeNormals[4] = float16_t3(cross(v4, v5)); - - if (sil.count > 6) - { - float32_t3 v6 = sil.vertices[6]; - result.edgeNormals[5] = float16_t3(cross(v5, v6)); - result.edgeNormals[6] = float16_t3(cross(v6, v0)); - } - else - { - result.edgeNormals[5] = float16_t3(cross(v5, v0)); - } - } - else - { - result.edgeNormals[4] = float16_t3(cross(v4, v0)); - } + float32_t3 v5 = sil.vertices[5]; + result.edgeNormals[4] = cross(v4, v5); + + if (sil.count > 6) + { + float32_t3 v6 = sil.vertices[6]; + result.edgeNormals[5] = cross(v5, v6); + result.edgeNormals[6] = cross(v6, v0); + } + else + { + result.edgeNormals[5] = cross(v5, v0); + } } else { - result.edgeNormals[3] = float16_t3(cross(v3, v0)); + result.edgeNormals[4] = cross(v4, v0); } - } - else - { - result.edgeNormals[2] = float16_t3(cross(v2, v0)); - } - - return result; - } - - bool isInside(float32_t3 dir) - { - float16_t3 d = float16_t3(dir); - half maxDot = dot(d, edgeNormals[0]); - maxDot = max(maxDot, dot(d, edgeNormals[1])); - maxDot = max(maxDot, dot(d, edgeNormals[2])); - maxDot = max(maxDot, dot(d, edgeNormals[3])); - maxDot = max(maxDot, dot(d, edgeNormals[4])); - maxDot = max(maxDot, dot(d, edgeNormals[5])); - maxDot = max(maxDot, dot(d, edgeNormals[6])); - return maxDot <= float16_t(0.0f); - } + } + else + { + result.edgeNormals[3] = cross(v3, v0); + } + } + else + { + result.edgeNormals[2] = cross(v2, v0); + } + + return result; + } + + bool isInside(float32_t3 dir) + { + float32_t maxDot = dot(dir, edgeNormals[0]); + maxDot = max(maxDot, dot(dir, edgeNormals[1])); + maxDot = max(maxDot, dot(dir, edgeNormals[2])); + maxDot = max(maxDot, dot(dir, edgeNormals[3])); + maxDot = max(maxDot, dot(dir, edgeNormals[4])); + maxDot = max(maxDot, dot(dir, edgeNormals[5])); + maxDot = max(maxDot, dot(dir, edgeNormals[6])); + return maxDot <= 0.0f; + } + + // Transform edge normals from world-space to the pyramid's local frame in-place. + // After this, edgeNormals[i] = (dot(n, axis1), dot(n, axis2), dot(n, axis3)) + // and isInsideLocal() can do 2-FMA half-plane tests without extra storage. + // NOTE: destroys world-space normals , isInside() will no longer work correctly. + void transformToLocal(float32_t3 axis1, float32_t3 axis2, float32_t3 axis3) + { + NBL_UNROLL + for (uint32_t i = 0; i < MAX_SILHOUETTE_VERTICES; i++) + { + float32_t3 n = edgeNormals[i]; + edgeNormals[i] = float32_t3(dot(n, axis1), dot(n, axis2), dot(n, axis3)); + } + } + + // 2D gnomonic containment test after transformToLocal(). + // dot(dir_unnorm, n_local) = localX * n.x + localY * n.y + n.z + bool isInsideLocal(float32_t localX, float32_t localY) + { + float32_t maxDot = localX * edgeNormals[0].x + localY * edgeNormals[0].y + edgeNormals[0].z; + maxDot = max(maxDot, localX * edgeNormals[1].x + localY * edgeNormals[1].y + edgeNormals[1].z); + maxDot = max(maxDot, localX * edgeNormals[2].x + localY * edgeNormals[2].y + edgeNormals[2].z); + maxDot = max(maxDot, localX * edgeNormals[3].x + localY * edgeNormals[3].y + edgeNormals[3].z); + maxDot = max(maxDot, localX * edgeNormals[4].x + localY * edgeNormals[4].y + edgeNormals[4].z); + maxDot = max(maxDot, localX * edgeNormals[5].x + localY * edgeNormals[5].y + edgeNormals[5].z); + maxDot = max(maxDot, localX * edgeNormals[6].x + localY * edgeNormals[6].y + edgeNormals[6].z); + return maxDot <= 0.0f; + } + + float32_t3 edgeNormals[MAX_SILHOUETTE_VERTICES]; }; #endif // _SOLID_ANGLE_VIS_EXAMPLE_SILHOUETTE_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl index bba9aba75..82728531c 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl @@ -1,4 +1,4 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #pragma wave shader_stage(fragment) @@ -20,286 +20,198 @@ using namespace ext::FullScreenTriangle; static const SAMPLING_MODE samplingMode = (SAMPLING_MODE)SAMPLING_MODE_CONST; -void computeCubeGeo() -{ - for (uint32_t i = 0; i < 8; i++) - corners[i] = mul(pc.modelMatrix, float32_t4(constCorners[i], 1.0f)).xyz; - - for (uint32_t f = 0; f < 6; f++) - { - faceCenters[f] = float32_t3(0, 0, 0); - for (uint32_t v = 0; v < 4; v++) - faceCenters[f] += corners[faceToCorners[f][v]]; - faceCenters[f] /= 4.0f; - } -} - -void validateSilhouetteEdges(uint32_t sil, uint32_t vertexCount, inout uint32_t silEdgeMask) -{ -#if DEBUG_DATA - { - for (uint32_t i = 0; i < vertexCount; i++) - { - uint32_t vIdx = i % vertexCount; - uint32_t v1Idx = (i + 1) % vertexCount; - - uint32_t v0Corner = getSilhouetteVertex(sil, vIdx); - uint32_t v1Corner = getSilhouetteVertex(sil, v1Idx); - // Mark edge as part of silhouette - for (uint32_t e = 0; e < 12; e++) - { - uint32_t2 edge = allEdges[e]; - if ((edge.x == v0Corner && edge.y == v1Corner) || - (edge.x == v1Corner && edge.y == v0Corner)) - { - silEdgeMask |= (1u << e); - } - } - } - validateEdgeVisibility(pc.modelMatrix, sil, vertexCount, silEdgeMask); - } -#endif -} - void computeSpherePos(SVertexAttributes vx, out float32_t2 ndc, out float32_t3 spherePos) { - ndc = vx.uv * 2.0f - 1.0f; - float32_t aspect = pc.viewport.z / pc.viewport.w; - ndc.x *= aspect; - - float32_t2 normalized = ndc / CIRCLE_RADIUS; - float32_t r2 = dot(normalized, normalized); - - if (r2 <= 1.0f) - { - spherePos = float32_t3(normalized.x, normalized.y, sqrt(1.0f - r2)); - } - else - { - float32_t uv2Plus1 = r2 + 1.0f; - spherePos = float32_t3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; - } - spherePos = normalize(spherePos); + ndc = vx.uv * 2.0f - 1.0f; + float32_t aspect = pc.viewport.z / pc.viewport.w; + ndc.x *= aspect; + + float32_t2 normalized = ndc / CIRCLE_RADIUS; + float32_t r2 = dot(normalized, normalized); + + if (r2 <= 1.0f) + { + spherePos = float32_t3(normalized.x, normalized.y, sqrt(1.0f - r2)); + } + else + { + float32_t uv2Plus1 = r2 + 1.0f; + spherePos = float32_t3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; + } + spherePos = normalize(spherePos); } -#if VISUALIZE_SAMPLES -float32_t4 visualizeSample(float32_t3 sampleDir, float32_t2 xi, uint32_t index, float32_t2 screenUV, float32_t3 spherePos, float32_t2 ndc, float32_t aaWidth -#if DEBUG_DATA - , - inout RWStructuredBuffer DebugDataBuffer -#endif -) +// Sample a direction from a pyramid-based rectangle sampler, returning validity +template +float32_t3 sampleFromPyramid(inout Sampler sampler, SphericalPyramid pyramid, SilEdgeNormals silEdgeNormals, float32_t2 xi, out float32_t pdf, out bool valid) { - float32_t4 accumColor = 0; - - float32_t2 pssSize = float32_t2(0.3, 0.3); // 30% of screen - float32_t2 pssPos = float32_t2(0.01, 0.01); // Offset from corner - bool isInsidePSS = all(and(screenUV >= pssPos, screenUV <= (pssPos + pssSize))); - - float32_t dist3D = distance(sampleDir, normalize(spherePos)); - float32_t alpha3D = 1.0f - smoothstep(0.0f, 0.02f, dist3D); - - if (alpha3D > 0.0f /* && !isInsidePSS*/) - { - float32_t3 sampleColor = colorLUT[index].rgb; - accumColor += float32_t4(sampleColor * alpha3D, alpha3D); - } - - // if (isInsidePSS) - // { - // // Map the raw xi to the PSS square dimensions - // float32_t2 xiPixelPos = pssPos + xi * pssSize; - // float32_t dist2D = distance(screenUV, xiPixelPos); - - // float32_t alpha2D = drawCross2D(screenUV, xiPixelPos, 0.005f, 0.001f); - // if (alpha2D > 0.0f) - // { - // float32_t3 sampleColor = colorLUT[index].rgb; - // accumColor += float32_t4(sampleColor * alpha2D, alpha2D); - // } - // } - - // // just the outline of the PSS - // if (isInsidePSS && accumColor.a < 0.1) - // accumColor = float32_t4(0.1, 0.1, 0.1, 1.0); - - return accumColor; + typename Sampler::cache_type cache; + float32_t hitDist; + float32_t3 localDir = sampler.generateNormalizedLocal(xi, cache, hitDist); + float32_t3 dir = localDir.x * pyramid.axis1 + localDir.y * pyramid.axis2 + localDir.z * pyramid.getAxis3(); + float32_t localX = localDir.x / localDir.z; + float32_t localY = localDir.y / localDir.z; + valid = dir.z > 0.0f && silEdgeNormals.isInsideLocal(localX, localY); + pdf = sampler.forwardPdf(xi, cache); + return dir; } -#endif // VISUALIZE_SAMPLES -// [shader("pixel")] [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 { - float32_t4 color = float32_t4(0, 0, 0, 0); - for (uint32_t i = 0; i < 1; i++) - { - float32_t aaWidth = length(float32_t2(ddx(vx.uv.x), ddy(vx.uv.y))); - float32_t3 spherePos; - float32_t2 ndc; - computeSpherePos(vx, ndc, spherePos); -#if !FAST || DEBUG_DATA - computeCubeGeo(); -#endif - uint32_t3 region; - uint32_t configIndex; - uint32_t vertexCount; - uint32_t sil = ClippedSilhouette::computeRegionAndConfig(pc.modelMatrix, region, configIndex, vertexCount); - - uint32_t silEdgeMask = 0; // TODO: take from 'fast' compute() -#if DEBUG_DATA - validateSilhouetteEdges(sil, vertexCount, silEdgeMask); -#endif - ClippedSilhouette silhouette; - silhouette.compute(pc.modelMatrix, vertexCount, sil); - -#if VISUALIZE_SAMPLES - // Draw silhouette edges on the sphere - for (uint32_t ei = 0; ei < silhouette.count; ei++) - { - float32_t3 v0 = normalize(silhouette.vertices[ei]); - float32_t3 v1 = normalize(silhouette.vertices[(ei + 1) % silhouette.count]); - float32_t3 pts[2] = {v0, v1}; - color += drawEdge(0, pts, spherePos, aaWidth); - } -#endif - - TriangleFanSampler samplingData; - Parallelogram parallelogram; - SphericalPyramid pyramid; - UrenaSampler urena; - BiquadraticSampler biquad; - BilinearSampler bilin; - - SilEdgeNormals silEdgeNormals; - //===================================================================== - // Building - //===================================================================== - if (samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || - samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - { - samplingData = TriangleFanSampler::create(silhouette, samplingMode); - } - else if (samplingMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) - { - silhouette.normalize(); - parallelogram = Parallelogram::create(silhouette, silEdgeNormals -#if VISUALIZE_SAMPLES - , - ndc, spherePos, aaWidth, color -#endif - ); - } - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE || - samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC || - samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) - { - pyramid = SphericalPyramid::create(silhouette, silEdgeNormals -#if VISUALIZE_SAMPLES - , - ndc, spherePos, aaWidth, color -#endif - ); - - if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE) - urena = UrenaSampler::create(pyramid); - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC) - biquad = BiquadraticSampler::create(pyramid); - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) - bilin = BilinearSampler::create(pyramid); - } - -#if DEBUG_DATA - uint32_t validSampleCount = 0u; - DebugDataBuffer[0].sampleCount = pc.sampleCount; -#endif - //===================================================================== - // Sampling - //===================================================================== - for (uint32_t i = 0; i < pc.sampleCount; i++) - { - // Hash the invocation to offset the grid - float32_t2 xi = float32_t2( - (float32_t(i & 7u) + 0.5) / 8.0f, - (float32_t(i >> 3u) + 0.5) / 8.0f); - - float32_t pdf; - uint32_t index = 0; - float32_t3 sampleDir; - bool valid; - - if (samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - sampleDir = samplingData.sample(silhouette, xi, pdf, index); - else if (samplingMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) - sampleDir = parallelogram.sample(silEdgeNormals, xi, pdf, valid); - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE) - sampleDir = urena.sample(pyramid, silEdgeNormals, xi, pdf, valid); - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC) - sampleDir = biquad.sample(pyramid, silEdgeNormals, xi, pdf, valid); - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) - sampleDir = bilin.sample(pyramid, silEdgeNormals, xi, pdf, valid); - - if (!valid) - { - pdf = 0.0f; - // sampleDir = float32_t3(0, 0, 1); - } -#if DEBUG_DATA - else - { - validSampleCount++; - } - - DebugDataBuffer[0].rayData[i] = float32_t4(sampleDir, pdf); -#endif - -#if VISUALIZE_SAMPLES - // Draw samples on sphere - color += visualizeSample(sampleDir, xi, index, vx.uv, spherePos, ndc, aaWidth -#if DEBUG_DATA - , - DebugDataBuffer -#endif - ); -#else - if (pdf > 0.0f) - color += float4(sampleDir * 0.02f / pdf, 1.0f); -#endif // VISUALIZE_SAMPLES - } - -#if VISUALIZE_SAMPLES - - // For debugging: Draw a small indicator of which faces are found - // color += drawVisibleFaceOverlay(pc.modelMatrix, spherePos, region, aaWidth); - - // color += drawFaces(pc.modelMatrix, spherePos, aaWidth); - - // Draw clipped silhouette vertices - // color += drawClippedSilhouetteVertices(ndc, silhouette, aaWidth); - // color += drawHiddenEdges(pc.modelMatrix, spherePos, silEdgeMask, aaWidth); - // color += drawCorners(pc.modelMatrix, ndc, aaWidth, 0.05f); - color += drawRing(ndc, aaWidth); - - if (all(vx.uv >= float32_t2(0.f, 0.97f)) && all(vx.uv <= float32_t2(0.03f, 1.0f))) - { - return float32_t4(colorLUT[configIndex], 1.0f); - } -#else -#endif // VISUALIZE_SAMPLES - -#if DEBUG_DATA - InterlockedAdd(DebugDataBuffer[0].validSampleCount, validSampleCount); - InterlockedAdd(DebugDataBuffer[0].threadCount, 1u); - DebugDataBuffer[0].region = uint32_t3(region); - DebugDataBuffer[0].silhouetteIndex = uint32_t(configIndex); - DebugDataBuffer[0].silhouetteVertexCount = uint32_t(getSilhouetteSize(sil)); - for (uint32_t i = 0; i < 6; i++) - { - DebugDataBuffer[0].vertices[i] = uint32_t(getSilhouetteVertex(sil, i)); - } - DebugDataBuffer[0].silhouette = sil; - -#endif - } - - return color; + float32_t aaWidth = length(float32_t2(ddx(vx.uv.x), ddy(vx.uv.y))); + float32_t3 spherePos; + float32_t2 ndc; + computeSpherePos(vx, ndc, spherePos); + VisContext::begin(ndc, spherePos, aaWidth); + + shapes::OBBView view = shapes::OBBView::create(pc.modelMatrix); + uint32_t3 region; + uint32_t configIndex; + uint32_t vertexCount; + BinSilhouette sil = ClippedSilhouette::computeRegionAndConfig(view, region, configIndex, vertexCount); + + ClippedSilhouette silhouette; + silhouette.compute(view, vertexCount, sil); + + if (samplingMode == SAMPLING_MODE::SILHOUETTE_CREATION_ONLY) + { + shapes::OBBView perturbedView = view; + perturbedView.minCorner += float32_t3(ndc.x, ndc.y, 0.0f) * 1e-7f; + ClippedSilhouette pSilhouette = ClippedSilhouette::create(perturbedView); + + uint32_t sink = pSilhouette.count; + NBL_UNROLL + for (uint32_t i = 0; i < MAX_SILHOUETTE_VERTICES; i++) + sink ^= asuint(pSilhouette.vertices[i].x) ^ asuint(pSilhouette.vertices[i].y) ^ asuint(pSilhouette.vertices[i].z); + return (float32_t4)asfloat(sink); + } + + // Draw silhouette edges on the sphere + for (uint32_t ei = 0; ei < silhouette.count; ei++) + { + float32_t3 v0 = normalize(silhouette.vertices[ei]); + float32_t3 v1 = normalize(silhouette.vertices[(ei + 1) % silhouette.count]); + float32_t3 pts[2] = {v0, v1}; + VisContext::add(SphereDrawer::drawEdge(0, pts, aaWidth)); + } + + // ===================================================================== + // Build sampler + // ===================================================================== + TriangleFanSampler samplingData; + Parallelogram parallelogram; + SphericalPyramid pyramid; + sampling::SphericalRectangle rectSampler; + sampling::ProjectedSphericalRectangle projRectSampler; + BiquadraticSampler biquad; + BilinearSampler bilin; + SilEdgeNormals silEdgeNormals; + + if (samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || + samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + { + samplingData = TriangleFanSampler::create(silhouette, samplingMode); + } + else if (samplingMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) + { + silhouette.normalize(); + parallelogram = Parallelogram::create(silhouette, silEdgeNormals); + } + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE || + samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC || + samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR || + samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE || + samplingMode == SAMPLING_MODE::PYRAMID_CREATION_ONLY) + { + pyramid = SphericalPyramid::create(silhouette, silEdgeNormals); + silEdgeNormals.transformToLocal(pyramid.axis1, pyramid.axis2, pyramid.getAxis3()); + + if (samplingMode == SAMPLING_MODE::PYRAMID_CREATION_ONLY) + { + uint32_t sink = 0; + for (uint32_t j = 0; j < pc.sampleCount; j++) + { + ClippedSilhouette pertSil = silhouette; + float32_t pertScale = (float32_t(j) + ndc.x + ndc.y) * 0.001f; + NBL_UNROLL + for (uint32_t i = 0; i < MAX_SILHOUETTE_VERTICES; i++) + pertSil.vertices[i] = normalize(pertSil.vertices[i] + float32_t3(pertScale * float32_t(i + 1), pertScale * 0.7f, 0.0f)); + + SilEdgeNormals pertEdgeNormals; + SphericalPyramid pertPyramid = SphericalPyramid::create(pertSil, pertEdgeNormals); + sink ^= asuint(pertPyramid.axis1.x) ^ asuint(pertPyramid.axis2.x) ^ asuint(pertPyramid.rectR0.x) ^ asuint(pertPyramid.rectExtents.x) ^ asuint(float32_t(pertEdgeNormals.edgeNormals[0].x)); + } + return (float32_t4)asfloat(sink); + } + + if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE) + rectSampler = sampling::SphericalRectangle::create(float32_t3x3(pyramid.axis1, pyramid.axis2, pyramid.getAxis3()), float32_t3(pyramid.rectR0, 1.0f), pyramid.rectExtents); + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE) + { + shapes::CompressedSphericalRectangle compressed; + compressed.origin = pyramid.axis1 * pyramid.rectR0.x + pyramid.axis2 * pyramid.rectR0.y + pyramid.getAxis3(); + compressed.right = pyramid.axis1 * pyramid.rectExtents.x; + compressed.up = pyramid.axis2 * pyramid.rectExtents.y; + projRectSampler = sampling::ProjectedSphericalRectangle::create(compressed, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, 1.0f), false); + } + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC) + biquad = BiquadraticSampler::create(pyramid); + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) + bilin = BilinearSampler::create(pyramid); + } + + // ===================================================================== + // Sample loop + // ===================================================================== + uint32_t validSampleCount = 0; + DebugRecorder::recordSampleCount(pc.sampleCount); + + for (uint32_t i = 0; i < pc.sampleCount; i++) + { + float32_t2 xi = float32_t2( + (float32_t(i & 7u) + 0.5) / sqrt(pc.sampleCount) + ndc.x * 1e-9f, + (float32_t(i >> 3u) + 0.5) / sqrt(pc.sampleCount) + ndc.y * 1e-9f); + + float32_t pdf; + uint32_t index = 0; + float32_t3 sampleDir; + bool valid; + + if (samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + sampleDir = samplingData.sample(silhouette, xi, pdf, index); + else if (samplingMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) + sampleDir = parallelogram.sample(silEdgeNormals, xi, pdf, valid); + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE) + sampleDir = sampleFromPyramid(rectSampler, pyramid, silEdgeNormals, xi, pdf, valid); + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE) + sampleDir = sampleFromPyramid(projRectSampler, pyramid, silEdgeNormals, xi, pdf, valid); + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC) + sampleDir = biquad.sample(pyramid, silEdgeNormals, xi, pdf, valid); + else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) + sampleDir = bilin.sample(pyramid, silEdgeNormals, xi, pdf, valid); + + if (!valid) + pdf = 0.0f; + else + validSampleCount++; + + DebugRecorder::recordRay(i, sampleDir, pdf); + + if (VisContext::enabled()) + VisContext::add(SphereDrawer::visualizeSample(sampleDir, xi, index, vx.uv)); + else if (pdf > 0.0f) + VisContext::add(float4(sampleDir * 0.02f / pdf, 1.0f)); + } + + VisContext::add(SphereDrawer::drawRing(ndc)); + + if (VisContext::enabled() && all(vx.uv >= float32_t2(0.f, 0.97f)) && all(vx.uv <= float32_t2(0.03f, 1.0f))) + return float32_t4(colorLUT[configIndex], 1.0f); + + uint32_t vertexIndices[6]; + for (uint32_t i = 0; i < 6; i++) + vertexIndices[i] = uint32_t(sil.getVertexIndex(i)); + DebugRecorder::recordFrameEnd(region, configIndex, sil.getSilhouetteSize(), sil.data, vertexIndices, validSampleCount); + + return VisContext::flush(); } diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl index 46277ca27..9053807ca 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl @@ -1,11 +1,11 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #ifndef _SOLID_ANGLE_VIS_EXAMPLE_TRIANGLE_SAMPLING_HLSL_INCLUDED_ #define _SOLID_ANGLE_VIS_EXAMPLE_TRIANGLE_SAMPLING_HLSL_INCLUDED_ // Include the spherical triangle utilities -#include "gpu_common.hlsl" +#include "common.hlsl" #include #include #include @@ -22,220 +22,163 @@ using namespace nbl::hlsl; struct TriangleFanSampler { - uint32_t count; // Number of valid triangles - uint32_t samplingMode; // Mode used during build - float32_t totalWeight; // Sum of all triangle weights - float32_t3 faceNormal; // Face normal (only used for projected mode) - float32_t triangleSolidAngles[MAX_TRIANGLES]; // Weight per triangle (for selection) - uint32_t triangleIndices[MAX_TRIANGLES]; // Vertex index i (forms triangle with v0, vi, vi+1) - - float32_t computeProjectedSolidAngleFallback(float32_t3 v0, float32_t3 v1, float32_t3 v2, float32_t3 N) - { - // 1. Get edge normals (unit vectors) - // We use the cross product of the vertices (unit vectors on sphere) - float32_t3 n0 = cross(v0, v1); - float32_t3 n1 = cross(v1, v2); - float32_t3 n2 = cross(v2, v0); - - // 2. Normalize edge normals (magnitude is sin of the arc length) - float32_t l0 = length(n0); - float32_t l1 = length(n1); - float32_t l2 = length(n2); - - // Guard against degenerate triangles - if (l0 < 1e-7 || l1 < 1e-7 || l2 < 1e-7) - return 0.0f; - - n0 /= l0; - n1 /= l1; - n2 /= l2; - - // 3. Get arc lengths (angles in radians) - float32_t a = asin(clamp(l0, -1.0f, 1.0f)); // side v0-v1 - float32_t b = asin(clamp(l1, -1.0f, 1.0f)); // side v1-v2 - float32_t c = asin(clamp(l2, -1.0f, 1.0f)); // side v2-v0 - - // Handle acos/asin quadrant if dot product is negative - if (dot(v0, v1) < 0) - a = 3.14159265 - a; - if (dot(v1, v2) < 0) - b = 3.14159265 - b; - if (dot(v2, v0) < 0) - c = 3.14159265 - c; - - // 4. Compute projected solid angle - float32_t Gamma = 0.5f * (a * dot(n0, N) + b * dot(n1, N) + c * dot(n2, N)); - - // Return the absolute value of the total - return abs(Gamma); - } - - // Build fan triangulation, cache weights for triangle selection - static TriangleFanSampler create(ClippedSilhouette silhouette, uint32_t mode) - { - TriangleFanSampler self; - self.count = 0; - self.totalWeight = 0.0f; - self.samplingMode = mode; - self.faceNormal = float32_t3(0, 0, 0); - - if (silhouette.count < 3) - return self; - - const float32_t3 v0 = silhouette.vertices[0]; - const float32_t3 origin = float32_t3(0, 0, 0); - - // Compute face normal ONCE before the loop - silhouette is planar! - if (mode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - { - float32_t3 v1 = silhouette.vertices[1]; - float32_t3 v2 = silhouette.vertices[2]; - self.faceNormal = normalize(cross(v1 - v0, v2 - v0)); - } - - // Build fan triangulation from v0 - NBL_UNROLL - for (uint32_t i = 1; i < silhouette.count - 1; i++) - { - float32_t3 v1 = silhouette.vertices[i]; - float32_t3 v2 = silhouette.vertices[i + 1]; - - shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(v0, v1, v2, origin); - - // Skip degenerate triangles - if (shapeTri.pyramidAngles()) - continue; - - // Calculate triangle solid angle - float32_t solidAngle; - if (mode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - { - float32_t3 cos_vertices = clamp( - (shapeTri.cos_sides - shapeTri.cos_sides.yzx * shapeTri.cos_sides.zxy) * - shapeTri.csc_sides.yzx * shapeTri.csc_sides.zxy, - float32_t3(-1.0f, -1.0f, -1.0f), - float32_t3(1.0f, 1.0f, 1.0f)); - solidAngle = shapeTri.projectedSolidAngleOfTriangle(self.faceNormal, shapeTri.cos_sides, shapeTri.csc_sides, cos_vertices); - } - else - { - solidAngle = shapeTri.solidAngleOfTriangle(); - } - - if (solidAngle <= 0.0f) - continue; - - // Store only what's needed for weighted selection - self.triangleSolidAngles[self.count] = solidAngle; - self.triangleIndices[self.count] = i; - self.totalWeight += solidAngle; - self.count++; - } - -#if DEBUG_DATA - // Validate no antipodal edges exist (would create spherical lune) - for (uint32_t i = 0; i < silhouette.count; i++) - { - uint32_t j = (i + 1) % silhouette.count; - float32_t3 n1 = normalize(silhouette.vertices[i]); - float32_t3 n2 = normalize(silhouette.vertices[j]); - - if (dot(n1, n2) < -0.99f) - { - DebugDataBuffer[0].sphericalLuneDetected = 1; - assert(false && "Spherical lune detected: antipodal silhouette edge"); - } - } - DebugDataBuffer[0].maxTrianglesExceeded = (self.count > MAX_TRIANGLES); - DebugDataBuffer[0].triangleCount = self.count; - DebugDataBuffer[0].totalSolidAngles = self.totalWeight; - for (uint32_t tri = 0; tri < self.count; tri++) - { - DebugDataBuffer[0].solidAngles[tri] = self.triangleSolidAngles[tri]; - } -#endif - - return self; - } - - // Sample using cached selection weights, recompute geometry on-demand - float32_t3 sample(ClippedSilhouette silhouette, float32_t2 xi, out float32_t pdf, out uint32_t selectedIdx) - { - selectedIdx = 0; - - // Handle empty or invalid data - if (count == 0 || totalWeight <= 0.0f) - { - pdf = 0.0f; - return float32_t3(0, 0, 1); - } - - // Select triangle using cached weighted random selection - float32_t targetWeight = xi.x * totalWeight; - float32_t cumulativeWeight = 0.0f; - float32_t prevCumulativeWeight = 0.0f; - - NBL_UNROLL - for (uint32_t i = 0; i < count; i++) - { - prevCumulativeWeight = cumulativeWeight; - cumulativeWeight += triangleSolidAngles[i]; - - if (targetWeight <= cumulativeWeight) - { - selectedIdx = i; - break; - } - } - - // Remap xi.x to [0,1] within selected triangle's solidAngle interval - float32_t triSolidAngle = triangleSolidAngles[selectedIdx]; - float32_t u = (targetWeight - prevCumulativeWeight) / max(triSolidAngle, 1e-7f); - - // Reconstruct the selected triangle geometry - uint32_t vertexIdx = triangleIndices[selectedIdx]; - float32_t3 v0 = silhouette.vertices[0]; - float32_t3 v1 = silhouette.vertices[vertexIdx]; - float32_t3 v2 = silhouette.vertices[vertexIdx + 1]; - - float32_t3 fn = normalize(cross(v1 - v0, v2 - v0)); - - float32_t3 origin = float32_t3(0, 0, 0); - - shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(v0, v1, v2, origin); - - // Compute vertex angles once - float32_t3 cos_vertices = clamp( - (shapeTri.cos_sides - shapeTri.cos_sides.yzx * shapeTri.cos_sides.zxy) * - shapeTri.csc_sides.yzx * shapeTri.csc_sides.zxy, - float32_t3(-1.0f, -1.0f, -1.0f), - float32_t3(1.0f, 1.0f, 1.0f)); - float32_t3 sin_vertices = sqrt(float32_t3(1.0f, 1.0f, 1.0f) - cos_vertices * cos_vertices); - - // Sample based on mode - float32_t3 direction; - float32_t rcpPdf; - - if (samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - { - sampling::ProjectedSphericalTriangle samplingTri = sampling::ProjectedSphericalTriangle::create(shapeTri); - - direction = samplingTri.generate(rcpPdf, triSolidAngle, cos_vertices, sin_vertices, shapeTri.cos_sides[0], shapeTri.cos_sides[2], shapeTri.csc_sides[1], shapeTri.csc_sides[2], fn, false, float32_t2(u, xi.y)); - triSolidAngle = rcpPdf; // projected solid angle returned as rcpPdf - } - else - { - sampling::SphericalTriangle samplingTri = sampling::SphericalTriangle::create(shapeTri); - direction = samplingTri.generate(triSolidAngle, cos_vertices, sin_vertices, shapeTri.cos_sides[0], shapeTri.cos_sides[2], shapeTri.csc_sides[1], shapeTri.csc_sides[2], float32_t2(u, xi.y)); - } - - // Calculate PDF - float32_t trianglePdf = 1.0f / triSolidAngle; - float32_t selectionProb = triSolidAngle / totalWeight; - pdf = trianglePdf * selectionProb; - - return normalize(direction); - } + uint32_t count; // Number of valid triangles + uint32_t samplingMode; // Mode used during build + float32_t totalWeight; // Sum of all triangle weights (for PDF computation) + float32_t3 faceNormal; // Face normal (only used for projected mode) + float32_t cdf[MAX_TRIANGLES]; // Normalized CDF: cdf[i] = sum(weight[0..i]) / totalWeight + float32_t triangleSolidAngles[MAX_TRIANGLES]; // Raw weight per triangle (for PDF after selection) + uint32_t triangleIndices[MAX_TRIANGLES]; // Vertex index i (forms triangle with v0, vi, vi+1) + + // Build fan triangulation, cache weights for triangle selection + static TriangleFanSampler create(ClippedSilhouette silhouette, uint32_t mode) + { + TriangleFanSampler self; + self.count = 0; + self.totalWeight = 0.0f; + self.samplingMode = mode; + self.faceNormal = float32_t3(0, 0, 0); + + if (silhouette.count < 3) + return self; + + const float32_t3 v0 = silhouette.vertices[0]; + const float32_t3 origin = float32_t3(0, 0, 0); + + // Compute face normal ONCE before the loop - silhouette is planar! + if (mode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + { + float32_t3 v1 = silhouette.vertices[1]; + float32_t3 v2 = silhouette.vertices[2]; + self.faceNormal = normalize(cross(v1 - v0, v2 - v0)); + } + + // Build fan triangulation from v0 + NBL_UNROLL + for (uint32_t i = 1; i < silhouette.count - 1; i++) + { + float32_t3 v1 = silhouette.vertices[i]; + float32_t3 v2 = silhouette.vertices[i + 1]; + + const float32_t3 triVerts[3] = {v0, v1, v2}; + shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(triVerts, origin); + + // Skip degenerate triangles + if (shapeTri.solid_angle <= 0.0f) + continue; + + // Calculate triangle solid angle + float32_t solidAngle; + if (mode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + solidAngle = shapeTri.projectedSolidAngle(self.faceNormal); + else + solidAngle = shapeTri.solid_angle; + + if (solidAngle <= 0.0f) + continue; + + // Store only what's needed for weighted selection + self.triangleSolidAngles[self.count] = solidAngle; + self.triangleIndices[self.count] = i; + self.totalWeight += solidAngle; + self.count++; + } + + // Build normalized CDF from raw weights + { + float32_t rcpTotal = (self.totalWeight > 0.0f) ? (1.0f / self.totalWeight) : 0.0f; + float32_t cumulative = 0.0f; + for (uint32_t i = 0; i < self.count; i++) + { + cumulative += self.triangleSolidAngles[i]; + self.cdf[i] = cumulative * rcpTotal; + } + } + + bool luneDetected = false; + for (uint32_t i = 0; i < silhouette.count; i++) + { + uint32_t j = (i + 1) % silhouette.count; + float32_t3 n1 = normalize(silhouette.vertices[i]); + float32_t3 n2 = normalize(silhouette.vertices[j]); + if (dot(n1, n2) < -0.99f) + { + luneDetected = true; + assert(false && "Spherical lune detected: antipodal silhouette edge"); + } + } + DebugRecorder::recordTriangleFan(luneDetected, self.count, self.totalWeight, self.triangleSolidAngles); + + return self; + } + + // Sample using cached selection weights, recompute geometry on-demand + float32_t3 sample(ClippedSilhouette silhouette, float32_t2 xi, out float32_t pdf, out uint32_t selectedIdx) + { + selectedIdx = 0; + + // Handle empty or invalid data + if (count == 0 || totalWeight <= 0.0f) + { + pdf = 0.0f; + return float32_t3(0, 0, 1); + } + + // Select triangle via precomputed normalized CDF + float32_t prevCdf = 0.0f; + NBL_UNROLL + for (uint32_t i = 0; i < count; i++) + { + if (xi.x <= cdf[i]) + { + selectedIdx = i; + break; + } + prevCdf = cdf[i]; + } + + // Remap xi.x to [0,1] within selected triangle's CDF interval + float32_t cdfWidth = cdf[selectedIdx] - prevCdf; + float32_t u = (xi.x - prevCdf) / max(cdfWidth, 1e-7f); + float32_t triSolidAngle = triangleSolidAngles[selectedIdx]; + + // Reconstruct the selected triangle geometry + uint32_t vertexIdx = triangleIndices[selectedIdx]; + float32_t3 v0 = silhouette.vertices[0]; + float32_t3 v1 = silhouette.vertices[vertexIdx]; + float32_t3 v2 = silhouette.vertices[vertexIdx + 1]; + + float32_t3 origin = float32_t3(0, 0, 0); + + const float32_t3 triVerts[3] = {v0, v1, v2}; + shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(triVerts, origin); + + // Sample based on mode + float32_t3 direction; + const float32_t2 u2 = float32_t2(u, xi.y); + + if (samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + { + // faceNormal was precomputed during create() -- silhouette is planar + sampling::ProjectedSphericalTriangle samplingTri = sampling::ProjectedSphericalTriangle::create(shapeTri, faceNormal, false); + sampling::ProjectedSphericalTriangle::cache_type cache; + direction = samplingTri.generate(u2, cache); + triSolidAngle = 1.0f / samplingTri.forwardPdf(u2, cache); + } + else + { + sampling::SphericalTriangle samplingTri = sampling::SphericalTriangle::create(shapeTri); + sampling::SphericalTriangle::cache_type cache; + direction = samplingTri.generate(u2, cache); + } + + // Calculate PDF + float32_t trianglePdf = 1.0f / triSolidAngle; + float32_t selectionProb = triSolidAngle / totalWeight; + pdf = trianglePdf * selectionProb; + + return normalize(direction); + } }; #endif // _SOLID_ANGLE_VIS_EXAMPLE_TRIANGLE_SAMPLING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl index 832204cf2..5100b2fc0 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/utils.hlsl @@ -1,36 +1,13 @@ -//// Copyright (C) 2026-2026 - DevSH Graphics Programming Sp. z O.O. +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. //// This file is part of the "Nabla Engine". //// For conditions of distribution and use, see copyright notice in nabla.h #ifndef _SOLID_ANGLE_VIS_EXAMPLE_UTILS_HLSL_INCLUDED_ #define _SOLID_ANGLE_VIS_EXAMPLE_UTILS_HLSL_INCLUDED_ +#include #include #include -// TODO: implemented somewhere else? -// Bit rotation helpers -uint32_t rotl(uint32_t value, uint32_t bits, uint32_t width) -{ - // mask for the width - uint32_t mask = (width == 32) ? 0xFFFFFFFFu : ((1u << width) - 1u); - value &= mask; - - // Map bits==width -> 0 - bits &= -(bits < width); - - return ((value << bits) | (value >> (width - bits))) & mask; -} - -uint32_t rotr(uint32_t value, uint32_t bits, uint32_t width) -{ - uint32_t mask = ((1u << width) - 1u); - value &= mask; - - // Map bits==width -> 0 - bits &= -(bits < width); - - return ((value >> bits) | (value << (width - bits))) & mask; -} - +// unused uint32_t packSilhouette(const uint32_t s[7]) { uint32_t packed = 0; @@ -51,18 +28,4 @@ uint32_t packSilhouette(const uint32_t s[7]) return packed; } -float32_t2 hammersleySample(uint32_t i, uint32_t numSamples) -{ - return float32_t2( - float32_t(i) / float32_t(numSamples), - float32_t(reversebits(i)) / 4294967295.0f); -} - -float32_t2 nextRandomUnorm2(inout nbl::hlsl::Xoroshiro64StarStar rnd) -{ - return float32_t2( - float32_t(rnd()) * 2.3283064365386963e-10, - float32_t(rnd()) * 2.3283064365386963e-10); -} - #endif // _SOLID_ANGLE_VIS_EXAMPLE_UTILS_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/include/transform.hpp b/73_SolidAngleVisualizer/include/transform.hpp index e1ffcd764..ecacae17d 100644 --- a/73_SolidAngleVisualizer/include/transform.hpp +++ b/73_SolidAngleVisualizer/include/transform.hpp @@ -24,7 +24,7 @@ TransformReturnInfo EditTransform(float* cameraView, const float* cameraProjecti static ImGuizmo::MODE mCurrentGizmoMode(ImGuizmo::LOCAL); static bool useSnap = false; static float snap[3] = { 1.f, 1.f, 1.f }; - static float bounds[] = { -0.5f, -0.5f, -0.5f, 0.5f, 0.5f, 0.5f }; + static float bounds[] = { 0.0f, 0.0f, 0.0f, 1.0f, 1.0f, 1.0f }; static float boundsSnap[] = { 0.1f, 0.1f, 0.1f }; static bool boundSizing = false; static bool boundSizingSnap = false; diff --git a/73_SolidAngleVisualizer/main.cpp b/73_SolidAngleVisualizer/main.cpp index c60952394..a1441d9bd 100644 --- a/73_SolidAngleVisualizer/main.cpp +++ b/73_SolidAngleVisualizer/main.cpp @@ -3,12 +3,12 @@ // For conditions of distribution and use, see copyright notice in nabla.h #include "nbl/this_example/builtin/build/spirv/keys.hpp" -#include "common.hpp" -#include -#include -#include "app_resources/hlsl/common.hlsl" #include "app_resources/hlsl/benchmark/common.hlsl" +#include "app_resources/hlsl/common.hlsl" +#include "common.hpp" #include "nbl/ext/FullScreenTriangle/FullScreenTriangle.h" +#include +#include /* Renders scene texture to an offscreen framebuffer whose color attachment is then sampled into a imgui window. @@ -17,605 +17,617 @@ Written with Nabla's UI extension and got integrated with ImGuizmo to handle sce */ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinResourcesApplication { - using device_base_t = MonoWindowApplication; - using asset_base_t = BuiltinResourcesApplication; - -public: - inline SolidAngleVisualizer(const path &_localInputCWD, const path &_localOutputCWD, const path &_sharedInputCWD, const path &_sharedOutputCWD) - : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD), - device_base_t({2048, 1024}, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) - { - } - - inline bool onAppInitialized(smart_refctd_ptr &&system) override - { - if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; - if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; - - interface.m_visualizer = this; - - m_semaphore = m_device->createSemaphore(m_realFrameIx); - if (!m_semaphore) - return logFail("Failed to Create a Semaphore!"); - - auto pool = m_device->createCommandPool(getGraphicsQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - for (auto i = 0u; i < MaxFramesInFlight; i++) - { - if (!pool) - return logFail("Couldn't create Command Pool!"); - if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, {m_cmdBufs.data() + i, 1})) - return logFail("Couldn't create Command Buffer!"); - } - - const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()}; - m_scene = CGeometryCreatorScene::create( - {.transferQueue = getTransferUpQueue(), - .utilities = m_utils.get(), - .logger = m_logger.get(), - .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies}, - CSimpleDebugRenderer::DefaultPolygonGeometryPatch); - - // for the scene drawing pass - { - IGPURenderpass::SCreationParams params = {}; - const IGPURenderpass::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] = { - {{{.format = sceneRenderDepthFormat, - .samples = IGPUImage::ESCF_1_BIT, - .mayAlias = false}, - /*.loadOp =*/{IGPURenderpass::LOAD_OP::CLEAR}, - /*.storeOp =*/{IGPURenderpass::STORE_OP::STORE}, - /*.initialLayout =*/{IGPUImage::LAYOUT::UNDEFINED}, - /*.finalLayout =*/{IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}}, - IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd}; - params.depthStencilAttachments = depthAttachments; - const IGPURenderpass::SCreationParams::SColorAttachmentDescription colorAttachments[] = { - {{ - {.format = finalSceneRenderFormat, - .samples = IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT, - .mayAlias = false}, - /*.loadOp =*/IGPURenderpass::LOAD_OP::CLEAR, - /*.storeOp =*/IGPURenderpass::STORE_OP::STORE, - /*.initialLayout =*/IGPUImage::LAYOUT::UNDEFINED, - /*.finalLayout =*/IGPUImage::LAYOUT::READ_ONLY_OPTIMAL // ImGUI shall read - }}, - IGPURenderpass::SCreationParams::ColorAttachmentsEnd}; - params.colorAttachments = colorAttachments; - IGPURenderpass::SCreationParams::SSubpassDescription subpasses[] = { - {}, - IGPURenderpass::SCreationParams::SubpassesEnd}; - subpasses[0].depthStencilAttachment = {{.render = {.attachmentIndex = 0, .layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}}; - subpasses[0].colorAttachments[0] = {.render = {.attachmentIndex = 0, .layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}; - params.subpasses = subpasses; - - const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { - // wipe-transition of Color to ATTACHMENT_OPTIMAL and depth - { - .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .dstSubpass = 0, - .memoryBarrier = { - // last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later - // while color is sampled by ImGUI - .srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, - // don't want any writes to be available, as we are clearing both attachments - .srcAccessMask = ACCESS_FLAGS::NONE, - // destination needs to wait as early as possible - // TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h` - .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - // because depth and color get cleared first no read mask - .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT} - // leave view offsets and flags default - }, - { - .srcSubpass = 0, .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, .memoryBarrier = {// last place where the color can get modified, depth is implicitly earlier - .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - // only write ops, reads can't be made available, also won't be using depth so don't care about it being visible to anyone else - .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT, - // the ImGUI will sample the color, then next frame we overwrite both attachments - .dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT | PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT, - // but we only care about the availability-visibility chain between renderpass and imgui - .dstAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT} - // leave view offsets and flags default - }, - IGPURenderpass::SCreationParams::DependenciesEnd}; - params.dependencies = dependencies; - auto solidAngleRenderpassParams = params; - m_mainRenderpass = m_device->createRenderpass(std::move(params)); - if (!m_mainRenderpass) - return logFail("Failed to create Main Renderpass!"); - - m_solidAngleRenderpass = m_device->createRenderpass(std::move(solidAngleRenderpassParams)); - if (!m_solidAngleRenderpass) - return logFail("Failed to create Solid Angle Renderpass!"); - } - - const auto &geometries = m_scene->getInitParams().geometries; - m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(), m_solidAngleRenderpass.get(), 0, {&geometries.front().get(), geometries.size()}); - // special case - { - const auto &pipelines = m_renderer->getInitParams().pipelines; - auto ix = 0u; - for (const auto &name : m_scene->getInitParams().geometryNames) - { - if (name == "Cone") - m_renderer->getGeometry(ix).pipeline = pipelines[CSimpleDebugRenderer::SInitParams::PipelineType::Cone]; - ix++; - } - } - // we'll only display one thing at a time - m_renderer->m_instances.resize(1); - - // Create graphics pipeline - { - auto loadPrecompiledShader = [&](auto key) -> smart_refctd_ptr - { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; - auto assetBundle = m_assetMgr->getAsset(key.data(), lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - { - m_logger->log("Could not load precompiled shader!", ILogger::ELL_ERROR); - std::exit(-1); - } - assert(assets.size() == 1); - auto shader = IAsset::castDown(assets[0]); - if (!shader) - { - m_logger->log("Failed to load precompiled shader!", ILogger::ELL_ERROR); - std::exit(-1); - } - return shader; - }; - - ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); - if (!fsTriProtoPPln) - return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); - - // Load pre-compiled fragment shaders (6 modes x 2 debug = 12 SolidAngleVis + 2 RayVis) - // Can't use string literal template args in a loop, so unroll manually - // Index: mode * 2 + debugFlag (0=release, 1=debug) - smart_refctd_ptr saVisShaders[SAMPLING_MODE::Count * DebugPermutations]; - saVisShaders[0] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_sa">(m_device.get())); - saVisShaders[1] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_sa_dbg">(m_device.get())); - saVisShaders[2] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_psa">(m_device.get())); - saVisShaders[3] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_psa_dbg">(m_device.get())); - saVisShaders[4] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_para">(m_device.get())); - saVisShaders[5] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_para_dbg">(m_device.get())); - saVisShaders[6] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_rectangle">(m_device.get())); - saVisShaders[7] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_rectangle_dbg">(m_device.get())); - saVisShaders[8] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_biquad">(m_device.get())); - saVisShaders[9] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_biquad_dbg">(m_device.get())); - saVisShaders[10] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_bilinear">(m_device.get())); - saVisShaders[11] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_bilinear_dbg">(m_device.get())); - - smart_refctd_ptr rayVisShaders[DebugPermutations]; - rayVisShaders[0] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"ray_vis">(m_device.get())); - rayVisShaders[1] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"ray_vis_dbg">(m_device.get())); - - smart_refctd_ptr solidAngleVisLayout, rayVisLayout; - nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = - { - {.binding = 0, - .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ShaderStage::ESS_FRAGMENT, - .count = 1}}; - smart_refctd_ptr dsLayout = m_device->createDescriptorSetLayout(bindings); - - const asset::SPushConstantRange saRanges[] = {{.stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, .offset = 0, .size = sizeof(PushConstants)}}; - const asset::SPushConstantRange rayRanges[] = {{.stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, .offset = 0, .size = sizeof(PushConstantRayVis)}}; - - if (!dsLayout) - logFail("Failed to create a Descriptor Layout!\n"); - - solidAngleVisLayout = m_device->createPipelineLayout(saRanges, dsLayout); - - rayVisLayout = m_device->createPipelineLayout(rayRanges, dsLayout); - - { - // Create all SolidAngleVis pipeline variants - for (uint32_t i = 0; i < SAMPLING_MODE::Count * DebugPermutations; i++) - { - const IGPUPipelineBase::SShaderSpecInfo fragSpec = { - .shader = saVisShaders[i].get(), - .entryPoint = "main"}; - m_solidAngleVisPipelines[i] = fsTriProtoPPln.createPipeline(fragSpec, solidAngleVisLayout.get(), m_solidAngleRenderpass.get()); - if (!m_solidAngleVisPipelines[i]) - return logFail("Could not create SolidAngleVis Graphics Pipeline variant %d!", i); - } - - asset::SRasterizationParams rasterParams = ext::FullScreenTriangle::ProtoPipeline::DefaultRasterParams; - rasterParams.depthWriteEnable = true; - rasterParams.depthCompareOp = asset::E_COMPARE_OP::ECO_GREATER; - - // Create all RayVis pipeline variants - for (uint32_t i = 0; i < DebugPermutations; i++) - { - const IGPUPipelineBase::SShaderSpecInfo fragSpec = { - .shader = rayVisShaders[i].get(), - .entryPoint = "main"}; - m_rayVisPipelines[i] = fsTriProtoPPln.createPipeline(fragSpec, rayVisLayout.get(), m_mainRenderpass.get(), 0, {}, rasterParams); - if (!m_rayVisPipelines[i]) - return logFail("Could not create RayVis Graphics Pipeline variant %d!", i); - } - } - // Allocate the memory - { - constexpr size_t BufferSize = sizeof(ResultData); - - nbl::video::IGPUBuffer::SCreationParams params = {}; - params.size = BufferSize; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; - m_outputStorageBuffer = m_device->createBuffer(std::move(params)); - if (!m_outputStorageBuffer) - logFail("Failed to create a GPU Buffer of size %d!\n", params.size); - - m_outputStorageBuffer->setObjectDebugName("ResultData output buffer"); - - nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputStorageBuffer->getMemoryReqs(); - reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); - - m_allocation = m_device->allocate(reqs, m_outputStorageBuffer.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); - if (!m_allocation.isValid()) - logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); - - assert(m_outputStorageBuffer->getBoundMemory().memory == m_allocation.memory.get()); - smart_refctd_ptr pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, {&dsLayout.get(), 1}); - - m_ds = pool->createDescriptorSet(std::move(dsLayout)); - { - IGPUDescriptorSet::SDescriptorInfo info[1]; - info[0].desc = smart_refctd_ptr(m_outputStorageBuffer); - info[0].info.buffer = {.offset = 0, .size = BufferSize}; - IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { - {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info}}; - m_device->updateDescriptorSets(writes, {}); - } - } - - if (!m_allocation.memory->map({0ull, m_allocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ)) - logFail("Failed to map the Device Memory!\n"); - - // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches - const ILogicalDevice::MappedMemoryRange memoryRange(m_allocation.memory.get(), 0ull, m_allocation.memory->getAllocationSize()); - if (!m_allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memoryRange); - } - - // Create ImGUI - { - auto scRes = static_cast(m_surface->getSwapchainResources()); - ext::imgui::UI::SCreationParameters params = {}; - params.resources.texturesInfo = {.setIx = 0u, .bindingIx = TexturesImGUIBindingIndex}; - params.resources.samplersInfo = {.setIx = 0u, .bindingIx = 1u}; - params.utilities = m_utils; - params.transfer = getTransferUpQueue(); - params.pipelineLayout = ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxImGUITextures); - params.assetManager = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); - params.renderpass = smart_refctd_ptr(scRes->getRenderpass()); - params.subpassIx = 0u; - params.pipelineCache = nullptr; - interface.imGUI = ext::imgui::UI::create(std::move(params)); - if (!interface.imGUI) - return logFail("Failed to create `nbl::ext::imgui::UI` class"); - } - - // create rest of User Interface - { - auto *imgui = interface.imGUI.get(); - // create the suballocated descriptor set - { - // note that we use default layout provided by our extension, but you are free to create your own by filling ext::imgui::UI::S_CREATION_PARAMETERS::resources - const auto *layout = interface.imGUI->getPipeline()->getLayout()->getDescriptorSetLayout(0u); - auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT, {&layout, 1}); - auto ds = pool->createDescriptorSet(smart_refctd_ptr(layout)); - interface.subAllocDS = make_smart_refctd_ptr(std::move(ds)); - if (!interface.subAllocDS) - return logFail("Failed to create the descriptor set"); - // make sure Texture Atlas slot is taken for eternity - { - auto dummy = SubAllocatedDescriptorSet::invalid_value; - interface.subAllocDS->multi_allocate(0, 1, &dummy); - assert(dummy == ext::imgui::UI::FontAtlasTexId); - } - // write constant descriptors, note we don't create info & write pair for the samplers because UI extension's are immutable and baked into DS layout - IGPUDescriptorSet::SDescriptorInfo info = {}; - info.desc = smart_refctd_ptr(interface.imGUI->getFontAtlasView()); - info.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; - const IGPUDescriptorSet::SWriteDescriptorSet write = { - .dstSet = interface.subAllocDS->getDescriptorSet(), - .binding = TexturesImGUIBindingIndex, - .arrayElement = ext::imgui::UI::FontAtlasTexId, - .count = 1, - .info = &info}; - if (!m_device->updateDescriptorSets({&write, 1}, {})) - return logFail("Failed to write the descriptor set"); - } - imgui->registerListener([this]() - { interface(); }); - } - - interface.camera.mapKeysToWASD(); - - onAppInitializedFinish(); - return true; - } - - // - virtual inline bool onAppTerminated() - { - SubAllocatedDescriptorSet::value_type fontAtlasDescIx = ext::imgui::UI::FontAtlasTexId; - IGPUDescriptorSet::SDropDescriptorSet dummy[1]; - interface.subAllocDS->multi_deallocate(dummy, TexturesImGUIBindingIndex, 1, &fontAtlasDescIx); - return device_base_t::onAppTerminated(); - } - - inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override - { - // CPU events - update(nextPresentationTimestamp); - - { - const auto &virtualSolidAngleWindowRes = interface.solidAngleViewTransformReturnInfo.sceneResolution; - const auto &virtualMainWindowRes = interface.mainViewTransformReturnInfo.sceneResolution; - if (!m_solidAngleViewFramebuffer || m_solidAngleViewFramebuffer->getCreationParameters().width != virtualSolidAngleWindowRes[0] || m_solidAngleViewFramebuffer->getCreationParameters().height != virtualSolidAngleWindowRes[1] || - !m_mainViewFramebuffer || m_mainViewFramebuffer->getCreationParameters().width != virtualMainWindowRes[0] || m_mainViewFramebuffer->getCreationParameters().height != virtualMainWindowRes[1]) - recreateFramebuffers(); - } - - // - const auto resourceIx = m_realFrameIx % MaxFramesInFlight; - - auto *const cb = m_cmdBufs.data()[resourceIx].get(); - cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); - cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - - if (m_solidAngleViewFramebuffer) - { - asset::SBufferRange range{ - .offset = 0, - .size = m_outputStorageBuffer->getSize(), - .buffer = m_outputStorageBuffer}; - cb->fillBuffer(range, 0u); - { - - const auto &creationParams = m_solidAngleViewFramebuffer->getCreationParameters(); - cb->beginDebugMarker("Draw Circle View Frame"); - { - const IGPUCommandBuffer::SClearDepthStencilValue farValue = {.depth = 0.f}; - const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.f, 0.f, 0.f, 1.f}}; - const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = - { - .framebuffer = m_solidAngleViewFramebuffer.get(), - .colorClearValues = &clearValue, - .depthStencilClearValues = &farValue, - .renderArea = { - .offset = {0, 0}, - .extent = {creationParams.width, creationParams.height}}}; - beginRenderpass(cb, renderpassInfo); - } - // draw scene - { - static uint32_t lastFrameSeed = 0u; - lastFrameSeed = m_frameSeeding ? static_cast(m_realFrameIx) : lastFrameSeed; - PushConstants pc{ - .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), - .viewport = {0.f, 0.f, static_cast(creationParams.width), static_cast(creationParams.height)}, - .sampleCount = static_cast(m_SampleCount), - .frameIndex = lastFrameSeed}; - const uint32_t debugIdx = m_debugVisualization ? 1u : 0u; - auto pipeline = m_solidAngleVisPipelines[m_samplingMode * DebugPermutations + debugIdx]; - cb->bindGraphicsPipeline(pipeline.get()); - cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(pc), &pc); - cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get()); - ext::FullScreenTriangle::recordDrawCall(cb); - } - cb->endRenderPass(); - cb->endDebugMarker(); - } - - if (m_debugVisualization) - { - m_device->waitIdle(); - std::memcpy(&m_GPUOutResulData, static_cast(m_allocation.memory->getMappedPointer()), sizeof(ResultData)); - m_device->waitIdle(); - } - } - // draw main view - if (m_mainViewFramebuffer) - { - { - auto creationParams = m_mainViewFramebuffer->getCreationParameters(); - const IGPUCommandBuffer::SClearDepthStencilValue farValue = {.depth = 0.f}; - const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.1f, 0.1f, 0.1f, 1.f}}; - const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = - { - .framebuffer = m_mainViewFramebuffer.get(), - .colorClearValues = &clearValue, - .depthStencilClearValues = &farValue, - .renderArea = { - .offset = {0, 0}, - .extent = {creationParams.width, creationParams.height}}}; - beginRenderpass(cb, renderpassInfo); - } - { // draw rays visualization - auto creationParams = m_mainViewFramebuffer->getCreationParameters(); - - cb->beginDebugMarker("Draw Rays visualization"); - // draw scene - { - float32_t4x4 viewProj = *reinterpret_cast(&interface.camera.getConcatenatedMatrix()); - float32_t3x4 view = *reinterpret_cast(&interface.camera.getViewMatrix()); - PushConstantRayVis pc{ - .viewProjMatrix = viewProj, - .viewMatrix = view, - .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), - .invModelMatrix = hlsl::float32_t3x4(hlsl::transpose(hlsl::inverse(interface.m_OBBModelMatrix))), - .viewport = {0.f, 0.f, static_cast(creationParams.width), static_cast(creationParams.height)}, - .frameIndex = m_frameSeeding ? static_cast(m_realFrameIx) : 0u}; - auto pipeline = m_rayVisPipelines[m_debugVisualization ? 1u : 0u]; - cb->bindGraphicsPipeline(pipeline.get()); - cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(pc), &pc); - cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get()); - ext::FullScreenTriangle::recordDrawCall(cb); - } - cb->endDebugMarker(); - } - // draw scene - { - cb->beginDebugMarker("Main Scene Frame"); - - float32_t3x4 viewMatrix; - float32_t4x4 viewProjMatrix; - // TODO: get rid of legacy matrices - { - const auto &camera = interface.camera; - memcpy(&viewMatrix, &camera.getViewMatrix(), sizeof(viewMatrix)); - memcpy(&viewProjMatrix, &camera.getConcatenatedMatrix(), sizeof(viewProjMatrix)); - } - const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix, viewProjMatrix); - - // tear down scene every frame - auto &instance = m_renderer->m_instances[0]; - instance.world = float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)); - instance.packedGeo = m_renderer->getGeometries().data(); // cube // +interface.gcIndex; - m_renderer->render(cb, viewParams); // draw the cube/OBB - - instance.world = float32_t3x4(1.0f); - instance.packedGeo = m_renderer->getGeometries().data() + 2; // disk - m_renderer->render(cb, viewParams); - } - - cb->endDebugMarker(); - cb->endRenderPass(); - } - - { - cb->beginDebugMarker("SolidAngleVisualizer IMGUI Frame"); - { - auto scRes = static_cast(m_surface->getSwapchainResources()); - const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.f, 0.f, 0.f, 1.f}}; - const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = - { - .framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex), - .colorClearValues = &clearValue, - .depthStencilClearValues = nullptr, - .renderArea = { - .offset = {0, 0}, - .extent = {m_window->getWidth(), m_window->getHeight()}}}; - beginRenderpass(cb, renderpassInfo); - } - // draw ImGUI - { - auto *imgui = interface.imGUI.get(); - auto *pipeline = imgui->getPipeline(); - cb->bindGraphicsPipeline(pipeline); - // note that we use default UI pipeline layout where uiParams.resources.textures.setIx == uiParams.resources.samplers.setIx - const auto *ds = interface.subAllocDS->getDescriptorSet(); - cb->bindDescriptorSets(EPBP_GRAPHICS, pipeline->getLayout(), imgui->getCreationParameters().resources.texturesInfo.setIx, 1u, &ds); - // a timepoint in the future to release streaming resources for geometry - const ISemaphore::SWaitInfo drawFinished = {.semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u}; - if (!imgui->render(cb, drawFinished)) - { - m_logger->log("TODO: need to present acquired image before bailing because its already acquired.", ILogger::ELL_ERROR); - return {}; - } - } - cb->endRenderPass(); - cb->endDebugMarker(); - } - cb->end(); - - IQueue::SSubmitInfo::SSemaphoreInfo retval = - { - .semaphore = m_semaphore.get(), - .value = ++m_realFrameIx, - .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS}; - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = - { - {.cmdbuf = cb}}; - const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { - {.semaphore = device_base_t::getCurrentAcquire().semaphore, - .value = device_base_t::getCurrentAcquire().acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::NONE}}; - const IQueue::SSubmitInfo infos[] = - { - {.waitSemaphores = acquired, - .commandBuffers = commandBuffers, - .signalSemaphores = {&retval, 1}}}; - - if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS) - { - retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal - m_realFrameIx--; - } - - m_window->setCaption("[Nabla Engine] UI App Test Demo"); - return retval; - } - -protected: - const video::IGPURenderpass::SCreationParams::SSubpassDependency *getDefaultSubpassDependencies() const override - { - // Subsequent submits don't wait for each other, but they wait for acquire and get waited on by present - const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { - // don't want any writes to be available, we'll clear, only thing to worry about is the layout transition - { - .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, - .dstSubpass = 0, - .memoryBarrier = { - .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, // should sync against the semaphore wait anyway - .srcAccessMask = ACCESS_FLAGS::NONE, - // layout transition needs to finish before the color write - .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT} - // leave view offsets and flags default - }, - // want layout transition to begin after all color output is done - { - .srcSubpass = 0, .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, .memoryBarrier = { - // last place where the color can get modified, depth is implicitly earlier - .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, - // only write ops, reads can't be made available - .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT - // spec says nothing is needed when presentation is the destination - } - // leave view offsets and flags default - }, - IGPURenderpass::SCreationParams::DependenciesEnd}; - return dependencies; - } - -private: - inline void update(const std::chrono::microseconds nextPresentationTimestamp) - { - auto &camera = interface.camera; - camera.setMoveSpeed(interface.moveSpeed); - camera.setRotateSpeed(interface.rotateSpeed); - - m_inputSystem->getDefaultMouse(&mouse); - m_inputSystem->getDefaultKeyboard(&keyboard); - - struct - { - std::vector mouse{}; - std::vector keyboard{}; - } uiEvents; - - // TODO: should be a member really - static std::chrono::microseconds previousEventTimestamp{}; - - // I think begin/end should always be called on camera, just events shouldn't be fed, why? - // If you stop begin/end, whatever keys were up/down get their up/down values frozen leading to - // `perActionDt` becoming obnoxiously large the first time the even processing resumes due to - // `timeDiff` being computed since `lastVirtualUpTimeStamp` - camera.beginInputProcessing(nextPresentationTimestamp); - { - mouse.consumeEvents([&](const IMouseEventChannel::range_t &events) -> void - { + using device_base_t = MonoWindowApplication; + using asset_base_t = BuiltinResourcesApplication; + + public: + inline SolidAngleVisualizer(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) + : IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD), + device_base_t({2048, 1024}, EF_UNKNOWN, _localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) + { + } + + virtual SPhysicalDeviceFeatures getPreferredDeviceFeatures() const override + { + auto retval = device_base_t::getPreferredDeviceFeatures(); + retval.pipelineExecutableInfo = true; + return retval; + } + + inline bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!asset_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + + interface.m_visualizer = this; + + m_semaphore = m_device->createSemaphore(m_realFrameIx); + if (!m_semaphore) + return logFail("Failed to Create a Semaphore!"); + + auto pool = m_device->createCommandPool(getGraphicsQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + for (auto i = 0u; i < MaxFramesInFlight; i++) + { + if (!pool) + return logFail("Couldn't create Command Pool!"); + if (!pool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, {m_cmdBufs.data() + i, 1})) + return logFail("Couldn't create Command Buffer!"); + } + + const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()}; + m_scene = CGeometryCreatorScene::create( + {.transferQueue = getTransferUpQueue(), + .utilities = m_utils.get(), + .logger = m_logger.get(), + .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies}, + CSimpleDebugRenderer::DefaultPolygonGeometryPatch); + + // for the scene drawing pass + { + IGPURenderpass::SCreationParams params = {}; + const IGPURenderpass::SCreationParams::SDepthStencilAttachmentDescription depthAttachments[] = { + {{{.format = sceneRenderDepthFormat, + .samples = IGPUImage::ESCF_1_BIT, + .mayAlias = false}, + /*.loadOp =*/ {IGPURenderpass::LOAD_OP::CLEAR}, + /*.storeOp =*/ {IGPURenderpass::STORE_OP::STORE}, + /*.initialLayout =*/ {IGPUImage::LAYOUT::UNDEFINED}, + /*.finalLayout =*/ {IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}}, + IGPURenderpass::SCreationParams::DepthStencilAttachmentsEnd}; + params.depthStencilAttachments = depthAttachments; + const IGPURenderpass::SCreationParams::SColorAttachmentDescription colorAttachments[] = { + {{ + {.format = finalSceneRenderFormat, + .samples = IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT, + .mayAlias = false}, + /*.loadOp =*/IGPURenderpass::LOAD_OP::CLEAR, + /*.storeOp =*/IGPURenderpass::STORE_OP::STORE, + /*.initialLayout =*/IGPUImage::LAYOUT::UNDEFINED, + /*.finalLayout =*/IGPUImage::LAYOUT::READ_ONLY_OPTIMAL // ImGUI shall read + }}, + IGPURenderpass::SCreationParams::ColorAttachmentsEnd}; + params.colorAttachments = colorAttachments; + IGPURenderpass::SCreationParams::SSubpassDescription subpasses[] = { + {}, + IGPURenderpass::SCreationParams::SubpassesEnd}; + subpasses[0].depthStencilAttachment = {{.render = {.attachmentIndex = 0, .layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}}; + subpasses[0].colorAttachments[0] = {.render = {.attachmentIndex = 0, .layout = IGPUImage::LAYOUT::ATTACHMENT_OPTIMAL}}; + params.subpasses = subpasses; + + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { + // wipe-transition of Color to ATTACHMENT_OPTIMAL and depth + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = { + // last place where the depth can get modified in previous frame, `COLOR_ATTACHMENT_OUTPUT_BIT` is implicitly later + // while color is sampled by ImGUI + .srcStageMask = PIPELINE_STAGE_FLAGS::LATE_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT, + // don't want any writes to be available, as we are clearing both attachments + .srcAccessMask = ACCESS_FLAGS::NONE, + // destination needs to wait as early as possible + // TODO: `COLOR_ATTACHMENT_OUTPUT_BIT` shouldn't be needed, because its a logically later stage, see TODO in `ECommonEnums.h` + .dstStageMask = PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT | PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // because depth and color get cleared first no read mask + .dstAccessMask = ACCESS_FLAGS::DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT} + // leave view offsets and flags default + }, + { + .srcSubpass = 0, .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, .memoryBarrier = {// last place where the color can get modified, depth is implicitly earlier + .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // only write ops, reads can't be made available, also won't be using depth so don't care about it being visible to anyone else + .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT, + // the ImGUI will sample the color, then next frame we overwrite both attachments + .dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT | PIPELINE_STAGE_FLAGS::EARLY_FRAGMENT_TESTS_BIT, + // but we only care about the availability-visibility chain between renderpass and imgui + .dstAccessMask = ACCESS_FLAGS::SAMPLED_READ_BIT} + // leave view offsets and flags default + }, + IGPURenderpass::SCreationParams::DependenciesEnd}; + params.dependencies = dependencies; + auto solidAngleRenderpassParams = params; + m_mainRenderpass = m_device->createRenderpass(std::move(params)); + if (!m_mainRenderpass) + return logFail("Failed to create Main Renderpass!"); + + m_solidAngleRenderpass = m_device->createRenderpass(std::move(solidAngleRenderpassParams)); + if (!m_solidAngleRenderpass) + return logFail("Failed to create Solid Angle Renderpass!"); + } + + const auto& geometries = m_scene->getInitParams().geometries; + m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(), m_solidAngleRenderpass.get(), 0, {&geometries.front().get(), geometries.size()}); + // special case + { + const auto& pipelines = m_renderer->getInitParams().pipelines; + auto ix = 0u; + for (const auto& name : m_scene->getInitParams().geometryNames) + { + if (name == "Cone") + m_renderer->getGeometry(ix).pipeline = pipelines[CSimpleDebugRenderer::SInitParams::PipelineType::Cone]; + ix++; + } + } + // we'll only display one thing at a time + m_renderer->m_instances.resize(1); + + // Create graphics pipeline + { + auto loadPrecompiledShader = [&](auto key) -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; + auto assetBundle = m_assetMgr->getAsset(key.data(), lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + m_logger->log("Could not load precompiled shader!", ILogger::ELL_ERROR); + std::exit(-1); + } + assert(assets.size() == 1); + auto shader = IAsset::castDown(assets[0]); + if (!shader) + { + m_logger->log("Failed to load precompiled shader!", ILogger::ELL_ERROR); + std::exit(-1); + } + return shader; + }; + + ext::FullScreenTriangle::ProtoPipeline fsTriProtoPPln(m_assetMgr.get(), m_device.get(), m_logger.get()); + if (!fsTriProtoPPln) + return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); + + // Load pre-compiled fragment shaders (6 modes x 2 debug = 12 SolidAngleVis + 2 RayVis) + // Can't use string literal template args in a loop, so unroll manually + // Index: mode * 2 + debugFlag (0=release, 1=debug) + smart_refctd_ptr saVisShaders[SAMPLING_MODE::Count * DebugPermutations]; + saVisShaders[0] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_sa">(m_device.get())); + saVisShaders[1] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_sa_dbg">(m_device.get())); + saVisShaders[2] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_psa">(m_device.get())); + saVisShaders[3] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_psa_dbg">(m_device.get())); + saVisShaders[4] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_para">(m_device.get())); + saVisShaders[5] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_para_dbg">(m_device.get())); + saVisShaders[6] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_rectangle">(m_device.get())); + saVisShaders[7] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_rectangle_dbg">(m_device.get())); + saVisShaders[8] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_biquad">(m_device.get())); + saVisShaders[9] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_biquad_dbg">(m_device.get())); + saVisShaders[10] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_bilinear">(m_device.get())); + saVisShaders[11] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_bilinear_dbg">(m_device.get())); + saVisShaders[12] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_proj_rectangle">(m_device.get())); + saVisShaders[13] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_proj_rectangle_dbg">(m_device.get())); + saVisShaders[14] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_silhouette">(m_device.get())); + saVisShaders[15] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_silhouette_dbg">(m_device.get())); + saVisShaders[16] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_pyramid">(m_device.get())); + saVisShaders[17] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_pyramid_dbg">(m_device.get())); + + smart_refctd_ptr rayVisShaders[DebugPermutations]; + rayVisShaders[0] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"ray_vis">(m_device.get())); + rayVisShaders[1] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"ray_vis_dbg">(m_device.get())); + + smart_refctd_ptr solidAngleVisLayout, rayVisLayout; + nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = + { + {.binding = 0, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = ShaderStage::ESS_FRAGMENT, + .count = 1}}; + smart_refctd_ptr dsLayout = m_device->createDescriptorSetLayout(bindings); + + const asset::SPushConstantRange saRanges[] = {{.stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, .offset = 0, .size = sizeof(PushConstants)}}; + const asset::SPushConstantRange rayRanges[] = {{.stageFlags = hlsl::ShaderStage::ESS_FRAGMENT, .offset = 0, .size = sizeof(PushConstantRayVis)}}; + + if (!dsLayout) + logFail("Failed to create a Descriptor Layout!\n"); + + solidAngleVisLayout = m_device->createPipelineLayout(saRanges, dsLayout); + + rayVisLayout = m_device->createPipelineLayout(rayRanges, dsLayout); + + { + // Create all SolidAngleVis pipeline variants + for (uint32_t i = 0; i < SAMPLING_MODE::Count * DebugPermutations; i++) + { + const IGPUPipelineBase::SShaderSpecInfo fragSpec = { + .shader = saVisShaders[i].get(), + .entryPoint = "main"}; + m_solidAngleVisPipelines[i] = fsTriProtoPPln.createPipeline(fragSpec, solidAngleVisLayout.get(), m_solidAngleRenderpass.get()); + if (!m_solidAngleVisPipelines[i]) + return logFail("Could not create SolidAngleVis Graphics Pipeline variant %d!", i); + } + + asset::SRasterizationParams rasterParams = ext::FullScreenTriangle::ProtoPipeline::DefaultRasterParams; + rasterParams.depthWriteEnable = true; + rasterParams.depthCompareOp = asset::E_COMPARE_OP::ECO_GREATER; + + // Create all RayVis pipeline variants + for (uint32_t i = 0; i < DebugPermutations; i++) + { + const IGPUPipelineBase::SShaderSpecInfo fragSpec = { + .shader = rayVisShaders[i].get(), + .entryPoint = "main"}; + m_rayVisPipelines[i] = fsTriProtoPPln.createPipeline(fragSpec, rayVisLayout.get(), m_mainRenderpass.get(), 0, {}, rasterParams); + if (!m_rayVisPipelines[i]) + return logFail("Could not create RayVis Graphics Pipeline variant %d!", i); + } + } + // Allocate the memory + { + constexpr size_t BufferSize = sizeof(ResultData); + + nbl::video::IGPUBuffer::SCreationParams params = {}; + params.size = BufferSize; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT; + m_outputStorageBuffer = m_device->createBuffer(std::move(params)); + if (!m_outputStorageBuffer) + logFail("Failed to create a GPU Buffer of size %d!\n", params.size); + + m_outputStorageBuffer->setObjectDebugName("ResultData output buffer"); + + nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = m_outputStorageBuffer->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); + + m_allocation = m_device->allocate(reqs, m_outputStorageBuffer.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); + if (!m_allocation.isValid()) + logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + + assert(m_outputStorageBuffer->getBoundMemory().memory == m_allocation.memory.get()); + smart_refctd_ptr pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, {&dsLayout.get(), 1}); + + m_ds = pool->createDescriptorSet(std::move(dsLayout)); + { + IGPUDescriptorSet::SDescriptorInfo info[1]; + info[0].desc = smart_refctd_ptr(m_outputStorageBuffer); + info[0].info.buffer = {.offset = 0, .size = BufferSize}; + IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { + {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info}}; + m_device->updateDescriptorSets(writes, {}); + } + } + + if (!m_allocation.memory->map({0ull, m_allocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ)) + logFail("Failed to map the Device Memory!\n"); + + // if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches + const ILogicalDevice::MappedMemoryRange memoryRange(m_allocation.memory.get(), 0ull, m_allocation.memory->getAllocationSize()); + if (!m_allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memoryRange); + } + + // Create ImGUI + { + auto scRes = static_cast(m_surface->getSwapchainResources()); + ext::imgui::UI::SCreationParameters params = {}; + params.resources.texturesInfo = {.setIx = 0u, .bindingIx = TexturesImGUIBindingIndex}; + params.resources.samplersInfo = {.setIx = 0u, .bindingIx = 1u}; + params.utilities = m_utils; + params.transfer = getTransferUpQueue(); + params.pipelineLayout = ext::imgui::UI::createDefaultPipelineLayout(m_utils->getLogicalDevice(), params.resources.texturesInfo, params.resources.samplersInfo, MaxImGUITextures); + params.assetManager = make_smart_refctd_ptr(smart_refctd_ptr(m_system)); + params.renderpass = smart_refctd_ptr(scRes->getRenderpass()); + params.subpassIx = 0u; + params.pipelineCache = nullptr; + interface.imGUI = ext::imgui::UI::create(std::move(params)); + if (!interface.imGUI) + return logFail("Failed to create `nbl::ext::imgui::UI` class"); + } + + // create rest of User Interface + { + auto* imgui = interface.imGUI.get(); + // create the suballocated descriptor set + { + // note that we use default layout provided by our extension, but you are free to create your own by filling ext::imgui::UI::S_CREATION_PARAMETERS::resources + const auto* layout = interface.imGUI->getPipeline()->getLayout()->getDescriptorSetLayout(0u); + auto pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT, {&layout, 1}); + auto ds = pool->createDescriptorSet(smart_refctd_ptr(layout)); + interface.subAllocDS = make_smart_refctd_ptr(std::move(ds)); + if (!interface.subAllocDS) + return logFail("Failed to create the descriptor set"); + // make sure Texture Atlas slot is taken for eternity + { + auto dummy = SubAllocatedDescriptorSet::invalid_value; + interface.subAllocDS->multi_allocate(0, 1, &dummy); + assert(dummy == ext::imgui::UI::FontAtlasTexId); + } + // write constant descriptors, note we don't create info & write pair for the samplers because UI extension's are immutable and baked into DS layout + IGPUDescriptorSet::SDescriptorInfo info = {}; + info.desc = smart_refctd_ptr(interface.imGUI->getFontAtlasView()); + info.info.image.imageLayout = IImage::LAYOUT::READ_ONLY_OPTIMAL; + const IGPUDescriptorSet::SWriteDescriptorSet write = { + .dstSet = interface.subAllocDS->getDescriptorSet(), + .binding = TexturesImGUIBindingIndex, + .arrayElement = ext::imgui::UI::FontAtlasTexId, + .count = 1, + .info = &info}; + if (!m_device->updateDescriptorSets({&write, 1}, {})) + return logFail("Failed to write the descriptor set"); + } + imgui->registerListener([this]() + { interface(); }); + } + + interface.camera.mapKeysToWASD(); + + onAppInitializedFinish(); + return true; + } + + // + virtual inline bool onAppTerminated() + { + SubAllocatedDescriptorSet::value_type fontAtlasDescIx = ext::imgui::UI::FontAtlasTexId; + IGPUDescriptorSet::SDropDescriptorSet dummy[1]; + interface.subAllocDS->multi_deallocate(dummy, TexturesImGUIBindingIndex, 1, &fontAtlasDescIx); + return device_base_t::onAppTerminated(); + } + + inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override + { + // CPU events + update(nextPresentationTimestamp); + + { + const auto& virtualSolidAngleWindowRes = interface.solidAngleViewTransformReturnInfo.sceneResolution; + const auto& virtualMainWindowRes = interface.mainViewTransformReturnInfo.sceneResolution; + if (!m_solidAngleViewFramebuffer || m_solidAngleViewFramebuffer->getCreationParameters().width != virtualSolidAngleWindowRes[0] || m_solidAngleViewFramebuffer->getCreationParameters().height != virtualSolidAngleWindowRes[1] || + !m_mainViewFramebuffer || m_mainViewFramebuffer->getCreationParameters().width != virtualMainWindowRes[0] || m_mainViewFramebuffer->getCreationParameters().height != virtualMainWindowRes[1]) + recreateFramebuffers(); + } + + // + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + + auto* const cb = m_cmdBufs.data()[resourceIx].get(); + cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + if (m_solidAngleViewFramebuffer) + { + asset::SBufferRange range { + .offset = 0, + .size = m_outputStorageBuffer->getSize(), + .buffer = m_outputStorageBuffer}; + cb->fillBuffer(range, 0u); + { + const auto& creationParams = m_solidAngleViewFramebuffer->getCreationParameters(); + cb->beginDebugMarker("Draw Circle View Frame"); + { + const IGPUCommandBuffer::SClearDepthStencilValue farValue = {.depth = 0.f}; + const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.f, 0.f, 0.f, 1.f}}; + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + { + .framebuffer = m_solidAngleViewFramebuffer.get(), + .colorClearValues = &clearValue, + .depthStencilClearValues = &farValue, + .renderArea = { + .offset = {0, 0}, + .extent = {creationParams.width, creationParams.height}}}; + beginRenderpass(cb, renderpassInfo); + } + // draw scene + { + static uint32_t lastFrameSeed = 0u; + lastFrameSeed = m_frameSeeding ? static_cast(m_realFrameIx) : lastFrameSeed; + PushConstants pc { + .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), + .viewport = {0.f, 0.f, static_cast(creationParams.width), static_cast(creationParams.height)}, + .sampleCount = static_cast(m_SampleCount), + .frameIndex = lastFrameSeed}; + const uint32_t debugIdx = m_debugVisualization ? 1u : 0u; + auto pipeline = m_solidAngleVisPipelines[m_samplingMode * DebugPermutations + debugIdx]; + cb->bindGraphicsPipeline(pipeline.get()); + cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(pc), &pc); + cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get()); + ext::FullScreenTriangle::recordDrawCall(cb); + } + cb->endRenderPass(); + cb->endDebugMarker(); + } + + if (m_debugVisualization) + { + m_device->waitIdle(); + std::memcpy(&m_GPUOutResulData, static_cast(m_allocation.memory->getMappedPointer()), sizeof(ResultData)); + m_device->waitIdle(); + } + } + // draw main view + if (m_mainViewFramebuffer) + { + { + auto creationParams = m_mainViewFramebuffer->getCreationParameters(); + const IGPUCommandBuffer::SClearDepthStencilValue farValue = {.depth = 0.f}; + const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.1f, 0.1f, 0.1f, 1.f}}; + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + { + .framebuffer = m_mainViewFramebuffer.get(), + .colorClearValues = &clearValue, + .depthStencilClearValues = &farValue, + .renderArea = { + .offset = {0, 0}, + .extent = {creationParams.width, creationParams.height}}}; + beginRenderpass(cb, renderpassInfo); + } + { // draw rays visualization + auto creationParams = m_mainViewFramebuffer->getCreationParameters(); + + cb->beginDebugMarker("Draw Rays visualization"); + // draw scene + { + float32_t4x4 viewProj = *reinterpret_cast(&interface.camera.getConcatenatedMatrix()); + float32_t3x4 view = *reinterpret_cast(&interface.camera.getViewMatrix()); + PushConstantRayVis pc { + .viewProjMatrix = viewProj, + .viewMatrix = view, + .modelMatrix = hlsl::float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)), + .invModelMatrix = hlsl::float32_t3x4(hlsl::transpose(hlsl::inverse(interface.m_OBBModelMatrix))), + .viewport = {0.f, 0.f, static_cast(creationParams.width), static_cast(creationParams.height)}, + .frameIndex = m_frameSeeding ? static_cast(m_realFrameIx) : 0u}; + auto pipeline = m_rayVisPipelines[m_debugVisualization ? 1u : 0u]; + cb->bindGraphicsPipeline(pipeline.get()); + cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(pc), &pc); + cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get()); + ext::FullScreenTriangle::recordDrawCall(cb); + } + cb->endDebugMarker(); + } + // draw scene + { + cb->beginDebugMarker("Main Scene Frame"); + + float32_t3x4 viewMatrix; + float32_t4x4 viewProjMatrix; + // TODO: get rid of legacy matrices + { + const auto& camera = interface.camera; + memcpy(&viewMatrix, &camera.getViewMatrix(), sizeof(viewMatrix)); + memcpy(&viewProjMatrix, &camera.getConcatenatedMatrix(), sizeof(viewProjMatrix)); + } + const auto viewParams = CSimpleDebugRenderer::SViewParams(viewMatrix, viewProjMatrix); + + // tear down scene every frame + auto& instance = m_renderer->m_instances[0]; + instance.world = float32_t3x4(hlsl::transpose(interface.m_OBBModelMatrix)); + instance.packedGeo = m_renderer->getGeometries().data(); // cube // +interface.gcIndex; + m_renderer->render(cb, viewParams); // draw the cube/OBB + + instance.world = float32_t3x4(1.0f); + instance.packedGeo = m_renderer->getGeometries().data() + 2; // disk + m_renderer->render(cb, viewParams); + } + + cb->endDebugMarker(); + cb->endRenderPass(); + } + + { + cb->beginDebugMarker("SolidAngleVisualizer IMGUI Frame"); + { + auto scRes = static_cast(m_surface->getSwapchainResources()); + const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.f, 0.f, 0.f, 1.f}}; + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + { + .framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex), + .colorClearValues = &clearValue, + .depthStencilClearValues = nullptr, + .renderArea = { + .offset = {0, 0}, + .extent = {m_window->getWidth(), m_window->getHeight()}}}; + beginRenderpass(cb, renderpassInfo); + } + // draw ImGUI + { + auto* imgui = interface.imGUI.get(); + auto* pipeline = imgui->getPipeline(); + cb->bindGraphicsPipeline(pipeline); + // note that we use default UI pipeline layout where uiParams.resources.textures.setIx == uiParams.resources.samplers.setIx + const auto* ds = interface.subAllocDS->getDescriptorSet(); + cb->bindDescriptorSets(EPBP_GRAPHICS, pipeline->getLayout(), imgui->getCreationParameters().resources.texturesInfo.setIx, 1u, &ds); + // a timepoint in the future to release streaming resources for geometry + const ISemaphore::SWaitInfo drawFinished = {.semaphore = m_semaphore.get(), .value = m_realFrameIx + 1u}; + if (!imgui->render(cb, drawFinished)) + { + m_logger->log("TODO: need to present acquired image before bailing because its already acquired.", ILogger::ELL_ERROR); + return {}; + } + } + cb->endRenderPass(); + cb->endDebugMarker(); + } + cb->end(); + + IQueue::SSubmitInfo::SSemaphoreInfo retval = + { + .semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS}; + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = + { + {.cmdbuf = cb}}; + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { + {.semaphore = device_base_t::getCurrentAcquire().semaphore, + .value = device_base_t::getCurrentAcquire().acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE}}; + const IQueue::SSubmitInfo infos[] = + { + {.waitSemaphores = acquired, + .commandBuffers = commandBuffers, + .signalSemaphores = {&retval, 1}}}; + + if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS) + { + retval.semaphore = nullptr; // so that we don't wait on semaphore that will never signal + m_realFrameIx--; + } + + m_window->setCaption("[Nabla Engine] UI App Test Demo"); + return retval; + } + + protected: + const video::IGPURenderpass::SCreationParams::SSubpassDependency* getDefaultSubpassDependencies() const override + { + // Subsequent submits don't wait for each other, but they wait for acquire and get waited on by present + const static IGPURenderpass::SCreationParams::SSubpassDependency dependencies[] = { + // don't want any writes to be available, we'll clear, only thing to worry about is the layout transition + { + .srcSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, + .dstSubpass = 0, + .memoryBarrier = { + .srcStageMask = PIPELINE_STAGE_FLAGS::NONE, // should sync against the semaphore wait anyway + .srcAccessMask = ACCESS_FLAGS::NONE, + // layout transition needs to finish before the color write + .dstStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + .dstAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT} + // leave view offsets and flags default + }, + // want layout transition to begin after all color output is done + { + .srcSubpass = 0, .dstSubpass = IGPURenderpass::SCreationParams::SSubpassDependency::External, .memoryBarrier = { + // last place where the color can get modified, depth is implicitly earlier + .srcStageMask = PIPELINE_STAGE_FLAGS::COLOR_ATTACHMENT_OUTPUT_BIT, + // only write ops, reads can't be made available + .srcAccessMask = ACCESS_FLAGS::COLOR_ATTACHMENT_WRITE_BIT + // spec says nothing is needed when presentation is the destination + } + // leave view offsets and flags default + }, + IGPURenderpass::SCreationParams::DependenciesEnd}; + return dependencies; + } + + private: + inline void update(const std::chrono::microseconds nextPresentationTimestamp) + { + auto& camera = interface.camera; + camera.setMoveSpeed(interface.moveSpeed); + camera.setRotateSpeed(interface.rotateSpeed); + + m_inputSystem->getDefaultMouse(&mouse); + m_inputSystem->getDefaultKeyboard(&keyboard); + + struct + { + std::vector mouse {}; + std::vector keyboard {}; + } uiEvents; + + // TODO: should be a member really + static std::chrono::microseconds previousEventTimestamp {}; + + // I think begin/end should always be called on camera, just events shouldn't be fed, why? + // If you stop begin/end, whatever keys were up/down get their up/down values frozen leading to + // `perActionDt` becoming obnoxiously large the first time the even processing resumes due to + // `timeDiff` being computed since `lastVirtualUpTimeStamp` + camera.beginInputProcessing(nextPresentationTimestamp); + { + mouse.consumeEvents([&](const IMouseEventChannel::range_t& events) -> void + { if (interface.move) camera.mouseProcess(events); // don't capture the events, only let camera handle them with its impl else @@ -635,9 +647,9 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // interface.gcIndex = core::clamp(interface.gcIndex, 0ull, m_renderer->getGeometries().size() - 1); //} } }, - m_logger.get()); - keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t &events) -> void - { + m_logger.get()); + keyboard.consumeEvents([&](const IKeyboardEventChannel::range_t& events) -> void + { if (interface.move) camera.keyboardProcess(events); // don't capture the events, only let camera handle them with its impl @@ -649,294 +661,299 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR previousEventTimestamp = e.timeStamp; uiEvents.keyboard.emplace_back(e); } }, - m_logger.get()); - } - camera.endInputProcessing(nextPresentationTimestamp); - - const auto cursorPosition = m_window->getCursorControl()->getPosition(); - - ext::imgui::UI::SUpdateParameters params = - { - .mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()), - .displaySize = {m_window->getWidth(), m_window->getHeight()}, - .mouseEvents = uiEvents.mouse, - .keyboardEvents = uiEvents.keyboard}; - - // interface.objectName = m_scene->getInitParams().geometryNames[interface.gcIndex]; - interface.imGUI->update(params); - } - - void recreateFramebuffers() - { - - auto createImageAndView = [&](const uint16_t2 resolution, E_FORMAT format) -> smart_refctd_ptr - { - auto image = m_device->createImage({{.type = IGPUImage::ET_2D, - .samples = IGPUImage::ESCF_1_BIT, - .format = format, - .extent = {resolution.x, resolution.y, 1}, - .mipLevels = 1, - .arrayLayers = 1, - .usage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT | IGPUImage::EUF_SAMPLED_BIT}}); - if (!m_device->allocate(image->getMemoryReqs(), image.get()).isValid()) - return nullptr; - IGPUImageView::SCreationParams params = { - .image = std::move(image), - .viewType = IGPUImageView::ET_2D, - .format = format}; - params.subresourceRange.aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT : IGPUImage::EAF_COLOR_BIT; - return m_device->createImageView(std::move(params)); - }; - - smart_refctd_ptr solidAngleView; - smart_refctd_ptr mainView; - const uint16_t2 solidAngleViewRes = interface.solidAngleViewTransformReturnInfo.sceneResolution; - const uint16_t2 mainViewRes = interface.mainViewTransformReturnInfo.sceneResolution; - - // detect window minimization - if (solidAngleViewRes.x < 0x4000 && solidAngleViewRes.y < 0x4000 || - mainViewRes.x < 0x4000 && mainViewRes.y < 0x4000) - { - solidAngleView = createImageAndView(solidAngleViewRes, finalSceneRenderFormat); - auto solidAngleDepthView = createImageAndView(solidAngleViewRes, sceneRenderDepthFormat); - m_solidAngleViewFramebuffer = m_device->createFramebuffer({{.renderpass = m_solidAngleRenderpass, - .depthStencilAttachments = &solidAngleDepthView.get(), - .colorAttachments = &solidAngleView.get(), - .width = solidAngleViewRes.x, - .height = solidAngleViewRes.y}}); - - mainView = createImageAndView(mainViewRes, finalSceneRenderFormat); - auto mainDepthView = createImageAndView(mainViewRes, sceneRenderDepthFormat); - m_mainViewFramebuffer = m_device->createFramebuffer({{.renderpass = m_mainRenderpass, - .depthStencilAttachments = &mainDepthView.get(), - .colorAttachments = &mainView.get(), - .width = mainViewRes.x, - .height = mainViewRes.y}}); - } - else - { - m_solidAngleViewFramebuffer = nullptr; - m_mainViewFramebuffer = nullptr; - } - - // release previous slot and its image - interface.subAllocDS->multi_deallocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices, {.semaphore = m_semaphore.get(), .value = m_realFrameIx + 1}); - // - if (solidAngleView && mainView) - { - interface.subAllocDS->multi_allocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices); - // update descriptor set - IGPUDescriptorSet::SDescriptorInfo infos[static_cast(CInterface::Count)] = {}; - infos[0].desc = mainView; - infos[0].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; - infos[1].desc = solidAngleView; - infos[1].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; - const IGPUDescriptorSet::SWriteDescriptorSet write[static_cast(CInterface::Count)] = { - {.dstSet = interface.subAllocDS->getDescriptorSet(), - .binding = TexturesImGUIBindingIndex, - .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_MAIN_VIEW)], - .count = 1, - .info = &infos[static_cast(CInterface::ERV_MAIN_VIEW)]}, - {.dstSet = interface.subAllocDS->getDescriptorSet(), - .binding = TexturesImGUIBindingIndex, - .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)], - .count = 1, - .info = &infos[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)]}}; - m_device->updateDescriptorSets({write, static_cast(CInterface::Count)}, {}); - } - interface.transformParams.sceneTexDescIx = interface.renderColorViewDescIndices[CInterface::ERV_MAIN_VIEW]; - } - - inline void beginRenderpass(IGPUCommandBuffer *cb, const IGPUCommandBuffer::SRenderpassBeginInfo &info) - { - cb->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); - cb->setScissor(0, 1, &info.renderArea); - const SViewport viewport = { - .x = 0, - .y = 0, - .width = static_cast(info.renderArea.extent.width), - .height = static_cast(info.renderArea.extent.height)}; - cb->setViewport(0u, 1u, &viewport); - } - - ~SolidAngleVisualizer() override - { - m_allocation.memory->unmap(); - } - - // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers - constexpr static inline uint32_t MaxFramesInFlight = 3u; - constexpr static inline auto sceneRenderDepthFormat = EF_D32_SFLOAT; - constexpr static inline auto finalSceneRenderFormat = EF_R8G8B8A8_SRGB; - constexpr static inline auto TexturesImGUIBindingIndex = 0u; - // we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes - constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight; - - static inline SAMPLING_MODE m_samplingMode = SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE; - static inline bool m_debugVisualization = true; - static inline int m_SampleCount = 64; - static inline bool m_frameSeeding = true; - static inline ResultData m_GPUOutResulData; - // - smart_refctd_ptr m_scene; - smart_refctd_ptr m_solidAngleRenderpass; - smart_refctd_ptr m_mainRenderpass; - smart_refctd_ptr m_renderer; - smart_refctd_ptr m_solidAngleViewFramebuffer; - smart_refctd_ptr m_mainViewFramebuffer; - // Pipeline variants: SolidAngleVis indexed by [mode * 2 + debugFlag], RayVis by [debugFlag] - static constexpr uint32_t DebugPermutations = 2; - smart_refctd_ptr m_solidAngleVisPipelines[SAMPLING_MODE::Count * DebugPermutations]; - smart_refctd_ptr m_rayVisPipelines[DebugPermutations]; - // - nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; - smart_refctd_ptr m_outputStorageBuffer; - smart_refctd_ptr m_ds = nullptr; - smart_refctd_ptr m_semaphore; - uint64_t m_realFrameIx = 0; - std::array, MaxFramesInFlight> m_cmdBufs; - // - InputSystem::ChannelReader mouse; - InputSystem::ChannelReader keyboard; - // UI stuff - struct CInterface - { - void operator()() - { - ImGuiIO &io = ImGui::GetIO(); - - // TODO: why is this a lambda and not just an assignment in a scope ? - camera.setProjectionMatrix([&]() - { - hlsl::float32_t4x4 projection; - - if (isPerspective) - if (isLH) - projection = hlsl::math::thin_lens::lhPerspectiveFovMatrix(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y * 0.5f, zNear, zFar); // TODO: why do I need to divide aspect ratio by 2? - else - projection = hlsl::math::thin_lens::rhPerspectiveFovMatrix(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y * 0.5f, zNear, zFar); - else - { - float viewHeight = viewWidth * io.DisplaySize.y / io.DisplaySize.x; - - if (isLH) - projection = hlsl::math::thin_lens::lhPerspectiveFovMatrix(viewWidth, viewHeight, zNear, zFar); - else - projection = hlsl::math::thin_lens::rhPerspectiveFovMatrix(viewWidth, viewHeight, zNear, zFar); - } - - return projection; }()); - - ImGuizmo::SetOrthographic(!isPerspective); - ImGuizmo::BeginFrame(); - - ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); - ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); - - // create a window and insert the inspector - ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); - ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); - ImGui::Begin("Editor"); - - ImGui::Text("Benchmarking Solid Angle Visualizer"); - - if (ImGui::Button("Run Benchmark")) - { - SolidAngleVisualizer::SamplingBenchmark benchmark(*m_visualizer); - benchmark.run(); - } - ImGui::Separator(); - - ImGui::Text("Sampling Mode:"); - ImGui::SameLine(); - - const char *samplingModes[] = - { - "Triangle Solid Angle", - "Triangle Projected Solid Angle", - "Parallelogram Projected Solid Angle", - "Rectangle Pyramid Solid Angle", - "Biquadratic pyramid solid angle", - "Bilinear pyramid solid angle"}; - - int currentMode = static_cast(m_samplingMode); - - if (ImGui::Combo("##SamplingMode", ¤tMode, samplingModes, IM_ARRAYSIZE(samplingModes))) - { - m_samplingMode = static_cast(currentMode); - } - - ImGui::Checkbox("Debug Visualization", &m_debugVisualization); - ImGui::Text("Pipeline idx: SA=%d, Ray=%d", - static_cast(m_samplingMode) * DebugPermutations + (m_debugVisualization ? 1 : 0), - m_debugVisualization ? 1 : 0); - ImGui::Checkbox("Frame seeding", &m_frameSeeding); - - ImGui::SliderInt("Sample Count", &m_SampleCount, 0, 512); - - ImGui::Separator(); - - ImGui::Text("Camera"); - - if (ImGui::RadioButton("LH", isLH)) - isLH = true; - - ImGui::SameLine(); - - if (ImGui::RadioButton("RH", !isLH)) - isLH = false; - - if (ImGui::RadioButton("Perspective", isPerspective)) - isPerspective = true; - - ImGui::SameLine(); - - if (ImGui::RadioButton("Orthographic", !isPerspective)) - isPerspective = false; - - ImGui::Checkbox("Enable \"view manipulate\"", &transformParams.enableViewManipulate); - // ImGui::Checkbox("Enable camera movement", &move); - ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f); - ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f); - - // ImGui::Checkbox("Flip Gizmo's Y axis", &flipGizmoY); // let's not expose it to be changed in UI but keep the logic in case - - if (isPerspective) - ImGui::SliderFloat("Fov", &fov, 20.f, 150.f); - else - ImGui::SliderFloat("Ortho width", &viewWidth, 1, 20); - - ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); - ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); - - if (firstFrame) - { - camera.setPosition(cameraIntialPosition); - camera.setTarget(cameraInitialTarget); - camera.setUpVector(cameraInitialUp); - - camera.recomputeViewMatrix(); - } - firstFrame = false; - - ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); - if (ImGuizmo::IsUsing()) - { - ImGui::Text("Using gizmo"); - } - else - { - ImGui::Text(ImGuizmo::IsOver() ? "Over gizmo" : ""); - ImGui::SameLine(); - ImGui::Text(ImGuizmo::IsOver(ImGuizmo::TRANSLATE) ? "Over translate gizmo" : ""); - ImGui::SameLine(); - ImGui::Text(ImGuizmo::IsOver(ImGuizmo::ROTATE) ? "Over rotate gizmo" : ""); - ImGui::SameLine(); - ImGui::Text(ImGuizmo::IsOver(ImGuizmo::SCALE) ? "Over scale gizmo" : ""); - } - ImGui::Separator(); - - /* + m_logger.get()); + } + camera.endInputProcessing(nextPresentationTimestamp); + + const auto cursorPosition = m_window->getCursorControl()->getPosition(); + + ext::imgui::UI::SUpdateParameters params = + { + .mousePosition = float32_t2(cursorPosition.x, cursorPosition.y) - float32_t2(m_window->getX(), m_window->getY()), + .displaySize = {m_window->getWidth(), m_window->getHeight()}, + .mouseEvents = uiEvents.mouse, + .keyboardEvents = uiEvents.keyboard}; + + // interface.objectName = m_scene->getInitParams().geometryNames[interface.gcIndex]; + interface.imGUI->update(params); + } + + void recreateFramebuffers() + { + auto createImageAndView = [&](const uint16_t2 resolution, E_FORMAT format) -> smart_refctd_ptr + { + auto image = m_device->createImage({{.type = IGPUImage::ET_2D, + .samples = IGPUImage::ESCF_1_BIT, + .format = format, + .extent = {resolution.x, resolution.y, 1}, + .mipLevels = 1, + .arrayLayers = 1, + .usage = IGPUImage::EUF_RENDER_ATTACHMENT_BIT | IGPUImage::EUF_SAMPLED_BIT}}); + if (!m_device->allocate(image->getMemoryReqs(), image.get()).isValid()) + return nullptr; + IGPUImageView::SCreationParams params = { + .image = std::move(image), + .viewType = IGPUImageView::ET_2D, + .format = format}; + params.subresourceRange.aspectMask = isDepthOrStencilFormat(format) ? IGPUImage::EAF_DEPTH_BIT : IGPUImage::EAF_COLOR_BIT; + return m_device->createImageView(std::move(params)); + }; + + smart_refctd_ptr solidAngleView; + smart_refctd_ptr mainView; + const uint16_t2 solidAngleViewRes = interface.solidAngleViewTransformReturnInfo.sceneResolution; + const uint16_t2 mainViewRes = interface.mainViewTransformReturnInfo.sceneResolution; + + // detect window minimization + if (solidAngleViewRes.x < 0x4000 && solidAngleViewRes.y < 0x4000 || + mainViewRes.x < 0x4000 && mainViewRes.y < 0x4000) + { + solidAngleView = createImageAndView(solidAngleViewRes, finalSceneRenderFormat); + auto solidAngleDepthView = createImageAndView(solidAngleViewRes, sceneRenderDepthFormat); + m_solidAngleViewFramebuffer = m_device->createFramebuffer({{.renderpass = m_solidAngleRenderpass, + .depthStencilAttachments = &solidAngleDepthView.get(), + .colorAttachments = &solidAngleView.get(), + .width = solidAngleViewRes.x, + .height = solidAngleViewRes.y}}); + + mainView = createImageAndView(mainViewRes, finalSceneRenderFormat); + auto mainDepthView = createImageAndView(mainViewRes, sceneRenderDepthFormat); + m_mainViewFramebuffer = m_device->createFramebuffer({{.renderpass = m_mainRenderpass, + .depthStencilAttachments = &mainDepthView.get(), + .colorAttachments = &mainView.get(), + .width = mainViewRes.x, + .height = mainViewRes.y}}); + } + else + { + m_solidAngleViewFramebuffer = nullptr; + m_mainViewFramebuffer = nullptr; + } + + // release previous slot and its image + interface.subAllocDS->multi_deallocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices, {.semaphore = m_semaphore.get(), .value = m_realFrameIx + 1}); + // + if (solidAngleView && mainView) + { + interface.subAllocDS->multi_allocate(0, static_cast(CInterface::Count), interface.renderColorViewDescIndices); + // update descriptor set + IGPUDescriptorSet::SDescriptorInfo infos[static_cast(CInterface::Count)] = {}; + infos[0].desc = mainView; + infos[0].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; + infos[1].desc = solidAngleView; + infos[1].info.image.imageLayout = IGPUImage::LAYOUT::READ_ONLY_OPTIMAL; + const IGPUDescriptorSet::SWriteDescriptorSet write[static_cast(CInterface::Count)] = { + {.dstSet = interface.subAllocDS->getDescriptorSet(), + .binding = TexturesImGUIBindingIndex, + .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_MAIN_VIEW)], + .count = 1, + .info = &infos[static_cast(CInterface::ERV_MAIN_VIEW)]}, + {.dstSet = interface.subAllocDS->getDescriptorSet(), + .binding = TexturesImGUIBindingIndex, + .arrayElement = interface.renderColorViewDescIndices[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)], + .count = 1, + .info = &infos[static_cast(CInterface::ERV_SOLID_ANGLE_VIEW)]}}; + m_device->updateDescriptorSets({write, static_cast(CInterface::Count)}, {}); + } + interface.transformParams.sceneTexDescIx = interface.renderColorViewDescIndices[CInterface::ERV_MAIN_VIEW]; + } + + inline void beginRenderpass(IGPUCommandBuffer* cb, const IGPUCommandBuffer::SRenderpassBeginInfo& info) + { + cb->beginRenderPass(info, IGPUCommandBuffer::SUBPASS_CONTENTS::INLINE); + cb->setScissor(0, 1, &info.renderArea); + const SViewport viewport = { + .x = 0, + .y = 0, + .width = static_cast(info.renderArea.extent.width), + .height = static_cast(info.renderArea.extent.height)}; + cb->setViewport(0u, 1u, &viewport); + } + + ~SolidAngleVisualizer() override + { + m_allocation.memory->unmap(); + } + + // Maximum frames which can be simultaneously submitted, used to cycle through our per-frame resources like command buffers + constexpr static inline uint32_t MaxFramesInFlight = 3u; + constexpr static inline auto sceneRenderDepthFormat = EF_D32_SFLOAT; + constexpr static inline auto finalSceneRenderFormat = EF_R8G8B8A8_SRGB; + constexpr static inline auto TexturesImGUIBindingIndex = 0u; + // we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes + constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight; + + static inline SAMPLING_MODE m_samplingMode = SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE; + static inline bool m_debugVisualization = true; + static inline int m_SampleCount = 64; + static inline int m_BenchmarkSampleCount = 128; + static inline bool m_frameSeeding = true; + static inline ResultData m_GPUOutResulData; + // + smart_refctd_ptr m_scene; + smart_refctd_ptr m_solidAngleRenderpass; + smart_refctd_ptr m_mainRenderpass; + smart_refctd_ptr m_renderer; + smart_refctd_ptr m_solidAngleViewFramebuffer; + smart_refctd_ptr m_mainViewFramebuffer; + // Pipeline variants: SolidAngleVis indexed by [mode * 2 + debugFlag], RayVis by [debugFlag] + static constexpr uint32_t DebugPermutations = 2; + smart_refctd_ptr m_solidAngleVisPipelines[SAMPLING_MODE::Count * DebugPermutations]; + smart_refctd_ptr m_rayVisPipelines[DebugPermutations]; + // + nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; + smart_refctd_ptr m_outputStorageBuffer; + smart_refctd_ptr m_ds = nullptr; + smart_refctd_ptr m_semaphore; + uint64_t m_realFrameIx = 0; + std::array, MaxFramesInFlight> m_cmdBufs; + // + InputSystem::ChannelReader mouse; + InputSystem::ChannelReader keyboard; + // UI stuff + struct CInterface + { + void operator()() + { + ImGuiIO& io = ImGui::GetIO(); + + // TODO: why is this a lambda and not just an assignment in a scope ? + camera.setProjectionMatrix([&]() + { + hlsl::float32_t4x4 projection; + + if (isPerspective) + if (isLH) + projection = hlsl::math::thin_lens::lhPerspectiveFovMatrix(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y * 0.5f, zNear, zFar); // TODO: why do I need to divide aspect ratio by 2? + else + projection = hlsl::math::thin_lens::rhPerspectiveFovMatrix(core::radians(fov), io.DisplaySize.x / io.DisplaySize.y * 0.5f, zNear, zFar); + else + { + float viewHeight = viewWidth * io.DisplaySize.y / io.DisplaySize.x; + + if (isLH) + projection = hlsl::math::thin_lens::lhPerspectiveFovMatrix(viewWidth, viewHeight, zNear, zFar); + else + projection = hlsl::math::thin_lens::rhPerspectiveFovMatrix(viewWidth, viewHeight, zNear, zFar); + } + + return projection; + }()); + + ImGuizmo::SetOrthographic(!isPerspective); + ImGuizmo::BeginFrame(); + + ImGui::SetNextWindowPos(ImVec2(1024, 100), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(256, 256), ImGuiCond_Appearing); + + // create a window and insert the inspector + ImGui::SetNextWindowPos(ImVec2(10, 10), ImGuiCond_Appearing); + ImGui::SetNextWindowSize(ImVec2(320, 340), ImGuiCond_Appearing); + ImGui::Begin("Editor"); + + ImGui::Text("Benchmarking Solid Angle Visualizer"); + + if (ImGui::Button("Run Benchmark")) + { + SolidAngleVisualizer::SamplingBenchmark benchmark(*m_visualizer); + benchmark.run(); + } + ImGui::Separator(); + + ImGui::Text("Sampling Mode:"); + ImGui::SameLine(); + + const char* samplingModes[] = + { + "Triangle Solid Angle", + "Triangle Projected Solid Angle", + "Parallelogram Projected Solid Angle", + "Rectangle Pyramid Solid Angle", + "Biquadratic pyramid solid angle", + "Bilinear pyramid solid angle", + "Projected Rectangle Pyramid", + "Silhouette only (benchmark)", + "Pyramid only (benchmark)"}; + + int currentMode = static_cast(m_samplingMode); + + if (ImGui::Combo("##SamplingMode", ¤tMode, samplingModes, IM_ARRAYSIZE(samplingModes))) + { + m_samplingMode = static_cast(currentMode); + } + + ImGui::Checkbox("Debug Visualization", &m_debugVisualization); + ImGui::Text("Pipeline idx: SA=%d, Ray=%d", + static_cast(m_samplingMode) * DebugPermutations + (m_debugVisualization ? 1 : 0), + m_debugVisualization ? 1 : 0); + ImGui::Checkbox("Frame seeding", &m_frameSeeding); + + ImGui::SliderInt("Sample Count", &m_SampleCount, 0, 512); + ImGui::SliderInt("Benchmark Sample Count", &m_BenchmarkSampleCount, 0, 8096); + + ImGui::Separator(); + + ImGui::Text("Camera"); + + if (ImGui::RadioButton("LH", isLH)) + isLH = true; + + ImGui::SameLine(); + + if (ImGui::RadioButton("RH", !isLH)) + isLH = false; + + if (ImGui::RadioButton("Perspective", isPerspective)) + isPerspective = true; + + ImGui::SameLine(); + + if (ImGui::RadioButton("Orthographic", !isPerspective)) + isPerspective = false; + + ImGui::Checkbox("Enable \"view manipulate\"", &transformParams.enableViewManipulate); + // ImGui::Checkbox("Enable camera movement", &move); + ImGui::SliderFloat("Move speed", &moveSpeed, 0.1f, 10.f); + ImGui::SliderFloat("Rotate speed", &rotateSpeed, 0.1f, 10.f); + + // ImGui::Checkbox("Flip Gizmo's Y axis", &flipGizmoY); // let's not expose it to be changed in UI but keep the logic in case + + if (isPerspective) + ImGui::SliderFloat("Fov", &fov, 20.f, 150.f); + else + ImGui::SliderFloat("Ortho width", &viewWidth, 1, 20); + + ImGui::SliderFloat("zNear", &zNear, 0.1f, 100.f); + ImGui::SliderFloat("zFar", &zFar, 110.f, 10000.f); + + if (firstFrame) + { + camera.setPosition(cameraIntialPosition); + camera.setTarget(cameraInitialTarget); + camera.setUpVector(cameraInitialUp); + + camera.recomputeViewMatrix(); + } + firstFrame = false; + + ImGui::Text("X: %f Y: %f", io.MousePos.x, io.MousePos.y); + if (ImGuizmo::IsUsing()) + { + ImGui::Text("Using gizmo"); + } + else + { + ImGui::Text(ImGuizmo::IsOver() ? "Over gizmo" : ""); + ImGui::SameLine(); + ImGui::Text(ImGuizmo::IsOver(ImGuizmo::TRANSLATE) ? "Over translate gizmo" : ""); + ImGui::SameLine(); + ImGui::Text(ImGuizmo::IsOver(ImGuizmo::ROTATE) ? "Over rotate gizmo" : ""); + ImGui::SameLine(); + ImGui::Text(ImGuizmo::IsOver(ImGuizmo::SCALE) ? "Over scale gizmo" : ""); + } + ImGui::Separator(); + + /* * ImGuizmo expects view & perspective matrix to be column major both with 4x4 layout * and Nabla uses row major matricies - 3x4 matrix for view & 4x4 for projection @@ -980,798 +997,828 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR * note it also modifies input view matrix but projection matrix is immutable */ - if (ImGui::IsKeyPressed(ImGuiKey_End)) - { - m_TRS = TRS{}; - } - - { - static struct - { - float32_t4x4 view, projection, model; - } imguizmoM16InOut; - - ImGuizmo::SetID(0u); - - // TODO: camera will return hlsl::float32_tMxN - auto view = camera.getViewMatrix(); - imguizmoM16InOut.view = hlsl::transpose(hlsl::math::linalg::promote_affine<4, 4>(view)); - - // TODO: camera will return hlsl::float32_tMxN - imguizmoM16InOut.projection = hlsl::transpose(camera.getProjectionMatrix()); - ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &imguizmoM16InOut.model[0][0]); - - if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates - imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/ - - transformParams.editTransformDecomposition = true; - mainViewTransformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams); - move = mainViewTransformReturnInfo.allowCameraMovement; - - ImGuizmo::DecomposeMatrixToComponents(&imguizmoM16InOut.model[0][0], &m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x); - ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &imguizmoM16InOut.model[0][0]); - } - // object meta display - //{ - // ImGui::Begin("Object"); - // ImGui::Text("type: \"%s\"", objectName.data()); - // ImGui::End(); - //} - - // solid angle view window - { - ImGui::SetNextWindowSize(ImVec2(800, 800), ImGuiCond_Appearing); - ImGui::SetNextWindowPos(ImVec2(1240, 20), ImGuiCond_Appearing); - static bool isOpen = true; - ImGui::Begin("Projected Solid Angle View", &isOpen, 0); - - ImVec2 contentRegionSize = ImGui::GetContentRegionAvail(); - solidAngleViewTransformReturnInfo.sceneResolution = uint16_t2(static_cast(contentRegionSize.x), static_cast(contentRegionSize.y)); - solidAngleViewTransformReturnInfo.allowCameraMovement = false; // not used in this view - ImGui::Image({renderColorViewDescIndices[ERV_SOLID_ANGLE_VIEW]}, contentRegionSize); - ImGui::End(); - } - - // Show data coming from GPU - if (m_debugVisualization) - { - if (ImGui::Begin("Result Data")) - { - auto drawColorField = [&](const char *fieldName, uint32_t index) - { - ImGui::Text("%s: %u", fieldName, index); - - if (index >= 27) - { - ImGui::SameLine(); - ImGui::Text(""); - return; - } - - const auto &c = colorLUT[index]; // uses the combined LUT we made earlier - - ImGui::SameLine(); - - // Color preview button - ImGui::ColorButton( - fieldName, - ImVec4(c.r, c.g, c.b, 1.0f), - 0, - ImVec2(20, 20)); - - ImGui::SameLine(); - ImGui::Text("%s", colorNames[index]); - }; - - // Vertices - if (ImGui::CollapsingHeader("Vertices", ImGuiTreeNodeFlags_DefaultOpen)) - { - for (uint32_t i = 0; i < 6; ++i) - { - if (i < m_GPUOutResulData.silhouetteVertexCount) - { - ImGui::Text("corners[%u]", i); - ImGui::SameLine(); - drawColorField(":", m_GPUOutResulData.vertices[i]); - ImGui::SameLine(); - static const float32_t3 constCorners[8] = { - float32_t3(-1, -1, -1), float32_t3(1, -1, -1), float32_t3(-1, 1, -1), float32_t3(1, 1, -1), - float32_t3(-1, -1, 1), float32_t3(1, -1, 1), float32_t3(-1, 1, 1), float32_t3(1, 1, 1)}; - float32_t3 vertexLocation = constCorners[m_GPUOutResulData.vertices[i]]; - ImGui::Text(" : (%.3f, %.3f, %.3f", vertexLocation.x, vertexLocation.y, vertexLocation.z); - } - else - { - ImGui::Text("corners[%u] :: ", i); - ImGui::SameLine(); - ImGui::ColorButton( - "", - ImVec4(0.0f, 0.0f, 0.0f, 0.0f), - 0, - ImVec2(20, 20)); - ImGui::SameLine(); - ImGui::Text(""); - } - } - } - - if (ImGui::CollapsingHeader("Color LUT Map")) - { - for (int i = 0; i < 27; i++) - drawColorField(" ", i); - } - - ImGui::Separator(); - ImGui::Text("Valid Samples: %u / %u", m_GPUOutResulData.validSampleCount / hlsl::max(m_GPUOutResulData.threadCount, 1u), m_GPUOutResulData.sampleCount); - ImGui::ProgressBar(static_cast(m_GPUOutResulData.validSampleCount / hlsl::max(m_GPUOutResulData.threadCount, 1u)) / static_cast(m_GPUOutResulData.sampleCount)); - ImGui::Separator(); - - // Silhouette - if (ImGui::CollapsingHeader("Silhouette")) - { - drawColorField("silhouetteIndex", m_GPUOutResulData.silhouetteIndex); - ImGui::Text("Region: (%u, %u, %u)", m_GPUOutResulData.region.x, m_GPUOutResulData.region.y, m_GPUOutResulData.region.z); - ImGui::Text("Silhouette Vertex Count: %u", m_GPUOutResulData.silhouetteVertexCount); - ImGui::Text("Positive Vertex Count: %u", m_GPUOutResulData.positiveVertCount); - ImGui::Text("Edge Visibility Mismatch: %s", m_GPUOutResulData.edgeVisibilityMismatch ? "true" : "false"); - ImGui::Text("Max Triangles Exceeded: %s", m_GPUOutResulData.maxTrianglesExceeded ? "true" : "false"); - for (uint32_t i = 0; i < 6; i++) - ImGui::Text("Vertex[%u]: %u", i, m_GPUOutResulData.vertices[i]); - ImGui::Text("Clipped Silhouette Vertex Count: %u", m_GPUOutResulData.clippedSilhouetteVertexCount); - for (uint32_t i = 0; i < 7; i++) - ImGui::Text("Clipped Vertex[%u]: (%.3f, %.3f, %.3f) Index: %u", i, - m_GPUOutResulData.clippedSilhouetteVertices[i].x, - m_GPUOutResulData.clippedSilhouetteVertices[i].y, - m_GPUOutResulData.clippedSilhouetteVertices[i].z, - m_GPUOutResulData.clippedSilhouetteVerticesIndices[i]); - - // Silhouette mask printed in binary - auto printBin = [](uint32_t bin, const char *name) - { - char buf[33]; - for (int i = 0; i < 32; i++) - buf[i] = (bin & (1u << (31 - i))) ? '1' : '0'; - buf[32] = '\0'; - ImGui::Text("%s: 0x%08X", name, bin); - ImGui::Text("binary: 0b%s", buf); - ImGui::Separator(); - }; - printBin(m_GPUOutResulData.silhouette, "Silhouette"); - printBin(m_GPUOutResulData.rotatedSil, "rotatedSilhouette"); - - printBin(m_GPUOutResulData.clipCount, "clipCount"); - printBin(m_GPUOutResulData.clipMask, "clipMask"); - printBin(m_GPUOutResulData.rotatedClipMask, "rotatedClipMask"); - printBin(m_GPUOutResulData.rotateAmount, "rotateAmount"); - printBin(m_GPUOutResulData.wrapAround, "wrapAround"); - } - - // Parallelogram - if (m_samplingMode == PROJECTED_PARALLELOGRAM_SOLID_ANGLE && ImGui::CollapsingHeader("Projected Parallelogram", ImGuiTreeNodeFlags_DefaultOpen)) - { - ImGui::Text("Does Not Bound: %s", m_GPUOutResulData.parallelogramDoesNotBound ? "true" : "false"); - ImGui::Text("Area: %.3f", m_GPUOutResulData.parallelogramArea); - ImGui::Text("Failed Vertex Index: %u", m_GPUOutResulData.failedVertexIndex); - for (uint32_t i = 0; i < 4; i++) - ImGui::Text("Edge Is Convex[%u]: %s", i, m_GPUOutResulData.edgeIsConvex[i] ? "true" : "false"); - ImGui::Text("Vertices Inside: %s", m_GPUOutResulData.parallelogramVerticesInside ? "true" : "false"); - ImGui::Text("Edges Inside: %s", m_GPUOutResulData.parallelogramEdgesInside ? "true" : "false"); - for (uint32_t i = 0; i < 4; i++) - ImGui::Text("Corner[%u]: (%.3f, %.3f)", i, m_GPUOutResulData.parallelogramCorners[i].x, m_GPUOutResulData.parallelogramCorners[i].y); - } - else if ((m_samplingMode == SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE || m_samplingMode == SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC ||m_samplingMode == SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) && ImGui::CollapsingHeader("Spherical Pyramid", ImGuiTreeNodeFlags_DefaultOpen)) - { - ImGui::Text("Spans Hemisphere: %s", m_GPUOutResulData.pyramidSpansHemisphere ? "YES (warning)" : "no"); - ImGui::Text("Best Caliper Edge: %u", m_GPUOutResulData.pyramidBestEdge); - ImGui::Separator(); - - ImGui::Text("Axis 1: (%.4f, %.4f, %.4f)", - m_GPUOutResulData.pyramidAxis1.x, m_GPUOutResulData.pyramidAxis1.y, m_GPUOutResulData.pyramidAxis1.z); - ImGui::Text(" Half-Width: %.4f Offset: %.4f", - m_GPUOutResulData.pyramidHalfWidth1, m_GPUOutResulData.pyramidOffset1); - ImGui::Text(" Bounds: [%.4f, %.4f]", - m_GPUOutResulData.pyramidMin1, m_GPUOutResulData.pyramidMax1); - - ImGui::Text("Axis 2: (%.4f, %.4f, %.4f)", - m_GPUOutResulData.pyramidAxis2.x, m_GPUOutResulData.pyramidAxis2.y, m_GPUOutResulData.pyramidAxis2.z); - ImGui::Text(" Half-Width: %.4f Offset: %.4f", - m_GPUOutResulData.pyramidHalfWidth2, m_GPUOutResulData.pyramidOffset2); - ImGui::Text(" Bounds: [%.4f, %.4f]", - m_GPUOutResulData.pyramidMin2, m_GPUOutResulData.pyramidMax2); - - ImGui::Separator(); - ImGui::Text("Center: (%.4f, %.4f, %.4f)", - m_GPUOutResulData.pyramidCenter.x, m_GPUOutResulData.pyramidCenter.y, m_GPUOutResulData.pyramidCenter.z); - ImGui::Text("Solid Angle (bound): %.6f sr", m_GPUOutResulData.pyramidSolidAngle); - } - else if (m_samplingMode == TRIANGLE_SOLID_ANGLE || m_samplingMode == TRIANGLE_PROJECTED_SOLID_ANGLE && ImGui::CollapsingHeader("Spherical Triangle", ImGuiTreeNodeFlags_DefaultOpen)) - { - ImGui::Text("Spherical Lune Detected: %s", m_GPUOutResulData.sphericalLuneDetected ? "true" : "false"); - ImGui::Text("Triangle Count: %u", m_GPUOutResulData.triangleCount); - // print solidAngles for each triangle - { - ImGui::Text("Solid Angles per Triangle:"); - ImGui::BeginTable("SolidAnglesTable", 2); - ImGui::TableSetupColumn("Triangle Index"); - ImGui::TableSetupColumn("Solid Angle"); - ImGui::TableHeadersRow(); - for (uint32_t i = 0; i < m_GPUOutResulData.triangleCount; ++i) - { - ImGui::TableNextRow(); - ImGui::TableSetColumnIndex(0); - ImGui::Text("%u", i); - ImGui::TableSetColumnIndex(1); - ImGui::Text("%.6f", m_GPUOutResulData.solidAngles[i]); - } - ImGui::Text("Total: %.6f", m_GPUOutResulData.totalSolidAngles); - ImGui::EndTable(); - } - } - - { - float32_t3 xAxis = m_OBBModelMatrix[0].xyz; - float32_t3 yAxis = m_OBBModelMatrix[1].xyz; - float32_t3 zAxis = m_OBBModelMatrix[2].xyz; - - float32_t3 nx = normalize(xAxis); - float32_t3 ny = normalize(yAxis); - float32_t3 nz = normalize(zAxis); - - const float epsilon = 1e-4; - bool hasSkew = false; - if (abs(dot(nx, ny)) > epsilon || abs(dot(nx, nz)) > epsilon || abs(dot(ny, nz)) > epsilon) - hasSkew = true; - ImGui::Separator(); - ImGui::Text("Matrix Has Skew: %s", hasSkew ? "true" : "false"); - } - - static bool modalShown = false; - static bool modalDismissed = false; - static uint32_t lastSilhouetteIndex = ~0u; - - // Reset modal flags if silhouette configuration changed - if (m_GPUOutResulData.silhouetteIndex != lastSilhouetteIndex) - { - modalShown = false; - modalDismissed = false; // Allow modal to show again for new configuration - lastSilhouetteIndex = m_GPUOutResulData.silhouetteIndex; - } - - // Reset flags when mismatch is cleared - if (!m_GPUOutResulData.edgeVisibilityMismatch && !m_GPUOutResulData.maxTrianglesExceeded && !m_GPUOutResulData.sphericalLuneDetected) - { - modalShown = false; - modalDismissed = false; - } - - // Open modal only if not already shown/dismissed - if ((m_GPUOutResulData.edgeVisibilityMismatch || m_GPUOutResulData.maxTrianglesExceeded || m_GPUOutResulData.sphericalLuneDetected) && m_GPUOutResulData.silhouetteIndex != 13 && !modalShown && !modalDismissed) // Don't reopen if user dismissed it - { - ImGui::OpenPopup("Edge Visibility Mismatch Warning"); - modalShown = true; - } - - // Modal popup - if (ImGui::BeginPopupModal("Edge Visibility Mismatch Warning", NULL, ImGuiWindowFlags_AlwaysAutoResize)) - { - ImGui::TextColored(ImVec4(1.0f, 0.5f, 0.0f, 1.0f), "Warning: Edge Visibility Mismatch Detected!"); - ImGui::Separator(); - ImGui::Text("The silhouette lookup table (LUT) does not match the computed edge visibility."); - ImGui::Text("This indicates the pre-computed silhouette data may be incorrect."); - ImGui::Spacing(); - ImGui::TextWrapped("Configuration Index: %u", m_GPUOutResulData.silhouetteIndex); - ImGui::TextWrapped("Region: (%u, %u, %u)", m_GPUOutResulData.region.x, m_GPUOutResulData.region.y, m_GPUOutResulData.region.z); - ImGui::Spacing(); - ImGui::Text("Mismatched Vertices (bitmask): 0x%08X", m_GPUOutResulData.edgeVisibilityMismatch); - ImGui::Text("Vertices involved in mismatched edges:"); - ImGui::Indent(); - for (int i = 0; i < 8; i++) - { - if (m_GPUOutResulData.edgeVisibilityMismatch & (1u << i)) - { - ImGui::BulletText("Vertex %d", i); - } - } - ImGui::Unindent(); - ImGui::Spacing(); - if (ImGui::Button("OK", ImVec2(120, 0))) - { - ImGui::CloseCurrentPopup(); - modalShown = false; - modalDismissed = true; // Mark as dismissed to prevent reopening - } - ImGui::EndPopup(); - } - } - ImGui::End(); - } - - // view matrices editor - { - ImGui::Begin("Matrices"); - - auto addMatrixTable = [&](const char *topText, const char *tableName, const int rows, const int columns, const float *pointer, const bool withSeparator = true) - { - ImGui::Text(topText); - if (ImGui::BeginTable(tableName, columns)) - { - for (int y = 0; y < rows; ++y) - { - ImGui::TableNextRow(); - for (int x = 0; x < columns; ++x) - { - ImGui::TableSetColumnIndex(x); - ImGui::Text("%.3f", *(pointer + (y * columns) + x)); - } - } - ImGui::EndTable(); - } - - if (withSeparator) - ImGui::Separator(); - }; - - static RandomSampler rng(0x45); // Initialize RNG with seed - - // Helper function to check if cube intersects unit sphere at origin - auto isCubeOutsideUnitSphere = [](const float32_t3 &translation, const float32_t3 &scale) -> bool - { - float cubeRadius = glm::length(scale) * 0.5f; - float distanceToCenter = glm::length(translation); - return (distanceToCenter - cubeRadius) > 1.0f; - }; - - static TRS lastTRS = {}; - if (ImGui::Button("Randomize Translation")) - { - lastTRS = m_TRS; // Backup before randomizing - int attempts = 0; - do - { - m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); - attempts++; - } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); - } - ImGui::SameLine(); - if (ImGui::Button("Randomize Rotation")) - { - lastTRS = m_TRS; // Backup before randomizing - m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f)); - } - ImGui::SameLine(); - if (ImGui::Button("Randomize Scale")) - { - lastTRS = m_TRS; // Backup before randomizing - int attempts = 0; - do - { - m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); - attempts++; - } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); - } - // ImGui::SameLine(); - if (ImGui::Button("Randomize All")) - { - lastTRS = m_TRS; // Backup before randomizing - int attempts = 0; - do - { - m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); - m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f)); - m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); - attempts++; - } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); - } - ImGui::SameLine(); - if (ImGui::Button("Revert to Last")) - { - m_TRS = lastTRS; // Restore backed-up TRS - } - - addMatrixTable("Model Matrix", "ModelMatrixTable", 4, 4, &m_OBBModelMatrix[0][0]); - addMatrixTable("Camera View Matrix", "ViewMatrixTable", 3, 4, &camera.getViewMatrix()[0].x); - addMatrixTable("Camera View Projection Matrix", "ViewProjectionMatrixTable", 4, 4, &camera.getProjectionMatrix()[0].x, false); - - ImGui::End(); - } - - // Nabla Imgui backend MDI buffer info - // To be 100% accurate and not overly conservative we'd have to explicitly `cull_frees` and defragment each time, - // so unless you do that, don't use this basic info to optimize the size of your IMGUI buffer. - { - auto *streaminingBuffer = imGUI->getStreamingBuffer(); - - const size_t total = streaminingBuffer->get_total_size(); // total memory range size for which allocation can be requested - const size_t freeSize = streaminingBuffer->getAddressAllocator().get_free_size(); // max total free bloock memory size we can still allocate from total memory available - const size_t consumedMemory = total - freeSize; // memory currently consumed by streaming buffer - - float freePercentage = 100.0f * (float)(freeSize) / (float)total; - float allocatedPercentage = (float)(consumedMemory) / (float)total; - - ImVec2 barSize = ImVec2(400, 30); - float windowPadding = 10.0f; - float verticalPadding = ImGui::GetStyle().FramePadding.y; - - ImGui::SetNextWindowSize(ImVec2(barSize.x + 2 * windowPadding, 110 + verticalPadding), ImGuiCond_Always); - ImGui::Begin("Nabla Imgui MDI Buffer Info", nullptr, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoScrollbar); - - ImGui::Text("Total Allocated Size: %zu bytes", total); - ImGui::Text("In use: %zu bytes", consumedMemory); - ImGui::Text("Buffer Usage:"); - - ImGui::SetCursorPosX(windowPadding); - - if (freePercentage > 70.0f) - ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(0.0f, 1.0f, 0.0f, 0.4f)); // Green - else if (freePercentage > 30.0f) - ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 1.0f, 0.0f, 0.4f)); // Yellow - else - ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 0.0f, 0.0f, 0.4f)); // Red - - ImGui::ProgressBar(allocatedPercentage, barSize, ""); - - ImGui::PopStyleColor(); - - ImDrawList *drawList = ImGui::GetWindowDrawList(); - - ImVec2 progressBarPos = ImGui::GetItemRectMin(); - ImVec2 progressBarSize = ImGui::GetItemRectSize(); - - const char *text = "%.2f%% free"; - char textBuffer[64]; - snprintf(textBuffer, sizeof(textBuffer), text, freePercentage); - - ImVec2 textSize = ImGui::CalcTextSize(textBuffer); - ImVec2 textPos = ImVec2( - progressBarPos.x + (progressBarSize.x - textSize.x) * 0.5f, - progressBarPos.y + (progressBarSize.y - textSize.y) * 0.5f); - - ImVec4 bgColor = ImGui::GetStyleColorVec4(ImGuiCol_WindowBg); - drawList->AddRectFilled( - ImVec2(textPos.x - 5, textPos.y - 2), - ImVec2(textPos.x + textSize.x + 5, textPos.y + textSize.y + 2), - ImGui::GetColorU32(bgColor)); - - ImGui::SetCursorScreenPos(textPos); - ImGui::Text("%s", textBuffer); - - ImGui::Dummy(ImVec2(0.0f, verticalPadding)); - - ImGui::End(); - } - ImGui::End(); - - ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &m_OBBModelMatrix[0][0]); - } - - smart_refctd_ptr imGUI; - - // descriptor set - smart_refctd_ptr subAllocDS; - enum E_RENDER_VIEWS : uint8_t - { - ERV_MAIN_VIEW, - ERV_SOLID_ANGLE_VIEW, - Count - }; - SubAllocatedDescriptorSet::value_type renderColorViewDescIndices[E_RENDER_VIEWS::Count] = {SubAllocatedDescriptorSet::invalid_value, SubAllocatedDescriptorSet::invalid_value}; - // - Camera camera = Camera(cameraIntialPosition, cameraInitialTarget, {}, 1, 1, nbl::core::vectorSIMDf(0.0f, 0.0f, 1.0f)); - // mutables - struct TRS // Source of truth - { - float32_t3 translation{0.0f, 0.0f, 1.5f}; - float32_t3 rotation{0.0f}; // MUST stay orthonormal - float32_t3 scale{1.0f}; - } m_TRS; - float32_t4x4 m_OBBModelMatrix; // always overwritten from TRS - - // std::string_view objectName; - TransformRequestParams transformParams; - TransformReturnInfo mainViewTransformReturnInfo; - TransformReturnInfo solidAngleViewTransformReturnInfo; - - const static inline core::vectorSIMDf cameraIntialPosition{-3.0f, 6.0f, 3.0f}; - const static inline core::vectorSIMDf cameraInitialTarget{0.f, 0.0f, 3.f}; - const static inline core::vectorSIMDf cameraInitialUp{0.f, 0.f, 1.f}; - - float fov = 90.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f; - float viewWidth = 10.f; - // uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed - bool isPerspective = true, isLH = true, flipGizmoY = true, move = true; - bool firstFrame = true; - - SolidAngleVisualizer *m_visualizer; - } interface; - - class SamplingBenchmark final - { - public: - SamplingBenchmark(SolidAngleVisualizer &base) - : m_api(base.m_api), m_device(base.m_device), m_logger(base.m_logger), m_visualizer(&base) - { - - // setting up pipeline in the constructor - m_queueFamily = base.getComputeQueue()->getFamilyIndex(); - m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - // core::smart_refctd_ptr* cmdBuffs[] = { &m_cmdbuf, &m_timestampBeforeCmdBuff, &m_timestampAfterCmdBuff }; - if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) - base.logFail("Failed to create Command Buffers!\n"); - if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampBeforeCmdBuff)) - base.logFail("Failed to create Command Buffers!\n"); - if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_timestampAfterCmdBuff)) - base.logFail("Failed to create Command Buffers!\n"); - - // Load shaders, set up pipelines (one per sampling mode) - { - auto loadShader = [&](auto key) -> smart_refctd_ptr - { - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = base.m_logger.get(); - lp.workingDirectory = "app_resources"; - auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - { - base.logFail("Could not load shader!"); - assert(0); - } - assert(assets.size() == 1); - auto shader = IAsset::castDown(assets[0]); - if (!shader) - base.logFail("Failed to load precompiled benchmark shader!\n"); - return shader; - }; - - smart_refctd_ptr shaders[SAMPLING_MODE::Count] = { - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_tri_sa">(m_device.get())), - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_tri_psa">(m_device.get())), - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_para">(m_device.get())), - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_rectangle">(m_device.get())), - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_biquad">(m_device.get())), - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_bilinear">(m_device.get())), - }; - - nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { - {.binding = 0, - .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, - .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, - .stageFlags = ShaderStage::ESS_COMPUTE, - .count = 1}}; - smart_refctd_ptr dsLayout = base.m_device->createDescriptorSetLayout(bindings); - if (!dsLayout) - base.logFail("Failed to create a Descriptor Layout!\n"); - - SPushConstantRange pushConstantRanges[] = { - {.stageFlags = ShaderStage::ESS_COMPUTE, - .offset = 0, - .size = sizeof(BenchmarkPushConstants)}}; - m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout)); - if (!m_pplnLayout) - base.logFail("Failed to create a Pipeline Layout!\n"); - - for (uint32_t i = 0; i < SAMPLING_MODE::Count; i++) - { - IGPUComputePipeline::SCreationParams params = {}; - params.layout = m_pplnLayout.get(); - params.shader.entryPoint = "main"; - params.shader.shader = shaders[i].get(); - if (!base.m_device->createComputePipelines(nullptr, {¶ms, 1}, &m_pipelines[i])) - base.logFail("Failed to create pipelines (compile & link shaders)!\n"); - } - - // Allocate the memory - { - constexpr size_t BufferSize = BENCHMARK_WORKGROUP_COUNT * BENCHMARK_WORKGROUP_DIMENSION_SIZE_X * - BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y * BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z * sizeof(uint32_t); - - nbl::video::IGPUBuffer::SCreationParams params = {}; - params.size = BufferSize; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT; - smart_refctd_ptr dummyBuff = base.m_device->createBuffer(std::move(params)); - if (!dummyBuff) - base.logFail("Failed to create a GPU Buffer of size %d!\n", params.size); - - dummyBuff->setObjectDebugName("benchmark buffer"); - - nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = dummyBuff->getMemoryReqs(); - - m_allocation = base.m_device->allocate(reqs, dummyBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); - if (!m_allocation.isValid()) - base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); - - assert(dummyBuff->getBoundMemory().memory == m_allocation.memory.get()); - smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, {&dsLayout.get(), 1}); - - m_ds = pool->createDescriptorSet(std::move(dsLayout)); - { - IGPUDescriptorSet::SDescriptorInfo info[1]; - info[0].desc = smart_refctd_ptr(dummyBuff); - info[0].info.buffer = {.offset = 0, .size = BufferSize}; - IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { - {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info}}; - base.m_device->updateDescriptorSets(writes, {}); - } - } - } - - IQueryPool::SCreationParams queryPoolCreationParams{}; - queryPoolCreationParams.queryType = IQueryPool::TYPE::TIMESTAMP; - queryPoolCreationParams.queryCount = 2; - queryPoolCreationParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; - m_queryPool = m_device->createQueryPool(queryPoolCreationParams); - - m_computeQueue = m_device->getQueue(m_queueFamily, 0); - } - - void run() - { - m_logger->log("\n\nsampling benchmark result:", ILogger::ELL_PERFORMANCE); - - m_logger->log("sampling benchmark, SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE result:", ILogger::ELL_PERFORMANCE); - performBenchmark(SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE); - - m_logger->log("sampling benchmark, SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC result:", ILogger::ELL_PERFORMANCE); - performBenchmark(SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC); - - m_logger->log("sampling benchmark, SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR result:", ILogger::ELL_PERFORMANCE); - performBenchmark(SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR); - - m_logger->log("sampling benchmark, PROJECTED_PARALLELOGRAM_SOLID_ANGLE result:", ILogger::ELL_PERFORMANCE); - performBenchmark(SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE); - - m_logger->log("sampling benchmark, TRIANGLE_SOLID_ANGLE result:", ILogger::ELL_PERFORMANCE); - performBenchmark(SAMPLING_MODE::TRIANGLE_SOLID_ANGLE); - - // m_logger->log("sampling benchmark, triangle projected solid angle result:", ILogger::ELL_PERFORMANCE); - // performBenchmark(SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE); - } - - private: - void performBenchmark(SAMPLING_MODE mode) - { - m_device->waitIdle(); - - recordTimestampQueryCmdBuffers(); - - uint64_t semaphoreCounter = 0; - smart_refctd_ptr semaphore = m_device->createSemaphore(semaphoreCounter); - - IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {{.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - IQueue::SSubmitInfo::SSemaphoreInfo waits[] = {{.semaphore = semaphore.get(), .value = 0u, .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT}}; - - IQueue::SSubmitInfo beforeTimestapSubmitInfo[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsBegin[] = {{.cmdbuf = m_timestampBeforeCmdBuff.get()}}; - beforeTimestapSubmitInfo[0].commandBuffers = cmdbufsBegin; - beforeTimestapSubmitInfo[0].signalSemaphores = signals; - beforeTimestapSubmitInfo[0].waitSemaphores = waits; - - IQueue::SSubmitInfo afterTimestapSubmitInfo[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufsEnd[] = {{.cmdbuf = m_timestampAfterCmdBuff.get()}}; - afterTimestapSubmitInfo[0].commandBuffers = cmdbufsEnd; - afterTimestapSubmitInfo[0].signalSemaphores = signals; - afterTimestapSubmitInfo[0].waitSemaphores = waits; - - IQueue::SSubmitInfo benchmarkSubmitInfos[1] = {}; - const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = {{.cmdbuf = m_cmdbuf.get()}}; - benchmarkSubmitInfos[0].commandBuffers = cmdbufs; - benchmarkSubmitInfos[0].signalSemaphores = signals; - benchmarkSubmitInfos[0].waitSemaphores = waits; - - m_pushConstants.modelMatrix = float32_t3x4(transpose(m_visualizer->interface.m_OBBModelMatrix)); - m_pushConstants.sampleCount = m_SampleCount; - recordCmdBuff(mode); - - // warmup runs - for (int i = 0; i < WarmupIterations; ++i) - { - if (i == 0) - m_api->startCapture(); - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(benchmarkSubmitInfos); - if (i == 0) - m_api->endCapture(); - } - - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(beforeTimestapSubmitInfo); - - // actual benchmark runs - for (int i = 0; i < Iterations; ++i) - { - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(benchmarkSubmitInfos); - } - - waits[0].value = semaphoreCounter; - signals[0].value = ++semaphoreCounter; - m_computeQueue->submit(afterTimestapSubmitInfo); - - m_device->waitIdle(); - - const uint64_t nativeBenchmarkTimeElapsedNanoseconds = calcTimeElapsed(); - const float nativeBenchmarkTimeElapsedSeconds = double(nativeBenchmarkTimeElapsedNanoseconds) / 1000000000.0; - - m_logger->log("%llu ns, %f s", ILogger::ELL_PERFORMANCE, nativeBenchmarkTimeElapsedNanoseconds, nativeBenchmarkTimeElapsedSeconds); - } - - void recordCmdBuff(SAMPLING_MODE mode) - { - m_cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); - m_cmdbuf->beginDebugMarker("sampling compute dispatch", vectorSIMDf(0, 1, 0, 1)); - m_cmdbuf->bindComputePipeline(m_pipelines[mode].get()); - m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); - m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants); - m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); - m_cmdbuf->endDebugMarker(); - m_cmdbuf->end(); - } - - void recordTimestampQueryCmdBuffers() - { - static bool firstInvocation = true; - - if (!firstInvocation) - { - m_timestampBeforeCmdBuff->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); - m_timestampBeforeCmdBuff->reset(IGPUCommandBuffer::RESET_FLAGS::NONE); - } - - m_timestampBeforeCmdBuff->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampBeforeCmdBuff->resetQueryPool(m_queryPool.get(), 0, 2); - m_timestampBeforeCmdBuff->writeTimestamp(PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 0); - m_timestampBeforeCmdBuff->end(); - - m_timestampAfterCmdBuff->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_timestampAfterCmdBuff->writeTimestamp(PIPELINE_STAGE_FLAGS::NONE, m_queryPool.get(), 1); - m_timestampAfterCmdBuff->end(); - - firstInvocation = false; - } - - uint64_t calcTimeElapsed() - { - uint64_t timestamps[2]; - const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); - m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, ×tamps, sizeof(uint64_t), flags); - return timestamps[1] - timestamps[0]; - } - - private: - core::smart_refctd_ptr m_api; - smart_refctd_ptr m_device; - smart_refctd_ptr m_logger; - SolidAngleVisualizer *m_visualizer; - - nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; - smart_refctd_ptr m_cmdpool = nullptr; - smart_refctd_ptr m_cmdbuf = nullptr; - smart_refctd_ptr m_ds = nullptr; - smart_refctd_ptr m_pplnLayout = nullptr; - BenchmarkPushConstants m_pushConstants; - smart_refctd_ptr m_pipelines[SAMPLING_MODE::Count]; - - smart_refctd_ptr m_timestampBeforeCmdBuff = nullptr; - smart_refctd_ptr m_timestampAfterCmdBuff = nullptr; - smart_refctd_ptr m_queryPool = nullptr; - - uint32_t m_queueFamily; - IQueue *m_computeQueue; - static constexpr int WarmupIterations = 50; - static constexpr int Iterations = 1; - }; - - template - inline bool logFail(const char *msg, Args &&...args) - { - m_logger->log(msg, ILogger::ELL_ERROR, std::forward(args)...); - return false; - } - - std::ofstream m_logFile; + if (ImGui::IsKeyPressed(ImGuiKey_End)) + { + m_TRS = TRS {}; + } + + { + static struct + { + float32_t4x4 view, projection, model; + } imguizmoM16InOut; + + ImGuizmo::SetID(0u); + + // TODO: camera will return hlsl::float32_tMxN + auto view = camera.getViewMatrix(); + imguizmoM16InOut.view = hlsl::transpose(hlsl::math::linalg::promote_affine<4, 4>(view)); + + // TODO: camera will return hlsl::float32_tMxN + imguizmoM16InOut.projection = hlsl::transpose(camera.getProjectionMatrix()); + ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &imguizmoM16InOut.model[0][0]); + + if (flipGizmoY) // note we allow to flip gizmo just to match our coordinates + imguizmoM16InOut.projection[1][1] *= -1.f; // https://johannesugb.github.io/gpu-programming/why-do-opengl-proj-matrices-fail-in-vulkan/ + + transformParams.editTransformDecomposition = true; + mainViewTransformReturnInfo = EditTransform(&imguizmoM16InOut.view[0][0], &imguizmoM16InOut.projection[0][0], &imguizmoM16InOut.model[0][0], transformParams); + move = mainViewTransformReturnInfo.allowCameraMovement; + + ImGuizmo::DecomposeMatrixToComponents(&imguizmoM16InOut.model[0][0], &m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x); + ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &imguizmoM16InOut.model[0][0]); + } + // object meta display + //{ + // ImGui::Begin("Object"); + // ImGui::Text("type: \"%s\"", objectName.data()); + // ImGui::End(); + //} + + // solid angle view window + { + ImGui::SetNextWindowSize(ImVec2(800, 800), ImGuiCond_Appearing); + ImGui::SetNextWindowPos(ImVec2(1240, 20), ImGuiCond_Appearing); + static bool isOpen = true; + ImGui::Begin("Projected Solid Angle View", &isOpen, 0); + + ImVec2 contentRegionSize = ImGui::GetContentRegionAvail(); + solidAngleViewTransformReturnInfo.sceneResolution = uint16_t2(static_cast(contentRegionSize.x), static_cast(contentRegionSize.y)); + solidAngleViewTransformReturnInfo.allowCameraMovement = false; // not used in this view + ImGui::Image({renderColorViewDescIndices[ERV_SOLID_ANGLE_VIEW]}, contentRegionSize); + ImGui::End(); + } + + // Show data coming from GPU + if (m_debugVisualization) + { + if (ImGui::Begin("Result Data")) + { + auto drawColorField = [&](const char* fieldName, uint32_t index) + { + ImGui::Text("%s: %u", fieldName, index); + + if (index >= 27) + { + ImGui::SameLine(); + ImGui::Text(""); + return; + } + + const auto& c = colorLUT[index]; // uses the combined LUT we made earlier + + ImGui::SameLine(); + + // Color preview button + ImGui::ColorButton( + fieldName, + ImVec4(c.r, c.g, c.b, 1.0f), + 0, + ImVec2(20, 20)); + + ImGui::SameLine(); + ImGui::Text("%s", colorNames[index]); + }; + + // Vertices + if (ImGui::CollapsingHeader("Vertices", ImGuiTreeNodeFlags_DefaultOpen)) + { + for (uint32_t i = 0; i < 6; ++i) + { + if (i < m_GPUOutResulData.silhouette.silhouetteVertexCount) + { + ImGui::Text("corners[%u]", i); + ImGui::SameLine(); + drawColorField(":", m_GPUOutResulData.silhouette.vertices[i]); + ImGui::SameLine(); + static const float32_t3 constCorners[8] = { + float32_t3(0, 0, 0), float32_t3(1, 0, 0), float32_t3(0, 1, 0), float32_t3(1, 1, 0), + float32_t3(0, 0, 1), float32_t3(1, 0, 1), float32_t3(0, 1, 1), float32_t3(1, 1, 1)}; + float32_t3 vertexLocation = constCorners[m_GPUOutResulData.silhouette.vertices[i]]; + ImGui::Text(" : (%.3f, %.3f, %.3f", vertexLocation.x, vertexLocation.y, vertexLocation.z); + } + else + { + ImGui::Text("corners[%u] :: ", i); + ImGui::SameLine(); + ImGui::ColorButton( + "", + ImVec4(0.0f, 0.0f, 0.0f, 0.0f), + 0, + ImVec2(20, 20)); + ImGui::SameLine(); + ImGui::Text(""); + } + } + } + + if (ImGui::CollapsingHeader("Color LUT Map")) + { + for (int i = 0; i < 27; i++) + drawColorField(" ", i); + } + + ImGui::Separator(); + ImGui::Text("Valid Samples: %u / %u", m_GPUOutResulData.sampling.validSampleCount / hlsl::max(m_GPUOutResulData.sampling.threadCount, 1u), m_GPUOutResulData.sampling.sampleCount); + ImGui::ProgressBar(static_cast(m_GPUOutResulData.sampling.validSampleCount / hlsl::max(m_GPUOutResulData.sampling.threadCount, 1u)) / static_cast(m_GPUOutResulData.sampling.sampleCount)); + ImGui::Separator(); + + // Silhouette + if (ImGui::CollapsingHeader("Silhouette")) + { + drawColorField("silhouetteIndex", m_GPUOutResulData.silhouette.silhouetteIndex); + ImGui::Text("Region: (%u, %u, %u)", m_GPUOutResulData.silhouette.region.x, m_GPUOutResulData.silhouette.region.y, m_GPUOutResulData.silhouette.region.z); + ImGui::Text("Silhouette Vertex Count: %u", m_GPUOutResulData.silhouette.silhouetteVertexCount); + ImGui::Text("Positive Vertex Count: %u", m_GPUOutResulData.silhouette.positiveVertCount); + ImGui::Text("Edge Visibility Mismatch: %s", m_GPUOutResulData.silhouette.edgeVisibilityMismatch ? "true" : "false"); + ImGui::Text("Max Triangles Exceeded: %s", m_GPUOutResulData.triangleFan.maxTrianglesExceeded ? "true" : "false"); + for (uint32_t i = 0; i < 6; i++) + ImGui::Text("Vertex[%u]: %u", i, m_GPUOutResulData.silhouette.vertices[i]); + ImGui::Text("Clipped Silhouette Vertex Count: %u", m_GPUOutResulData.silhouette.clippedVertexCount); + for (uint32_t i = 0; i < 7; i++) + ImGui::Text("Clipped Vertex[%u]: (%.3f, %.3f, %.3f) Index: %u", i, + m_GPUOutResulData.silhouette.clippedVertices[i].x, + m_GPUOutResulData.silhouette.clippedVertices[i].y, + m_GPUOutResulData.silhouette.clippedVertices[i].z, + m_GPUOutResulData.silhouette.clippedVertexIndices[i]); + + // Silhouette mask printed in binary + auto printBin = [](uint32_t bin, const char* name) + { + char buf[33]; + for (int i = 0; i < 32; i++) + buf[i] = (bin & (1u << (31 - i))) ? '1' : '0'; + buf[32] = '\0'; + ImGui::Text("%s: 0x%08X", name, bin); + ImGui::Text("binary: 0b%s", buf); + ImGui::Separator(); + }; + printBin(m_GPUOutResulData.silhouette.silhouette, "Silhouette"); + printBin(m_GPUOutResulData.silhouette.rotatedSil, "rotatedSilhouette"); + + printBin(m_GPUOutResulData.silhouette.clipCount, "clipCount"); + printBin(m_GPUOutResulData.silhouette.clipMask, "clipMask"); + printBin(m_GPUOutResulData.silhouette.rotatedClipMask, "rotatedClipMask"); + printBin(m_GPUOutResulData.silhouette.rotateAmount, "rotateAmount"); + printBin(m_GPUOutResulData.silhouette.wrapAround, "wrapAround"); + } + + // Parallelogram + if (m_samplingMode == PROJECTED_PARALLELOGRAM_SOLID_ANGLE && ImGui::CollapsingHeader("Projected Parallelogram", ImGuiTreeNodeFlags_DefaultOpen)) + { + ImGui::Text("Area: %.3f", m_GPUOutResulData.parallelogram.area); + ImGui::Text("N3 Mask: 0x%02X", m_GPUOutResulData.parallelogram.n3Mask); + for (uint32_t i = 0; i < 4; i++) + { + bool convex = m_GPUOutResulData.parallelogram.edgeIsConvex[i] != 0; + bool n3 = (m_GPUOutResulData.parallelogram.n3Mask >> i) & 1u; + ImGui::Text("Edge[%u]: %s%s", i, + convex ? "convex" : "concave", + n3 ? " (N3 split)" : ""); + } + for (uint32_t i = 0; i < 4; i++) + ImGui::Text("Corner[%u]: (%.3f, %.3f)", i, m_GPUOutResulData.parallelogram.corners[i].x, m_GPUOutResulData.parallelogram.corners[i].y); + } + else if ((m_samplingMode == SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE || m_samplingMode == SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC || m_samplingMode == SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) && ImGui::CollapsingHeader("Spherical Pyramid", ImGuiTreeNodeFlags_DefaultOpen)) + { + ImGui::Text("Best Caliper Edge: %u", m_GPUOutResulData.pyramid.bestEdge); + ImGui::Separator(); + + ImGui::Text("Axis 1: (%.4f, %.4f, %.4f)", + m_GPUOutResulData.pyramid.axis1.x, m_GPUOutResulData.pyramid.axis1.y, m_GPUOutResulData.pyramid.axis1.z); + ImGui::Text(" Half-Width: %.4f Offset: %.4f", + m_GPUOutResulData.pyramid.halfWidth1, m_GPUOutResulData.pyramid.offset1); + ImGui::Text(" Bounds: [%.4f, %.4f]", + m_GPUOutResulData.pyramid.min1, m_GPUOutResulData.pyramid.max1); + + ImGui::Text("Axis 2: (%.4f, %.4f, %.4f)", + m_GPUOutResulData.pyramid.axis2.x, m_GPUOutResulData.pyramid.axis2.y, m_GPUOutResulData.pyramid.axis2.z); + ImGui::Text(" Half-Width: %.4f Offset: %.4f", + m_GPUOutResulData.pyramid.halfWidth2, m_GPUOutResulData.pyramid.offset2); + ImGui::Text(" Bounds: [%.4f, %.4f]", + m_GPUOutResulData.pyramid.min2, m_GPUOutResulData.pyramid.max2); + + ImGui::Separator(); + ImGui::Text("Center: (%.4f, %.4f, %.4f)", + m_GPUOutResulData.pyramid.center.x, m_GPUOutResulData.pyramid.center.y, m_GPUOutResulData.pyramid.center.z); + ImGui::Text("Solid Angle (bound): %.6f sr", m_GPUOutResulData.pyramid.solidAngle); + } + else if (m_samplingMode == TRIANGLE_SOLID_ANGLE || m_samplingMode == TRIANGLE_PROJECTED_SOLID_ANGLE && ImGui::CollapsingHeader("Spherical Triangle", ImGuiTreeNodeFlags_DefaultOpen)) + { + ImGui::Text("Spherical Lune Detected: %s", m_GPUOutResulData.triangleFan.sphericalLuneDetected ? "true" : "false"); + ImGui::Text("Triangle Count: %u", m_GPUOutResulData.triangleFan.triangleCount); + // print solidAngles for each triangle + { + ImGui::Text("Solid Angles per Triangle:"); + ImGui::BeginTable("SolidAnglesTable", 2); + ImGui::TableSetupColumn("Triangle Index"); + ImGui::TableSetupColumn("Solid Angle"); + ImGui::TableHeadersRow(); + for (uint32_t i = 0; i < m_GPUOutResulData.triangleFan.triangleCount; ++i) + { + ImGui::TableNextRow(); + ImGui::TableSetColumnIndex(0); + ImGui::Text("%u", i); + ImGui::TableSetColumnIndex(1); + ImGui::Text("%.6f", m_GPUOutResulData.triangleFan.solidAngles[i]); + } + ImGui::Text("Total: %.6f", m_GPUOutResulData.triangleFan.totalSolidAngles); + ImGui::EndTable(); + } + } + + { + float32_t3 xAxis = m_OBBModelMatrix[0].xyz; + float32_t3 yAxis = m_OBBModelMatrix[1].xyz; + float32_t3 zAxis = m_OBBModelMatrix[2].xyz; + + float32_t3 nx = normalize(xAxis); + float32_t3 ny = normalize(yAxis); + float32_t3 nz = normalize(zAxis); + + const float epsilon = 1e-4; + bool hasSkew = false; + if (abs(dot(nx, ny)) > epsilon || abs(dot(nx, nz)) > epsilon || abs(dot(ny, nz)) > epsilon) + hasSkew = true; + ImGui::Separator(); + ImGui::Text("Matrix Has Skew: %s", hasSkew ? "true" : "false"); + } + + static bool modalShown = false; + static bool modalDismissed = false; + static uint32_t lastSilhouetteIndex = ~0u; + + // Reset modal flags if silhouette configuration changed + if (m_GPUOutResulData.silhouette.silhouetteIndex != lastSilhouetteIndex) + { + modalShown = false; + modalDismissed = false; // Allow modal to show again for new configuration + lastSilhouetteIndex = m_GPUOutResulData.silhouette.silhouetteIndex; + } + + // Reset flags when mismatch is cleared + if (!m_GPUOutResulData.silhouette.edgeVisibilityMismatch && !m_GPUOutResulData.triangleFan.maxTrianglesExceeded && !m_GPUOutResulData.triangleFan.sphericalLuneDetected) + { + modalShown = false; + modalDismissed = false; + } + + // Open modal only if not already shown/dismissed + if ((m_GPUOutResulData.silhouette.edgeVisibilityMismatch || m_GPUOutResulData.triangleFan.maxTrianglesExceeded || m_GPUOutResulData.triangleFan.sphericalLuneDetected) && m_GPUOutResulData.silhouette.silhouetteIndex != 13 && !modalShown && !modalDismissed) // Don't reopen if user dismissed it + { + ImGui::OpenPopup("Edge Visibility Mismatch Warning"); + modalShown = true; + } + + // Modal popup + if (ImGui::BeginPopupModal("Edge Visibility Mismatch Warning", NULL, ImGuiWindowFlags_AlwaysAutoResize)) + { + ImGui::TextColored(ImVec4(1.0f, 0.5f, 0.0f, 1.0f), "Warning: Edge Visibility Mismatch Detected!"); + ImGui::Separator(); + ImGui::Text("The silhouette lookup table (LUT) does not match the computed edge visibility."); + ImGui::Text("This indicates the pre-computed silhouette data may be incorrect."); + ImGui::Spacing(); + ImGui::TextWrapped("Configuration Index: %u", m_GPUOutResulData.silhouette.silhouetteIndex); + ImGui::TextWrapped("Region: (%u, %u, %u)", m_GPUOutResulData.silhouette.region.x, m_GPUOutResulData.silhouette.region.y, m_GPUOutResulData.silhouette.region.z); + ImGui::Spacing(); + ImGui::Text("Mismatched Vertices (bitmask): 0x%08X", m_GPUOutResulData.silhouette.edgeVisibilityMismatch); + ImGui::Text("Vertices involved in mismatched edges:"); + ImGui::Indent(); + for (int i = 0; i < 8; i++) + { + if (m_GPUOutResulData.silhouette.edgeVisibilityMismatch & (1u << i)) + { + ImGui::BulletText("Vertex %d", i); + } + } + ImGui::Unindent(); + ImGui::Spacing(); + if (ImGui::Button("OK", ImVec2(120, 0))) + { + ImGui::CloseCurrentPopup(); + modalShown = false; + modalDismissed = true; // Mark as dismissed to prevent reopening + } + ImGui::EndPopup(); + } + } + ImGui::End(); + } + + // view matrices editor + { + ImGui::Begin("Matrices"); + + auto addMatrixTable = [&](const char* topText, const char* tableName, const int rows, const int columns, const float* pointer, const bool withSeparator = true) + { + ImGui::Text(topText); + if (ImGui::BeginTable(tableName, columns)) + { + for (int y = 0; y < rows; ++y) + { + ImGui::TableNextRow(); + for (int x = 0; x < columns; ++x) + { + ImGui::TableSetColumnIndex(x); + ImGui::Text("%.3f", *(pointer + (y * columns) + x)); + } + } + ImGui::EndTable(); + } + + if (withSeparator) + ImGui::Separator(); + }; + + static RandomSampler rng(0x45); // Initialize RNG with seed + + // Helper function to check if cube intersects unit sphere at origin + auto isCubeOutsideUnitSphere = [](const float32_t3& translation, const float32_t3& scale) -> bool + { + float cubeRadius = glm::length(scale) * 0.5f; + float distanceToCenter = glm::length(translation); + return (distanceToCenter - cubeRadius) > 1.0f; + }; + + static TRS lastTRS = {}; + if (ImGui::Button("Randomize Translation")) + { + lastTRS = m_TRS; // Backup before randomizing + int attempts = 0; + do + { + m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); + attempts++; + } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); + } + ImGui::SameLine(); + if (ImGui::Button("Randomize Rotation")) + { + lastTRS = m_TRS; // Backup before randomizing + m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f)); + } + ImGui::SameLine(); + if (ImGui::Button("Randomize Scale")) + { + lastTRS = m_TRS; // Backup before randomizing + int attempts = 0; + do + { + m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); + attempts++; + } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); + } + // ImGui::SameLine(); + if (ImGui::Button("Randomize All")) + { + lastTRS = m_TRS; // Backup before randomizing + int attempts = 0; + do + { + m_TRS.translation = float32_t3(rng.nextFloat(-3.f, 3.f), rng.nextFloat(-3.f, 3.f), rng.nextFloat(-1.f, 3.f)); + m_TRS.rotation = float32_t3(rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f), rng.nextFloat(-180.f, 180.f)); + m_TRS.scale = float32_t3(rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f), rng.nextFloat(0.5f, 2.0f)); + attempts++; + } while (!isCubeOutsideUnitSphere(m_TRS.translation, m_TRS.scale) && attempts < 100); + } + ImGui::SameLine(); + if (ImGui::Button("Revert to Last")) + { + m_TRS = lastTRS; // Restore backed-up TRS + } + + addMatrixTable("Model Matrix", "ModelMatrixTable", 4, 4, &m_OBBModelMatrix[0][0]); + addMatrixTable("Camera View Matrix", "ViewMatrixTable", 3, 4, &camera.getViewMatrix()[0].x); + addMatrixTable("Camera View Projection Matrix", "ViewProjectionMatrixTable", 4, 4, &camera.getProjectionMatrix()[0].x, false); + + ImGui::End(); + } + + // Nabla Imgui backend MDI buffer info + // To be 100% accurate and not overly conservative we'd have to explicitly `cull_frees` and defragment each time, + // so unless you do that, don't use this basic info to optimize the size of your IMGUI buffer. + { + auto* streaminingBuffer = imGUI->getStreamingBuffer(); + + const size_t total = streaminingBuffer->get_total_size(); // total memory range size for which allocation can be requested + const size_t freeSize = streaminingBuffer->getAddressAllocator().get_free_size(); // max total free bloock memory size we can still allocate from total memory available + const size_t consumedMemory = total - freeSize; // memory currently consumed by streaming buffer + + float freePercentage = 100.0f * (float)(freeSize) / (float)total; + float allocatedPercentage = (float)(consumedMemory) / (float)total; + + ImVec2 barSize = ImVec2(400, 30); + float windowPadding = 10.0f; + float verticalPadding = ImGui::GetStyle().FramePadding.y; + + ImGui::SetNextWindowSize(ImVec2(barSize.x + 2 * windowPadding, 110 + verticalPadding), ImGuiCond_Always); + ImGui::Begin("Nabla Imgui MDI Buffer Info", nullptr, ImGuiWindowFlags_NoResize | ImGuiWindowFlags_NoScrollbar); + + ImGui::Text("Total Allocated Size: %zu bytes", total); + ImGui::Text("In use: %zu bytes", consumedMemory); + ImGui::Text("Buffer Usage:"); + + ImGui::SetCursorPosX(windowPadding); + + if (freePercentage > 70.0f) + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(0.0f, 1.0f, 0.0f, 0.4f)); // Green + else if (freePercentage > 30.0f) + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 1.0f, 0.0f, 0.4f)); // Yellow + else + ImGui::PushStyleColor(ImGuiCol_PlotHistogram, ImVec4(1.0f, 0.0f, 0.0f, 0.4f)); // Red + + ImGui::ProgressBar(allocatedPercentage, barSize, ""); + + ImGui::PopStyleColor(); + + ImDrawList* drawList = ImGui::GetWindowDrawList(); + + ImVec2 progressBarPos = ImGui::GetItemRectMin(); + ImVec2 progressBarSize = ImGui::GetItemRectSize(); + + const char* text = "%.2f%% free"; + char textBuffer[64]; + snprintf(textBuffer, sizeof(textBuffer), text, freePercentage); + + ImVec2 textSize = ImGui::CalcTextSize(textBuffer); + ImVec2 textPos = ImVec2( + progressBarPos.x + (progressBarSize.x - textSize.x) * 0.5f, + progressBarPos.y + (progressBarSize.y - textSize.y) * 0.5f); + + ImVec4 bgColor = ImGui::GetStyleColorVec4(ImGuiCol_WindowBg); + drawList->AddRectFilled( + ImVec2(textPos.x - 5, textPos.y - 2), + ImVec2(textPos.x + textSize.x + 5, textPos.y + textSize.y + 2), + ImGui::GetColorU32(bgColor)); + + ImGui::SetCursorScreenPos(textPos); + ImGui::Text("%s", textBuffer); + + ImGui::Dummy(ImVec2(0.0f, verticalPadding)); + + ImGui::End(); + } + ImGui::End(); + + ImGuizmo::RecomposeMatrixFromComponents(&m_TRS.translation.x, &m_TRS.rotation.x, &m_TRS.scale.x, &m_OBBModelMatrix[0][0]); + } + + smart_refctd_ptr imGUI; + + // descriptor set + smart_refctd_ptr subAllocDS; + enum E_RENDER_VIEWS : uint8_t + { + ERV_MAIN_VIEW, + ERV_SOLID_ANGLE_VIEW, + Count + }; + SubAllocatedDescriptorSet::value_type renderColorViewDescIndices[E_RENDER_VIEWS::Count] = {SubAllocatedDescriptorSet::invalid_value, SubAllocatedDescriptorSet::invalid_value}; + // + Camera camera = Camera(cameraIntialPosition, cameraInitialTarget, {}, 1, 1, nbl::core::vectorSIMDf(0.0f, 0.0f, 1.0f)); + // mutables + struct TRS // Source of truth + { + float32_t3 translation {0.0f, 0.0f, 1.5f}; + float32_t3 rotation {0.0f}; // MUST stay orthonormal + float32_t3 scale {1.0f}; + } m_TRS; + float32_t4x4 m_OBBModelMatrix; // always overwritten from TRS + + // std::string_view objectName; + TransformRequestParams transformParams; + TransformReturnInfo mainViewTransformReturnInfo; + TransformReturnInfo solidAngleViewTransformReturnInfo; + + const static inline core::vectorSIMDf cameraIntialPosition {-3.0f, 6.0f, 3.0f}; + const static inline core::vectorSIMDf cameraInitialTarget {0.f, 0.0f, 3.f}; + const static inline core::vectorSIMDf cameraInitialUp {0.f, 0.f, 1.f}; + + float fov = 90.f, zNear = 0.1f, zFar = 10000.f, moveSpeed = 1.f, rotateSpeed = 1.f; + float viewWidth = 10.f; + // uint16_t gcIndex = {}; // note: this is dirty however since I assume only single object in scene I can leave it now, when this example is upgraded to support multiple objects this needs to be changed + bool isPerspective = true, isLH = true, flipGizmoY = true, move = true; + bool firstFrame = true; + + SolidAngleVisualizer* m_visualizer; + } interface; + + class SamplingBenchmark final + { + public: + SamplingBenchmark(SolidAngleVisualizer& base) + : m_api(base.m_api), m_device(base.m_device), m_logger(base.m_logger), m_visualizer(&base) + { + // setting up pipeline in the constructor + m_queueFamily = base.getComputeQueue()->getFamilyIndex(); + m_cmdpool = base.m_device->createCommandPool(m_queueFamily, IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!m_cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &m_cmdbuf)) + base.logFail("Failed to create Command Buffers!\n"); + + // Load shaders, set up pipelines (one per sampling mode) + { + auto loadShader = [&](auto key) -> smart_refctd_ptr + { + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = base.m_logger.get(); + lp.workingDirectory = "app_resources"; + auto assetBundle = base.m_assetMgr->getAsset(key.data(), lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + base.logFail("Could not load shader!"); + assert(0); + } + assert(assets.size() == 1); + auto shader = IAsset::castDown(assets[0]); + if (!shader) + base.logFail("Failed to load precompiled benchmark shader!\n"); + return shader; + }; + + const char* shaderNames[SAMPLING_MODE::Count] = { + "benchmark_tri_sa", + "benchmark_tri_psa", + "benchmark_para", + "benchmark_rectangle", + "benchmark_biquad", + "benchmark_bilinear", + "benchmark_proj_rectangle", + "benchmark_silhouette", + "benchmark_pyramid_creation", + }; + smart_refctd_ptr shaders[SAMPLING_MODE::Count] = { + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_tri_sa">(m_device.get())), + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_tri_psa">(m_device.get())), + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_para">(m_device.get())), + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_rectangle">(m_device.get())), + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_biquad">(m_device.get())), + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_bilinear">(m_device.get())), + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_proj_rectangle">(m_device.get())), + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_silhouette">(m_device.get())), + loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_pyramid_creation">(m_device.get())), + }; + + nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { + {.binding = 0, + .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = ShaderStage::ESS_COMPUTE, + .count = 1}}; + smart_refctd_ptr dsLayout = base.m_device->createDescriptorSetLayout(bindings); + if (!dsLayout) + base.logFail("Failed to create a Descriptor Layout!\n"); + + SPushConstantRange pushConstantRanges[] = { + {.stageFlags = ShaderStage::ESS_COMPUTE, + .offset = 0, + .size = sizeof(BenchmarkPushConstants)}}; + m_pplnLayout = base.m_device->createPipelineLayout(pushConstantRanges, smart_refctd_ptr(dsLayout)); + if (!m_pplnLayout) + base.logFail("Failed to create a Pipeline Layout!\n"); + + for (uint32_t i = 0; i < SAMPLING_MODE::Count; i++) + { + IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_pplnLayout.get(); + params.shader.entryPoint = "main"; + params.shader.shader = shaders[i].get(); + if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) + { + params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_STATISTICS; + params.flags |= IGPUComputePipeline::SCreationParams::FLAGS::CAPTURE_INTERNAL_REPRESENTATIONS; + } + if (!base.m_device->createComputePipelines(nullptr, {¶ms, 1}, &m_pipelines[i])) + base.logFail("Failed to create pipelines (compile & link shaders)!\n"); + if (base.m_device->getEnabledFeatures().pipelineExecutableInfo) + { + m_pipelineReports[i] = system::to_string(m_pipelines[i]->getExecutableInfo()); + m_pipelineReportNames[i] = shaderNames[i]; + } + } + + // Allocate the memory + { + constexpr size_t BufferSize = BENCHMARK_WORKGROUP_COUNT * BENCHMARK_WORKGROUP_DIMENSION_SIZE_X * + BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y * BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z * sizeof(uint32_t); + + nbl::video::IGPUBuffer::SCreationParams params = {}; + params.size = BufferSize; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT; + smart_refctd_ptr dummyBuff = base.m_device->createBuffer(std::move(params)); + if (!dummyBuff) + base.logFail("Failed to create a GPU Buffer of size %d!\n", params.size); + + dummyBuff->setObjectDebugName("benchmark buffer"); + + nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = dummyBuff->getMemoryReqs(); + + m_allocation = base.m_device->allocate(reqs, dummyBuff.get(), nbl::video::IDeviceMemoryAllocation::EMAF_NONE); + if (!m_allocation.isValid()) + base.logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + + assert(dummyBuff->getBoundMemory().memory == m_allocation.memory.get()); + smart_refctd_ptr pool = base.m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, {&dsLayout.get(), 1}); + + m_ds = pool->createDescriptorSet(std::move(dsLayout)); + { + IGPUDescriptorSet::SDescriptorInfo info[1]; + info[0].desc = smart_refctd_ptr(dummyBuff); + info[0].info.buffer = {.offset = 0, .size = BufferSize}; + IGPUDescriptorSet::SWriteDescriptorSet writes[1] = { + {.dstSet = m_ds.get(), .binding = 0, .arrayElement = 0, .count = 1, .info = info}}; + base.m_device->updateDescriptorSets(writes, {}); + } + } + } + + IQueryPool::SCreationParams queryPoolCreationParams {}; + queryPoolCreationParams.queryType = IQueryPool::TYPE::TIMESTAMP; + queryPoolCreationParams.queryCount = 2; + queryPoolCreationParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + m_queryPool = m_device->createQueryPool(queryPoolCreationParams); + + m_computeQueue = m_device->getQueue(m_queueFamily, 0); + m_physicalDevice = base.m_device->getPhysicalDevice(); + m_timestampPeriodNs = float64_t(m_physicalDevice->getLimits().timestampPeriodInNanoSeconds); + } + + void run() + { + // Pipeline executable reports first so the timings cluster at the bottom of the log. + for (uint32_t i = 0; i < SAMPLING_MODE::Count; i++) + { + if (!m_pipelineReports[i].empty()) + m_logger->log("%s Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, m_pipelineReportNames[i], m_pipelineReports[i].c_str()); + } + + const uint64_t totalThreads = (uint64_t)BENCHMARK_WORKGROUP_COUNT * BENCHMARK_WORKGROUP_DIMENSION_SIZE_X; + m_logger->log("\n\n=== GPU Sampler Benchmarks (%d dispatches, %llu threads/dispatch, %d samples/thread, ps/sample is per all GPU threads) ===", + ILogger::ELL_PERFORMANCE, Dispatches, totalThreads, m_BenchmarkSampleCount); + m_logger->log(" timestampPeriod = %.1f ps/tick", ILogger::ELL_PERFORMANCE, m_timestampPeriodNs * 1000.0); + m_logger->log("%-28s | %-12s | %9s | %10s | %10s", + ILogger::ELL_PERFORMANCE, "Sampler", "Mode", "ps/sample", "GSamples/s", "ms total"); + + struct SamplerEntry + { + const char* name; + SAMPLING_MODE mode; + }; + const SamplerEntry samplers[] = { + {"PYRAMID_RECTANGLE", SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE}, + {"PYRAMID_PROJ_RECTANGLE", SAMPLING_MODE::SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE}, + {"PYRAMID_BIQUADRATIC", SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC}, + {"PYRAMID_BILINEAR", SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR}, + {"PARALLELOGRAM", SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE}, + {"TRIANGLE_SA", SAMPLING_MODE::TRIANGLE_SOLID_ANGLE}, + {"TRIANGLE_PSA", SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE}, + }; + + // Creation-only modes: report per-creation, not per-sample. + performBenchmark("SILHOUETTE_CREATION_ONLY", SAMPLING_MODE::SILHOUETTE_CREATION_ONLY, totalThreads, 0); + performBenchmark("PYRAMID_CREATION_ONLY", SAMPLING_MODE::PYRAMID_CREATION_ONLY, totalThreads, 0); + + // Modes per sampler: 1 creation per N samples. 1 = no amortization, sampleCount = full amortization. + const uint32_t modeRatios[] = {1u, 16u, uint32_t(m_BenchmarkSampleCount)}; + for (uint32_t spc : modeRatios) + for (const auto& s : samplers) + performBenchmark(s.name, s.mode, totalThreads, spc); + + + } + + private: + // samplesPerCreation: > 0 selects sampling mode with that 1:N ratio; 0 means create-only mode (label "create-only"). + void performBenchmark(const char* name, SAMPLING_MODE mode, uint64_t totalThreads, uint32_t samplesPerCreation) + { + m_device->waitIdle(); + + const bool isCreationBenchmark = (mode == SAMPLING_MODE::SILHOUETTE_CREATION_ONLY || mode == SAMPLING_MODE::PYRAMID_CREATION_ONLY); + + m_pushConstants.modelMatrix = float32_t3x4(transpose(m_visualizer->interface.m_OBBModelMatrix)); + m_pushConstants.sampleCount = m_BenchmarkSampleCount; + // For create-only modes the inner loop is unused; pick any divisor of sampleCount to keep the shader's `creations = sampleCount / samplesPerCreation` well-defined. + m_pushConstants.samplesPerCreation = isCreationBenchmark ? uint32_t(m_BenchmarkSampleCount) : samplesPerCreation; + recordCmdBuff(mode); + + // Nabla's IQueue::submit rejects submissions without a signal semaphore + // (SSubmitInfo::valid() requires signalSemaphores non-empty so the + // submission's resources can be tracked on a timeline). + smart_refctd_ptr done = m_device->createSemaphore(0); + const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {{.semaphore = done.get(), .value = 1, .stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS}}; + + IQueue::SSubmitInfo submitInfos[1] = {}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = {{.cmdbuf = m_cmdbuf.get()}}; + submitInfos[0].commandBuffers = cmdbufs; + submitInfos[0].signalSemaphores = signals; + + m_api->startCapture(); + m_computeQueue->submit(submitInfos); + const ISemaphore::SWaitInfo waitInfo[] = {{.semaphore = done.get(), .value = 1}}; + m_device->blockForSemaphores(waitInfo); + m_api->endCapture(); + + const float64_t elapsed_ps = float64_t(calcTimeElapsed()) * m_timestampPeriodNs * 1000.0; + + const uint64_t totalOps = uint64_t(Dispatches) * totalThreads * uint64_t(m_BenchmarkSampleCount); + const float64_t ps_per_op = elapsed_ps / float64_t(totalOps); + const float64_t gops_per_s = float64_t(totalOps) / elapsed_ps * 1e3; // ops / (ps × 1e-12) / 1e9 + const float64_t elapsed_ms = elapsed_ps * 1e-9; + + char modeBuf[16]; + if (isCreationBenchmark) + snprintf(modeBuf, sizeof(modeBuf), "create-only"); + else + snprintf(modeBuf, sizeof(modeBuf), "1:%u", samplesPerCreation); + + m_logger->log("%-28s | %-12s | %9.2f | %10.2f | %10.3f", + ILogger::ELL_PERFORMANCE, name, modeBuf, ps_per_op, gops_per_s, elapsed_ms); + } + + void recordCmdBuff(SAMPLING_MODE mode) + { + m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); + m_cmdbuf->beginDebugMarker("sampling compute dispatch", vectorSIMDf(0, 1, 0, 1)); + m_cmdbuf->bindComputePipeline(m_pipelines[mode].get()); + m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); + m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants); + + // Serialize back-to-back dispatches so each completes before the next begins + // (matches the original semaphore-chain methodology — measurement is per-dispatch + // time, not pipelined throughput). + const asset::SMemoryBarrier serializeDispatch = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + }; + const IGPUCommandBuffer::SPipelineBarrierDependencyInfo barrierInfo = {.memBarriers = {&serializeDispatch, 1}}; + + for (int i = 0; i < WarmupDispatches; ++i) + { + m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); + m_cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, barrierInfo); + } + + m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 0); + + for (int i = 0; i < Dispatches; ++i) + { + m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); + if (i + 1 < Dispatches) + m_cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, barrierInfo); + } + + m_cmdbuf->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, m_queryPool.get(), 1); + m_cmdbuf->endDebugMarker(); + m_cmdbuf->end(); + } + + uint64_t calcTimeElapsed() + { + uint64_t timestamps[2]; + const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); + m_device->getQueryPoolResults(m_queryPool.get(), 0, 2, ×tamps, sizeof(uint64_t), flags); + return timestamps[1] - timestamps[0]; + } + + private: + core::smart_refctd_ptr m_api; + smart_refctd_ptr m_device; + smart_refctd_ptr m_logger; + SolidAngleVisualizer* m_visualizer; + + nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; + smart_refctd_ptr m_cmdpool = nullptr; + smart_refctd_ptr m_cmdbuf = nullptr; + smart_refctd_ptr m_ds = nullptr; + smart_refctd_ptr m_pplnLayout = nullptr; + BenchmarkPushConstants m_pushConstants; + smart_refctd_ptr m_pipelines[SAMPLING_MODE::Count]; + + smart_refctd_ptr m_queryPool = nullptr; + + std::string m_pipelineReports[SAMPLING_MODE::Count]; + const char* m_pipelineReportNames[SAMPLING_MODE::Count] = {}; + + uint32_t m_queueFamily; + IQueue* m_computeQueue; + const nbl::video::IPhysicalDevice* m_physicalDevice = nullptr; + float64_t m_timestampPeriodNs = 1.0; + static constexpr int WarmupDispatches = 100; + static constexpr int Dispatches = 1000; + }; + + template + inline bool logFail(const char* msg, Args&&... args) + { + m_logger->log(msg, ILogger::ELL_ERROR, std::forward(args)...); + return false; + } + + std::ofstream m_logFile; }; NBL_MAIN_FUNC(SolidAngleVisualizer) \ No newline at end of file From f573c61bdbd5dc7ca0b5a446fb70bccd7ed31e2a Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Tue, 28 Apr 2026 13:50:41 +0300 Subject: [PATCH 23/26] Kelvin Stokes centroid, new O(N) algo --- .../app_resources/hlsl/debug_vis.hlsl | 23 +- .../app_resources/hlsl/pyramid_sampling.hlsl | 352 ++++++++++-------- 73_SolidAngleVisualizer/main.cpp | 19 +- 3 files changed, 204 insertions(+), 190 deletions(-) diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl index 916390323..c34b76c65 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl @@ -19,9 +19,7 @@ struct DebugRecorder DebugDataBuffer[0].silhouette.clippedVertexIndices[slot] = originalIndex; } - static void recordClipResult(uint32_t vertexCount, uint32_t clipMask, uint32_t clipCount, - uint32_t rotatedClipMask, uint32_t rotateAmount, uint32_t positiveCount, - bool wrapAround, uint32_t rotatedSil) + static void recordClipResult(uint32_t vertexCount, uint32_t clipMask, uint32_t clipCount, uint32_t rotatedClipMask, uint32_t rotateAmount, uint32_t positiveCount, bool wrapAround, uint32_t rotatedSil) { DebugDataBuffer[0].silhouette.clippedVertexCount = vertexCount; DebugDataBuffer[0].silhouette.clipMask = clipMask; @@ -43,8 +41,7 @@ struct DebugRecorder DebugDataBuffer[0].triangleFan.solidAngles[tri] = solidAngles[tri]; } - static void recordParallelogram(float32_t area, uint32_t convexMask, uint32_t n3Mask, - float32_t2 corner, float32_t2 axisDir, float32_t width, float32_t height) + static void recordParallelogram(float32_t area, uint32_t convexMask, uint32_t n3Mask, float32_t2 corner, float32_t2 axisDir, float32_t width, float32_t height) { DebugDataBuffer[0].parallelogram.area = area; @@ -61,8 +58,7 @@ struct DebugRecorder DebugDataBuffer[0].parallelogram.corners[3] = corner + height * perpDir; } - static void recordPyramid(float32_t3 axis1, float32_t3 axis2, float32_t3 center, - float32_t4 bounds, float32_t solidAngle, uint32_t bestEdge) + static void recordPyramid(float32_t3 axis1, float32_t3 axis2, float32_t3 center, float32_t4 bounds, float32_t solidAngle, uint32_t bestEdge) { DebugDataBuffer[0].pyramid.axis1 = axis1; DebugDataBuffer[0].pyramid.axis2 = axis2; @@ -80,8 +76,7 @@ struct DebugRecorder static void recordSampleCount(uint32_t count) { DebugDataBuffer[0].sampling.sampleCount = count; } static void recordRay(uint32_t i, float32_t3 dir, float32_t pdf) { DebugDataBuffer[0].sampling.rayData[i] = float32_t4(dir, pdf); } - static void recordFrameEnd(uint32_t3 region, uint32_t configIndex, uint32_t silSize, - uint32_t silData, uint32_t vertexIndices[6], uint32_t validSampleCount) + static void recordFrameEnd(uint32_t3 region, uint32_t configIndex, uint32_t silSize, uint32_t silData, uint32_t vertexIndices[6], uint32_t validSampleCount) { InterlockedAdd(DebugDataBuffer[0].sampling.validSampleCount, validSampleCount); InterlockedAdd(DebugDataBuffer[0].sampling.threadCount, 1u); @@ -94,14 +89,10 @@ struct DebugRecorder } #else static void recordClippedVertex(uint32_t slot, float32_t3 pos, uint32_t originalIndex) {} - static void recordClipResult(uint32_t vertexCount, uint32_t clipMask, uint32_t clipCount, - uint32_t rotatedClipMask, uint32_t rotateAmount, uint32_t positiveCount, - bool wrapAround, uint32_t rotatedSil) {} + static void recordClipResult(uint32_t vertexCount, uint32_t clipMask, uint32_t clipCount, uint32_t rotatedClipMask, uint32_t rotateAmount, uint32_t positiveCount, bool wrapAround, uint32_t rotatedSil) {} static void recordTriangleFan(bool luneDetected, uint32_t count, float32_t totalWeight, float32_t solidAngles[5]) {} - static void recordParallelogram(float32_t area, uint32_t convexMask, uint32_t n3Mask, - float32_t2 corner, float32_t2 axisDir, float32_t width, float32_t height) {} - static void recordPyramid(float32_t3 axis1, float32_t3 axis2, float32_t3 center, - float32_t4 bounds, float32_t solidAngle, uint32_t bestEdge) {} + static void recordParallelogram(float32_t area, uint32_t convexMask, uint32_t n3Mask, float32_t2 corner, float32_t2 axisDir, float32_t width, float32_t height) {} + static void recordPyramid(float32_t3 axis1, float32_t3 axis2, float32_t3 center, float32_t4 bounds, float32_t solidAngle, uint32_t bestEdge) {} static void recordSampleCount(uint32_t count) {} static void recordRay(uint32_t i, float32_t3 dir, float32_t pdf) {} static void recordFrameEnd(uint32_t3 region, uint32_t configIndex, uint32_t silSize, diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl index afd60914c..1c7f3aaba 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl @@ -7,8 +7,10 @@ #include "common.hlsl" #include +#include #include #include +#include // acos_csc_approx #include #include @@ -19,16 +21,24 @@ // Spherical Pyramid: gnomonic bounding rectangle for silhouette sampling. // // Algorithm (SphericalPyramid::create): -// 1. Adaptive axis3: blend silhouette centroid toward (0,0,1) to keep -// all vertices in the positive gnomonic half-space. Branchless. -// 2. Rotating calipers: try each edge projected perpendicular to axis3, -// keep the axis1/axis2 rotation with minimum gnomonic bounding area. -// Edge normals are fused into this pass (cross products from the same -// vertex loads). -// 3. Sign-stabilize axis1 against a world-space reference. +// 1. Pass 1: walk the silhouette CCW, accumulating +// unnormCentroid = sum(cross(v_i, v_{i+1}) * acos_csc_approx(dot(v_i, v_{i+1}))) +// which is the sum of normalized outward edge normals weighted by arc length +// (Kelvin-Stokes form). This is the true spherical centroid of the polygon +// and serves as a much better gnomonic-projection axis than blending the raw +// vertex centroid toward (0,0,1). The cross products are also written into +// silEdgeNormals.edgeNormals[i] (used later by the inside-polygon test). +// 2. axis3 = normalize(unnormCentroid). +// 3. Pass 2: Frisvad basis (u, v) orthogonal to axis3; project all silhouette +// vertices to 2D gnomonic coordinates in (u, v) once, up front. +// 4. Pass 3: "guesstimate" calipers: pick the longest 2D edge as axis1, do +// a single bound pass. O(N) edge-length compares + 1 bound pass, vs the old +// O(N^2) cascade. The bound is slightly looser than the true min-area rect +// but the rejection sampler tolerates that. +// 5. Reconstruct 3D axis1, axis2; sign-stabilize axis1 against a world ref. // // axis3 is not stored, reconstructed as cross(axis1, axis2). -// rectR0 is float2 (z is always 1.0 in gnomonic space). +// rectR0 is float2 (z is always 1.0 in the local gnomonic frame). // ============================================================================ struct SphericalPyramid { @@ -40,127 +50,93 @@ struct SphericalPyramid float32_t3 getAxis3() NBL_CONST_MEMBER_FUNC { return cross(axis1, axis2); } // ======================================================================== - // Gnomonic Projection + // Pass 1: per-edge cross + arc-length-weighted accumulate // ======================================================================== - template - static void projectAndBound(const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], float32_t3 projAxis1, float32_t3 projAxis2, float32_t3 projAxis3, NBL_REF_ARG(float32_t4) bound) + template + static void accumulateEdge(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, NBL_REF_ARG(float32_t3) unnormCentroid, NBL_REF_ARG(SilEdgeNormals) silEdgeNormals) { - float32_t3 v = vertices[I]; - float32_t x = dot(v, projAxis1); - float32_t y = dot(v, projAxis2); - float32_t z = dot(v, projAxis3); - float32_t rcpZ = (z > 0.0f) ? rcp(z) : 0.0f; - float32_t projX = x * rcpZ; - float32_t projY = y * rcpZ; - bound.x = min(bound.x, projX); - bound.y = min(bound.y, projY); - bound.z = max(bound.z, projX); - bound.w = max(bound.w, projY); + const uint32_t j = CheckCount ? ((I + 1 < silhouette.count) ? I + 1 : 0) : I + 1; + float32_t3 vI = silhouette.vertices[I]; + float32_t3 vJ = silhouette.vertices[j]; + float32_t3 c = cross(vI, vJ); + silEdgeNormals.edgeNormals[I] = c; + // |c| = sin(arc) since vI, vJ are unit; so c/|c| * arc = c * acos(dot)/sin(arc) = c * acos_csc(dot). + // Clamp away from -1: acos_csc_approx contains log2(1+arg), which goes -inf at arg=-1 and + // produces inf-inf = NaN inside the order-2 polynomial for near-antipodal edges (which can + // occur for "wide" silhouettes whose adjacent vertices sit far apart on the sphere). + // TODO: will be moved to it's own namespace + const float32_t cos_arc = max(dot(vI, vJ), -1.0f + 1e-5f); + unnormCentroid += c * nbl::hlsl::shapes::acos_csc_approx(cos_arc); } - // Template-unrolled projection of all vertices. - static void projectAllVertices(const ClippedSilhouette silhouette, float32_t3 projAxis1, float32_t3 projAxis2, float32_t3 projAxis3, NBL_REF_ARG(float32_t4) bound) + // ======================================================================== + // Pass 2: gnomonic project a single silhouette vertex into the (u,v) plane. + // Skips the (w_dot > 0) guard, axis3 = normalize(unnormCentroid) is the + // polygon's interior direction so all vertices have w_dot > 0 by construction. + // ======================================================================== + template + static float32_t2 projectVertex2D(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, float32_t3 axis_u, float32_t3 axis_v, float32_t3 axis3) { - bound = float32_t4(1e10f, 1e10f, -1e10f, -1e10f); - projectAndBound<0>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); - projectAndBound<1>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); - projectAndBound<2>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); - if (silhouette.count > 3) - { - projectAndBound<3>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); - if (silhouette.count > 4) - { - projectAndBound<4>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); - if (silhouette.count > 5) - { - projectAndBound<5>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); - if (silhouette.count > 6) - { - projectAndBound<6>(silhouette.vertices, projAxis1, projAxis2, projAxis3, bound); - } - } - } - } + float32_t3 vert = silhouette.vertices[I]; + float32_t rcpW = rcp(dot(vert, axis3)); + return float32_t2(dot(vert, axis_u), dot(vert, axis_v)) * rcpW; } // ======================================================================== - // Adaptive Axis3 + // Pass 3: 2D rotating-calipers helpers // ======================================================================== - - // t = max blend keeping dot(v, centroid*t + (0,0,1)) >= margin. - template - static float32_t blendLimit(const float32_t3 vertices[MAX_SILHOUETTE_VERTICES], float32_t3 center, float32_t margin, float32_t curMin) + template + static void boundOne2D(const float32_t2 verts2d[MAX_SILHOUETTE_VERTICES], float32_t2 axis2d, float32_t2 perp2d, NBL_REF_ARG(float32_t4) bound) { - float32_t cd = dot(vertices[I], center); - float32_t tLimit = (cd < 0.0f) ? ((vertices[I].z - margin) / -cd) : 1e10f; - return min(curMin, tLimit); + float32_t2 v2 = verts2d[K]; + float32_t x = dot(v2, axis2d); + float32_t y = dot(v2, perp2d); + bound.x = min(bound.x, x); + bound.y = min(bound.y, y); + bound.z = max(bound.z, x); + bound.w = max(bound.w, y); } - static float32_t computeBlendFactor(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, float32_t3 center, float32_t margin) + static void computeBound2D(const float32_t2 verts2d[MAX_SILHOUETTE_VERTICES], uint32_t count, float32_t2 axis2d, float32_t2 perp2d, NBL_REF_ARG(float32_t4) bound) { - float32_t t = 1e10f; - t = blendLimit<0>(silhouette.vertices, center, margin, t); - t = blendLimit<1>(silhouette.vertices, center, margin, t); - t = blendLimit<2>(silhouette.vertices, center, margin, t); - if (silhouette.count > 3) + bound = float32_t4(1e10f, 1e10f, -1e10f, -1e10f); + boundOne2D<0>(verts2d, axis2d, perp2d, bound); + boundOne2D<1>(verts2d, axis2d, perp2d, bound); + boundOne2D<2>(verts2d, axis2d, perp2d, bound); + if (count > 3) { - t = blendLimit<3>(silhouette.vertices, center, margin, t); - if (silhouette.count > 4) + boundOne2D<3>(verts2d, axis2d, perp2d, bound); + if (count > 4) { - t = blendLimit<4>(silhouette.vertices, center, margin, t); - if (silhouette.count > 5) + boundOne2D<4>(verts2d, axis2d, perp2d, bound); + if (count > 5) { - t = blendLimit<5>(silhouette.vertices, center, margin, t); - if (silhouette.count > 6) - { - t = blendLimit<6>(silhouette.vertices, center, margin, t); - } + boundOne2D<5>(verts2d, axis2d, perp2d, bound); + if (count > 6) + boundOne2D<6>(verts2d, axis2d, perp2d, bound); } } } - return max(t, 0.0f); } - // ======================================================================== - // Rotating Calipers (fused edge normal computation) - // ======================================================================== + // "Guesstimate" pass 3: pick the longest 2D edge as axis1 and do ONE bound + // computation, instead of trying every edge as a caliper candidate. O(N) + + // one bound pass, vs old O(N^2) of bound passes. The bound is slightly + // looser than the true min-area rect (typically a few percent for OBB + // silhouettes), but the rejection sampler tolerates that. template - static void tryCaliperCandidate(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, float32_t3 fixedAxis3, - NBL_REF_ARG(float32_t) bestArea, NBL_REF_ARG(float32_t3) bestAxis1, - NBL_REF_ARG(float32_t3) bestAxis2, NBL_REF_ARG(float32_t4) bestBound, - NBL_REF_ARG(uint32_t) bestEdge, NBL_REF_ARG(SilEdgeNormals) silEdgeNormals) + static void considerEdge(const float32_t2 verts2d[MAX_SILHOUETTE_VERTICES], uint32_t count, NBL_REF_ARG(float32_t) bestLenSq, NBL_REF_ARG(float32_t2) bestEdge2d, NBL_REF_ARG(uint32_t) bestEdge) { - const uint32_t j = CheckCount ? ((I + 1 < silhouette.count) ? I + 1 : 0) : I + 1; - float32_t3 vI = silhouette.vertices[I]; - float32_t3 vJ = silhouette.vertices[j]; - - // Fused: edge normal from the same vertex pair (vertices already in registers) - silEdgeNormals.edgeNormals[I] = cross(vI, vJ); - - float32_t3 edge = vJ - vI; - - // Project edge perpendicular to axis3. Skip edges nearly parallel to axis3. - float32_t3 edgeInPlane = edge - fixedAxis3 * dot(edge, fixedAxis3); - float32_t lenSq = dot(edgeInPlane, edgeInPlane); - if (lenSq < 0.01f * dot(edge, edge)) - return; - - float32_t3 axis1Cand = edgeInPlane * rsqrt(lenSq); - float32_t3 axis2Cand = cross(fixedAxis3, axis1Cand); - - float32_t4 bound; - projectAllVertices(silhouette, axis1Cand, axis2Cand, fixedAxis3, bound); - - // Sticky selection: new edge must be meaningfully better (1% smaller area) - // to prevent jitter when two edges have nearly identical bounding rects. - float32_t rectArea = (bound.z - bound.x) * (bound.w - bound.y); - if (rectArea < bestArea * (1.0f - 1e-2f)) + const uint32_t j = CheckCount ? ((I + 1 < count) ? I + 1 : 0) : I + 1; + float32_t2 edge2d = verts2d[j] - verts2d[I]; + float32_t lenSq = dot(edge2d, edge2d); + // Sticky 1% threshold (in lenSq, ~0.5% in length) prevents axis1 from flipping + // between two near-equal-length edges as the silhouette deforms. + if (lenSq > bestLenSq * (1.0f + 1e-2f)) { - bestArea = rectArea; - bestAxis1 = axis1Cand; - bestAxis2 = axis2Cand; - bestBound = bound; - bestEdge = I; + bestLenSq = lenSq; + bestEdge2d = edge2d; + bestEdge = I; } } @@ -173,74 +149,122 @@ struct SphericalPyramid SphericalPyramid self; silEdgeNormals = (SilEdgeNormals)0; - // Step 1: Adaptive axis3 (local var, reconstructed via getAxis3() after construction). - float32_t3 center = silhouette.getUnnormalizedCenter(); - const float32_t AXIS3_MARGIN = 0.15f; - float32_t tBlend = computeBlendFactor(silhouette, center, AXIS3_MARGIN); - float32_t3 axis3 = normalize(center * tBlend + float32_t3(0.0f, 0.0f, 1.0f)); - - // Step 2: Rotating calipers, min-area gnomonic bounding rectangle. - float32_t bestArea = 1e20f; - self.axis1 = float32_t3(0.0f, 1.0f, 0.0f); - self.axis2 = float32_t3(-1.0f, 0.0f, 0.0f); - float32_t4 bounds = float32_t4(-0.1f, -0.1f, 0.1f, 0.1f); - uint32_t bestEdge = 0; - - // Each candidate also computes cross(v[I], v[j]) for edge normals. - // I=2 needs the wrap check because count can be exactly 3 (j must wrap to 0). - tryCaliperCandidate<0>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); - tryCaliperCandidate<1>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); - tryCaliperCandidate<2, true>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); + // Pass 1: build unnormCentroid (true spherical centroid) and edgeNormals. + // Seed with a tiny scaled vertex centroid so symmetric / near-cancelling + // shapes don't degenerate to a zero direction on `normalize`. + float32_t3 unnormCentroid = silhouette.getUnnormalizedCenter() * 1e-6f; + + // Count-cascade: silhouette.vertices[I] for I >= count is uninitialized in some + // call sites (e.g. solid_angle_vis.frag.hlsl declares ClippedSilhouette without + // zero-init), so we must NOT read past count. I=2 needs the wrap check because + // count can be exactly 3 (j must wrap to 0). + accumulateEdge<0>(silhouette, unnormCentroid, silEdgeNormals); + accumulateEdge<1>(silhouette, unnormCentroid, silEdgeNormals); + accumulateEdge<2, true>(silhouette, unnormCentroid, silEdgeNormals); if (silhouette.count > 3) { - tryCaliperCandidate<3, true>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); + accumulateEdge<3, true>(silhouette, unnormCentroid, silEdgeNormals); if (silhouette.count > 4) { - tryCaliperCandidate<4, true>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); + accumulateEdge<4, true>(silhouette, unnormCentroid, silEdgeNormals); if (silhouette.count > 5) { - tryCaliperCandidate<5, true>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); + accumulateEdge<5, true>(silhouette, unnormCentroid, silEdgeNormals); if (silhouette.count > 6) - { - tryCaliperCandidate<6, true>(silhouette, axis3, bestArea, self.axis1, self.axis2, bounds, bestEdge, silEdgeNormals); - } + accumulateEdge<6, true>(silhouette, unnormCentroid, silEdgeNormals); } } } - // Step 3: Stabilize axis1 sign against a world-space reference. + const float32_t3 axis3 = normalize(-unnormCentroid); + + // Pass 2: Frisvad basis + 2D gnomonic projection (one-time, before calipers). + float32_t3 u, v; + nbl::hlsl::math::frisvad(axis3, u, v); + + // Project only the first `count` vertices; entries past `count` are unread by + // try2DCaliper since its cascade is also count-gated. + float32_t2 verts2d[MAX_SILHOUETTE_VERTICES]; + verts2d[0] = projectVertex2D<0>(silhouette, u, v, axis3); + verts2d[1] = projectVertex2D<1>(silhouette, u, v, axis3); + verts2d[2] = projectVertex2D<2>(silhouette, u, v, axis3); + if (silhouette.count > 3) { - float32_t3 worldRef = nbl::hlsl::select(abs(axis3.x) < 0.9f, float32_t3(1.0f, 0.0f, 0.0f), float32_t3(0.0f, 1.0f, 0.0f)); - float32_t3 axis1Ref = worldRef - axis3 * dot(worldRef, axis3); - if (dot(self.axis1, axis1Ref) < 0.0f) + verts2d[3] = projectVertex2D<3>(silhouette, u, v, axis3); + if (silhouette.count > 4) { - self.axis1 = -self.axis1; - // axis2 also flips (recomputed below), so mirror both x and y bounds. - bounds = float32_t4(-bounds.z, -bounds.w, -bounds.x, -bounds.y); + verts2d[4] = projectVertex2D<4>(silhouette, u, v, axis3); + if (silhouette.count > 5) + { + verts2d[5] = projectVertex2D<5>(silhouette, u, v, axis3); + if (silhouette.count > 6) + verts2d[6] = projectVertex2D<6>(silhouette, u, v, axis3); + } } } - // Step 4: Recompute axis2 so getAxis3() = cross(axis1, axis2) recovers axis3. - self.axis2 = cross(axis3, self.axis1); + // Pass 3: pick longest 2D edge as axis1 ("guesstimate" rotating calipers). + // O(N) edge-length comparisons, then ONE bound pass after the winner is known. + float32_t bestLenSq = 0.0f; + float32_t2 bestEdge2d = float32_t2(1.0f, 0.0f); + uint32_t bestEdge = 0; - // Degenerate bounds check - if (bounds.x >= bounds.z || bounds.y >= bounds.w) - bounds = float32_t4(-0.1f, -0.1f, 0.1f, 0.1f); - - self.rectR0 = bounds.xy; - self.rectExtents = float32_t2(bounds.zw - bounds.xy); + considerEdge<0>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); + considerEdge<1>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); + considerEdge<2, true>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); + if (silhouette.count > 3) + { + considerEdge<3, true>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); + if (silhouette.count > 4) + { + considerEdge<4, true>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); + if (silhouette.count > 5) + { + considerEdge<5, true>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); + if (silhouette.count > 6) + considerEdge<6, true>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); + } + } + } - float32_t solidAngle; + // Single bound pass with the winning edge as axis1. Fall back to (1,0) if + // every edge degenerated (silhouette projects to a single point). + const float32_t2 bestAxis2d = bestLenSq > 1e-12f ? bestEdge2d * rsqrt(bestLenSq) : float32_t2(1.0f, 0.0f); + const float32_t2 bestPerp2d = float32_t2(-bestAxis2d.y, bestAxis2d.x); + float32_t4 bestBound; + computeBound2D(verts2d, silhouette.count, bestAxis2d, bestPerp2d, bestBound); + + // Pass 4: reconstruct 3D, sign-stabilize axis1 against a world reference. + // For right-handed (u, v, axis3) Frisvad basis, cross(axis3, u) = v and cross(axis3, v) = -u, + // so axis1 = u*a + v*b => axis2 = cross(axis3, axis1) = v*a - u*b. Skip the 3D `cross`. + const float32_t3 axis1Raw = u * bestAxis2d.x + v * bestAxis2d.y; + const float32_t3 axis2Raw = v * bestAxis2d.x - u * bestAxis2d.y; { - nbl::hlsl::sampling::SphericalRectangle rectSampler = nbl::hlsl::sampling::SphericalRectangle::create(float32_t3x3(self.axis1, self.axis2, self.getAxis3()), float32_t3(self.rectR0, 1.0f), self.rectExtents); - solidAngle = rectSampler.solidAngle; + // Sign-stabilize axis1 against a world reference, branchless. + // axis1 is already perpendicular to axis3, so dot(axis1, worldRef - axis3*dot(worldRef,axis3)) + // == dot(axis1, worldRef). Flipping axis1 also flips axis2 (both negate together since + // axis2 = cross(axis3, axis1)); mirror both x and y bounds simultaneously. + const float32_t3 worldRef = nbl::hlsl::select(abs(axis3.x) < 0.9f, float32_t3(1.0f, 0.0f, 0.0f), float32_t3(0.0f, 1.0f, 0.0f)); + const bool flip = dot(axis1Raw, worldRef) < 0.0f; + self.axis1 = nbl::hlsl::select(flip, -axis1Raw, axis1Raw); + self.axis2 = nbl::hlsl::select(flip, -axis2Raw, axis2Raw); + bestBound = nbl::hlsl::select(flip, float32_t4(-bestBound.z, -bestBound.w, -bestBound.x, -bestBound.y), bestBound); } - VisContext::add(SphereDrawer::drawDot(normalize(center), 0.05f, 0.0f, float32_t3(1.0f, 0.0f, 1.0f))); + // Degenerate bounds fallback (branchless). + const bool degenerateBounds = bestBound.x >= bestBound.z || bestBound.y >= bestBound.w; + bestBound = nbl::hlsl::select(degenerateBounds, float32_t4(-0.1f, -0.1f, 0.1f, 0.1f), bestBound); + + self.rectR0 = bestBound.xy; + self.rectExtents = float32_t2(bestBound.zw - bestBound.xy); + + VisContext::add(SphereDrawer::drawDot(normalize(-unnormCentroid), 0.05f, 0.0f, float32_t3(1.0f, 0.0f, 1.0f))); VisContext::add(SphereDrawer::visualizeBestCaliperEdge(silhouette, bestEdge)); self.visualize(); - - DebugRecorder::recordPyramid(self.axis1, self.axis2, center, bounds, solidAngle, bestEdge); + + // DCE + nbl::hlsl::sampling::SphericalRectangle rectSampler = nbl::hlsl::sampling::SphericalRectangle::create(float32_t3x3(self.axis1, self.axis2, axis3), float32_t3(self.rectR0, 1.0f), self.rectExtents); + DebugRecorder::recordPyramid(self.axis1, self.axis2, -unnormCentroid, bestBound, rectSampler.solidAngle, bestEdge); return self; } @@ -256,28 +280,28 @@ struct SphericalPyramid float32_t3 boundColor2 = float32_t3(0.5f, 0.5f, 1.0f); // Light blue for axis2 bounds float32_t3 centerColor = float32_t3(1.0f, 1.0f, 0.0f); // Yellow for center - float32_t3 a3 = getAxis3(); - float32_t x0 = rectR0.x; - float32_t x1 = rectR0.x + rectExtents.x; - float32_t y0 = rectR0.y; - float32_t y1 = rectR0.y + rectExtents.y; - const float32_t z = 1.0f; + float32_t3 a3 = getAxis3(); + float32_t x0 = rectR0.x; + float32_t x1 = rectR0.x + rectExtents.x; + float32_t y0 = rectR0.y; + float32_t y1 = rectR0.y + rectExtents.y; + const float32_t z = 1.0f; // Great circle normals for the 4 edges (in local frame, then transform to world) float32_t3 bottomNormalLocal = normalize(float32_t3(0, -z, y0)); - float32_t3 topNormalLocal = normalize(float32_t3(0, z, -y1)); - float32_t3 leftNormalLocal = normalize(float32_t3(-z, 0, x0)); - float32_t3 rightNormalLocal = normalize(float32_t3(z, 0, -x1)); + float32_t3 topNormalLocal = normalize(float32_t3(0, z, -y1)); + float32_t3 leftNormalLocal = normalize(float32_t3(-z, 0, x0)); + float32_t3 rightNormalLocal = normalize(float32_t3(z, 0, -x1)); // Transform to world space float32_t3 bottomNormal = bottomNormalLocal.x * axis1 + bottomNormalLocal.y * axis2 + bottomNormalLocal.z * a3; - float32_t3 topNormal = topNormalLocal.x * axis1 + topNormalLocal.y * axis2 + topNormalLocal.z * a3; - float32_t3 leftNormal = leftNormalLocal.x * axis1 + leftNormalLocal.y * axis2 + leftNormalLocal.z * a3; - float32_t3 rightNormal = rightNormalLocal.x * axis1 + rightNormalLocal.y * axis2 + rightNormalLocal.z * a3; + float32_t3 topNormal = topNormalLocal.x * axis1 + topNormalLocal.y * axis2 + topNormalLocal.z * a3; + float32_t3 leftNormal = leftNormalLocal.x * axis1 + leftNormalLocal.y * axis2 + leftNormalLocal.z * a3; + float32_t3 rightNormal = rightNormalLocal.x * axis1 + rightNormalLocal.y * axis2 + rightNormalLocal.z * a3; // Draw center point (center of the rectangle projected onto sphere) - float32_t centerX = (x0 + x1) * 0.5f; - float32_t centerY = (y0 + y1) * 0.5f; + float32_t centerX = (x0 + x1) * 0.5f; + float32_t centerY = (y0 + y1) * 0.5f; float32_t3 centerLocal = normalize(float32_t3(centerX, centerY, z)); float32_t3 centerWorld = centerLocal.x * axis1 + centerLocal.y * axis2 + centerLocal.z * a3; diff --git a/73_SolidAngleVisualizer/main.cpp b/73_SolidAngleVisualizer/main.cpp index a1441d9bd..ecc3694f5 100644 --- a/73_SolidAngleVisualizer/main.cpp +++ b/73_SolidAngleVisualizer/main.cpp @@ -1599,8 +1599,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // Allocate the memory { - constexpr size_t BufferSize = BENCHMARK_WORKGROUP_COUNT * BENCHMARK_WORKGROUP_DIMENSION_SIZE_X * - BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y * BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z * sizeof(uint32_t); + constexpr size_t BufferSize = BENCHMARK_WORKGROUP_COUNT * BENCHMARK_WORKGROUP_DIMENSION_SIZE_X * BENCHMARK_WORKGROUP_DIMENSION_SIZE_Y * BENCHMARK_WORKGROUP_DIMENSION_SIZE_Z * sizeof(uint32_t); nbl::video::IGPUBuffer::SCreationParams params = {}; params.size = BufferSize; @@ -1665,13 +1664,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR SAMPLING_MODE mode; }; const SamplerEntry samplers[] = { - {"PYRAMID_RECTANGLE", SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE}, - {"PYRAMID_PROJ_RECTANGLE", SAMPLING_MODE::SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE}, - {"PYRAMID_BIQUADRATIC", SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC}, - {"PYRAMID_BILINEAR", SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR}, - {"PARALLELOGRAM", SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE}, - {"TRIANGLE_SA", SAMPLING_MODE::TRIANGLE_SOLID_ANGLE}, - {"TRIANGLE_PSA", SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE}, + {.name = "PYRAMID_RECTANGLE", .mode = SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE}, + {.name = "PYRAMID_PROJ_RECTANGLE", .mode = SAMPLING_MODE::SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE}, + {.name = "PYRAMID_BIQUADRATIC", .mode = SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC}, + {.name = "PYRAMID_BILINEAR", .mode = SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR}, + {.name = "PARALLELOGRAM", .mode = SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE}, + {.name = "TRIANGLE_SA", .mode = SAMPLING_MODE::TRIANGLE_SOLID_ANGLE}, + {.name = "TRIANGLE_PSA", .mode = SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE}, }; // Creation-only modes: report per-creation, not per-sample. @@ -1776,7 +1775,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR m_cmdbuf->end(); } - uint64_t calcTimeElapsed() + uint64_t calcTimeElapsed() const { uint64_t timestamps[2]; const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); From f55fc3f83f8d93c438c20f4b911193ea55bc7f2c Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Tue, 28 Apr 2026 14:25:09 +0300 Subject: [PATCH 24/26] fix CI --- 73_SolidAngleVisualizer/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/73_SolidAngleVisualizer/CMakeLists.txt b/73_SolidAngleVisualizer/CMakeLists.txt index 6dbc19664..067395863 100644 --- a/73_SolidAngleVisualizer/CMakeLists.txt +++ b/73_SolidAngleVisualizer/CMakeLists.txt @@ -14,6 +14,9 @@ if(NBL_BUILD_IMGUI) Nabla::ext::FullScreenTriangle ) + # TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !? + nbl_create_executable_project("${NBL_EXTRA_SOURCES}" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}") + if(NBL_EMBED_BUILTIN_RESOURCES) set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) set(RESOURCE_DIR "app_resources") @@ -33,9 +36,6 @@ if(NBL_BUILD_IMGUI) LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) endif() - # TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !? - nbl_create_executable_project("${NBL_EXTRA_SOURCES}" "" "${NBL_INCLUDE_SERACH_DIRECTORIES}" "${NBL_LIBRARIES}") - # TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet # LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD) set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen") From 8edcb1a6d3828c6c0a8fafc8a7ae266978641283 Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Tue, 5 May 2026 04:04:35 +0300 Subject: [PATCH 25/26] Samplers now coform to `TractableSampler` concept, way more organized code, less code duplication, faster pyramid creation, and - Way more organized code - Less code duplication - Faster pyramid creation - Faster silhouette creation - Added back the rotating calipers with Lexell version (O(N*(N-2), templated bool) --- 73_SolidAngleVisualizer/CMakeLists.txt | 68 +-- .../app_resources/hlsl/Drawing.hlsl | 160 ++--- .../hlsl/benchmark/benchmark.comp.hlsl | 282 ++++----- .../app_resources/hlsl/common.hlsl | 70 ++- .../app_resources/hlsl/debug_vis.hlsl | 15 +- .../hlsl/parallelogram_sampling.hlsl | 244 +++++--- .../app_resources/hlsl/pyramid_sampling.hlsl | 557 +++++++++++------- .../hlsl/pyramid_sampling/bilinear.hlsl | 130 ++-- .../hlsl/pyramid_sampling/biquadratic.hlsl | 80 --- .../app_resources/hlsl/silhouette.hlsl | 415 +++++++------ .../hlsl/solid_angle_vis.frag.hlsl | 224 +++---- .../app_resources/hlsl/triangle_sampling.hlsl | 392 ++++++++---- 73_SolidAngleVisualizer/main.cpp | 341 ++++++++--- 13 files changed, 1739 insertions(+), 1239 deletions(-) delete mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/biquadratic.hlsl diff --git a/73_SolidAngleVisualizer/CMakeLists.txt b/73_SolidAngleVisualizer/CMakeLists.txt index 067395863..8112efd1b 100644 --- a/73_SolidAngleVisualizer/CMakeLists.txt +++ b/73_SolidAngleVisualizer/CMakeLists.txt @@ -50,8 +50,6 @@ if(NBL_BUILD_IMGUI) app_resources/hlsl/pyramid_sampling.hlsl app_resources/hlsl/pyramid_sampling/bilinear.hlsl - app_resources/hlsl/pyramid_sampling/biquadratic.hlsl - app_resources/hlsl/solid_angle_vis.frag.hlsl app_resources/hlsl/ray_vis.frag.hlsl @@ -69,37 +67,40 @@ if(NBL_BUILD_IMGUI) set(JSON [=[ [ - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_sa", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=0", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_sa_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=0", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_psa", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=1", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_psa_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=1", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_para", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=2", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_para_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=2", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=3", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=3", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_biquad", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=4", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_biquad_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=4", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_bilinear", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=5", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_bilinear_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=5", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_proj_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=6", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_proj_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=6", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_silhouette", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=7", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_silhouette_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=7", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_pyramid", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=8", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_pyramid_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-O3","-DSAMPLING_MODE_CONST=8", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_sa", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=3", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_sa_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=3", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_psa", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=4", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_psa_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=4", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_para", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=5", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_para_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=5", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=1", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=1", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_bilinear", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=6", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_bilinear_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=6", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_proj_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=2", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_proj_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=2", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_silhouette", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=7", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_silhouette_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=7", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_pyramid", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=8", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_pyramid_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=8", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_pyramid", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=9", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_pyramid_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=9", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=0", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=0", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, {"INPUT": "${RAY_VIS}", "KEY": "ray_vis", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, {"INPUT": "${RAY_VIS}", "KEY": "ray_vis_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_tri_sa", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=0"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_tri_psa", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=1"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_para", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=2"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=3"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_biquad", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=4"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_bilinear", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=5"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_proj_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=6"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_silhouette", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=7"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_pyramid_creation", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_CONST=8"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_tri_sa", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=3"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_tri_psa", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=4"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_para", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=5"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_bilinear", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=6"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=1"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_proj_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=2"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_silhouette", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=7"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_pyramid_creation", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=8"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_caliper_pyramid_creation", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=9"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_caliper_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=0"]}, ] ]=]) string(CONFIGURE "${JSON}" JSON) @@ -107,9 +108,10 @@ if(NBL_BUILD_IMGUI) set(COMPILE_OPTIONS -I "${CMAKE_CURRENT_SOURCE_DIR}" -Zi -Qembed_debug - -fspv-debug=file - -fspv-debug=source - -fspv-debug=line + + # -fspv-debug=file + # -fspv-debug=source + # -fspv-debug=line -enable-16bit-types ) @@ -119,7 +121,7 @@ if(NBL_BUILD_IMGUI) DEPENDS ${DEPENDS} BINARY_DIR ${OUTPUT_DIRECTORY} MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT - COMMON_OPTIONS ${COMPILE_OPTIONS} + COMMON_OPTIONS ${COMPILE_OPTIONS} OUTPUT_VAR KEYS INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp NAMESPACE nbl::this_example::builtin::build diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl index 8fe9adbb8..c2239037b 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/Drawing.hlsl @@ -29,7 +29,7 @@ struct SphereDrawer } else { - float32_t r2 = (1.0f - spherePoint.z) / (1.0f + spherePoint.z); + float32_t r2 = (1.0f - spherePoint.z) / (1.0f + spherePoint.z); float32_t uv2Plus1 = r2 + 1.0f; return float32_t3((spherePoint.xy * uv2Plus1 / 2.0f) * CIRCLE_RADIUS, spherePoint.z); } @@ -42,25 +42,25 @@ struct SphereDrawer // Great circle arc between two points on the sphere static float32_t drawGreatCircleArc(float32_t3 points[2], float32_t width = 0.01f) { - float32_t3 v0 = normalize(points[0]); - float32_t3 v1 = normalize(points[1]); + float32_t3 v0 = normalize(points[0]); + float32_t3 v1 = normalize(points[1]); float32_t3 ndc = normalize(VisContext::spherePos()); float32_t3 arcNormal = normalize(cross(v0, v1)); - float32_t dist = abs(dot(ndc, arcNormal)); + float32_t dist = abs(dot(ndc, arcNormal)); float32_t dotMid = dot(v0, v1); - bool onArc = (dot(ndc, v0) >= dotMid) && (dot(ndc, v1) >= dotMid); + bool onArc = (dot(ndc, v0) >= dotMid) && (dot(ndc, v1) >= dotMid); if (!onArc) return 0.0f; - float32_t avgDepth = (length(points[0]) + length(points[1])) * 0.5f; + float32_t avgDepth = (length(points[0]) + length(points[1])) * 0.5f; float32_t depthScale = 3.0f / avgDepth; - width = min(width * depthScale, 0.02f); + width = min(width * depthScale, 0.02f); const float32_t aaWidth = VisContext::aaWidth(); - float32_t alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); + float32_t alpha = 1.0f - smoothstep(width - aaWidth, width + aaWidth, dist); return alpha; } @@ -71,7 +71,7 @@ struct SphereDrawer float32_t2 ndc = abs(fragPos - center); bool inHorizontal = (ndc.x <= size && ndc.y <= thickness); - bool inVertical = (ndc.y <= size && ndc.x <= thickness); + bool inVertical = (ndc.y <= size && ndc.x <= thickness); return (inHorizontal || inVertical) ? 1.0f : 0.0f; } @@ -79,10 +79,10 @@ struct SphereDrawer // Dot (circle) with optional inner hollow for hidden corners static float32_t4 drawDot(float32_t3 cornerNDCPos, float32_t dotSize, float32_t innerDotSize, float32_t3 dotColor) { - float32_t4 color = float32_t4(0, 0, 0, 0); - const float32_t aaWidth = VisContext::aaWidth(); - const float32_t2 ndc = VisContext::ndc(); - const float32_t dist = length(ndc - cornerNDCPos.xy); + float32_t4 color = float32_t4(0, 0, 0, 0); + const float32_t aaWidth = VisContext::aaWidth(); + const float32_t2 ndc = VisContext::ndc(); + const float32_t dist = length(ndc - cornerNDCPos.xy); float32_t outerAlpha = 1.0f - smoothstep(dotSize - aaWidth, dotSize + aaWidth, dist); @@ -95,7 +95,7 @@ struct SphereDrawer { float32_t innerAlpha = 1.0f - smoothstep(innerDotSize - aaWidth, innerDotSize + aaWidth, dist); innerAlpha *= outerAlpha; - color -= float32_t4(innerAlpha.xxx, 0.0f); + color -= float32_t4(hlsl::promote(innerAlpha), 0.0f); } return color; @@ -104,10 +104,10 @@ struct SphereDrawer // Line segment in NDC space static float32_t lineSegment(float32_t2 ndc, float32_t2 a, float32_t2 b, float32_t thickness) { - float32_t2 pa = ndc - a; - float32_t2 ba = b - a; - float32_t h = saturate(dot(pa, ba) / dot(ba, ba)); - float32_t dist = length(pa - ba * h); + float32_t2 pa = ndc - a; + float32_t2 ba = b - a; + float32_t h = saturate(dot(pa, ba) / dot(ba, ba)); + float32_t dist = length(pa - ba * h); return smoothstep(thickness, thickness * 0.5, dist); } @@ -117,24 +117,24 @@ struct SphereDrawer // Point is on great circle if dot(point, normal) ~= 0 // Only draw the half where dot(point, axis3) > 0 (toward silhouette) const float32_t3 spherePos = VisContext::spherePos(); - const float32_t aaWidth = VisContext::aaWidth(); + const float32_t aaWidth = VisContext::aaWidth(); - float32_t dist = abs(dot(spherePos, normal)); + float32_t dist = abs(dot(spherePos, normal)); float32_t sideFade = smoothstep(-0.1f, 0.1f, dot(spherePos, axis3)); - float32_t alpha = (1.0f - smoothstep(thickness - aaWidth, thickness + aaWidth, dist)) * sideFade; + float32_t alpha = (1.0f - smoothstep(thickness - aaWidth, thickness + aaWidth, dist)) * sideFade; return float32_t4(color * alpha, alpha); } // Unit-circle ring static float32_t4 drawRing(float32_t2 ndc) { - const float32_t aaWidth = VisContext::aaWidth(); - float32_t ringWidth = 0.003f; - float32_t positionLength = length(ndc); + const float32_t aaWidth = VisContext::aaWidth(); + float32_t ringWidth = 0.003f; + float32_t positionLength = length(ndc); float32_t ringDistance = abs(positionLength - CIRCLE_RADIUS); - float32_t ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance); - return ringAlpha * float32_t4(1, 1, 1, 1); + float32_t ringAlpha = 1.0f - smoothstep(ringWidth - aaWidth, ringWidth + aaWidth, ringDistance); + return ringAlpha * float32_t4(0, 0, 0, 1); } // ======================================================================== @@ -157,8 +157,8 @@ struct SphereDrawer // All 8 cube corners as colored dots static float32_t4 drawCorners(float32_t3x4 modelMatrix, float32_t dotSize) { - float32_t4 color = float32_t4(0, 0, 0, 0); - float32_t innerDotSize = dotSize * 0.5f; + float32_t4 color = float32_t4(0, 0, 0, 0); + float32_t innerDotSize = dotSize * 0.5f; shapes::OBBView view = shapes::OBBView::create(modelMatrix); @@ -170,34 +170,34 @@ struct SphereDrawer return color; } - // Clipped silhouette vertices with red-to-cyan gradient - static float32_t4 drawClippedSilhouetteVertices(ClippedSilhouette silhouette) + static float32_t4 drawClippedSilhouetteVertices(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count) { + const float32_t dotSize = 0.03f; + const float32_t2 ndc = VisContext::ndc(); + const float32_t rcpDenom = rcp(float32_t(max(1u, count - 1))); + float32_t4 color = 0; - float32_t dotSize = 0.03f; - for (uint i = 0; i < silhouette.count; i++) + for (uint32_t i = 0; i < count; i++) { - float32_t3 cornerCirclePos = sphereToCircle(normalize(silhouette.vertices[i])); - float32_t dist = length(VisContext::ndc() - cornerCirclePos.xy); - - float32_t alpha = 1.0f - smoothstep(dotSize * 0.8f, dotSize, dist); - + const float32_t3 cornerCirclePos = sphereToCircle(normalize(vertices[i])); + const float32_t dist = length(ndc - cornerCirclePos.xy); + const float32_t alpha = 1.0f - smoothstep(dotSize * 0.8f, dotSize, dist); if (alpha > 0.0f) { - float32_t t = float32_t(i) / float32_t(max(1u, silhouette.count - 1)); - float32_t3 vertexColor = lerp(float32_t3(1, 0, 0), float32_t3(0, 1, 1), t); - + const float32_t t = float32_t(i) * rcpDenom; + const float32_t3 vertexColor = lerp(float32_t3(1, 0, 0), float32_t3(0, 1, 1), t); color += float32_t4(vertexColor * alpha, alpha); } } + return color; } // Non-silhouette cube edges (drawn as faint lines) - static float32_t4 drawHiddenEdges(float32_t3x4 modelMatrix, float32_t3 spherePos, uint32_t silEdgeMask) + static float32_t4 drawHiddenEdges(float32_t3x4 modelMatrix, uint32_t silEdgeMask) { - float32_t4 color = 0; + float32_t4 color = 0; float32_t3 hiddenEdgeColor = float32_t3(0.1, 0.1, 0.1); shapes::OBBView view = shapes::OBBView::create(modelMatrix); @@ -216,8 +216,8 @@ struct SphereDrawer continue; // Re-insert the axis bit (as 0) to recover the low corner index - uint32_t below = compact & ((1u << axis) - 1u); - uint32_t above = compact >> axis; + uint32_t below = compact & ((1u << axis) - 1u); + uint32_t above = compact >> axis; uint32_t corner = (above << (axis + 1u)) | below; float32_t3 v0 = normalize(view.getVertex(corner)); @@ -236,7 +236,7 @@ struct SphereDrawer // clip if one vertex is behind camera if (neg0 ^ neg1) { - float32_t t = v0.z / (v0.z - v1.z); + float32_t t = v0.z / (v0.z - v1.z); float32_t3 clip = normalize(lerp(v0, v1, t)); p0 = neg0 ? clip : v0; @@ -244,7 +244,7 @@ struct SphereDrawer } float32_t3 pts[2] = {p0, p1}; - float32_t c = drawGreatCircleArc(pts, 0.003f); + float32_t c = drawGreatCircleArc(pts, 0.003f); color += float32_t4(hiddenEdgeColor * c, c); } } @@ -253,19 +253,19 @@ struct SphereDrawer } // Best caliper edge highlighted in gold - static float32_t4 visualizeBestCaliperEdge(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, uint32_t bestEdgeIdx) + static float32_t4 visualizeBestCaliperEdge(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, uint32_t bestEdgeIdx) { float32_t4 result = float32_t4(0, 0, 0, 0); - if (bestEdgeIdx >= silhouette.count) + if (bestEdgeIdx >= count) return result; - float32_t3 v0 = silhouette.vertices[bestEdgeIdx]; - float32_t3 v1 = silhouette.vertices[(bestEdgeIdx + 1) % silhouette.count]; + float32_t3 v0 = vertices[bestEdgeIdx]; + float32_t3 v1 = vertices[(bestEdgeIdx + 1) % count]; - float32_t3 pts[2] = {v0, v1}; + float32_t3 pts[2] = {v0, v1}; float32_t3 highlightColor = float32_t3(1.0f, 0.8f, 0.0f); - float32_t alpha = drawGreatCircleArc(pts, 0.008f); + float32_t alpha = drawGreatCircleArc(pts, 0.008f); result += float32_t4(highlightColor * alpha, alpha); return result; @@ -277,32 +277,32 @@ struct SphereDrawer static float32_t4 visualizeSample(float32_t3 sampleDir, float32_t2 xi, uint32_t colorIndex, float32_t2 screenUV) { - float32_t4 accumColor = 0; + float32_t4 accumColor = 0; float32_t3 sampleColor = colorLUT[colorIndex].rgb; // 3D dot on the sphere - float32_t dist3D = distance(sampleDir, normalize(VisContext::spherePos())); + float32_t dist3D = distance(sampleDir, normalize(VisContext::spherePos())); float32_t alpha3D = 1.0f - smoothstep(0.0f, 0.02f, dist3D); if (alpha3D > 0.0f) accumColor += float32_t4(sampleColor * alpha3D, alpha3D); // Parameter-space square (PSS) overlay - static const float32_t2 pssSize = float32_t2(0.2, 0.2); - static const float32_t2 pssPos = float32_t2(0.01, 0.01); - bool isInsidePSS = all(and(screenUV >= pssPos, screenUV <= (pssPos + pssSize))); + static const float32_t2 pssSize = float32_t2(0.2, 0.2); + static const float32_t2 pssPos = float32_t2(0.01, 0.01); + bool isInsidePSS = all(and(screenUV >= pssPos, screenUV <= (pssPos + pssSize))); if (isInsidePSS) { // Cross marker at the sample's xi position float32_t2 xiPixelPos = pssPos + xi * pssSize; - float32_t alpha2D = drawCross2D(screenUV, xiPixelPos, 0.005f, 0.001f); + float32_t alpha2D = drawCross2D(screenUV, xiPixelPos, 0.005f, 0.001f); if (alpha2D > 0.0f) accumColor += float32_t4(sampleColor * alpha2D, alpha2D); // Faint border outline - float32_t2 edgeDist = min(screenUV - pssPos, (pssPos + pssSize) - screenUV); - float32_t borderDist = min(edgeDist.x, edgeDist.y); - float32_t borderAlpha = 1.0f - smoothstep(0.001f, 0.003f, borderDist); + float32_t2 edgeDist = min(screenUV - pssPos, (pssPos + pssSize) - screenUV); + float32_t borderDist = min(edgeDist.x, edgeDist.y); + float32_t borderAlpha = 1.0f - smoothstep(0.001f, 0.003f, borderDist); if (borderAlpha > 0.0f) accumColor += float32_t4(0.3f, 0.3f, 0.3f, 1.0f) * borderAlpha; } @@ -326,7 +326,7 @@ struct SphereDrawer struct ArrowResult { float32_t4 color; - float32_t depth; + float32_t depth; }; // Visualize a ray as an arrow from origin in NDC space. @@ -339,14 +339,14 @@ struct SphereDrawer result.depth = 0.0; // Far plane in reversed-Z float32_t3 rayDir = normalize(directionAndPdf.xyz); - float32_t pdf = directionAndPdf.w; + float32_t pdf = directionAndPdf.w; // Define the 3D line segment float32_t3 worldStart = rayOrigin; - float32_t3 worldEnd = rayOrigin + rayDir * arrowLength; + float32_t3 worldEnd = rayOrigin + rayDir * arrowLength; float32_t4 clipStart = mul(viewProjMatrix, float32_t4(worldStart, 1.0)); - float32_t4 clipEnd = mul(viewProjMatrix, float32_t4(worldEnd, 1.0)); + float32_t4 clipEnd = mul(viewProjMatrix, float32_t4(worldEnd, 1.0)); // Clip against near plane (w = 0 plane in clip space) // If both points are behind camera, reject @@ -360,17 +360,17 @@ struct SphereDrawer if (clipStart.w <= 0.001) { float32_t t = (0.001 - clipStart.w) / (clipEnd.w - clipStart.w); - t0 = saturate(t); - clipStart = lerp(clipStart, clipEnd, t0); - worldStart = lerp(worldStart, worldEnd, t0); + t0 = saturate(t); + clipStart = lerp(clipStart, clipEnd, t0); + worldStart = lerp(worldStart, worldEnd, t0); } if (clipEnd.w <= 0.001) { float32_t t = (0.001 - clipStart.w) / (clipEnd.w - clipStart.w); - t1 = saturate(t); - clipEnd = lerp(clipStart, clipEnd, t1); - worldEnd = lerp(worldStart, worldEnd, t1); + t1 = saturate(t); + clipEnd = lerp(clipStart, clipEnd, t1); + worldEnd = lerp(worldStart, worldEnd, t1); } // Now check if the clipped segment is valid @@ -379,15 +379,15 @@ struct SphereDrawer // Perspective divide to NDC float32_t2 ndcStart = clipStart.xy / clipStart.w; - float32_t2 ndcEnd = clipEnd.xy / clipEnd.w; + float32_t2 ndcEnd = clipEnd.xy / clipEnd.w; // Apply aspect ratio correction ndcStart.x *= aspect; ndcEnd.x *= aspect; // Calculate arrow direction in NDC - float32_t2 arrowVec = ndcEnd - ndcStart; - float32_t arrowNDCLength = length(arrowVec); + float32_t2 arrowVec = ndcEnd - ndcStart; + float32_t arrowNDCLength = length(arrowVec); // Skip if arrow is too small on screen if (arrowNDCLength < 0.005) @@ -395,7 +395,7 @@ struct SphereDrawer // Calculate perpendicular distance to line segment in NDC space float32_t2 toPixel = ndcPos - ndcStart; - float32_t t_ndc = saturate(dot(toPixel, arrowVec) / dot(arrowVec, arrowVec)); + float32_t t_ndc = saturate(dot(toPixel, arrowVec) / dot(arrowVec, arrowVec)); // Draw line shaft float32_t lineThickness = 0.002; @@ -404,17 +404,17 @@ struct SphereDrawer // Calculate perspective-correct depth if (lineIntensity > 0.0) { - float32_t4 clipPos = lerp(clipStart, clipEnd, t_ndc); - float32_t depthNDC = clipPos.z / clipPos.w; - result.depth = 1.0f - depthNDC; + float32_t4 clipPos = lerp(clipStart, clipEnd, t_ndc); + float32_t depthNDC = clipPos.z / clipPos.w; + result.depth = 1.0f - depthNDC; if (result.depth < 0.0 || result.depth > 1.0) lineIntensity = 0.0; } // Modulate by PDF - float32_t pdfIntensity = saturate(pdf * 0.5); - float32_t3 finalColor = float32_t3(pdfIntensity, pdfIntensity, pdfIntensity); + float32_t pdfIntensity = saturate(pdf * 0.5); + float32_t3 finalColor = float32_t3(pdfIntensity, pdfIntensity, pdfIntensity); result.color = float32_t4(finalColor, lineIntensity); return result; diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl index d21dfaf73..8df778c34 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl @@ -12,10 +12,10 @@ using namespace nbl::hlsl; -[[vk::binding(0, 0)]] RWByteAddressBuffer outputBuffer; +[[vk::binding(0, 0)]] RWByteAddressBuffer outputBuffer; [[vk::push_constant]] BenchmarkPushConstants pc; -static const SAMPLING_MODE benchmarkMode = (SAMPLING_MODE)SAMPLING_MODE_CONST; +static const SAMPLING_MODE_FLAGS benchmarkMode = SAMPLING_MODE_FLAGS_CONST; float32_t2 stratifiedXi(uint32_t sampleIdx, uint32_t threadIdx) { @@ -24,24 +24,11 @@ float32_t2 stratifiedXi(uint32_t sampleIdx, uint32_t threadIdx) (float32_t(sampleIdx >> 3u) + 0.5f) / 8.0f + float32_t(threadIdx) * 1e-9f); } -struct PyramidSetup -{ - SphericalPyramid pyramid; - SilEdgeNormals silEdgeNormals; - - static PyramidSetup create(ClippedSilhouette silhouette) - { - PyramidSetup s; - s.pyramid = SphericalPyramid::create(silhouette, s.silEdgeNormals); - s.silEdgeNormals.transformToLocal(s.pyramid.axis1, s.pyramid.axis2, s.pyramid.getAxis3()); - return s; - } -}; - // Per-thread input perturbation: scatters threads across the 27 OBB regions and -// generates a fresh silhouette per outer-loop iteration so creation work can't -// be hoisted out by the compiler. -ClippedSilhouette makePerturbedSilhouette(float32_t3 baseOffset, NBL_REF_ARG(random::PCG32) rng, float32_t rcpU32) +// generates a fresh OBBView per outer-loop iteration so creation work can't be +// hoisted out by the compiler. Returns just the view; callers build their own +// ClippedSilhouette + materialized verts from it as needed. +shapes::OBBView makePerturbedView(float32_t3 baseOffset, NBL_REF_ARG(Xoroshiro64Star) rng, float32_t rcpU32) { const float32_t3 cJ = float32_t3( (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f, @@ -51,18 +38,84 @@ ClippedSilhouette makePerturbedSilhouette(float32_t3 baseOffset, NBL_REF_ARG(ran cM[0][3] += baseOffset.x + cJ.x; cM[1][3] += baseOffset.y + cJ.y; cM[2][3] += baseOffset.z + cJ.z; - shapes::OBBView cV = shapes::OBBView::create(cM); - return ClippedSilhouette::create(cV); + return shapes::OBBView::create(cM); } -[numthreads(BENCHMARK_WORKGROUP_DIMENSION_SIZE_X, 1, 1)] -void main() +// Shared create-and-sample loop for any sampler with the standard +// `create(silhouette, view)` + `generate/forwardPdf/selectedIdx(cache)` shape. +// XORs all outputs into the returned sink to defeat DCE. +template +uint32_t runCreateAndSample(uint32_t creations, NBL_REF_ARG(Xoroshiro64Star) rng, float32_t rcpU32, uint32_t invocationID, float32_t3 rndOffset) { - const uint32_t invocationID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; + uint32_t sink = 0; + for (uint32_t c = 0; c < creations; c++) + { + shapes::OBBView view = makePerturbedView(rndOffset, rng, rcpU32); + ClippedSilhouette silhouette = ClippedSilhouette::create(view); + SamplerT sampler = SamplerT::create(silhouette, view); - // Scatter the OBB translation per invocation so threads span all 27 regions - random::PCG32 rng = random::PCG32::construct(invocationID.x + 0x9e3779b9u); - const float32_t rcpU32 = 1.0f / 4294967296.0f; + for (uint32_t s = 0; s < pc.samplesPerCreation; s++) + { + float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); + typename SamplerT::cache_type cache; + float32_t3 dir = sampler.generate(xi, cache); + float32_t pdf = sampler.forwardPdf(xi, cache); + sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ sampler.selectedIdx(cache); + } + } + return sink; +} + +// Pyramid-create-only benchmark using synthetic random vertices. Templated on +// UseCaliper so PYRAMID_CREATION_ONLY and CALIPER_PYRAMID_CREATION_ONLY share +// one body. Inner sampler is unused (no generate() calls), so default to SphRect. +template +uint32_t runPyramidCreationOnly(NBL_REF_ARG(Xoroshiro64Star) rng, float32_t rcpU32) +{ + typedef SphericalPyramid > PyramidT; + uint32_t sink = 0; + for (uint32_t i = 0; i < pc.sampleCount; i++) + { + float32_t3 synthVerts[MAX_SILHOUETTE_VERTICES]; + NBL_UNROLL + for (uint32_t init = 0; init < MAX_SILHOUETTE_VERTICES; init++) + synthVerts[init] = float32_t3(0, 0, 0); + const uint32_t synthCount = 5; + + for (uint32_t v = 0; v < synthCount; v++) + { + float32_t x = (float32_t(rng()) * rcpU32 - 0.5f) * 1.2f; + float32_t y = (float32_t(rng()) * rcpU32 - 0.5f) * 1.2f; + // Diagnostic raw-rng sink: forces rng+normalize cost into the timing + // even if the entire pyramid create() gets DCE'd downstream. + sink ^= asuint(x) ^ asuint(y); + synthVerts[v] = normalize(float32_t3(x, y, 1.0f)); + sink ^= asuint(synthVerts[v].x) ^ asuint(synthVerts[v].y) ^ asuint(synthVerts[v].z); + } + + float32_t2 dummyR0, dummyExt; + PyramidT pyramid = PyramidT::createFromVertices(synthVerts, synthCount, dummyR0, dummyExt); + + const float32_t3 axis3 = pyramid.getAxis3(); + sink ^= asuint(pyramid.axis1.x) ^ asuint(pyramid.axis1.y) ^ asuint(pyramid.axis1.z); + sink ^= asuint(pyramid.axis2.x) ^ asuint(pyramid.axis2.y) ^ asuint(pyramid.axis2.z); + sink ^= asuint(axis3.x) ^ asuint(axis3.y) ^ asuint(axis3.z); + NBL_UNROLL + for (uint32_t e = 0; e < 5; e++) + { + const float32_t3 n = pyramid.silEdgeNormals.edgeNormals[e]; + sink ^= asuint(n.x) ^ asuint(n.y) ^ asuint(n.z); + } + } + return sink; +} + +[numthreads(BENCHMARK_WORKGROUP_DIMENSION_SIZE_X, 1, 1)] void main() +{ + const uint32_t invocationID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; + + Xoroshiro64Star rng = Xoroshiro64Star::construct(uint32_t2(invocationID.x + 0x9e3779b9u, invocationID.x * 0x85ebca77u + 1u)); + const float32_t rcpU32 = 1.0f / 4294967296.0f; const float32_t3 rndOffset = float32_t3( (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f, (float32_t(rng()) * rcpU32 - 0.5f) * 8.0f, @@ -77,164 +130,45 @@ void main() // `samplesPerCreation`. Total samples per thread = sampleCount. const uint32_t creations = pc.sampleCount / pc.samplesPerCreation; - if (benchmarkMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || - benchmarkMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - { - for (uint32_t c = 0; c < creations; c++) - { - ClippedSilhouette silhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); - TriangleFanSampler samplingData = TriangleFanSampler::create(silhouette, benchmarkMode); - - for (uint32_t s = 0; s < pc.samplesPerCreation; s++) - { - float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); - float32_t pdf; - uint32_t triIdx; - float32_t3 dir = samplingData.sample(silhouette, xi, pdf, triIdx); - sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ triIdx; - } - } - } - else if (benchmarkMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) - { - for (uint32_t c = 0; c < creations; c++) - { - ClippedSilhouette silhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); - silhouette.normalize(); - SilEdgeNormals silEdgeNormals; - Parallelogram parallelogram = Parallelogram::create(silhouette, silEdgeNormals); - - for (uint32_t s = 0; s < pc.samplesPerCreation; s++) - { - float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); - float32_t pdf; - float32_t3 dir = parallelogram.sample(silEdgeNormals, xi, pdf, sampleValid); - sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ (uint32_t)sampleValid; - } - } - } - else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE) - { - for (uint32_t c = 0; c < creations; c++) - { - ClippedSilhouette silhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); - PyramidSetup ps = PyramidSetup::create(silhouette); - sampling::SphericalRectangle rectSampler = sampling::SphericalRectangle::create(float32_t3x3(ps.pyramid.axis1, ps.pyramid.axis2, ps.pyramid.getAxis3()), float32_t3(ps.pyramid.rectR0, 1.0f), ps.pyramid.rectExtents); - - for (uint32_t s = 0; s < pc.samplesPerCreation; s++) - { - float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); - sampling::SphericalRectangle::cache_type cache; - float32_t hitDist; - float32_t3 localDir = rectSampler.generateNormalizedLocal(xi, cache, hitDist); - float32_t3 dir = localDir.x * ps.pyramid.axis1 + localDir.y * ps.pyramid.axis2 + localDir.z * ps.pyramid.getAxis3(); - float32_t localX = localDir.x / localDir.z; - float32_t localY = localDir.y / localDir.z; - sampleValid = dir.z > 0.0f && ps.silEdgeNormals.isInsideLocal(localX, localY); - float32_t pdf = rectSampler.forwardPdf(xi, cache); - sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ (uint32_t)sampleValid; - } - } - } - else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE) - { - for (uint32_t c = 0; c < creations; c++) - { - ClippedSilhouette silhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); - PyramidSetup ps = PyramidSetup::create(silhouette); - - const float32_t3 axis3 = ps.pyramid.getAxis3(); - shapes::CompressedSphericalRectangle compressed; - compressed.origin = ps.pyramid.axis1 * ps.pyramid.rectR0.x + ps.pyramid.axis2 * ps.pyramid.rectR0.y + axis3; - compressed.right = ps.pyramid.axis1 * ps.pyramid.rectExtents.x; - compressed.up = ps.pyramid.axis2 * ps.pyramid.rectExtents.y; - sampling::ProjectedSphericalRectangle projRectSampler = sampling::ProjectedSphericalRectangle::create(compressed, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, 1.0f), false); - - for (uint32_t s = 0; s < pc.samplesPerCreation; s++) - { - float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); - sampling::ProjectedSphericalRectangle::cache_type cache; - float32_t hitDist; - float32_t3 localDir = projRectSampler.generateNormalizedLocal(xi, cache, hitDist); - float32_t3 dir = localDir.x * ps.pyramid.axis1 + localDir.y * ps.pyramid.axis2 + localDir.z * ps.pyramid.getAxis3(); - float32_t localX = localDir.x / localDir.z; - float32_t localY = localDir.y / localDir.z; - sampleValid = dir.z > 0.0f && ps.silEdgeNormals.isInsideLocal(localX, localY); - float32_t pdf = projRectSampler.forwardPdf(xi, cache); - sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ (uint32_t)sampleValid; - } - } - } - else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC) - { - for (uint32_t c = 0; c < creations; c++) - { - ClippedSilhouette silhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); - PyramidSetup ps = PyramidSetup::create(silhouette); - BiquadraticSampler biquad = BiquadraticSampler::create(ps.pyramid); - - for (uint32_t s = 0; s < pc.samplesPerCreation; s++) - { - float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); - float32_t pdf; - float32_t3 dir = biquad.sample(ps.pyramid, ps.silEdgeNormals, xi, pdf, sampleValid); - sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ (uint32_t)sampleValid; - } - } - } - else if (benchmarkMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) - { - for (uint32_t c = 0; c < creations; c++) - { - ClippedSilhouette silhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); - PyramidSetup ps = PyramidSetup::create(silhouette); - BilinearSampler bilin = BilinearSampler::create(ps.pyramid); - - for (uint32_t s = 0; s < pc.samplesPerCreation; s++) - { - float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); - float32_t pdf; - float32_t3 dir = bilin.sample(ps.pyramid, ps.silEdgeNormals, xi, pdf, sampleValid); - sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ (uint32_t)sampleValid; - } - } - } - else if (benchmarkMode == SAMPLING_MODE::SILHOUETTE_CREATION_ONLY) + if (benchmarkMode == SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY) { + // Measure full silhouette-prep cost = create + materialize. The previous + // ClippedSilhouette did both inline; the metadata-only ClippedSilhouette + // splits them, so we exercise both here to keep this benchmark + // apples-to-apples. for (uint32_t i = 0; i < pc.sampleCount; i++) { - ClippedSilhouette iterSilhouette = makePerturbedSilhouette(rndOffset, rng, rcpU32); + shapes::OBBView iterView = makePerturbedView(rndOffset, rng, rcpU32); + ClippedSilhouette iterSilhouette = ClippedSilhouette::create(iterView); + float32_t3 iterVerts[MAX_SILHOUETTE_VERTICES]; + iterSilhouette.materialize(iterView, iterVerts); sink ^= iterSilhouette.count; NBL_UNROLL for (uint32_t j = 0; j < MAX_SILHOUETTE_VERTICES; j++) - sink ^= asuint(iterSilhouette.vertices[j].x) ^ asuint(iterSilhouette.vertices[j].y) ^ asuint(iterSilhouette.vertices[j].z); + sink ^= asuint(iterVerts[j].x) ^ asuint(iterVerts[j].y) ^ asuint(iterVerts[j].z); } } - else if (benchmarkMode == SAMPLING_MODE::PYRAMID_CREATION_ONLY) + else if ((benchmarkMode & SAMPLING_MODE_FLAGS::FLAG_PYRAMID) != 0u + && (benchmarkMode & SAMPLING_MODE_FLAGS::FLAG_CREATE_ONLY) != 0u) + sink ^= runPyramidCreationOnly<(benchmarkMode & SAMPLING_MODE_FLAGS::FLAG_CALIPER) != 0u>(rng, rcpU32); + // Caliper variant: tighter rect → different rejection rate, only interesting when samplesPerCreation > 1. + else if (benchmarkMode == SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID) + sink ^= runCreateAndSample > >(creations, rng, rcpU32, invocationID, rndOffset); + else if (benchmarkMode == SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID) + sink ^= runCreateAndSample > >(creations, rng, rcpU32, invocationID, rndOffset); + else if (benchmarkMode == SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID) + sink ^= runCreateAndSample > >(creations, rng, rcpU32, invocationID, rndOffset); + else if ((benchmarkMode & SAMPLING_MODE_FLAGS::FLAG_TRIANGLE) != 0u) + sink ^= runCreateAndSample >(creations, rng, rcpU32, invocationID, rndOffset); + else if (benchmarkMode == SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) + sink ^= runCreateAndSample(creations, rng, rcpU32, invocationID, rndOffset); + else if (benchmarkMode == SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID) + sink ^= runCreateAndSample >(creations, rng, rcpU32, invocationID, rndOffset); + else { - for (uint32_t i = 0; i < pc.sampleCount; i++) - { - ClippedSilhouette synthSil = (ClippedSilhouette)0; - synthSil.count = 5; - - NBL_UNROLL - for (uint32_t v = 0; v < 5; v++) - { - float32_t x = (float32_t(rng()) * rcpU32 - 0.5f) * 1.2f; - float32_t y = (float32_t(rng()) * rcpU32 - 0.5f) * 1.2f; - synthSil.vertices[v] = normalize(float32_t3(x, y, 1.0f)); - } - - SilEdgeNormals silEdgeNormals; - SphericalPyramid pyramid = SphericalPyramid::create(synthSil, silEdgeNormals); - - uint32_t pyramidBits = asuint(pyramid.axis1.x) ^ asuint(pyramid.axis2.x) ^ asuint(pyramid.rectR0.x) ^ asuint(pyramid.rectR0.y) ^ asuint(pyramid.rectExtents.x) ^ asuint(pyramid.rectExtents.y); - uint32_t edgeBits = asuint(float32_t(silEdgeNormals.edgeNormals[0].x)) ^ asuint(float32_t(silEdgeNormals.edgeNormals[1].x)); - sink ^= pyramidBits ^ edgeBits; - } + assert(false); } - const uint32_t offset = sizeof(uint32_t) * invocationID.x; outputBuffer.Store(offset, sink); } diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl index f55f27067..632cd7856 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl @@ -13,20 +13,63 @@ namespace nbl namespace hlsl { -// Sampling mode enum -enum SAMPLING_MODE : uint32_t +// Sampling mode enum -- bit-encoded: low byte is the dense ID (0..Count-1), +// high bits are family/variant flags so callers can do `mode & FLAG_X` instead +// of long `||` chains. Host C++ that needs a dense index wraps mode access +// with `(uint32_t(mode) & DENSE_ID_MASK)`. +enum SAMPLING_MODE_FLAGS : uint32_t { - TRIANGLE_SOLID_ANGLE, - TRIANGLE_PROJECTED_SOLID_ANGLE, - PROJECTED_PARALLELOGRAM_SOLID_ANGLE, - SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE, - SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC, - SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR, - SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE, - SILHOUETTE_CREATION_ONLY, - PYRAMID_CREATION_ONLY, - Count + // ---- family flags (which underlying geometry/sampler family) ---- + FLAG_PYRAMID = 0x100, + FLAG_TRIANGLE = 0x200, + FLAG_PARALLELOGRAM = 0x400, + FLAG_SILHOUETTE = 0x800, + + // ---- variant flags (modifiers on the family) ---- + FLAG_CALIPER = 0x1000, + FLAG_PROJECTED = 0x2000, + FLAG_BILINEAR = 0x4000, + FLAG_CREATE_ONLY = 0x8000, + + // ---- dense-ID extractor for host-side array indexing ---- + DENSE_ID_MASK = 0xFF, + + // ---- modes: dense ID in low byte | family/variant flags ---- + SPH_RECT_FROM_CALIPER_PYRAMID = 0 | FLAG_PYRAMID | FLAG_CALIPER, + SPH_RECT_FROM_PYRAMID = 1 | FLAG_PYRAMID, + PROJ_SPH_RECT_FROM_PYRAMID = 2 | FLAG_PYRAMID | FLAG_PROJECTED, + + TRIANGLE_SOLID_ANGLE = 3 | FLAG_TRIANGLE, + TRIANGLE_PROJECTED_SOLID_ANGLE = 4 | FLAG_TRIANGLE | FLAG_PROJECTED, + + PROJECTED_PARALLELOGRAM_SOLID_ANGLE = 5 | FLAG_PARALLELOGRAM, + + BILINEAR_FROM_PYRAMID = 6 | FLAG_PYRAMID | FLAG_BILINEAR, + + SILHOUETTE_CREATION_ONLY = 7 | FLAG_SILHOUETTE | FLAG_CREATE_ONLY, + PYRAMID_CREATION_ONLY = 8 | FLAG_PYRAMID | FLAG_CREATE_ONLY, + CALIPER_PYRAMID_CREATION_ONLY = 9 | FLAG_PYRAMID | FLAG_CALIPER | FLAG_CREATE_ONLY, + + Count = 10 // count of distinct dense IDs +}; + +#ifndef __HLSL_VERSION +// Host helpers: dense IDs for array indexing + a parallel array for combo/iteration. +inline uint32_t denseIdOf(SAMPLING_MODE_FLAGS m) { return uint32_t(m) & uint32_t(SAMPLING_MODE_FLAGS::DENSE_ID_MASK); } + +constexpr SAMPLING_MODE_FLAGS kAllModes[SAMPLING_MODE_FLAGS::Count] = { + SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID, // dense 0 + SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID, // dense 1 + SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID, // dense 2 + SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE, // dense 3 + SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE, // dense 4 + SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE, // dense 5 + SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID, // dense 6 + SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY, // dense 7 + SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY, // dense 8 + SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY, // dense 9 }; +#endif struct ResultData { @@ -48,7 +91,8 @@ struct ResultData uint32_t rotatedSil; uint32_t edgeVisibilityMismatch; - // Clipped output (layout matches ClippedSilhouette: vertices[7] then count) + // Clipped output: positions written via DebugRecorder::recordClippedVertex + // by callers that materialize silhouette vertices; indices recorded in parallel. float32_t3 clippedVertices[MAX_SILHOUETTE_VERTICES]; uint32_t clippedVertexCount; uint32_t clippedVertexIndices[MAX_SILHOUETTE_VERTICES]; diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl index c34b76c65..96ad9abf3 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/debug_vis.hlsl @@ -73,19 +73,20 @@ struct DebugRecorder DebugDataBuffer[0].pyramid.max2 = bounds.w; } - static void recordSampleCount(uint32_t count) { DebugDataBuffer[0].sampling.sampleCount = count; } static void recordRay(uint32_t i, float32_t3 dir, float32_t pdf) { DebugDataBuffer[0].sampling.rayData[i] = float32_t4(dir, pdf); } - static void recordFrameEnd(uint32_t3 region, uint32_t configIndex, uint32_t silSize, uint32_t silData, uint32_t vertexIndices[6], uint32_t validSampleCount) + static void recordFrameEnd(uint32_t3 region, uint32_t configIndex, uint32_t silSize, uint32_t silData, uint32_t vertexIndices[6], uint32_t validSampleCount, uint32_t sampleCount) { - InterlockedAdd(DebugDataBuffer[0].sampling.validSampleCount, validSampleCount); - InterlockedAdd(DebugDataBuffer[0].sampling.threadCount, 1u); DebugDataBuffer[0].silhouette.region = region; DebugDataBuffer[0].silhouette.silhouetteIndex = configIndex; DebugDataBuffer[0].silhouette.silhouetteVertexCount = silSize; for (uint32_t i = 0; i < 6; i++) - DebugDataBuffer[0].silhouette.vertices[i] = vertexIndices[i]; + DebugDataBuffer[0].silhouette.vertices[i] = vertexIndices[i]; DebugDataBuffer[0].silhouette.silhouette = silData; + + InterlockedAdd(DebugDataBuffer[0].sampling.validSampleCount, validSampleCount); + InterlockedAdd(DebugDataBuffer[0].sampling.threadCount, 1u); + DebugDataBuffer[0].sampling.sampleCount = sampleCount; } #else static void recordClippedVertex(uint32_t slot, float32_t3 pos, uint32_t originalIndex) {} @@ -93,10 +94,8 @@ struct DebugRecorder static void recordTriangleFan(bool luneDetected, uint32_t count, float32_t totalWeight, float32_t solidAngles[5]) {} static void recordParallelogram(float32_t area, uint32_t convexMask, uint32_t n3Mask, float32_t2 corner, float32_t2 axisDir, float32_t width, float32_t height) {} static void recordPyramid(float32_t3 axis1, float32_t3 axis2, float32_t3 center, float32_t4 bounds, float32_t solidAngle, uint32_t bestEdge) {} - static void recordSampleCount(uint32_t count) {} static void recordRay(uint32_t i, float32_t3 dir, float32_t pdf) {} - static void recordFrameEnd(uint32_t3 region, uint32_t configIndex, uint32_t silSize, - uint32_t silData, uint32_t vertexIndices[6], uint32_t validSampleCount) {} + static void recordFrameEnd(uint32_t3 region, uint32_t configIndex, uint32_t silSize, uint32_t silData, uint32_t vertexIndices[6], uint32_t validSampleCount, uint32_t sampleCount) {} #endif }; diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl index 7c99a3363..1751f1524 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/parallelogram_sampling.hlsl @@ -10,17 +10,40 @@ #include "drawing.hlsl" #define MAX_CURVE_APEXES 2 -#define GET_PROJ_VERT(i) silhouette.vertices[i].xy *CIRCLE_RADIUS +#define GET_PROJ_VERT(i) vertices[i].xy *CIRCLE_RADIUS // ============================================================================ // Minimum bounding rectangle on projected sphere +// +// All internal helpers operate on a pre-materialized + pre-normalized vertex +// array `verts[7]`. The factory `create(silhouette)` materializes verts +// locally via the silhouette's +/- walk (using its stored view) and absorbs +// SilEdgeNormals as a member so sample(xi, pdf) needs no extra args. // ============================================================================ struct Parallelogram { - float16_t2 corner; - float16_t2 axisDir; - float16_t width; - float16_t height; + using scalar_type = float32_t; + using vector2_type = float32_t2; + using vector3_type = float32_t3; + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; + + // Cache for the TractableSampler concept: stores enough state from + // generate() that forwardPdf()/forwardWeight() are O(1) lookups instead + // of redoing the inside test. selectedIdx is unused for Parallelogram + // (no subdivision) but kept for uniform extraction by visualizeSample(). + struct cache_type + { + density_type pdf; + }; + + float16_t2 corner; + float16_t2 axisDir; + float16_t width; + float16_t height; + SilEdgeNormals normals; // per-edge cross products in world frame for the inside test in sample() // ======================================================================== // Projection helpers @@ -152,9 +175,9 @@ struct Parallelogram // Accurate=false (Fast): tests vertex + midpoint only. Used O(N^2) times for axis ranking. // Accurate=true: also computes tangent-line apex for convex edges. Used once for final rect. template - static void testEdgeForAxis(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, const ClippedSilhouette silhouette, uint32_t convexMask, uint32_t n3Mask, float32_t2 dir, float32_t2 perpDir) + static void testEdgeForAxis(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, uint32_t convexMask, uint32_t n3Mask, float32_t2 dir, float32_t2 perpDir) { - const uint32_t nextIdx = (I + 1 < silhouette.count) ? I + 1 : 0; + const uint32_t nextIdx = (I + 1 < count) ? I + 1 : 0; const float32_t2 projectedVertex = GET_PROJ_VERT(I); testPoint(minAlong, maxAlong, minPerp, maxPerp, projectedVertex, dir, perpDir); @@ -168,8 +191,8 @@ struct Parallelogram if (!isN3 && !isConvex) return; - float32_t3 S = silhouette.vertices[I]; - float32_t3 E = silhouette.vertices[nextIdx]; + float32_t3 S = vertices[I]; + float32_t3 E = vertices[nextIdx]; float32_t2 midPoint = evalCurvePoint(S, E, 0.5f); if (isN3) @@ -210,7 +233,7 @@ struct Parallelogram { if (isN3) { - float32_t2 midPoint = evalCurvePoint(silhouette.vertices[I], silhouette.vertices[nextIdx], 0.5f); + float32_t2 midPoint = evalCurvePoint(vertices[I], vertices[nextIdx], 0.5f); testPoint(minAlong, maxAlong, minPerp, maxPerp, midPoint, dir, perpDir); } } @@ -220,30 +243,30 @@ struct Parallelogram // Accurate=false: fast path for axis ranking during candidate selection. // Accurate=true: tight bounds with apex computation for the final rectangle. template - static void computeBoundsForAxis(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, const ClippedSilhouette silhouette, uint32_t convexMask, uint32_t n3Mask, float32_t2 dir, float32_t2 perpDir) + static void computeBoundsForAxis(inout float32_t minAlong, inout float32_t maxAlong, inout float32_t minPerp, inout float32_t maxPerp, float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, uint32_t convexMask, uint32_t n3Mask, float32_t2 dir, float32_t2 perpDir) { - testEdgeForAxis<0, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); - testEdgeForAxis<1, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); - testEdgeForAxis<2, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); - if (silhouette.count > 3) + testEdgeForAxis<0, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + testEdgeForAxis<1, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + testEdgeForAxis<2, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + if (count > 3) { - testEdgeForAxis<3, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); - if (silhouette.count > 4) + testEdgeForAxis<3, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + if (count > 4) { - testEdgeForAxis<4, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); - if (silhouette.count > 5) + testEdgeForAxis<4, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + if (count > 5) { - testEdgeForAxis<5, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); - if (silhouette.count > 6) + testEdgeForAxis<5, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); + if (count > 6) { - testEdgeForAxis<6, Accurate>(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); + testEdgeForAxis<6, Accurate>(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); } } } } } - static void tryCaliperDir(inout float32_t bestArea, inout float32_t2 bestDir, const float32_t2 dir, const ClippedSilhouette silhouette, uint32_t n3Mask) + static void tryCaliperDir(inout float32_t bestArea, inout float32_t2 bestDir, const float32_t2 dir, float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, uint32_t n3Mask) { float32_t2 perpDir = float32_t2(-dir.y, dir.x); @@ -252,7 +275,7 @@ struct Parallelogram float32_t minPerp = 1e10f; float32_t maxPerp = -1e10f; - computeBoundsForAxis(minAlong, maxAlong, minPerp, maxPerp, silhouette, 0, n3Mask, dir, perpDir); + computeBoundsForAxis(minAlong, maxAlong, minPerp, maxPerp, vertices, count, 0, n3Mask, dir, perpDir); float32_t area = (maxAlong - minAlong) * (maxPerp - minPerp); if (area < bestArea) @@ -263,28 +286,28 @@ struct Parallelogram } template - static void processEdge(inout float32_t bestArea, inout float32_t2 bestDir, inout uint32_t convexMask, inout uint32_t n3Mask, const ClippedSilhouette silhouette, inout SilEdgeNormals precompSil) + static void processEdge(inout float32_t bestArea, inout float32_t2 bestDir, inout uint32_t convexMask, inout uint32_t n3Mask, float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, inout SilEdgeNormals precompSil) { - const uint32_t nextIdx = (I + 1 < silhouette.count) ? I + 1 : 0; - float32_t3 S = silhouette.vertices[I]; - float32_t3 E = silhouette.vertices[nextIdx]; + const uint32_t nextIdx = (I + 1 < count) ? I + 1 : 0; + float32_t3 S = vertices[I]; + float32_t3 E = vertices[nextIdx]; precompSil.edgeNormals[I] = float16_t3(cross(S, E)); float32_t2 t0, t1; getProjectedTangents(S, E, t0, t1); - tryCaliperDir(bestArea, bestDir, t0, silhouette, n3Mask); + tryCaliperDir(bestArea, bestDir, t0, vertices, count, n3Mask); if (nbl::hlsl::cross2D(S.xy, E.xy) < -1e-6f) { convexMask |= (1u << I); - tryCaliperDir(bestArea, bestDir, t1, silhouette, n3Mask); + tryCaliperDir(bestArea, bestDir, t1, vertices, count, n3Mask); if (dot(t0, t1) < 0.5f) { n3Mask |= (1u << I); float32_t2 tangentAtMid = evalCurveTangent(S, E, 0.5f); - tryCaliperDir(bestArea, bestDir, tangentAtMid, silhouette, n3Mask); + tryCaliperDir(bestArea, bestDir, tangentAtMid, vertices, count, n3Mask); } } } @@ -293,7 +316,7 @@ struct Parallelogram // Factory methods // ======================================================================== - static Parallelogram buildForAxis(const ClippedSilhouette silhouette, uint32_t convexMask, uint32_t n3Mask, float32_t2 dir) + static Parallelogram buildForAxis(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, uint32_t convexMask, uint32_t n3Mask, float32_t2 dir) { float32_t2 perpDir = float32_t2(-dir.y, dir.x); @@ -302,7 +325,7 @@ struct Parallelogram float32_t minPerp = 1e10f; float32_t maxPerp = -1e10f; - computeBoundsForAxis(minAlong, maxAlong, minPerp, maxPerp, silhouette, convexMask, n3Mask, dir, perpDir); + computeBoundsForAxis(minAlong, maxAlong, minPerp, maxPerp, vertices, count, convexMask, n3Mask, dir, perpDir); Parallelogram result; result.width = (float16_t)(maxAlong - minAlong); @@ -313,95 +336,158 @@ struct Parallelogram return result; } - // Silhouette vertices must be normalized before calling create() - static Parallelogram create(const ClippedSilhouette silhouette, out SilEdgeNormals precompSil) + // Real factory: takes a pre-materialized + pre-normalized vertex array. + // The (silhouette) overload below handles materialization. + static Parallelogram createFromVertices(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count) { - precompSil = (SilEdgeNormals)0; + SilEdgeNormals precompSil = (SilEdgeNormals)0; uint32_t convexMask = 0; uint32_t n3Mask = 0; float32_t bestArea = 1e10f; float32_t2 bestDir = float32_t2(1.0f, 0.0f); - processEdge<0>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); - processEdge<1>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); - processEdge<2>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); - if (silhouette.count > 3) + processEdge<0>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); + processEdge<1>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); + processEdge<2>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); + if (count > 3) { - processEdge<3>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); - if (silhouette.count > 4) + processEdge<3>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); + if (count > 4) { - processEdge<4>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); - if (silhouette.count > 5) + processEdge<4>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); + if (count > 5) { - processEdge<5>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); - if (silhouette.count > 6) + processEdge<5>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); + if (count > 6) { - processEdge<6>(bestArea, bestDir, convexMask, n3Mask, silhouette, precompSil); + processEdge<6>(bestArea, bestDir, convexMask, n3Mask, vertices, count, precompSil); } } } } - tryCaliperDir(bestArea, bestDir, float32_t2(1.0f, 0.0f), silhouette, n3Mask); - tryCaliperDir(bestArea, bestDir, float32_t2(0.0f, 1.0f), silhouette, n3Mask); + tryCaliperDir(bestArea, bestDir, float32_t2(1.0f, 0.0f), vertices, count, n3Mask); + tryCaliperDir(bestArea, bestDir, float32_t2(0.0f, 1.0f), vertices, count, n3Mask); - Parallelogram best = buildForAxis(silhouette, convexMask, n3Mask, bestDir); + Parallelogram best = buildForAxis(vertices, count, convexMask, n3Mask, bestDir); - for (uint32_t i = 0; i < silhouette.count; i++) + // Apex-draw cascade: literal per edge so vertices[I] / vertices[J] + // accesses keep vertices SROA-promoted (a single dynamic-index access here + // would demote the entire SilhouetteVerts to Function memory and tank + // every cascade above this point). + apexDrawEdge<0, 1>(vertices, convexMask, n3Mask); + apexDrawEdge<1, 2>(vertices, convexMask, n3Mask); + if (count == 3) + { + apexDrawEdge<2, 0>(vertices, convexMask, n3Mask); + } + else { - if (convexMask & (1u << i)) + apexDrawEdge<2, 3>(vertices, convexMask, n3Mask); + if (count == 4) { - uint32_t nextIdx = (i + 1) % silhouette.count; - float32_t2 p0 = GET_PROJ_VERT(i); - float32_t2 p1 = GET_PROJ_VERT(nextIdx); - - float32_t2 t0, endTangent; - getProjectedTangents(silhouette.vertices[i], silhouette.vertices[nextIdx], t0, endTangent); - - if (n3Mask & (1u << i)) + apexDrawEdge<3, 0>(vertices, convexMask, n3Mask); + } + else + { + apexDrawEdge<3, 4>(vertices, convexMask, n3Mask); + if (count == 5) { - float32_t2 tangentAtMid = evalCurveTangent(silhouette.vertices[i], silhouette.vertices[nextIdx], 0.5f); - float32_t2 midPoint = evalCurvePoint(silhouette.vertices[i], silhouette.vertices[nextIdx], 0.5f); - - float32_t2 apex0, apex1; - computeApexClamped(p0, midPoint, t0, tangentAtMid, apex0); - computeApexClamped(midPoint, p1, tangentAtMid, endTangent, apex1); - - VisContext::add(SphereDrawer::drawDot(float32_t3(apex0, 0.0f), 0.03, 0.0f, float32_t3(1, 0, 1))); - VisContext::add(SphereDrawer::drawDot(float32_t3(midPoint, 0.0f), 0.02, 0.0f, float32_t3(0, 1, 0))); - VisContext::add(SphereDrawer::drawDot(float32_t3(apex1, 0.0f), 0.03, 0.0f, float32_t3(1, 0.5, 0))); + apexDrawEdge<4, 0>(vertices, convexMask, n3Mask); } else { - float32_t2 apex; - computeApexClamped(p0, p1, t0, endTangent, apex); - VisContext::add(SphereDrawer::drawDot(float32_t3(apex, 0.0f), 0.03, 0.0f, float32_t3(1, 0, 1))); + apexDrawEdge<4, 5>(vertices, convexMask, n3Mask); + if (count == 6) + { + apexDrawEdge<5, 0>(vertices, convexMask, n3Mask); + } + else // count == 7 + { + apexDrawEdge<5, 6>(vertices, convexMask, n3Mask); + apexDrawEdge<6, 0>(vertices, convexMask, n3Mask); + } } } } DebugRecorder::recordParallelogram(float32_t(best.width) * float32_t(best.height), convexMask, n3Mask, float32_t2(best.corner), float32_t2(best.axisDir), float32_t(best.width), float32_t(best.height)); + best.normals = precompSil; return best; } - float32_t3 sample(NBL_CONST_REF_ARG(SilEdgeNormals) silhouette, float32_t2 xi, out float32_t pdf, out bool valid) + // Per-edge apex-draw helper. Templated so vertices[I] / vertices[J] are + // literal-index reads. Skipped at runtime when the edge isn't convex. + template + static void apexDrawEdge(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t convexMask, uint32_t n3Mask) + { + if ((convexMask & (1u << I)) == 0u) + return; + + const float32_t2 p0 = GET_PROJ_VERT(I); + const float32_t2 p1 = GET_PROJ_VERT(J); + + float32_t2 t0, endTangent; + getProjectedTangents(vertices[I], vertices[J], t0, endTangent); + + if (n3Mask & (1u << I)) + { + const float32_t2 tangentAtMid = evalCurveTangent(vertices[I], vertices[J], 0.5f); + const float32_t2 midPoint = evalCurvePoint(vertices[I], vertices[J], 0.5f); + + float32_t2 apex0, apex1; + computeApexClamped(p0, midPoint, t0, tangentAtMid, apex0); + computeApexClamped(midPoint, p1, tangentAtMid, endTangent, apex1); + + VisContext::add(SphereDrawer::drawDot(float32_t3(apex0, 0.0f), 0.03, 0.0f, float32_t3(1, 0, 1))); + VisContext::add(SphereDrawer::drawDot(float32_t3(midPoint, 0.0f), 0.02, 0.0f, float32_t3(0, 1, 0))); + VisContext::add(SphereDrawer::drawDot(float32_t3(apex1, 0.0f), 0.03, 0.0f, float32_t3(1, 0.5, 0))); + } + else + { + float32_t2 apex; + computeApexClamped(p0, p1, t0, endTangent, apex); + VisContext::add(SphereDrawer::drawDot(float32_t3(apex, 0.0f), 0.03, 0.0f, float32_t3(1, 0, 1))); + } + } + + // Convenience overload: materialize + normalize verts on the stack via the + // silhouette's +/- walk, then forward to the real factory. Local verts[7] + // dies when this function returns; the Parallelogram (with its embedded + // edge normals) is the only thing that outlives create(). + static Parallelogram create(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, shapes::OBBView view) + { + float32_t3 vertices[MAX_SILHOUETTE_VERTICES]; + silhouette.materializeNormalized(view, vertices); + return createFromVertices(vertices, silhouette.count); + } + + // TractableSampler::generate. Maps u in [0,1]^2 to a unit direction on the + // sphere via the orthographically-projected parallelogram, registers the + // pdf in the cache for O(1) forwardPdf, and stamps selectedIdx = 0 (no + // subdivision -- the field exists only for the visualization code path). + codomain_type generate(domain_type u, NBL_REF_ARG(cache_type) cache) { float16_t2 perpDir = float16_t2(-axisDir.y, axisDir.x); float16_t2 circleXY = corner + - (float16_t)(xi.x) * width * axisDir + - (float16_t)(xi.y) * height * perpDir; + (float16_t)(u.x) * width * axisDir + + (float16_t)(u.y) * height * perpDir; - float32_t3 direction = circleToSphere(circleXY); + codomain_type direction = circleToSphere(circleXY); - valid = direction.z > 0.0f && silhouette.isInside(direction); + bool valid = direction.z > 0.0f && normals.isInside(direction); // PDF in solid angle measure: the rectangle is in circle-space (scaled by CIRCLE_RADIUS), // and the orthographic projection Jacobian is dA_circle/dω = CIRCLE_RADIUS^2 * z - pdf = valid ? (CIRCLE_RADIUS * CIRCLE_RADIUS * direction.z / (float32_t(width) * float32_t(height))) : 0.0f; + cache.pdf = valid ? (CIRCLE_RADIUS * CIRCLE_RADIUS * direction.z / (scalar_type(width) * scalar_type(height))) : 0.0f; return direction; } + + density_type forwardPdf(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + weight_type forwardWeight(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + uint32_t selectedIdx(cache_type cache) NBL_CONST_MEMBER_FUNC { return 0; } }; #undef MAX_CURVE_APEXES diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl index 1c7f3aaba..8b73a8ae1 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling.hlsl @@ -10,275 +10,428 @@ #include #include #include -#include // acos_csc_approx #include #include #include "silhouette.hlsl" #include "drawing.hlsl" +#include "pyramid_sampling/bilinear.hlsl" + +// Tag-dispatched inner sampler factory: overload selected by the type of the +// default-constructed `tag` arg. Avoids the per-inner adapter struct. +inline sampling::SphericalRectangle buildInner(float32_t3x3 basis, float32_t2 r0, float32_t2 ext, sampling::SphericalRectangle /*tag*/) +{ + return sampling::SphericalRectangle::create(basis, float32_t3(r0, 1.0f), ext); +} + +inline sampling::ProjectedSphericalRectangle buildInner(float32_t3x3 basis, float32_t2 r0, float32_t2 ext, sampling::ProjectedSphericalRectangle /*tag*/) +{ + shapes::CompressedSphericalRectangle compressed; + compressed.origin = basis[0] * r0.x + basis[1] * r0.y + basis[2]; + compressed.right = basis[0] * ext.x; + compressed.up = basis[1] * ext.y; + return sampling::ProjectedSphericalRectangle::create(compressed, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, 1.0f), false); +} + +inline BilinearSampler buildInner(float32_t3x3 basis, float32_t2 r0, float32_t2 ext, BilinearSampler /*tag*/) +{ + return BilinearSampler::create(basis, r0, ext); +} -// ============================================================================ // Spherical Pyramid: gnomonic bounding rectangle for silhouette sampling. // -// Algorithm (SphericalPyramid::create): -// 1. Pass 1: walk the silhouette CCW, accumulating -// unnormCentroid = sum(cross(v_i, v_{i+1}) * acos_csc_approx(dot(v_i, v_{i+1}))) -// which is the sum of normalized outward edge normals weighted by arc length -// (Kelvin-Stokes form). This is the true spherical centroid of the polygon -// and serves as a much better gnomonic-projection axis than blending the raw -// vertex centroid toward (0,0,1). The cross products are also written into -// silEdgeNormals.edgeNormals[i] (used later by the inside-polygon test). -// 2. axis3 = normalize(unnormCentroid). -// 3. Pass 2: Frisvad basis (u, v) orthogonal to axis3; project all silhouette -// vertices to 2D gnomonic coordinates in (u, v) once, up front. -// 4. Pass 3: "guesstimate" calipers: pick the longest 2D edge as axis1, do -// a single bound pass. O(N) edge-length compares + 1 bound pass, vs the old -// O(N^2) cascade. The bound is slightly looser than the true min-area rect -// but the rejection sampler tolerates that. -// 5. Reconstruct 3D axis1, axis2; sign-stabilize axis1 against a world ref. +// UseCaliper=false: axis1 picks the longest world-space silhouette edge +// (one compare per edge, no inner loop, blind to perpendicular spread). +// UseCaliper=true: spherical rotating-caliper. For each candidate edge (A, B), +// the extremal opposing vertex C is found via argmax_K dot(C_K, precross) +// where precross = cross(B-A, n0); this matches argmax dot(n0, cross(C+A, C+B)) +// by the cyclic scalar triple product. Score = cos(dihedral) between the +// AB-great-circle and the Lexell-circle plane through (-A, -B, C). The +// lune cosine is a heuristic; the post-search bound pass is exact regardless. +// +// Pipeline: axis3 = normalize(-unnormCentroid); axis1 = project bestEdge3d +// onto plane(axis3); axis2 = cross(axis3, axis1); computeBound3D yields +// (rectR0, rectExtents). axis3 is not stored, reconstructed via getAxis3(). // -// axis3 is not stored, reconstructed as cross(axis1, axis2). -// rectR0 is float2 (z is always 1.0 in the local gnomonic frame). -// ============================================================================ +// rectR0/rectExtents are returned out-params from createFromVertices and not +// stored on the pyramid (the inner sampler keeps its own copy). The local +// vertex array dies at end-of-create-scope; only the inner sampler persists. +template struct SphericalPyramid { - float32_t3 axis1; // edge-aligned, perpendicular to axis3 - float32_t3 axis2; // = cross(axis3, axis1); axis3 reconstructed via getAxis3() - float32_t2 rectR0; // gnomonic bounding rect corner (z=1 implicit) - float32_t2 rectExtents; + using scalar_type = float32_t; + using vector2_type = float32_t2; + using vector3_type = float32_t3; + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; + + // Caches the inner sampler's cache plus a pre-computed `pdf` that bakes in + // the silhouette/horizon validity test from generate(). + struct cache_type + { + typename InnerSampler::cache_type inner; + density_type pdf; + }; + + float32_t3 axis1; + float32_t3 axis2; // axis3 reconstructed via getAxis3() = cross(axis1, axis2) + + // Per-edge cross products in world space. Populated during Pass 1's + // centroid accumulation (also cached for caliper scoring), used by + // isInside(dir) in generate(). + SilEdgeNormals silEdgeNormals; + + // Constructed by create(silhouette, view) via tag-dispatched buildInner. + // The synth-vertices path (createFromVertices direct) leaves it default-init. + InnerSampler inner; float32_t3 getAxis3() NBL_CONST_MEMBER_FUNC { return cross(axis1, axis2); } - // ======================================================================== - // Pass 1: per-edge cross + arc-length-weighted accumulate - // ======================================================================== - template - static void accumulateEdge(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, NBL_REF_ARG(float32_t3) unnormCentroid, NBL_REF_ARG(SilEdgeNormals) silEdgeNormals) + // Pass 1: per-edge cross + Stokes centroid; UseCaliper=false also tracks + // the longest world edge here. Out params exist in both modes so the + // per-count cascade has one signature; DCE drops the longest-edge body when + // UseCaliper=true. + template + void processEdge(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], NBL_REF_ARG(float32_t3) unnormCentroid, NBL_REF_ARG(float32_t) bestLenSq, NBL_REF_ARG(float32_t3) bestEdge3d, NBL_REF_ARG(uint32_t) bestEdge) { - const uint32_t j = CheckCount ? ((I + 1 < silhouette.count) ? I + 1 : 0) : I + 1; - float32_t3 vI = silhouette.vertices[I]; - float32_t3 vJ = silhouette.vertices[j]; - float32_t3 c = cross(vI, vJ); + const float32_t3 vI = vertices[I]; + const float32_t3 vJ = vertices[J]; + + const float32_t3 c = cross(vI, vJ); silEdgeNormals.edgeNormals[I] = c; - // |c| = sin(arc) since vI, vJ are unit; so c/|c| * arc = c * acos(dot)/sin(arc) = c * acos_csc(dot). - // Clamp away from -1: acos_csc_approx contains log2(1+arg), which goes -inf at arg=-1 and - // produces inf-inf = NaN inside the order-2 polynomial for near-antipodal edges (which can - // occur for "wide" silhouettes whose adjacent vertices sit far apart on the sphere). - // TODO: will be moved to it's own namespace - const float32_t cos_arc = max(dot(vI, vJ), -1.0f + 1e-5f); - unnormCentroid += c * nbl::hlsl::shapes::acos_csc_approx(cos_arc); - } + unnormCentroid += c; - // ======================================================================== - // Pass 2: gnomonic project a single silhouette vertex into the (u,v) plane. - // Skips the (w_dot > 0) guard, axis3 = normalize(unnormCentroid) is the - // polygon's interior direction so all vertices have w_dot > 0 by construction. - // ======================================================================== - template - static float32_t2 projectVertex2D(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, float32_t3 axis_u, float32_t3 axis_v, float32_t3 axis3) - { - float32_t3 vert = silhouette.vertices[I]; - float32_t rcpW = rcp(dot(vert, axis3)); - return float32_t2(dot(vert, axis_u), dot(vert, axis_v)) * rcpW; + if (!UseCaliper) + { + // Explicit nbl::hlsl::select so DXC emits scalar-conditional OpSelect + // for the vec3 update instead of a bool-broadcast v3bool. + const float32_t3 edge3d = vJ - vI; + const float32_t lenSq = dot(edge3d, edge3d); + const bool isBest = lenSq > bestLenSq; + bestLenSq = max(lenSq, bestLenSq); + bestEdge3d = nbl::hlsl::select(isBest, edge3d, bestEdge3d); + bestEdge = nbl::hlsl::select(isBest, I, bestEdge); + } } - // ======================================================================== - // Pass 3: 2D rotating-calipers helpers - // ======================================================================== - template - static void boundOne2D(const float32_t2 verts2d[MAX_SILHOUETTE_VERTICES], float32_t2 axis2d, float32_t2 perp2d, NBL_REF_ARG(float32_t4) bound) + // Caliper-only helpers (DCE'd when UseCaliper=false). + + // Track the silhouette vertex with max dot(vK, precross). SkipA/SkipB are + // the candidate edge's (I, J); compile-time skipped (drops the verts[K] + // read entirely). Assumes vertices are ~unit length so we can skip the + // per-K |vK| factor in the cosine. + template + static void tryK(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], float32_t3 precross, NBL_REF_ARG(float32_t) bestNum, NBL_REF_ARG(float32_t3) bestC) { - float32_t2 v2 = verts2d[K]; - float32_t x = dot(v2, axis2d); - float32_t y = dot(v2, perp2d); - bound.x = min(bound.x, x); - bound.y = min(bound.y, y); - bound.z = max(bound.z, x); - bound.w = max(bound.w, y); + if (K != SkipA && K != SkipB) + { + const float32_t3 vK = vertices[K]; + const float32_t num = dot(vK, precross); + const bool better = num > bestNum; + bestNum = max(num, bestNum); + bestC = nbl::hlsl::select(better, vK, bestC); + } } - static void computeBound2D(const float32_t2 verts2d[MAX_SILHOUETTE_VERTICES], uint32_t count, float32_t2 axis2d, float32_t2 perp2d, NBL_REF_ARG(float32_t4) bound) + // Cascade-on-count K scan with (I, J) as compile-time skips. bestNum seeds + // at -inf; bestC's placeholder is always overwritten (count >= 3). + template + static float32_t3 findExtremalC(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, float32_t3 precross) { - bound = float32_t4(1e10f, 1e10f, -1e10f, -1e10f); - boundOne2D<0>(verts2d, axis2d, perp2d, bound); - boundOne2D<1>(verts2d, axis2d, perp2d, bound); - boundOne2D<2>(verts2d, axis2d, perp2d, bound); + float32_t bestNum = -1e30f; + float32_t3 bestC = vertices[0]; + tryK<0, I, J>(vertices, precross, bestNum, bestC); + tryK<1, I, J>(vertices, precross, bestNum, bestC); + tryK<2, I, J>(vertices, precross, bestNum, bestC); if (count > 3) { - boundOne2D<3>(verts2d, axis2d, perp2d, bound); + tryK<3, I, J>(vertices, precross, bestNum, bestC); if (count > 4) { - boundOne2D<4>(verts2d, axis2d, perp2d, bound); + tryK<4, I, J>(vertices, precross, bestNum, bestC); if (count > 5) { - boundOne2D<5>(verts2d, axis2d, perp2d, bound); + tryK<5, I, J>(vertices, precross, bestNum, bestC); if (count > 6) - boundOne2D<6>(verts2d, axis2d, perp2d, bound); + tryK<6, I, J>(vertices, precross, bestNum, bestC); } } } + return bestC; } - // "Guesstimate" pass 3: pick the longest 2D edge as axis1 and do ONE bound - // computation, instead of trying every edge as a caliper candidate. O(N) + - // one bound pass, vs old O(N^2) of bound passes. The bound is slightly - // looser than the true min-area rect (typically a few percent for OBB - // silhouettes), but the rejection sampler tolerates that. - template - static void considerEdge(const float32_t2 verts2d[MAX_SILHOUETTE_VERTICES], uint32_t count, NBL_REF_ARG(float32_t) bestLenSq, NBL_REF_ARG(float32_t2) bestEdge2d, NBL_REF_ARG(uint32_t) bestEdge) + // Score candidate edge (I, J) by cos(dihedral) between AB-great-circle + // and Lexell plane through (-A, -B, C_win). Identity used: + // cross(C+A, C+B) = n0 + cross(A, C) + cross(C, B) + // so we reuse cached n0. Larger score = smaller bounding lune. max(.,1e-30f) + // keeps rsqrt finite on collapsed edges (they lose on numerator anyway). + template + static void evalCandidate(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, NBL_CONST_REF_ARG(SilEdgeNormals) sen, NBL_REF_ARG(float32_t) bestScore, NBL_REF_ARG(float32_t3) bestEdge3d, NBL_REF_ARG(uint32_t) bestEdge) { - const uint32_t j = CheckCount ? ((I + 1 < count) ? I + 1 : 0) : I + 1; - float32_t2 edge2d = verts2d[j] - verts2d[I]; - float32_t lenSq = dot(edge2d, edge2d); - // Sticky 1% threshold (in lenSq, ~0.5% in length) prevents axis1 from flipping - // between two near-equal-length edges as the silhouette deforms. - if (lenSq > bestLenSq * (1.0f + 1e-2f)) - { - bestLenSq = lenSq; - bestEdge2d = edge2d; - bestEdge = I; - } + const float32_t3 vI = vertices[I]; + const float32_t3 vJ = vertices[J]; + const float32_t3 n0 = sen.edgeNormals[I]; + const float32_t3 edge3d = vJ - vI; + + const float32_t3 precross = cross(edge3d, n0); + const float32_t3 C = findExtremalC(vertices, count, precross); + + const float32_t3 lexell_n1 = n0 + cross(vI, C) + cross(C, vJ); + const float32_t numerator = dot(n0, lexell_n1); + const float32_t edgeDenomSq = dot(n0, n0) * dot(lexell_n1, lexell_n1); + const float32_t score = numerator * rsqrt(max(edgeDenomSq, 1e-30f)); + + const bool better = score > bestScore; + bestScore = max(score, bestScore); + bestEdge3d = nbl::hlsl::select(better, edge3d, bestEdge3d); + bestEdge = nbl::hlsl::select(better, I, bestEdge); } - // ======================================================================== - // Factory - // ======================================================================== + // Gnomonic-project each silhouette vertex into the (axis1, axis2, axis3) + // frame and accumulate the AABB. + template + static void boundOne3D(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], float32_t3 axis1, float32_t3 perp, float32_t3 axis3, NBL_REF_ARG(float32_t4) bound) + { + const float32_t3 vert = vertices[I]; + const float32_t rcpDp = rcp(dot(vert, axis3)); + const float32_t x = dot(vert, axis1) * rcpDp; + const float32_t y = dot(vert, perp) * rcpDp; + bound.x = min(bound.x, x); + bound.y = min(bound.y, y); + bound.z = max(bound.z, x); + bound.w = max(bound.w, y); + } - static SphericalPyramid create(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, NBL_REF_ARG(SilEdgeNormals) silEdgeNormals) + static void computeBound3D(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, float32_t3 axis1, float32_t3 perp, float32_t3 axis3, NBL_REF_ARG(float32_t4) bound) { - SphericalPyramid self; - silEdgeNormals = (SilEdgeNormals)0; - - // Pass 1: build unnormCentroid (true spherical centroid) and edgeNormals. - // Seed with a tiny scaled vertex centroid so symmetric / near-cancelling - // shapes don't degenerate to a zero direction on `normalize`. - float32_t3 unnormCentroid = silhouette.getUnnormalizedCenter() * 1e-6f; - - // Count-cascade: silhouette.vertices[I] for I >= count is uninitialized in some - // call sites (e.g. solid_angle_vis.frag.hlsl declares ClippedSilhouette without - // zero-init), so we must NOT read past count. I=2 needs the wrap check because - // count can be exactly 3 (j must wrap to 0). - accumulateEdge<0>(silhouette, unnormCentroid, silEdgeNormals); - accumulateEdge<1>(silhouette, unnormCentroid, silEdgeNormals); - accumulateEdge<2, true>(silhouette, unnormCentroid, silEdgeNormals); - if (silhouette.count > 3) + bound = float32_t4(1e10f, 1e10f, -1e10f, -1e10f); + boundOne3D<0>(vertices, axis1, perp, axis3, bound); + boundOne3D<1>(vertices, axis1, perp, axis3, bound); + boundOne3D<2>(vertices, axis1, perp, axis3, bound); + if (count > 3) { - accumulateEdge<3, true>(silhouette, unnormCentroid, silEdgeNormals); - if (silhouette.count > 4) + boundOne3D<3>(vertices, axis1, perp, axis3, bound); + if (count > 4) { - accumulateEdge<4, true>(silhouette, unnormCentroid, silEdgeNormals); - if (silhouette.count > 5) + boundOne3D<4>(vertices, axis1, perp, axis3, bound); + if (count > 5) { - accumulateEdge<5, true>(silhouette, unnormCentroid, silEdgeNormals); - if (silhouette.count > 6) - accumulateEdge<6, true>(silhouette, unnormCentroid, silEdgeNormals); + boundOne3D<5>(vertices, axis1, perp, axis3, bound); + if (count > 6) + boundOne3D<6>(vertices, axis1, perp, axis3, bound); } } } + } - const float32_t3 axis3 = normalize(-unnormCentroid); - - // Pass 2: Frisvad basis + 2D gnomonic projection (one-time, before calipers). - float32_t3 u, v; - nbl::hlsl::math::frisvad(axis3, u, v); - - // Project only the first `count` vertices; entries past `count` are unread by - // try2DCaliper since its cascade is also count-gated. - float32_t2 verts2d[MAX_SILHOUETTE_VERTICES]; - verts2d[0] = projectVertex2D<0>(silhouette, u, v, axis3); - verts2d[1] = projectVertex2D<1>(silhouette, u, v, axis3); - verts2d[2] = projectVertex2D<2>(silhouette, u, v, axis3); - if (silhouette.count > 3) + // Pyramid from pre-materialized verts; (rectR0, rectExtents) returned as + // out-params (not stored on the pyramid). + static SphericalPyramid createFromVertices(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, NBL_REF_ARG(float32_t2) outRectR0, NBL_REF_ARG(float32_t2) outRectExtents) + { + SphericalPyramid self; + // Sentinel-init so unused slots (count..6) produce dot(dir,(0,0,-1)) < 0 + // for the sign-bit AND in SilEdgeNormals::isInside. + self.silEdgeNormals = SilEdgeNormals::initSentinel(); + + // Tiny z-bias seed so symmetric shapes don't normalize(0) to NaN; the + // cross sum dominates for any non-degenerate silhouette. + // verts past count are zero-init by materialize, so reading them is harmless. + float32_t3 unnormCentroid = float32_t3(0.0f, 0.0f, 1e-6f); + float32_t bestLenSq = 0.0f; + float32_t3 bestEdge3d = float32_t3(1.0f, 0.0f, 0.0f); + uint32_t bestEdge = 0; + + self.processEdge<0, 1>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + self.processEdge<1, 2>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + if (count == 3) + { + self.processEdge<2, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + } + else { - verts2d[3] = projectVertex2D<3>(silhouette, u, v, axis3); - if (silhouette.count > 4) + self.processEdge<2, 3>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + if (count == 4) + { + self.processEdge<3, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + } + else { - verts2d[4] = projectVertex2D<4>(silhouette, u, v, axis3); - if (silhouette.count > 5) + self.processEdge<3, 4>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + if (count == 5) { - verts2d[5] = projectVertex2D<5>(silhouette, u, v, axis3); - if (silhouette.count > 6) - verts2d[6] = projectVertex2D<6>(silhouette, u, v, axis3); + self.processEdge<4, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + } + else + { + self.processEdge<4, 5>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + if (count == 6) + { + self.processEdge<5, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + } + else // count == 7 + { + self.processEdge<5, 6>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + self.processEdge<6, 0>(vertices, unnormCentroid, bestLenSq, bestEdge3d, bestEdge); + } } } } - // Pass 3: pick longest 2D edge as axis1 ("guesstimate" rotating calipers). - // O(N) edge-length comparisons, then ONE bound pass after the winner is known. - float32_t bestLenSq = 0.0f; - float32_t2 bestEdge2d = float32_t2(1.0f, 0.0f); - uint32_t bestEdge = 0; + const float32_t3 axis3 = normalize(-unnormCentroid); - considerEdge<0>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); - considerEdge<1>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); - considerEdge<2, true>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); - if (silhouette.count > 3) + // Pass 2: caliper dihedral scan overwrites bestEdge3d. Skipped under + // UseCaliper=false (keeps Pass 1's longest edge). + if (UseCaliper) { - considerEdge<3, true>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); - if (silhouette.count > 4) + float32_t bestScore = -2.0f; + + evalCandidate<0, 1>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + evalCandidate<1, 2>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + if (count == 3) { - considerEdge<4, true>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); - if (silhouette.count > 5) + evalCandidate<2, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + } + else + { + evalCandidate<2, 3>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + if (count == 4) + { + evalCandidate<3, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + } + else { - considerEdge<5, true>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); - if (silhouette.count > 6) - considerEdge<6, true>(verts2d, silhouette.count, bestLenSq, bestEdge2d, bestEdge); + evalCandidate<3, 4>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + if (count == 5) + { + evalCandidate<4, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + } + else + { + evalCandidate<4, 5>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + if (count == 6) + { + evalCandidate<5, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + } + else // count == 7 + { + evalCandidate<5, 6>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + evalCandidate<6, 0>(vertices, count, self.silEdgeNormals, bestScore, bestEdge3d, bestEdge); + } + } } } } - // Single bound pass with the winning edge as axis1. Fall back to (1,0) if - // every edge degenerated (silhouette projects to a single point). - const float32_t2 bestAxis2d = bestLenSq > 1e-12f ? bestEdge2d * rsqrt(bestLenSq) : float32_t2(1.0f, 0.0f); - const float32_t2 bestPerp2d = float32_t2(-bestAxis2d.y, bestAxis2d.x); - float32_t4 bestBound; - computeBound2D(verts2d, silhouette.count, bestAxis2d, bestPerp2d, bestBound); - - // Pass 4: reconstruct 3D, sign-stabilize axis1 against a world reference. - // For right-handed (u, v, axis3) Frisvad basis, cross(axis3, u) = v and cross(axis3, v) = -u, - // so axis1 = u*a + v*b => axis2 = cross(axis3, axis1) = v*a - u*b. Skip the 3D `cross`. - const float32_t3 axis1Raw = u * bestAxis2d.x + v * bestAxis2d.y; - const float32_t3 axis2Raw = v * bestAxis2d.x - u * bestAxis2d.y; - { - // Sign-stabilize axis1 against a world reference, branchless. - // axis1 is already perpendicular to axis3, so dot(axis1, worldRef - axis3*dot(worldRef,axis3)) - // == dot(axis1, worldRef). Flipping axis1 also flips axis2 (both negate together since - // axis2 = cross(axis3, axis1)); mirror both x and y bounds simultaneously. - const float32_t3 worldRef = nbl::hlsl::select(abs(axis3.x) < 0.9f, float32_t3(1.0f, 0.0f, 0.0f), float32_t3(0.0f, 1.0f, 0.0f)); - const bool flip = dot(axis1Raw, worldRef) < 0.0f; - self.axis1 = nbl::hlsl::select(flip, -axis1Raw, axis1Raw); - self.axis2 = nbl::hlsl::select(flip, -axis2Raw, axis2Raw); - bestBound = nbl::hlsl::select(flip, float32_t4(-bestBound.z, -bestBound.w, -bestBound.x, -bestBound.y), bestBound); - } + // axis1 = winning chord projected onto plane(axis3) and normalized. + // max(lenSq, 1e-12) keeps rsqrt finite; degenerate select picks a stable + // axis perpendicular to axis3. + const float32_t3 inPlaneEdge = bestEdge3d - axis3 * dot(bestEdge3d, axis3); + const float32_t inPlaneLenSq = dot(inPlaneEdge, inPlaneEdge); + const bool useY = abs(axis3.x) >= 0.9f; + const float32_t scale = rsqrt(max(inPlaneLenSq, 1e-12f)); + + const bool degenerate = inPlaneLenSq <= 1e-12f; + const float32_t3 fallbackAxis1 = nbl::hlsl::select(useY, float32_t3(0.0f, 1.0f, 0.0f), float32_t3(1.0f, 0.0f, 0.0f)); + self.axis1 = nbl::hlsl::select(degenerate, fallbackAxis1, inPlaneEdge * scale); + self.axis2 = cross(axis3, self.axis1); + + float32_t4 bestBound; + computeBound3D(vertices, count, self.axis1, self.axis2, axis3, bestBound); + + // Per-axis degenerate clamp: each upper bound at least 1e-6 above lower. + // Independent per axis so a single collapsed axis doesn't kill the other. + bestBound.zw = max(bestBound.zw, bestBound.xy + 1e-6f); + + outRectR0 = bestBound.xy; + outRectExtents = float32_t2(bestBound.zw - bestBound.xy); + + // Pre-rotate edge normals into local frame so per-sample inside test + // can use the cheaper 2D form (2 muls + 2 adds + n.z per edge instead + // of 3 muls + 2 adds). Amortized once per build; saves 7 muls/sample. + self.silEdgeNormals.transformToLocal(self.axis1, self.axis2, axis3); + + // solidAngle for the debug overlay only. + const float32_t4 denorm_n_z = float32_t4(-bestBound.y, bestBound.z, bestBound.w, -bestBound.x); + const float32_t4 n_z = denorm_n_z * rsqrt(float32_t4(1.0f, 1.0f, 1.0f, 1.0f) + denorm_n_z * denorm_n_z); + const float32_t4 cosGamma = float32_t4(-n_z[0] * n_z[1], -n_z[1] * n_z[2], -n_z[2] * n_z[3], -n_z[3] * n_z[0]); + math::sincos_accumulator acc = math::sincos_accumulator::create(cosGamma[0]); + acc.addCosine(cosGamma[1]); + acc.addCosine(cosGamma[2]); + acc.addCosine(cosGamma[3]); + const float32_t solidAngle = acc.getSumOfArccos() - 2.0f * numbers::pi; + + DebugRecorder::recordPyramid(self.axis1, self.axis2, -unnormCentroid, bestBound, solidAngle, bestEdge); + self.visualize(vertices, count, outRectR0, outRectExtents); + + return self; + } + + // Materialize verts from the silhouette, build the pyramid, then construct + // the InnerSampler via tag-dispatched buildInner. Local rect data dies at + // end-of-scope; only the inner sampler retains a copy. + static SphericalPyramid create(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, shapes::OBBView view) + { + float32_t3 vertices[MAX_SILHOUETTE_VERTICES]; + silhouette.materialize(view, vertices); - // Degenerate bounds fallback (branchless). - const bool degenerateBounds = bestBound.x >= bestBound.z || bestBound.y >= bestBound.w; - bestBound = nbl::hlsl::select(degenerateBounds, float32_t4(-0.1f, -0.1f, 0.1f, 0.1f), bestBound); - - self.rectR0 = bestBound.xy; - self.rectExtents = float32_t2(bestBound.zw - bestBound.xy); - - VisContext::add(SphereDrawer::drawDot(normalize(-unnormCentroid), 0.05f, 0.0f, float32_t3(1.0f, 0.0f, 1.0f))); - VisContext::add(SphereDrawer::visualizeBestCaliperEdge(silhouette, bestEdge)); - self.visualize(); - - // DCE - nbl::hlsl::sampling::SphericalRectangle rectSampler = nbl::hlsl::sampling::SphericalRectangle::create(float32_t3x3(self.axis1, self.axis2, axis3), float32_t3(self.rectR0, 1.0f), self.rectExtents); - DebugRecorder::recordPyramid(self.axis1, self.axis2, -unnormCentroid, bestBound, rectSampler.solidAngle, bestEdge); + float32_t2 rectR0, rectExtents; + SphericalPyramid self = createFromVertices(vertices, silhouette.count, rectR0, rectExtents); + // tag's value is unread; only its type selects the overload. + const float32_t3x3 basis = float32_t3x3(self.axis1, self.axis2, self.getAxis3()); + InnerSampler tag; + self.inner = buildInner(basis, rectR0, rectExtents, tag); return self; } - // ======================================================================== - // Visualization - // ======================================================================== + // Generate via inner.generateNormalizedLocal so we can recover gnomonic + // (localX, localY) for the 2D inside test. With rectR0.z == 1, localDir.z = + // 1/hitDist, so localDir.{x,y} * hitDist == gnomonic coords. Bake + // silhouette/horizon validity into cache.pdf so forwardPdf is O(1). + codomain_type generate(domain_type u, NBL_REF_ARG(cache_type) cache) + { + scalar_type hitDist; + const codomain_type localDir = inner.generateNormalizedLocal(u, cache.inner, hitDist); + const codomain_type dir = localDir.x * axis1 + localDir.y * axis2 + localDir.z * getAxis3(); + const scalar_type localX = localDir.x * hitDist; + const scalar_type localY = localDir.y * hitDist; + const bool valid = dir.z > 0.0f && silEdgeNormals.isInsideLocal(localX, localY); + cache.pdf = hlsl::select(valid, inner.forwardPdf(u, cache.inner), 0.0f); + return dir; + } - void visualize() + density_type forwardPdf(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + weight_type forwardWeight(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + uint32_t selectedIdx(cache_type cache) NBL_CONST_MEMBER_FUNC { return 0u; } + + // Visualization (debug only). Takes verts + count to highlight the chosen + // edge; rectR0/rectExtents are passed in since the pyramid doesn't store them. + uint32_t findChosenEdge(uint32_t count) NBL_CONST_MEMBER_FUNC + { + uint32_t bestI = 0; + float32_t bestAbs = abs(silEdgeNormals.edgeNormals[0].x); + + for (uint32_t i = 0; i < count; i++) + { + const float32_t v = abs(silEdgeNormals.edgeNormals[i].x); + const bool better = v < bestAbs; + bestAbs = nbl::hlsl::select(better, v, bestAbs); + bestI = nbl::hlsl::select(better, i, bestI); + } + + return bestI; + } + + void visualize(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count, float32_t2 rectR0, float32_t2 rectExtents) NBL_CONST_MEMBER_FUNC { // Colors for visualization float32_t3 boundColor1 = float32_t3(1.0f, 0.5f, 0.5f); // Light red for axis1 bounds float32_t3 boundColor2 = float32_t3(0.5f, 0.5f, 1.0f); // Light blue for axis2 bounds float32_t3 centerColor = float32_t3(1.0f, 1.0f, 0.0f); // Yellow for center + float32_t3 chosenColor = float32_t3(1.0f, 0.65f, 0.0f); // Orange for chosen edge highlight + float32_t3 cornerColor = float32_t3(1.0f, 1.0f, 1.0f); // White for rect corners float32_t3 a3 = getAxis3(); float32_t x0 = rectR0.x; @@ -312,14 +465,18 @@ struct SphericalPyramid VisContext::add(SphereDrawer::drawGreatCircleHalf(leftNormal, a3, boundColor1, 0.004f)); VisContext::add(SphereDrawer::drawGreatCircleHalf(rightNormal, a3, boundColor1, 0.004f)); + // Highlight the chosen silhouette edge (recovered from cached silEdgeNormals). + const uint32_t bestI = findChosenEdge(count); + const uint32_t bestJ = (bestI + 1u) % count; + const float32_t3 vBestI = vertices[bestI]; + const float32_t3 vBestJ = vertices[bestJ]; + float32_t3 chosen[2] = {vBestI, vBestJ}; + VisContext::add(SphereDrawer::drawEdge(8u, chosen, 0.012f)); // colorLUT[8] = orange + VisContext::add(SphereDrawer::drawDot(axis1, 0.025f, 0.0f, float32_t3(1.0f, 0.0f, 0.0f))); VisContext::add(SphereDrawer::drawDot(axis2, 0.025f, 0.0f, float32_t3(0.0f, 1.0f, 0.0f))); VisContext::add(SphereDrawer::drawDot(a3, 0.025f, 0.0f, float32_t3(0.0f, 0.0f, 1.0f))); } }; - -#include "pyramid_sampling/bilinear.hlsl" -#include "pyramid_sampling/biquadratic.hlsl" - #endif // _SOLID_ANGLE_VIS_EXAMPLE_PYRAMID_SAMPLING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl index 4094e6bd3..4b0f85cbf 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/bilinear.hlsl @@ -5,72 +5,98 @@ #define _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BILINEAR_HLSL_INCLUDED_ #include -// ============================================================================ -// Bilinear Approximation Sampling (closed-form, faster than biquadratic) -// ============================================================================ -// +// Bilinear gnomonic-rect sampler. Stores the pyramid's basis so generate() +// returns world-space dirs (matching SphericalRectangle's contract). struct BilinearSampler { - nbl::hlsl::sampling::Bilinear sampler; + using scalar_type = float32_t; + using vector2_type = float32_t2; + using vector3_type = float32_t3; + using matrix3x3_type = float32_t3x3; + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; - float32_t rcpRectArea; + nbl::hlsl::sampling::Bilinear sampler; + matrix3x3_type basis; + float32_t2 rectR0; + float32_t2 rectExtents; + float32_t rcpRectArea; - // Precompute bilinear sampler from pyramid - static BilinearSampler create(NBL_CONST_REF_ARG(SphericalPyramid) pyramid) - { - BilinearSampler self; + struct cache_type + { + nbl::hlsl::sampling::Bilinear::cache_type bilinearCache; + float32_t dist2; + float32_t rcpLen; + }; - // 4 corner positions on the rectangle - const float32_t x0 = pyramid.rectR0.x; - const float32_t x1 = x0 + pyramid.rectExtents.x; - const float32_t y0 = pyramid.rectR0.y; - const float32_t y1 = y0 + pyramid.rectExtents.y; + static BilinearSampler create(matrix3x3_type basis, float32_t2 rectR0, float32_t2 rectExtents) + { + BilinearSampler self; + self.basis = basis; - // dSA(x,y) = 1 / (x^2 + y^2 + 1)^(3/2) [z = 1.0 in local frame] - const float32_t xx0 = x0 * x0, xx1 = x1 * x1; - const float32_t yy0 = y0 * y0, yy1 = y1 * y1; + // 4 corner positions on the rectangle + const float32_t x0 = rectR0.x; + const float32_t x1 = x0 + rectExtents.x; + const float32_t y0 = rectR0.y; + const float32_t y1 = y0 + rectExtents.y; - // d^{-3/2} = rsqrt(d)^3: 1 rsqrt + 2 mul instead of 1 rsqrt + 1 div - float32_t r; - r = rsqrt(xx0 + yy0 + 1.0f); - const float32_t v00 = r * r * r; // x0y0 - r = rsqrt(xx1 + yy0 + 1.0f); - const float32_t v10 = r * r * r; // x1y0 - r = rsqrt(xx0 + yy1 + 1.0f); - const float32_t v01 = r * r * r; // x0y1 - r = rsqrt(xx1 + yy1 + 1.0f); - const float32_t v11 = r * r * r; // x1y1 + // dSA(x,y) = 1 / (x^2 + y^2 + 1)^(3/2) [z = 1.0 in local frame] + const float32_t xx0 = x0 * x0, xx1 = x1 * x1; + const float32_t yy0 = y0 * y0, yy1 = y1 * y1; - // Bilinear layout: (x0y0, x0y1, x1y0, x1y1) - self.sampler = nbl::hlsl::sampling::Bilinear::create(float32_t4(v00, v01, v10, v11)); - self.rcpRectArea = rcp(max(pyramid.rectExtents.x * pyramid.rectExtents.y, 1e-20f)); + // d^{-3/2} = rsqrt(d)^3: 1 rsqrt + 2 mul instead of 1 rsqrt + 1 div + float32_t r; + r = rsqrt(xx0 + yy0 + 1.0f); + const float32_t v00 = r * r * r; + r = rsqrt(xx1 + yy0 + 1.0f); + const float32_t v10 = r * r * r; + r = rsqrt(xx0 + yy1 + 1.0f); + const float32_t v01 = r * r * r; + r = rsqrt(xx1 + yy1 + 1.0f); + const float32_t v11 = r * r * r; - return self; - } + // Bilinear layout: (x0y0, x0y1, x1y0, x1y1) + self.sampler = nbl::hlsl::sampling::Bilinear::create(float32_t4(v00, v01, v10, v11)); + self.rectR0 = rectR0; + self.rectExtents = rectExtents; + self.rcpRectArea = rcp(max(rectExtents.x * rectExtents.y, 1e-20f)); - // Sample a direction on the spherical pyramid using bilinear importance sampling. - // Returns the world-space direction; outputs pdf in solid-angle space and validity flag. - float32_t3 sample(NBL_CONST_REF_ARG(SphericalPyramid) pyramid, NBL_CONST_REF_ARG(SilEdgeNormals) silEdgeNormals, float32_t2 xi, out float32_t pdf, out bool valid) - { - nbl::hlsl::sampling::Bilinear::cache_type cache; - float32_t2 uv = sampler.generate(xi, cache); + return self; + } - const float32_t localX = pyramid.rectR0.x + uv.x * pyramid.rectExtents.x; - const float32_t localY = pyramid.rectR0.y + uv.y * pyramid.rectExtents.y; + // Returns world-space unit direction; caches dist2 and rcpLen for forwardPdf. + // Returns local-frame unit direction; caches dist2/rcpLen for forwardPdf. + // hitDist == 1/rcpLen (the gnomonic ray length on the rect at z=1). + codomain_type generateNormalizedLocal(domain_type u, NBL_REF_ARG(cache_type) cache, NBL_REF_ARG(scalar_type) hitDist) + { + const vector2_type uv = sampler.generate(u, cache.bilinearCache); + const scalar_type localX = rectR0.x + uv.x * rectExtents.x; + const scalar_type localY = rectR0.y + uv.y * rectExtents.y; + cache.dist2 = localX * localX + localY * localY + 1.0f; + cache.rcpLen = rsqrt(cache.dist2); + hitDist = 1.0f / cache.rcpLen; + return codomain_type(localX, localY, 1.0f) * cache.rcpLen; + } - const float32_t dist2 = localX * localX + localY * localY + 1.0f; - const float32_t rcpLen = rsqrt(dist2); - float32_t3 direction = (localX * pyramid.axis1 + - localY * pyramid.axis2 + - pyramid.getAxis3()) * rcpLen; + codomain_type generate(domain_type u, NBL_REF_ARG(cache_type) cache) + { + scalar_type dummy; + const vector3_type localDir = generateNormalizedLocal(u, cache, dummy); + return basis[0] * localDir.x + basis[1] * localDir.y + basis[2] * localDir.z; + } - valid = direction.z > 0.0f && silEdgeNormals.isInsideLocal(localX, localY); + // Solid-angle-measure pdf: bilinearPdf * dist2^{3/2} * rcpRectArea. + density_type forwardPdf(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC + { + return sampler.forwardPdf(u, cache.bilinearCache) * cache.dist2 * cache.dist2 * cache.rcpLen * rcpRectArea; + } - // PDF in solid angle space: pdfBilinear * dist2^{3/2} * rcpRectArea - pdf = sampler.forwardPdf(xi, cache) * dist2 * dist2 * rcpLen * rcpRectArea; - - return direction; - } + weight_type forwardWeight(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC + { + return forwardPdf(u, cache); + } }; #endif // _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BILINEAR_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/biquadratic.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/biquadratic.hlsl deleted file mode 100644 index fa9e391cc..000000000 --- a/73_SolidAngleVisualizer/app_resources/hlsl/pyramid_sampling/biquadratic.hlsl +++ /dev/null @@ -1,80 +0,0 @@ -//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. -//// This file is part of the "Nabla Engine". -//// For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BIQUADRATIC_HLSL_INCLUDED_ -#define _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BIQUADRATIC_HLSL_INCLUDED_ -#include // reuse basic structure - -// ============================================================================ -// Biquadratic Approximation Sampling (cheap solid-angle approximation) -// ============================================================================ -struct BiquadraticSampler -{ - nbl::hlsl::sampling::Bilinear baseSampler; // underlying bilinear generator - - float32_t rcpRectArea; - - // Precompute biquadratic sampler from pyramid - static BiquadraticSampler create(NBL_CONST_REF_ARG(SphericalPyramid) pyramid) - { - BiquadraticSampler self; - - // 4 corner positions on the rectangle - const float32_t x0 = pyramid.rectR0.x; - const float32_t x1 = x0 + pyramid.rectExtents.x; - const float32_t y0 = pyramid.rectR0.y; - const float32_t y1 = y0 + pyramid.rectExtents.y; - - // Compute solid-angle weights at corners: d^{-3/2} - const float32_t xx0 = x0 * x0, xx1 = x1 * x1; - const float32_t yy0 = y0 * y0, yy1 = y1 * y1; - - // d^{-3/2} = rsqrt(d)^3 - float32_t r; - r = rsqrt(xx0 + yy0 + 1.0f); - const float32_t v00 = r * r * r; - r = rsqrt(xx1 + yy0 + 1.0f); - const float32_t v10 = r * r * r; - r = rsqrt(xx0 + yy1 + 1.0f); - const float32_t v01 = r * r * r; - r = rsqrt(xx1 + yy1 + 1.0f); - const float32_t v11 = r * r * r; - - self.baseSampler = nbl::hlsl::sampling::Bilinear::create(float32_t4(v00, v01, v10, v11)); - self.rcpRectArea = rcp(max(pyramid.rectExtents.x * pyramid.rectExtents.y, 1e-20f)); - - return self; - } - - // Sample a direction on the spherical pyramid using biquadratic importance sampling. - // Applies a quadratic warp f(t) = t*(2-t) after bilinear sampling to redistribute - // samples. The warp Jacobian f'(t) = 2*(1-t) is accounted for in the PDF. - float32_t3 sample(NBL_CONST_REF_ARG(SphericalPyramid) pyramid, NBL_CONST_REF_ARG(SilEdgeNormals) silEdgeNormals, float32_t2 xi, out float32_t pdf, out bool valid) - { - nbl::hlsl::sampling::Bilinear::cache_type cache; - float32_t2 uv = baseSampler.generate(xi, cache); - - // Quadratic warp: f(t) = t * (2 - t), f'(t) = 2 * (1 - t) - const float32_t rcpWarpJacobian = rcp(4.0f * (1.0f - uv.x) * (1.0f - uv.y)); - uv = float32_t2(uv.x * (2.0f - uv.x), uv.y * (2.0f - uv.y)); - - const float32_t localX = pyramid.rectR0.x + uv.y * pyramid.rectExtents.x; - const float32_t localY = pyramid.rectR0.y + uv.x * pyramid.rectExtents.y; - - const float32_t dist2 = localX * localX + localY * localY + 1.0f; - const float32_t rcpLen = rsqrt(dist2); - float32_t3 direction = (localX * pyramid.axis1 + - localY * pyramid.axis2 + - pyramid.getAxis3()) * - rcpLen; - - valid = direction.z > 0.0f && silEdgeNormals.isInsideLocal(localX, localY); - - // PDF in solid-angle space, accounting for warp Jacobian - pdf = baseSampler.forwardPdf(xi, cache) * dist2 * dist2 * rcpLen * rcpRectArea * rcpWarpJacobian; - - return direction; - } -}; - -#endif // _SOLID_ANGLE_VIS_EXAMPLE_SAMPLING_BIQUADRATIC_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl index 58afa5345..3050f8425 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/silhouette.hlsl @@ -11,6 +11,7 @@ #include #include +using namespace nbl; using namespace nbl::hlsl; // TODO: unused, remove later @@ -42,7 +43,7 @@ static const uint32_t silhouettes[27][7] = { {6, 0, 4, 6, 2, 3, 1}, // 23: Teal {6, 0, 2, 3, 7, 5, 4}, // 24: Brown {6, 0, 2, 3, 1, 5, 4}, // 25: Tan/Beige - {6, 1, 5, 4, 6, 2, 3} // 26: Dark Brown + {6, 1, 5, 4, 6, 2, 3}, // 26: Dark Brown }; // Binary packed silhouettes @@ -80,7 +81,7 @@ struct BinSilhouette { static BinSilhouette create(uint32_t configIndex) { - BinSilhouette s = (BinSilhouette)0; + BinSilhouette s; s.data = binSilhouettes[configIndex]; return s; } @@ -91,38 +92,11 @@ struct BinSilhouette } // Get silhouette size - uint32_t getSilhouetteSize() NBL_CONST_MEMBER_FUNC + uint32_t getVertexCount() NBL_CONST_MEMBER_FUNC { return (data >> 29u) & 0x7u; } - // Build a 12-bit mask of which cube edges are part of the silhouette. - // Edge enumeration: for axis in {0,1,2}, for each corner with axis-bit - // clear, edge = (corner, corner | (1<> (axis + 1u); - uint32_t compact = (above << axis) | below; - mask |= 1u << (axis * 4u + compact); - } - return mask; - } - void rotr(uint32_t shift, uint32_t size) { data = nbl::hlsl::rotr(data, shift, size); @@ -136,197 +110,269 @@ struct BinSilhouette uint32_t data; }; + +// Metadata-only descriptor of a clipped OBB silhouette (12 bytes). Vertex +// positions are NOT stored -- consumers call materialize(view, verts) to +// fill a local array on demand, keeping vec3 storage out of struct-passing. +// +// silData: bits 0-17 rotated 3-bit corner indices (positive-z corners first +// in CCW order, then negative-z), bits 24-28 configIndex, bits 29-31 silhouette size. +// positiveCount: positive-z corners surviving the clip. +// count: emitted vertex count (positiveCount + 2 on partial clip, 0 if fully clipped). struct ClippedSilhouette { + uint32_t silData; // rotated BinSilhouette data + size + uint32_t positiveCount; // # of positive-z OBB corners after rotation + uint32_t count; // total emitted vertex count consumers cascade on static ClippedSilhouette create(shapes::OBBView view) { uint32_t3 region; - uint32_t configIndex, vertexCount; - BinSilhouette sil = computeRegionAndConfig(view, region, configIndex, vertexCount); - ClippedSilhouette s = (ClippedSilhouette)0; - s.compute(view, vertexCount, sil); - return s; - } + uint32_t configIndex, vertexCount; + // OBB-local observer coord along axis i is -dot(col_i, minCorner); + // compare against [0, |col_i|^2] for branchless 27-config classify. + float32_t3 sqScales = float32_t3(dot(view.columns[0], view.columns[0]), dot(view.columns[1], view.columns[1]), dot(view.columns[2], view.columns[2])); + float32_t3 proj = -float32_t3(dot(view.columns[0], view.minCorner), dot(view.columns[1], view.minCorner), dot(view.columns[2], view.minCorner)); - // only used by projected parallelogram - void normalize() - { - vertices[0] = nbl::hlsl::normalize(vertices[0]); - vertices[1] = nbl::hlsl::normalize(vertices[1]); - vertices[2] = nbl::hlsl::normalize(vertices[2]); - if (count > 3) - { - vertices[3] = nbl::hlsl::normalize(vertices[3]); - if (count > 4) - { - vertices[4] = nbl::hlsl::normalize(vertices[4]); - if (count > 5) - { - vertices[5] = nbl::hlsl::normalize(vertices[5]); - if (count > 6) - { - vertices[6] = nbl::hlsl::normalize(vertices[6]); - } - } - } - } - } - - // Compute the silhouette centroid (average direction) - // Returns unnormalized centroid (sum of vertices). The direction is what - // matters for the adaptive axis3 blend, the magnitude cancels out after - // normalize(center * tBlend + (0,0,1)). just as small optimization. - float32_t3 getUnnormalizedCenter() - { - float32_t3 sum = float32_t3(0, 0, 0); - - NBL_UNROLL - for (uint32_t i = 0; i < MAX_SILHOUETTE_VERTICES; i++) - { - if (i < count) - sum += vertices[i]; - } - - return sum; - } - - static BinSilhouette computeRegionAndConfig(shapes::OBBView view, out uint32_t3 region, out uint32_t configIndex, out uint32_t vertexCount) - { - // With [0,1]^3 local space, the observer's unnormalized OBB-local - // coordinate along axis i is proj_i = -dot(col_i, minCorner). - // Compare against 0 and |col_i|^2 (the unnormalized [0,1] bounds) - // to classify into the 27-configuration LUT. - float32_t3 sqScales = float32_t3( - dot(view.columns[0], view.columns[0]), - dot(view.columns[1], view.columns[1]), - dot(view.columns[2], view.columns[2])); - - float32_t3 proj = -float32_t3( - dot(view.columns[0], view.minCorner), - dot(view.columns[1], view.minCorner), - dot(view.columns[2], view.minCorner)); - - region = uint32_t3( - proj.x < 0 ? 2 : (proj.x > sqScales.x ? 0 : 1), - proj.y < 0 ? 2 : (proj.y > sqScales.y ? 0 : 1), - proj.z < 0 ? 2 : (proj.z > sqScales.z ? 0 : 1)); + uint32_t3 below = uint32_t3(proj < float32_t3(0, 0, 0)); + uint32_t3 above = uint32_t3(proj > sqScales); + region = uint32_t3(uint32_t3(1u, 1u, 1u) + below - above); configIndex = region.x + region.y * 3u + region.z * 9u; BinSilhouette sil = BinSilhouette::create(configIndex); - vertexCount = sil.getSilhouetteSize(); - - return sil; - } + vertexCount = sil.getVertexCount(); - void compute(shapes::OBBView view, uint32_t vertexCount, BinSilhouette sil) - { - - // Build clip mask (z < 0) - uint32_t clipMask = 0u; + // Always evaluate all 6 slots so the loop unrolls without a runtime + // branch on vertexCount; high bits are masked off below. + uint32_t validMask = (1u << vertexCount) - 1u; + uint32_t clipMask = 0u; NBL_UNROLL - for (uint32_t i = 0; i < 4; i++) - clipMask |= (view.getVertexZ(sil.getVertexIndex(i)) < 0.0f ? 1u : 0u) << i; - - if (vertexCount == 6) - { - NBL_UNROLL - for (uint32_t i = 4; i < 6; i++) - clipMask |= (view.getVertexZ(sil.getVertexIndex(i)) < 0.0f ? 1u : 0u) << i; - } + for (uint32_t i = 0; i < 6; i++) + clipMask |= (hlsl::select(view.getVertexZ(sil.getVertexIndex(i)) < 0.0f, 1u, 0u)) << i; + clipMask &= validMask; uint32_t clipCount = countbits(clipMask); + uint32_t invertedMask = ~clipMask & validMask; - // Invert clip mask to find first positive vertex - uint32_t invertedMask = ~clipMask & ((1u << vertexCount) - 1u); - - // Check if wrap-around is needed (first and last bits negative) - bool wrapAround = ((clipMask & 1u) != 0u) && ((clipMask & (1u << (vertexCount - 1))) != 0u); + // clipMask is masked to validMask, so the shift can't pull garbage into bit 0. + bool wrapAround = (clipMask & (clipMask >> (vertexCount - 1))) != 0u; - // Compute rotation amount uint32_t rotateAmount = nbl::hlsl::select(wrapAround, firstbitlow(invertedMask), // first positive firstbithigh(clipMask) + 1); // first vertex after last negative - // Rotate masks - uint32_t rotatedClipMask = nbl::hlsl::rotr(clipMask, rotateAmount, vertexCount); sil.rotr(rotateAmount * 3, vertexCount * 3); - uint32_t positiveCount = vertexCount - clipCount; - // Compute all 4 clip endpoints up front , independent obbVertex calls - // give the compiler maximum ILP alongside the positive-vertex loop. - uint32_t lastPosIdx = positiveCount - 1; - uint32_t firstNegIdx = positiveCount; - - float32_t3 vLastPos = view.getVertex(sil.getVertexIndex(lastPosIdx)); - float32_t3 vFirstNeg = view.getVertex(sil.getVertexIndex(firstNegIdx)); - float32_t t = vLastPos.z / (vLastPos.z - vFirstNeg.z); - float32_t3 clipA = lerp(vLastPos, vFirstNeg, t); - - float32_t3 vLastNeg = view.getVertex(sil.getVertexIndex(vertexCount - 1)); - float32_t3 vFirstPos = view.getVertex(sil.getVertexIndex(0)); - t = vLastNeg.z / (vLastNeg.z - vFirstPos.z); - float32_t3 clipB = lerp(vLastNeg, vFirstPos, t); + ClippedSilhouette self; + // rotr wipes bits above width, so re-inject vertexCount and pack configIndex. + self.silData = sil.data | (configIndex << 24u) | (vertexCount << 29u); + self.positiveCount = vertexCount - clipCount; + const bool fullyClipped = (clipCount == vertexCount); + const bool partialClip = (clipCount > 0) && !fullyClipped; + self.count = nbl::hlsl::select(fullyClipped, 0u, self.positiveCount + (partialClip ? 2u : 0u)); + + uint32_t rotatedClipMask = nbl::hlsl::rotr(clipMask, rotateAmount, vertexCount); // Debug only + DebugRecorder::recordClipResult(self.count, clipMask, clipCount, rotatedClipMask, rotateAmount, self.positiveCount, wrapAround, sil.data); + + return self; + } - count = 0; + uint32_t cornerIndex(uint32_t k) NBL_CONST_MEMBER_FUNC + { + return (silData >> (3u * k)) & 0x7u; + } + uint32_t getVertexCount() NBL_CONST_MEMBER_FUNC { return (silData >> 29u) & 0x7u; } + uint32_t getConfigIndex() NBL_CONST_MEMBER_FUNC { return (silData >> 24u) & 0x1Fu; } + uint32_t3 getRegion() NBL_CONST_MEMBER_FUNC + { + const uint32_t ci = getConfigIndex(); + return uint32_t3(ci % 3u, (ci / 3u) % 3u, ci / 9u); + } + BinSilhouette getOriginalBinSilhouette() NBL_CONST_MEMBER_FUNC { return BinSilhouette::create(getConfigIndex()); } + + // Fill `count` vertices into the caller's local array. Each vertex is + // view.getVertex(cornerIndex(K)) -- columns[0/1/2] indexed by literal so + // SROA keeps them in registers and the 3 conditional adds run in parallel. + // A +/- walk (one fmadd per vertex via view.columns[axis]) was tried and + // measured slower: dynamic-index access demotes view to Function memory + // and serializes the prev-chain. + // Cascade on count rather than for+break so every vertices[K] write uses + // a literal slot index, otherwise the array demotes to Function memory. + void materialize(shapes::OBBView view, out float32_t3 vertices[MAX_SILHOUETTE_VERTICES]) NBL_CONST_MEMBER_FUNC + { + // Zero the unused tail; some consumers (DCE sinks, debug paths) read + // the full 7-wide array. NBL_UNROLL - for (uint32_t i = 0; i < positiveCount; i++) + for (uint32_t init = 0; init < MAX_SILHOUETTE_VERTICES; init++) + vertices[init] = float32_t3(0.0f, 0.0f, 0.0f); + if (count == 0) + return; + + vertices[0] = view.getVertex(cornerIndex(0)); + if (positiveCount > 1) { - float32_t3 v0 = view.getVertex(sil.getVertexIndex(i)); - DebugRecorder::recordClippedVertex(count, v0, (i + rotateAmount) % vertexCount); - vertices[count++] = v0; + vertices[1] = view.getVertex(cornerIndex(1)); + if (positiveCount > 2) + { + vertices[2] = view.getVertex(cornerIndex(2)); + if (positiveCount > 3) + { + vertices[3] = view.getVertex(cornerIndex(3)); + if (positiveCount > 4) + { + vertices[4] = view.getVertex(cornerIndex(4)); + if (positiveCount > 5) + { + vertices[5] = view.getVertex(cornerIndex(5)); + if (positiveCount > 6) + vertices[6] = view.getVertex(cornerIndex(6)); + } + } + } + } } - if (clipCount > 0 && clipCount < vertexCount) + // Partial-clip: two extra getVertex calls for the negative-z endpoints + // around the positive run, lerped to z=0. Cascaded for literal slot indices. + if (count > positiveCount) { - DebugRecorder::recordClippedVertex(count, clipA, 23); - vertices[count++] = clipA; + const uint32_t silSize = (silData >> 29u) & 0x7u; + const float32_t3 vFirstNeg = view.getVertex(cornerIndex(positiveCount)); + const float32_t3 vLastNeg = view.getVertex(cornerIndex(silSize - 1u)); + const float32_t3 vFirstPos = vertices[0]; - DebugRecorder::recordClippedVertex(count, clipB, 24); - vertices[count++] = clipB; + if (positiveCount == 1) + { + const float32_t3 vLastPos = vertices[0]; + const float32_t tA = vLastPos.z / (vLastPos.z - vFirstNeg.z); + vertices[1] = lerp(vLastPos, vFirstNeg, tA); + const float32_t tB = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + vertices[2] = lerp(vLastNeg, vFirstPos, tB); + } + else if (positiveCount == 2) + { + const float32_t3 vLastPos = vertices[1]; + const float32_t tA = vLastPos.z / (vLastPos.z - vFirstNeg.z); + vertices[2] = lerp(vLastPos, vFirstNeg, tA); + const float32_t tB = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + vertices[3] = lerp(vLastNeg, vFirstPos, tB); + } + else if (positiveCount == 3) + { + const float32_t3 vLastPos = vertices[2]; + const float32_t tA = vLastPos.z / (vLastPos.z - vFirstNeg.z); + vertices[3] = lerp(vLastPos, vFirstNeg, tA); + const float32_t tB = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + vertices[4] = lerp(vLastNeg, vFirstPos, tB); + } + else if (positiveCount == 4) + { + const float32_t3 vLastPos = vertices[3]; + const float32_t tA = vLastPos.z / (vLastPos.z - vFirstNeg.z); + vertices[4] = lerp(vLastPos, vFirstNeg, tA); + const float32_t tB = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + vertices[5] = lerp(vLastNeg, vFirstPos, tB); + } + else // positiveCount == 5; positiveCount == 6 -> count == 8 > 7, impossible + { + const float32_t3 vLastPos = vertices[4]; + const float32_t tA = vLastPos.z / (vLastPos.z - vFirstNeg.z); + vertices[5] = lerp(vLastPos, vFirstNeg, tA); + const float32_t tB = vLastNeg.z / (vLastNeg.z - vFirstPos.z); + vertices[6] = lerp(vLastNeg, vFirstPos, tB); + } } + } - DebugRecorder::recordClipResult(count, clipMask, clipCount, rotatedClipMask, - rotateAmount, positiveCount, wrapAround, sil.data); + // Originals tagged with their cube corner index; clip verts use sentinels 23/24. + // recordClippedVertex is a no-op in release. + void recordVertices(float32_t3 vertices[MAX_SILHOUETTE_VERTICES]) NBL_CONST_MEMBER_FUNC + { + for (uint32_t k = 0; k < positiveCount; k++) + DebugRecorder::recordClippedVertex(k, vertices[k], cornerIndex(k)); + if (count > positiveCount) + { + DebugRecorder::recordClippedVertex(positiveCount, vertices[positiveCount], 23u); + DebugRecorder::recordClippedVertex(positiveCount + 1u, vertices[positiveCount + 1u], 24u); + } } - float32_t3 vertices[MAX_SILHOUETTE_VERTICES]; // Max 7 vertices after clipping, unnormalized - uint32_t count; + // materialize + per-vertex normalize. Cascaded for literal slot indices. + void materializeNormalized(shapes::OBBView view, out float32_t3 vertices[MAX_SILHOUETTE_VERTICES]) NBL_CONST_MEMBER_FUNC + { + materialize(view, vertices); + vertices[0] = nbl::hlsl::normalize(vertices[0]); + if (count > 1) + { + vertices[1] = nbl::hlsl::normalize(vertices[1]); + if (count > 2) + { + vertices[2] = nbl::hlsl::normalize(vertices[2]); + if (count > 3) + { + vertices[3] = nbl::hlsl::normalize(vertices[3]); + if (count > 4) + { + vertices[4] = nbl::hlsl::normalize(vertices[4]); + if (count > 5) + { + vertices[5] = nbl::hlsl::normalize(vertices[5]); + if (count > 6) + vertices[6] = nbl::hlsl::normalize(vertices[6]); + } + } + } + } + } + } }; struct SilEdgeNormals { - // Better not use and calculate it while creating the sampler - static SilEdgeNormals create(NBL_CONST_REF_ARG(ClippedSilhouette) sil) + // Sentinel for unused edge slots: dot(dir, (0,0,-1)) = -dir.z. Callers + // gate isInside on dir.z > 0, so this dot is always negative for them + // -- its asuint has the sign bit set, which makes the bitwise-AND + // reduction in isInside() pass through the real sign bits unchanged. + static SilEdgeNormals initSentinel() { - SilEdgeNormals result = (SilEdgeNormals)0; + SilEdgeNormals result; + NBL_UNROLL + for (uint32_t i = 0; i < MAX_SILHOUETTE_VERTICES; i++) + result.edgeNormals[i] = float32_t3(0.0f, 0.0f, -1.0f); + return result; + } + + // Build per-edge cross products from a materialized vertex array. + static SilEdgeNormals create(float32_t3 vertices[MAX_SILHOUETTE_VERTICES], uint32_t count) + { + SilEdgeNormals result = initSentinel(); - float32_t3 v0 = sil.vertices[0]; - float32_t3 v1 = sil.vertices[1]; - float32_t3 v2 = sil.vertices[2]; + float32_t3 v0 = vertices[0]; + float32_t3 v1 = vertices[1]; + float32_t3 v2 = vertices[2]; result.edgeNormals[0] = cross(v0, v1); result.edgeNormals[1] = cross(v1, v2); - if (sil.count > 3) + if (count > 3) { - float32_t3 v3 = sil.vertices[3]; + float32_t3 v3 = vertices[3]; result.edgeNormals[2] = cross(v2, v3); - if (sil.count > 4) + if (count > 4) { - float32_t3 v4 = sil.vertices[4]; + float32_t3 v4 = vertices[4]; result.edgeNormals[3] = cross(v3, v4); - if (sil.count > 5) + if (count > 5) { - float32_t3 v5 = sil.vertices[5]; + float32_t3 v5 = vertices[5]; result.edgeNormals[4] = cross(v4, v5); - if (sil.count > 6) + if (count > 6) { - float32_t3 v6 = sil.vertices[6]; + float32_t3 v6 = vertices[6]; result.edgeNormals[5] = cross(v5, v6); result.edgeNormals[6] = cross(v6, v0); } @@ -353,16 +399,21 @@ struct SilEdgeNormals return result; } + // Sign-bit AND reduction: dot ≤ 0 ⟺ asuint(dot) sign bit set (modulo +0.0 + // exact-boundary samples, which never hit in practice). 6 ANDs on the INT + // pipe instead of 6 fmaxes on the FP pipe; lets the FP pipe stay busy with + // the 7 dot products on Ampere's split FP/INT scheduler. bool isInside(float32_t3 dir) { - float32_t maxDot = dot(dir, edgeNormals[0]); - maxDot = max(maxDot, dot(dir, edgeNormals[1])); - maxDot = max(maxDot, dot(dir, edgeNormals[2])); - maxDot = max(maxDot, dot(dir, edgeNormals[3])); - maxDot = max(maxDot, dot(dir, edgeNormals[4])); - maxDot = max(maxDot, dot(dir, edgeNormals[5])); - maxDot = max(maxDot, dot(dir, edgeNormals[6])); - return maxDot <= 0.0f; + const float32_t d0 = hlsl::dot(dir, edgeNormals[0]); + const float32_t d1 = hlsl::dot(dir, edgeNormals[1]); + const float32_t d2 = hlsl::dot(dir, edgeNormals[2]); + const float32_t d3 = hlsl::dot(dir, edgeNormals[3]); + const float32_t d4 = hlsl::dot(dir, edgeNormals[4]); + const float32_t d5 = hlsl::dot(dir, edgeNormals[5]); + const float32_t d6 = hlsl::dot(dir, edgeNormals[6]); + const uint32_t allNeg = asuint(d0) & asuint(d1) & asuint(d2) & asuint(d3) & asuint(d4) & asuint(d5) & asuint(d6); + return (allNeg & 0x80000000u) != 0u; } // Transform edge normals from world-space to the pyramid's local frame in-place. @@ -374,7 +425,7 @@ struct SilEdgeNormals NBL_UNROLL for (uint32_t i = 0; i < MAX_SILHOUETTE_VERTICES; i++) { - float32_t3 n = edgeNormals[i]; + float32_t3 n = edgeNormals[i]; edgeNormals[i] = float32_t3(dot(n, axis1), dot(n, axis2), dot(n, axis3)); } } @@ -384,12 +435,12 @@ struct SilEdgeNormals bool isInsideLocal(float32_t localX, float32_t localY) { float32_t maxDot = localX * edgeNormals[0].x + localY * edgeNormals[0].y + edgeNormals[0].z; - maxDot = max(maxDot, localX * edgeNormals[1].x + localY * edgeNormals[1].y + edgeNormals[1].z); - maxDot = max(maxDot, localX * edgeNormals[2].x + localY * edgeNormals[2].y + edgeNormals[2].z); - maxDot = max(maxDot, localX * edgeNormals[3].x + localY * edgeNormals[3].y + edgeNormals[3].z); - maxDot = max(maxDot, localX * edgeNormals[4].x + localY * edgeNormals[4].y + edgeNormals[4].z); - maxDot = max(maxDot, localX * edgeNormals[5].x + localY * edgeNormals[5].y + edgeNormals[5].z); - maxDot = max(maxDot, localX * edgeNormals[6].x + localY * edgeNormals[6].y + edgeNormals[6].z); + maxDot = hlsl::max(maxDot, localX * edgeNormals[1].x + localY * edgeNormals[1].y + edgeNormals[1].z); + maxDot = hlsl::max(maxDot, localX * edgeNormals[2].x + localY * edgeNormals[2].y + edgeNormals[2].z); + maxDot = hlsl::max(maxDot, localX * edgeNormals[3].x + localY * edgeNormals[3].y + edgeNormals[3].z); + maxDot = hlsl::max(maxDot, localX * edgeNormals[4].x + localY * edgeNormals[4].y + edgeNormals[4].z); + maxDot = hlsl::max(maxDot, localX * edgeNormals[5].x + localY * edgeNormals[5].y + edgeNormals[5].z); + maxDot = hlsl::max(maxDot, localX * edgeNormals[6].x + localY * edgeNormals[6].y + edgeNormals[6].z); return maxDot <= 0.0f; } diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl index 82728531c..cfa2e2969 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl @@ -13,21 +13,50 @@ using namespace ext::FullScreenTriangle; #include "utils.hlsl" #include "silhouette.hlsl" #include "triangle_sampling.hlsl" -#include "pyramid_sampling.hlsl" #include "parallelogram_sampling.hlsl" +#include "pyramid_sampling.hlsl" [[vk::push_constant]] struct PushConstants pc; -static const SAMPLING_MODE samplingMode = (SAMPLING_MODE)SAMPLING_MODE_CONST; +static const SAMPLING_MODE_FLAGS samplingMode = SAMPLING_MODE_FLAGS_CONST; + +// Mode -> Sampler type dispatch keyed on the dense ID (boost::wave can't +// evaluate enum-qualified `::` in #if, so we use a parallel numeric macro +// passed in by CMake). Dense IDs match the kAllModes ordering in common.hlsl: +// 0 SPH_RECT_FROM_CALIPER_PYRAMID 5 PROJECTED_PARALLELOGRAM_SOLID_ANGLE +// 1 SPH_RECT_FROM_PYRAMID 6 BILINEAR_FROM_PYRAMID +// 2 PROJ_SPH_RECT_FROM_PYRAMID 7 SILHOUETTE_CREATION_ONLY (early-exit) +// 3 TRIANGLE_SOLID_ANGLE 8 PYRAMID_CREATION_ONLY +// 4 TRIANGLE_PROJECTED_SOLID_ANGLE 9 CALIPER_PYRAMID_CREATION_ONLY +// PYRAMID_CREATION_ONLY / CALIPER_PYRAMID_CREATION_ONLY pick the sphrect inner +// so the bounding lunes still draw on screen; the inner is harmless extra work +// (those modes are timed in the compute benchmark, not the frag). +#if SAMPLING_MODE_DENSE_ID == 3 +typedef TriangleFanSampler SelectedSampler; +#elif SAMPLING_MODE_DENSE_ID == 4 +typedef TriangleFanSampler SelectedSampler; +#elif SAMPLING_MODE_DENSE_ID == 5 +typedef Parallelogram SelectedSampler; +#elif SAMPLING_MODE_DENSE_ID == 1 || SAMPLING_MODE_DENSE_ID == 8 +typedef SphericalPyramid > SelectedSampler; +#elif SAMPLING_MODE_DENSE_ID == 0 || SAMPLING_MODE_DENSE_ID == 9 +typedef SphericalPyramid > SelectedSampler; +#elif SAMPLING_MODE_DENSE_ID == 2 +typedef SphericalPyramid > SelectedSampler; +#elif SAMPLING_MODE_DENSE_ID == 6 +typedef SphericalPyramid SelectedSampler; +#elif SAMPLING_MODE_DENSE_ID == 7 // SILHOUETTE_CREATION_ONLY: alias any type so the +typedef Parallelogram SelectedSampler; // unreachable post-early-return code parses. +#endif void computeSpherePos(SVertexAttributes vx, out float32_t2 ndc, out float32_t3 spherePos) { - ndc = vx.uv * 2.0f - 1.0f; + ndc = vx.uv * 2.0f - 1.0f; float32_t aspect = pc.viewport.z / pc.viewport.w; ndc.x *= aspect; float32_t2 normalized = ndc / CIRCLE_RADIUS; - float32_t r2 = dot(normalized, normalized); + float32_t r2 = dot(normalized, normalized); if (r2 <= 1.0f) { @@ -36,182 +65,89 @@ void computeSpherePos(SVertexAttributes vx, out float32_t2 ndc, out float32_t3 s else { float32_t uv2Plus1 = r2 + 1.0f; - spherePos = float32_t3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; + spherePos = float32_t3(normalized.x * 2.0f, normalized.y * 2.0f, 1.0f - r2) / uv2Plus1; } spherePos = normalize(spherePos); } -// Sample a direction from a pyramid-based rectangle sampler, returning validity -template -float32_t3 sampleFromPyramid(inout Sampler sampler, SphericalPyramid pyramid, SilEdgeNormals silEdgeNormals, float32_t2 xi, out float32_t pdf, out bool valid) -{ - typename Sampler::cache_type cache; - float32_t hitDist; - float32_t3 localDir = sampler.generateNormalizedLocal(xi, cache, hitDist); - float32_t3 dir = localDir.x * pyramid.axis1 + localDir.y * pyramid.axis2 + localDir.z * pyramid.getAxis3(); - float32_t localX = localDir.x / localDir.z; - float32_t localY = localDir.y / localDir.z; - valid = dir.z > 0.0f && silEdgeNormals.isInsideLocal(localX, localY); - pdf = sampler.forwardPdf(xi, cache); - return dir; -} - [[vk::location(0)]] float32_t4 main(SVertexAttributes vx) : SV_Target0 { - float32_t aaWidth = length(float32_t2(ddx(vx.uv.x), ddy(vx.uv.y))); + float32_t aaWidth = length(float32_t2(ddx(vx.uv.x), ddy(vx.uv.y))); float32_t3 spherePos; float32_t2 ndc; computeSpherePos(vx, ndc, spherePos); VisContext::begin(ndc, spherePos, aaWidth); - shapes::OBBView view = shapes::OBBView::create(pc.modelMatrix); - uint32_t3 region; - uint32_t configIndex; - uint32_t vertexCount; - BinSilhouette sil = ClippedSilhouette::computeRegionAndConfig(view, region, configIndex, vertexCount); - - ClippedSilhouette silhouette; - silhouette.compute(view, vertexCount, sil); + shapes::OBBView view = shapes::OBBView::create(pc.modelMatrix); + ClippedSilhouette silhouette = ClippedSilhouette::create(view); - if (samplingMode == SAMPLING_MODE::SILHOUETTE_CREATION_ONLY) + if (SAMPLING_MODE_DENSE_ID == 7) // SILHOUETTE_CREATION_ONLY { + // Sink that prevents DCE of the create+materialize cost. shapes::OBBView perturbedView = view; perturbedView.minCorner += float32_t3(ndc.x, ndc.y, 0.0f) * 1e-7f; ClippedSilhouette pSilhouette = ClippedSilhouette::create(perturbedView); + float32_t3 pVerts[MAX_SILHOUETTE_VERTICES]; + pSilhouette.materialize(perturbedView, pVerts); uint32_t sink = pSilhouette.count; NBL_UNROLL for (uint32_t i = 0; i < MAX_SILHOUETTE_VERTICES; i++) - sink ^= asuint(pSilhouette.vertices[i].x) ^ asuint(pSilhouette.vertices[i].y) ^ asuint(pSilhouette.vertices[i].z); + sink ^= asuint(pVerts[i].x) ^ asuint(pVerts[i].y) ^ asuint(pVerts[i].z); return (float32_t4)asfloat(sink); } - // Draw silhouette edges on the sphere - for (uint32_t ei = 0; ei < silhouette.count; ei++) - { - float32_t3 v0 = normalize(silhouette.vertices[ei]); - float32_t3 v1 = normalize(silhouette.vertices[(ei + 1) % silhouette.count]); - float32_t3 pts[2] = {v0, v1}; - VisContext::add(SphereDrawer::drawEdge(0, pts, aaWidth)); - } - - // ===================================================================== - // Build sampler - // ===================================================================== - TriangleFanSampler samplingData; - Parallelogram parallelogram; - SphericalPyramid pyramid; - sampling::SphericalRectangle rectSampler; - sampling::ProjectedSphericalRectangle projRectSampler; - BiquadraticSampler biquad; - BilinearSampler bilin; - SilEdgeNormals silEdgeNormals; - - if (samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || - samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - { - samplingData = TriangleFanSampler::create(silhouette, samplingMode); - } - else if (samplingMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) - { - silhouette.normalize(); - parallelogram = Parallelogram::create(silhouette, silEdgeNormals); - } - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE || - samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC || - samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR || - samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE || - samplingMode == SAMPLING_MODE::PYRAMID_CREATION_ONLY) - { - pyramid = SphericalPyramid::create(silhouette, silEdgeNormals); - silEdgeNormals.transformToLocal(pyramid.axis1, pyramid.axis2, pyramid.getAxis3()); + SelectedSampler sampler = SelectedSampler::create(silhouette, view); - if (samplingMode == SAMPLING_MODE::PYRAMID_CREATION_ONLY) - { - uint32_t sink = 0; - for (uint32_t j = 0; j < pc.sampleCount; j++) - { - ClippedSilhouette pertSil = silhouette; - float32_t pertScale = (float32_t(j) + ndc.x + ndc.y) * 0.001f; - NBL_UNROLL - for (uint32_t i = 0; i < MAX_SILHOUETTE_VERTICES; i++) - pertSil.vertices[i] = normalize(pertSil.vertices[i] + float32_t3(pertScale * float32_t(i + 1), pertScale * 0.7f, 0.0f)); - - SilEdgeNormals pertEdgeNormals; - SphericalPyramid pertPyramid = SphericalPyramid::create(pertSil, pertEdgeNormals); - sink ^= asuint(pertPyramid.axis1.x) ^ asuint(pertPyramid.axis2.x) ^ asuint(pertPyramid.rectR0.x) ^ asuint(pertPyramid.rectExtents.x) ^ asuint(float32_t(pertEdgeNormals.edgeNormals[0].x)); - } - return (float32_t4)asfloat(sink); - } - - if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE) - rectSampler = sampling::SphericalRectangle::create(float32_t3x3(pyramid.axis1, pyramid.axis2, pyramid.getAxis3()), float32_t3(pyramid.rectR0, 1.0f), pyramid.rectExtents); - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE) - { - shapes::CompressedSphericalRectangle compressed; - compressed.origin = pyramid.axis1 * pyramid.rectR0.x + pyramid.axis2 * pyramid.rectR0.y + pyramid.getAxis3(); - compressed.right = pyramid.axis1 * pyramid.rectExtents.x; - compressed.up = pyramid.axis2 * pyramid.rectExtents.y; - projRectSampler = sampling::ProjectedSphericalRectangle::create(compressed, float32_t3(0.0f, 0.0f, 0.0f), float32_t3(0.0f, 0.0f, 1.0f), false); - } - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC) - biquad = BiquadraticSampler::create(pyramid); - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) - bilin = BilinearSampler::create(pyramid); - } - - // ===================================================================== - // Sample loop - // ===================================================================== uint32_t validSampleCount = 0; - DebugRecorder::recordSampleCount(pc.sampleCount); - for (uint32_t i = 0; i < pc.sampleCount; i++) { float32_t2 xi = float32_t2( (float32_t(i & 7u) + 0.5) / sqrt(pc.sampleCount) + ndc.x * 1e-9f, (float32_t(i >> 3u) + 0.5) / sqrt(pc.sampleCount) + ndc.y * 1e-9f); - float32_t pdf; - uint32_t index = 0; - float32_t3 sampleDir; - bool valid; - - if (samplingMode == SAMPLING_MODE::TRIANGLE_SOLID_ANGLE || samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - sampleDir = samplingData.sample(silhouette, xi, pdf, index); - else if (samplingMode == SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE) - sampleDir = parallelogram.sample(silEdgeNormals, xi, pdf, valid); - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE) - sampleDir = sampleFromPyramid(rectSampler, pyramid, silEdgeNormals, xi, pdf, valid); - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE) - sampleDir = sampleFromPyramid(projRectSampler, pyramid, silEdgeNormals, xi, pdf, valid); - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC) - sampleDir = biquad.sample(pyramid, silEdgeNormals, xi, pdf, valid); - else if (samplingMode == SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) - sampleDir = bilin.sample(pyramid, silEdgeNormals, xi, pdf, valid); - - if (!valid) - pdf = 0.0f; - else - validSampleCount++; - - DebugRecorder::recordRay(i, sampleDir, pdf); + typename SelectedSampler::cache_type cache; + const float32_t3 sampleDir = sampler.generate(xi, cache); + const float32_t pdf = sampler.forwardPdf(xi, cache); - if (VisContext::enabled()) - VisContext::add(SphereDrawer::visualizeSample(sampleDir, xi, index, vx.uv)); - else if (pdf > 0.0f) - VisContext::add(float4(sampleDir * 0.02f / pdf, 1.0f)); + if (pdf > 0.0f) + { + validSampleCount++; + DebugRecorder::recordRay(i, sampleDir, pdf); + if (VisContext::enabled()) + VisContext::add(SphereDrawer::visualizeSample(sampleDir, xi, sampler.selectedIdx(cache), vx.uv)); + else + VisContext::add(float4(sampleDir * 0.02f / pdf, 1.0f)); + } } - VisContext::add(SphereDrawer::drawRing(ndc)); + // Silhouette edges + debug recording. Re-materialize verts here -- the + // sampler may have absorbed its own copy already, but `verts` is local to + // this scope and dies at function end anyway. + { + float32_t3 vertices[MAX_SILHOUETTE_VERTICES]; + silhouette.materialize(view, vertices); + silhouette.recordVertices(vertices); - if (VisContext::enabled() && all(vx.uv >= float32_t2(0.f, 0.97f)) && all(vx.uv <= float32_t2(0.03f, 1.0f))) - return float32_t4(colorLUT[configIndex], 1.0f); + for (uint32_t i = 0; i < silhouette.count; i++) + { + const uint32_t j = (i + 1u < silhouette.count) ? i + 1u : 0u; + const float32_t3 e0 = normalize(vertices[i]); + const float32_t3 e1 = normalize(vertices[j]); + const float32_t3 ePts[2] = {e0, e1}; + VisContext::add(SphereDrawer::drawEdge(0, ePts, aaWidth)); + } - uint32_t vertexIndices[6]; - for (uint32_t i = 0; i < 6; i++) - vertexIndices[i] = uint32_t(sil.getVertexIndex(i)); - DebugRecorder::recordFrameEnd(region, configIndex, sil.getSilhouetteSize(), sil.data, vertexIndices, validSampleCount); + const uint32_t configIndex = silhouette.getConfigIndex(); + if (VisContext::enabled() && all(vx.uv >= float32_t2(0.f, 0.97f)) && all(vx.uv <= float32_t2(0.03f, 1.0f))) + return float32_t4(colorLUT[configIndex], 1.0f); + VisContext::add(SphereDrawer::drawRing(ndc)); + const BinSilhouette binSil = silhouette.getOriginalBinSilhouette(); + uint32_t vertexIndices[6]; + for (uint32_t i = 0; i < 6; i++) + vertexIndices[i] = uint32_t(binSil.getVertexIndex(i)); + DebugRecorder::recordFrameEnd(silhouette.getRegion(), configIndex, binSil.getVertexCount(), binSil.data, vertexIndices, validSampleCount, pc.sampleCount); + } return VisContext::flush(); } diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl index 9053807ca..d4fd9902e 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/triangle_sampling.hlsl @@ -20,165 +20,351 @@ using namespace nbl::hlsl; // With clipping, one more edge. 7 - 2 = 5 max triangles because fanning from one vertex #define MAX_TRIANGLES 5 +// ============================================================================ +// TriangleFanSampler: importance-sampled fan triangulation of the clipped +// silhouette. create() takes only the silhouette and materializes verts +// internally, storing them as a member so sample() has random access without +// the caller threading verts through. +// +// All loops over silCount/triangle-count are cascade-unrolled (instead of +// `for + break`) so every `self.verts[K]` / `cdf[K]` / `triangleSolidAngles[K]` +// access has a literal slot index. This keeps the local arrays in registers +// (SROA-promoted) instead of spilling to addressable Function memory -- a +// single dynamic-index access would demote the whole array and tank every +// subsequent read. +// ============================================================================ +template struct TriangleFanSampler { - uint32_t count; // Number of valid triangles - uint32_t samplingMode; // Mode used during build - float32_t totalWeight; // Sum of all triangle weights (for PDF computation) - float32_t3 faceNormal; // Face normal (only used for projected mode) - float32_t cdf[MAX_TRIANGLES]; // Normalized CDF: cdf[i] = sum(weight[0..i]) / totalWeight - float32_t triangleSolidAngles[MAX_TRIANGLES]; // Raw weight per triangle (for PDF after selection) - uint32_t triangleIndices[MAX_TRIANGLES]; // Vertex index i (forms triangle with v0, vi, vi+1) - - // Build fan triangulation, cache weights for triangle selection - static TriangleFanSampler create(ClippedSilhouette silhouette, uint32_t mode) + using scalar_type = float32_t; + using vector2_type = float32_t2; + using vector3_type = float32_t3; + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; + + // Cache for the TractableSampler concept. Stores the per-triangle pdf + // (selectionProb * trianglePdf) so forwardPdf is an O(1) load, plus the + // selected fan-triangle index (used by the visualization code path to + // colour each triangle differently). + struct cache_type { - TriangleFanSampler self; - self.count = 0; - self.totalWeight = 0.0f; - self.samplingMode = mode; - self.faceNormal = float32_t3(0, 0, 0); + density_type pdf; + uint32_t selectedIdx; + }; - if (silhouette.count < 3) - return self; + uint32_t count; // Number of valid triangles + float32_t totalWeight; // Sum of all triangle weights (for PDF computation) + float32_t3 faceNormal; // Face normal (only used for projected mode) + float32_t cdf[MAX_TRIANGLES]; // Normalized CDF: cdf[i] = sum(weight[0..i]) / totalWeight + float32_t triangleSolidAngles[MAX_TRIANGLES]; // Raw weight per triangle (for PDF after selection) + uint32_t triangleIndices[MAX_TRIANGLES]; // Vertex index i (forms triangle with v0, vi, vi+1) + float32_t3 verts[MAX_SILHOUETTE_VERTICES]; - const float32_t3 v0 = silhouette.vertices[0]; - const float32_t3 origin = float32_t3(0, 0, 0); + // Build fan triangulation, cache weights for triangle selection. + // Materializes silhouette verts internally (using the view stored in + // ClippedSilhouette) and keeps them as a member for sample-time access. + static TriangleFanSampler create(NBL_CONST_REF_ARG(ClippedSilhouette) silhouette, shapes::OBBView view) + { + TriangleFanSampler self; + self.totalWeight = 0.0f; + self.faceNormal = float32_t3(0, 0, 0); + const uint32_t silCount = silhouette.count; + silhouette.materialize(view, self.verts); - // Compute face normal ONCE before the loop - silhouette is planar! - if (mode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + // Pre-zero the per-triangle arrays so unused slots are well-defined -- + // the cascade below populates exactly silCount-2 slots and we don't + // want the tail to leak garbage into the CDF. + NBL_UNROLL + for (uint32_t z = 0; z < MAX_TRIANGLES; z++) { - float32_t3 v1 = silhouette.vertices[1]; - float32_t3 v2 = silhouette.vertices[2]; - self.faceNormal = normalize(cross(v1 - v0, v2 - v0)); + self.triangleSolidAngles[z] = 0.0f; + self.triangleIndices[z] = 0u; + self.cdf[z] = 0.0f; } - // Build fan triangulation from v0 - NBL_UNROLL - for (uint32_t i = 1; i < silhouette.count - 1; i++) + if (silCount < 3) { - float32_t3 v1 = silhouette.vertices[i]; - float32_t3 v2 = silhouette.vertices[i + 1]; - - const float32_t3 triVerts[3] = {v0, v1, v2}; - shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(triVerts, origin); - - // Skip degenerate triangles - if (shapeTri.solid_angle <= 0.0f) - continue; + self.count = 0; + return self; + } - // Calculate triangle solid angle - float32_t solidAngle; - if (mode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) - solidAngle = shapeTri.projectedSolidAngle(self.faceNormal); - else - solidAngle = shapeTri.solid_angle; + const float32_t3 v0 = self.verts[0]; - if (solidAngle <= 0.0f) - continue; + // Compute face normal ONCE before the loop - silhouette is planar! + if (Projected) + { + const float32_t3 v1 = self.verts[1]; + const float32_t3 v2 = self.verts[2]; + self.faceNormal = normalize(cross(v1 - v0, v2 - v0)); + } - // Store only what's needed for weighted selection - self.triangleSolidAngles[self.count] = solidAngle; - self.triangleIndices[self.count] = i; - self.totalWeight += solidAngle; - self.count++; + // Fan triangulation: triangles (v0, self.verts[I], self.verts[I+1]) for I = 1..silCount-2. + // Cascade-on-silCount so each call site has literal I. + processFanTri<1>(v0, self.faceNormal, self); + if (silCount > 3) + { + processFanTri<2>(v0, self.faceNormal, self); + if (silCount > 4) + { + processFanTri<3>(v0, self.faceNormal, self); + if (silCount > 5) + { + processFanTri<4>(v0, self.faceNormal, self); + if (silCount > 6) + processFanTri<5>(v0, self.faceNormal, self); + } + } } + // self.count = silCount - 2 (every triangle slot gets populated, possibly + // with zero weight for degenerates -- they're handled cleanly by the CDF). + self.count = silCount - 2u; - // Build normalized CDF from raw weights + // CDF build: cascade-on-count so cdf[K] / triangleSolidAngles[K] are + // literal-index accesses; otherwise the whole sampler struct's arrays + // would demote to Function memory. + const float32_t rcpTotal = (self.totalWeight > 0.0f) ? rcp(self.totalWeight) : 0.0f; + float32_t cumulative = 0.0f; + + cumulative += self.triangleSolidAngles[0]; + self.cdf[0] = cumulative * rcpTotal; + if (self.count > 1) { - float32_t rcpTotal = (self.totalWeight > 0.0f) ? (1.0f / self.totalWeight) : 0.0f; - float32_t cumulative = 0.0f; - for (uint32_t i = 0; i < self.count; i++) + cumulative += self.triangleSolidAngles[1]; + self.cdf[1] = cumulative * rcpTotal; + if (self.count > 2) { - cumulative += self.triangleSolidAngles[i]; - self.cdf[i] = cumulative * rcpTotal; + cumulative += self.triangleSolidAngles[2]; + self.cdf[2] = cumulative * rcpTotal; + if (self.count > 3) + { + cumulative += self.triangleSolidAngles[3]; + self.cdf[3] = cumulative * rcpTotal; + if (self.count > 4) + { + cumulative += self.triangleSolidAngles[4]; + self.cdf[4] = cumulative * rcpTotal; + } + } } } +#if DEBUG_DATA + // Debug-only closed-loop walk over silhouette edges. Released builds DCE + // both the loop (recordTriangleFan is a no-op stub) and luneDetected. bool luneDetected = false; - for (uint32_t i = 0; i < silhouette.count; i++) + for (uint32_t i = 0; i < silCount; i++) { - uint32_t j = (i + 1) % silhouette.count; - float32_t3 n1 = normalize(silhouette.vertices[i]); - float32_t3 n2 = normalize(silhouette.vertices[j]); - if (dot(n1, n2) < -0.99f) + const uint32_t j = (i + 1u < silCount) ? i + 1u : 0u; + const float32_t3 ni = nbl::hlsl::normalize(self.verts[i]); + const float32_t3 nj = nbl::hlsl::normalize(self.verts[j]); + if (dot(ni, nj) < -0.99f) { luneDetected = true; assert(false && "Spherical lune detected: antipodal silhouette edge"); } } DebugRecorder::recordTriangleFan(luneDetected, self.count, self.totalWeight, self.triangleSolidAngles); +#else + DebugRecorder::recordTriangleFan(false, self.count, self.totalWeight, self.triangleSolidAngles); +#endif return self; } - // Sample using cached selection weights, recompute geometry on-demand - float32_t3 sample(ClippedSilhouette silhouette, float32_t2 xi, out float32_t pdf, out uint32_t selectedIdx) + // TractableSampler::generate. Picks a fan triangle by xi.x via the cached + // CDF, samples within it, and registers (selectedIdx, pdf) in the cache so + // forwardPdf is an O(1) load. Geometry is reconstructed on-demand from + // `this->verts`. The CDF-select and triangle-reconstruct steps both use + // literal-index cascades on count / vertexIdx -- a single dynamic-index + // access into verts.v / cdf / triangleIndices would demote those arrays to + // Function memory and slow every call. + codomain_type generate(domain_type xi, NBL_REF_ARG(cache_type) cache) { - selectedIdx = 0; - // Handle empty or invalid data if (count == 0 || totalWeight <= 0.0f) { - pdf = 0.0f; - return float32_t3(0, 0, 1); + cache.pdf = 0.0f; + cache.selectedIdx = 0; + return codomain_type(0, 0, 1); } - // Select triangle via precomputed normalized CDF - float32_t prevCdf = 0.0f; - NBL_UNROLL - for (uint32_t i = 0; i < count; i++) + // Use a local idx for all the cascade work; assign to the cache once at + // the end so the cache field doesn't get pessimised by repeated stores. + uint32_t idx = count - 1u; // fall-through default for numerical roundoff + scalar_type prevCdf = 0.0f; + if (xi.x <= cdf[0]) { - if (xi.x <= cdf[i]) - { - selectedIdx = i; - break; - } - prevCdf = cdf[i]; + idx = 0; + } + else if (count > 1 && xi.x <= cdf[1]) + { + idx = 1; + prevCdf = cdf[0]; + } + else if (count > 2 && xi.x <= cdf[2]) + { + idx = 2; + prevCdf = cdf[1]; + } + else if (count > 3 && xi.x <= cdf[3]) + { + idx = 3; + prevCdf = cdf[2]; } + else if (count > 4 && xi.x <= cdf[4]) + { + idx = 4; + prevCdf = cdf[3]; + } + else // fall-through to last valid triangle + { + if (count == 2) + prevCdf = cdf[0]; + else if (count == 3) + prevCdf = cdf[1]; + else if (count == 4) + prevCdf = cdf[2]; + else if (count == 5) + prevCdf = cdf[3]; + } + cache.selectedIdx = idx; - // Remap xi.x to [0,1] within selected triangle's CDF interval - float32_t cdfWidth = cdf[selectedIdx] - prevCdf; - float32_t u = (xi.x - prevCdf) / max(cdfWidth, 1e-7f); - float32_t triSolidAngle = triangleSolidAngles[selectedIdx]; + // cdf[idx] read also via cascade so the array stays SROA'd. + scalar_type selectedCdf; + if (idx == 0) + selectedCdf = cdf[0]; + else if (idx == 1) + selectedCdf = cdf[1]; + else if (idx == 2) + selectedCdf = cdf[2]; + else if (idx == 3) + selectedCdf = cdf[3]; + else + selectedCdf = cdf[4]; - // Reconstruct the selected triangle geometry - uint32_t vertexIdx = triangleIndices[selectedIdx]; - float32_t3 v0 = silhouette.vertices[0]; - float32_t3 v1 = silhouette.vertices[vertexIdx]; - float32_t3 v2 = silhouette.vertices[vertexIdx + 1]; + const scalar_type cdfWidth = selectedCdf - prevCdf; + const scalar_type u = (xi.x - prevCdf) / max(cdfWidth, 1e-7f); - float32_t3 origin = float32_t3(0, 0, 0); + scalar_type triSolidAngle; + if (idx == 0) + triSolidAngle = triangleSolidAngles[0]; + else if (idx == 1) + triSolidAngle = triangleSolidAngles[1]; + else if (idx == 2) + triSolidAngle = triangleSolidAngles[2]; + else if (idx == 3) + triSolidAngle = triangleSolidAngles[3]; + else + triSolidAngle = triangleSolidAngles[4]; - const float32_t3 triVerts[3] = {v0, v1, v2}; - shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(triVerts, origin); + uint32_t vertexIdx; + if (idx == 0) + vertexIdx = triangleIndices[0]; + else if (idx == 1) + vertexIdx = triangleIndices[1]; + else if (idx == 2) + vertexIdx = triangleIndices[2]; + else if (idx == 3) + vertexIdx = triangleIndices[3]; + else + vertexIdx = triangleIndices[4]; + + // Reconstruct triangle geometry. vertexIdx is in [1, MAX_SILHOUETTE_VERTICES-2] + // and is data-dependent on xi -- cascade so verts[vertexIdx] / verts[vertexIdx+1] + // become literal-index reads. With our 7-vertex max, vertexIdx <= 5. + const codomain_type v0 = verts[0]; + codomain_type v1, v2; + if (vertexIdx == 1) + { + v1 = verts[1]; + v2 = verts[2]; + } + else if (vertexIdx == 2) + { + v1 = verts[2]; + v2 = verts[3]; + } + else if (vertexIdx == 3) + { + v1 = verts[3]; + v2 = verts[4]; + } + else if (vertexIdx == 4) + { + v1 = verts[4]; + v2 = verts[5]; + } + else + { + v1 = verts[5]; + v2 = verts[6]; + } // vertexIdx == 5 + + const codomain_type origin = codomain_type(0, 0, 0); + + const codomain_type triVerts[3] = {v0, v1, v2}; + shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(triVerts, origin); // Sample based on mode - float32_t3 direction; - const float32_t2 u2 = float32_t2(u, xi.y); + codomain_type direction; + const domain_type u2 = domain_type(u, xi.y); - if (samplingMode == SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE) + if (Projected) { - // faceNormal was precomputed during create() -- silhouette is planar - sampling::ProjectedSphericalTriangle samplingTri = sampling::ProjectedSphericalTriangle::create(shapeTri, faceNormal, false); - sampling::ProjectedSphericalTriangle::cache_type cache; - direction = samplingTri.generate(u2, cache); - triSolidAngle = 1.0f / samplingTri.forwardPdf(u2, cache); + // faceNormal was precomputed during create(), silhouette is planar + sampling::ProjectedSphericalTriangle samplingTri = sampling::ProjectedSphericalTriangle::create(shapeTri, faceNormal, false); + sampling::ProjectedSphericalTriangle::cache_type triCache; + direction = samplingTri.generate(u2, triCache); + triSolidAngle = 1.0f / samplingTri.forwardPdf(u2, triCache); } else { - sampling::SphericalTriangle samplingTri = sampling::SphericalTriangle::create(shapeTri); - sampling::SphericalTriangle::cache_type cache; - direction = samplingTri.generate(u2, cache); + sampling::SphericalTriangle samplingTri = sampling::SphericalTriangle::create(shapeTri); + sampling::SphericalTriangle::cache_type triCache; + direction = samplingTri.generate(u2, triCache); } - // Calculate PDF - float32_t trianglePdf = 1.0f / triSolidAngle; - float32_t selectionProb = triSolidAngle / totalWeight; - pdf = trianglePdf * selectionProb; + // Calculate PDF: trianglePdf * selectionProb where the per-triangle pdf + // is 1/triSolidAngle (uniform over the spherical triangle) and the + // selection probability is triSolidAngle / totalWeight. + cache.pdf = (1.0f / triSolidAngle) * (triSolidAngle / totalWeight); return normalize(direction); } + + density_type forwardPdf(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + weight_type forwardWeight(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + uint32_t selectedIdx(cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.selectedIdx; } + + // Process one fan triangle (v0, self.verts[I], self.verts[I+1]) at the cascade level. + // I is a template constant so self.verts[I] / self.verts[I+1] / triangleSolidAngles[I-1] + // / triangleIndices[I-1] are all literal-index accesses; the body's + // append-to-slot-(I-1) only works because we treat degenerate triangles as + // zero-weight rather than skipping them. This is a behavior change from the + // old `count++ on non-degenerate` form: degenerate triangles now occupy a + // slot with zero weight, which contributes nothing to the CDF and has + // selection probability 0, so the sampling result is unchanged. + template + static void processFanTri(float32_t3 v0, float32_t3 faceNormal, NBL_REF_ARG(TriangleFanSampler) self) + { + const float32_t3 v1 = self.verts[I]; + const float32_t3 v2 = self.verts[I + 1]; + + const float32_t3 origin = float32_t3(0, 0, 0); + const float32_t3 triVerts[3] = {v0, v1, v2}; + shapes::SphericalTriangle shapeTri = shapes::SphericalTriangle::create(triVerts, origin); + + // Compute solid angle (or projected) and clamp to >= 0; degenerate + // triangles end up with zero weight and don't affect sampling. + float32_t sa = Projected ? shapeTri.projectedSolidAngle(faceNormal) : shapeTri.solid_angle; + sa = max(sa, 0.0f); + + self.triangleSolidAngles[I - 1u] = sa; + self.triangleIndices[I - 1u] = I; + self.totalWeight += sa; + } }; #endif // _SOLID_ANGLE_VIS_EXAMPLE_TRIANGLE_SAMPLING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/main.cpp b/73_SolidAngleVisualizer/main.cpp index ecc3694f5..78313e413 100644 --- a/73_SolidAngleVisualizer/main.cpp +++ b/73_SolidAngleVisualizer/main.cpp @@ -10,6 +10,40 @@ #include #include +//#include "app_resources/hlsl/silhouette.hlsl" +//#include "app_resources/hlsl/parallelogram_sampling.hlsl" +//#include "app_resources/hlsl/pyramid_sampling.hlsl" +//#include "app_resources/hlsl/triangle_sampling.hlsl" +//#include + +// ============================================================================ +// Compile-time concept verification (mirrors example 37 main.cpp). Each +// example sampler must satisfy TractableSampler: +// typedef domain_type, codomain_type, density_type, cache_type +// codomain_type generate(domain_type, ref cache_type) +// density_type forwardPdf(domain_type, cache_type) +// SphericalPyramid is checked across all four (UseCaliper, InnerSampler) +// pairs that the frag shader / benchmark actually instantiate. +// ============================================================================ + + //static_assert(nbl::hlsl::sampling::concepts::TractableSampler); + //static_assert(nbl::hlsl::sampling::concepts::TractableSampler>); + //static_assert(nbl::hlsl::sampling::concepts::TractableSampler>); + //static_assert(nbl::hlsl::sampling::concepts::TractableSampler); + //static_assert(nbl::hlsl::sampling::concepts::TractableSampler>>); + //static_assert(nbl::hlsl::sampling::concepts::TractableSampler>>); + //static_assert(nbl::hlsl::sampling::concepts::TractableSampler>>); + //static_assert(nbl::hlsl::sampling::concepts::TractableSampler>); + +// App execution mode -- pick at compile time via -DAPP_MODE=N +// APP_MODE_VISUALIZER (1) full visualization with debug + ImGui editor (default) +// APP_MODE_NSIGHT_BENCHMARKS(2) submits one dispatch per SAMPLING_MODE_FLAGS in a single capture, then exits +#define APP_MODE_VISUALIZER 1 +#define APP_MODE_NSIGHT_BENCHMARKS 2 +#ifndef APP_MODE +#define APP_MODE APP_MODE_VISUALIZER +#endif + /* Renders scene texture to an offscreen framebuffer whose color attachment is then sampled into a imgui window. @@ -56,6 +90,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR return logFail("Couldn't create Command Buffer!"); } +#if APP_MODE == APP_MODE_VISUALIZER const uint32_t addtionalBufferOwnershipFamilies[] = {getGraphicsQueue()->getFamilyIndex()}; m_scene = CGeometryCreatorScene::create( {.transferQueue = getTransferUpQueue(), @@ -63,6 +98,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .logger = m_logger.get(), .addtionalBufferOwnershipFamilies = addtionalBufferOwnershipFamilies}, CSimpleDebugRenderer::DefaultPolygonGeometryPatch); +#endif // for the scene drawing pass { @@ -137,6 +173,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR return logFail("Failed to create Solid Angle Renderpass!"); } +#if APP_MODE == APP_MODE_VISUALIZER const auto& geometries = m_scene->getInitParams().geometries; m_renderer = CSimpleDebugRenderer::create(m_assetMgr.get(), m_solidAngleRenderpass.get(), 0, {&geometries.front().get(), geometries.size()}); // special case @@ -152,6 +189,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } // we'll only display one thing at a time m_renderer->m_instances.resize(1); +#endif // Create graphics pipeline { @@ -181,28 +219,24 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR if (!fsTriProtoPPln) return logFail("Failed to create Full Screen Triangle protopipeline or load its vertex shader!"); - // Load pre-compiled fragment shaders (6 modes x 2 debug = 12 SolidAngleVis + 2 RayVis) - // Can't use string literal template args in a loop, so unroll manually - // Index: mode * 2 + debugFlag (0=release, 1=debug) - smart_refctd_ptr saVisShaders[SAMPLING_MODE::Count * DebugPermutations]; - saVisShaders[0] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_sa">(m_device.get())); - saVisShaders[1] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_sa_dbg">(m_device.get())); - saVisShaders[2] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_psa">(m_device.get())); - saVisShaders[3] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_tri_psa_dbg">(m_device.get())); - saVisShaders[4] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_para">(m_device.get())); - saVisShaders[5] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_para_dbg">(m_device.get())); - saVisShaders[6] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_rectangle">(m_device.get())); - saVisShaders[7] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_rectangle_dbg">(m_device.get())); - saVisShaders[8] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_biquad">(m_device.get())); - saVisShaders[9] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_biquad_dbg">(m_device.get())); - saVisShaders[10] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_bilinear">(m_device.get())); - saVisShaders[11] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_bilinear_dbg">(m_device.get())); - saVisShaders[12] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_proj_rectangle">(m_device.get())); - saVisShaders[13] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_proj_rectangle_dbg">(m_device.get())); - saVisShaders[14] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_silhouette">(m_device.get())); - saVisShaders[15] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_silhouette_dbg">(m_device.get())); - saVisShaders[16] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_pyramid">(m_device.get())); - saVisShaders[17] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"sa_vis_pyramid_dbg">(m_device.get())); + smart_refctd_ptr saVisShaders[SAMPLING_MODE_FLAGS::Count * DebugPermutations]; + + auto addSaVis = [&](SAMPLING_MODE_FLAGS mode) + { + saVisShaders[denseIdOf(mode) * DebugPermutations + 0] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key(m_device.get())); + saVisShaders[denseIdOf(mode) * DebugPermutations + 1] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key(m_device.get())); + }; + + addSaVis.template operator()<"sa_vis_tri_sa", "sa_vis_tri_sa_dbg">(SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE); + addSaVis.template operator()<"sa_vis_tri_psa", "sa_vis_tri_psa_dbg">(SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE); + addSaVis.template operator()<"sa_vis_para", "sa_vis_para_dbg">(SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE); + addSaVis.template operator()<"sa_vis_rectangle", "sa_vis_rectangle_dbg">(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID); + addSaVis.template operator()<"sa_vis_bilinear", "sa_vis_bilinear_dbg">(SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID); + addSaVis.template operator()<"sa_vis_proj_rectangle", "sa_vis_proj_rectangle_dbg">(SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID); + addSaVis.template operator()<"sa_vis_silhouette", "sa_vis_silhouette_dbg">(SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY); + addSaVis.template operator()<"sa_vis_pyramid", "sa_vis_pyramid_dbg">(SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY); + addSaVis.template operator()<"sa_vis_caliper_pyramid", "sa_vis_caliper_pyramid_dbg">(SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY); + addSaVis.template operator()<"sa_vis_caliper_rectangle", "sa_vis_caliper_rectangle_dbg">(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID); smart_refctd_ptr rayVisShaders[DebugPermutations]; rayVisShaders[0] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"ray_vis">(m_device.get())); @@ -230,7 +264,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { // Create all SolidAngleVis pipeline variants - for (uint32_t i = 0; i < SAMPLING_MODE::Count * DebugPermutations; i++) + for (uint32_t i = 0; i < SAMPLING_MODE_FLAGS::Count * DebugPermutations; i++) { const IGPUPipelineBase::SShaderSpecInfo fragSpec = { .shader = saVisShaders[i].get(), @@ -298,6 +332,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR m_device->invalidateMappedMemoryRanges(1, &memoryRange); } +#if APP_MODE == APP_MODE_VISUALIZER // Create ImGUI { auto scRes = static_cast(m_surface->getSwapchainResources()); @@ -352,22 +387,84 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } interface.camera.mapKeysToWASD(); - +#endif + +#if APP_MODE == APP_MODE_NSIGHT_BENCHMARKS + // The actual one-shot runs from inside the first renderFrame() so NSight's Shader Profiler has + // the same render-loop context as the working UI-button-triggered benchmark. Just seed the OBB + // matrix here from the default TRS so the bench shaders see sane inputs. + ImGuizmo::RecomposeMatrixFromComponents(&interface.m_TRS.translation.x, &interface.m_TRS.rotation.x, &interface.m_TRS.scale.x, &interface.m_OBBModelMatrix[0][0]); +#endif onAppInitializedFinish(); return true; } + virtual inline bool keepRunning() override + { + if (!m_keepRunning) + return false; + return device_base_t::keepRunning(); + } + // virtual inline bool onAppTerminated() { +#if APP_MODE == APP_MODE_VISUALIZER SubAllocatedDescriptorSet::value_type fontAtlasDescIx = ext::imgui::UI::FontAtlasTexId; IGPUDescriptorSet::SDropDescriptorSet dummy[1]; interface.subAllocDS->multi_deallocate(dummy, TexturesImGUIBindingIndex, 1, &fontAtlasDescIx); +#endif return device_base_t::onAppTerminated(); } inline IQueue::SSubmitInfo::SSemaphoreInfo renderFrame(const std::chrono::microseconds nextPresentationTimestamp) override { +#if APP_MODE == APP_MODE_NSIGHT_BENCHMARKS + // Minimal frame: run the one-shot once (inside the render loop so NSight's Shader Profiler + // has the same context as the UI-triggered benchmark), then submit a bare swapchain clear + // to satisfy the framework's frame contract, and signal exit on the next loop iteration. + if (!m_nsightBenchDone) + { + SamplingBenchmark(*this).runNSightOneShot(); + m_nsightBenchDone = true; + m_keepRunning = false; + } + + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + auto* const cb = m_cmdBufs.data()[resourceIx].get(); + cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + { + auto* scRes = static_cast(m_surface->getSwapchainResources()); + const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.f, 0.f, 0.f, 1.f}}; + const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = + {.framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex), + .colorClearValues = &clearValue, + .depthStencilClearValues = nullptr, + .renderArea = {.offset = {0, 0}, .extent = {m_window->getWidth(), m_window->getHeight()}}}; + beginRenderpass(cb, renderpassInfo); + cb->endRenderPass(); + } + cb->end(); + + IQueue::SSubmitInfo::SSemaphoreInfo retval = + {.semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS}; + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = {{.cmdbuf = cb}}; + const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { + {.semaphore = device_base_t::getCurrentAcquire().semaphore, + .value = device_base_t::getCurrentAcquire().acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE}}; + const IQueue::SSubmitInfo infos[] = { + {.waitSemaphores = acquired, .commandBuffers = commandBuffers, .signalSemaphores = {&retval, 1}}}; + if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS) + { + retval.semaphore = nullptr; + m_realFrameIx--; + } + return retval; +#else // CPU events update(nextPresentationTimestamp); @@ -419,7 +516,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR .sampleCount = static_cast(m_SampleCount), .frameIndex = lastFrameSeed}; const uint32_t debugIdx = m_debugVisualization ? 1u : 0u; - auto pipeline = m_solidAngleVisPipelines[m_samplingMode * DebugPermutations + debugIdx]; + auto pipeline = m_solidAngleVisPipelines[denseIdOf(m_samplingMode) * DebugPermutations + debugIdx]; cb->bindGraphicsPipeline(pipeline.get()); cb->pushConstants(pipeline->getLayout(), hlsl::ShaderStage::ESS_FRAGMENT, 0, sizeof(pc), &pc); cb->bindDescriptorSets(nbl::asset::EPBP_GRAPHICS, pipeline->getLayout(), 0, 1, &m_ds.get()); @@ -567,6 +664,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR m_window->setCaption("[Nabla Engine] UI App Test Demo"); return retval; +#endif } protected: @@ -705,8 +803,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR const uint16_t2 mainViewRes = interface.mainViewTransformReturnInfo.sceneResolution; // detect window minimization - if (solidAngleViewRes.x < 0x4000 && solidAngleViewRes.y < 0x4000 || - mainViewRes.x < 0x4000 && mainViewRes.y < 0x4000) + if (solidAngleViewRes.x < 0x4000 && solidAngleViewRes.y < 0x4000 || mainViewRes.x < 0x4000 && mainViewRes.y < 0x4000) { solidAngleView = createImageAndView(solidAngleViewRes, finalSceneRenderFormat); auto solidAngleDepthView = createImageAndView(solidAngleViewRes, sceneRenderDepthFormat); @@ -783,12 +880,14 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // we create the Descriptor Set with a few slots extra to spare, so we don't have to `waitIdle` the device whenever ImGUI virtual window resizes constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight; - static inline SAMPLING_MODE m_samplingMode = SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE; + static inline SAMPLING_MODE_FLAGS m_samplingMode = SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID; static inline bool m_debugVisualization = true; static inline int m_SampleCount = 64; static inline int m_BenchmarkSampleCount = 128; static inline bool m_frameSeeding = true; static inline ResultData m_GPUOutResulData; + bool m_keepRunning = true; + bool m_nsightBenchDone = false; // smart_refctd_ptr m_scene; smart_refctd_ptr m_solidAngleRenderpass; @@ -798,7 +897,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR smart_refctd_ptr m_mainViewFramebuffer; // Pipeline variants: SolidAngleVis indexed by [mode * 2 + debugFlag], RayVis by [debugFlag] static constexpr uint32_t DebugPermutations = 2; - smart_refctd_ptr m_solidAngleVisPipelines[SAMPLING_MODE::Count * DebugPermutations]; + smart_refctd_ptr m_solidAngleVisPipelines[SAMPLING_MODE_FLAGS::Count * DebugPermutations]; smart_refctd_ptr m_rayVisPipelines[DebugPermutations]; // nbl::video::IDeviceMemoryAllocator::SAllocation m_allocation = {}; @@ -863,28 +962,25 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::Text("Sampling Mode:"); ImGui::SameLine(); - const char* samplingModes[] = - { - "Triangle Solid Angle", - "Triangle Projected Solid Angle", - "Parallelogram Projected Solid Angle", - "Rectangle Pyramid Solid Angle", - "Biquadratic pyramid solid angle", - "Bilinear pyramid solid angle", - "Projected Rectangle Pyramid", - "Silhouette only (benchmark)", - "Pyramid only (benchmark)"}; - - int currentMode = static_cast(m_samplingMode); - - if (ImGui::Combo("##SamplingMode", ¤tMode, samplingModes, IM_ARRAYSIZE(samplingModes))) + const char* samplingModes[SAMPLING_MODE_FLAGS::Count - 3] = {}; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID)] = "Spherical Rectangle From Pyramid"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID)] = "Caliper Rectangle From Pyramid"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID)] = "Projected Spherical Rectangle From Pyramid"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE)] = "Spherical Triangle"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE)] = "Projected Spherical Triangle"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE)] = "Projected Parallelogram"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID)] = "Bilinear Pyramid"; + + int currentMode = static_cast(denseIdOf(m_samplingMode)); + + if (ImGui::Combo("##SamplingMode", ¤tMode, samplingModes, SAMPLING_MODE_FLAGS::Count - 3)) { - m_samplingMode = static_cast(currentMode); + m_samplingMode = kAllModes[currentMode]; } ImGui::Checkbox("Debug Visualization", &m_debugVisualization); ImGui::Text("Pipeline idx: SA=%d, Ray=%d", - static_cast(m_samplingMode) * DebugPermutations + (m_debugVisualization ? 1 : 0), + static_cast(denseIdOf(m_samplingMode)) * DebugPermutations + (m_debugVisualization ? 1 : 0), m_debugVisualization ? 1 : 0); ImGui::Checkbox("Frame seeding", &m_frameSeeding); @@ -1179,7 +1275,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR for (uint32_t i = 0; i < 4; i++) ImGui::Text("Corner[%u]: (%.3f, %.3f)", i, m_GPUOutResulData.parallelogram.corners[i].x, m_GPUOutResulData.parallelogram.corners[i].y); } - else if ((m_samplingMode == SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE || m_samplingMode == SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC || m_samplingMode == SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR) && ImGui::CollapsingHeader("Spherical Pyramid", ImGuiTreeNodeFlags_DefaultOpen)) + else if ((m_samplingMode == SPH_RECT_FROM_PYRAMID || m_samplingMode == PROJ_SPH_RECT_FROM_PYRAMID || m_samplingMode == BILINEAR_FROM_PYRAMID || m_samplingMode == SPH_RECT_FROM_CALIPER_PYRAMID) && ImGui::CollapsingHeader("Spherical Pyramid", ImGuiTreeNodeFlags_DefaultOpen)) { ImGui::Text("Best Caliper Edge: %u", m_GPUOutResulData.pyramid.bestEdge); ImGui::Separator(); @@ -1536,29 +1632,26 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR return shader; }; - const char* shaderNames[SAMPLING_MODE::Count] = { - "benchmark_tri_sa", - "benchmark_tri_psa", - "benchmark_para", - "benchmark_rectangle", - "benchmark_biquad", - "benchmark_bilinear", - "benchmark_proj_rectangle", - "benchmark_silhouette", - "benchmark_pyramid_creation", - }; - smart_refctd_ptr shaders[SAMPLING_MODE::Count] = { - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_tri_sa">(m_device.get())), - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_tri_psa">(m_device.get())), - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_para">(m_device.get())), - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_rectangle">(m_device.get())), - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_biquad">(m_device.get())), - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_bilinear">(m_device.get())), - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_proj_rectangle">(m_device.get())), - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_silhouette">(m_device.get())), - loadShader(nbl::this_example::builtin::build::get_spirv_key<"benchmark_pyramid_creation">(m_device.get())), + const char* shaderNames[SAMPLING_MODE_FLAGS::Count] = {}; + smart_refctd_ptr shaders[SAMPLING_MODE_FLAGS::Count]; + + auto addBench = [&](SAMPLING_MODE_FLAGS mode) + { + shaderNames[denseIdOf(mode)] = Key.value; + shaders[denseIdOf(mode)] = loadShader(nbl::this_example::builtin::build::get_spirv_key(m_device.get())); }; + addBench.template operator()<"benchmark_tri_sa">(SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE); + addBench.template operator()<"benchmark_tri_psa">(SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE); + addBench.template operator()<"benchmark_para">(SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE); + addBench.template operator()<"benchmark_rectangle">(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID); + addBench.template operator()<"benchmark_bilinear">(SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID); + addBench.template operator()<"benchmark_proj_rectangle">(SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID); + addBench.template operator()<"benchmark_silhouette">(SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY); + addBench.template operator()<"benchmark_pyramid_creation">(SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY); + addBench.template operator()<"benchmark_caliper_pyramid_creation">(SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY); + addBench.template operator()<"benchmark_caliper_rectangle">(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID); + nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { {.binding = 0, .type = nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER, @@ -1577,7 +1670,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR if (!m_pplnLayout) base.logFail("Failed to create a Pipeline Layout!\n"); - for (uint32_t i = 0; i < SAMPLING_MODE::Count; i++) + for (uint32_t i = 0; i < SAMPLING_MODE_FLAGS::Count; i++) { IGPUComputePipeline::SCreationParams params = {}; params.layout = m_pplnLayout.get(); @@ -1645,7 +1738,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR void run() { // Pipeline executable reports first so the timings cluster at the bottom of the log. - for (uint32_t i = 0; i < SAMPLING_MODE::Count; i++) + for (uint32_t i = 0; i < SAMPLING_MODE_FLAGS::Count; i++) { if (!m_pipelineReports[i].empty()) m_logger->log("%s Pipeline Executable Report:\n%s", ILogger::ELL_PERFORMANCE, m_pipelineReportNames[i], m_pipelineReports[i].c_str()); @@ -1655,44 +1748,109 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR m_logger->log("\n\n=== GPU Sampler Benchmarks (%d dispatches, %llu threads/dispatch, %d samples/thread, ps/sample is per all GPU threads) ===", ILogger::ELL_PERFORMANCE, Dispatches, totalThreads, m_BenchmarkSampleCount); m_logger->log(" timestampPeriod = %.1f ps/tick", ILogger::ELL_PERFORMANCE, m_timestampPeriodNs * 1000.0); - m_logger->log("%-28s | %-12s | %9s | %10s | %10s", + m_logger->log("%-29s | %-12s | %9s | %10s | %10s", ILogger::ELL_PERFORMANCE, "Sampler", "Mode", "ps/sample", "GSamples/s", "ms total"); struct SamplerEntry { const char* name; - SAMPLING_MODE mode; + SAMPLING_MODE_FLAGS mode; }; const SamplerEntry samplers[] = { - {.name = "PYRAMID_RECTANGLE", .mode = SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_RECTANGLE}, - {.name = "PYRAMID_PROJ_RECTANGLE", .mode = SAMPLING_MODE::SYMMETRIC_PYRAMID_PROJECTED_SOLID_ANGLE_RECTANGLE}, - {.name = "PYRAMID_BIQUADRATIC", .mode = SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BIQUADRATIC}, - {.name = "PYRAMID_BILINEAR", .mode = SAMPLING_MODE::SYMMETRIC_PYRAMID_SOLID_ANGLE_BILINEAR}, - {.name = "PARALLELOGRAM", .mode = SAMPLING_MODE::PROJECTED_PARALLELOGRAM_SOLID_ANGLE}, - {.name = "TRIANGLE_SA", .mode = SAMPLING_MODE::TRIANGLE_SOLID_ANGLE}, - {.name = "TRIANGLE_PSA", .mode = SAMPLING_MODE::TRIANGLE_PROJECTED_SOLID_ANGLE}, + {.name = "PYRAMID_RECTANGLE", .mode = SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID}, + {.name = "CALIPER_PYRAMID_RECTANGLE", .mode = SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID}, + {.name = "PYRAMID_PROJ_RECTANGLE", .mode = SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID}, + {.name = "PYRAMID_BILINEAR", .mode = SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID}, + {.name = "PARALLELOGRAM", .mode = SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE}, + {.name = "TRIANGLE_SA", .mode = SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE}, + {.name = "TRIANGLE_PSA", .mode = SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE}, }; // Creation-only modes: report per-creation, not per-sample. - performBenchmark("SILHOUETTE_CREATION_ONLY", SAMPLING_MODE::SILHOUETTE_CREATION_ONLY, totalThreads, 0); - performBenchmark("PYRAMID_CREATION_ONLY", SAMPLING_MODE::PYRAMID_CREATION_ONLY, totalThreads, 0); + performBenchmark("SILHOUETTE_CREATION_ONLY", SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY, totalThreads, 0); + performBenchmark("PYRAMID_CREATION_ONLY", SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY, totalThreads, 0); + performBenchmark("CALIPER_PYRAMID_CREATION_ONLY", SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY, totalThreads, 0); // Modes per sampler: 1 creation per N samples. 1 = no amortization, sampleCount = full amortization. - const uint32_t modeRatios[] = {1u, 16u, uint32_t(m_BenchmarkSampleCount)}; + const uint32_t modeRatios[] = {1u, 16u, static_cast(m_BenchmarkSampleCount)}; for (uint32_t spc : modeRatios) for (const auto& s : samplers) performBenchmark(s.name, s.mode, totalThreads, spc); + } + + // Many dispatches per SAMPLING_MODE_FLAGS, all in a single capture. Intended for NSight submit-mode + // captures with the Shader Profiler -- each mode's range needs sustained execution so PC sampling + // can gather enough source-line hits. + void runNSightOneShot() + { + const char* modeNames[SAMPLING_MODE_FLAGS::Count] = {}; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID)] = "CALIPER_PYRAMID_RECTANGLE"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID)] = "PYRAMID_RECTANGLE"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID)] = "PYRAMID_PROJ_RECTANGLE"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE)] = "TRIANGLE_SA"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE)] = "TRIANGLE_PSA"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE)] = "PARALLELOGRAM"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID)] = "PYRAMID_BILINEAR"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY)] = "SILHOUETTE_CREATION_ONLY"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY)] = "PYRAMID_CREATION_ONLY"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY)] = "CALIPER_PYRAMID_CREATION_ONLY"; + + m_pushConstants.modelMatrix = float32_t3x4(transpose(m_visualizer->interface.m_OBBModelMatrix)); + m_pushConstants.sampleCount = static_cast(m_BenchmarkSampleCount); + m_pushConstants.samplesPerCreation = m_pushConstants.sampleCount; // full amortization: 1 creation per dispatch + + m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); + m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants); + + const asset::SMemoryBarrier serializeDispatch = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + }; + const IGPUCommandBuffer::SPipelineBarrierDependencyInfo barrierInfo = {.memBarriers = {&serializeDispatch, 1}}; + for (uint32_t mode = 0; mode < SAMPLING_MODE_FLAGS::Count; ++mode) + { + m_cmdbuf->beginDebugMarker(modeNames[mode], vectorSIMDf(0, 1, 0, 1)); + m_cmdbuf->bindComputePipeline(m_pipelines[mode].get()); + for (int i = 0; i < NSightDispatchesPerMode; ++i) + { + m_cmdbuf->dispatch(BENCHMARK_WORKGROUP_COUNT, 1, 1); + if (i + 1 < NSightDispatchesPerMode) + m_cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, barrierInfo); + } + m_cmdbuf->endDebugMarker(); + if (mode + 1u < SAMPLING_MODE_FLAGS::Count) + m_cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS::EDF_NONE, barrierInfo); + } + m_cmdbuf->end(); + + smart_refctd_ptr done = m_device->createSemaphore(0); + const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {{.semaphore = done.get(), .value = 1, .stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS}}; + IQueue::SSubmitInfo submitInfos[1] = {}; + const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = {{.cmdbuf = m_cmdbuf.get()}}; + submitInfos[0].commandBuffers = cmdbufs; + submitInfos[0].signalSemaphores = signals; + + m_api->startCapture(); + m_computeQueue->submit(submitInfos); + const ISemaphore::SWaitInfo waitInfo[] = {{.semaphore = done.get(), .value = 1}}; + m_device->blockForSemaphores(waitInfo); + m_api->endCapture(); + m_logger->log("NSight benchmarks: dispatched %u sampling modes in one submit.", ILogger::ELL_INFO, static_cast(SAMPLING_MODE_FLAGS::Count)); } private: // samplesPerCreation: > 0 selects sampling mode with that 1:N ratio; 0 means create-only mode (label "create-only"). - void performBenchmark(const char* name, SAMPLING_MODE mode, uint64_t totalThreads, uint32_t samplesPerCreation) + void performBenchmark(const char* name, SAMPLING_MODE_FLAGS mode, uint64_t totalThreads, uint32_t samplesPerCreation) { m_device->waitIdle(); - const bool isCreationBenchmark = (mode == SAMPLING_MODE::SILHOUETTE_CREATION_ONLY || mode == SAMPLING_MODE::PYRAMID_CREATION_ONLY); + const bool isCreationBenchmark = (mode == SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY || mode == SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY || mode == SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY); m_pushConstants.modelMatrix = float32_t3x4(transpose(m_visualizer->interface.m_OBBModelMatrix)); m_pushConstants.sampleCount = m_BenchmarkSampleCount; @@ -1730,17 +1888,16 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR else snprintf(modeBuf, sizeof(modeBuf), "1:%u", samplesPerCreation); - m_logger->log("%-28s | %-12s | %9.2f | %10.2f | %10.3f", - ILogger::ELL_PERFORMANCE, name, modeBuf, ps_per_op, gops_per_s, elapsed_ms); + m_logger->log("%-29s | %-12s | %9.2f | %10.2f | %10.3f", ILogger::ELL_PERFORMANCE, name, modeBuf, ps_per_op, gops_per_s, elapsed_ms); } - void recordCmdBuff(SAMPLING_MODE mode) + void recordCmdBuff(SAMPLING_MODE_FLAGS mode) const { m_cmdbuf->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); m_cmdbuf->resetQueryPool(m_queryPool.get(), 0, 2); m_cmdbuf->beginDebugMarker("sampling compute dispatch", vectorSIMDf(0, 1, 0, 1)); - m_cmdbuf->bindComputePipeline(m_pipelines[mode].get()); + m_cmdbuf->bindComputePipeline(m_pipelines[denseIdOf(mode)].get()); m_cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE, m_pplnLayout.get(), 0, 1, &m_ds.get()); m_cmdbuf->pushConstants(m_pplnLayout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0, sizeof(BenchmarkPushConstants), &m_pushConstants); @@ -1795,12 +1952,12 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR smart_refctd_ptr m_ds = nullptr; smart_refctd_ptr m_pplnLayout = nullptr; BenchmarkPushConstants m_pushConstants; - smart_refctd_ptr m_pipelines[SAMPLING_MODE::Count]; + smart_refctd_ptr m_pipelines[SAMPLING_MODE_FLAGS::Count]; smart_refctd_ptr m_queryPool = nullptr; - std::string m_pipelineReports[SAMPLING_MODE::Count]; - const char* m_pipelineReportNames[SAMPLING_MODE::Count] = {}; + std::string m_pipelineReports[SAMPLING_MODE_FLAGS::Count]; + const char* m_pipelineReportNames[SAMPLING_MODE_FLAGS::Count] = {}; uint32_t m_queueFamily; IQueue* m_computeQueue; @@ -1808,6 +1965,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR float64_t m_timestampPeriodNs = 1.0; static constexpr int WarmupDispatches = 100; static constexpr int Dispatches = 1000; + // PC sampling needs sustained execution per range; one dispatch is too short. Tune up if NSight still reports too few samples. + static constexpr int NSightDispatchesPerMode = 16; }; template From 8a1ff7c82928783dc32b5fd0ae5eda99d650e8cf Mon Sep 17 00:00:00 2001 From: Karim Mohamed Date: Wed, 6 May 2026 20:51:11 +0300 Subject: [PATCH 26/26] direct rectangles from OBB sampler, no need for dense ID's in shaders (used NTTP instead) --- 73_SolidAngleVisualizer/CMakeLists.txt | 64 ++++--- .../hlsl/benchmark/benchmark.comp.hlsl | 33 +++- .../app_resources/hlsl/common.hlsl | 20 +- .../app_resources/hlsl/obb_face_sampling.hlsl | 178 ++++++++++++++++++ .../hlsl/solid_angle_vis.frag.hlsl | 59 ++---- 73_SolidAngleVisualizer/main.cpp | 95 +++++----- 6 files changed, 318 insertions(+), 131 deletions(-) create mode 100644 73_SolidAngleVisualizer/app_resources/hlsl/obb_face_sampling.hlsl diff --git a/73_SolidAngleVisualizer/CMakeLists.txt b/73_SolidAngleVisualizer/CMakeLists.txt index 8112efd1b..0709770be 100644 --- a/73_SolidAngleVisualizer/CMakeLists.txt +++ b/73_SolidAngleVisualizer/CMakeLists.txt @@ -48,6 +48,7 @@ if(NBL_BUILD_IMGUI) app_resources/hlsl/triangle_sampling.hlsl app_resources/hlsl/parallelogram_sampling.hlsl app_resources/hlsl/pyramid_sampling.hlsl + app_resources/hlsl/obb_face_sampling.hlsl app_resources/hlsl/pyramid_sampling/bilinear.hlsl @@ -67,40 +68,43 @@ if(NBL_BUILD_IMGUI) set(JSON [=[ [ - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_sa", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=3", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_sa_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=3", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_psa", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=4", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_psa_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=4", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_para", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=5", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_para_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=5", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=1", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=1", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_bilinear", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=6", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_bilinear_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=6", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_proj_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=2", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_proj_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=2", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_silhouette", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=7", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_silhouette_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=7", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_pyramid", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=8", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_pyramid_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=8", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_pyramid", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=9", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_pyramid_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=9", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=0", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, - {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=0", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_sa", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_sa_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_psa", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_tri_psa_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_para", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_para_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_bilinear", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_bilinear_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_proj_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_proj_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_silhouette", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_silhouette_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_pyramid", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_pyramid_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_pyramid", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_pyramid_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_rectangle", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_caliper_rectangle_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_obb_face", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, + {"INPUT": "${SA_VIS}", "KEY": "sa_vis_obb_face_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, {"INPUT": "${RAY_VIS}", "KEY": "ray_vis", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DDEBUG_DATA=0", "-DVISUALIZE_SAMPLES=0"]}, {"INPUT": "${RAY_VIS}", "KEY": "ray_vis_dbg", "COMPILE_OPTIONS": ["-T", "ps_${SM}", "-DDEBUG_DATA=1", "-DVISUALIZE_SAMPLES=1"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_tri_sa", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=3"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_tri_psa", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=4"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_para", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE", "-DSAMPLING_MODE_DENSE_ID=5"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_bilinear", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=6"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=1"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_proj_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=2"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_silhouette", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=7"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_pyramid_creation", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=8"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_caliper_pyramid_creation", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY", "-DSAMPLING_MODE_DENSE_ID=9"]}, - {"INPUT": "${BENCH}", "KEY": "benchmark_caliper_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID", "-DSAMPLING_MODE_DENSE_ID=0"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_tri_sa", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_tri_psa", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_para", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_bilinear", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_proj_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_silhouette", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_pyramid_creation", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_caliper_pyramid_creation", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_caliper_rectangle", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID"]}, + {"INPUT": "${BENCH}", "KEY": "benchmark_obb_face_direct", "COMPILE_OPTIONS": ["-T", "cs_${SM}", "-DSAMPLING_MODE_FLAGS_CONST=SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT"]}, ] ]=]) string(CONFIGURE "${JSON}" JSON) diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl index 8df778c34..fdc7a8197 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/benchmark/benchmark.comp.hlsl @@ -9,6 +9,7 @@ #include "app_resources/hlsl/parallelogram_sampling.hlsl" #include "app_resources/hlsl/pyramid_sampling.hlsl" #include "app_resources/hlsl/triangle_sampling.hlsl" +#include "app_resources/hlsl/obb_face_sampling.hlsl" using namespace nbl::hlsl; @@ -66,6 +67,30 @@ uint32_t runCreateAndSample(uint32_t creations, NBL_REF_ARG(Xoroshiro64Star) rng return sink; } +// Variant for samplers whose `create(view)` works directly from the OBBView +// without needing a ClippedSilhouette upstream. Skips the ~25-30 ps silhouette +// build cost per creation. +template +uint32_t runCreateAndSampleNoSilhouette(uint32_t creations, NBL_REF_ARG(Xoroshiro64Star) rng, float32_t rcpU32, uint32_t invocationID, float32_t3 rndOffset) +{ + uint32_t sink = 0; + for (uint32_t c = 0; c < creations; c++) + { + shapes::OBBView view = makePerturbedView(rndOffset, rng, rcpU32); + SamplerT sampler = SamplerT::create(view); + + for (uint32_t s = 0; s < pc.samplesPerCreation; s++) + { + float32_t2 xi = stratifiedXi(c * pc.samplesPerCreation + s, invocationID); + typename SamplerT::cache_type cache; + float32_t3 dir = sampler.generate(xi, cache); + float32_t pdf = sampler.forwardPdf(xi, cache); + sink ^= asuint(dir.x) ^ asuint(dir.y) ^ asuint(dir.z) ^ asuint(pdf) ^ sampler.selectedIdx(cache); + } + } + return sink; +} + // Pyramid-create-only benchmark using synthetic random vertices. Templated on // UseCaliper so PYRAMID_CREATION_ONLY and CALIPER_PYRAMID_CREATION_ONLY share // one body. Inner sampler is unused (no generate() calls), so default to SphRect. @@ -110,7 +135,8 @@ uint32_t runPyramidCreationOnly(NBL_REF_ARG(Xoroshiro64Star) rng, float32_t rcpU return sink; } -[numthreads(BENCHMARK_WORKGROUP_DIMENSION_SIZE_X, 1, 1)] void main() +[numthreads(BENCHMARK_WORKGROUP_DIMENSION_SIZE_X, 1, 1)] +void main() { const uint32_t invocationID = nbl::hlsl::glsl::gl_GlobalInvocationID().x; @@ -149,8 +175,7 @@ uint32_t runPyramidCreationOnly(NBL_REF_ARG(Xoroshiro64Star) rng, float32_t rcpU sink ^= asuint(iterVerts[j].x) ^ asuint(iterVerts[j].y) ^ asuint(iterVerts[j].z); } } - else if ((benchmarkMode & SAMPLING_MODE_FLAGS::FLAG_PYRAMID) != 0u - && (benchmarkMode & SAMPLING_MODE_FLAGS::FLAG_CREATE_ONLY) != 0u) + else if ((benchmarkMode & SAMPLING_MODE_FLAGS::FLAG_PYRAMID) != 0u && (benchmarkMode & SAMPLING_MODE_FLAGS::FLAG_CREATE_ONLY) != 0u) sink ^= runPyramidCreationOnly<(benchmarkMode & SAMPLING_MODE_FLAGS::FLAG_CALIPER) != 0u>(rng, rcpU32); // Caliper variant: tighter rect → different rejection rate, only interesting when samplesPerCreation > 1. else if (benchmarkMode == SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID) @@ -165,6 +190,8 @@ uint32_t runPyramidCreationOnly(NBL_REF_ARG(Xoroshiro64Star) rng, float32_t rcpU sink ^= runCreateAndSample(creations, rng, rcpU32, invocationID, rndOffset); else if (benchmarkMode == SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID) sink ^= runCreateAndSample >(creations, rng, rcpU32, invocationID, rndOffset); + else if (benchmarkMode == SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT) + sink ^= runCreateAndSampleNoSilhouette(creations, rng, rcpU32, invocationID, rndOffset); else { assert(false); diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl index 632cd7856..d170660af 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/common.hlsl @@ -24,6 +24,8 @@ enum SAMPLING_MODE_FLAGS : uint32_t FLAG_TRIANGLE = 0x200, FLAG_PARALLELOGRAM = 0x400, FLAG_SILHOUETTE = 0x800, + FLAG_OBB_FACE = 0x10000, + FLAG_OBB_AXES = 0x20000, // ---- variant flags (modifiers on the family) ---- FLAG_CALIPER = 0x1000, @@ -46,11 +48,14 @@ enum SAMPLING_MODE_FLAGS : uint32_t BILINEAR_FROM_PYRAMID = 6 | FLAG_PYRAMID | FLAG_BILINEAR, - SILHOUETTE_CREATION_ONLY = 7 | FLAG_SILHOUETTE | FLAG_CREATE_ONLY, - PYRAMID_CREATION_ONLY = 8 | FLAG_PYRAMID | FLAG_CREATE_ONLY, - CALIPER_PYRAMID_CREATION_ONLY = 9 | FLAG_PYRAMID | FLAG_CALIPER | FLAG_CREATE_ONLY, + OBB_FACE_DIRECT = 7 | FLAG_OBB_FACE, - Count = 10 // count of distinct dense IDs + SILHOUETTE_CREATION_ONLY = 8 | FLAG_SILHOUETTE | FLAG_CREATE_ONLY, + PYRAMID_CREATION_ONLY = 9 | FLAG_PYRAMID | FLAG_CREATE_ONLY, + CALIPER_PYRAMID_CREATION_ONLY = 10 | FLAG_PYRAMID | FLAG_CALIPER | FLAG_CREATE_ONLY, + + Count = 11, // count of distinct dense IDs + CountWithoutCreateOnly = Count - 3 // count of modes that aren't "creation only" (i.e. that produce samples) }; #ifndef __HLSL_VERSION @@ -65,9 +70,10 @@ constexpr SAMPLING_MODE_FLAGS kAllModes[SAMPLING_MODE_FLAGS::Count] = { SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE, // dense 4 SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE, // dense 5 SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID, // dense 6 - SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY, // dense 7 - SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY, // dense 8 - SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY, // dense 9 + SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT, // dense 7 + SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY, // dense 8 + SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY, // dense 9 + SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY, // dense 10 }; #endif diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/obb_face_sampling.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/obb_face_sampling.hlsl new file mode 100644 index 000000000..b11038364 --- /dev/null +++ b/73_SolidAngleVisualizer/app_resources/hlsl/obb_face_sampling.hlsl @@ -0,0 +1,178 @@ +//// Copyright (C) 2026 - DevSH Graphics Programming Sp. z O.O. +//// This file is part of the "Nabla Engine". +//// For conditions of distribution and use, see copyright notice in nabla.h +#ifndef _SOLID_ANGLE_VIS_EXAMPLE_OBB_FACE_SAMPLING_HLSL_INCLUDED_ +#define _SOLID_ANGLE_VIS_EXAMPLE_OBB_FACE_SAMPLING_HLSL_INCLUDED_ + +#include "common.hlsl" +#include "silhouette.hlsl" // for the (silhouette, view) overload's signature + +#include +#include +#include +#include +#include + +// Multi-face OBB sampler -- Matt's design with shared tip vertex T as origin +// and silhouette pipeline skipped entirely. NO horizon clipping (option A): +// samples below z=0 just get pdf=0, biased for OBBs near receiver horizon. +// +// This is the best OBB-faces variant we measured (~92 ps @ 1:1, ~22 ps @ 1:16, +// ~17 ps @ 1:128). Still slower than PYRAMID_RECTANGLE on this Ampere SM at +// every ratio. Kept around as a documented baseline for future experiments +// (e.g. Las Vegas resampling, different inner samplers, fp16 packing) where +// the no-clipping property might justify the per-sample overhead. +// +// See feedback memory: feedback_obb_faces_direct_loses.md +struct OBBFaceSampler +{ + using scalar_type = float32_t; + using vector2_type = float32_t2; + using vector3_type = float32_t3; + using domain_type = vector2_type; + using codomain_type = vector3_type; + using density_type = scalar_type; + using weight_type = density_type; + + struct cache_type + { + typename sampling::SphericalRectangle::cache_type inner; + density_type pdf; + }; + + sampling::SphericalRectangle rects[3]; + uint32_t numRects; + float32_t cumSA0; + float32_t cumSA1; + float32_t totalSolidAngle; + float32_t rcpTotalSolidAngle; + + // Build sphrect for face on `Axis`, using T as the shared world-space origin. + // T_idx encodes which OBB cube corner T is (bits 0/1/2 = axis sides). + // swap flips right/up for correct outward-normal direction; rule is + // popcount(T_idx) even => swap. + template + static sampling::SphericalRectangle makeRectFromTip(shapes::OBBView view, float32_t3 T_pos, uint32_t T_idx, bool swap) + { + const uint32_t a1 = (Axis + 1u) % 3u; + const uint32_t a2 = (Axis + 2u) % 3u; + + const float32_t s1 = ((T_idx & (1u << a1)) != 0u) ? -1.0f : 1.0f; + const float32_t s2 = ((T_idx & (1u << a2)) != 0u) ? -1.0f : 1.0f; + const float32_t3 rNatural = view.columns[a1] * s1; + const float32_t3 uNatural = view.columns[a2] * s2; + + shapes::CompressedSphericalRectangle compressed; + compressed.origin = T_pos; + if (swap) + { + compressed.right = uNatural; + compressed.up = rNatural; + } + else + { + compressed.right = rNatural; + compressed.up = uNatural; + } + + const shapes::SphericalRectangle shapeRect = shapes::SphericalRectangle::create(compressed); + return sampling::SphericalRectangle::create(shapeRect, float32_t3(0.0f, 0.0f, 0.0f)); + } + + // create(view) -- region derived inline from view, no silhouette pipeline. + static OBBFaceSampler create(shapes::OBBView view) + { + OBBFaceSampler self; + + // Region inline (mirrors silhouette.hlsl ClippedSilhouette::create). + const float32_t3 sqScales = float32_t3(dot(view.columns[0], view.columns[0]), dot(view.columns[1], view.columns[1]), dot(view.columns[2], view.columns[2])); + const float32_t3 proj = -float32_t3(dot(view.columns[0], view.minCorner), dot(view.columns[1], view.minCorner), dot(view.columns[2], view.minCorner)); + const uint32_t3 below = uint32_t3(proj < float32_t3(0, 0, 0)); + const uint32_t3 above = uint32_t3(proj > sqScales); + const uint32_t3 region = uint32_t3(uint32_t3(1u, 1u, 1u) + below - above); + + const bool xVis = (region.x != 1u); + const bool yVis = (region.y != 1u); + const bool zVis = (region.z != 1u); + self.numRects = uint32_t(xVis) + uint32_t(yVis) + uint32_t(zVis); + + // Tip T: bit i set iff observer past max on axis i (region[i] == 0). + const uint32_t T_idx = (uint32_t(region.x == 0u) << 0) + | (uint32_t(region.y == 0u) << 1) + | (uint32_t(region.z == 0u) << 2); + const float32_t3 T_pos = view.getVertex(T_idx); + + const bool swap = (countbits(T_idx) & 1u) == 0u; + + // Slot 0: first visible axis. Cascade keeps every rects[K] write at a + // literal slot index, every makeRectFromTip at literal Axis. + if (xVis) + self.rects[0] = makeRectFromTip<0>(view, T_pos, T_idx, swap); + else if (yVis) + self.rects[0] = makeRectFromTip<1>(view, T_pos, T_idx, swap); + else + self.rects[0] = makeRectFromTip<2>(view, T_pos, T_idx, swap); + + // Slot 1: second visible. xVis && yVis -> y; otherwise z. + if (self.numRects >= 2u) + { + if (xVis && yVis) + self.rects[1] = makeRectFromTip<1>(view, T_pos, T_idx, swap); + else + self.rects[1] = makeRectFromTip<2>(view, T_pos, T_idx, swap); + } + + // Slot 2: only when all 3 visible -> axis z. + if (self.numRects == 3u) + self.rects[2] = makeRectFromTip<2>(view, T_pos, T_idx, swap); + + // CDF over face solid angles. + self.cumSA0 = self.rects[0].solidAngle; + self.cumSA1 = self.cumSA0 + ((self.numRects >= 2u) ? self.rects[1].solidAngle : 0.0f); + self.totalSolidAngle = self.cumSA1 + ((self.numRects == 3u) ? self.rects[2].solidAngle : 0.0f); + self.rcpTotalSolidAngle = 1.0f / self.totalSolidAngle; + + return self; + } + + // Uniform interface compatibility: ignores `silhouette` since region is + // derived inline from view. + static OBBFaceSampler create(NBL_CONST_REF_ARG(ClippedSilhouette) /*silhouette*/, shapes::OBBView view) + { + return create(view); + } + + codomain_type generate(domain_type u, NBL_REF_ARG(cache_type) cache) + { + const float32_t target = u.x * totalSolidAngle; + codomain_type dir; + + if (target < cumSA0) + { + const float32_t uPrime = target / cumSA0; + dir = rects[0].generate(float32_t2(uPrime, u.y), cache.inner); + } + else if (numRects == 2u || target < cumSA1) + { + const float32_t faceSA = (numRects == 2u) ? (totalSolidAngle - cumSA0) : (cumSA1 - cumSA0); + const float32_t uPrime = (target - cumSA0) / faceSA; + dir = rects[1].generate(float32_t2(uPrime, u.y), cache.inner); + } + else // numRects == 3 and target >= cumSA1 + { + const float32_t faceSA = totalSolidAngle - cumSA1; + const float32_t uPrime = (target - cumSA1) / faceSA; + dir = rects[2].generate(float32_t2(uPrime, u.y), cache.inner); + } + + const bool valid = dir.z > 0.0f; + cache.pdf = hlsl::select(valid, rcpTotalSolidAngle, 0.0f); + return dir; + } + + density_type forwardPdf(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + weight_type forwardWeight(domain_type u, cache_type cache) NBL_CONST_MEMBER_FUNC { return cache.pdf; } + uint32_t selectedIdx(cache_type cache) NBL_CONST_MEMBER_FUNC { return 0u; } +}; + +#endif // _SOLID_ANGLE_VIS_EXAMPLE_OBB_FACE_SAMPLING_HLSL_INCLUDED_ diff --git a/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl b/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl index cfa2e2969..feb3e63d3 100644 --- a/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl +++ b/73_SolidAngleVisualizer/app_resources/hlsl/solid_angle_vis.frag.hlsl @@ -15,39 +15,26 @@ using namespace ext::FullScreenTriangle; #include "triangle_sampling.hlsl" #include "parallelogram_sampling.hlsl" #include "pyramid_sampling.hlsl" +#include "obb_face_sampling.hlsl" [[vk::push_constant]] struct PushConstants pc; static const SAMPLING_MODE_FLAGS samplingMode = SAMPLING_MODE_FLAGS_CONST; -// Mode -> Sampler type dispatch keyed on the dense ID (boost::wave can't -// evaluate enum-qualified `::` in #if, so we use a parallel numeric macro -// passed in by CMake). Dense IDs match the kAllModes ordering in common.hlsl: -// 0 SPH_RECT_FROM_CALIPER_PYRAMID 5 PROJECTED_PARALLELOGRAM_SOLID_ANGLE -// 1 SPH_RECT_FROM_PYRAMID 6 BILINEAR_FROM_PYRAMID -// 2 PROJ_SPH_RECT_FROM_PYRAMID 7 SILHOUETTE_CREATION_ONLY (early-exit) -// 3 TRIANGLE_SOLID_ANGLE 8 PYRAMID_CREATION_ONLY -// 4 TRIANGLE_PROJECTED_SOLID_ANGLE 9 CALIPER_PYRAMID_CREATION_ONLY -// PYRAMID_CREATION_ONLY / CALIPER_PYRAMID_CREATION_ONLY pick the sphrect inner -// so the bounding lunes still draw on screen; the inner is harmless extra work -// (those modes are timed in the compute benchmark, not the frag). -#if SAMPLING_MODE_DENSE_ID == 3 -typedef TriangleFanSampler SelectedSampler; -#elif SAMPLING_MODE_DENSE_ID == 4 -typedef TriangleFanSampler SelectedSampler; -#elif SAMPLING_MODE_DENSE_ID == 5 -typedef Parallelogram SelectedSampler; -#elif SAMPLING_MODE_DENSE_ID == 1 || SAMPLING_MODE_DENSE_ID == 8 -typedef SphericalPyramid > SelectedSampler; -#elif SAMPLING_MODE_DENSE_ID == 0 || SAMPLING_MODE_DENSE_ID == 9 -typedef SphericalPyramid > SelectedSampler; -#elif SAMPLING_MODE_DENSE_ID == 2 -typedef SphericalPyramid > SelectedSampler; -#elif SAMPLING_MODE_DENSE_ID == 6 -typedef SphericalPyramid SelectedSampler; -#elif SAMPLING_MODE_DENSE_ID == 7 // SILHOUETTE_CREATION_ONLY: alias any type so the -typedef Parallelogram SelectedSampler; // unreachable post-early-return code parses. -#endif +template struct SelectSampler; +template<> struct SelectSampler { using type = TriangleFanSampler; }; +template<> struct SelectSampler { using type = TriangleFanSampler; }; +template<> struct SelectSampler { using type = Parallelogram; }; +template<> struct SelectSampler { using type = SphericalPyramid >; }; +template<> struct SelectSampler { using type = SphericalPyramid >; }; +template<> struct SelectSampler { using type = SphericalPyramid >; }; +template<> struct SelectSampler { using type = SphericalPyramid >; }; +template<> struct SelectSampler { using type = SphericalPyramid >; }; +template<> struct SelectSampler { using type = SphericalPyramid; }; +template<> struct SelectSampler { using type = OBBFaceSampler; }; +template<> struct SelectSampler { using type = Parallelogram; }; + +using SelectedSampler = typename SelectSampler::type; void computeSpherePos(SVertexAttributes vx, out float32_t2 ndc, out float32_t3 spherePos) { @@ -81,22 +68,6 @@ void computeSpherePos(SVertexAttributes vx, out float32_t2 ndc, out float32_t3 s shapes::OBBView view = shapes::OBBView::create(pc.modelMatrix); ClippedSilhouette silhouette = ClippedSilhouette::create(view); - if (SAMPLING_MODE_DENSE_ID == 7) // SILHOUETTE_CREATION_ONLY - { - // Sink that prevents DCE of the create+materialize cost. - shapes::OBBView perturbedView = view; - perturbedView.minCorner += float32_t3(ndc.x, ndc.y, 0.0f) * 1e-7f; - ClippedSilhouette pSilhouette = ClippedSilhouette::create(perturbedView); - float32_t3 pVerts[MAX_SILHOUETTE_VERTICES]; - pSilhouette.materialize(perturbedView, pVerts); - - uint32_t sink = pSilhouette.count; - NBL_UNROLL - for (uint32_t i = 0; i < MAX_SILHOUETTE_VERTICES; i++) - sink ^= asuint(pVerts[i].x) ^ asuint(pVerts[i].y) ^ asuint(pVerts[i].z); - return (float32_t4)asfloat(sink); - } - SelectedSampler sampler = SelectedSampler::create(silhouette, view); uint32_t validSampleCount = 0; diff --git a/73_SolidAngleVisualizer/main.cpp b/73_SolidAngleVisualizer/main.cpp index 78313e413..a0547c7ed 100644 --- a/73_SolidAngleVisualizer/main.cpp +++ b/73_SolidAngleVisualizer/main.cpp @@ -26,14 +26,14 @@ // pairs that the frag shader / benchmark actually instantiate. // ============================================================================ - //static_assert(nbl::hlsl::sampling::concepts::TractableSampler); - //static_assert(nbl::hlsl::sampling::concepts::TractableSampler>); - //static_assert(nbl::hlsl::sampling::concepts::TractableSampler>); - //static_assert(nbl::hlsl::sampling::concepts::TractableSampler); - //static_assert(nbl::hlsl::sampling::concepts::TractableSampler>>); - //static_assert(nbl::hlsl::sampling::concepts::TractableSampler>>); - //static_assert(nbl::hlsl::sampling::concepts::TractableSampler>>); - //static_assert(nbl::hlsl::sampling::concepts::TractableSampler>); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler>); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler>); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler>>); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler>>); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler>>); +//static_assert(nbl::hlsl::sampling::concepts::TractableSampler>); // App execution mode -- pick at compile time via -DAPP_MODE=N // APP_MODE_VISUALIZER (1) full visualization with debug + ImGui editor (default) @@ -237,6 +237,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR addSaVis.template operator()<"sa_vis_pyramid", "sa_vis_pyramid_dbg">(SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY); addSaVis.template operator()<"sa_vis_caliper_pyramid", "sa_vis_caliper_pyramid_dbg">(SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY); addSaVis.template operator()<"sa_vis_caliper_rectangle", "sa_vis_caliper_rectangle_dbg">(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID); + addSaVis.template operator()<"sa_vis_obb_face", "sa_vis_obb_face_dbg">(SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT); smart_refctd_ptr rayVisShaders[DebugPermutations]; rayVisShaders[0] = loadPrecompiledShader(nbl::this_example::builtin::build::get_spirv_key<"ray_vis">(m_device.get())); @@ -430,16 +431,16 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR m_keepRunning = false; } - const auto resourceIx = m_realFrameIx % MaxFramesInFlight; - auto* const cb = m_cmdBufs.data()[resourceIx].get(); + const auto resourceIx = m_realFrameIx % MaxFramesInFlight; + auto* const cb = m_cmdBufs.data()[resourceIx].get(); cb->reset(IGPUCommandBuffer::RESET_FLAGS::RELEASE_RESOURCES_BIT); cb->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); { auto* scRes = static_cast(m_surface->getSwapchainResources()); const IGPUCommandBuffer::SClearColorValue clearValue = {.float32 = {0.f, 0.f, 0.f, 1.f}}; const IGPUCommandBuffer::SRenderpassBeginInfo renderpassInfo = - {.framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex), - .colorClearValues = &clearValue, + {.framebuffer = scRes->getFramebuffer(device_base_t::getCurrentAcquire().imageIndex), + .colorClearValues = &clearValue, .depthStencilClearValues = nullptr, .renderArea = {.offset = {0, 0}, .extent = {m_window->getWidth(), m_window->getHeight()}}}; beginRenderpass(cb, renderpassInfo); @@ -448,14 +449,14 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR cb->end(); IQueue::SSubmitInfo::SSemaphoreInfo retval = - {.semaphore = m_semaphore.get(), - .value = ++m_realFrameIx, + {.semaphore = m_semaphore.get(), + .value = ++m_realFrameIx, .stageMask = PIPELINE_STAGE_FLAGS::ALL_GRAPHICS_BITS}; - const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = {{.cmdbuf = cb}}; + const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[] = {{.cmdbuf = cb }}; const IQueue::SSubmitInfo::SSemaphoreInfo acquired[] = { {.semaphore = device_base_t::getCurrentAcquire().semaphore, - .value = device_base_t::getCurrentAcquire().acquireCount, - .stageMask = PIPELINE_STAGE_FLAGS::NONE}}; + .value = device_base_t::getCurrentAcquire().acquireCount, + .stageMask = PIPELINE_STAGE_FLAGS::NONE }}; const IQueue::SSubmitInfo infos[] = { {.waitSemaphores = acquired, .commandBuffers = commandBuffers, .signalSemaphores = {&retval, 1}}}; if (getGraphicsQueue()->submit(infos) != IQueue::RESULT::SUCCESS) @@ -881,13 +882,13 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR constexpr static inline auto MaxImGUITextures = 2u + MaxFramesInFlight; static inline SAMPLING_MODE_FLAGS m_samplingMode = SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID; - static inline bool m_debugVisualization = true; - static inline int m_SampleCount = 64; - static inline int m_BenchmarkSampleCount = 128; - static inline bool m_frameSeeding = true; - static inline ResultData m_GPUOutResulData; - bool m_keepRunning = true; - bool m_nsightBenchDone = false; + static inline bool m_debugVisualization = true; + static inline int m_SampleCount = 64; + static inline int m_BenchmarkSampleCount = 128; + static inline bool m_frameSeeding = true; + static inline ResultData m_GPUOutResulData; + bool m_keepRunning = true; + bool m_nsightBenchDone = false; // smart_refctd_ptr m_scene; smart_refctd_ptr m_solidAngleRenderpass; @@ -962,26 +963,25 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR ImGui::Text("Sampling Mode:"); ImGui::SameLine(); - const char* samplingModes[SAMPLING_MODE_FLAGS::Count - 3] = {}; - samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID)] = "Spherical Rectangle From Pyramid"; - samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID)] = "Caliper Rectangle From Pyramid"; - samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID)] = "Projected Spherical Rectangle From Pyramid"; - samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE)] = "Spherical Triangle"; - samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE)] = "Projected Spherical Triangle"; + const char* samplingModes[SAMPLING_MODE_FLAGS::CountWithoutCreateOnly] = {}; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID)] = "Spherical Rectangle From Pyramid"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID)] = "Caliper Rectangle From Pyramid"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID)] = "Projected Spherical Rectangle From Pyramid"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE)] = "Spherical Triangle"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE)] = "Projected Spherical Triangle"; samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE)] = "Projected Parallelogram"; - samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID)] = "Bilinear Pyramid"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::BILINEAR_FROM_PYRAMID)] = "Bilinear Pyramid"; + samplingModes[denseIdOf(SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT)] = "OBB Face Direct"; int currentMode = static_cast(denseIdOf(m_samplingMode)); - if (ImGui::Combo("##SamplingMode", ¤tMode, samplingModes, SAMPLING_MODE_FLAGS::Count - 3)) + if (ImGui::Combo("##SamplingMode", ¤tMode, samplingModes, SAMPLING_MODE_FLAGS::CountWithoutCreateOnly)) { m_samplingMode = kAllModes[currentMode]; } ImGui::Checkbox("Debug Visualization", &m_debugVisualization); - ImGui::Text("Pipeline idx: SA=%d, Ray=%d", - static_cast(denseIdOf(m_samplingMode)) * DebugPermutations + (m_debugVisualization ? 1 : 0), - m_debugVisualization ? 1 : 0); + ImGui::Text("Pipeline idx: SA=%d, Ray=%d", static_cast(denseIdOf(m_samplingMode)) * DebugPermutations + (m_debugVisualization ? 1 : 0), m_debugVisualization ? 1 : 0); ImGui::Checkbox("Frame seeding", &m_frameSeeding); ImGui::SliderInt("Sample Count", &m_SampleCount, 0, 512); @@ -1260,7 +1260,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } // Parallelogram - if (m_samplingMode == PROJECTED_PARALLELOGRAM_SOLID_ANGLE && ImGui::CollapsingHeader("Projected Parallelogram", ImGuiTreeNodeFlags_DefaultOpen)) + if (m_samplingMode & FLAG_PARALLELOGRAM && ImGui::CollapsingHeader("Projected Parallelogram", ImGuiTreeNodeFlags_DefaultOpen)) { ImGui::Text("Area: %.3f", m_GPUOutResulData.parallelogram.area); ImGui::Text("N3 Mask: 0x%02X", m_GPUOutResulData.parallelogram.n3Mask); @@ -1275,7 +1275,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR for (uint32_t i = 0; i < 4; i++) ImGui::Text("Corner[%u]: (%.3f, %.3f)", i, m_GPUOutResulData.parallelogram.corners[i].x, m_GPUOutResulData.parallelogram.corners[i].y); } - else if ((m_samplingMode == SPH_RECT_FROM_PYRAMID || m_samplingMode == PROJ_SPH_RECT_FROM_PYRAMID || m_samplingMode == BILINEAR_FROM_PYRAMID || m_samplingMode == SPH_RECT_FROM_CALIPER_PYRAMID) && ImGui::CollapsingHeader("Spherical Pyramid", ImGuiTreeNodeFlags_DefaultOpen)) + else if ((m_samplingMode & FLAG_PYRAMID) && ImGui::CollapsingHeader("Spherical Pyramid", ImGuiTreeNodeFlags_DefaultOpen)) { ImGui::Text("Best Caliper Edge: %u", m_GPUOutResulData.pyramid.bestEdge); ImGui::Separator(); @@ -1299,7 +1299,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR m_GPUOutResulData.pyramid.center.x, m_GPUOutResulData.pyramid.center.y, m_GPUOutResulData.pyramid.center.z); ImGui::Text("Solid Angle (bound): %.6f sr", m_GPUOutResulData.pyramid.solidAngle); } - else if (m_samplingMode == TRIANGLE_SOLID_ANGLE || m_samplingMode == TRIANGLE_PROJECTED_SOLID_ANGLE && ImGui::CollapsingHeader("Spherical Triangle", ImGuiTreeNodeFlags_DefaultOpen)) + else if (m_samplingMode & FLAG_TRIANGLE && ImGui::CollapsingHeader("Spherical Triangle", ImGuiTreeNodeFlags_DefaultOpen)) { ImGui::Text("Spherical Lune Detected: %s", m_GPUOutResulData.triangleFan.sphericalLuneDetected ? "true" : "false"); ImGui::Text("Triangle Count: %u", m_GPUOutResulData.triangleFan.triangleCount); @@ -1651,6 +1651,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR addBench.template operator()<"benchmark_pyramid_creation">(SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY); addBench.template operator()<"benchmark_caliper_pyramid_creation">(SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY); addBench.template operator()<"benchmark_caliper_rectangle">(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID); + addBench.template operator()<"benchmark_obb_face_direct">(SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT); nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = { {.binding = 0, @@ -1753,7 +1754,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR struct SamplerEntry { - const char* name; + const char* name; SAMPLING_MODE_FLAGS mode; }; const SamplerEntry samplers[] = { @@ -1764,6 +1765,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR {.name = "PARALLELOGRAM", .mode = SAMPLING_MODE_FLAGS::PROJECTED_PARALLELOGRAM_SOLID_ANGLE}, {.name = "TRIANGLE_SA", .mode = SAMPLING_MODE_FLAGS::TRIANGLE_SOLID_ANGLE}, {.name = "TRIANGLE_PSA", .mode = SAMPLING_MODE_FLAGS::TRIANGLE_PROJECTED_SOLID_ANGLE}, + {.name = "OBB_FACE_DIRECT", .mode = SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT}, }; // Creation-only modes: report per-creation, not per-sample. @@ -1783,7 +1785,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR // can gather enough source-line hits. void runNSightOneShot() { - const char* modeNames[SAMPLING_MODE_FLAGS::Count] = {}; + const char* modeNames[SAMPLING_MODE_FLAGS::Count] = {}; modeNames[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_CALIPER_PYRAMID)] = "CALIPER_PYRAMID_RECTANGLE"; modeNames[denseIdOf(SAMPLING_MODE_FLAGS::SPH_RECT_FROM_PYRAMID)] = "PYRAMID_RECTANGLE"; modeNames[denseIdOf(SAMPLING_MODE_FLAGS::PROJ_SPH_RECT_FROM_PYRAMID)] = "PYRAMID_PROJ_RECTANGLE"; @@ -1794,6 +1796,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR modeNames[denseIdOf(SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY)] = "SILHOUETTE_CREATION_ONLY"; modeNames[denseIdOf(SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY)] = "PYRAMID_CREATION_ONLY"; modeNames[denseIdOf(SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY)] = "CALIPER_PYRAMID_CREATION_ONLY"; + modeNames[denseIdOf(SAMPLING_MODE_FLAGS::OBB_FACE_DIRECT)] = "OBB_FACE_DIRECT"; m_pushConstants.modelMatrix = float32_t3x4(transpose(m_visualizer->interface.m_OBBModelMatrix)); m_pushConstants.sampleCount = static_cast(m_BenchmarkSampleCount); @@ -1828,8 +1831,8 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR } m_cmdbuf->end(); - smart_refctd_ptr done = m_device->createSemaphore(0); - const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {{.semaphore = done.get(), .value = 1, .stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS}}; + smart_refctd_ptr done = m_device->createSemaphore(0); + const IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {{.semaphore = done.get(), .value = 1, .stageMask = asset::PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS}}; IQueue::SSubmitInfo submitInfos[1] = {}; const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufs[] = {{.cmdbuf = m_cmdbuf.get()}}; submitInfos[0].commandBuffers = cmdbufs; @@ -1850,12 +1853,10 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR { m_device->waitIdle(); - const bool isCreationBenchmark = (mode == SAMPLING_MODE_FLAGS::SILHOUETTE_CREATION_ONLY || mode == SAMPLING_MODE_FLAGS::PYRAMID_CREATION_ONLY || mode == SAMPLING_MODE_FLAGS::CALIPER_PYRAMID_CREATION_ONLY); - m_pushConstants.modelMatrix = float32_t3x4(transpose(m_visualizer->interface.m_OBBModelMatrix)); m_pushConstants.sampleCount = m_BenchmarkSampleCount; // For create-only modes the inner loop is unused; pick any divisor of sampleCount to keep the shader's `creations = sampleCount / samplesPerCreation` well-defined. - m_pushConstants.samplesPerCreation = isCreationBenchmark ? uint32_t(m_BenchmarkSampleCount) : samplesPerCreation; + m_pushConstants.samplesPerCreation = mode & FLAG_CREATE_ONLY ? uint32_t(m_BenchmarkSampleCount) : samplesPerCreation; recordCmdBuff(mode); // Nabla's IQueue::submit rejects submissions without a signal semaphore @@ -1883,7 +1884,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR const float64_t elapsed_ms = elapsed_ps * 1e-9; char modeBuf[16]; - if (isCreationBenchmark) + if (mode & FLAG_CREATE_ONLY) snprintf(modeBuf, sizeof(modeBuf), "create-only"); else snprintf(modeBuf, sizeof(modeBuf), "1:%u", samplesPerCreation); @@ -1966,7 +1967,7 @@ class SolidAngleVisualizer final : public MonoWindowApplication, public BuiltinR static constexpr int WarmupDispatches = 100; static constexpr int Dispatches = 1000; // PC sampling needs sustained execution per range; one dispatch is too short. Tune up if NSight still reports too few samples. - static constexpr int NSightDispatchesPerMode = 16; + static constexpr int NSightDispatchesPerMode = 16; }; template