Skip to content

Video failed to be rendered , problem with gsplat and CUDA platform. #89

Description

@applepie1026

system: Windows 11
CUDAToolkit: NAVIDA GPU Computing Toolkit CUDA 12.8
Pytorch: torch 2.12.1+cu126
torchvison: 0.27.1+cu126
GPU : RTX 3060
| NVIDIA-SMI 610.62 KMD Version: 610.62 CUDA UMD Version: 13.3 |
+-----------------------------------------+------------------------+----------------------+
| GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA GeForce RTX 3060 ... WDDM | 00000000:01:00.0 On | N/A |
| N/A 52C P0 23W / 140W | 1660MiB / 6144MiB | 0% Default |
| | | N/A |
+-----------------------------------------+------------------------+----------------------+

--------------------------------------------------------------------------------------------------------
ERROR when import gsplat:
C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.8/include\cub/thread/thread_store.cuh(246): error: asm operand type size(8) does not match type/size implied by constraint 'r'
  template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, uint4*, uint4>(uint4 * ptr, uint4 val) { asm volatile("st." "cg" ".v4.u32 [%0], {%1, %2, %3, %4};" : : "r"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, ulonglong2*, ulonglong2>( ulonglong2 * ptr, ulonglong2 val) { asm volatile("st." "cg" ".v2.u64 [%0], {%1, %2};" : : "r"(ptr), "l"(val.x), "l"(val.y)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, ushort4*, ushort4>(ushort4 * ptr, ushort4 val) { asm volatile("st." "cg" ".v4.u16 [%0], {%1, %2, %3, %4};" : : "r"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, uint2*, uint2>(uint2 * ptr, uint2 val) { asm volatile("st." "cg" ".v2.u32 [%0], {%1, %2};" : : "r"(ptr), "r"(val.x), "r"(val.y)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, unsigned long long*, unsigned long long>( unsigned long long* ptr, unsigned long long val) { asm volatile("st." "cg" ".u64 [%0], %1;" : : "r"(ptr), "l"(val)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, unsigned int*, unsigned int>( unsigned int* ptr, unsigned int val) { asm volatile("st." "cg" ".u32 [%0], %1;" : : "r"(ptr), "r"(val)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, unsigned short*, unsigned short>( unsigned short* ptr, unsigned short val) { asm volatile("st." "cg" ".u16 [%0], %1;" : : "r"(ptr), "h"(val)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, unsigned char*, unsigned char>( unsigned char* ptr, unsigned char val) { asm volatile( "{" "   .reg .u8 datum;" "   cvt.u8.u16 datum, %1;" "   st." "cg" ".u8 [%0], datum;" "}" : : "r"(ptr), "h"((unsigned short) val)); }
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          ^

C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v12.8/include\cub/thread/thread_store.cuh(246): error: asm operand type size(8) does not match type/size implied by constraint 'r'
  template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, uint4*, uint4>(uint4 * ptr, uint4 val) { asm volatile("st." "cg" ".v4.u32 [%0], {%1, %2, %3, %4};" : : "r"(ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, ulonglong2*, ulonglong2>( ulonglong2 * ptr, ulonglong2 val) { asm volatile("st." "cg" ".v2.u64 [%0], {%1, %2};" : : "r"(ptr), "l"(val.x), "l"(val.y)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, ushort4*, ushort4>(ushort4 * ptr, ushort4 val) { asm volatile("st." "cg" ".v4.u16 [%0], {%1, %2, %3, %4};" : : "r"(ptr), "h"(val.x), "h"(val.y), "h"(val.z), "h"(val.w)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, uint2*, uint2>(uint2 * ptr, uint2 val) { asm volatile("st." "cg" ".v2.u32 [%0], {%1, %2};" : : "r"(ptr), "r"(val.x), "r"(val.y)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, unsigned long long*, unsigned long long>( unsigned long long* ptr, unsigned long long val) { asm volatile("st." "cg" ".u64 [%0], %1;" : : "r"(ptr), "l"(val)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, unsigned int*, unsigned int>( unsigned int* ptr, unsigned int val) { asm volatile("st." "cg" ".u32 [%0], %1;" : : "r"(ptr), "r"(val)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, unsigned short*, unsigned short>( unsigned short* ptr, unsigned short val) { asm volatile("st." "cg" ".u16 [%0], %1;" : : "r"(ptr), "h"(val)); } template <> __declspec(__device__) __forceinline void ThreadStore<STORE_CG, unsigned char*, unsigned char>( unsigned char* ptr, unsigned char val) { asm volatile( "{" "   .reg .u8 datum;" "   cvt.u8.u16 datum, %1;" "   st." "cg" ".u8 [%0], datum;" "}" : : "r"(ptr), "h"((unsigned short) val)); }
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     ^

Error limit reached.
100 errors detected in the compilation of "C:/Users/Lenovo/.conda/envs/mlsharp/Lib/site-packages/gsplat/cuda/csrc/IntersectTile.cu".
Compilation terminated.
IntersectTile.cu
ninja: build stopped: subcommand failed.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Fields

    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions