From 6351386d33c402eef354e2556dcaf985c170a12a Mon Sep 17 00:00:00 2001
From: Keith Kraus <keith.j.kraus@gmail.com>
Date: Wed, 3 Jun 2026 15:18:49 -0400
Subject: [PATCH 1/5] Regenerate cuda.bindings docs markup

---
 .../cuda/bindings/_bindings/cydriver.pxd.in   |    3 +-
 .../cuda/bindings/_bindings/cydriver.pyx.in   |    2 +-
 .../cuda/bindings/_bindings/cyruntime.pxd.in  |    2 +-
 .../cuda/bindings/_bindings/cyruntime.pyx.in  |    2 +-
 .../bindings/_bindings/cyruntime_ptds.pxd.in  |    2 +-
 .../bindings/_bindings/cyruntime_ptds.pyx.in  |    2 +-
 .../cuda/bindings/_internal/nvrtc.pxd         |    2 +-
 .../cuda/bindings/_internal/nvrtc_linux.pyx   |    2 +-
 .../cuda/bindings/_internal/nvrtc_windows.pyx |    2 +-
 cuda_bindings/cuda/bindings/cydriver.pxd.in   |    4 +-
 cuda_bindings/cuda/bindings/cydriver.pyx.in   |    2 +-
 cuda_bindings/cuda/bindings/cynvrtc.pxd       |    2 +-
 cuda_bindings/cuda/bindings/cynvrtc.pyx       |    2 +-
 cuda_bindings/cuda/bindings/cyruntime.pxd.in  |    4 +-
 cuda_bindings/cuda/bindings/cyruntime.pyx.in  |    2 +-
 .../cuda/bindings/cyruntime_functions.pxi.in  |    3 +-
 .../cuda/bindings/cyruntime_types.pxi.in      |    2 +-
 cuda_bindings/cuda/bindings/driver.pxd.in     |  294 +-
 cuda_bindings/cuda/bindings/driver.pyx.in     | 5110 +++++++++--------
 cuda_bindings/cuda/bindings/nvrtc.pxd         |    2 +-
 cuda_bindings/cuda/bindings/nvrtc.pyx         |   79 +-
 cuda_bindings/cuda/bindings/runtime.pxd.in    |  233 +-
 cuda_bindings/cuda/bindings/runtime.pyx.in    | 3986 ++++++-------
 cuda_bindings/docs/source/module/driver.rst   |   58 +-
 cuda_bindings/docs/source/module/nvrtc.rst    |   11 +-
 cuda_bindings/docs/source/module/runtime.rst  |  150 +-
 26 files changed, 5023 insertions(+), 4940 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
index 85107bb0fe..1b643ba64b 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1711+g875fec45. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 from cuda.bindings.cydriver cimport *
 
 {{if 'cuGetErrorString' in found_functions}}
@@ -2523,4 +2523,3 @@ cdef CUresult _cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource* pCudaReso
 
 cdef CUresult _cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource* pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags) except ?CUDA_ERROR_NOT_FOUND nogil
 {{endif}}
-
diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
index 6cd5fd689b..f0c724a547 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1711+g875fec45. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 {{if 'Windows' == platform.system()}}
 import os
 cimport cuda.bindings._lib.windll as windll
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
index 178ba2022a..aa0676a5bd 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 include "../cyruntime_types.pxi"
 
 include "../_lib/cyruntime/cyruntime.pxd"
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
index 0a7de77221..98bff296bf 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 include "../cyruntime_functions.pxi"
 
 import os
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
index 08e14a023d..2912d4b0d9 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
index c771cf89de..b268ac8470 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
diff --git a/cuda_bindings/cuda/bindings/_internal/nvrtc.pxd b/cuda_bindings/cuda/bindings/_internal/nvrtc.pxd
index e27ff2c08d..45768a3979 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvrtc.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvrtc.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 
 from ..cynvrtc cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvrtc_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvrtc_linux.pyx
index ac9cbca550..5ddb84098f 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvrtc_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvrtc_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvrtc_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvrtc_windows.pyx
index 03427c3ef5..767e35d8aa 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvrtc_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvrtc_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/cydriver.pxd.in
index 416f428b7b..9068460e22 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1711+g875fec45. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
@@ -5550,4 +5550,4 @@ cdef enum: CU_DEVICE_INVALID = -2
 
 cdef enum: MAX_PLANES = 3
 
-cdef enum: CUDA_EGL_INFINITE_TIMEOUT = 4294967295
\ No newline at end of file
+cdef enum: CUDA_EGL_INFINITE_TIMEOUT = 4294967295
diff --git a/cuda_bindings/cuda/bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/cydriver.pyx.in
index aa552f17f6..c4439868c7 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1711+g875fec45. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 cimport cuda.bindings._bindings.cydriver as cydriver
 
 {{if 'cuGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pxd b/cuda_bindings/cuda/bindings/cynvrtc.pxd
index 9a4476c3ce..90fcd6517a 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pxd
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pyx b/cuda_bindings/cuda/bindings/cynvrtc.pyx
index d297aaa299..e69d1ffd51 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pyx
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 
 from ._internal cimport nvrtc as _nvrtc
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
index 453011b2ba..2bc4c4f833 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
@@ -2091,4 +2091,4 @@ cdef enum: CUDART_VERSION = 13030
 
 cdef enum: __CUDART_API_VERSION = 13030
 
-cdef enum: CUDA_EGL_MAX_PLANES = 3
\ No newline at end of file
+cdef enum: CUDA_EGL_MAX_PLANES = 3
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
index 230b5d8f84..e3c471edf1 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 cimport cuda.bindings._bindings.cyruntime as cyruntime
 cimport cython
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
index 981b55fb29..f9e78ad2ad 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 cdef extern from "cuda_runtime_api.h":
 
     {{if 'cudaDeviceReset' in found_functions}}
@@ -1615,4 +1615,3 @@ cdef extern from "cuda_profiler_api.h":
     cudaError_t cudaProfilerStop() nogil
 
     {{endif}}
-
diff --git a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
index a7ad5839ac..f151ce8321 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 
 cdef extern from "vector_types.h":
 
diff --git a/cuda_bindings/cuda/bindings/driver.pxd.in b/cuda_bindings/cuda/bindings/driver.pxd.in
index 76997b5269..93c895702e 100644
--- a/cuda_bindings/cuda/bindings/driver.pxd.in
+++ b/cuda_bindings/cuda/bindings/driver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1711+g875fec45. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 cimport cuda.bindings.cydriver as cydriver
 
 include "_lib/utils.pxd"
@@ -1089,7 +1089,7 @@ cdef class CUasyncNotificationInfo_st:
     {{endif}}
     {{if 'CUasyncNotificationInfo_st.info' in found_struct}}
     info : anon_union2
-        Information about the notification. `typename` must be checked in
+        Information about the notification. ``typename`` must be checked in
         order to interpret this field.
     {{endif}}
 
@@ -1630,12 +1630,12 @@ cdef class CUDA_CONDITIONAL_NODE_PARAMS:
         graphs at any level, must belong to the same CUDA context.
         These graphs may be populated using graph node creation APIs or
         cuStreamBeginCaptureToGraph.  CU_GRAPH_COND_TYPE_IF: phGraph_out[0]
-        is executed when the condition is non-zero. If `size` == 2,
+        is executed when the condition is non-zero. If ``size`` == 2,
         phGraph_out[1] will be executed when the condition is zero.
         CU_GRAPH_COND_TYPE_WHILE: phGraph_out[0] is executed as long as the
         condition is non-zero. CU_GRAPH_COND_TYPE_SWITCH: phGraph_out[n] is
         executed when the condition is equal to n. If the condition >=
-        `size`, no body graph is executed.
+        ``size``, no body graph is executed.
     {{endif}}
     {{if 'CUDA_CONDITIONAL_NODE_PARAMS.ctx' in found_struct}}
     ctx : CUcontext
@@ -1678,7 +1678,7 @@ cdef class CUgraphEdgeData_st:
         node on the edge. The meaning is specfic to the node type. A value
         of 0 in all cases means full completion of the upstream node, with
         memory visibility to the downstream node or portion thereof
-        (indicated by `to_port`).   Only kernel nodes define non-zero
+        (indicated by ``to_port``).   Only kernel nodes define non-zero
         ports. A kernel node can use the following output port types:
         CU_GRAPH_KERNEL_NODE_PORT_DEFAULT,
         CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC, or
@@ -1687,7 +1687,7 @@ cdef class CUgraphEdgeData_st:
     {{if 'CUgraphEdgeData_st.to_port' in found_struct}}
     to_port : bytes
         This indicates what portion of the downstream node is dependent on
-        the upstream node or portion thereof (indicated by `from_port`).
+        the upstream node or portion thereof (indicated by ``from_port``).
         The meaning is specific to the node type. A value of 0 in all cases
         means the entirety of the downstream node is dependent on the
         upstream work.   Currently no node types define non-zero ports.
@@ -1759,7 +1759,7 @@ cdef class CUDA_GRAPH_INSTANTIATE_PARAMS_st:
 
 cdef class CUlaunchMemSyncDomainMap_st:
     """
-    Memory Synchronization Domain map  See ::cudaLaunchMemSyncDomain.
+    Memory Synchronization Domain map  See ``cudaLaunchMemSyncDomain``.
     By default, kernels are launched in domain 0. Kernel launched with
     CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE will have a different domain ID.
     User may also alter the domain ID with CUlaunchMemSyncDomainMap for
@@ -1948,11 +1948,11 @@ cdef class CUlaunchAttributeValue_union:
     clusterDim : anon_struct1
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
         that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - `x` - The X dimension of
+        Opaque type with the following fields: - ``x`` - The X dimension of
         the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - `y` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - `z` - The Z dimension of the
-        cluster, in blocks. Must be a divisor of the grid Z dimension.
+        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
+        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
+        the cluster, in blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
     clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
@@ -1968,18 +1968,18 @@ cdef class CUlaunchAttributeValue_union:
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - ``CUevent`` event - Event to fire when
+        all blocks trigger it.    - ``Event`` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - `triggerAtBlockStart` - If this is set to non-0, each block
+        - ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - ``CUevent`` event - Event to fire when the last block
+        launches    - ``int`` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -2002,23 +2002,23 @@ cdef class CUlaunchAttributeValue_union:
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - `x` - The X dimension of the preferred
-        cluster, in blocks. Must be a divisor of the grid X dimension, and
-        must be a multiple of the `x` field of
-        CUlaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
+        with the following fields: - ``x`` - The X dimension of the
+        preferred cluster, in blocks. Must be a divisor of the grid X
+        dimension, and must be a multiple of the ``x`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
         the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        CUlaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        CUlaunchAttributeValue::clusterDim.
+        dimension, and must be a multiple of the ``y`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
+        the preferred cluster, in blocks. Must be equal to the ``z`` field
+        of CUlaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - `int` deviceUpdatable - Whether or not the
+        following fields: - ``int`` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
+        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
         various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
@@ -4506,7 +4506,7 @@ cdef class CUmemAccessDesc_st:
     {{endif}}
     {{if 'CUmemAccessDesc_st.flags' in found_struct}}
     flags : CUmemAccess_flags
-        ::CUmemProt accessibility flags to set on the request
+        ``CUmemProt`` accessibility flags to set on the request
     {{endif}}
 
     Methods
@@ -4913,8 +4913,8 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v1_st:
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.poolProps' in found_struct}}
     poolProps : CUmemPoolProps
         in: location where the allocation should reside (specified in
-        ::location). ::handleTypes must be CU_MEM_HANDLE_TYPE_NONE. IPC is
-        not supported.
+        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
+        is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.accessDescs' in found_struct}}
     accessDescs : CUmemAccessDesc
@@ -4964,8 +4964,8 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v2_st:
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.poolProps' in found_struct}}
     poolProps : CUmemPoolProps
         in: location where the allocation should reside (specified in
-        ::location). ::handleTypes must be CU_MEM_HANDLE_TYPE_NONE. IPC is
-        not supported.
+        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
+        is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.accessDescs' in found_struct}}
     accessDescs : CUmemAccessDesc
@@ -5692,17 +5692,17 @@ cdef class CUdevResource_st:
     {{endif}}
     {{if 'CUdevResource_st.sm' in found_struct}}
     sm : CUdevSmResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_SM `typename`.
+        Resource corresponding to CU_DEV_RESOURCE_TYPE_SM ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wqConfig' in found_struct}}
     wqConfig : CUdevWorkqueueConfigResource
         Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG
-        `typename`.
+        ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wq' in found_struct}}
     wq : CUdevWorkqueueResource
         Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE
-        `typename`.
+        ``typename``.
     {{endif}}
     {{if 'CUdevResource_st._oversize' in found_struct}}
     _oversize : bytes
@@ -6246,7 +6246,7 @@ cdef class CUasyncNotificationInfo(CUasyncNotificationInfo_st):
     {{endif}}
     {{if 'CUasyncNotificationInfo_st.info' in found_struct}}
     info : anon_union2
-        Information about the notification. `typename` must be checked in
+        Information about the notification. ``typename`` must be checked in
         order to interpret this field.
     {{endif}}
 
@@ -6936,7 +6936,7 @@ cdef class CUgraphEdgeData(CUgraphEdgeData_st):
         node on the edge. The meaning is specfic to the node type. A value
         of 0 in all cases means full completion of the upstream node, with
         memory visibility to the downstream node or portion thereof
-        (indicated by `to_port`).   Only kernel nodes define non-zero
+        (indicated by ``to_port``).   Only kernel nodes define non-zero
         ports. A kernel node can use the following output port types:
         CU_GRAPH_KERNEL_NODE_PORT_DEFAULT,
         CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC, or
@@ -6945,7 +6945,7 @@ cdef class CUgraphEdgeData(CUgraphEdgeData_st):
     {{if 'CUgraphEdgeData_st.to_port' in found_struct}}
     to_port : bytes
         This indicates what portion of the downstream node is dependent on
-        the upstream node or portion thereof (indicated by `from_port`).
+        the upstream node or portion thereof (indicated by ``from_port``).
         The meaning is specific to the node type. A value of 0 in all cases
         means the entirety of the downstream node is dependent on the
         upstream work.   Currently no node types define non-zero ports.
@@ -7006,7 +7006,7 @@ cdef class CUDA_GRAPH_INSTANTIATE_PARAMS(CUDA_GRAPH_INSTANTIATE_PARAMS_st):
 
 cdef class CUlaunchMemSyncDomainMap(CUlaunchMemSyncDomainMap_st):
     """
-    Memory Synchronization Domain map  See ::cudaLaunchMemSyncDomain.
+    Memory Synchronization Domain map  See ``cudaLaunchMemSyncDomain``.
     By default, kernels are launched in domain 0. Kernel launched with
     CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE will have a different domain ID.
     User may also alter the domain ID with CUlaunchMemSyncDomainMap for
@@ -7063,11 +7063,11 @@ cdef class CUlaunchAttributeValue(CUlaunchAttributeValue_union):
     clusterDim : anon_struct1
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
         that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - `x` - The X dimension of
+        Opaque type with the following fields: - ``x`` - The X dimension of
         the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - `y` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - `z` - The Z dimension of the
-        cluster, in blocks. Must be a divisor of the grid Z dimension.
+        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
+        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
+        the cluster, in blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
     clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
@@ -7083,18 +7083,18 @@ cdef class CUlaunchAttributeValue(CUlaunchAttributeValue_union):
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - ``CUevent`` event - Event to fire when
+        all blocks trigger it.    - ``Event`` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - `triggerAtBlockStart` - If this is set to non-0, each block
+        - ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - ``CUevent`` event - Event to fire when the last block
+        launches    - ``int`` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -7117,23 +7117,23 @@ cdef class CUlaunchAttributeValue(CUlaunchAttributeValue_union):
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - `x` - The X dimension of the preferred
-        cluster, in blocks. Must be a divisor of the grid X dimension, and
-        must be a multiple of the `x` field of
-        CUlaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
+        with the following fields: - ``x`` - The X dimension of the
+        preferred cluster, in blocks. Must be a divisor of the grid X
+        dimension, and must be a multiple of the ``x`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
         the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        CUlaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        CUlaunchAttributeValue::clusterDim.
+        dimension, and must be a multiple of the ``y`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
+        the preferred cluster, in blocks. Must be equal to the ``z`` field
+        of CUlaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - `int` deviceUpdatable - Whether or not the
+        following fields: - ``int`` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
+        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
         various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
@@ -7274,11 +7274,11 @@ cdef class CUkernelNodeAttrValue_v1(CUlaunchAttributeValue):
     clusterDim : anon_struct1
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
         that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - `x` - The X dimension of
+        Opaque type with the following fields: - ``x`` - The X dimension of
         the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - `y` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - `z` - The Z dimension of the
-        cluster, in blocks. Must be a divisor of the grid Z dimension.
+        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
+        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
+        the cluster, in blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
     clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
@@ -7294,18 +7294,18 @@ cdef class CUkernelNodeAttrValue_v1(CUlaunchAttributeValue):
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - ``CUevent`` event - Event to fire when
+        all blocks trigger it.    - ``Event`` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - `triggerAtBlockStart` - If this is set to non-0, each block
+        - ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - ``CUevent`` event - Event to fire when the last block
+        launches    - ``int`` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -7328,23 +7328,23 @@ cdef class CUkernelNodeAttrValue_v1(CUlaunchAttributeValue):
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - `x` - The X dimension of the preferred
-        cluster, in blocks. Must be a divisor of the grid X dimension, and
-        must be a multiple of the `x` field of
-        CUlaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
+        with the following fields: - ``x`` - The X dimension of the
+        preferred cluster, in blocks. Must be a divisor of the grid X
+        dimension, and must be a multiple of the ``x`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
         the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        CUlaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        CUlaunchAttributeValue::clusterDim.
+        dimension, and must be a multiple of the ``y`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
+        the preferred cluster, in blocks. Must be equal to the ``z`` field
+        of CUlaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - `int` deviceUpdatable - Whether or not the
+        following fields: - ``int`` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
+        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
         various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
@@ -7405,11 +7405,11 @@ cdef class CUkernelNodeAttrValue(CUkernelNodeAttrValue_v1):
     clusterDim : anon_struct1
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
         that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - `x` - The X dimension of
+        Opaque type with the following fields: - ``x`` - The X dimension of
         the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - `y` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - `z` - The Z dimension of the
-        cluster, in blocks. Must be a divisor of the grid Z dimension.
+        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
+        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
+        the cluster, in blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
     clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
@@ -7425,18 +7425,18 @@ cdef class CUkernelNodeAttrValue(CUkernelNodeAttrValue_v1):
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - ``CUevent`` event - Event to fire when
+        all blocks trigger it.    - ``Event`` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - `triggerAtBlockStart` - If this is set to non-0, each block
+        - ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - ``CUevent`` event - Event to fire when the last block
+        launches    - ``int`` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -7459,23 +7459,23 @@ cdef class CUkernelNodeAttrValue(CUkernelNodeAttrValue_v1):
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - `x` - The X dimension of the preferred
-        cluster, in blocks. Must be a divisor of the grid X dimension, and
-        must be a multiple of the `x` field of
-        CUlaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
+        with the following fields: - ``x`` - The X dimension of the
+        preferred cluster, in blocks. Must be a divisor of the grid X
+        dimension, and must be a multiple of the ``x`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
         the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        CUlaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        CUlaunchAttributeValue::clusterDim.
+        dimension, and must be a multiple of the ``y`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
+        the preferred cluster, in blocks. Must be equal to the ``z`` field
+        of CUlaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - `int` deviceUpdatable - Whether or not the
+        following fields: - ``int`` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
+        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
         various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
@@ -7536,11 +7536,11 @@ cdef class CUstreamAttrValue_v1(CUlaunchAttributeValue):
     clusterDim : anon_struct1
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
         that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - `x` - The X dimension of
+        Opaque type with the following fields: - ``x`` - The X dimension of
         the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - `y` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - `z` - The Z dimension of the
-        cluster, in blocks. Must be a divisor of the grid Z dimension.
+        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
+        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
+        the cluster, in blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
     clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
@@ -7556,18 +7556,18 @@ cdef class CUstreamAttrValue_v1(CUlaunchAttributeValue):
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - ``CUevent`` event - Event to fire when
+        all blocks trigger it.    - ``Event`` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - `triggerAtBlockStart` - If this is set to non-0, each block
+        - ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - ``CUevent`` event - Event to fire when the last block
+        launches    - ``int`` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -7590,23 +7590,23 @@ cdef class CUstreamAttrValue_v1(CUlaunchAttributeValue):
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - `x` - The X dimension of the preferred
-        cluster, in blocks. Must be a divisor of the grid X dimension, and
-        must be a multiple of the `x` field of
-        CUlaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
+        with the following fields: - ``x`` - The X dimension of the
+        preferred cluster, in blocks. Must be a divisor of the grid X
+        dimension, and must be a multiple of the ``x`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
         the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        CUlaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        CUlaunchAttributeValue::clusterDim.
+        dimension, and must be a multiple of the ``y`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
+        the preferred cluster, in blocks. Must be equal to the ``z`` field
+        of CUlaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - `int` deviceUpdatable - Whether or not the
+        following fields: - ``int`` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
+        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
         various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
@@ -7667,11 +7667,11 @@ cdef class CUstreamAttrValue(CUstreamAttrValue_v1):
     clusterDim : anon_struct1
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
         that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - `x` - The X dimension of
+        Opaque type with the following fields: - ``x`` - The X dimension of
         the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - `y` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - `z` - The Z dimension of the
-        cluster, in blocks. Must be a divisor of the grid Z dimension.
+        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
+        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
+        the cluster, in blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
     clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
@@ -7687,18 +7687,18 @@ cdef class CUstreamAttrValue(CUstreamAttrValue_v1):
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - ``CUevent`` event - Event to fire when
+        all blocks trigger it.    - ``Event`` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - `triggerAtBlockStart` - If this is set to non-0, each block
+        - ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - ``CUevent`` event - Event to fire when the last block
+        launches    - ``int`` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -7721,23 +7721,23 @@ cdef class CUstreamAttrValue(CUstreamAttrValue_v1):
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - `x` - The X dimension of the preferred
-        cluster, in blocks. Must be a divisor of the grid X dimension, and
-        must be a multiple of the `x` field of
-        CUlaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
+        with the following fields: - ``x`` - The X dimension of the
+        preferred cluster, in blocks. Must be a divisor of the grid X
+        dimension, and must be a multiple of the ``x`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
         the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        CUlaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        CUlaunchAttributeValue::clusterDim.
+        dimension, and must be a multiple of the ``y`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
+        the preferred cluster, in blocks. Must be equal to the ``z`` field
+        of CUlaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - `int` deviceUpdatable - Whether or not the
+        following fields: - ``int`` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
+        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
         various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
@@ -10313,7 +10313,7 @@ cdef class CUmemAccessDesc_v1(CUmemAccessDesc_st):
     {{endif}}
     {{if 'CUmemAccessDesc_st.flags' in found_struct}}
     flags : CUmemAccess_flags
-        ::CUmemProt accessibility flags to set on the request
+        ``CUmemProt`` accessibility flags to set on the request
     {{endif}}
 
     Methods
@@ -10337,7 +10337,7 @@ cdef class CUmemAccessDesc(CUmemAccessDesc_v1):
     {{endif}}
     {{if 'CUmemAccessDesc_st.flags' in found_struct}}
     flags : CUmemAccess_flags
-        ::CUmemProt accessibility flags to set on the request
+        ``CUmemProt`` accessibility flags to set on the request
     {{endif}}
 
     Methods
@@ -10862,8 +10862,8 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v1(CUDA_MEM_ALLOC_NODE_PARAMS_v1_st):
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.poolProps' in found_struct}}
     poolProps : CUmemPoolProps
         in: location where the allocation should reside (specified in
-        ::location). ::handleTypes must be CU_MEM_HANDLE_TYPE_NONE. IPC is
-        not supported.
+        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
+        is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.accessDescs' in found_struct}}
     accessDescs : CUmemAccessDesc
@@ -10902,8 +10902,8 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS(CUDA_MEM_ALLOC_NODE_PARAMS_v1):
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.poolProps' in found_struct}}
     poolProps : CUmemPoolProps
         in: location where the allocation should reside (specified in
-        ::location). ::handleTypes must be CU_MEM_HANDLE_TYPE_NONE. IPC is
-        not supported.
+        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
+        is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.accessDescs' in found_struct}}
     accessDescs : CUmemAccessDesc
@@ -10942,8 +10942,8 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v2(CUDA_MEM_ALLOC_NODE_PARAMS_v2_st):
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.poolProps' in found_struct}}
     poolProps : CUmemPoolProps
         in: location where the allocation should reside (specified in
-        ::location). ::handleTypes must be CU_MEM_HANDLE_TYPE_NONE. IPC is
-        not supported.
+        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
+        is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.accessDescs' in found_struct}}
     accessDescs : CUmemAccessDesc
@@ -11538,17 +11538,17 @@ cdef class CUdevResource_v1(CUdevResource_st):
     {{endif}}
     {{if 'CUdevResource_st.sm' in found_struct}}
     sm : CUdevSmResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_SM `typename`.
+        Resource corresponding to CU_DEV_RESOURCE_TYPE_SM ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wqConfig' in found_struct}}
     wqConfig : CUdevWorkqueueConfigResource
         Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG
-        `typename`.
+        ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wq' in found_struct}}
     wq : CUdevWorkqueueResource
         Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE
-        `typename`.
+        ``typename``.
     {{endif}}
     {{if 'CUdevResource_st._oversize' in found_struct}}
     _oversize : bytes
@@ -11582,17 +11582,17 @@ cdef class CUdevResource(CUdevResource_v1):
     {{endif}}
     {{if 'CUdevResource_st.sm' in found_struct}}
     sm : CUdevSmResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_SM `typename`.
+        Resource corresponding to CU_DEV_RESOURCE_TYPE_SM ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wqConfig' in found_struct}}
     wqConfig : CUdevWorkqueueConfigResource
         Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG
-        `typename`.
+        ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wq' in found_struct}}
     wq : CUdevWorkqueueResource
         Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE
-        `typename`.
+        ``typename``.
     {{endif}}
     {{if 'CUdevResource_st._oversize' in found_struct}}
     _oversize : bytes
diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index 206c2557fc..07c4b5d4f7 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1711+g875fec45. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -173,7 +173,7 @@ CU_TENSOR_MAP_NUM_QWORDS = cydriver.CU_TENSOR_MAP_NUM_QWORDS
 #: Indicates that the external memory object is a dedicated resource
 CUDA_EXTERNAL_MEMORY_DEDICATED = cydriver.CUDA_EXTERNAL_MEMORY_DEDICATED
 
-#: When the `flags` parameter of
+#: When the ``flags`` parameter of
 #: :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS` contains this flag, it
 #: indicates that signaling an external semaphore object should skip
 #: performing appropriate memory synchronization operations over all the
@@ -183,7 +183,7 @@ CUDA_EXTERNAL_MEMORY_DEDICATED = cydriver.CUDA_EXTERNAL_MEMORY_DEDICATED
 #: the same NvSciBuf memory objects.
 CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC = cydriver.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC
 
-#: When the `flags` parameter of
+#: When the ``flags`` parameter of
 #: :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS` contains this flag, it
 #: indicates that waiting on an external semaphore object should skip
 #: performing appropriate memory synchronization operations over all the
@@ -193,13 +193,13 @@ CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC = cydriver.CUDA_EXTERNAL_SE
 #: the same NvSciBuf memory objects.
 CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC = cydriver.CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
 
-#: When `flags` of :py:obj:`~.cuDeviceGetNvSciSyncAttributes` is set to
+#: When ``flags`` of :py:obj:`~.cuDeviceGetNvSciSyncAttributes` is set to
 #: this, it indicates that application needs signaler specific
 #: NvSciSyncAttr to be filled by
 #: :py:obj:`~.cuDeviceGetNvSciSyncAttributes`.
 CUDA_NVSCISYNC_ATTR_SIGNAL = cydriver.CUDA_NVSCISYNC_ATTR_SIGNAL
 
-#: When `flags` of :py:obj:`~.cuDeviceGetNvSciSyncAttributes` is set to
+#: When ``flags`` of :py:obj:`~.cuDeviceGetNvSciSyncAttributes` is set to
 #: this, it indicates that application needs waiter specific NvSciSyncAttr
 #: to be filled by :py:obj:`~.cuDeviceGetNvSciSyncAttributes`.
 CUDA_NVSCISYNC_ATTR_WAIT = cydriver.CUDA_NVSCISYNC_ATTR_WAIT
@@ -302,31 +302,31 @@ CU_LAUNCH_KERNEL_REQUIRED_BLOCK_DIM = cydriver.CU_LAUNCH_KERNEL_REQUIRED_BLOCK_D
 #: C++ compile time constant for CU_LAUNCH_PARAM_END
 CU_LAUNCH_PARAM_END_AS_INT = cydriver.CU_LAUNCH_PARAM_END_AS_INT
 
-#: End of array terminator for the `extra` parameter to
+#: End of array terminator for the ``extra`` parameter to
 #: :py:obj:`~.cuLaunchKernel`
 CU_LAUNCH_PARAM_END = cydriver.CU_LAUNCH_PARAM_END
 
 #: C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_POINTER
 CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT = cydriver.CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT
 
-#: Indicator that the next value in the `extra` parameter to
+#: Indicator that the next value in the ``extra`` parameter to
 #: :py:obj:`~.cuLaunchKernel` will be a pointer to a buffer containing all
-#: kernel parameters used for launching kernel `f`. This buffer needs to
+#: kernel parameters used for launching kernel ``f``. This buffer needs to
 #: honor all alignment/padding requirements of the individual parameters.
 #: If :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE` is not also specified in the
-#: `extra` array, then :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER` will have
-#: no effect.
+#: ``extra`` array, then :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER` will
+#: have no effect.
 CU_LAUNCH_PARAM_BUFFER_POINTER = cydriver.CU_LAUNCH_PARAM_BUFFER_POINTER
 
 #: C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_SIZE
 CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT = cydriver.CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT
 
-#: Indicator that the next value in the `extra` parameter to
+#: Indicator that the next value in the ``extra`` parameter to
 #: :py:obj:`~.cuLaunchKernel` will be a pointer to a size_t which contains
 #: the size of the buffer specified with
 #: :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER`. It is required that
 #: :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER` also be specified in the
-#: `extra` array if the value associated with
+#: ``extra`` array if the value associated with
 #: :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE` is not zero.
 CU_LAUNCH_PARAM_BUFFER_SIZE = cydriver.CU_LAUNCH_PARAM_BUFFER_SIZE
 
@@ -738,7 +738,7 @@ class CUstreamWaitValue_flags(_FastEnum):
 
     CU_STREAM_WAIT_VALUE_GEQ = (
         cydriver.CUstreamWaitValue_flags_enum.CU_STREAM_WAIT_VALUE_GEQ,
-        'Wait until (int32_t)(*addr - value) >= 0 (or int64_t for 64 bit values).\n'
+        'Wait until (int32_t)(\\*addr - value) >= 0 (or int64_t for 64 bit values).\n'
         'Note this is a cyclic comparison which ignores wraparound. (Default\n'
         'behavior.)\n'
     ){{endif}}
@@ -746,20 +746,20 @@ class CUstreamWaitValue_flags(_FastEnum):
 
     CU_STREAM_WAIT_VALUE_EQ = (
         cydriver.CUstreamWaitValue_flags_enum.CU_STREAM_WAIT_VALUE_EQ,
-        'Wait until *addr == value.\n'
+        'Wait until \\*addr == value.\n'
     ){{endif}}
     {{if 'CU_STREAM_WAIT_VALUE_AND' in found_values}}
 
     CU_STREAM_WAIT_VALUE_AND = (
         cydriver.CUstreamWaitValue_flags_enum.CU_STREAM_WAIT_VALUE_AND,
-        'Wait until (*addr & value) != 0.\n'
+        'Wait until (\\*addr & value) != 0.\n'
     ){{endif}}
     {{if 'CU_STREAM_WAIT_VALUE_NOR' in found_values}}
 
     CU_STREAM_WAIT_VALUE_NOR = (
         cydriver.CUstreamWaitValue_flags_enum.CU_STREAM_WAIT_VALUE_NOR,
-        'Wait until ~(*addr | value) != 0. Support for this operation can be queried\n'
-        'with :py:obj:`~.cuDeviceGetAttribute()` and\n'
+        'Wait until ~(\\*addr | value) != 0. Support for this operation can be\n'
+        'queried with :py:obj:`~.cuDeviceGetAttribute()` and\n'
         ':py:obj:`~.CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR`.\n'
     ){{endif}}
     {{if 'CU_STREAM_WAIT_VALUE_FLUSH' in found_values}}
@@ -885,19 +885,19 @@ class CUstreamAtomicReductionOpType(_FastEnum):
 
     CU_STREAM_ATOMIC_REDUCTION_OP_ADD = (
         cydriver.CUstreamAtomicReductionOpType_enum.CU_STREAM_ATOMIC_REDUCTION_OP_ADD,
-        'Performs an atomic ADD: *(address) = *(address) + value\n'
+        'Performs an atomic ADD: \\*(address) = \\*(address) + value\n'
     ){{endif}}
     {{if 'CU_STREAM_ATOMIC_REDUCTION_OP_AND' in found_values}}
 
     CU_STREAM_ATOMIC_REDUCTION_OP_AND = (
         cydriver.CUstreamAtomicReductionOpType_enum.CU_STREAM_ATOMIC_REDUCTION_OP_AND,
-        'Performs an atomic AND: *(address) = *(address) & value\n'
+        'Performs an atomic AND: \\*(address) = \\*(address) & value\n'
     ){{endif}}
     {{if 'CU_STREAM_ATOMIC_REDUCTION_OP_OR' in found_values}}
 
     CU_STREAM_ATOMIC_REDUCTION_OP_OR = (
         cydriver.CUstreamAtomicReductionOpType_enum.CU_STREAM_ATOMIC_REDUCTION_OP_OR,
-        'Performs an atomic OR: *(address) = *(address) | value\n'
+        'Performs an atomic OR: \\*(address) = \\*(address) | value\n'
     ){{endif}}
 
 {{endif}}
@@ -2061,7 +2061,7 @@ class CUdevice_attribute(_FastEnum):
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES,
         'The :py:obj:`~.CU_STREAM_WAIT_VALUE_FLUSH` flag and the\n'
         ':py:obj:`~.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES` MemOp are supported on the\n'
-        'device. See :py:obj:`~.Stream Memory Operations` for additional details.\n'
+        'device. See Stream Memory Operations for additional details.\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED' in found_values}}
 
@@ -2581,7 +2581,7 @@ class CUpointer_attribute(_FastEnum):
 
     CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE = (
         cydriver.CUpointer_attribute_enum.CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE,
-        'Returns in `*data` a boolean that indicates whether the pointer points to\n'
+        'Returns in ``*data`` a boolean that indicates whether the pointer points to\n'
         'memory that is capable to be used for hardware accelerated decompression.\n'
     ){{endif}}
 
@@ -3879,8 +3879,8 @@ class CUgraphConditionalNodeType(_FastEnum):
     CU_GRAPH_COND_TYPE_IF = (
         cydriver.CUgraphConditionalNodeType_enum.CU_GRAPH_COND_TYPE_IF,
         "Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If\n"
-        '`size` == 2, an optional ELSE graph is created and this is executed if the\n'
-        'condition is zero.\n'
+        '``size`` == 2, an optional ELSE graph is created and this is executed if\n'
+        'the condition is zero.\n'
     ){{endif}}
     {{if 'CU_GRAPH_COND_TYPE_WHILE' in found_values}}
 
@@ -4037,7 +4037,7 @@ class CUgraphDependencyType(_FastEnum):
     CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC = (
         cydriver.CUgraphDependencyType_enum.CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC,
         'This dependency type allows the downstream node to use\n'
-        '`cudaGridDependencySynchronize()`. It may only be used between kernel\n'
+        '``cudaGridDependencySynchronize()``. It may only be used between kernel\n'
         'nodes, and must be used with either the\n'
         ':py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC` or\n'
         ':py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER` outgoing port.\n'
@@ -4366,9 +4366,9 @@ class CUlaunchAttributeID(_FastEnum):
         ' This attribute will only take effect when a regular cluster dimension has\n'
         'been specified. The preferred substitute cluster dimension must be an\n'
         'integer multiple greater than zero of the regular cluster dimension and\n'
-        'must divide the grid. It must also be no more than `maxBlocksPerCluster`,\n'
-        "if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less\n"
-        'than the maximum value the driver can support. Otherwise, setting this\n'
+        'must divide the grid. It must also be no more than ``maxBlocksPerCluster``,\n'
+        "if it is set in the kernel's ``__launch_bounds__``. Otherwise it must be\n"
+        'less than the maximum value the driver can support. Otherwise, setting this\n'
         'attribute to a value physically unable to fit on any particular device is\n'
         'permitted.\n'
     ){{endif}}
@@ -4387,8 +4387,8 @@ class CUlaunchAttributeID(_FastEnum):
         'on different GPUs) or if B is a higher priority than A. Exercise caution if\n'
         'such an ordering inversion could lead to deadlock.\n'
         ' A launch completion event is nominally similar to a programmatic event\n'
-        'with `triggerAtBlockStart` set except that it is not visible to\n'
-        '`cudaGridDependencySynchronize()` and can be used with compute capability\n'
+        'with ``triggerAtBlockStart`` set except that it is not visible to\n'
+        '``cudaGridDependencySynchronize()`` and can be used with compute capability\n'
         'less than 9.0.\n'
         ' The event supplied must not be an interprocess or interop event. The event\n'
         'must disable timing (i.e. must be created with the\n'
@@ -4642,14 +4642,14 @@ class CUlibraryOption(_FastEnum):
 
     CU_LIBRARY_BINARY_IS_PRESERVED = (
         cydriver.CUlibraryOption_enum.CU_LIBRARY_BINARY_IS_PRESERVED,
-        'Specifes that the argument `code` passed to :py:obj:`~.cuLibraryLoadData()`\n'
-        'will be preserved. Specifying this option will let the driver know that\n'
-        '`code` can be accessed at any point until :py:obj:`~.cuLibraryUnload()`.\n'
-        'The default behavior is for the driver to allocate and maintain its own\n'
-        'copy of `code`. Note that this is only a memory usage optimization hint and\n'
-        'the driver can choose to ignore it if required. Specifying this option with\n'
-        ':py:obj:`~.cuLibraryLoadFromFile()` is invalid and will return\n'
-        ':py:obj:`~.CUDA_ERROR_INVALID_VALUE`.\n'
+        'Specifes that the argument ``code`` passed to\n'
+        ':py:obj:`~.cuLibraryLoadData()` will be preserved. Specifying this option\n'
+        'will let the driver know that ``code`` can be accessed at any point until\n'
+        ':py:obj:`~.cuLibraryUnload()`. The default behavior is for the driver to\n'
+        'allocate and maintain its own copy of ``code``. Note that this is only a\n'
+        'memory usage optimization hint and the driver can choose to ignore it if\n'
+        'required. Specifying this option with :py:obj:`~.cuLibraryLoadFromFile()`\n'
+        'is invalid and will return :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.\n'
     ){{endif}}
     {{if 'CU_LIBRARY_NUM_OPTIONS' in found_values}}
     CU_LIBRARY_NUM_OPTIONS = cydriver.CUlibraryOption_enum.CU_LIBRARY_NUM_OPTIONS{{endif}}
@@ -6846,7 +6846,7 @@ class CUgraphInstantiate_flags(_FastEnum):
         cydriver.CUgraphInstantiate_flags_enum.CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD,
         'Automatically upload the graph after instantiation. Only supported by\n'
         ':py:obj:`~.cuGraphInstantiateWithParams`. The upload will be performed\n'
-        'using the stream provided in `instantiateParams`.\n'
+        'using the stream provided in ``instantiateParams``.\n'
     ){{endif}}
     {{if 'CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH' in found_values}}
 
@@ -7136,7 +7136,7 @@ class CUgreenCtxCreate_flags(_FastEnum):
 
 class CUdevSmResourceGroup_flags(_FastEnum):
     """
-    Flags for a :py:obj:`~.CUdevSmResource` group
+    Flags for a CUdevSmResource group
     """
     {{if 'CU_DEV_SM_RESOURCE_GROUP_DEFAULT' in found_values}}
     CU_DEV_SM_RESOURCE_GROUP_DEFAULT = cydriver.CUdevSmResourceGroup_flags.CU_DEV_SM_RESOURCE_GROUP_DEFAULT{{endif}}
@@ -8414,9 +8414,9 @@ class CUkernelNodeAttrID(_FastEnum):
         ' This attribute will only take effect when a regular cluster dimension has\n'
         'been specified. The preferred substitute cluster dimension must be an\n'
         'integer multiple greater than zero of the regular cluster dimension and\n'
-        'must divide the grid. It must also be no more than `maxBlocksPerCluster`,\n'
-        "if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less\n"
-        'than the maximum value the driver can support. Otherwise, setting this\n'
+        'must divide the grid. It must also be no more than ``maxBlocksPerCluster``,\n'
+        "if it is set in the kernel's ``__launch_bounds__``. Otherwise it must be\n"
+        'less than the maximum value the driver can support. Otherwise, setting this\n'
         'attribute to a value physically unable to fit on any particular device is\n'
         'permitted.\n'
     ){{endif}}
@@ -8435,8 +8435,8 @@ class CUkernelNodeAttrID(_FastEnum):
         'on different GPUs) or if B is a higher priority than A. Exercise caution if\n'
         'such an ordering inversion could lead to deadlock.\n'
         ' A launch completion event is nominally similar to a programmatic event\n'
-        'with `triggerAtBlockStart` set except that it is not visible to\n'
-        '`cudaGridDependencySynchronize()` and can be used with compute capability\n'
+        'with ``triggerAtBlockStart`` set except that it is not visible to\n'
+        '``cudaGridDependencySynchronize()`` and can be used with compute capability\n'
         'less than 9.0.\n'
         ' The event supplied must not be an interprocess or interop event. The event\n'
         'must disable timing (i.e. must be created with the\n'
@@ -8662,9 +8662,9 @@ class CUstreamAttrID(_FastEnum):
         ' This attribute will only take effect when a regular cluster dimension has\n'
         'been specified. The preferred substitute cluster dimension must be an\n'
         'integer multiple greater than zero of the regular cluster dimension and\n'
-        'must divide the grid. It must also be no more than `maxBlocksPerCluster`,\n'
-        "if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less\n"
-        'than the maximum value the driver can support. Otherwise, setting this\n'
+        'must divide the grid. It must also be no more than ``maxBlocksPerCluster``,\n'
+        "if it is set in the kernel's ``__launch_bounds__``. Otherwise it must be\n"
+        'less than the maximum value the driver can support. Otherwise, setting this\n'
         'attribute to a value physically unable to fit on any particular device is\n'
         'permitted.\n'
     ){{endif}}
@@ -8683,8 +8683,8 @@ class CUstreamAttrID(_FastEnum):
         'on different GPUs) or if B is a higher priority than A. Exercise caution if\n'
         'such an ordering inversion could lead to deadlock.\n'
         ' A launch completion event is nominally similar to a programmatic event\n'
-        'with `triggerAtBlockStart` set except that it is not visible to\n'
-        '`cudaGridDependencySynchronize()` and can be used with compute capability\n'
+        'with ``triggerAtBlockStart`` set except that it is not visible to\n'
+        '``cudaGridDependencySynchronize()`` and can be used with compute capability\n'
         'less than 9.0.\n'
         ' The event supplied must not be an interprocess or interop event. The event\n'
         'must disable timing (i.e. must be created with the\n'
@@ -11619,7 +11619,7 @@ cdef class CUasyncNotificationInfo_st:
     {{endif}}
     {{if 'CUasyncNotificationInfo_st.info' in found_struct}}
     info : anon_union2
-        Information about the notification. `typename` must be checked in
+        Information about the notification. ``typename`` must be checked in
         order to interpret this field.
     {{endif}}
 
@@ -13376,12 +13376,12 @@ cdef class CUDA_CONDITIONAL_NODE_PARAMS:
         graphs at any level, must belong to the same CUDA context.
         These graphs may be populated using graph node creation APIs or
         cuStreamBeginCaptureToGraph.  CU_GRAPH_COND_TYPE_IF: phGraph_out[0]
-        is executed when the condition is non-zero. If `size` == 2,
+        is executed when the condition is non-zero. If ``size`` == 2,
         phGraph_out[1] will be executed when the condition is zero.
         CU_GRAPH_COND_TYPE_WHILE: phGraph_out[0] is executed as long as the
         condition is non-zero. CU_GRAPH_COND_TYPE_SWITCH: phGraph_out[n] is
         executed when the condition is equal to n. If the condition >=
-        `size`, no body graph is executed.
+        ``size``, no body graph is executed.
     {{endif}}
     {{if 'CUDA_CONDITIONAL_NODE_PARAMS.ctx' in found_struct}}
     ctx : CUcontext
@@ -13522,7 +13522,7 @@ cdef class CUgraphEdgeData_st:
         node on the edge. The meaning is specfic to the node type. A value
         of 0 in all cases means full completion of the upstream node, with
         memory visibility to the downstream node or portion thereof
-        (indicated by `to_port`).   Only kernel nodes define non-zero
+        (indicated by ``to_port``).   Only kernel nodes define non-zero
         ports. A kernel node can use the following output port types:
         CU_GRAPH_KERNEL_NODE_PORT_DEFAULT,
         CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC, or
@@ -13531,7 +13531,7 @@ cdef class CUgraphEdgeData_st:
     {{if 'CUgraphEdgeData_st.to_port' in found_struct}}
     to_port : bytes
         This indicates what portion of the downstream node is dependent on
-        the upstream node or portion thereof (indicated by `from_port`).
+        the upstream node or portion thereof (indicated by ``from_port``).
         The meaning is specific to the node type. A value of 0 in all cases
         means the entirety of the downstream node is dependent on the
         upstream work.   Currently no node types define non-zero ports.
@@ -13776,7 +13776,7 @@ cdef class CUDA_GRAPH_INSTANTIATE_PARAMS_st:
 
 cdef class CUlaunchMemSyncDomainMap_st:
     """
-    Memory Synchronization Domain map  See ::cudaLaunchMemSyncDomain.
+    Memory Synchronization Domain map  See ``cudaLaunchMemSyncDomain``.
     By default, kernels are launched in domain 0. Kernel launched with
     CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE will have a different domain ID.
     User may also alter the domain ID with CUlaunchMemSyncDomainMap for
@@ -14287,11 +14287,11 @@ cdef class CUlaunchAttributeValue_union:
     clusterDim : anon_struct1
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
         that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - `x` - The X dimension of
+        Opaque type with the following fields: - ``x`` - The X dimension of
         the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - `y` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - `z` - The Z dimension of the
-        cluster, in blocks. Must be a divisor of the grid Z dimension.
+        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
+        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
+        the cluster, in blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
     clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
@@ -14307,18 +14307,18 @@ cdef class CUlaunchAttributeValue_union:
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
         Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - `CUevent` event - Event to fire when
-        all blocks trigger it.    - `Event` record flags, see
+        with the following fields: - ``CUevent`` event - Event to fire when
+        all blocks trigger it.    - ``Event`` record flags, see
         cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - `triggerAtBlockStart` - If this is set to non-0, each block
+        - ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - `CUevent` event - Event to fire when the last block
-        launches    - `int` flags; - Event record flags, see
+        fields: - ``CUevent`` event - Event to fire when the last block
+        launches    - ``int`` flags; - Event record flags, see
         cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
@@ -14341,23 +14341,23 @@ cdef class CUlaunchAttributeValue_union:
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - `x` - The X dimension of the preferred
-        cluster, in blocks. Must be a divisor of the grid X dimension, and
-        must be a multiple of the `x` field of
-        CUlaunchAttributeValue::clusterDim.    - `y` - The Y dimension of
+        with the following fields: - ``x`` - The X dimension of the
+        preferred cluster, in blocks. Must be a divisor of the grid X
+        dimension, and must be a multiple of the ``x`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
         the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the `y` field of
-        CUlaunchAttributeValue::clusterDim.    - `z` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the `z` field of
-        CUlaunchAttributeValue::clusterDim.
+        dimension, and must be a multiple of the ``y`` field of
+        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
+        the preferred cluster, in blocks. Must be equal to the ``z`` field
+        of CUlaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
         CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - `int` deviceUpdatable - Whether or not the
+        following fields: - ``int`` deviceUpdatable - Whether or not the
         resulting kernel node should be device-updatable.    -
-        `CUgraphDeviceNode` devNode - Returns a handle to pass to the
+        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
         various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
@@ -22316,7 +22316,7 @@ cdef class CUmemAccessDesc_st:
     {{endif}}
     {{if 'CUmemAccessDesc_st.flags' in found_struct}}
     flags : CUmemAccess_flags
-        ::CUmemProt accessibility flags to set on the request
+        ``CUmemProt`` accessibility flags to set on the request
     {{endif}}
 
     Methods
@@ -23475,8 +23475,8 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v1_st:
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.poolProps' in found_struct}}
     poolProps : CUmemPoolProps
         in: location where the allocation should reside (specified in
-        ::location). ::handleTypes must be CU_MEM_HANDLE_TYPE_NONE. IPC is
-        not supported.
+        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
+        is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.accessDescs' in found_struct}}
     accessDescs : CUmemAccessDesc
@@ -23638,8 +23638,8 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v2_st:
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.poolProps' in found_struct}}
     poolProps : CUmemPoolProps
         in: location where the allocation should reside (specified in
-        ::location). ::handleTypes must be CU_MEM_HANDLE_TYPE_NONE. IPC is
-        not supported.
+        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
+        is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.accessDescs' in found_struct}}
     accessDescs : CUmemAccessDesc
@@ -25767,17 +25767,17 @@ cdef class CUdevResource_st:
     {{endif}}
     {{if 'CUdevResource_st.sm' in found_struct}}
     sm : CUdevSmResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_SM `typename`.
+        Resource corresponding to CU_DEV_RESOURCE_TYPE_SM ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wqConfig' in found_struct}}
     wqConfig : CUdevWorkqueueConfigResource
         Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG
-        `typename`.
+        ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wq' in found_struct}}
     wq : CUdevWorkqueueResource
         Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE
-        `typename`.
+        ``typename``.
     {{endif}}
     {{if 'CUdevResource_st._oversize' in found_struct}}
     _oversize : bytes
@@ -26673,10 +26673,10 @@ cdef class VdpOutputSurface:
 def cuGetErrorString(error not None : CUresult):
     """ Gets the string description of an error code.
 
-    Sets `*pStr` to the address of a NULL-terminated string description of
-    the error code `error`. If the error code is not recognized,
-    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned and `*pStr` will
-    be set to the NULL address.
+    Sets ``*pStr`` to the address of a NULL-terminated string description
+    of the error code ``error``. If the error code is not recognized,
+    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned and ``*pStr``
+    will be set to the NULL address.
 
     Parameters
     ----------
@@ -26709,10 +26709,10 @@ def cuGetErrorString(error not None : CUresult):
 def cuGetErrorName(error not None : CUresult):
     """ Gets the string representation of an error code enum name.
 
-    Sets `*pStr` to the address of a NULL-terminated string representation
-    of the name of the enum error code `error`. If the error code is not
-    recognized, :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned and
-    `*pStr` will be set to the NULL address.
+    Sets ``*pStr`` to the address of a NULL-terminated string
+    representation of the name of the enum error code ``error``. If the
+    error code is not recognized, :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will
+    be returned and ``*pStr`` will be set to the NULL address.
 
     Parameters
     ----------
@@ -26743,7 +26743,7 @@ def cuGetErrorName(error not None : CUresult):
 
 @cython.embedsignature(True)
 def cuInit(unsigned int Flags):
-    """ Initialize the CUDA driver API Initializes the driver API and must be called before any other function from the driver API in the current process. Currently, the `Flags` parameter must be 0. If :py:obj:`~.cuInit()` has not been called, any function from the driver API will return :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`.
+    """ Initialize the CUDA driver API Initializes the driver API and must be called before any other function from the driver API in the current process. Currently, the ``Flags`` parameter must be 0. If :py:obj:`~.cuInit()` has not been called, any function from the driver API will return :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`.
 
     Note: cuInit preloads various libraries needed for JIT compilation. To
     opt-out of this behavior, set the environment variable
@@ -26772,12 +26772,12 @@ def cuInit(unsigned int Flags):
 def cuDriverGetVersion():
     """ Returns the latest CUDA version supported by driver.
 
-    Returns in `*driverVersion` the version of CUDA supported by the
+    Returns in ``*driverVersion`` the version of CUDA supported by the
     driver. The version is returned as (1000 * major + 10 * minor). For
     example, CUDA 9.2 would be represented by 9020.
 
     This function automatically returns
-    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if `driverVersion` is NULL.
+    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if ``driverVersion`` is NULL.
 
     Returns
     -------
@@ -26804,8 +26804,8 @@ def cuDriverGetVersion():
 def cuDeviceGet(int ordinal):
     """ Returns a handle to a compute device.
 
-    Returns in `*device` a device handle given an ordinal in the range [0,
-    :py:obj:`~.cuDeviceGetCount()`-1].
+    Returns in ``*device`` a device handle given an ordinal in the range
+    [0, :py:obj:`~.cuDeviceGetCount()`-1].
 
     Parameters
     ----------
@@ -26837,7 +26837,7 @@ def cuDeviceGet(int ordinal):
 def cuDeviceGetCount():
     """ Returns the number of compute-capable devices.
 
-    Returns in `*count` the number of devices with compute capability
+    Returns in ``*count`` the number of devices with compute capability
     greater than or equal to 2.0 that are available for execution. If there
     is no such device, :py:obj:`~.cuDeviceGetCount()` returns 0.
 
@@ -26866,15 +26866,16 @@ def cuDeviceGetCount():
 def cuDeviceGetName(int length, dev):
     """ Returns an identifier string for the device.
 
-    Returns an ASCII string identifying the device `dev` in the NULL-
-    terminated string pointed to by `name`. `length` specifies the maximum
-    length of the string that may be returned. `name` is shortened to the
-    specified `length`, if `length` is less than the device name
+    Returns an ASCII string identifying the device ``dev`` in the NULL-
+    terminated string pointed to by ``name``. ``length`` specifies the
+    maximum length of the string that may be returned. ``name`` is
+    shortened to the specified ``length``, if ``length`` is less than the
+    device name
 
     Parameters
     ----------
     length : int
-        Maximum length of string to store in `name`
+        Maximum length of string to store in ``name``
     dev : :py:obj:`~.CUdevice`
         Device to get identifier string for
 
@@ -26912,9 +26913,9 @@ def cuDeviceGetName(int length, dev):
 def cuDeviceGetUuid(dev):
     """ Return an UUID for the device.
 
-    Returns 16-octets identifying the device `dev` in the structure pointed
-    by the `uuid`. If the device is in MIG mode, returns its MIG UUID which
-    uniquely identifies the subscribed MIG compute instance.
+    Returns 16-octets identifying the device ``dev`` in the structure
+    pointed by the ``uuid``. If the device is in MIG mode, returns its MIG
+    UUID which uniquely identifies the subscribed MIG compute instance.
 
     Parameters
     ----------
@@ -26954,8 +26955,8 @@ def cuDeviceGetUuid(dev):
 def cuDeviceGetLuid(dev):
     """ Return an LUID and device node mask for the device.
 
-    Return identifying information (`luid` and `deviceNodeMask`) to allow
-    matching device with graphics APIs.
+    Return identifying information (``luid`` and ``deviceNodeMask``) to
+    allow matching device with graphics APIs.
 
     Parameters
     ----------
@@ -26998,8 +26999,8 @@ def cuDeviceGetLuid(dev):
 def cuDeviceTotalMem(dev):
     """ Returns the total amount of memory on the device.
 
-    Returns in `*bytes` the total amount of memory available on the device
-    `dev` in bytes.
+    Returns in ``*bytes`` the total amount of memory available on the
+    device ``dev`` in bytes.
 
     Parameters
     ----------
@@ -27039,9 +27040,9 @@ def cuDeviceTotalMem(dev):
 def cuDeviceGetTexture1DLinearMaxWidth(pformat not None : CUarray_format, unsigned numChannels, dev):
     """ Returns the maximum number of elements allocatable in a 1D linear texture for a given texture element size.
 
-    Returns in `maxWidthInElements` the maximum number of texture elements
-    allocatable in a 1D linear texture for given `pformat` and
-    `numChannels`.
+    Returns in ``maxWidthInElements`` the maximum number of texture
+    elements allocatable in a 1D linear texture for given ``pformat`` and
+    ``numChannels``.
 
     Parameters
     ----------
@@ -27058,7 +27059,7 @@ def cuDeviceGetTexture1DLinearMaxWidth(pformat not None : CUarray_format, unsign
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     maxWidthInElements : int
         Returned maximum number of texture elements allocatable for given
-        `pformat` and `numChannels`.
+        ``pformat`` and ``numChannels``.
 
     See Also
     --------
@@ -27087,8 +27088,8 @@ def cuDeviceGetTexture1DLinearMaxWidth(pformat not None : CUarray_format, unsign
 def cuDeviceGetAttribute(attrib not None : CUdevice_attribute, dev):
     """ Returns information about the device.
 
-    Returns in `*pi` the integer value of the attribute `attrib` on device
-    `dev`.
+    Returns in ``*pi`` the integer value of the attribute ``attrib`` on
+    device ``dev``.
 
     Parameters
     ----------
@@ -27131,20 +27132,21 @@ def cuDeviceGetAttribute(attrib not None : CUdevice_attribute, dev):
 def cuDeviceGetHostAtomicCapabilities(operations : Optional[tuple[CUatomicOperation] | list[CUatomicOperation]], unsigned int count, dev):
     """ Queries details about atomic operations supported between the device and host.
 
-    Returns in `*capabilities` the details about requested atomic
-    `*operations` over the the link between `dev` and the host. The
-    allocated size of `*operations` and `*capabilities` must be `count`.
+    Returns in ``*capabilities`` the details about requested atomic
+    ``*operations`` over the the link between ``dev`` and the host. The
+    allocated size of ``*operations`` and ``*capabilities`` must be
+    ``count``.
 
-    For each :py:obj:`~.CUatomicOperation` in `*operations`, the
-    corresponding result in `*capabilities` will be a bitmask indicating
+    For each :py:obj:`~.CUatomicOperation` in ``*operations``, the
+    corresponding result in ``*capabilities`` will be a bitmask indicating
     which of :py:obj:`~.CUatomicOperationCapability` the link supports
     natively.
 
-    Returns :py:obj:`~.CUDA_ERROR_INVALID_DEVICE` if `dev` is not valid.
+    Returns :py:obj:`~.CUDA_ERROR_INVALID_DEVICE` if ``dev`` is not valid.
 
-    Returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if `*capabilities` or
-    `*operations` is NULL, if `count` is 0, or if any of `*operations` is
-    not valid.
+    Returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if ``*capabilities`` or
+    ``*operations`` is NULL, if ``count`` is 0, or if any of
+    ``*operations`` is not valid.
 
     Parameters
     ----------
@@ -27202,20 +27204,20 @@ def cuDeviceGetHostAtomicCapabilities(operations : Optional[tuple[CUatomicOperat
 def cuDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, dev, int flags):
     """ Return NvSciSync attributes that this device can support.
 
-    Returns in `nvSciSyncAttrList`, the properties of NvSciSync that this
-    CUDA device, `dev` can support. The returned `nvSciSyncAttrList` can be
-    used to create an NvSciSync object that matches this device's
+    Returns in ``nvSciSyncAttrList``, the properties of NvSciSync that this
+    CUDA device, ``dev`` can support. The returned ``nvSciSyncAttrList``
+    can be used to create an NvSciSync object that matches this device's
     capabilities.
 
-    If NvSciSyncAttrKey_RequiredPerm field in `nvSciSyncAttrList` is
+    If NvSciSyncAttrKey_RequiredPerm field in ``nvSciSyncAttrList`` is
     already set this API will return :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
-    The applications should set `nvSciSyncAttrList` to a valid
+    The applications should set ``nvSciSyncAttrList`` to a valid
     NvSciSyncAttrList failing which this API will return
     :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`.
 
-    The `flags` controls how applications intends to use the NvSciSync
-    created from the `nvSciSyncAttrList`. The valid flags are:
+    The ``flags`` controls how applications intends to use the NvSciSync
+    created from the ``nvSciSyncAttrList``. The valid flags are:
 
     - :py:obj:`~.CUDA_NVSCISYNC_ATTR_SIGNAL`, specifies that the
       applications intends to signal an NvSciSync on this CUDA device.
@@ -27227,31 +27229,32 @@ def cuDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, dev, int flags):
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE`. Both the flags are orthogonal to
     one another: a developer may set both these flags that allows to set
     both wait and signal specific attributes in the same
-    `nvSciSyncAttrList`.
+    ``nvSciSyncAttrList``.
 
-    Note that this API updates the input `nvSciSyncAttrList` with values
+    Note that this API updates the input ``nvSciSyncAttrList`` with values
     equivalent to the following public attribute key-values:
     NvSciSyncAttrKey_RequiredPerm is set to
 
     - NvSciSyncAccessPerm_SignalOnly if
-      :py:obj:`~.CUDA_NVSCISYNC_ATTR_SIGNAL` is set in `flags`.
+      :py:obj:`~.CUDA_NVSCISYNC_ATTR_SIGNAL` is set in ``flags``.
 
     - NvSciSyncAccessPerm_WaitOnly if :py:obj:`~.CUDA_NVSCISYNC_ATTR_WAIT`
-      is set in `flags`.
+      is set in ``flags``.
 
     - NvSciSyncAccessPerm_WaitSignal if both
       :py:obj:`~.CUDA_NVSCISYNC_ATTR_WAIT` and
-      :py:obj:`~.CUDA_NVSCISYNC_ATTR_SIGNAL` are set in `flags`.
+      :py:obj:`~.CUDA_NVSCISYNC_ATTR_SIGNAL` are set in ``flags``.
       NvSciSyncAttrKey_PrimitiveInfo is set to
 
-    - NvSciSyncAttrValPrimitiveType_SysmemSemaphore on any valid `device`.
+    - NvSciSyncAttrValPrimitiveType_SysmemSemaphore on any valid
+      ``device``.
 
-    - NvSciSyncAttrValPrimitiveType_Syncpoint if `device` is a Tegra
+    - NvSciSyncAttrValPrimitiveType_Syncpoint if ``device`` is a Tegra
       device.
 
-    - NvSciSyncAttrValPrimitiveType_SysmemSemaphorePayload64b if `device`
+    - NvSciSyncAttrValPrimitiveType_SysmemSemaphorePayload64b if ``device``
       is GA10X+. NvSciSyncAttrKey_GpuId is set to the same UUID that is
-      returned for this `device` from :py:obj:`~.cuDeviceGetUuid`.
+      returned for this ``device`` from :py:obj:`~.cuDeviceGetUuid`.
 
     :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`,
     :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`,
@@ -27438,8 +27441,8 @@ def cuDeviceGetDefaultMemPool(dev):
 def cuDeviceGetExecAffinitySupport(typename not None : CUexecAffinityType, dev):
     """ Returns information about the execution affinity support of the device.
 
-    Returns in `*pi` whether execution affinity type `typename` is
-    supported by device `dev`. The supported types are:
+    Returns in ``*pi`` whether execution affinity type ``typename`` is
+    supported by device ``dev``. The supported types are:
 
     - :py:obj:`~.CU_EXEC_AFFINITY_TYPE_SM_COUNT`: 1 if context with limited
       SMs is supported by the device, or 0 if not;
@@ -27456,7 +27459,7 @@ def cuDeviceGetExecAffinitySupport(typename not None : CUexecAffinityType, dev):
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     pi : int
-        1 if the execution affinity type `typename` is supported by the
+        1 if the execution affinity type ``typename`` is supported by the
         device, or 0 if not
 
     See Also
@@ -27542,7 +27545,7 @@ def cuDeviceGetProperties(dev):
     This function was deprecated as of CUDA 5.0 and replaced by
     :py:obj:`~.cuDeviceGetAttribute()`.
 
-    Returns in `*prop` the properties of device `dev`. The
+    Returns in ``*prop`` the properties of device ``dev``. The
     :py:obj:`~.CUdevprop` structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
@@ -27552,11 +27555,11 @@ def cuDeviceGetProperties(dev):
     - :py:obj:`~.maxThreadsPerBlock` is the maximum number of threads per
       block;
 
-    - :py:obj:`~.maxThreadsDim`[3] is the maximum sizes of each dimension
-      of a block;
+    - :py:obj:`~.maxThreadsDim` ``[3]`` is the maximum sizes of each
+      dimension of a block;
 
-    - :py:obj:`~.maxGridSize`[3] is the maximum sizes of each dimension of
-      a grid;
+    - :py:obj:`~.maxGridSize` ``[3]`` is the maximum sizes of each
+      dimension of a grid;
 
     - :py:obj:`~.sharedMemPerBlock` is the total amount of shared memory
       available per block in bytes;
@@ -27622,8 +27625,8 @@ def cuDeviceComputeCapability(dev):
     This function was deprecated as of CUDA 5.0 and its functionality
     superseded by :py:obj:`~.cuDeviceGetAttribute()`.
 
-    Returns in `*major` and `*minor` the major and minor revision numbers
-    that define the compute capability of the device `dev`.
+    Returns in ``*major`` and ``*minor`` the major and minor revision
+    numbers that define the compute capability of the device ``dev``.
 
     Parameters
     ----------
@@ -27678,8 +27681,8 @@ def cuDevicePrimaryCtxRetain(dev):
     :py:obj:`~.CU_COMPUTEMODE_PROHIBITED`. The function
     :py:obj:`~.cuDeviceGetAttribute()` can be used with
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE` to determine the compute
-    mode of the device. The `nvidia-smi` tool can be used to set the
-    compute mode for devices. Documentation for `nvidia-smi` can be
+    mode of the device. The ``nvidia-smi`` tool can be used to set the
+    compute mode for devices. Documentation for ``nvidia-smi`` can be
     obtained by passing a -h option to it.
 
     Please note that the primary context always supports pinned
@@ -27773,8 +27776,8 @@ def cuDevicePrimaryCtxSetFlags(dev, unsigned int flags):
     Sets the flags for the primary context on the device overwriting
     perviously set ones.
 
-    The three LSBs of the `flags` parameter can be used to control how the
-    OS thread, which owns the CUDA context at the time of an API call,
+    The three LSBs of the ``flags`` parameter can be used to control how
+    the OS thread, which owns the CUDA context at the time of an API call,
     interacts with the OS scheduler when waiting for results from the GPU.
     Only one of the scheduling flags can be set when creating a context.
 
@@ -27797,15 +27800,16 @@ def cuDevicePrimaryCtxSetFlags(dev, unsigned int flags):
       finish work.   Deprecated: This flag was deprecated as of CUDA 4.0
       and was replaced with :py:obj:`~.CU_CTX_SCHED_BLOCKING_SYNC`.
 
-    - :py:obj:`~.CU_CTX_SCHED_AUTO`: The default value if the `flags`
+    - :py:obj:`~.CU_CTX_SCHED_AUTO`: The default value if the ``flags``
       parameter is zero, uses a heuristic based on the number of active
-      CUDA contexts in the process `C` and the number of logical processors
-      in the system `P`. If `C` > `P`, then CUDA will yield to other OS
-      threads when waiting for the GPU (:py:obj:`~.CU_CTX_SCHED_YIELD`),
-      otherwise CUDA will not yield while waiting for results and actively
-      spin on the processor (:py:obj:`~.CU_CTX_SCHED_SPIN`). Additionally,
-      on Tegra devices, :py:obj:`~.CU_CTX_SCHED_AUTO` uses a heuristic
-      based on the power profile of the platform and may choose
+      CUDA contexts in the process ``C`` and the number of logical
+      processors in the system ``P``. If ``C`` > ``P``, then CUDA will
+      yield to other OS threads when waiting for the GPU
+      (:py:obj:`~.CU_CTX_SCHED_YIELD`), otherwise CUDA will not yield while
+      waiting for results and actively spin on the processor
+      (:py:obj:`~.CU_CTX_SCHED_SPIN`). Additionally, on Tegra devices,
+      :py:obj:`~.CU_CTX_SCHED_AUTO` uses a heuristic based on the power
+      profile of the platform and may choose
       :py:obj:`~.CU_CTX_SCHED_BLOCKING_SYNC` for low-powered devices.
 
     - :py:obj:`~.CU_CTX_LMEM_RESIZE_TO_MAX`: Instruct CUDA to not reduce
@@ -27834,7 +27838,7 @@ def cuDevicePrimaryCtxSetFlags(dev, unsigned int flags):
       create a coredump if data is written to a certain pipe that is
       present in the OS space. These environment variables are described in
       the CUDA-GDB user guide under the "GPU core dump support" section. It
-      is important to note that the pipe name `must` be set with
+      is important to note that the pipe name ``must`` be set with
       :py:obj:`~.cuCoredumpSetAttributeGlobal` before creating the context
       if this flag is used. Setting this flag implies that
       :py:obj:`~.CU_CTX_COREDUMP_ENABLE` is set. The initial settings will
@@ -27884,8 +27888,8 @@ def cuDevicePrimaryCtxSetFlags(dev, unsigned int flags):
 def cuDevicePrimaryCtxGetState(dev):
     """ Get the state of the primary context.
 
-    Returns in `*flags` the flags for the primary context of `dev`, and in
-    `*active` whether it is active. See
+    Returns in ``*flags`` the flags for the primary context of ``dev``, and
+    in ``*active`` whether it is active. See
     :py:obj:`~.cuDevicePrimaryCtxSetFlags` for flag values.
 
     Parameters
@@ -27975,50 +27979,50 @@ def cuCtxCreate(ctxCreateParams : Optional[CUctxCreateParams], unsigned int flag
     """ Create a CUDA context.
 
     Creates a new CUDA context and associates it with the calling thread.
-    The `flags` parameter is described below. The context is created with a
-    usage count of 1 and the caller of :py:obj:`~.cuCtxCreate()` must call
-    :py:obj:`~.cuCtxDestroy()` when done using the context. If a context is
-    already current to the thread, it is supplanted by the newly created
-    context and may be restored by a subsequent call to
+    The ``flags`` parameter is described below. The context is created with
+    a usage count of 1 and the caller of :py:obj:`~.cuCtxCreate()` must
+    call :py:obj:`~.cuCtxDestroy()` when done using the context. If a
+    context is already current to the thread, it is supplanted by the newly
+    created context and may be restored by a subsequent call to
     :py:obj:`~.cuCtxPopCurrent()`.
 
-    A regular CUDA context can be created by setting `ctxCreateParams` to
+    A regular CUDA context can be created by setting ``ctxCreateParams`` to
     NULL.
 
     A CUDA context can be created with execution affinity. The type and the
     amount of execution resource the context can use is limited by
-    `paramsArray` and `numExecAffinityParams` in `execAffinity`. The
-    `paramsArray` is an array of `CUexecAffinityParam` and the
-    `numExecAffinityParams` describes the size of the paramsArray. If two
-    `CUexecAffinityParam` in the array have the same type, the latter
+    ``paramsArray`` and ``numExecAffinityParams`` in ``execAffinity``. The
+    ``paramsArray`` is an array of ``CUexecAffinityParam`` and the
+    ``numExecAffinityParams`` describes the size of the paramsArray. If two
+    ``CUexecAffinityParam`` in the array have the same type, the latter
     execution affinity parameter overrides the former execution affinity
     parameter. The supported execution affinity types are:
 
     - :py:obj:`~.CU_EXEC_AFFINITY_TYPE_SM_COUNT` limits the portion of SMs
       that the context can use. The portion of SMs is specified as the
-      number of SMs via `CUexecAffinitySmCount`. This limit will be
+      number of SMs via ``CUexecAffinitySmCount``. This limit will be
       internally rounded up to the next hardware-supported amount. Hence,
       it is imperative to query the actual execution affinity of the
       context via :py:obj:`~.cuCtxGetExecAffinity` after context creation.
       Currently, this attribute is only supported under Volta+ MPS.
 
     A CUDA context can be created in CIG(CUDA in Graphics) mode by setting
-    `cigParams`. Data from graphics client is shared with CUDA via the
-    `sharedData` in `cigParams`. Support for D3D12 graphics client can be
+    ``cigParams``. Data from graphics client is shared with CUDA via the
+    ``sharedData`` in ``cigParams``. Support for D3D12 graphics client can
+    be determined using :py:obj:`~.cuDeviceGetAttribute()` with
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED`. ``sharedData`` is
+    a ID3D12CommandQueue handle. Support for Vulkan graphics client can be
     determined using :py:obj:`~.cuDeviceGetAttribute()` with
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_D3D12_CIG_SUPPORTED`. `sharedData` is a
-    ID3D12CommandQueue handle. Support for Vulkan graphics client can be
-    determined using :py:obj:`~.cuDeviceGetAttribute()` with
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_VULKAN_CIG_SUPPORTED`. `sharedData` is a
-    Nvidia specific data blob populated by calling
-    vkGetExternalComputeQueueDataNV(). `execAffinityParams` and `cigParams`
-    are mutually exclusive and cannot both be non-NULL. Setting both to
-    non-NULL values will result in undefined behavior. If both
-    `execAffinityParams` and `cigParams` are NULL, the context will be
-    created as a regular CUDA context.
-
-    The three LSBs of the `flags` parameter can be used to control how the
-    OS thread, which owns the CUDA context at the time of an API call,
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_VULKAN_CIG_SUPPORTED`. ``sharedData`` is
+    a Nvidia specific data blob populated by calling
+    vkGetExternalComputeQueueDataNV(). ``execAffinityParams`` and
+    ``cigParams`` are mutually exclusive and cannot both be non-NULL.
+    Setting both to non-NULL values will result in undefined behavior. If
+    both ``execAffinityParams`` and ``cigParams`` are NULL, the context
+    will be created as a regular CUDA context.
+
+    The three LSBs of the ``flags`` parameter can be used to control how
+    the OS thread, which owns the CUDA context at the time of an API call,
     interacts with the OS scheduler when waiting for results from the GPU.
     Only one of the scheduling flags can be set when creating a context.
 
@@ -28041,15 +28045,16 @@ def cuCtxCreate(ctxCreateParams : Optional[CUctxCreateParams], unsigned int flag
       finish work.   Deprecated: This flag was deprecated as of CUDA 4.0
       and was replaced with :py:obj:`~.CU_CTX_SCHED_BLOCKING_SYNC`.
 
-    - :py:obj:`~.CU_CTX_SCHED_AUTO`: The default value if the `flags`
+    - :py:obj:`~.CU_CTX_SCHED_AUTO`: The default value if the ``flags``
       parameter is zero, uses a heuristic based on the number of active
-      CUDA contexts in the process `C` and the number of logical processors
-      in the system `P`. If `C` > `P`, then CUDA will yield to other OS
-      threads when waiting for the GPU (:py:obj:`~.CU_CTX_SCHED_YIELD`),
-      otherwise CUDA will not yield while waiting for results and actively
-      spin on the processor (:py:obj:`~.CU_CTX_SCHED_SPIN`). Additionally,
-      on Tegra devices, :py:obj:`~.CU_CTX_SCHED_AUTO` uses a heuristic
-      based on the power profile of the platform and may choose
+      CUDA contexts in the process ``C`` and the number of logical
+      processors in the system ``P``. If ``C`` > ``P``, then CUDA will
+      yield to other OS threads when waiting for the GPU
+      (:py:obj:`~.CU_CTX_SCHED_YIELD`), otherwise CUDA will not yield while
+      waiting for results and actively spin on the processor
+      (:py:obj:`~.CU_CTX_SCHED_SPIN`). Additionally, on Tegra devices,
+      :py:obj:`~.CU_CTX_SCHED_AUTO` uses a heuristic based on the power
+      profile of the platform and may choose
       :py:obj:`~.CU_CTX_SCHED_BLOCKING_SYNC` for low-powered devices.
 
     - :py:obj:`~.CU_CTX_MAP_HOST`: Instruct CUDA to support mapped pinned
@@ -28085,7 +28090,7 @@ def cuCtxCreate(ctxCreateParams : Optional[CUctxCreateParams], unsigned int flag
       create a coredump if data is written to a certain pipe that is
       present in the OS space. These environment variables are described in
       the CUDA-GDB user guide under the "GPU core dump support" section. It
-      is important to note that the pipe name `must` be set with
+      is important to note that the pipe name ``must`` be set with
       :py:obj:`~.cuCoredumpSetAttributeGlobal` before creating the context
       if this flag is used. Setting this flag implies that
       :py:obj:`~.CU_CTX_COREDUMP_ENABLE` is set. The initial attributes
@@ -28094,7 +28099,7 @@ def cuCtxCreate(ctxCreateParams : Optional[CUctxCreateParams], unsigned int flag
       modified by calling :py:obj:`~.cuCoredumpSetAttribute` from the
       created context after it becomes current. Setting this flag on any
       context creation is equivalent to setting the
-      :py:obj:`~.CU_COREDUMP_ENABLE_USER_TRIGGER` attribute to `true`
+      :py:obj:`~.CU_COREDUMP_ENABLE_USER_TRIGGER` attribute to ``true``
       globally. This flag is not supported when CUDA context is created in
       CIG(CUDA in Graphics) mode.
 
@@ -28108,8 +28113,8 @@ def cuCtxCreate(ctxCreateParams : Optional[CUctxCreateParams], unsigned int flag
     compute mode of the device is :py:obj:`~.CU_COMPUTEMODE_PROHIBITED`.
     The function :py:obj:`~.cuDeviceGetAttribute()` can be used with
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_COMPUTE_MODE` to determine the compute
-    mode of the device. The `nvidia-smi` tool can be used to set the
-    compute mode for * devices. Documentation for `nvidia-smi` can be
+    mode of the device. The ``nvidia-smi`` tool can be used to set the
+    compute mode for * devices. Documentation for ``nvidia-smi`` can be
     obtained by passing a -h option to it.
 
     Context creation will fail with :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if
@@ -28163,10 +28168,11 @@ def cuCtxCreate(ctxCreateParams : Optional[CUctxCreateParams], unsigned int flag
 def cuCtxDestroy(ctx):
     """ Destroy a CUDA context.
 
-    Destroys the CUDA context specified by `ctx`. The context `ctx` will be
-    destroyed regardless of how many threads it is current to. It is the
-    responsibility of the calling function to ensure that no API call
-    issues using `ctx` while :py:obj:`~.cuCtxDestroy()` is executing.
+    Destroys the CUDA context specified by ``ctx``. The context ``ctx``
+    will be destroyed regardless of how many threads it is current to. It
+    is the responsibility of the calling function to ensure that no API
+    call issues using ``ctx`` while :py:obj:`~.cuCtxDestroy()` is
+    executing.
 
     Destroys and cleans up all resources associated with the context. It is
     the caller's responsibility to ensure that the context or its resources
@@ -28182,12 +28188,12 @@ def cuCtxDestroy(ctx):
     :py:obj:`~.cuMemAllocHost()`, :py:obj:`~.cuMemAllocManaged()` and
     :py:obj:`~.cuMemAllocPitch()`.
 
-    If `ctx` is current to the calling thread then `ctx` will also be
+    If ``ctx`` is current to the calling thread then ``ctx`` will also be
     popped from the current thread's context stack (as though
-    :py:obj:`~.cuCtxPopCurrent()` were called). If `ctx` is current to
-    other threads, then `ctx` will remain current to those threads, and
-    attempting to access `ctx` from those threads will result in the error
-    :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`.
+    :py:obj:`~.cuCtxPopCurrent()` were called). If ``ctx`` is current to
+    other threads, then ``ctx`` will remain current to those threads, and
+    attempting to access ``ctx`` from those threads will result in the
+    error :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`.
 
     Parameters
     ----------
@@ -28226,7 +28232,7 @@ def cuCtxDestroy(ctx):
 def cuCtxPushCurrent(ctx):
     """ Pushes a context on the current CPU thread.
 
-    Pushes the given context `ctx` onto the CPU thread's stack of current
+    Pushes the given context ``ctx`` onto the CPU thread's stack of current
     contexts. The specified context becomes the CPU thread's current
     context, so all CUDA functions that operate on the current context are
     affected.
@@ -28268,8 +28274,8 @@ def cuCtxPopCurrent():
     """ Pops the current CUDA context from the current CPU thread.
 
     Pops the current CUDA context from the CPU thread and passes back the
-    old context handle in `*pctx`. That context may then be made current to
-    a different CPU thread by calling :py:obj:`~.cuCtxPushCurrent()`.
+    old context handle in ``*pctx``. That context may then be made current
+    to a different CPU thread by calling :py:obj:`~.cuCtxPushCurrent()`.
 
     If a context was current to the CPU thread before
     :py:obj:`~.cuCtxCreate()` or :py:obj:`~.cuCtxPushCurrent()` was called,
@@ -28300,15 +28306,15 @@ def cuCtxPopCurrent():
 def cuCtxSetCurrent(ctx):
     """ Binds the specified CUDA context to the calling CPU thread.
 
-    Binds the specified CUDA context to the calling CPU thread. If `ctx` is
-    NULL then the CUDA context previously bound to the calling CPU thread
-    is unbound and :py:obj:`~.CUDA_SUCCESS` is returned.
+    Binds the specified CUDA context to the calling CPU thread. If ``ctx``
+    is NULL then the CUDA context previously bound to the calling CPU
+    thread is unbound and :py:obj:`~.CUDA_SUCCESS` is returned.
 
     If there exists a CUDA context stack on the calling CPU thread, this
-    will replace the top of that stack with `ctx`. If `ctx` is NULL then
-    this will be equivalent to popping the top of the calling CPU thread's
-    CUDA context stack (or a no-op if the calling CPU thread's CUDA context
-    stack is empty).
+    will replace the top of that stack with ``ctx``. If ``ctx`` is NULL
+    then this will be equivalent to popping the top of the calling CPU
+    thread's CUDA context stack (or a no-op if the calling CPU thread's
+    CUDA context stack is empty).
 
     Parameters
     ----------
@@ -28343,9 +28349,9 @@ def cuCtxSetCurrent(ctx):
 def cuCtxGetCurrent():
     """ Returns the CUDA context bound to the calling CPU thread.
 
-    Returns in `*pctx` the CUDA context bound to the calling CPU thread. If
-    no context is bound to the calling CPU thread then `*pctx` is set to
-    NULL and :py:obj:`~.CUDA_SUCCESS` is returned.
+    Returns in ``*pctx`` the CUDA context bound to the calling CPU thread.
+    If no context is bound to the calling CPU thread then ``*pctx`` is set
+    to NULL and :py:obj:`~.CUDA_SUCCESS` is returned.
 
     Returns
     -------
@@ -28372,7 +28378,7 @@ def cuCtxGetCurrent():
 def cuCtxGetDevice():
     """ Returns the device handle for the current context.
 
-    Returns in `*device` the handle of the current context's device.
+    Returns in ``*device`` the handle of the current context's device.
 
     Returns
     -------
@@ -28399,7 +28405,7 @@ def cuCtxGetDevice():
 def cuCtxGetDevice_v2(ctx):
     """ Returns the device handle for the specified context.
 
-    Returns in `*device` the handle of the specified context's device. If
+    Returns in ``*device`` the handle of the specified context's device. If
     the specified context is NULL, the API will return the current
     context's device.
 
@@ -28441,7 +28447,7 @@ def cuCtxGetDevice_v2(ctx):
 def cuCtxGetFlags():
     """ Returns the flags for the current context.
 
-    Returns in `*flags` the flags of the current context. See
+    Returns in ``*flags`` the flags of the current context. See
     :py:obj:`~.cuCtxCreate` for flag values.
 
     Returns
@@ -28497,7 +28503,7 @@ def cuCtxSetFlags(unsigned int flags):
 def cuCtxGetId(ctx):
     """ Returns the unique Id associated with the context supplied.
 
-    Returns in `ctxId` the unique Id which is associated with a given
+    Returns in ``ctxId`` the unique Id which is associated with a given
     context. The Id is unique for the life of the program for this instance
     of CUDA. If context is supplied as NULL and there is one current, the
     Id of the current context is returned.
@@ -28613,9 +28619,9 @@ def cuCtxSynchronize_v2(ctx):
 def cuCtxSetLimit(limit not None : CUlimit, size_t value):
     """ Set resource limits.
 
-    Setting `limit` to `value` is a request by the application to update
-    the current limit maintained by the context. The driver is free to
-    modify the requested value to meet h/w requirements (this could be
+    Setting ``limit`` to ``value`` is a request by the application to
+    update the current limit maintained by the context. The driver is free
+    to modify the requested value to meet h/w requirements (this could be
     clamping to minimum or maximum values, rounding up to nearest element
     size, etc). The application can use :py:obj:`~.cuCtxGetLimit()` to find
     out exactly what the limit has been set to.
@@ -28720,7 +28726,7 @@ def cuCtxSetLimit(limit not None : CUlimit, size_t value):
 def cuCtxGetLimit(limit not None : CUlimit):
     """ Returns resource limits.
 
-    Returns in `*pvalue` the current size of `limit`. The supported
+    Returns in ``*pvalue`` the current size of ``limit``. The supported
     :py:obj:`~.CUlimit` values are:
 
     - :py:obj:`~.CU_LIMIT_STACK_SIZE`: stack size in bytes of each GPU
@@ -28776,12 +28782,13 @@ def cuCtxGetCacheConfig():
     """ Returns the preferred cache configuration for the current context.
 
     On devices where the L1 cache and shared memory use the same hardware
-    resources, this function returns through `pconfig` the preferred cache
-    configuration for the current context. This is only a preference. The
-    driver will use the requested configuration if possible, but it is free
-    to choose a different configuration if required to execute functions.
+    resources, this function returns through ``pconfig`` the preferred
+    cache configuration for the current context. This is only a preference.
+    The driver will use the requested configuration if possible, but it is
+    free to choose a different configuration if required to execute
+    functions.
 
-    This will return a `pconfig` of :py:obj:`~.CU_FUNC_CACHE_PREFER_NONE`
+    This will return a ``pconfig`` of :py:obj:`~.CU_FUNC_CACHE_PREFER_NONE`
     on devices where the size of the L1 cache and shared memory are fixed.
 
     The supported cache configurations are:
@@ -28824,11 +28831,12 @@ def cuCtxSetCacheConfig(config not None : CUfunc_cache):
     """ Sets the preferred cache configuration for the current context.
 
     On devices where the L1 cache and shared memory use the same hardware
-    resources, this sets through `config` the preferred cache configuration
-    for the current context. This is only a preference. The driver will use
-    the requested configuration if possible, but it is free to choose a
-    different configuration if required to execute the function. Any
-    function preference set via :py:obj:`~.cuFuncSetCacheConfig()` or
+    resources, this sets through ``config`` the preferred cache
+    configuration for the current context. This is only a preference. The
+    driver will use the requested configuration if possible, but it is free
+    to choose a different configuration if required to execute the
+    function. Any function preference set via
+    :py:obj:`~.cuFuncSetCacheConfig()` or
     :py:obj:`~.cuKernelSetCacheConfig()` will be preferred over this
     context-wide setting. Setting the context-wide cache configuration to
     :py:obj:`~.CU_FUNC_CACHE_PREFER_NONE` will cause subsequent kernel
@@ -28881,10 +28889,11 @@ def cuCtxSetCacheConfig(config not None : CUfunc_cache):
 def cuCtxGetApiVersion(ctx):
     """ Gets the context's API version.
 
-    Returns a version number in `version` corresponding to the capabilities
-    of the context (e.g. 3010 or 3020), which library developers can use to
-    direct callers to a specific API version. If `ctx` is NULL, returns the
-    API version used to create the currently bound context.
+    Returns a version number in ``version`` corresponding to the
+    capabilities of the context (e.g. 3010 or 3020), which library
+    developers can use to direct callers to a specific API version. If
+    ``ctx`` is NULL, returns the API version used to create the currently
+    bound context.
 
     Note that new API versions are only introduced when context
     capabilities are changed that break binary compatibility, so the API
@@ -28929,20 +28938,21 @@ def cuCtxGetApiVersion(ctx):
 def cuCtxGetStreamPriorityRange():
     """ Returns numerical values that correspond to the least and greatest stream priorities.
 
-    Returns in `*leastPriority` and `*greatestPriority` the numerical
+    Returns in ``*leastPriority`` and ``*greatestPriority`` the numerical
     values that correspond to the least and greatest stream priorities
     respectively. Stream priorities follow a convention where lower numbers
     imply greater priorities. The range of meaningful stream priorities is
-    given by [`*greatestPriority`, `*leastPriority`]. If the user attempts
-    to create a stream with a priority value that is outside the meaningful
-    range as specified by this API, the priority is automatically clamped
-    down or up to either `*leastPriority` or `*greatestPriority`
-    respectively. See :py:obj:`~.cuStreamCreateWithPriority` for details on
-    creating a priority stream. A NULL may be passed in for
-    `*leastPriority` or `*greatestPriority` if the value is not desired.
-
-    This function will return '0' in both `*leastPriority` and
-    `*greatestPriority` if the current context's device does not support
+    given by [``*greatestPriority``, ``*leastPriority``]. If the user
+    attempts to create a stream with a priority value that is outside the
+    meaningful range as specified by this API, the priority is
+    automatically clamped down or up to either ``*leastPriority`` or
+    ``*greatestPriority`` respectively. See
+    :py:obj:`~.cuStreamCreateWithPriority` for details on creating a
+    priority stream. A NULL may be passed in for ``*leastPriority`` or
+    ``*greatestPriority`` if the value is not desired.
+
+    This function will return '0' in both ``*leastPriority`` and
+    ``*greatestPriority`` if the current context's device does not support
     stream priorities (see :py:obj:`~.cuDeviceGetAttribute`).
 
     Returns
@@ -28998,7 +29008,7 @@ def cuCtxResetPersistingL2Cache():
 def cuCtxGetExecAffinity(typename not None : CUexecAffinityType):
     """ Returns the execution affinity setting for the current context.
 
-    Returns in `*pExecAffinity` the current value of `typename`. The
+    Returns in ``*pExecAffinity`` the current value of ``typename``. The
     supported :py:obj:`~.CUexecAffinityType` values are:
 
     - :py:obj:`~.CU_EXEC_AFFINITY_TYPE_SM_COUNT`: number of SMs the context
@@ -29035,17 +29045,17 @@ def cuCtxGetExecAffinity(typename not None : CUexecAffinityType):
 def cuCtxRecordEvent(hCtx, hEvent):
     """ Records an event.
 
-    Captures in `hEvent` all the activities of the context `hCtx` at the
-    time of this call. `hEvent` and `hCtx` must be from the same CUDA
-    context, otherwise :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` will be
+    Captures in ``hEvent`` all the activities of the context ``hCtx`` at
+    the time of this call. ``hEvent`` and ``hCtx`` must be from the same
+    CUDA context, otherwise :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` will be
     returned. Calls such as :py:obj:`~.cuEventQuery()` or
     :py:obj:`~.cuCtxWaitEvent()` will then examine or wait for completion
-    of the work that was captured. Uses of `hCtx` after this call do not
-    modify `hEvent`. If the context passed to `hCtx` is the primary
-    context, `hEvent` will capture all the activities of the primary
-    context and its green contexts. If the context passed to `hCtx` is a
+    of the work that was captured. Uses of ``hCtx`` after this call do not
+    modify ``hEvent``. If the context passed to ``hCtx`` is the primary
+    context, ``hEvent`` will capture all the activities of the primary
+    context and its green contexts. If the context passed to ``hCtx`` is a
     context converted from green context via
-    :py:obj:`~.cuCtxFromGreenCtx()`, `hEvent` will capture only the
+    :py:obj:`~.cuCtxFromGreenCtx()`, ``hEvent`` will capture only the
     activities of the green context.
 
     Parameters
@@ -29066,7 +29076,7 @@ def cuCtxRecordEvent(hCtx, hEvent):
 
     Notes
     -----
-    The API will return :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED` if the specified context `hCtx` has a stream in the capture mode. In such a case, the call will invalidate all the conflicting captures.
+    The API will return :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED` if the specified context ``hCtx`` has a stream in the capture mode. In such a case, the call will invalidate all the conflicting captures.
     """
     cdef cydriver.CUevent cyhEvent
     if hEvent is None:
@@ -29095,15 +29105,15 @@ def cuCtxRecordEvent(hCtx, hEvent):
 def cuCtxWaitEvent(hCtx, hEvent):
     """ Make a context wait on an event.
 
-    Makes all future work submitted to context `hCtx` wait for all work
-    captured in `hEvent`. The synchronization will be performed on the
+    Makes all future work submitted to context ``hCtx`` wait for all work
+    captured in ``hEvent``. The synchronization will be performed on the
     device and will not block the calling CPU thread. See
     :py:obj:`~.cuCtxRecordEvent()` for details on what is captured by an
-    event. If the context passed to `hCtx` is the primary context, the
-    primary context and its green contexts will wait for `hEvent`. If the
-    context passed to `hCtx` is a context converted from green context via
-    :py:obj:`~.cuCtxFromGreenCtx()`, the green context will wait for
-    `hEvent`.
+    event. If the context passed to ``hCtx`` is the primary context, the
+    primary context and its green contexts will wait for ``hEvent``. If the
+    context passed to ``hCtx`` is a context converted from green context
+    via :py:obj:`~.cuCtxFromGreenCtx()`, the green context will wait for
+    ``hEvent``.
 
     Parameters
     ----------
@@ -29123,9 +29133,9 @@ def cuCtxWaitEvent(hCtx, hEvent):
 
     Notes
     -----
-    `hEvent` may be from a different context or device than `hCtx`.
+    ``hEvent`` may be from a different context or device than ``hCtx``.
 
-    The API will return :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED` and invalidate the capture if the specified event `hEvent` is part of an ongoing capture sequence or if the specified context `hCtx` has a stream in the capture mode.
+    The API will return :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED` and invalidate the capture if the specified event ``hEvent`` is part of an ongoing capture sequence or if the specified context ``hCtx`` has a stream in the capture mode.
     """
     cdef cydriver.CUevent cyhEvent
     if hEvent is None:
@@ -29159,11 +29169,12 @@ def cuCtxAttach(unsigned int flags):
     Note that this function is deprecated and should not be used.
 
     Increments the usage count of the context and passes back a context
-    handle in `*pctx` that must be passed to :py:obj:`~.cuCtxDetach()` when
-    the application is done with the context. :py:obj:`~.cuCtxAttach()`
-    fails if there is no context current to the thread.
+    handle in ``*pctx`` that must be passed to :py:obj:`~.cuCtxDetach()`
+    when the application is done with the context.
+    :py:obj:`~.cuCtxAttach()` fails if there is no context current to the
+    thread.
 
-    Currently, the `flags` parameter must be 0.
+    Currently, the ``flags`` parameter must be 0.
 
     Parameters
     ----------
@@ -29199,7 +29210,7 @@ def cuCtxDetach(ctx):
 
     Note that this function is deprecated and should not be used.
 
-    Decrements the usage count of the context `ctx`, and destroys the
+    Decrements the usage count of the context ``ctx``, and destroys the
     context if the usage count goes to 0. The context must be a handle that
     was passed back by :py:obj:`~.cuCtxCreate()` or
     :py:obj:`~.cuCtxAttach()`, and must be current to the calling thread.
@@ -29239,7 +29250,7 @@ def cuCtxGetSharedMemConfig():
 
     [Deprecated]
 
-    This function will return in `pConfig` the current size of shared
+    This function will return in ``pConfig`` the current size of shared
     memory banks in the current context. On devices with configurable
     shared memory banks, :py:obj:`~.cuCtxSetSharedMemConfig` can be used to
     change this setting, so that all subsequent kernel launches will by
@@ -29335,14 +29346,14 @@ def cuCtxSetSharedMemConfig(config not None : CUsharedconfig):
 def cuModuleLoad(char* fname):
     """ Loads a compute module.
 
-    Takes a filename `fname` and loads the corresponding module `module`
-    into the current context. The CUDA driver API does not attempt to
-    lazily allocate the resources needed by a module; if the memory for
-    functions and data (constant and global) needed by the module cannot be
-    allocated, :py:obj:`~.cuModuleLoad()` fails. The file should be a
-    `cubin` file as output by nvcc, or a `PTX` file either as output by
-    nvcc or handwritten, or a `fatbin` file as output by nvcc from
-    toolchain 4.0 or later, or a `Tile` IR file.
+    Takes a filename ``fname`` and loads the corresponding module
+    ``module`` into the current context. The CUDA driver API does not
+    attempt to lazily allocate the resources needed by a module; if the
+    memory for functions and data (constant and global) needed by the
+    module cannot be allocated, :py:obj:`~.cuModuleLoad()` fails. The file
+    should be a ``cubin`` file as output by nvcc, or a ``PTX`` file either
+    as output by nvcc or handwritten, or a ``fatbin`` file as output by
+    nvcc from toolchain 4.0 or later, or a ``Tile`` IR file.
 
     Parameters
     ----------
@@ -29374,10 +29385,10 @@ def cuModuleLoad(char* fname):
 def cuModuleLoadData(image):
     """ Load a module's data.
 
-    Takes a pointer `image` and loads the corresponding module `module`
-    into the current context. The `image` may be a `cubin` or `fatbin` as
-    output by nvcc, or a NULL-terminated `PTX`, either as output by nvcc or
-    hand-written, or `Tile` IR data.
+    Takes a pointer ``image`` and loads the corresponding module ``module``
+    into the current context. The ``image`` may be a ``cubin`` or
+    ``fatbin`` as output by nvcc, or a NULL-terminated ``PTX``, either as
+    output by nvcc or hand-written, or ``Tile`` IR data.
 
     Parameters
     ----------
@@ -29412,10 +29423,10 @@ def cuModuleLoadData(image):
 def cuModuleLoadDataEx(image, unsigned int numOptions, options : Optional[tuple[CUjit_option] | list[CUjit_option]], optionValues : Optional[tuple[Any] | list[Any]]):
     """ Load a module's data with options.
 
-    Takes a pointer `image` and loads the corresponding module `module`
-    into the current context. The `image` may be a `cubin` or `fatbin` as
-    output by nvcc, or a NULL-terminated `PTX`, either as output by nvcc or
-    hand-written, or `Tile` IR data.
+    Takes a pointer ``image`` and loads the corresponding module ``module``
+    into the current context. The ``image`` may be a ``cubin`` or
+    ``fatbin`` as output by nvcc, or a NULL-terminated ``PTX``, either as
+    output by nvcc or hand-written, or ``Tile`` IR data.
 
     Parameters
     ----------
@@ -29466,16 +29477,16 @@ def cuModuleLoadDataEx(image, unsigned int numOptions, options : Optional[tuple[
 def cuModuleLoadFatBinary(fatCubin):
     """ Load a module's data.
 
-    Takes a pointer `fatCubin` and loads the corresponding module `module`
-    into the current context. The pointer represents a `fat binary` object,
-    which is a collection of different `cubin` and/or `PTX` files, all
-    representing the same device code, but compiled and optimized for
-    different architectures.
+    Takes a pointer ``fatCubin`` and loads the corresponding module
+    ``module`` into the current context. The pointer represents a ``fat
+    binary`` object, which is a collection of different ``cubin`` and/or
+    ``PTX`` files, all representing the same device code, but compiled and
+    optimized for different architectures.
 
     Prior to CUDA 4.0, there was no documented API for constructing and
     using fat binary objects by programmers. Starting with CUDA 4.0, fat
-    binary objects can be constructed by providing the `-fatbin option` to
-    nvcc. More information can be found in the nvcc document.
+    binary objects can be constructed by providing the ``-fatbin option``
+    to nvcc. More information can be found in the nvcc document.
 
     Parameters
     ----------
@@ -29510,9 +29521,9 @@ def cuModuleLoadFatBinary(fatCubin):
 def cuModuleUnload(hmod):
     """ Unloads a module.
 
-    Unloads a module `hmod` from the current context. Attempting to unload
-    a module which was obtained from the Library Management API such as
-    :py:obj:`~.cuLibraryGetModule` will return
+    Unloads a module ``hmod`` from the current context. Attempting to
+    unload a module which was obtained from the Library Management API such
+    as :py:obj:`~.cuLibraryGetModule` will return
     :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`.
 
     Parameters
@@ -29576,8 +29587,8 @@ def cuModuleGetLoadingMode():
 def cuModuleGetFunction(hmod, char* name):
     """ Returns a function handle.
 
-    Returns in `*hfunc` the handle of the function of name `name` located
-    in module `hmod`. If no function of that name exists,
+    Returns in ``*hfunc`` the handle of the function of name ``name``
+    located in module ``hmod``. If no function of that name exists,
     :py:obj:`~.cuModuleGetFunction()` returns
     :py:obj:`~.CUDA_ERROR_NOT_FOUND`.
 
@@ -29621,7 +29632,7 @@ def cuModuleGetFunction(hmod, char* name):
 def cuModuleGetFunctionCount(mod):
     """ Returns the number of functions within a module.
 
-    Returns in `count` the number of functions in `mod`.
+    Returns in ``count`` the number of functions in ``mod``.
 
     Parameters
     ----------
@@ -29657,8 +29668,8 @@ def cuModuleGetFunctionCount(mod):
 def cuModuleEnumerateFunctions(unsigned int numFunctions, mod):
     """ Returns the function handles within a module.
 
-    Returns in `functions` a maximum number of `numFunctions` function
-    handles within `mod`. When function loading mode is set to LAZY the
+    Returns in ``functions`` a maximum number of ``numFunctions`` function
+    handles within ``mod``. When function loading mode is set to LAZY the
     function retrieved may be partially loaded. The loading state of a
     function can be queried using :py:obj:`~.cuFunctionIsLoaded`. CUDA APIs
     may load the function automatically when called with partially loaded
@@ -29716,11 +29727,11 @@ def cuModuleEnumerateFunctions(unsigned int numFunctions, mod):
 def cuModuleGetGlobal(hmod, char* name):
     """ Returns a global pointer from a module.
 
-    Returns in `*dptr` and `*bytes` the base pointer and size of the global
-    of name `name` located in module `hmod`. If no variable of that name
-    exists, :py:obj:`~.cuModuleGetGlobal()` returns
-    :py:obj:`~.CUDA_ERROR_NOT_FOUND`. One of the parameters `dptr` or
-    `numbytes` (not both) can be NULL in which case it is ignored.
+    Returns in ``*dptr`` and ``*bytes`` the base pointer and size of the
+    global of name ``name`` located in module ``hmod``. If no variable of
+    that name exists, :py:obj:`~.cuModuleGetGlobal()` returns
+    :py:obj:`~.CUDA_ERROR_NOT_FOUND`. One of the parameters ``dptr`` or
+    ``numbytes`` (not both) can be NULL in which case it is ignored.
 
     Parameters
     ----------
@@ -29783,7 +29794,7 @@ def cuLinkCreate(unsigned int numOptions, options : Optional[tuple[CUjit_option]
     and will have similar consequences as offline relocatable device code
     linking.
 
-    `optionValues` must remain valid for the life of the CUlinkState if
+    ``optionValues`` must remain valid for the life of the CUlinkState if
     output options are used. No other references to inputs are maintained
     after this call returns.
 
@@ -29839,8 +29850,8 @@ def cuLinkCreate(unsigned int numOptions, options : Optional[tuple[CUjit_option]
 def cuLinkAddData(state, typename not None : CUjitInputType, data, size_t size, char* name, unsigned int numOptions, options : Optional[tuple[CUjit_option] | list[CUjit_option]], optionValues : Optional[tuple[Any] | list[Any]]):
     """ Add an input to a pending linker invocation.
 
-    Ownership of `data` is retained by the caller. No reference is retained
-    to any inputs after this call returns.
+    Ownership of ``data`` is retained by the caller. No reference is
+    retained to any inputs after this call returns.
 
     This method accepts only compiler options, which are used if the data
     must be compiled from PTX, and does not accept any of
@@ -29986,9 +29997,9 @@ def cuLinkComplete(state):
 
     Completes the pending linker action and returns the cubin image for the
     linked device code, which can be used with
-    :py:obj:`~.cuModuleLoadData`. The cubin is owned by `state`, so it
-    should be loaded before `state` is destroyed via
-    :py:obj:`~.cuLinkDestroy`. This call does not destroy `state`.
+    :py:obj:`~.cuModuleLoadData`. The cubin is owned by ``state``, so it
+    should be loaded before ``state`` is destroyed via
+    :py:obj:`~.cuLinkDestroy`. This call does not destroy ``state``.
 
     Parameters
     ----------
@@ -30066,8 +30077,8 @@ def cuModuleGetTexRef(hmod, char* name):
 
     [Deprecated]
 
-    Returns in `*pTexRef` the handle of the texture reference of name
-    `name` in the module `hmod`. If no texture reference of that name
+    Returns in ``*pTexRef`` the handle of the texture reference of name
+    ``name`` in the module ``hmod``. If no texture reference of that name
     exists, :py:obj:`~.cuModuleGetTexRef()` returns
     :py:obj:`~.CUDA_ERROR_NOT_FOUND`. This texture reference handle should
     not be destroyed, since it will be destroyed when the module is
@@ -30115,8 +30126,8 @@ def cuModuleGetSurfRef(hmod, char* name):
 
     [Deprecated]
 
-    Returns in `*pSurfRef` the handle of the surface reference of name
-    `name` in the module `hmod`. If no surface reference of that name
+    Returns in ``*pSurfRef`` the handle of the surface reference of name
+    ``name`` in the module ``hmod``. If no surface reference of that name
     exists, :py:obj:`~.cuModuleGetSurfRef()` returns
     :py:obj:`~.CUDA_ERROR_NOT_FOUND`.
 
@@ -30160,36 +30171,36 @@ def cuModuleGetSurfRef(hmod, char* name):
 def cuLibraryLoadData(code, jitOptions : Optional[tuple[CUjit_option] | list[CUjit_option]], jitOptionsValues : Optional[tuple[Any] | list[Any]], unsigned int numJitOptions, libraryOptions : Optional[tuple[CUlibraryOption] | list[CUlibraryOption]], libraryOptionValues : Optional[tuple[Any] | list[Any]], unsigned int numLibraryOptions):
     """ Load a library with specified code and options.
 
-    Takes a pointer `code` and loads the corresponding library `library`
-    based on the application defined library loading mode:
+    Takes a pointer ``code`` and loads the corresponding library
+    ``library`` based on the application defined library loading mode:
 
     - If module loading is set to EAGER, via the environment variables
-      described in "Module loading", `library` is loaded eagerly into all
+      described in "Module loading", ``library`` is loaded eagerly into all
       contexts at the time of the call and future contexts at the time of
       creation until the library is unloaded with
       :py:obj:`~.cuLibraryUnload()`.
 
-    - If the environment variables are set to LAZY, `library` is not
+    - If the environment variables are set to LAZY, ``library`` is not
       immediately loaded onto all existent contexts and will only be loaded
       when a function is needed for that context, such as a kernel launch.
 
     These environment variables are described in the CUDA programming guide
     under the "CUDA environment variables" section.
 
-    The `code` may be a `cubin` or `fatbin` as output by nvcc, or a NULL-
-    terminated `PTX`, either as output by nvcc or hand-written, or `Tile`
-    IR data. A fatbin should also contain relocatable code when doing
-    separate compilation.
+    The ``code`` may be a ``cubin`` or ``fatbin`` as output by nvcc, or a
+    NULL-terminated ``PTX``, either as output by nvcc or hand-written, or
+    ``Tile`` IR data. A fatbin should also contain relocatable code when
+    doing separate compilation.
 
-    Options are passed as an array via `jitOptions` and any corresponding
-    parameters are passed in `jitOptionsValues`. The number of total JIT
-    options is supplied via `numJitOptions`. Any outputs will be returned
-    via `jitOptionsValues`.
+    Options are passed as an array via ``jitOptions`` and any corresponding
+    parameters are passed in ``jitOptionsValues``. The number of total JIT
+    options is supplied via ``numJitOptions``. Any outputs will be returned
+    via ``jitOptionsValues``.
 
-    Library load options are passed as an array via `libraryOptions` and
-    any corresponding parameters are passed in `libraryOptionValues`. The
+    Library load options are passed as an array via ``libraryOptions`` and
+    any corresponding parameters are passed in ``libraryOptionValues``. The
     number of total library load options is supplied via
-    `numLibraryOptions`.
+    ``numLibraryOptions``.
 
     Parameters
     ----------
@@ -30260,36 +30271,36 @@ def cuLibraryLoadData(code, jitOptions : Optional[tuple[CUjit_option] | list[CUj
 def cuLibraryLoadFromFile(char* fileName, jitOptions : Optional[tuple[CUjit_option] | list[CUjit_option]], jitOptionsValues : Optional[tuple[Any] | list[Any]], unsigned int numJitOptions, libraryOptions : Optional[tuple[CUlibraryOption] | list[CUlibraryOption]], libraryOptionValues : Optional[tuple[Any] | list[Any]], unsigned int numLibraryOptions):
     """ Load a library with specified file and options.
 
-    Takes a pointer `code` and loads the corresponding library `library`
-    based on the application defined library loading mode:
+    Takes a pointer ``code`` and loads the corresponding library
+    ``library`` based on the application defined library loading mode:
 
     - If module loading is set to EAGER, via the environment variables
-      described in "Module loading", `library` is loaded eagerly into all
+      described in "Module loading", ``library`` is loaded eagerly into all
       contexts at the time of the call and future contexts at the time of
       creation until the library is unloaded with
       :py:obj:`~.cuLibraryUnload()`.
 
-    - If the environment variables are set to LAZY, `library` is not
+    - If the environment variables are set to LAZY, ``library`` is not
       immediately loaded onto all existent contexts and will only be loaded
       when a function is needed for that context, such as a kernel launch.
 
     These environment variables are described in the CUDA programming guide
     under the "CUDA environment variables" section.
 
-    The file should be a `cubin` file as output by nvcc, or a `PTX` file
-    either as output by nvcc or handwritten, or a `fatbin` file as output
-    by nvcc or hand-written, or `Tile` IR file. A fatbin should also
-    contain relocatable code when doing separate compilation.
+    The file should be a ``cubin`` file as output by nvcc, or a ``PTX``
+    file either as output by nvcc or handwritten, or a ``fatbin`` file as
+    output by nvcc or hand-written, or ``Tile`` IR file. A fatbin should
+    also contain relocatable code when doing separate compilation.
 
-    Options are passed as an array via `jitOptions` and any corresponding
-    parameters are passed in `jitOptionsValues`. The number of total
-    options is supplied via `numJitOptions`. Any outputs will be returned
-    via `jitOptionsValues`.
+    Options are passed as an array via ``jitOptions`` and any corresponding
+    parameters are passed in ``jitOptionsValues``. The number of total
+    options is supplied via ``numJitOptions``. Any outputs will be returned
+    via ``jitOptionsValues``.
 
-    Library load options are passed as an array via `libraryOptions` and
-    any corresponding parameters are passed in `libraryOptionValues`. The
+    Library load options are passed as an array via ``libraryOptions`` and
+    any corresponding parameters are passed in ``libraryOptionValues``. The
     number of total library load options is supplied via
-    `numLibraryOptions`.
+    ``numLibraryOptions``.
 
     Parameters
     ----------
@@ -30357,7 +30368,7 @@ def cuLibraryLoadFromFile(char* fileName, jitOptions : Optional[tuple[CUjit_opti
 def cuLibraryUnload(library):
     """ Unloads a library.
 
-    Unloads the library specified with `library`
+    Unloads the library specified with ``library``
 
     Parameters
     ----------
@@ -30392,9 +30403,9 @@ def cuLibraryUnload(library):
 def cuLibraryGetKernel(library, char* name):
     """ Returns a kernel handle.
 
-    Returns in `pKernel` the handle of the kernel with name `name` located
-    in library `library`. If kernel handle is not found, the call returns
-    :py:obj:`~.CUDA_ERROR_NOT_FOUND`.
+    Returns in ``pKernel`` the handle of the kernel with name ``name``
+    located in library ``library``. If kernel handle is not found, the call
+    returns :py:obj:`~.CUDA_ERROR_NOT_FOUND`.
 
     Parameters
     ----------
@@ -30436,7 +30447,7 @@ def cuLibraryGetKernel(library, char* name):
 def cuLibraryGetKernelCount(lib):
     """ Returns the number of kernels within a library.
 
-    Returns in `count` the number of kernels in `lib`.
+    Returns in ``count`` the number of kernels in ``lib``.
 
     Parameters
     ----------
@@ -30472,9 +30483,9 @@ def cuLibraryGetKernelCount(lib):
 def cuLibraryEnumerateKernels(unsigned int numKernels, lib):
     """ Retrieve the kernel handles within a library.
 
-    Returns in `kernels` a maximum number of `numKernels` kernel handles
-    within `lib`. The returned kernel handle becomes invalid when the
-    library is unloaded.
+    Returns in ``kernels`` a maximum number of ``numKernels`` kernel
+    handles within ``lib``. The returned kernel handle becomes invalid when
+    the library is unloaded.
 
     Parameters
     ----------
@@ -30525,9 +30536,9 @@ def cuLibraryEnumerateKernels(unsigned int numKernels, lib):
 def cuLibraryGetModule(library):
     """ Returns a module handle.
 
-    Returns in `pMod` the module handle associated with the current context
-    located in library `library`. If module handle is not found, the call
-    returns :py:obj:`~.CUDA_ERROR_NOT_FOUND`.
+    Returns in ``pMod`` the module handle associated with the current
+    context located in library ``library``. If module handle is not found,
+    the call returns :py:obj:`~.CUDA_ERROR_NOT_FOUND`.
 
     Parameters
     ----------
@@ -30567,9 +30578,9 @@ def cuLibraryGetModule(library):
 def cuKernelGetFunction(kernel):
     """ Returns a function handle.
 
-    Returns in `pFunc` the handle of the function for the requested kernel
-    `kernel` and the current context. If function handle is not found, the
-    call returns :py:obj:`~.CUDA_ERROR_NOT_FOUND`.
+    Returns in ``pFunc`` the handle of the function for the requested
+    kernel ``kernel`` and the current context. If function handle is not
+    found, the call returns :py:obj:`~.CUDA_ERROR_NOT_FOUND`.
 
     Parameters
     ----------
@@ -30609,8 +30620,8 @@ def cuKernelGetFunction(kernel):
 def cuKernelGetLibrary(kernel):
     """ Returns a library handle.
 
-    Returns in `pLib` the handle of the library for the requested kernel
-    `kernel`
+    Returns in ``pLib`` the handle of the library for the requested kernel
+    ``kernel``
 
     Parameters
     ----------
@@ -30650,11 +30661,12 @@ def cuKernelGetLibrary(kernel):
 def cuLibraryGetGlobal(library, char* name):
     """ Returns a global device pointer.
 
-    Returns in `*dptr` and `*bytes` the base pointer and size of the global
-    with name `name` for the requested library `library` and the current
-    context. If no global for the requested name `name` exists, the call
-    returns :py:obj:`~.CUDA_ERROR_NOT_FOUND`. One of the parameters `dptr`
-    or `numbytes` (not both) can be NULL in which case it is ignored.
+    Returns in ``*dptr`` and ``*bytes`` the base pointer and size of the
+    global with name ``name`` for the requested library ``library`` and the
+    current context. If no global for the requested name ``name`` exists,
+    the call returns :py:obj:`~.CUDA_ERROR_NOT_FOUND`. One of the
+    parameters ``dptr`` or ``numbytes`` (not both) can be NULL in which
+    case it is ignored.
 
     Parameters
     ----------
@@ -30699,13 +30711,14 @@ def cuLibraryGetGlobal(library, char* name):
 def cuLibraryGetManaged(library, char* name):
     """ Returns a pointer to managed memory.
 
-    Returns in `*dptr` and `*bytes` the base pointer and size of the
-    managed memory with name `name` for the requested library `library`. If
-    no managed memory with the requested name `name` exists, the call
-    returns :py:obj:`~.CUDA_ERROR_NOT_FOUND`. One of the parameters `dptr`
-    or `numbytes` (not both) can be NULL in which case it is ignored. Note
-    that managed memory for library `library` is shared across devices and
-    is registered when the library is loaded into atleast one context.
+    Returns in ``*dptr`` and ``*bytes`` the base pointer and size of the
+    managed memory with name ``name`` for the requested library
+    ``library``. If no managed memory with the requested name ``name``
+    exists, the call returns :py:obj:`~.CUDA_ERROR_NOT_FOUND`. One of the
+    parameters ``dptr`` or ``numbytes`` (not both) can be NULL in which
+    case it is ignored. Note that managed memory for library ``library`` is
+    shared across devices and is registered when the library is loaded into
+    atleast one context.
 
     Parameters
     ----------
@@ -30750,12 +30763,12 @@ def cuLibraryGetManaged(library, char* name):
 def cuLibraryGetUnifiedFunction(library, char* symbol):
     """ Returns a pointer to a unified function.
 
-    Returns in `*fptr` the function pointer to a unified function denoted
-    by `symbol`. If no unified function with name `symbol` exists, the call
-    returns :py:obj:`~.CUDA_ERROR_NOT_FOUND`. If there is no device with
-    attribute :py:obj:`~.CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS`
-    present in the system, the call may return
-    :py:obj:`~.CUDA_ERROR_NOT_FOUND`.
+    Returns in ``*fptr`` the function pointer to a unified function denoted
+    by ``symbol``. If no unified function with name ``symbol`` exists, the
+    call returns :py:obj:`~.CUDA_ERROR_NOT_FOUND`. If there is no device
+    with attribute
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_UNIFIED_FUNCTION_POINTERS` present in
+    the system, the call may return :py:obj:`~.CUDA_ERROR_NOT_FOUND`.
 
     Parameters
     ----------
@@ -30797,8 +30810,8 @@ def cuLibraryGetUnifiedFunction(library, char* symbol):
 def cuKernelGetAttribute(attrib not None : CUfunction_attribute, kernel, dev):
     """ Returns information about a kernel.
 
-    Returns in `*pi` the integer value of the attribute `attrib` for the
-    kernel `kernel` for the requested device `dev`. The supported
+    Returns in ``*pi`` the integer value of the attribute ``attrib`` for
+    the kernel ``kernel`` for the requested device ``dev``. The supported
     attributes are:
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK`: The maximum
@@ -30932,11 +30945,11 @@ def cuKernelGetAttribute(attrib not None : CUfunction_attribute, kernel, dev):
 def cuKernelSetAttribute(attrib not None : CUfunction_attribute, int val, kernel, dev):
     """ Sets information about a kernel.
 
-    This call sets the value of a specified attribute `attrib` on the
-    kernel `kernel` for the requested device `dev` to an integer value
-    specified by `val`. This function returns CUDA_SUCCESS if the new value
-    of the attribute could be successfully set. If the set fails, this call
-    will return an error. Not all attributes can have values set.
+    This call sets the value of a specified attribute ``attrib`` on the
+    kernel ``kernel`` for the requested device ``dev`` to an integer value
+    specified by ``val``. This function returns CUDA_SUCCESS if the new
+    value of the attribute could be successfully set. If the set fails,
+    this call will return an error. Not all attributes can have values set.
     Attempting to set a value on a read-only attribute will result in an
     error (CUDA_ERROR_INVALID_VALUE)
 
@@ -31048,13 +31061,13 @@ def cuKernelSetCacheConfig(kernel, config not None : CUfunc_cache, dev):
     """ Sets the preferred cache configuration for a device kernel.
 
     On devices where the L1 cache and shared memory use the same hardware
-    resources, this sets through `config` the preferred cache configuration
-    for the device kernel `kernel` on the requested device `dev`. This is
-    only a preference. The driver will use the requested configuration if
-    possible, but it is free to choose a different configuration if
-    required to execute `kernel`. Any context-wide preference set via
-    :py:obj:`~.cuCtxSetCacheConfig()` will be overridden by this per-kernel
-    setting.
+    resources, this sets through ``config`` the preferred cache
+    configuration for the device kernel ``kernel`` on the requested device
+    ``dev``. This is only a preference. The driver will use the requested
+    configuration if possible, but it is free to choose a different
+    configuration if required to execute ``kernel``. Any context-wide
+    preference set via :py:obj:`~.cuCtxSetCacheConfig()` will be overridden
+    by this per-kernel setting.
 
     Note that attributes set using :py:obj:`~.cuFuncSetCacheConfig()` will
     override the attribute set by this API irrespective of whether the call
@@ -31131,13 +31144,13 @@ def cuKernelSetCacheConfig(kernel, config not None : CUfunc_cache, dev):
 def cuKernelGetName(hfunc):
     """ Returns the function name for a :py:obj:`~.CUkernel` handle.
 
-    Returns in `**name` the function name associated with the kernel handle
-    `hfunc` . The function name is returned as a null-terminated string.
-    The returned name is only valid when the kernel handle is valid. If the
-    library is unloaded or reloaded, one must call the API again to get the
-    updated name. This API may return a mangled name if the function is not
-    declared as having C linkage. If either `**name` or `hfunc` is NULL,
-    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned.
+    Returns in ``**name`` the function name associated with the kernel
+    handle ``hfunc`` . The function name is returned as a null-terminated
+    string. The returned name is only valid when the kernel handle is
+    valid. If the library is unloaded or reloaded, one must call the API
+    again to get the updated name. This API may return a mangled name if
+    the function is not declared as having C linkage. If either ``**name``
+    or ``hfunc`` is NULL, :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned.
 
     Parameters
     ----------
@@ -31173,15 +31186,15 @@ def cuKernelGetName(hfunc):
 def cuKernelGetParamInfo(kernel, size_t paramIndex):
     """ Returns the offset and size of a kernel parameter in the device-side parameter layout.
 
-    Queries the kernel parameter at `paramIndex` into `kernel's` list of
-    parameters, and returns in `paramOffset` and `paramSize` the offset and
-    size, respectively, where the parameter will reside in the device-side
-    parameter layout. This information can be used to update kernel node
-    parameters from the device via
+    Queries the kernel parameter at ``paramIndex`` into ``kernel's`` list
+    of parameters, and returns in ``paramOffset`` and ``paramSize`` the
+    offset and size, respectively, where the parameter will reside in the
+    device-side parameter layout. This information can be used to update
+    kernel node parameters from the device via
     :py:obj:`~.cudaGraphKernelNodeSetParam()` and
-    :py:obj:`~.cudaGraphKernelNodeUpdatesApply()`. `paramIndex` must be
-    less than the number of parameters that `kernel` takes. `paramSize` can
-    be set to NULL if only the parameter offset is desired.
+    :py:obj:`~.cudaGraphKernelNodeUpdatesApply()`. ``paramIndex`` must be
+    less than the number of parameters that ``kernel`` takes. ``paramSize``
+    can be set to NULL if only the parameter offset is desired.
 
     Parameters
     ----------
@@ -31228,8 +31241,8 @@ def cuKernelGetParamInfo(kernel, size_t paramIndex):
 def cuKernelGetParamCount(kernel):
     """ Returns the number of parameters used by the kernel.
 
-    Queries the number of kernel parameters used by `kernel` and returns it
-    in `paramCount`.
+    Queries the number of kernel parameters used by ``kernel`` and returns
+    it in ``paramCount``.
 
     Parameters
     ----------
@@ -31269,15 +31282,15 @@ def cuKernelGetParamCount(kernel):
 def cuMemGetInfo():
     """ Gets free and total memory.
 
-    Returns in `*total` the total amount of memory available to the the
-    current context. Returns in `*free` the amount of memory on the device
-    that is free according to the OS. CUDA is not guaranteed to be able to
-    allocate all of the memory that the OS reports as free. In a multi-
-    tenet situation, free estimate returned is prone to race condition
-    where a new allocation/free done by a different process or a different
-    thread in the same process between the time when free memory was
-    estimated and reported, will result in deviation in free value reported
-    and actual free memory.
+    Returns in ``*total`` the total amount of memory available to the the
+    current context. Returns in ``*free`` the amount of memory on the
+    device that is free according to the OS. CUDA is not guaranteed to be
+    able to allocate all of the memory that the OS reports as free. In a
+    multi-tenet situation, free estimate returned is prone to race
+    condition where a new allocation/free done by a different process or a
+    different thread in the same process between the time when free memory
+    was estimated and reported, will result in deviation in free value
+    reported and actual free memory.
 
     The integrated GPU on Tegra shares memory with CPU and other component
     of the SoC. The free and total values returned by the API excludes the
@@ -31314,10 +31327,10 @@ def cuMemGetInfo():
 def cuMemAlloc(size_t bytesize):
     """ Allocates device memory.
 
-    Allocates `bytesize` bytes of linear memory on the device and returns
-    in `*dptr` a pointer to the allocated memory. The allocated memory is
+    Allocates ``bytesize`` bytes of linear memory on the device and returns
+    in ``*dptr`` a pointer to the allocated memory. The allocated memory is
     suitably aligned for any kind of variable. The memory is not cleared.
-    If `bytesize` is 0, :py:obj:`~.cuMemAlloc()` returns
+    If ``bytesize`` is 0, :py:obj:`~.cuMemAlloc()` returns
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
     Parameters
@@ -31350,22 +31363,22 @@ def cuMemAlloc(size_t bytesize):
 def cuMemAllocPitch(size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes):
     """ Allocates pitched device memory.
 
-    Allocates at least `WidthInBytes` * `Height` bytes of linear memory on
-    the device and returns in `*dptr` a pointer to the allocated memory.
-    The function may pad the allocation to ensure that corresponding
-    pointers in any given row will continue to meet the alignment
-    requirements for coalescing as the address is updated from row to row.
-    `ElementSizeBytes` specifies the size of the largest reads and writes
-    that will be performed on the memory range. `ElementSizeBytes` may be
-    4, 8 or 16 (since coalesced memory transactions are not possible on
-    other data sizes). If `ElementSizeBytes` is smaller than the actual
-    read/write size of a kernel, the kernel will run correctly, but
-    possibly at reduced speed. The pitch returned in `*pPitch` by
-    :py:obj:`~.cuMemAllocPitch()` is the width in bytes of the allocation.
-    The intended usage of pitch is as a separate parameter of the
-    allocation, used to compute addresses within the 2D array. Given the
-    row and column of an array element of type T, the address is computed
-    as:
+    Allocates at least ``WidthInBytes`` * ``Height`` bytes of linear memory
+    on the device and returns in ``*dptr`` a pointer to the allocated
+    memory. The function may pad the allocation to ensure that
+    corresponding pointers in any given row will continue to meet the
+    alignment requirements for coalescing as the address is updated from
+    row to row. ``ElementSizeBytes`` specifies the size of the largest
+    reads and writes that will be performed on the memory range.
+    ``ElementSizeBytes`` may be 4, 8 or 16 (since coalesced memory
+    transactions are not possible on other data sizes). If
+    ``ElementSizeBytes`` is smaller than the actual read/write size of a
+    kernel, the kernel will run correctly, but possibly at reduced speed.
+    The pitch returned in ``*pPitch`` by :py:obj:`~.cuMemAllocPitch()` is
+    the width in bytes of the allocation. The intended usage of pitch is as
+    a separate parameter of the allocation, used to compute addresses
+    within the 2D array. Given the row and column of an array element of
+    type T, the address is computed as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -31419,7 +31432,7 @@ def cuMemAllocPitch(size_t WidthInBytes, size_t Height, unsigned int ElementSize
 def cuMemFree(dptr):
     """ Frees device memory.
 
-    Frees the memory space pointed to by `dptr`, which must have been
+    Frees the memory space pointed to by ``dptr``, which must have been
     returned by a previous call to one of the following memory allocation
     APIs - :py:obj:`~.cuMemAlloc()`, :py:obj:`~.cuMemAllocPitch()`,
     :py:obj:`~.cuMemAllocManaged()`, :py:obj:`~.cuMemAllocAsync()`,
@@ -31467,9 +31480,9 @@ def cuMemFree(dptr):
 def cuMemGetAddressRange(dptr):
     """ Get information on memory allocations.
 
-    Returns the base address in `*pbase` and size in `*psize` of the
-    allocation that contains the input pointer `dptr`. Both parameters
-    `pbase` and `psize` are optional. If one of them is NULL, it is
+    Returns the base address in ``*pbase`` and size in ``*psize`` of the
+    allocation that contains the input pointer ``dptr``. Both parameters
+    ``pbase`` and ``psize`` are optional. If one of them is NULL, it is
     ignored.
 
     Parameters
@@ -31513,7 +31526,7 @@ def cuMemGetAddressRange(dptr):
 def cuMemAllocHost(size_t bytesize):
     """ Allocates page-locked host memory.
 
-    Allocates `bytesize` bytes of host memory that is page-locked and
+    Allocates ``bytesize`` bytes of host memory that is page-locked and
     accessible to the device. The driver tracks the virtual memory ranges
     allocated with this function and automatically accelerates calls to
     functions such as :py:obj:`~.cuMemcpy()`. Since the memory can be
@@ -31537,8 +31550,8 @@ def cuMemAllocHost(size_t bytesize):
     which support unified addressing (as may be queried using
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING`). The device pointer
     that may be used to access this host memory from those contexts is
-    always equal to the returned host pointer `*pp`. See :py:obj:`~.Unified
-    Addressing` for additional details.
+    always equal to the returned host pointer ``*pp``. See Unified
+    Addressing for additional details.
 
     Parameters
     ----------
@@ -31570,8 +31583,8 @@ def cuMemAllocHost(size_t bytesize):
 def cuMemFreeHost(p):
     """ Frees page-locked host memory.
 
-    Frees the memory space pointed to by `p`, which must have been returned
-    by a previous call to :py:obj:`~.cuMemAllocHost()`.
+    Frees the memory space pointed to by ``p``, which must have been
+    returned by a previous call to :py:obj:`~.cuMemAllocHost()`.
 
     Parameters
     ----------
@@ -31601,7 +31614,7 @@ def cuMemFreeHost(p):
 def cuMemHostAlloc(size_t bytesize, unsigned int Flags):
     """ Allocates page-locked host memory.
 
-    Allocates `bytesize` bytes of host memory that is page-locked and
+    Allocates ``bytesize`` bytes of host memory that is page-locked and
     accessible to the device. The driver tracks the virtual memory ranges
     allocated with this function and automatically accelerates calls to
     functions such as :py:obj:`~.cuMemcpyHtoD()`. Since the memory can be
@@ -31619,7 +31632,7 @@ def cuMemHostAlloc(size_t bytesize, unsigned int Flags):
     system for paging. As a result, this function is best used sparingly to
     allocate staging areas for data exchange between host and device.
 
-    The `Flags` parameter enables different options to be specified that
+    The ``Flags`` parameter enables different options to be specified that
     affect the allocation, as follows.
 
     - :py:obj:`~.CU_MEMHOSTALLOC_PORTABLE`: The memory returned by this
@@ -31656,11 +31669,11 @@ def cuMemHostAlloc(size_t bytesize, unsigned int Flags):
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING`). Unless the flag
     :py:obj:`~.CU_MEMHOSTALLOC_WRITECOMBINED` is specified, the device
     pointer that may be used to access this host memory from those contexts
-    is always equal to the returned host pointer `*pp`. If the flag
+    is always equal to the returned host pointer ``*pp``. If the flag
     :py:obj:`~.CU_MEMHOSTALLOC_WRITECOMBINED` is specified, then the
     function :py:obj:`~.cuMemHostGetDevicePointer()` must be used to query
     the device pointer, even if the context supports unified addressing.
-    See :py:obj:`~.Unified Addressing` for additional details.
+    See Unified Addressing for additional details.
 
     Parameters
     ----------
@@ -31694,8 +31707,8 @@ def cuMemHostAlloc(size_t bytesize, unsigned int Flags):
 def cuMemHostGetDevicePointer(p, unsigned int Flags):
     """ Passes back device pointer of mapped pinned memory.
 
-    Passes back the device pointer `pdptr` corresponding to the mapped,
-    pinned host buffer `p` allocated by :py:obj:`~.cuMemHostAlloc`.
+    Passes back the device pointer ``pdptr`` corresponding to the mapped,
+    pinned host buffer ``p`` allocated by :py:obj:`~.cuMemHostAlloc`.
 
     :py:obj:`~.cuMemHostGetDevicePointer()` will fail if the
     :py:obj:`~.CU_MEMHOSTALLOC_DEVICEMAP` flag was not specified at the
@@ -31705,23 +31718,23 @@ def cuMemHostGetDevicePointer(p, unsigned int Flags):
     For devices that have a non-zero value for the device attribute
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM`,
     the memory can also be accessed from the device using the host pointer
-    `p`. The device pointer returned by
+    ``p``. The device pointer returned by
     :py:obj:`~.cuMemHostGetDevicePointer()` may or may not match the
-    original host pointer `p` and depends on the devices visible to the
+    original host pointer ``p`` and depends on the devices visible to the
     application. If all devices visible to the application have a non-zero
     value for the device attribute, the device pointer returned by
     :py:obj:`~.cuMemHostGetDevicePointer()` will match the original pointer
-    `p`. If any device visible to the application has a zero value for the
-    device attribute, the device pointer returned by
+    ``p``. If any device visible to the application has a zero value for
+    the device attribute, the device pointer returned by
     :py:obj:`~.cuMemHostGetDevicePointer()` will not match the original
-    host pointer `p`, but it will be suitable for use on all devices
+    host pointer ``p``, but it will be suitable for use on all devices
     provided Unified Virtual Addressing is enabled. In such systems, it is
     valid to access the memory using either pointer on devices that have a
     non-zero value for the device attribute. Note however that such devices
     should access the memory using only one of the two pointers and not
     both.
 
-    `Flags` provides for future releases. For now, it must be set to 0.
+    ``Flags`` provides for future releases. For now, it must be set to 0.
 
     Parameters
     ----------
@@ -31758,8 +31771,8 @@ def cuMemHostGetDevicePointer(p, unsigned int Flags):
 def cuMemHostGetFlags(p):
     """ Passes back flags that were used for a pinned allocation.
 
-    Passes back the flags `pFlags` that were specified when allocating the
-    pinned host buffer `p` allocated by :py:obj:`~.cuMemHostAlloc`.
+    Passes back the flags ``pFlags`` that were specified when allocating
+    the pinned host buffer ``p`` allocated by :py:obj:`~.cuMemHostAlloc`.
 
     :py:obj:`~.cuMemHostGetFlags()` will fail if the pointer does not
     reside in an allocation performed by :py:obj:`~.cuMemAllocHost()` or
@@ -31798,19 +31811,20 @@ def cuMemHostGetFlags(p):
 def cuMemAllocManaged(size_t bytesize, unsigned int flags):
     """ Allocates memory that will be automatically managed by the Unified Memory system.
 
-    Allocates `bytesize` bytes of managed memory on the device and returns
-    in `*dptr` a pointer to the allocated memory. If the device doesn't
-    support allocating managed memory, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
-    is returned. Support for managed memory can be queried using the device
-    attribute :py:obj:`~.CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY`. The allocated
-    memory is suitably aligned for any kind of variable. The memory is not
-    cleared. If `bytesize` is 0, :py:obj:`~.cuMemAllocManaged` returns
+    Allocates ``bytesize`` bytes of managed memory on the device and
+    returns in ``*dptr`` a pointer to the allocated memory. If the device
+    doesn't support allocating managed memory,
+    :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED` is returned. Support for managed
+    memory can be queried using the device attribute
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY`. The allocated memory is
+    suitably aligned for any kind of variable. The memory is not cleared.
+    If ``bytesize`` is 0, :py:obj:`~.cuMemAllocManaged` returns
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE`. The pointer is valid on the CPU
     and on all GPUs in the system that support managed memory. All accesses
     to this pointer must obey the Unified Memory programming model.
 
-    `flags` specifies the default stream association for this allocation.
-    `flags` must be one of :py:obj:`~.CU_MEM_ATTACH_GLOBAL` or
+    ``flags`` specifies the default stream association for this allocation.
+    ``flags`` must be one of :py:obj:`~.CU_MEM_ATTACH_GLOBAL` or
     :py:obj:`~.CU_MEM_ATTACH_HOST`. If :py:obj:`~.CU_MEM_ATTACH_GLOBAL` is
     specified, then this memory is accessible from any stream on any
     device. If :py:obj:`~.CU_MEM_ATTACH_HOST` is specified, then the
@@ -31947,10 +31961,10 @@ cdef void cuAsyncNotificationCallbackWrapper(cydriver.CUasyncNotificationInfo *i
 def cuDeviceRegisterAsyncNotification(device, callbackFunc, userData):
     """ Registers a callback function to receive async notifications.
 
-    Registers `callbackFunc` to receive async notifications.
+    Registers ``callbackFunc`` to receive async notifications.
 
-    The `userData` parameter is passed to the callback function at async
-    notification time.   Likewise, `callback` is also passed to the
+    The ``userData`` parameter is passed to the callback function at async
+    notification time.   Likewise, ``callback`` is also passed to the
     callback function to distinguish between multiple registered callbacks.
 
     The callback function being registered should be designed to return
@@ -31962,7 +31976,7 @@ def cuDeviceRegisterAsyncNotification(device, callbackFunc, userData):
     :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`. Async notification callbacks
     execute in an undefined order and may be serialized.
 
-    Returns in `*callback` a handle representing the registered callback
+    Returns in ``*callback`` a handle representing the registered callback
     instance.
 
     Parameters
@@ -32031,13 +32045,13 @@ def cuDeviceRegisterAsyncNotification(device, callbackFunc, userData):
 def cuDeviceUnregisterAsyncNotification(device, callback):
     """ Unregisters an async notification callback.
 
-    Unregisters `callback` so that the corresponding callback function will
-    stop receiving async notifications.
+    Unregisters ``callback`` so that the corresponding callback function
+    will stop receiving async notifications.
 
     Parameters
     ----------
     device : :py:obj:`~.CUdevice`
-        The device from which to remove `callback`.
+        The device from which to remove ``callback``.
     callback : :py:obj:`~.CUasyncCallbackHandle`
         The callback instance to unregister from receiving async
         notifications.
@@ -32081,10 +32095,10 @@ def cuDeviceUnregisterAsyncNotification(device, callback):
 def cuDeviceGetByPCIBusId(char* pciBusId):
     """ Returns a handle to a compute device.
 
-    Returns in `*device` a device handle given a PCI bus ID string.
+    Returns in ``*device`` a device handle given a PCI bus ID string.
 
-    where `domain`, `bus`, `device`, and `function` are all hexadecimal
-    values
+    where ``domain``, ``bus``, ``device``, and ``function`` are all
+    hexadecimal values
 
     Parameters
     ----------
@@ -32116,18 +32130,18 @@ def cuDeviceGetByPCIBusId(char* pciBusId):
 def cuDeviceGetPCIBusId(int length, dev):
     """ Returns a PCI Bus Id string for the device.
 
-    Returns an ASCII string identifying the device `dev` in the NULL-
-    terminated string pointed to by `pciBusId`. `length` specifies the
+    Returns an ASCII string identifying the device ``dev`` in the NULL-
+    terminated string pointed to by ``pciBusId``. ``length`` specifies the
     maximum length of the string that may be returned.
 
-    where `domain`, `bus`, `device`, and `function` are all hexadecimal
-    values. pciBusId should be large enough to store 13 characters
-    including the NULL-terminator.
+    where ``domain``, ``bus``, ``device``, and ``function`` are all
+    hexadecimal values. pciBusId should be large enough to store 13
+    characters including the NULL-terminator.
 
     Parameters
     ----------
     length : int
-        Maximum length of string to store in `name`
+        Maximum length of string to store in ``name``
     dev : :py:obj:`~.CUdevice`
         Device to get identifier string for
 
@@ -32382,7 +32396,7 @@ def cuIpcOpenMemHandle(handle not None : CUipcMemHandle, unsigned int Flags):
 
     Notes
     -----
-    No guarantees are made about the address returned in `*pdptr`. In particular, multiple processes may not receive the same address for the same `handle`.
+    No guarantees are made about the address returned in ``*pdptr``. In particular, multiple processes may not receive the same address for the same ``handle``.
     """
     cdef CUdeviceptr pdptr = CUdeviceptr()
     with nogil:
@@ -32447,24 +32461,24 @@ def cuIpcCloseMemHandle(dptr):
 def cuMemHostRegister(p, size_t bytesize, unsigned int Flags):
     """ Registers an existing host memory range for use by CUDA.
 
-    Page-locks the memory range specified by `p` and `bytesize` and maps it
-    for the device(s) as specified by `Flags`. This memory range also is
-    added to the same tracking mechanism as :py:obj:`~.cuMemHostAlloc` to
-    automatically accelerate calls to functions such as
-    :py:obj:`~.cuMemcpyHtoD()`. Since the memory can be accessed directly
-    by the device, it can be read or written with much higher bandwidth
-    than pageable memory that has not been registered. Page-locking
-    excessive amounts of memory may degrade system performance, since it
-    reduces the amount of memory available to the system for paging. As a
-    result, this function is best used sparingly to register staging areas
-    for data exchange between host and device.
+    Page-locks the memory range specified by ``p`` and ``bytesize`` and
+    maps it for the device(s) as specified by ``Flags``. This memory range
+    also is added to the same tracking mechanism as
+    :py:obj:`~.cuMemHostAlloc` to automatically accelerate calls to
+    functions such as :py:obj:`~.cuMemcpyHtoD()`. Since the memory can be
+    accessed directly by the device, it can be read or written with much
+    higher bandwidth than pageable memory that has not been registered.
+    Page-locking excessive amounts of memory may degrade system
+    performance, since it reduces the amount of memory available to the
+    system for paging. As a result, this function is best used sparingly to
+    register staging areas for data exchange between host and device.
 
     On systems where
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES`
     is true, :py:obj:`~.cuMemHostRegister` will not page-lock the memory
-    range specified by `ptr` but only populate unpopulated pages.
+    range specified by ``ptr`` but only populate unpopulated pages.
 
-    The `Flags` parameter enables different options to be specified that
+    The ``Flags`` parameter enables different options to be specified that
     affect the allocation, as follows.
 
     - :py:obj:`~.CU_MEMHOSTREGISTER_PORTABLE`: The memory returned by this
@@ -32503,16 +32517,16 @@ def cuMemHostRegister(p, size_t bytesize, unsigned int Flags):
     For devices that have a non-zero value for the device attribute
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM`,
     the memory can also be accessed from the device using the host pointer
-    `p`. The device pointer returned by
+    ``p``. The device pointer returned by
     :py:obj:`~.cuMemHostGetDevicePointer()` may or may not match the
-    original host pointer `ptr` and depends on the devices visible to the
+    original host pointer ``ptr`` and depends on the devices visible to the
     application. If all devices visible to the application have a non-zero
     value for the device attribute, the device pointer returned by
     :py:obj:`~.cuMemHostGetDevicePointer()` will match the original pointer
-    `ptr`. If any device visible to the application has a zero value for
+    ``ptr``. If any device visible to the application has a zero value for
     the device attribute, the device pointer returned by
     :py:obj:`~.cuMemHostGetDevicePointer()` will not match the original
-    host pointer `ptr`, but it will be suitable for use on all devices
+    host pointer ``ptr``, but it will be suitable for use on all devices
     provided Unified Virtual Addressing is enabled. In such systems, it is
     valid to access the memory using either pointer on devices that have a
     non-zero value for the device attribute. Note however that such devices
@@ -32553,7 +32567,7 @@ def cuMemHostRegister(p, size_t bytesize, unsigned int Flags):
 def cuMemHostUnregister(p):
     """ Unregisters a memory range that was registered with cuMemHostRegister.
 
-    Unmaps the memory range whose base address is specified by `p`, and
+    Unmaps the memory range whose base address is specified by ``p``, and
     makes it pageable again.
 
     The base address must be the same one specified to
@@ -32587,11 +32601,11 @@ def cuMemHostUnregister(p):
 def cuMemcpy(dst, src, size_t ByteCount):
     """ Copies memory.
 
-    Copies data between two pointers. `dst` and `src` are base pointers of
-    the destination and source, respectively. `ByteCount` specifies the
-    number of bytes to copy. Note that this function infers the type of the
-    transfer (host to host, host to device, device to device, or device to
-    host) from the pointer values. This function is only allowed in
+    Copies data between two pointers. ``dst`` and ``src`` are base pointers
+    of the destination and source, respectively. ``ByteCount`` specifies
+    the number of bytes to copy. Note that this function infers the type of
+    the transfer (host to host, host to device, device to device, or device
+    to host) from the pointer values. This function is only allowed in
     contexts which support unified addressing.
 
     Parameters
@@ -32640,10 +32654,10 @@ def cuMemcpyPeer(dstDevice, dstContext, srcDevice, srcContext, size_t ByteCount)
     """ Copies device memory between two contexts.
 
     Copies from device memory in one context to device memory in another
-    context. `dstDevice` is the base device pointer of the destination
-    memory and `dstContext` is the destination context. `srcDevice` is the
-    base device pointer of the source memory and `srcContext` is the source
-    pointer. `ByteCount` specifies the number of bytes to copy.
+    context. ``dstDevice`` is the base device pointer of the destination
+    memory and ``dstContext`` is the destination context. ``srcDevice`` is
+    the base device pointer of the source memory and ``srcContext`` is the
+    source pointer. ``ByteCount`` specifies the number of bytes to copy.
 
     Parameters
     ----------
@@ -32710,9 +32724,9 @@ def cuMemcpyPeer(dstDevice, dstContext, srcDevice, srcContext, size_t ByteCount)
 def cuMemcpyHtoD(dstDevice, srcHost, size_t ByteCount):
     """ Copies memory from Host to Device.
 
-    Copies from host memory to device memory. `dstDevice` and `srcHost` are
-    the base addresses of the destination and source, respectively.
-    `ByteCount` specifies the number of bytes to copy.
+    Copies from host memory to device memory. ``dstDevice`` and ``srcHost``
+    are the base addresses of the destination and source, respectively.
+    ``ByteCount`` specifies the number of bytes to copy.
 
     Parameters
     ----------
@@ -32754,9 +32768,9 @@ def cuMemcpyHtoD(dstDevice, srcHost, size_t ByteCount):
 def cuMemcpyDtoH(dstHost, srcDevice, size_t ByteCount):
     """ Copies memory from Device to Host.
 
-    Copies from device to host memory. `dstHost` and `srcDevice` specify
-    the base pointers of the destination and source, respectively.
-    `ByteCount` specifies the number of bytes to copy.
+    Copies from device to host memory. ``dstHost`` and ``srcDevice``
+    specify the base pointers of the destination and source, respectively.
+    ``ByteCount`` specifies the number of bytes to copy.
 
     Parameters
     ----------
@@ -32798,9 +32812,9 @@ def cuMemcpyDtoH(dstHost, srcDevice, size_t ByteCount):
 def cuMemcpyDtoD(dstDevice, srcDevice, size_t ByteCount):
     """ Copies memory from Device to Device.
 
-    Copies from device memory to device memory. `dstDevice` and `srcDevice`
-    are the base pointers of the destination and source, respectively.
-    `ByteCount` specifies the number of bytes to copy.
+    Copies from device memory to device memory. ``dstDevice`` and
+    ``srcDevice`` are the base pointers of the destination and source,
+    respectively. ``ByteCount`` specifies the number of bytes to copy.
 
     Parameters
     ----------
@@ -32847,10 +32861,10 @@ def cuMemcpyDtoD(dstDevice, srcDevice, size_t ByteCount):
 def cuMemcpyDtoA(dstArray, size_t dstOffset, srcDevice, size_t ByteCount):
     """ Copies memory from Device to Array.
 
-    Copies from device memory to a 1D CUDA array. `dstArray` and
-    `dstOffset` specify the CUDA array handle and starting index of the
-    destination data. `srcDevice` specifies the base pointer of the source.
-    `ByteCount` specifies the number of bytes to copy.
+    Copies from device memory to a 1D CUDA array. ``dstArray`` and
+    ``dstOffset`` specify the CUDA array handle and starting index of the
+    destination data. ``srcDevice`` specifies the base pointer of the
+    source. ``ByteCount`` specifies the number of bytes to copy.
 
     Parameters
     ----------
@@ -32899,12 +32913,12 @@ def cuMemcpyDtoA(dstArray, size_t dstOffset, srcDevice, size_t ByteCount):
 def cuMemcpyAtoD(dstDevice, srcArray, size_t srcOffset, size_t ByteCount):
     """ Copies memory from Array to Device.
 
-    Copies from one 1D CUDA array to device memory. `dstDevice` specifies
+    Copies from one 1D CUDA array to device memory. ``dstDevice`` specifies
     the base pointer of the destination and must be naturally aligned with
-    the CUDA array elements. `srcArray` and `srcOffset` specify the CUDA
-    array handle and the offset in bytes into the array where the copy is
-    to begin. `ByteCount` specifies the number of bytes to copy and must be
-    evenly divisible by the array element size.
+    the CUDA array elements. ``srcArray`` and ``srcOffset`` specify the
+    CUDA array handle and the offset in bytes into the array where the copy
+    is to begin. ``ByteCount`` specifies the number of bytes to copy and
+    must be evenly divisible by the array element size.
 
     Parameters
     ----------
@@ -32953,10 +32967,10 @@ def cuMemcpyAtoD(dstDevice, srcArray, size_t srcOffset, size_t ByteCount):
 def cuMemcpyHtoA(dstArray, size_t dstOffset, srcHost, size_t ByteCount):
     """ Copies memory from Host to Array.
 
-    Copies from host memory to a 1D CUDA array. `dstArray` and `dstOffset`
-    specify the CUDA array handle and starting offset in bytes of the
-    destination data. `pSrc` specifies the base address of the source.
-    `ByteCount` specifies the number of bytes to copy.
+    Copies from host memory to a 1D CUDA array. ``dstArray`` and
+    ``dstOffset`` specify the CUDA array handle and starting offset in
+    bytes of the destination data. ``pSrc`` specifies the base address of
+    the source. ``ByteCount`` specifies the number of bytes to copy.
 
     Parameters
     ----------
@@ -33000,10 +33014,10 @@ def cuMemcpyHtoA(dstArray, size_t dstOffset, srcHost, size_t ByteCount):
 def cuMemcpyAtoH(dstHost, srcArray, size_t srcOffset, size_t ByteCount):
     """ Copies memory from Array to Host.
 
-    Copies from one 1D CUDA array to host memory. `dstHost` specifies the
-    base pointer of the destination. `srcArray` and `srcOffset` specify the
-    CUDA array handle and starting offset in bytes of the source data.
-    `ByteCount` specifies the number of bytes to copy.
+    Copies from one 1D CUDA array to host memory. ``dstHost`` specifies the
+    base pointer of the destination. ``srcArray`` and ``srcOffset`` specify
+    the CUDA array handle and starting offset in bytes of the source data.
+    ``ByteCount`` specifies the number of bytes to copy.
 
     Parameters
     ----------
@@ -33047,13 +33061,14 @@ def cuMemcpyAtoH(dstHost, srcArray, size_t srcOffset, size_t ByteCount):
 def cuMemcpyAtoA(dstArray, size_t dstOffset, srcArray, size_t srcOffset, size_t ByteCount):
     """ Copies memory from Array to Array.
 
-    Copies from one 1D CUDA array to another. `dstArray` and `srcArray`
+    Copies from one 1D CUDA array to another. ``dstArray`` and ``srcArray``
     specify the handles of the destination and source CUDA arrays for the
-    copy, respectively. `dstOffset` and `srcOffset` specify the destination
-    and source offsets in bytes into the CUDA arrays. `ByteCount` is the
-    number of bytes to be copied. The size of the elements in the CUDA
-    arrays need not be the same format, but the elements must be the same
-    size; and count must be evenly divisible by that size.
+    copy, respectively. ``dstOffset`` and ``srcOffset`` specify the
+    destination and source offsets in bytes into the CUDA arrays.
+    ``ByteCount`` is the number of bytes to be copied. The size of the
+    elements in the CUDA arrays need not be the same format, but the
+    elements must be the same size; and count must be evenly divisible by
+    that size.
 
     Parameters
     ----------
@@ -33105,7 +33120,7 @@ def cuMemcpy2D(pCopy : Optional[CUDA_MEMCPY2D]):
     """ Copies memory for 2D arrays.
 
     Perform a 2D memory copy according to the parameters specified in
-    `pCopy`. The :py:obj:`~.CUDA_MEMCPY2D` structure is defined as:
+    ``pCopy``. The :py:obj:`~.CUDA_MEMCPY2D` structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -33232,7 +33247,7 @@ def cuMemcpy2DUnaligned(pCopy : Optional[CUDA_MEMCPY2D]):
     """ Copies memory for 2D arrays.
 
     Perform a 2D memory copy according to the parameters specified in
-    `pCopy`. The :py:obj:`~.CUDA_MEMCPY2D` structure is defined as:
+    ``pCopy``. The :py:obj:`~.CUDA_MEMCPY2D` structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -33359,7 +33374,7 @@ def cuMemcpy3D(pCopy : Optional[CUDA_MEMCPY3D]):
     """ Copies memory for 3D arrays.
 
     Perform a 3D memory copy according to the parameters specified in
-    `pCopy`. The :py:obj:`~.CUDA_MEMCPY3D` structure is defined as:
+    ``pCopy``. The :py:obj:`~.CUDA_MEMCPY3D` structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -33489,7 +33504,7 @@ def cuMemcpy3DPeer(pCopy : Optional[CUDA_MEMCPY3D_PEER]):
     """ Copies memory between contexts.
 
     Perform a 3D memory copy according to the parameters specified in
-    `pCopy`. See the definition of the :py:obj:`~.CUDA_MEMCPY3D_PEER`
+    ``pCopy``. See the definition of the :py:obj:`~.CUDA_MEMCPY3D_PEER`
     structure for documentation of its parameters.
 
     Parameters
@@ -33518,11 +33533,11 @@ def cuMemcpy3DPeer(pCopy : Optional[CUDA_MEMCPY3D_PEER]):
 def cuMemcpyAsync(dst, src, size_t ByteCount, hStream):
     """ Copies memory asynchronously.
 
-    Copies data between two pointers. `dst` and `src` are base pointers of
-    the destination and source, respectively. `ByteCount` specifies the
-    number of bytes to copy. Note that this function infers the type of the
-    transfer (host to host, host to device, device to device, or device to
-    host) from the pointer values. This function is only allowed in
+    Copies data between two pointers. ``dst`` and ``src`` are base pointers
+    of the destination and source, respectively. ``ByteCount`` specifies
+    the number of bytes to copy. Note that this function infers the type of
+    the transfer (host to host, host to device, device to device, or device
+    to host) from the pointer values. This function is only allowed in
     contexts which support unified addressing.
 
     Parameters
@@ -33581,10 +33596,10 @@ def cuMemcpyPeerAsync(dstDevice, dstContext, srcDevice, srcContext, size_t ByteC
     """ Copies device memory between two contexts asynchronously.
 
     Copies from device memory in one context to device memory in another
-    context. `dstDevice` is the base device pointer of the destination
-    memory and `dstContext` is the destination context. `srcDevice` is the
-    base device pointer of the source memory and `srcContext` is the source
-    pointer. `ByteCount` specifies the number of bytes to copy.
+    context. ``dstDevice`` is the base device pointer of the destination
+    memory and ``dstContext`` is the destination context. ``srcDevice`` is
+    the base device pointer of the source memory and ``srcContext`` is the
+    source pointer. ``ByteCount`` specifies the number of bytes to copy.
 
     Parameters
     ----------
@@ -33661,9 +33676,9 @@ def cuMemcpyPeerAsync(dstDevice, dstContext, srcDevice, srcContext, size_t ByteC
 def cuMemcpyHtoDAsync(dstDevice, srcHost, size_t ByteCount, hStream):
     """ Copies memory from Host to Device.
 
-    Copies from host memory to device memory. `dstDevice` and `srcHost` are
-    the base addresses of the destination and source, respectively.
-    `ByteCount` specifies the number of bytes to copy.
+    Copies from host memory to device memory. ``dstDevice`` and ``srcHost``
+    are the base addresses of the destination and source, respectively.
+    ``ByteCount`` specifies the number of bytes to copy.
 
     Parameters
     ----------
@@ -33715,9 +33730,9 @@ def cuMemcpyHtoDAsync(dstDevice, srcHost, size_t ByteCount, hStream):
 def cuMemcpyDtoHAsync(dstHost, srcDevice, size_t ByteCount, hStream):
     """ Copies memory from Device to Host.
 
-    Copies from device to host memory. `dstHost` and `srcDevice` specify
-    the base pointers of the destination and source, respectively.
-    `ByteCount` specifies the number of bytes to copy.
+    Copies from device to host memory. ``dstHost`` and ``srcDevice``
+    specify the base pointers of the destination and source, respectively.
+    ``ByteCount`` specifies the number of bytes to copy.
 
     Parameters
     ----------
@@ -33769,9 +33784,9 @@ def cuMemcpyDtoHAsync(dstHost, srcDevice, size_t ByteCount, hStream):
 def cuMemcpyDtoDAsync(dstDevice, srcDevice, size_t ByteCount, hStream):
     """ Copies memory from Device to Device.
 
-    Copies from device memory to device memory. `dstDevice` and `srcDevice`
-    are the base pointers of the destination and source, respectively.
-    `ByteCount` specifies the number of bytes to copy.
+    Copies from device memory to device memory. ``dstDevice`` and
+    ``srcDevice`` are the base pointers of the destination and source,
+    respectively. ``ByteCount`` specifies the number of bytes to copy.
 
     Parameters
     ----------
@@ -33828,10 +33843,10 @@ def cuMemcpyDtoDAsync(dstDevice, srcDevice, size_t ByteCount, hStream):
 def cuMemcpyHtoAAsync(dstArray, size_t dstOffset, srcHost, size_t ByteCount, hStream):
     """ Copies memory from Host to Array.
 
-    Copies from host memory to a 1D CUDA array. `dstArray` and `dstOffset`
-    specify the CUDA array handle and starting offset in bytes of the
-    destination data. `srcHost` specifies the base address of the source.
-    `ByteCount` specifies the number of bytes to copy.
+    Copies from host memory to a 1D CUDA array. ``dstArray`` and
+    ``dstOffset`` specify the CUDA array handle and starting offset in
+    bytes of the destination data. ``srcHost`` specifies the base address
+    of the source. ``ByteCount`` specifies the number of bytes to copy.
 
     Parameters
     ----------
@@ -33885,10 +33900,10 @@ def cuMemcpyHtoAAsync(dstArray, size_t dstOffset, srcHost, size_t ByteCount, hSt
 def cuMemcpyAtoHAsync(dstHost, srcArray, size_t srcOffset, size_t ByteCount, hStream):
     """ Copies memory from Array to Host.
 
-    Copies from one 1D CUDA array to host memory. `dstHost` specifies the
-    base pointer of the destination. `srcArray` and `srcOffset` specify the
-    CUDA array handle and starting offset in bytes of the source data.
-    `ByteCount` specifies the number of bytes to copy.
+    Copies from one 1D CUDA array to host memory. ``dstHost`` specifies the
+    base pointer of the destination. ``srcArray`` and ``srcOffset`` specify
+    the CUDA array handle and starting offset in bytes of the source data.
+    ``ByteCount`` specifies the number of bytes to copy.
 
     Parameters
     ----------
@@ -33943,7 +33958,7 @@ def cuMemcpy2DAsync(pCopy : Optional[CUDA_MEMCPY2D], hStream):
     """ Copies memory for 2D arrays.
 
     Perform a 2D memory copy according to the parameters specified in
-    `pCopy`. The :py:obj:`~.CUDA_MEMCPY2D` structure is defined as:
+    ``pCopy``. The :py:obj:`~.CUDA_MEMCPY2D` structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -34087,7 +34102,7 @@ def cuMemcpy3DAsync(pCopy : Optional[CUDA_MEMCPY3D], hStream):
     """ Copies memory for 3D arrays.
 
     Perform a 3D memory copy according to the parameters specified in
-    `pCopy`. The :py:obj:`~.CUDA_MEMCPY3D` structure is defined as:
+    ``pCopy``. The :py:obj:`~.CUDA_MEMCPY3D` structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -34227,7 +34242,7 @@ def cuMemcpy3DPeerAsync(pCopy : Optional[CUDA_MEMCPY3D_PEER], hStream):
     """ Copies memory between contexts asynchronously.
 
     Perform a 3D memory copy according to the parameters specified in
-    `pCopy`. See the definition of the :py:obj:`~.CUDA_MEMCPY3D_PEER`
+    ``pCopy``. See the definition of the :py:obj:`~.CUDA_MEMCPY3D_PEER`
     structure for documentation of its parameters.
 
     Parameters
@@ -34272,26 +34287,26 @@ def cuMemcpyBatchAsync(dsts : Optional[tuple[CUdeviceptr] | list[CUdeviceptr]],
     For copies involving CUDA arrays, please see
     :py:obj:`~.cuMemcpy3DBatchAsync`.
 
-    Performs memory copies from source buffers specified in `srcs` to
-    destination buffers specified in `dsts`. The size of each copy is
-    specified in `sizes`. All three arrays must be of the same length as
-    specified by `count`. Since there are no ordering guarantees for copies
-    within a batch, specifying any dependent copies within a batch will
-    result in undefined behavior.
+    Performs memory copies from source buffers specified in ``srcs`` to
+    destination buffers specified in ``dsts``. The size of each copy is
+    specified in ``sizes``. All three arrays must be of the same length as
+    specified by ``count``. Since there are no ordering guarantees for
+    copies within a batch, specifying any dependent copies within a batch
+    will result in undefined behavior.
 
     Every copy in the batch has to be associated with a set of attributes
-    specified in the `attrs` array. Each entry in this array can apply to
-    more than one copy. This can be done by specifying in the `attrsIdxs`
+    specified in the ``attrs`` array. Each entry in this array can apply to
+    more than one copy. This can be done by specifying in the ``attrsIdxs``
     array, the index of the first copy that the corresponding entry in the
-    `attrs` array applies to. Both `attrs` and `attrsIdxs` must be of the
-    same length as specified by `numAttrs`. For example, if a batch has 10
-    copies listed in dst/src/sizes, the first 6 of which have one set of
-    attributes and the remaining 4 another, then `numAttrs` will be 2,
-    `attrsIdxs` will be {0, 6} and `attrs` will contains the two sets of
-    attributes. Note that the first entry in `attrsIdxs` must always be 0.
-    Also, each entry must be greater than the previous entry and the last
-    entry should be less than `count`. Furthermore, `numAttrs` must be
-    lesser than or equal to `count`.
+    ``attrs`` array applies to. Both ``attrs`` and ``attrsIdxs`` must be of
+    the same length as specified by ``numAttrs``. For example, if a batch
+    has 10 copies listed in dst/src/sizes, the first 6 of which have one
+    set of attributes and the remaining 4 another, then ``numAttrs`` will
+    be 2, ``attrsIdxs`` will be {0, 6} and ``attrs`` will contains the two
+    sets of attributes. Note that the first entry in ``attrsIdxs`` must
+    always be 0. Also, each entry must be greater than the previous entry
+    and the last entry should be less than ``count``. Furthermore,
+    ``numAttrs`` must be lesser than or equal to ``count``.
 
     The :py:obj:`~.CUmemcpyAttributes.srcAccessOrder` indicates the source
     access ordering to be observed for copies associated with the
@@ -34344,17 +34359,17 @@ def cuMemcpyBatchAsync(dsts : Optional[tuple[CUdeviceptr] | list[CUdeviceptr]],
     sizes : list[int]
         Array of sizes for memcpy operations.
     count : size_t
-        Size of `dsts`, `srcs` and `sizes` arrays
+        Size of ``dsts``, ``srcs`` and ``sizes`` arrays
     attrs : list[:py:obj:`~.CUmemcpyAttributes`]
         Array of memcpy attributes.
     attrsIdxs : list[int]
-        Array of indices to specify which copies each entry in the `attrs`
-        array applies to. The attributes specified in attrs[k] will be
-        applied to copies starting from attrsIdxs[k] through attrsIdxs[k+1]
-        - 1. Also attrs[numAttrs-1] will apply to copies starting from
-        attrsIdxs[numAttrs-1] through count - 1.
+        Array of indices to specify which copies each entry in the
+        ``attrs`` array applies to. The attributes specified in attrs[k]
+        will be applied to copies starting from attrsIdxs[k] through
+        attrsIdxs[k+1] - 1. Also attrs[numAttrs-1] will apply to copies
+        starting from attrsIdxs[numAttrs-1] through count - 1.
     numAttrs : size_t
-        Size of `attrs` and `attrsIdxs` arrays.
+        Size of ``attrs`` and ``attrsIdxs`` arrays.
     hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
         The stream to enqueue the operations in. Must not be legacy NULL
         stream.
@@ -34443,8 +34458,8 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[CUDA_MEMCPY3D_BA
     any specific order. Note that this means specifying any dependent
     copies within a batch will result in undefined behavior.
 
-    Performs memory copies as specified in the `opList` array. The length
-    of this array is specified in `numOps`. Each entry in this array
+    Performs memory copies as specified in the ``opList`` array. The length
+    of this array is specified in ``numOps``. Each entry in this array
     describes a copy operation. This includes among other things, the
     source and destination operands for the copy as specified in
     :py:obj:`~.CUDA_MEMCPY3D_BATCH_OP.src` and
@@ -34506,8 +34521,8 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[CUDA_MEMCPY3D_BA
     suited for host pointers allocated outside CUDA (ex., via malloc) when
     it's known that no prior operations in the stream can be accessing the
     memory. Specifying this flag allows the driver to optimize the copy on
-    certain platforms. Each memcopy operation in `opList` must have a valid
-    srcAccessOrder setting, otherwise this API will return
+    certain platforms. Each memcopy operation in ``opList`` must have a
+    valid srcAccessOrder setting, otherwise this API will return
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
     The :py:obj:`~.CUmemcpyAttributes.flags` field can be used to specify
@@ -34522,7 +34537,7 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[CUDA_MEMCPY3D_BA
     numOps : size_t
         Total number of memcpy operations.
     opList : list[:py:obj:`~.CUDA_MEMCPY3D_BATCH_OP`]
-        Array of size `numOps` containing the actual memcpy operations.
+        Array of size ``numOps`` containing the actual memcpy operations.
     flags : unsigned long long
         Flags for future use, must be zero now.
     hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
@@ -34566,15 +34581,16 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[CUDA_MEMCPY3D_BA
 
 @cython.embedsignature(True)
 def cuMemcpyWithAttributesAsync(dst, src, size_t size, attr : Optional[CUmemcpyAttributes], hStream):
-    """ 
+    """
 
     Performs asynchronous memory copy operation with the specified
     attributes.
 
-    Performs asynchronous memory copy operation where `dst` and `src` are
-    the destination and source pointers respectively. `size` specifies the
-    number of bytes to copy. `attr` specifies the attributes for the copy
-    and `hStream` specifies the stream to enqueue the operation in.
+    Performs asynchronous memory copy operation where ``dst`` and ``src``
+    are the destination and source pointers respectively. ``size``
+    specifies the number of bytes to copy. ``attr`` specifies the
+    attributes for the copy and ``hStream`` specifies the stream to enqueue
+    the operation in.
 
     For more information regarding the attributes, please refer to
     :py:obj:`~.CUmemcpyAttributes` and it's usage desciption
@@ -34636,13 +34652,13 @@ def cuMemcpyWithAttributesAsync(dst, src, size_t size, attr : Optional[CUmemcpyA
 
 @cython.embedsignature(True)
 def cuMemcpy3DWithAttributesAsync(op : Optional[CUDA_MEMCPY3D_BATCH_OP], unsigned long long flags, hStream):
-    """ 
+    """
 
     Performs 3D memory copy with attributes asynchronously
 
-    Performs the copy operation specified in `op`. `flags` specifies the
-    flags for the copy and `hStream` specifies the stream to enqueue the
-    operation in.
+    Performs the copy operation specified in ``op``. ``flags`` specifies
+    the flags for the copy and ``hStream`` specifies the stream to enqueue
+    the operation in.
 
     For more information regarding the operation, please refer to
     :py:obj:`~.CUDA_MEMCPY3D_BATCH_OP` and it's usage desciption
@@ -34686,7 +34702,8 @@ def cuMemcpy3DWithAttributesAsync(op : Optional[CUDA_MEMCPY3D_BATCH_OP], unsigne
 def cuMemsetD8(dstDevice, unsigned char uc, size_t N):
     """ Initializes device memory.
 
-    Sets the memory range of `N` 8-bit values to the specified value `uc`.
+    Sets the memory range of ``N`` 8-bit values to the specified value
+    ``uc``.
 
     Parameters
     ----------
@@ -34725,8 +34742,8 @@ def cuMemsetD8(dstDevice, unsigned char uc, size_t N):
 def cuMemsetD16(dstDevice, unsigned short us, size_t N):
     """ Initializes device memory.
 
-    Sets the memory range of `N` 16-bit values to the specified value `us`.
-    The `dstDevice` pointer must be two byte aligned.
+    Sets the memory range of ``N`` 16-bit values to the specified value
+    ``us``. The ``dstDevice`` pointer must be two byte aligned.
 
     Parameters
     ----------
@@ -34765,8 +34782,8 @@ def cuMemsetD16(dstDevice, unsigned short us, size_t N):
 def cuMemsetD32(dstDevice, unsigned int ui, size_t N):
     """ Initializes device memory.
 
-    Sets the memory range of `N` 32-bit values to the specified value `ui`.
-    The `dstDevice` pointer must be four byte aligned.
+    Sets the memory range of ``N`` 32-bit values to the specified value
+    ``ui``. The ``dstDevice`` pointer must be four byte aligned.
 
     Parameters
     ----------
@@ -34805,18 +34822,18 @@ def cuMemsetD32(dstDevice, unsigned int ui, size_t N):
 def cuMemsetD2D8(dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height):
     """ Initializes device memory.
 
-    Sets the 2D memory range of `Width` 8-bit values to the specified value
-    `uc`. `Height` specifies the number of rows to set, and `dstPitch`
-    specifies the number of bytes between each row. This function performs
-    fastest when the pitch is one that has been passed back by
-    :py:obj:`~.cuMemAllocPitch()`.
+    Sets the 2D memory range of ``Width`` 8-bit values to the specified
+    value ``uc``. ``Height`` specifies the number of rows to set, and
+    ``dstPitch`` specifies the number of bytes between each row. This
+    function performs fastest when the pitch is one that has been passed
+    back by :py:obj:`~.cuMemAllocPitch()`.
 
     Parameters
     ----------
     dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     dstPitch : size_t
-        Pitch of destination device pointer(Unused if `Height` is 1)
+        Pitch of destination device pointer(Unused if ``Height`` is 1)
     uc : unsigned char
         Value to set
     Width : size_t
@@ -34852,10 +34869,10 @@ def cuMemsetD2D8(dstDevice, size_t dstPitch, unsigned char uc, size_t Width, siz
 def cuMemsetD2D16(dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height):
     """ Initializes device memory.
 
-    Sets the 2D memory range of `Width` 16-bit values to the specified
-    value `us`. `Height` specifies the number of rows to set, and
-    `dstPitch` specifies the number of bytes between each row. The
-    `dstDevice` pointer and `dstPitch` offset must be two byte aligned.
+    Sets the 2D memory range of ``Width`` 16-bit values to the specified
+    value ``us``. ``Height`` specifies the number of rows to set, and
+    ``dstPitch`` specifies the number of bytes between each row. The
+    ``dstDevice`` pointer and ``dstPitch`` offset must be two byte aligned.
     This function performs fastest when the pitch is one that has been
     passed back by :py:obj:`~.cuMemAllocPitch()`.
 
@@ -34864,7 +34881,7 @@ def cuMemsetD2D16(dstDevice, size_t dstPitch, unsigned short us, size_t Width, s
     dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     dstPitch : size_t
-        Pitch of destination device pointer(Unused if `Height` is 1)
+        Pitch of destination device pointer(Unused if ``Height`` is 1)
     us : unsigned short
         Value to set
     Width : size_t
@@ -34900,19 +34917,19 @@ def cuMemsetD2D16(dstDevice, size_t dstPitch, unsigned short us, size_t Width, s
 def cuMemsetD2D32(dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height):
     """ Initializes device memory.
 
-    Sets the 2D memory range of `Width` 32-bit values to the specified
-    value `ui`. `Height` specifies the number of rows to set, and
-    `dstPitch` specifies the number of bytes between each row. The
-    `dstDevice` pointer and `dstPitch` offset must be four byte aligned.
-    This function performs fastest when the pitch is one that has been
-    passed back by :py:obj:`~.cuMemAllocPitch()`.
+    Sets the 2D memory range of ``Width`` 32-bit values to the specified
+    value ``ui``. ``Height`` specifies the number of rows to set, and
+    ``dstPitch`` specifies the number of bytes between each row. The
+    ``dstDevice`` pointer and ``dstPitch`` offset must be four byte
+    aligned. This function performs fastest when the pitch is one that has
+    been passed back by :py:obj:`~.cuMemAllocPitch()`.
 
     Parameters
     ----------
     dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     dstPitch : size_t
-        Pitch of destination device pointer(Unused if `Height` is 1)
+        Pitch of destination device pointer(Unused if ``Height`` is 1)
     ui : unsigned int
         Value to set
     Width : size_t
@@ -34948,7 +34965,8 @@ def cuMemsetD2D32(dstDevice, size_t dstPitch, unsigned int ui, size_t Width, siz
 def cuMemsetD8Async(dstDevice, unsigned char uc, size_t N, hStream):
     """ Sets device memory.
 
-    Sets the memory range of `N` 8-bit values to the specified value `uc`.
+    Sets the memory range of ``N`` 8-bit values to the specified value
+    ``uc``.
 
     Parameters
     ----------
@@ -34997,8 +35015,8 @@ def cuMemsetD8Async(dstDevice, unsigned char uc, size_t N, hStream):
 def cuMemsetD16Async(dstDevice, unsigned short us, size_t N, hStream):
     """ Sets device memory.
 
-    Sets the memory range of `N` 16-bit values to the specified value `us`.
-    The `dstDevice` pointer must be two byte aligned.
+    Sets the memory range of ``N`` 16-bit values to the specified value
+    ``us``. The ``dstDevice`` pointer must be two byte aligned.
 
     Parameters
     ----------
@@ -35047,8 +35065,8 @@ def cuMemsetD16Async(dstDevice, unsigned short us, size_t N, hStream):
 def cuMemsetD32Async(dstDevice, unsigned int ui, size_t N, hStream):
     """ Sets device memory.
 
-    Sets the memory range of `N` 32-bit values to the specified value `ui`.
-    The `dstDevice` pointer must be four byte aligned.
+    Sets the memory range of ``N`` 32-bit values to the specified value
+    ``ui``. The ``dstDevice`` pointer must be four byte aligned.
 
     Parameters
     ----------
@@ -35097,18 +35115,18 @@ def cuMemsetD32Async(dstDevice, unsigned int ui, size_t N, hStream):
 def cuMemsetD2D8Async(dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, hStream):
     """ Sets device memory.
 
-    Sets the 2D memory range of `Width` 8-bit values to the specified value
-    `uc`. `Height` specifies the number of rows to set, and `dstPitch`
-    specifies the number of bytes between each row. This function performs
-    fastest when the pitch is one that has been passed back by
-    :py:obj:`~.cuMemAllocPitch()`.
+    Sets the 2D memory range of ``Width`` 8-bit values to the specified
+    value ``uc``. ``Height`` specifies the number of rows to set, and
+    ``dstPitch`` specifies the number of bytes between each row. This
+    function performs fastest when the pitch is one that has been passed
+    back by :py:obj:`~.cuMemAllocPitch()`.
 
     Parameters
     ----------
     dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     dstPitch : size_t
-        Pitch of destination device pointer(Unused if `Height` is 1)
+        Pitch of destination device pointer(Unused if ``Height`` is 1)
     uc : unsigned char
         Value to set
     Width : size_t
@@ -35154,10 +35172,10 @@ def cuMemsetD2D8Async(dstDevice, size_t dstPitch, unsigned char uc, size_t Width
 def cuMemsetD2D16Async(dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, hStream):
     """ Sets device memory.
 
-    Sets the 2D memory range of `Width` 16-bit values to the specified
-    value `us`. `Height` specifies the number of rows to set, and
-    `dstPitch` specifies the number of bytes between each row. The
-    `dstDevice` pointer and `dstPitch` offset must be two byte aligned.
+    Sets the 2D memory range of ``Width`` 16-bit values to the specified
+    value ``us``. ``Height`` specifies the number of rows to set, and
+    ``dstPitch`` specifies the number of bytes between each row. The
+    ``dstDevice`` pointer and ``dstPitch`` offset must be two byte aligned.
     This function performs fastest when the pitch is one that has been
     passed back by :py:obj:`~.cuMemAllocPitch()`.
 
@@ -35166,7 +35184,7 @@ def cuMemsetD2D16Async(dstDevice, size_t dstPitch, unsigned short us, size_t Wid
     dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     dstPitch : size_t
-        Pitch of destination device pointer(Unused if `Height` is 1)
+        Pitch of destination device pointer(Unused if ``Height`` is 1)
     us : unsigned short
         Value to set
     Width : size_t
@@ -35212,19 +35230,19 @@ def cuMemsetD2D16Async(dstDevice, size_t dstPitch, unsigned short us, size_t Wid
 def cuMemsetD2D32Async(dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, hStream):
     """ Sets device memory.
 
-    Sets the 2D memory range of `Width` 32-bit values to the specified
-    value `ui`. `Height` specifies the number of rows to set, and
-    `dstPitch` specifies the number of bytes between each row. The
-    `dstDevice` pointer and `dstPitch` offset must be four byte aligned.
-    This function performs fastest when the pitch is one that has been
-    passed back by :py:obj:`~.cuMemAllocPitch()`.
+    Sets the 2D memory range of ``Width`` 32-bit values to the specified
+    value ``ui``. ``Height`` specifies the number of rows to set, and
+    ``dstPitch`` specifies the number of bytes between each row. The
+    ``dstDevice`` pointer and ``dstPitch`` offset must be four byte
+    aligned. This function performs fastest when the pitch is one that has
+    been passed back by :py:obj:`~.cuMemAllocPitch()`.
 
     Parameters
     ----------
     dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     dstPitch : size_t
-        Pitch of destination device pointer(Unused if `Height` is 1)
+        Pitch of destination device pointer(Unused if ``Height`` is 1)
     ui : unsigned int
         Value to set
     Width : size_t
@@ -35271,15 +35289,15 @@ def cuArrayCreate(pAllocateArray : Optional[CUDA_ARRAY_DESCRIPTOR]):
     """ Creates a 1D or 2D CUDA array.
 
     Creates a CUDA array according to the :py:obj:`~.CUDA_ARRAY_DESCRIPTOR`
-    structure `pAllocateArray` and returns a handle to the new CUDA array
-    in `*pHandle`. The :py:obj:`~.CUDA_ARRAY_DESCRIPTOR` is defined as:
+    structure ``pAllocateArray`` and returns a handle to the new CUDA array
+    in ``*pHandle``. The :py:obj:`~.CUDA_ARRAY_DESCRIPTOR` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
     where:
 
-    - `Width`, and `Height` are the width, and height of the CUDA array (in
-      elements); the CUDA array is one-dimensional if height is 0, two-
+    - ``Width``, and ``Height`` are the width, and height of the CUDA array
+      (in elements); the CUDA array is one-dimensional if height is 0, two-
       dimensional otherwise;
 
     - :py:obj:`~.Format` specifies the format of the elements;
@@ -35287,7 +35305,7 @@ def cuArrayCreate(pAllocateArray : Optional[CUDA_ARRAY_DESCRIPTOR]):
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
-    - `NumChannels` specifies the number of packed components per CUDA
+    - ``NumChannels`` specifies the number of packed components per CUDA
       array element; it may be 1, 2, or 4;
 
     Here are examples of CUDA array descriptions:
@@ -35300,12 +35318,12 @@ def cuArrayCreate(pAllocateArray : Optional[CUDA_ARRAY_DESCRIPTOR]):
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    Description for a `width` x `height` CUDA array of 64-bit, 4x16-bit
+    Description for a ``width`` x ``height`` CUDA array of 64-bit, 4x16-bit
     float16's:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    Description for a `width` x `height` CUDA array of 16-bit elements,
+    Description for a ``width`` x ``height`` CUDA array of 16-bit elements,
     each of which is two 8-bit unsigned chars:
 
     **View CUDA Toolkit Documentation for a C++ code example**
@@ -35341,10 +35359,10 @@ def cuArrayCreate(pAllocateArray : Optional[CUDA_ARRAY_DESCRIPTOR]):
 def cuArrayGetDescriptor(hArray):
     """ Get a 1D or 2D CUDA array descriptor.
 
-    Returns in `*pArrayDescriptor` a descriptor containing information on
-    the format and dimensions of the CUDA array `hArray`. It is useful for
-    subroutines that have been passed a CUDA array, but need to know the
-    CUDA array parameters for validation or other purposes.
+    Returns in ``*pArrayDescriptor`` a descriptor containing information on
+    the format and dimensions of the CUDA array ``hArray``. It is useful
+    for subroutines that have been passed a CUDA array, but need to know
+    the CUDA array parameters for validation or other purposes.
 
     Parameters
     ----------
@@ -35385,7 +35403,7 @@ def cuArrayGetSparseProperties(array):
     """ Returns the layout properties of a sparse CUDA array.
 
     Returns the layout properties of a sparse CUDA array in
-    `sparseProperties` If the CUDA array is not allocated with flag
+    ``sparseProperties`` If the CUDA array is not allocated with flag
     :py:obj:`~.CUDA_ARRAY3D_SPARSE` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     will be returned.
 
@@ -35394,13 +35412,13 @@ def cuArrayGetSparseProperties(array):
     :py:obj:`~.CUDA_ARRAY_SPARSE_PROPERTIES.miptailSize` represents the
     total size of the array. Otherwise, it will be zero. Also, the returned
     value in :py:obj:`~.CUDA_ARRAY_SPARSE_PROPERTIES.miptailFirstLevel` is
-    always zero. Note that the `array` must have been allocated using
+    always zero. Note that the ``array`` must have been allocated using
     :py:obj:`~.cuArrayCreate` or :py:obj:`~.cuArray3DCreate`. For CUDA
     arrays obtained using :py:obj:`~.cuMipmappedArrayGetLevel`,
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned. Instead,
     :py:obj:`~.cuMipmappedArrayGetSparseProperties` must be used to obtain
     the sparse properties of the entire CUDA mipmapped array to which
-    `array` belongs to.
+    ``array`` belongs to.
 
     Parameters
     ----------
@@ -35440,8 +35458,8 @@ def cuArrayGetSparseProperties(array):
 def cuMipmappedArrayGetSparseProperties(mipmap):
     """ Returns the layout properties of a sparse CUDA mipmapped array.
 
-    Returns the sparse array layout properties in `sparseProperties` If the
-    CUDA mipmapped array is not allocated with flag
+    Returns the sparse array layout properties in ``sparseProperties`` If
+    the CUDA mipmapped array is not allocated with flag
     :py:obj:`~.CUDA_ARRAY3D_SPARSE` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     will be returned.
 
@@ -35498,8 +35516,8 @@ def cuMipmappedArrayGetSparseProperties(mipmap):
 def cuArrayGetMemoryRequirements(array, device):
     """ Returns the memory requirements of a CUDA array.
 
-    Returns the memory requirements of a CUDA array in `memoryRequirements`
-    If the CUDA array is not allocated with flag
+    Returns the memory requirements of a CUDA array in
+    ``memoryRequirements`` If the CUDA array is not allocated with flag
     :py:obj:`~.CUDA_ARRAY3D_DEFERRED_MAPPING`
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned.
 
@@ -35557,8 +35575,8 @@ def cuMipmappedArrayGetMemoryRequirements(mipmap, device):
     """ Returns the memory requirements of a CUDA mipmapped array.
 
     Returns the memory requirements of a CUDA mipmapped array in
-    `memoryRequirements` If the CUDA mipmapped array is not allocated with
-    flag :py:obj:`~.CUDA_ARRAY3D_DEFERRED_MAPPING`
+    ``memoryRequirements`` If the CUDA mipmapped array is not allocated
+    with flag :py:obj:`~.CUDA_ARRAY3D_DEFERRED_MAPPING`
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned.
 
     The returned value in :py:obj:`~.CUDA_ARRAY_MEMORY_REQUIREMENTS.size`
@@ -35615,20 +35633,20 @@ def cuMipmappedArrayGetMemoryRequirements(mipmap, device):
 def cuArrayGetPlane(hArray, unsigned int planeIdx):
     """ Gets a CUDA array plane from a CUDA array.
 
-    Returns in `pPlaneArray` a CUDA array that represents a single format
-    plane of the CUDA array `hArray`.
+    Returns in ``pPlaneArray`` a CUDA array that represents a single format
+    plane of the CUDA array ``hArray``.
 
-    If `planeIdx` is greater than the maximum number of planes in this
+    If ``planeIdx`` is greater than the maximum number of planes in this
     array or if the array does not have a multi-planar format e.g:
     :py:obj:`~.CU_AD_FORMAT_NV12`, then
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned.
 
-    Note that if the `hArray` has format :py:obj:`~.CU_AD_FORMAT_NV12`,
-    then passing in 0 for `planeIdx` returns a CUDA array of the same size
-    as `hArray` but with one channel and
+    Note that if the ``hArray`` has format :py:obj:`~.CU_AD_FORMAT_NV12`,
+    then passing in 0 for ``planeIdx`` returns a CUDA array of the same
+    size as ``hArray`` but with one channel and
     :py:obj:`~.CU_AD_FORMAT_UNSIGNED_INT8` as its format. If 1 is passed
-    for `planeIdx`, then the returned CUDA array has half the height and
-    width of `hArray` with two channels and
+    for ``planeIdx``, then the returned CUDA array has half the height and
+    width of ``hArray`` with two channels and
     :py:obj:`~.CU_AD_FORMAT_UNSIGNED_INT8` as its format.
 
     Parameters
@@ -35643,7 +35661,7 @@ def cuArrayGetPlane(hArray, unsigned int planeIdx):
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     pPlaneArray : :py:obj:`~.CUarray`
-        Returned CUDA array referenced by the `planeIdx`
+        Returned CUDA array referenced by the ``planeIdx``
 
     See Also
     --------
@@ -35671,7 +35689,7 @@ def cuArrayGetPlane(hArray, unsigned int planeIdx):
 def cuArrayDestroy(hArray):
     """ Destroys a CUDA array.
 
-    Destroys the CUDA array `hArray`.
+    Destroys the CUDA array ``hArray``.
 
     Parameters
     ----------
@@ -35707,26 +35725,26 @@ def cuArray3DCreate(pAllocateArray : Optional[CUDA_ARRAY3D_DESCRIPTOR]):
     """ Creates a 3D CUDA array.
 
     Creates a CUDA array according to the
-    :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR` structure `pAllocateArray` and
-    returns a handle to the new CUDA array in `*pHandle`. The
+    :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR` structure ``pAllocateArray`` and
+    returns a handle to the new CUDA array in ``*pHandle``. The
     :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
     where:
 
-    - `Width`, `Height`, and `Depth` are the width, height, and depth of
-      the CUDA array (in elements); the following types of CUDA arrays can
-      be allocated:
+    - ``Width``, ``Height``, and ``Depth`` are the width, height, and depth
+      of the CUDA array (in elements); the following types of CUDA arrays
+      can be allocated:
 
-      - A 1D array is allocated if `Height` and `Depth` extents are both
-        zero.
+      - A 1D array is allocated if ``Height`` and ``Depth`` extents are
+        both zero.
 
-      - A 2D array is allocated if only `Depth` extent is zero.
+      - A 2D array is allocated if only ``Depth`` extent is zero.
 
       - A 3D array is allocated if all three extents are non-zero.
 
-      - A 1D layered CUDA array is allocated if only `Height` is zero and
+      - A 1D layered CUDA array is allocated if only ``Height`` is zero and
         the :py:obj:`~.CUDA_ARRAY3D_LAYERED` flag is set. Each layer is a
         1D array. The number of layers is determined by the depth extent.
 
@@ -35736,34 +35754,34 @@ def cuArray3DCreate(pAllocateArray : Optional[CUDA_ARRAY3D_DESCRIPTOR]):
         depth extent.
 
       - A cubemap CUDA array is allocated if all three extents are non-zero
-        and the :py:obj:`~.CUDA_ARRAY3D_CUBEMAP` flag is set. `Width` must
-        be equal to `Height`, and `Depth` must be six. A cubemap is a
-        special type of 2D layered CUDA array, where the six layers
+        and the :py:obj:`~.CUDA_ARRAY3D_CUBEMAP` flag is set. ``Width``
+        must be equal to ``Height``, and ``Depth`` must be six. A cubemap
+        is a special type of 2D layered CUDA array, where the six layers
         represent the six faces of a cube. The order of the six layers in
         memory is the same as that listed in
         :py:obj:`~.CUarray_cubemap_face`.
 
       - A cubemap layered CUDA array is allocated if all three extents are
         non-zero, and both, :py:obj:`~.CUDA_ARRAY3D_CUBEMAP` and
-        :py:obj:`~.CUDA_ARRAY3D_LAYERED` flags are set. `Width` must be
-        equal to `Height`, and `Depth` must be a multiple of six. A cubemap
-        layered CUDA array is a special type of 2D layered CUDA array that
-        consists of a collection of cubemaps. The first six layers
-        represent the first cubemap, the next six layers form the second
-        cubemap, and so on.
+        :py:obj:`~.CUDA_ARRAY3D_LAYERED` flags are set. ``Width`` must be
+        equal to ``Height``, and ``Depth`` must be a multiple of six. A
+        cubemap layered CUDA array is a special type of 2D layered CUDA
+        array that consists of a collection of cubemaps. The first six
+        layers represent the first cubemap, the next six layers form the
+        second cubemap, and so on.
 
     - :py:obj:`~.Format` specifies the format of the elements;
       :py:obj:`~.CUarray_format` is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
-    - `NumChannels` specifies the number of packed components per CUDA
+    - ``NumChannels`` specifies the number of packed components per CUDA
       array element; it may be 1, 2, or 4;
 
     - :py:obj:`~.Flags` may be set to
 
       - :py:obj:`~.CUDA_ARRAY3D_LAYERED` to enable creation of layered CUDA
-        arrays. If this flag is set, `Depth` specifies the number of
+        arrays. If this flag is set, ``Depth`` specifies the number of
         layers, not the depth of a 3D array.
 
       - :py:obj:`~.CUDA_ARRAY3D_SURFACE_LDST` to enable surface references
@@ -35772,23 +35790,23 @@ def cuArray3DCreate(pAllocateArray : Optional[CUDA_ARRAY3D_DESCRIPTOR]):
         CUDA array to a surface reference.
 
       - :py:obj:`~.CUDA_ARRAY3D_CUBEMAP` to enable creation of cubemaps. If
-        this flag is set, `Width` must be equal to `Height`, and `Depth`
-        must be six. If the :py:obj:`~.CUDA_ARRAY3D_LAYERED` flag is also
-        set, then `Depth` must be a multiple of six.
+        this flag is set, ``Width`` must be equal to ``Height``, and
+        ``Depth`` must be six. If the :py:obj:`~.CUDA_ARRAY3D_LAYERED` flag
+        is also set, then ``Depth`` must be a multiple of six.
 
       - :py:obj:`~.CUDA_ARRAY3D_TEXTURE_GATHER` to indicate that the CUDA
         array will be used for texture gather. Texture gather can only be
         performed on 2D CUDA arrays.
 
-    `Width`, `Height` and `Depth` must meet certain size requirements as
-    listed in the following table. All values are specified in elements.
+    ``Width``, ``Height`` and ``Depth`` must meet certain size requirements
+    as listed in the following table. All values are specified in elements.
     Note that for brevity's sake, the full name of the device attribute is
     not specified. For ex., TEXTURE1D_WIDTH refers to the device attribute
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH`.
 
     Note that 2D CUDA arrays have different size requirements if the
-    :py:obj:`~.CUDA_ARRAY3D_TEXTURE_GATHER` flag is set. `Width` and
-    `Height` must not be greater than
+    :py:obj:`~.CUDA_ARRAY3D_TEXTURE_GATHER` flag is set. ``Width`` and
+    ``Height`` must not be greater than
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH` and
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT`
     respectively, in that case.
@@ -35805,8 +35823,8 @@ def cuArray3DCreate(pAllocateArray : Optional[CUDA_ARRAY3D_DESCRIPTOR]):
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    Description for a `width` x `height` x `depth` CUDA array of 64-bit,
-    4x16-bit float16's:
+    Description for a ``width`` x ``height`` x ``depth`` CUDA array of
+    64-bit, 4x16-bit float16's:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -35841,14 +35859,14 @@ def cuArray3DCreate(pAllocateArray : Optional[CUDA_ARRAY3D_DESCRIPTOR]):
 def cuArray3DGetDescriptor(hArray):
     """ Get a 3D CUDA array descriptor.
 
-    Returns in `*pArrayDescriptor` a descriptor containing information on
-    the format and dimensions of the CUDA array `hArray`. It is useful for
-    subroutines that have been passed a CUDA array, but need to know the
-    CUDA array parameters for validation or other purposes.
+    Returns in ``*pArrayDescriptor`` a descriptor containing information on
+    the format and dimensions of the CUDA array ``hArray``. It is useful
+    for subroutines that have been passed a CUDA array, but need to know
+    the CUDA array parameters for validation or other purposes.
 
     This function may be called on 1D and 2D arrays, in which case the
-    `Height` and/or `Depth` members of the descriptor struct will be set to
-    0.
+    ``Height`` and/or ``Depth`` members of the descriptor struct will be
+    set to 0.
 
     Parameters
     ----------
@@ -35889,9 +35907,9 @@ def cuMipmappedArrayCreate(pMipmappedArrayDesc : Optional[CUDA_ARRAY3D_DESCRIPTO
     """ Creates a CUDA mipmapped array.
 
     Creates a CUDA mipmapped array according to the
-    :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR` structure `pMipmappedArrayDesc` and
-    returns a handle to the new CUDA mipmapped array in `*pHandle`.
-    `numMipmapLevels` specifies the number of mipmap levels to be
+    :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR` structure ``pMipmappedArrayDesc``
+    and returns a handle to the new CUDA mipmapped array in ``*pHandle``.
+    ``numMipmapLevels`` specifies the number of mipmap levels to be
     allocated. This value is clamped to the range [1, 1 +
     floor(log2(max(width, height, depth)))].
 
@@ -35901,20 +35919,20 @@ def cuMipmappedArrayCreate(pMipmappedArrayDesc : Optional[CUDA_ARRAY3D_DESCRIPTO
 
     where:
 
-    - `Width`, `Height`, and `Depth` are the width, height, and depth of
-      the CUDA array (in elements); the following types of CUDA arrays can
-      be allocated:
+    - ``Width``, ``Height``, and ``Depth`` are the width, height, and depth
+      of the CUDA array (in elements); the following types of CUDA arrays
+      can be allocated:
 
-      - A 1D mipmapped array is allocated if `Height` and `Depth` extents
-        are both zero.
+      - A 1D mipmapped array is allocated if ``Height`` and ``Depth``
+        extents are both zero.
 
-      - A 2D mipmapped array is allocated if only `Depth` extent is zero.
+      - A 2D mipmapped array is allocated if only ``Depth`` extent is zero.
 
       - A 3D mipmapped array is allocated if all three extents are non-
         zero.
 
-      - A 1D layered CUDA mipmapped array is allocated if only `Height` is
-        zero and the :py:obj:`~.CUDA_ARRAY3D_LAYERED` flag is set. Each
+      - A 1D layered CUDA mipmapped array is allocated if only ``Height``
+        is zero and the :py:obj:`~.CUDA_ARRAY3D_LAYERED` flag is set. Each
         layer is a 1D array. The number of layers is determined by the
         depth extent.
 
@@ -35925,7 +35943,7 @@ def cuMipmappedArrayCreate(pMipmappedArrayDesc : Optional[CUDA_ARRAY3D_DESCRIPTO
 
       - A cubemap CUDA mipmapped array is allocated if all three extents
         are non-zero and the :py:obj:`~.CUDA_ARRAY3D_CUBEMAP` flag is set.
-        `Width` must be equal to `Height`, and `Depth` must be six. A
+        ``Width`` must be equal to ``Height``, and ``Depth`` must be six. A
         cubemap is a special type of 2D layered CUDA array, where the six
         layers represent the six faces of a cube. The order of the six
         layers in memory is the same as that listed in
@@ -35933,26 +35951,26 @@ def cuMipmappedArrayCreate(pMipmappedArrayDesc : Optional[CUDA_ARRAY3D_DESCRIPTO
 
       - A cubemap layered CUDA mipmapped array is allocated if all three
         extents are non-zero, and both, :py:obj:`~.CUDA_ARRAY3D_CUBEMAP`
-        and :py:obj:`~.CUDA_ARRAY3D_LAYERED` flags are set. `Width` must be
-        equal to `Height`, and `Depth` must be a multiple of six. A cubemap
-        layered CUDA array is a special type of 2D layered CUDA array that
-        consists of a collection of cubemaps. The first six layers
-        represent the first cubemap, the next six layers form the second
-        cubemap, and so on.
+        and :py:obj:`~.CUDA_ARRAY3D_LAYERED` flags are set. ``Width`` must
+        be equal to ``Height``, and ``Depth`` must be a multiple of six. A
+        cubemap layered CUDA array is a special type of 2D layered CUDA
+        array that consists of a collection of cubemaps. The first six
+        layers represent the first cubemap, the next six layers form the
+        second cubemap, and so on.
 
     - :py:obj:`~.Format` specifies the format of the elements;
       :py:obj:`~.CUarray_format` is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
-    - `NumChannels` specifies the number of packed components per CUDA
+    - ``NumChannels`` specifies the number of packed components per CUDA
       array element; it may be 1, 2, or 4;
 
     - :py:obj:`~.Flags` may be set to
 
       - :py:obj:`~.CUDA_ARRAY3D_LAYERED` to enable creation of layered CUDA
-        mipmapped arrays. If this flag is set, `Depth` specifies the number
-        of layers, not the depth of a 3D array.
+        mipmapped arrays. If this flag is set, ``Depth`` specifies the
+        number of layers, not the depth of a 3D array.
 
       - :py:obj:`~.CUDA_ARRAY3D_SURFACE_LDST` to enable surface references
         to be bound to individual mipmap levels of the CUDA mipmapped
@@ -35960,17 +35978,18 @@ def cuMipmappedArrayCreate(pMipmappedArrayDesc : Optional[CUDA_ARRAY3D_DESCRIPTO
         fail when attempting to bind a mipmap level of the CUDA mipmapped
         array to a surface reference.
 
-    - :py:obj:`~.CUDA_ARRAY3D_CUBEMAP` to enable creation of mipmapped
-    cubemaps. If this flag is set, `Width` must be equal to `Height`, and
-    `Depth` must be six. If the :py:obj:`~.CUDA_ARRAY3D_LAYERED` flag is
-    also set, then `Depth` must be a multiple of six.
+        - :py:obj:`~.CUDA_ARRAY3D_CUBEMAP` to enable creation of mipmapped
+          cubemaps. If this flag is set, ``Width`` must be equal to
+          ``Height``, and ``Depth`` must be six. If the
+          :py:obj:`~.CUDA_ARRAY3D_LAYERED` flag is also set, then ``Depth``
+          must be a multiple of six.
 
       - :py:obj:`~.CUDA_ARRAY3D_TEXTURE_GATHER` to indicate that the CUDA
         mipmapped array will be used for texture gather. Texture gather can
         only be performed on 2D CUDA mipmapped arrays.
 
-    `Width`, `Height` and `Depth` must meet certain size requirements as
-    listed in the following table. All values are specified in elements.
+    ``Width``, ``Height`` and ``Depth`` must meet certain size requirements
+    as listed in the following table. All values are specified in elements.
     Note that for brevity's sake, the full name of the device attribute is
     not specified. For ex., TEXTURE1D_MIPMAPPED_WIDTH refers to the device
     attribute
@@ -36011,10 +36030,10 @@ def cuMipmappedArrayCreate(pMipmappedArrayDesc : Optional[CUDA_ARRAY3D_DESCRIPTO
 def cuMipmappedArrayGetLevel(hMipmappedArray, unsigned int level):
     """ Gets a mipmap level of a CUDA mipmapped array.
 
-    Returns in `*pLevelArray` a CUDA array that represents a single mipmap
-    level of the CUDA mipmapped array `hMipmappedArray`.
+    Returns in ``*pLevelArray`` a CUDA array that represents a single
+    mipmap level of the CUDA mipmapped array ``hMipmappedArray``.
 
-    If `level` is greater than the maximum number of levels in this
+    If ``level`` is greater than the maximum number of levels in this
     mipmapped array, :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned.
 
     Parameters
@@ -36057,7 +36076,7 @@ def cuMipmappedArrayGetLevel(hMipmappedArray, unsigned int level):
 def cuMipmappedArrayDestroy(hMipmappedArray):
     """ Destroys a CUDA mipmapped array.
 
-    Destroys the CUDA mipmapped array `hMipmappedArray`.
+    Destroys the CUDA mipmapped array ``hMipmappedArray``.
 
     Parameters
     ----------
@@ -36109,10 +36128,10 @@ def cuMemGetHandleForAddressRange(dptr, size_t size, handleType not None : CUmem
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED` device attribute is
     deprecated.
 
-    Users must ensure the `dptr` and `size` are aligned to the host page
-    size.
+    Users must ensure the ``dptr`` and ``size`` are aligned to the host
+    page size.
 
-    The `handle` will be interpreted as a pointer to an integer to store
+    The ``handle`` will be interpreted as a pointer to an integer to store
     the dma_buf file descriptor. Users must ensure the entire address range
     is backed and mapped when the address range is allocated by
     :py:obj:`~.cuMemAddressReserve`. All the physical allocations backing
@@ -36145,7 +36164,7 @@ def cuMemGetHandleForAddressRange(dptr, size_t size, handleType not None : CUmem
     size : size_t
         Length of the address range. Must be aligned to host page size.
     handleType : :py:obj:`~.CUmemRangeHandleType`
-        Type of handle requested (defines type and size of the `handle`
+        Type of handle requested (defines type and size of the ``handle``
         output parameter)
     flags : unsigned long long
         When requesting
@@ -36182,10 +36201,10 @@ def cuMemGetHandleForAddressRange(dptr, size_t size, handleType not None : CUmem
 
 @cython.embedsignature(True)
 def cuMemBatchDecompressAsync(paramsArray : Optional[CUmemDecompressParams], size_t count, unsigned int flags, stream):
-    """ Submit a batch of `count` independent decompression operations.
+    """ Submit a batch of ``count`` independent decompression operations.
 
-    Each of the `count` decompression operations is described by a single
-    entry in the `paramsArray` array. Once the batch has been submitted,
+    Each of the ``count`` decompression operations is described by a single
+    entry in the ``paramsArray`` array. Once the batch has been submitted,
     the function will return, and decompression will happen asynchronously
     w.r.t. the CPU. To the work completion tracking mechanisms in the CUDA
     driver, the batch will be considered a single unit of work and
@@ -36213,7 +36232,7 @@ def cuMemBatchDecompressAsync(paramsArray : Optional[CUmemDecompressParams], siz
       :py:obj:`~.CUmemDecompressParams.src`,
       :py:obj:`~.CUmemDecompressParams.dst`, and
       :py:obj:`~.CUmemDecompressParams.dstActBytes`, must all be accessible
-      from the device associated with the context where `stream` was
+      from the device associated with the context where ``stream`` was
       created. For information on how to ensure this, see the documentation
       for the allocator of interest.
 
@@ -36223,7 +36242,7 @@ def cuMemBatchDecompressAsync(paramsArray : Optional[CUmemDecompressParams], siz
         The array of structures describing the independent decompression
         operations.
     count : size_t
-        The number of entries in `paramsArray` array.
+        The number of entries in ``paramsArray`` array.
     flags : unsigned int
         Must be 0.
     stream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
@@ -36234,13 +36253,13 @@ def cuMemBatchDecompressAsync(paramsArray : Optional[CUmemDecompressParams], siz
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     errorIndex : int
-        The index into `paramsArray` of the decompression operation for
-        which the error returned by this function pertains to. If `index`
+        The index into ``paramsArray`` of the decompression operation for
+        which the error returned by this function pertains to. If ``index``
         is SIZE_MAX and the value returned is not :py:obj:`~.CUDA_SUCCESS`,
         then the error returned by this function should be considered a
         general error that does not pertain to a particular decompression
-        operation. May be `NULL`, in which case, no index will be recorded
-        in the event of error.
+        operation. May be ``NULL``, in which case, no index will be
+        recorded in the event of error.
 
     See Also
     --------
@@ -36270,13 +36289,13 @@ def cuMemAddressReserve(size_t size, size_t alignment, addr, unsigned long long
     """ Allocate an address range reservation.
 
     Reserves a virtual address range based on the given parameters, giving
-    the starting address of the range in `ptr`. This API requires a system
-    that supports UVA. The size and address parameters must be a multiple
-    of the host page size and the alignment must be a power of two or zero
-    for default alignment. If `addr` is 0, then the driver chooses the
-    address at which to place the start of the reservation whereas when it
-    is non-zero then the driver treats it as a hint about where to place
-    the reservation.
+    the starting address of the range in ``ptr``. This API requires a
+    system that supports UVA. The size and address parameters must be a
+    multiple of the host page size and the alignment must be a power of two
+    or zero for default alignment. If ``addr`` is 0, then the driver
+    chooses the address at which to place the start of the reservation
+    whereas when it is non-zero then the driver treats it as a hint about
+    where to place the reservation.
 
     Parameters
     ----------
@@ -36362,17 +36381,17 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     """ Create a CUDA memory handle representing a memory allocation of a given size described by the given properties.
 
     This creates a memory allocation on the target device specified through
-    the `prop` structure. The created allocation will not have any device
-    or host mappings. The generic memory `handle` for the allocation can be
-    mapped to the address space of calling process via
+    the ``prop`` structure. The created allocation will not have any device
+    or host mappings. The generic memory ``handle`` for the allocation can
+    be mapped to the address space of calling process via
     :py:obj:`~.cuMemMap`. This handle cannot be transmitted directly to
     other processes (see :py:obj:`~.cuMemExportToShareableHandle`). On
-    Windows, the caller must also pass an LPSECURITYATTRIBUTE in `prop` to
-    be associated with this handle which limits or allows access to this
+    Windows, the caller must also pass an LPSECURITYATTRIBUTE in ``prop``
+    to be associated with this handle which limits or allows access to this
     handle for a recipient process (see
     :py:obj:`~.CUmemAllocationProp.win32HandleMetaData` for more). The
-    `size` of this allocation must be a multiple of the the value given via
-    :py:obj:`~.cuMemGetAllocationGranularity` with the
+    ``size`` of this allocation must be a multiple of the the value given
+    via :py:obj:`~.cuMemGetAllocationGranularity` with the
     :py:obj:`~.CU_MEM_ALLOC_GRANULARITY_MINIMUM` flag. To create a CPU
     allocation that doesn't target any specific NUMA nodes, applications
     must set :py:obj:`~.CUmemAllocationProp.CUmemLocation.type` to
@@ -36393,7 +36412,7 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
     Applications that intend to use :py:obj:`~.CU_MEM_HANDLE_TYPE_FABRIC`
-    based memory sharing must ensure: (1) `nvidia-caps-imex-channels`
+    based memory sharing must ensure: (1) ``nvidia-caps-imex-channels``
     character device is created by the driver and is listed under
     /proc/devices (2) have at least one IMEX channel file accessible by the
     user launching the application.
@@ -36409,8 +36428,8 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     These channel files exist in /dev/nvidia-caps-imex-channels/channel*
     and can be created using standard OS native calls like mknod on Linux.
     For example: To create channel0 with the major number from
-    /proc/devices users can execute the following command: `mknod
-    /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
+    /proc/devices users can execute the following command: ``mknod
+    /dev/nvidia-caps-imex-channels/channel0 c <major number> 0``
 
     If :py:obj:`~.CUmemAllocationProp.allocFlags.usage` contains
     :py:obj:`~.CU_MEM_CREATE_USAGE_TILE_POOL` flag then the memory
@@ -36462,8 +36481,8 @@ def cuMemRelease(handle):
     generic memory handle can be freed when there are still outstanding
     mappings made with this handle. Each time a recipient process imports a
     shareable handle, it needs to pair it with :py:obj:`~.cuMemRelease` for
-    the handle to be freed. If `handle` is not a valid handle the behavior
-    is undefined.
+    the handle to be freed. If ``handle`` is not a valid handle the
+    behavior is undefined.
 
     Parameters
     ----------
@@ -36498,23 +36517,23 @@ def cuMemRelease(handle):
 def cuMemMap(ptr, size_t size, size_t offset, handle, unsigned long long flags):
     """ Maps an allocation handle to a reserved virtual address range.
 
-    Maps bytes of memory represented by `handle` starting from byte
-    `offset` to `size` to address range [`addr`, `addr` + `size`]. This
-    range must be an address reservation previously reserved with
-    :py:obj:`~.cuMemAddressReserve`, and `offset` + `size` must be less
-    than the size of the memory allocation. Both `ptr`, `size`, and
-    `offset` must be a multiple of the value given via
-    :py:obj:`~.cuMemGetAllocationGranularity` with the
-    :py:obj:`~.CU_MEM_ALLOC_GRANULARITY_MINIMUM` flag. If `handle`
-    represents a multicast object, `ptr`, `size` and `offset` must be
+    Maps bytes of memory represented by ``handle`` starting from byte
+    ``offset`` to ``size`` to address range [``addr``, ``addr`` +
+    ``size``]. This range must be an address reservation previously
+    reserved with :py:obj:`~.cuMemAddressReserve`, and ``offset`` +
+    ``size`` must be less than the size of the memory allocation. Both
+    ``ptr``, ``size``, and ``offset`` must be a multiple of the value given
+    via :py:obj:`~.cuMemGetAllocationGranularity` with the
+    :py:obj:`~.CU_MEM_ALLOC_GRANULARITY_MINIMUM` flag. If ``handle``
+    represents a multicast object, ``ptr``, ``size`` and ``offset`` must be
     aligned to the value returned by :py:obj:`~.cuMulticastGetGranularity`
     with the flag :py:obj:`~.CU_MULTICAST_MINIMUM_GRANULARITY`. For best
-    performance however, it is recommended that `ptr`, `size` and `offset`
-    be aligned to the value returned by
+    performance however, it is recommended that ``ptr``, ``size`` and
+    ``offset`` be aligned to the value returned by
     :py:obj:`~.cuMulticastGetGranularity` with the flag
     :py:obj:`~.CU_MULTICAST_RECOMMENDED_GRANULARITY`.
 
-    When `handle` represents a multicast object, this call may return
+    When ``handle`` represents a multicast object, this call may return
     CUDA_ERROR_ILLEGAL_STATE if the system configuration is in an illegal
     state. In such cases, to continue using multicast, verify that the
     system configuration is in a valid state and all required driver
@@ -36583,8 +36602,9 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
 
     Performs map or unmap operations on subregions of sparse CUDA arrays
     and sparse CUDA mipmapped arrays. Each operation is specified by a
-    :py:obj:`~.CUarrayMapInfo` entry in the `mapInfoList` array of size
-    `count`. The structure :py:obj:`~.CUarrayMapInfo` is defined as follow:
+    :py:obj:`~.CUarrayMapInfo` entry in the ``mapInfoList`` array of size
+    ``count``. The structure :py:obj:`~.CUarrayMapInfo` is defined as
+    follow:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -36699,15 +36719,15 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
     allocation as specified by :py:obj:`~.CUarrayMapInfo.memHandle`.
 
     :py:obj:`~.CUarrayMapInfo.flags` and
-    :py:obj:`~.CUarrayMapInfo.reserved`[] are unused and must be set to
-    zero.
+    :py:obj:`~.CUarrayMapInfo.reserved` ``[]`` are unused and must be set
+    to zero.
 
     Parameters
     ----------
     mapInfoList : list[:py:obj:`~.CUarrayMapInfo`]
         List of :py:obj:`~.CUarrayMapInfo`
     count : unsigned int
-        Count of :py:obj:`~.CUarrayMapInfo` in `mapInfoList`
+        Count of :py:obj:`~.CUarrayMapInfo` in ``mapInfoList``
     hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
         Stream identifier for the stream to use for map or unmap operations
 
@@ -36800,23 +36820,23 @@ def cuMemUnmap(ptr, size_t size):
 
 @cython.embedsignature(True)
 def cuMemSetAccess(ptr, size_t size, desc : Optional[tuple[CUmemAccessDesc] | list[CUmemAccessDesc]], size_t count):
-    """ Set the access flags for each location specified in `desc` for the given virtual address range.
+    """ Set the access flags for each location specified in ``desc`` for the given virtual address range.
 
-    Given the virtual address range via `ptr` and `size`, and the locations
-    in the array given by `desc` and `count`, set the access flags for the
-    target locations. The range must be a fully mapped address range
-    containing all allocations created by :py:obj:`~.cuMemMap` /
-    :py:obj:`~.cuMemCreate`. Users cannot specify
+    Given the virtual address range via ``ptr`` and ``size``, and the
+    locations in the array given by ``desc`` and ``count``, set the access
+    flags for the target locations. The range must be a fully mapped
+    address range containing all allocations created by
+    :py:obj:`~.cuMemMap` / :py:obj:`~.cuMemCreate`. Users cannot specify
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` accessibility for
     allocations created on with other location types. Note: When
     :py:obj:`~.CUmemAccessDesc.CUmemLocation.type` is
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`,
     :py:obj:`~.CUmemAccessDesc.CUmemLocation.id` is ignored. When setting
     the access flags for a virtual address range mapping a multicast
-    object, `ptr` and `size` must be aligned to the value returned by
+    object, ``ptr`` and ``size`` must be aligned to the value returned by
     :py:obj:`~.cuMulticastGetGranularity` with the flag
     :py:obj:`~.CU_MULTICAST_MINIMUM_GRANULARITY`. For best performance
-    however, it is recommended that `ptr` and `size` be aligned to the
+    however, it is recommended that ``ptr`` and ``size`` be aligned to the
     value returned by :py:obj:`~.cuMulticastGetGranularity` with the flag
     :py:obj:`~.CU_MULTICAST_RECOMMENDED_GRANULARITY`.
 
@@ -36830,7 +36850,7 @@ def cuMemSetAccess(ptr, size_t size, desc : Optional[tuple[CUmemAccessDesc] | li
         Array of :py:obj:`~.CUmemAccessDesc` that describe how to change
         the
     count : size_t
-        Number of :py:obj:`~.CUmemAccessDesc` in `desc`
+        Number of :py:obj:`~.CUmemAccessDesc` in ``desc``
 
     Returns
     -------
@@ -36873,7 +36893,7 @@ def cuMemSetAccess(ptr, size_t size, desc : Optional[tuple[CUmemAccessDesc] | li
 
 @cython.embedsignature(True)
 def cuMemGetAccess(location : Optional[CUmemLocation], ptr):
-    """ Get the access `flags` set for the given `location` and `ptr`.
+    """ Get the access ``flags`` set for the given ``location`` and ``ptr``.
 
     Parameters
     ----------
@@ -36922,7 +36942,7 @@ def cuMemExportToShareableHandle(handle, handleType not None : CUmemAllocationHa
     memory handle using :py:obj:`~.cuMemImportFromShareableHandle` and map
     it with :py:obj:`~.cuMemMap`. The implementation of what this handle is
     and how it can be transferred is defined by the requested handle type
-    in `handleType`
+    in ``handleType``
 
     Once all shareable handles are closed and the allocation is released,
     the allocated memory referenced will be released back to the OS and
@@ -36937,7 +36957,7 @@ def cuMemExportToShareableHandle(handle, handleType not None : CUmemAllocationHa
         CUDA handle for the memory allocation
     handleType : :py:obj:`~.CUmemAllocationHandleType`
         Type of shareable handle requested (defines type and size of the
-        `shareableHandle` output parameter)
+        ``shareableHandle`` output parameter)
     flags : unsigned long long
         Reserved, must be zero
 
@@ -36980,7 +37000,7 @@ def cuMemImportFromShareableHandle(osHandle, shHandleType not None : CUmemAlloca
     shareable handle, this API will error as
     :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`.
 
-    If `shHandleType` is :py:obj:`~.CU_MEM_HANDLE_TYPE_FABRIC` and the
+    If ``shHandleType`` is :py:obj:`~.CU_MEM_HANDLE_TYPE_FABRIC` and the
     importer process has not been granted access to the same IMEX channel
     as the exporter process, this API will error as
     :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`.
@@ -37007,7 +37027,7 @@ def cuMemImportFromShareableHandle(osHandle, shHandleType not None : CUmemAlloca
 
     Notes
     -----
-    Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc) created on devices under an SLI group may not be supported, and thus this API will return CUDA_ERROR_NOT_SUPPORTED. There is no guarantee that the contents of `handle` will be the same CUDA memory handle for the same given OS shareable handle, or the same underlying allocation.
+    Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc) created on devices under an SLI group may not be supported, and thus this API will return CUDA_ERROR_NOT_SUPPORTED. There is no guarantee that the contents of ``handle`` will be the same CUDA memory handle for the same given OS shareable handle, or the same underlying allocation.
     """
     cdef CUmemGenericAllocationHandle handle = CUmemGenericAllocationHandle()
     cdef _HelperInputVoidPtrStruct cyosHandleHelper
@@ -37103,7 +37123,7 @@ def cuMemGetAllocationPropertiesFromHandle(handle):
 
 @cython.embedsignature(True)
 def cuMemRetainAllocationHandle(addr):
-    """ Given an address `addr`, returns the allocation handle of the backing memory allocation.
+    """ Given an address ``addr``, returns the allocation handle of the backing memory allocation.
 
     The handle is guaranteed to be the same handle value used to map the
     memory. If the address requested is not mapped, the function will fail.
@@ -37128,7 +37148,7 @@ def cuMemRetainAllocationHandle(addr):
 
     Notes
     -----
-    The address `addr`, can be any address in a range previously mapped by :py:obj:`~.cuMemMap`, and not necessarily the start address.
+    The address ``addr``, can be any address in a range previously mapped by :py:obj:`~.cuMemMap`, and not necessarily the start address.
     """
     cdef CUmemGenericAllocationHandle handle = CUmemGenericAllocationHandle()
     cdef _HelperInputVoidPtrStruct cyaddrHelper
@@ -37147,7 +37167,7 @@ def cuMemRetainAllocationHandle(addr):
 def cuMemFreeAsync(dptr, hStream):
     """ Frees memory with stream ordered semantics.
 
-    Inserts a free operation into `hStream`. The allocation must not be
+    Inserts a free operation into ``hStream``. The allocation must not be
     accessed after stream execution reaches the free. After this API
     returns, accessing the memory from any subsequent work launched on the
     GPU or querying its pointer attributes results in undefined behavior.
@@ -37195,8 +37215,8 @@ def cuMemFreeAsync(dptr, hStream):
 def cuMemAllocAsync(size_t bytesize, hStream):
     """ Allocates memory with stream ordered semantics.
 
-    Inserts an allocation operation into `hStream`. A pointer to the
-    allocated memory is returned immediately in *dptr. The allocation must
+    Inserts an allocation operation into ``hStream``. A pointer to the
+    allocated memory is returned immediately in \\*dptr. The allocation must
     not be accessed until the the allocation operation completes. The
     allocation comes from the memory pool current to the stream's device.
 
@@ -37582,8 +37602,8 @@ def cuMemPoolGetAccess(memPool, location : Optional[CUmemLocation]):
 def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
     """ Creates a memory pool.
 
-    Creates a CUDA memory pool and returns the handle in `pool`. The
-    `poolProps` determines the properties of the pool such as the backing
+    Creates a CUDA memory pool and returns the handle in ``pool``. The
+    ``poolProps`` determines the properties of the pool such as the backing
     device and IPC capabilities.
 
     To create a memory pool for HOST memory not targeting a specific NUMA
@@ -37613,7 +37633,7 @@ def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
     the pool will default to a system dependent value.
 
     Applications that intend to use :py:obj:`~.CU_MEM_HANDLE_TYPE_FABRIC`
-    based memory sharing must ensure: (1) `nvidia-caps-imex-channels`
+    based memory sharing must ensure: (1) ``nvidia-caps-imex-channels``
     character device is created by the driver and is listed under
     /proc/devices (2) have at least one IMEX channel file accessible by the
     user launching the application.
@@ -37629,8 +37649,8 @@ def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
     These channel files exist in /dev/nvidia-caps-imex-channels/channel*
     and can be created using standard OS native calls like mknod on Linux.
     For example: To create channel0 with the major number from
-    /proc/devices users can execute the following command: `mknod
-    /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
+    /proc/devices users can execute the following command: ``mknod
+    /dev/nvidia-caps-imex-channels/channel0 c <major number> 0``
 
     To create a managed memory pool, applications must set
     :py:obj:`~.CUmemPoolProps.CUmemAllocationType` to
@@ -37894,8 +37914,8 @@ def cuMemSetMemPool(location : Optional[CUmemLocation], typename not None : CUme
 def cuMemAllocFromPoolAsync(size_t bytesize, pool, hStream):
     """ Allocates memory from a specified pool with stream ordered semantics.
 
-    Inserts an allocation operation into `hStream`. A pointer to the
-    allocated memory is returned immediately in *dptr. The allocation must
+    Inserts an allocation operation into ``hStream``. A pointer to the
+    allocated memory is returned immediately in \\*dptr. The allocation must
     not be accessed until the the allocation operation completes. The
     allocation comes from the specified memory pool.
 
@@ -38013,7 +38033,7 @@ def cuMemPoolImportFromShareableHandle(handle, handleType not None : CUmemAlloca
     Specific allocations can be imported from the imported pool with
     cuMemPoolImportPointer.
 
-    If `handleType` is :py:obj:`~.CU_MEM_HANDLE_TYPE_FABRIC` and the
+    If ``handleType`` is :py:obj:`~.CU_MEM_HANDLE_TYPE_FABRIC` and the
     importer process has not been granted access to the same IMEX channel
     as the exporter process, this API will error as
     :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`.
@@ -38060,7 +38080,7 @@ def cuMemPoolImportFromShareableHandle(handle, handleType not None : CUmemAlloca
 def cuMemPoolExportPointer(ptr):
     """ Export data to share a memory pool allocation between processes.
 
-    Constructs `shareData_out` for sharing a specific allocation from an
+    Constructs ``shareData_out`` for sharing a specific allocation from an
     already shared memory pool. The recipient process can import the
     allocation with the :py:obj:`~.cuMemPoolImportPointer` api. The data is
     not a handle and may be shared through any IPC mechanism.
@@ -38103,7 +38123,7 @@ def cuMemPoolExportPointer(ptr):
 def cuMemPoolImportPointer(pool, shareData : Optional[CUmemPoolPtrExportData]):
     """ Import a memory pool allocation from another process.
 
-    Returns in `ptr_out` a pointer to the imported memory. The imported
+    Returns in ``ptr_out`` a pointer to the imported memory. The imported
     memory must not be accessed before the allocation operation completes
     in the exporting process. The imported memory must be freed from all
     importing processes before being freed in the exporting process. The
@@ -38156,7 +38176,7 @@ def cuMemPoolImportPointer(pool, shareData : Optional[CUmemPoolPtrExportData]):
 def cuMulticastCreate(prop : Optional[CUmulticastObjectProp]):
     """ Create a generic allocation handle representing a multicast object described by the given properties.
 
-    This creates a multicast object as described by `prop`. The number of
+    This creates a multicast object as described by ``prop``. The number of
     participating devices is specified by
     :py:obj:`~.CUmulticastObjectProp.numDevices`. Devices can be added to
     the multicast object via :py:obj:`~.cuMulticastAddDevice`. All
@@ -38276,22 +38296,23 @@ def cuMulticastAddDevice(mcHandle, dev):
 def cuMulticastBindMem(mcHandle, size_t mcOffset, memHandle, size_t memOffset, size_t size, unsigned long long flags):
     """ Bind a memory allocation represented by a handle to a multicast object.
 
-    Binds a memory allocation specified by `memHandle` and created via
-    :py:obj:`~.cuMemCreate` to a multicast object represented by `mcHandle`
-    and created via :py:obj:`~.cuMulticastCreate`. The intended `size` of
-    the bind, the offset in the multicast range `mcOffset` as well as the
-    offset in the memory `memOffset` must be a multiple of the value
-    returned by :py:obj:`~.cuMulticastGetGranularity` with the flag
+    Binds a memory allocation specified by ``memHandle`` and created via
+    :py:obj:`~.cuMemCreate` to a multicast object represented by
+    ``mcHandle`` and created via :py:obj:`~.cuMulticastCreate`. The
+    intended ``size`` of the bind, the offset in the multicast range
+    ``mcOffset`` as well as the offset in the memory ``memOffset`` must be
+    a multiple of the value returned by
+    :py:obj:`~.cuMulticastGetGranularity` with the flag
     :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best performance
-    however, `size`, `mcOffset` and `memOffset` should be aligned to the
-    granularity of the memory allocation(see
+    however, ``size``, ``mcOffset`` and ``memOffset`` should be aligned to
+    the granularity of the memory allocation(see
     :py:obj:`~.cuMemGetAllocationGranularity`) or to the value returned by
     :py:obj:`~.cuMulticastGetGranularity` with the flag
     :py:obj:`~.CU_MULTICAST_GRANULARITY_RECOMMENDED`.
 
-    The `size` + `memOffset` cannot be larger than the size of the
-    allocated memory. Similarly the `size` + `mcOffset` cannot be larger
-    than the size of the multicast object.
+    The ``size`` + ``memOffset`` cannot be larger than the size of the
+    allocated memory. Similarly the ``size`` + ``mcOffset`` cannot be
+    larger than the size of the multicast object.
 
     The memory allocation must have beeen created on one of the devices
     that was added to the multicast team via
@@ -38360,35 +38381,35 @@ def cuMulticastBindMem(mcHandle, size_t mcOffset, memHandle, size_t memOffset, s
 def cuMulticastBindMem_v2(mcHandle, dev, size_t mcOffset, memHandle, size_t memOffset, size_t size, unsigned long long flags):
     """ Bind a memory allocation represented by a handle to a multicast object.
 
-    Binds a memory allocation specified by `memHandle` and created via
-    :py:obj:`~.cuMemCreate` to a multicast object represented by `mcHandle`
-    and created via :py:obj:`~.cuMulticastCreate`. The binding will be
-    applicable for the device `dev`. The intended `size` of the bind, the
-    offset in the multicast range `mcOffset` as well as the offset in the
-    memory `memOffset` must be a multiple of the value returned by
-    :py:obj:`~.cuMulticastGetGranularity` with the flag
+    Binds a memory allocation specified by ``memHandle`` and created via
+    :py:obj:`~.cuMemCreate` to a multicast object represented by
+    ``mcHandle`` and created via :py:obj:`~.cuMulticastCreate`. The binding
+    will be applicable for the device ``dev``. The intended ``size`` of the
+    bind, the offset in the multicast range ``mcOffset`` as well as the
+    offset in the memory ``memOffset`` must be a multiple of the value
+    returned by :py:obj:`~.cuMulticastGetGranularity` with the flag
     :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best performance
-    however, `size`, `mcOffset` and `memOffset` should be aligned to the
-    granularity of the memory allocation(see
+    however, ``size``, ``mcOffset`` and ``memOffset`` should be aligned to
+    the granularity of the memory allocation(see
     :py:obj:`~.cuMemGetAllocationGranularity`) or to the value returned by
     :py:obj:`~.cuMulticastGetGranularity` with the flag
     :py:obj:`~.CU_MULTICAST_GRANULARITY_RECOMMENDED`.
 
-    The `size` + `memOffset` cannot be larger than the size of the
-    allocated memory. Similarly the `size` + `mcOffset` cannot be larger
-    than the size of the multicast object.
+    The ``size`` + ``memOffset`` cannot be larger than the size of the
+    allocated memory. Similarly the ``size`` + ``mcOffset`` cannot be
+    larger than the size of the multicast object.
 
     The memory allocation must have beeen created on one of the devices
     that was added to the multicast team via
     :py:obj:`~.cuMulticastAddDevice`. For device memory, i.e., type
     :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`, the memory allocation must
-    have been created on the device specified by `dev`. For host NUMA
+    have been created on the device specified by ``dev``. For host NUMA
     memory, i.e., type :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`, the
     memory allocation must have been created on the CPU NUMA node closest
-    to `dev`. That is, the value returned when querying
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID` for `dev`, must be the CPU
-    NUMA node where the memory was allocated. In both cases, the device
-    named by `dev` must have been added to the multicast team via
+    to ``dev``. That is, the value returned when querying
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID` for ``dev``, must be the
+    CPU NUMA node where the memory was allocated. In both cases, the device
+    named by ``dev`` must have been added to the multicast team via
     :py:obj:`~.cuMulticastAddDevice`. Externally shareable as well as
     imported multicast objects can be bound only to externally shareable
     memory. Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if
@@ -38463,20 +38484,20 @@ def cuMulticastBindMem_v2(mcHandle, dev, size_t mcOffset, memHandle, size_t memO
 def cuMulticastBindAddr(mcHandle, size_t mcOffset, memptr, size_t size, unsigned long long flags):
     """ Bind a memory allocation represented by a virtual address to a multicast object.
 
-    Binds a memory allocation specified by its mapped address `memptr` to a
-    multicast object represented by `mcHandle`. The memory must have been
-    allocated via :py:obj:`~.cuMemCreate` or :py:obj:`~.cudaMallocAsync`.
-    The intended `size` of the bind, the offset in the multicast range
-    `mcOffset` and `memptr` must be a multiple of the value returned by
-    :py:obj:`~.cuMulticastGetGranularity` with the flag
-    :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best performance
-    however, `size`, `mcOffset` and `memptr` should be aligned to the value
-    returned by :py:obj:`~.cuMulticastGetGranularity` with the flag
-    :py:obj:`~.CU_MULTICAST_GRANULARITY_RECOMMENDED`.
+    Binds a memory allocation specified by its mapped address ``memptr`` to
+    a multicast object represented by ``mcHandle``. The memory must have
+    been allocated via :py:obj:`~.cuMemCreate` or
+    :py:obj:`~.cudaMallocAsync`. The intended ``size`` of the bind, the
+    offset in the multicast range ``mcOffset`` and ``memptr`` must be a
+    multiple of the value returned by :py:obj:`~.cuMulticastGetGranularity`
+    with the flag :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best
+    performance however, ``size``, ``mcOffset`` and ``memptr`` should be
+    aligned to the value returned by :py:obj:`~.cuMulticastGetGranularity`
+    with the flag :py:obj:`~.CU_MULTICAST_GRANULARITY_RECOMMENDED`.
 
-    The `size` cannot be larger than the size of the allocated memory.
-    Similarly the `size` + `mcOffset` cannot be larger than the total size
-    of the multicast object.
+    The ``size`` cannot be larger than the size of the allocated memory.
+    Similarly the ``size`` + ``mcOffset`` cannot be larger than the total
+    size of the multicast object.
 
     The memory allocation must have beeen created on one of the devices
     that was added to the multicast team via
@@ -38543,31 +38564,31 @@ def cuMulticastBindAddr(mcHandle, size_t mcOffset, memptr, size_t size, unsigned
 def cuMulticastBindAddr_v2(mcHandle, dev, size_t mcOffset, memptr, size_t size, unsigned long long flags):
     """ Bind a memory allocation represented by a virtual address to a multicast object.
 
-    Binds a memory allocation specified by its mapped address `memptr` to a
-    multicast object represented by `mcHandle`. The binding will be
-    applicable for the device `dev`. The memory must have been allocated
+    Binds a memory allocation specified by its mapped address ``memptr`` to
+    a multicast object represented by ``mcHandle``. The binding will be
+    applicable for the device ``dev``. The memory must have been allocated
     via :py:obj:`~.cuMemCreate` or :py:obj:`~.cudaMallocAsync`. The
-    intended `size` of the bind, the offset in the multicast range
-    `mcOffset` and `memptr` must be a multiple of the value returned by
+    intended ``size`` of the bind, the offset in the multicast range
+    ``mcOffset`` and ``memptr`` must be a multiple of the value returned by
     :py:obj:`~.cuMulticastGetGranularity` with the flag
     :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best performance
-    however, `size`, `mcOffset` and `memptr` should be aligned to the value
-    returned by :py:obj:`~.cuMulticastGetGranularity` with the flag
+    however, ``size``, ``mcOffset`` and ``memptr`` should be aligned to the
+    value returned by :py:obj:`~.cuMulticastGetGranularity` with the flag
     :py:obj:`~.CU_MULTICAST_GRANULARITY_RECOMMENDED`.
 
-    The `size` cannot be larger than the size of the allocated memory.
-    Similarly the `size` + `mcOffset` cannot be larger than the total size
-    of the multicast object.
+    The ``size`` cannot be larger than the size of the allocated memory.
+    Similarly the ``size`` + ``mcOffset`` cannot be larger than the total
+    size of the multicast object.
 
     For device memory, i.e., type :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
     the memory allocation must have been created on the device specified by
-    `dev`. For host NUMA memory, i.e., type
+    ``dev``. For host NUMA memory, i.e., type
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`, the memory allocation must
-    have been created on the CPU NUMA node closest to `dev`. That is, the
+    have been created on the CPU NUMA node closest to ``dev``. That is, the
     value returned when querying
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID` for `dev`, must be the CPU
-    NUMA node where the memory was allocated. In both cases, the device
-    named by `dev` must have been added to the multicast team via
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID` for ``dev``, must be the
+    CPU NUMA node where the memory was allocated. In both cases, the device
+    named by ``dev`` must have been added to the multicast team via
     :py:obj:`~.cuMulticastAddDevice`. Externally shareable as well as
     imported multicast objects can be bound only to externally shareable
     memory. Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if
@@ -38640,13 +38661,14 @@ def cuMulticastBindAddr_v2(mcHandle, dev, size_t mcOffset, memptr, size_t size,
 def cuMulticastUnbind(mcHandle, dev, size_t mcOffset, size_t size):
     """ Unbind any memory allocations bound to a multicast object at a given offset and upto a given size.
 
-    Unbinds any memory allocations hosted on `dev` and bound to a multicast
-    object at `mcOffset` and upto a given `size`. The intended `size` of
-    the unbind and the offset in the multicast range ( `mcOffset` ) must be
-    a multiple of the value returned by
+    Unbinds any memory allocations hosted on ``dev`` and bound to a
+    multicast object at ``mcOffset`` and upto a given ``size``. The
+    intended ``size`` of the unbind and the offset in the multicast range (
+    ``mcOffset`` ) must be a multiple of the value returned by
     :py:obj:`~.cuMulticastGetGranularity` flag
-    :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. The `size` + `mcOffset`
-    cannot be larger than the total size of the multicast object.
+    :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. The ``size`` +
+    ``mcOffset`` cannot be larger than the total size of the multicast
+    object.
 
     Parameters
     ----------
@@ -38672,7 +38694,7 @@ def cuMulticastUnbind(mcHandle, dev, size_t mcOffset, size_t size):
 
     Notes
     -----
-    Warning: The `mcOffset` and the `size` must match the corresponding values specified during the bind call. Any other values may result in undefined behavior.
+    Warning: The ``mcOffset`` and the ``size`` must match the corresponding values specified during the bind call. Any other values may result in undefined behavior.
     """
     cdef cydriver.CUdevice cydev
     if dev is None:
@@ -38742,9 +38764,9 @@ def cuMulticastGetGranularity(prop : Optional[CUmulticastObjectProp], option not
 def cuLogicalEndpointIdReserve(count):
     """ Reserves a range of logical endpoint ids.
 
-    Reserves a range of logical endpoint ids starting at `*baseLeId` and
-    extending for `count`. The reserved ids can be used to create or import
-    logical endpoints via :py:obj:`~.cuLogicalEndpointCreate` or
+    Reserves a range of logical endpoint ids starting at ``*baseLeId`` and
+    extending for ``count``. The reserved ids can be used to create or
+    import logical endpoints via :py:obj:`~.cuLogicalEndpointCreate` or
     :py:obj:`~.cuLogicalEndpointImport` respectively.
 
     Parameters
@@ -38758,7 +38780,7 @@ def cuLogicalEndpointIdReserve(count):
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`,
     baseLeId : :py:obj:`~.CUlogicalEndpointId`
         If :py:obj:`~.cuLogicalEndpointIdReserve` returns CUDA_SUCCESS,
-        *baseLeId contains the base logical endpoint id of the reserved
+        \\*baseLeId contains the base logical endpoint id of the reserved
         logical endpoint id range.
 
     See Also
@@ -38787,10 +38809,10 @@ def cuLogicalEndpointIdReserve(count):
 def cuLogicalEndpointIdRelease(baseLeId, count):
     """ Releases a range of logical endpoint ids.
 
-    Releases up to `count` logical endpoint ids starting at `baseLeId`. The
-    range of ids represented by [`baseLeId`, `baseLeId` + `count`) must all
-    be previously reserved. All logical endpoints in the range must be
-    destroyed before they can be released.
+    Releases up to ``count`` logical endpoint ids starting at ``baseLeId``.
+    The range of ids represented by [``baseLeId``, ``baseLeId`` +
+    ``count``) must all be previously reserved. All logical endpoints in
+    the range must be destroyed before they can be released.
 
     Parameters
     ----------
@@ -38835,7 +38857,7 @@ def cuLogicalEndpointIdRelease(baseLeId, count):
 def cuLogicalEndpointCreate(leId, prop : Optional[CUlogicalEndpointProp]):
     """ Creates a logical endpoint with the requested properties and associates it with the logical endpoint id.
 
-    This creates a logical endpoint as described by `prop`. The number of
+    This creates a logical endpoint as described by ``prop``. The number of
     participating devices is determined by the
     :py:obj:`~.CUlogicalEndpointProp.type`. If the type is
     :py:obj:`~.CU_LOGICAL_ENDPOINT_TYPE_UNICAST` then
@@ -38862,9 +38884,9 @@ def cuLogicalEndpointCreate(leId, prop : Optional[CUlogicalEndpointProp]):
     :py:obj:`~.cuLogicalEndpointUnbind`. The total amount of memory that
     can be bound per device is specified by
     :py:obj:`~.CUlogicalEndpointProp.size`. This size must be a multiple of
-    the value for `bindAlignment` as returned by
+    the value for ``bindAlignment`` as returned by
     :py:obj:`~.cuLogicalEndpointGetLimits`. The maximum size for the
-    logical endpoint cannot exceed the value for `maxSize` as returned by
+    logical endpoint cannot exceed the value for ``maxSize`` as returned by
     :py:obj:`~.cuLogicalEndpointGetLimits`. The bind alignment and maximum
     size depend on the properties of the logical endpoint.
 
@@ -38999,31 +39021,31 @@ def cuLogicalEndpointDestroy(leId):
 def cuLogicalEndpointBindAddr(leId, dev, offset, ptr, size, unsigned long long flags):
     """ Bind a memory allocation represented by a virtual address to a logical endpoint.
 
-    Binds the memory allocation specified by its mapped address `ptr` to a
-    logical endpoint represented by `leId` at the offset `offset`. The
-    memory must have been allocated via :py:obj:`~.cuMemCreate` or
-    :py:obj:`~.cudaMallocAsync`. The intended `size` of the bind, the
-    `offset` in the logical endpoint range and `ptr` must be multiples of
-    the value for `bindAlignment` as returned by
+    Binds the memory allocation specified by its mapped address ``ptr`` to
+    a logical endpoint represented by ``leId`` at the offset ``offset``.
+    The memory must have been allocated via :py:obj:`~.cuMemCreate` or
+    :py:obj:`~.cudaMallocAsync`. The intended ``size`` of the bind, the
+    ``offset`` in the logical endpoint range and ``ptr`` must be multiples
+    of the value for ``bindAlignment`` as returned by
     :py:obj:`~.cuLogicalEndpointGetLimits`.
 
-    The `size` cannot be larger than the size of the allocated memory.
-    Similarly the `size` + `offset` cannot be larger than the total size of
-    the logical endpoint.
+    The ``size`` cannot be larger than the size of the allocated memory.
+    Similarly the ``size`` + ``offset`` cannot be larger than the total
+    size of the logical endpoint.
 
     For device memory, i.e., type :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
     the memory allocation must have been created on the device specified by
-    `dev`. For host NUMA memory, i.e., type
+    ``dev``. For host NUMA memory, i.e., type
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`, the memory allocation must
-    have been created on the CPU NUMA node closest to `dev`. That is, the
+    have been created on the CPU NUMA node closest to ``dev``. That is, the
     value returned when querying
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID` for `dev`, must be the CPU
-    NUMA node where the memory was allocated.
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID` for ``dev``, must be the
+    CPU NUMA node where the memory was allocated.
 
-    For multicast endpoints, the device named by `dev` must have been added
-    to the multicast team via :py:obj:`~.cuLogicalEndpointAddDevice`.
+    For multicast endpoints, the device named by ``dev`` must have been
+    added to the multicast team via :py:obj:`~.cuLogicalEndpointAddDevice`.
 
-    For unicast endpoints the device named by `dev` must be the owner
+    For unicast endpoints the device named by ``dev`` must be the owner
     device specified during :py:obj:`~.cuLogicalEndpointCreate` via
     :py:obj:`~.CUlogicalEndpointProp.unicast.device`.
 
@@ -39114,31 +39136,31 @@ def cuLogicalEndpointBindAddr(leId, dev, offset, ptr, size, unsigned long long f
 def cuLogicalEndpointBindMem(leId, dev, offset, memHandle, memOffset, size, unsigned long long flags):
     """ Binds memory object represented by a handle to the logical endpoint.
 
-    Binds the memory allocation specified by `memHandle` to a logical
-    endpoint represented by `leId` at the offset `offset`. The memory must
-    have been allocated via :py:obj:`~.cuMemCreate`. The intended `size` of
-    the bind, the offset in the logical endpoint range `offset` and the
-    offset in the memory handle `memOffset` must be multiples of the value
-    for `bindAlignment` as returned by
+    Binds the memory allocation specified by ``memHandle`` to a logical
+    endpoint represented by ``leId`` at the offset ``offset``. The memory
+    must have been allocated via :py:obj:`~.cuMemCreate`. The intended
+    ``size`` of the bind, the offset in the logical endpoint range
+    ``offset`` and the offset in the memory handle ``memOffset`` must be
+    multiples of the value for ``bindAlignment`` as returned by
     :py:obj:`~.cuLogicalEndpointGetLimits`.
 
-    The `size` + `memOffset` cannot be larger than the size of the
-    allocated memory. Similarly the `size` + `offset` cannot be larger than
-    the total size of the logical endpoint.
+    The ``size`` + ``memOffset`` cannot be larger than the size of the
+    allocated memory. Similarly the ``size`` + ``offset`` cannot be larger
+    than the total size of the logical endpoint.
 
     For device memory, i.e., type :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
     the memory allocation must have been created on the device specified by
-    `dev`. For host NUMA memory, i.e., type
+    ``dev``. For host NUMA memory, i.e., type
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`, the memory allocation must
-    have been created on the CPU NUMA node closest to `dev`. That is, the
+    have been created on the CPU NUMA node closest to ``dev``. That is, the
     value returned when querying
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID` for `dev`, must be the CPU
-    NUMA node where the memory was allocated.
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_HOST_NUMA_ID` for ``dev``, must be the
+    CPU NUMA node where the memory was allocated.
 
-    For multicast endpoints, the device named by `dev` must have been added
-    to the multicast team via :py:obj:`~.cuLogicalEndpointAddDevice`.
+    For multicast endpoints, the device named by ``dev`` must have been
+    added to the multicast team via :py:obj:`~.cuLogicalEndpointAddDevice`.
 
-    For unicast endpoints the device named by `dev` must be the owner
+    For unicast endpoints the device named by ``dev`` must be the owner
     device specified during :py:obj:`~.cuLogicalEndpointCreate` via
     :py:obj:`~.CUlogicalEndpointProp.unicast.device`.
 
@@ -39244,10 +39266,10 @@ def cuLogicalEndpointBindMem(leId, dev, offset, memHandle, memOffset, size, unsi
 def cuLogicalEndpointUnbind(leId, dev, offset, size):
     """ Unbinds any binding at offset from the logical endpoint.
 
-    Unbinds any memory allocations bound to the logical endpoint on `dev`
-    at `offset` and up to the given `size`. The intended `size` of the
-    unbind and the offset in the logical endpoint range `offset` must be
-    multiples of the value for `bindAlignment` as returned by
+    Unbinds any memory allocations bound to the logical endpoint on ``dev``
+    at ``offset`` and up to the given ``size``. The intended ``size`` of
+    the unbind and the offset in the logical endpoint range ``offset`` must
+    be multiples of the value for ``bindAlignment`` as returned by
     :py:obj:`~.cuLogicalEndpointGetLimits`.
 
     Parameters
@@ -39272,7 +39294,7 @@ def cuLogicalEndpointUnbind(leId, dev, offset, size):
 
     Notes
     -----
-    The `offset` must correspond to a value specified during a bind call. The `size` must either match the bind call of the offset or be the combined `size` of multiple bind calls. The `size` + `offset` must fully enclose all bindings that are covered.
+    The ``offset`` must correspond to a value specified during a bind call. The ``size`` must either match the bind call of the offset or be the combined ``size`` of multiple bind calls. The ``size`` + ``offset`` must fully enclose all bindings that are covered.
     """
     cdef cydriver.cuuint64_t cysize
     if size is None:
@@ -39317,12 +39339,12 @@ def cuLogicalEndpointUnbind(leId, dev, offset, size):
 def cuLogicalEndpointExport(leId, handleType not None : CUlogicalEndpointIpcHandleType):
     """ Exports a logical endpoint associated with leId to an IPC handle.
 
-    Given a logical endpoint id `leId`, create a shareable handle `handle`
-    that can be used to share the logical endpoint with other processes.
-    The recipient process can convert the shareable handle back into a
-    logical endpoint id using :py:obj:`~.cuLogicalEndpointImport`. The
-    implementation of what this `handle` is and how it can be transfered is
-    defined by the requested handle type in `handletype`.
+    Given a logical endpoint id ``leId``, create a shareable handle
+    ``handle`` that can be used to share the logical endpoint with other
+    processes. The recipient process can convert the shareable handle back
+    into a logical endpoint id using :py:obj:`~.cuLogicalEndpointImport`.
+    The implementation of what this ``handle`` is and how it can be
+    transfered is defined by the requested handle type in ``handletype``.
 
     Parameters
     ----------
@@ -39367,12 +39389,12 @@ def cuLogicalEndpointExport(leId, handleType not None : CUlogicalEndpointIpcHand
 def cuLogicalEndpointImport(leId, handle, handleType not None : CUlogicalEndpointIpcHandleType):
     """ Imports a logical endpoint from the given IPC handle and associates it with a logical endpoint id.
 
-    Imports a logical endpoint from the given IPC `handle` and associates
-    it with the logical endpoint id specified by `leId`.
+    Imports a logical endpoint from the given IPC ``handle`` and associates
+    it with the logical endpoint id specified by ``leId``.
 
     If the current process cannot support the logical endpoint described by
     the shareable handle, this API will error as
-    :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`. If `handle` is of type
+    :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`. If ``handle`` is of type
     :py:obj:`~.CU_LOGICAL_ENDPOINT_IPC_HANDLE_TYPE_FABRIC` and the importer
     process does not have access permissions, then
     :py:obj:`~.CUDA_ERROR_NOT_PERMITTED` will be returned
@@ -39420,10 +39442,11 @@ def cuLogicalEndpointImport(leId, handle, handleType not None : CUlogicalEndpoin
 def cuLogicalEndpointGetLimits(prop : Optional[CUlogicalEndpointProp]):
     """ Calculates the minimum alignment and the maximum size for the given logical endpoint properties.
 
-    The `bindAlignment` can be used as a multiple for size and bind offset
-    values. The `maxSize` is the maximum size of the logical endpoint. If
-    `maxSize` is less than :py:obj:`~.CUlogicalEndpointProp`:size the user
-    must adjust the request to the smaller value.
+    The ``bindAlignment`` can be used as a multiple for size and bind
+    offset values. The ``maxSize`` is the maximum size of the logical
+    endpoint. If ``maxSize`` is less than
+    :py:obj:`~.CUlogicalEndpointProp`:size the user must adjust the request
+    to the smaller value.
 
     Parameters
     ----------
@@ -39460,16 +39483,16 @@ def cuLogicalEndpointQuery(leId, count):
     """ Determines if all logical endpoints in the range have been successfully constructed.
 
     Queries the driver to determine if all logical endpoints in the given
-    range starting at `leId` and extending for `count` have been
+    range starting at ``leId`` and extending for ``count`` have been
     successfully constructed.
 
     Provides a mechanism to ensure that it is safe to begin using a logical
     endpoint ID. Using a logical endpoint ID before verifying that it is
     fully constructed can result in undefined behavior.
 
-    This is not a blocking API, it returns immediately with a `queryStatus`
-    of 0 if any logical endpoint ID in the given range is not fully
-    constructed, and a non-zero value otherwise.
+    This is not a blocking API, it returns immediately with a
+    ``queryStatus`` of 0 if any logical endpoint ID in the given range is
+    not fully constructed, and a non-zero value otherwise.
 
     Parameters
     ----------
@@ -39525,30 +39548,30 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_CONTEXT`:
 
-    - Returns in `*data` the :py:obj:`~.CUcontext` in which `ptr` was
-      allocated or registered. The type of `data` must be
+    - Returns in ``*data`` the :py:obj:`~.CUcontext` in which ``ptr`` was
+      allocated or registered. The type of ``data`` must be
       :py:obj:`~.CUcontext` *.
 
-    - If `ptr` was not allocated by, mapped by, or registered with a
+    - If ``ptr`` was not allocated by, mapped by, or registered with a
       :py:obj:`~.CUcontext` which uses unified virtual addressing then
       :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned.
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_MEMORY_TYPE`:
 
-    - Returns in `*data` the physical memory type of the memory that `ptr`
-      addresses as a :py:obj:`~.CUmemorytype` enumerated value. The type of
-      `data` must be unsigned int.
+    - Returns in ``*data`` the physical memory type of the memory that
+      ``ptr`` addresses as a :py:obj:`~.CUmemorytype` enumerated value. The
+      type of ``data`` must be unsigned int.
 
-    - If `ptr` addresses device memory then `*data` is set to
+    - If ``ptr`` addresses device memory then ``*data`` is set to
       :py:obj:`~.CU_MEMORYTYPE_DEVICE`. The particular :py:obj:`~.CUdevice`
       on which the memory resides is the :py:obj:`~.CUdevice` of the
       :py:obj:`~.CUcontext` returned by the
-      :py:obj:`~.CU_POINTER_ATTRIBUTE_CONTEXT` attribute of `ptr`.
+      :py:obj:`~.CU_POINTER_ATTRIBUTE_CONTEXT` attribute of ``ptr``.
 
-    - If `ptr` addresses host memory then `*data` is set to
+    - If ``ptr`` addresses host memory then ``*data`` is set to
       :py:obj:`~.CU_MEMORYTYPE_HOST`.
 
-    - If `ptr` was not allocated by, mapped by, or registered with a
+    - If ``ptr`` was not allocated by, mapped by, or registered with a
       :py:obj:`~.CUcontext` which uses unified virtual addressing then
       :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned.
 
@@ -39557,60 +39580,60 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_DEVICE_POINTER`:
 
-    - Returns in `*data` the device pointer value through which `ptr` may
-      be accessed by kernels running in the current :py:obj:`~.CUcontext`.
-      The type of `data` must be CUdeviceptr *.
+    - Returns in ``*data`` the device pointer value through which ``ptr``
+      may be accessed by kernels running in the current
+      :py:obj:`~.CUcontext`. The type of ``data`` must be CUdeviceptr *.
 
     - If there exists no device pointer value through which kernels running
-      in the current :py:obj:`~.CUcontext` may access `ptr` then
+      in the current :py:obj:`~.CUcontext` may access ``ptr`` then
       :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned.
 
     - If there is no current :py:obj:`~.CUcontext` then
       :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT` is returned.
 
     - Except in the exceptional disjoint addressing cases discussed below,
-      the value returned in `*data` will equal the input value `ptr`.
+      the value returned in ``*data`` will equal the input value ``ptr``.
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_HOST_POINTER`:
 
-    - Returns in `*data` the host pointer value through which `ptr` may be
-      accessed by by the host program. The type of `data` must be void **.
-      If there exists no host pointer value through which the host program
-      may directly access `ptr` then :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
-      is returned.
+    - Returns in ``*data`` the host pointer value through which ``ptr`` may
+      be accessed by by the host program. The type of ``data`` must be void
+      **. If there exists no host pointer value through which the host
+      program may directly access ``ptr`` then
+      :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned.
 
     - Except in the exceptional disjoint addressing cases discussed below,
-      the value returned in `*data` will equal the input value `ptr`.
+      the value returned in ``*data`` will equal the input value ``ptr``.
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_P2P_TOKENS`:
 
-    - Returns in `*data` two tokens for use with the nv-p2p.h Linux kernel
-      interface. `data` must be a struct of type
+    - Returns in ``*data`` two tokens for use with the nv-p2p.h Linux
+      kernel interface. ``data`` must be a struct of type
       CUDA_POINTER_ATTRIBUTE_P2P_TOKENS.
 
-    - `ptr` must be a pointer to memory obtained from
+    - ``ptr`` must be a pointer to memory obtained from
       :py:obj:`~.py`:obj:`~.cuMemAlloc()`. Note that p2pToken and
       vaSpaceToken are only valid for the lifetime of the source
       allocation. A subsequent allocation at the same address may return
       completely different tokens. Querying this attribute has a side
       effect of setting the attribute
       :py:obj:`~.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS` for the region of memory
-      that `ptr` points to.
+      that ``ptr`` points to.
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS`:
 
     - A boolean attribute which when set, ensures that synchronous memory
-      operations initiated on the region of memory that `ptr` points to
+      operations initiated on the region of memory that ``ptr`` points to
       will always synchronize. See further documentation in the section
       titled "API synchronization behavior" to learn more about cases when
       synchronous memory operations can exhibit asynchronous behavior.
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_BUFFER_ID`:
 
-    - Returns in `*data` a buffer ID which is guaranteed to be unique
-      within the process. `data` must point to an unsigned long long.
+    - Returns in ``*data`` a buffer ID which is guaranteed to be unique
+      within the process. ``data`` must point to an unsigned long long.
 
-    - `ptr` must be a pointer to memory obtained from a CUDA memory
+    - ``ptr`` must be a pointer to memory obtained from a CUDA memory
       allocation API. Every memory allocation from any of the CUDA memory
       allocation APIs will have a unique ID over a process lifetime.
       Subsequent allocations do not reuse IDs from previous freed
@@ -39618,40 +39641,41 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_IS_MANAGED`:
 
-    - Returns in `*data` a boolean that indicates whether the pointer
+    - Returns in ``*data`` a boolean that indicates whether the pointer
       points to managed memory or not.
 
-    - If `ptr` is not a valid CUDA pointer then
+    - If ``ptr`` is not a valid CUDA pointer then
       :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned.
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL`:
 
-    - Returns in `*data` an integer representing a device ordinal of a
+    - Returns in ``*data`` an integer representing a device ordinal of a
       device against which the memory was allocated or registered.
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE`:
 
-    - Returns in `*data` a boolean that indicates if this pointer maps to
+    - Returns in ``*data`` a boolean that indicates if this pointer maps to
       an allocation that is suitable for :py:obj:`~.cudaIpcGetMemHandle`.
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR`:
 
-    - Returns in `*data` the starting address for the allocation referenced
-      by the device pointer `ptr`. Note that this is not necessarily the
-      address of the mapped region, but the address of the mappable address
-      range `ptr` references (e.g. from :py:obj:`~.cuMemAddressReserve`).
+    - Returns in ``*data`` the starting address for the allocation
+      referenced by the device pointer ``ptr``. Note that this is not
+      necessarily the address of the mapped region, but the address of the
+      mappable address range ``ptr`` references (e.g. from
+      :py:obj:`~.cuMemAddressReserve`).
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_RANGE_SIZE`:
 
-    - Returns in `*data` the size for the allocation referenced by the
-      device pointer `ptr`. Note that this is not necessarily the size of
-      the mapped region, but the size of the mappable address range `ptr`
+    - Returns in ``*data`` the size for the allocation referenced by the
+      device pointer ``ptr``. Note that this is not necessarily the size of
+      the mapped region, but the size of the mappable address range ``ptr``
       references (e.g. from :py:obj:`~.cuMemAddressReserve`). To retrieve
       the size of the mapped region, see :py:obj:`~.cuMemGetAddressRange`
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_MAPPED`:
 
-    - Returns in `*data` a boolean that indicates if this pointer is in a
+    - Returns in ``*data`` a boolean that indicates if this pointer is in a
       valid address range that is mapped to a backing allocation.
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES`:
@@ -39661,12 +39685,12 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE`:
 
-    - Returns in `*data` the handle to the mempool that the allocation was
-      obtained from.
+    - Returns in ``*data`` the handle to the mempool that the allocation
+      was obtained from.
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE`:
 
-    - Returns in `*data` a boolean that indicates whether the pointer
+    - Returns in ``*data`` a boolean that indicates whether the pointer
       points to memory that is capable to be used for hardware accelerated
       decompression.
 
@@ -39734,10 +39758,10 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
 def cuMemPrefetchAsync(devPtr, size_t count, location not None : CUmemLocation, unsigned int flags, hStream):
     """ Prefetches memory to the specified destination location.
 
-    Prefetches memory to the specified destination location. `devPtr` is
-    the base device pointer of the memory to be prefetched and `location`
-    specifies the destination location. `count` specifies the number of
-    bytes to copy. `hStream` is the stream in which the operation is
+    Prefetches memory to the specified destination location. ``devPtr`` is
+    the base device pointer of the memory to be prefetched and ``location``
+    specifies the destination location. ``count`` specifies the number of
+    bytes to copy. ``hStream`` is the stream in which the operation is
     enqueued. The memory range must refer to managed memory allocated via
     :py:obj:`~.cuMemAllocManaged`, via :py:obj:`~.cuMemAllocFromPool` from
     a managed memory pool or declared via managed variables.
@@ -39747,7 +39771,7 @@ def cuMemPrefetchAsync(devPtr, size_t count, location not None : CUmemLocation,
     device ordinal :py:obj:`~.CUmemLocation.id` which must have non-zero
     value for the device attribute
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`.
-    Additionally, `hStream` must be associated with a device that has a
+    Additionally, ``hStream`` must be associated with a device that has a
     non-zero value for the device attribute
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`. Specifying
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` as :py:obj:`~.CUmemLocation.type`
@@ -39790,7 +39814,7 @@ def cuMemPrefetchAsync(devPtr, size_t count, location not None : CUmemLocation,
 
     If :py:obj:`~.CU_MEM_ADVISE_SET_PREFERRED_LOCATION` was called on any
     subset of this memory range, then the pages will be migrated to
-    `location` even if `location` is not the preferred location of any
+    ``location`` even if ``location`` is not the preferred location of any
     pages in the memory range.
 
     If :py:obj:`~.CU_MEM_ADVISE_SET_ACCESSED_BY` was called on any subset
@@ -39858,18 +39882,18 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
     """ Advise about the usage of a given memory range.
 
     Advise the Unified Memory subsystem about the usage pattern for the
-    memory range starting at `devPtr` with a size of `count` bytes. The
+    memory range starting at ``devPtr`` with a size of ``count`` bytes. The
     start address and end address of the memory range will be rounded down
     and rounded up respectively to be aligned to CPU page size before the
     advice is applied. The memory range must refer to managed memory
     allocated via :py:obj:`~.cuMemAllocManaged` or declared via managed
     variables. The memory range could also refer to system-allocated
     pageable memory provided it represents a valid, host-accessible region
-    of memory and all additional constraints imposed by `advice` as
+    of memory and all additional constraints imposed by ``advice`` as
     outlined below are also satisfied. Specifying an invalid system-
     allocated pageable memory range results in an error being returned.
 
-    The `advice` parameter can take the following values:
+    The ``advice`` parameter can take the following values:
 
     - :py:obj:`~.CU_MEM_ADVISE_SET_READ_MOSTLY`: This implies that the data
       is mostly going to be read from and only occasionally written to. Any
@@ -39885,9 +39909,9 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
       except for the one where the write occurred. If the writing processor
       is the CPU and the preferred location of the page is a host NUMA
       node, then the page will also be migrated to that host NUMA node. The
-      `location` argument is ignored for this advice. Note that for a page
-      to be read-duplicated, the accessing processor must either be the CPU
-      or a GPU that has a non-zero value for the device attribute
+      ``location`` argument is ignored for this advice. Note that for a
+      page to be read-duplicated, the accessing processor must either be
+      the CPU or a GPU that has a non-zero value for the device attribute
       :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`. Also, if a
       context is created on a device that does not have the device
       attribute :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`
@@ -39909,12 +39933,12 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
       collapsed into a single copy. The location for the collapsed copy
       will be the preferred location if the page has a preferred location
       and one of the read-duplicated copies was resident at that location.
-      Otherwise, the location chosen is arbitrary. Note: The `location`
+      Otherwise, the location chosen is arbitrary. Note: The ``location``
       argument is ignored for this advice.
 
     - :py:obj:`~.CU_MEM_ADVISE_SET_PREFERRED_LOCATION`: This advice sets
       the preferred location for the data to be the memory belonging to
-      `location`. When :py:obj:`~.CUmemLocation.type` is
+      ``location``. When :py:obj:`~.CUmemLocation.type` is
       :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`, :py:obj:`~.CUmemLocation.id`
       is ignored and the preferred location is set to be host memory. To
       set the preferred location to a specific host NUMA node, applications
@@ -39949,7 +39973,7 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
       :py:obj:`~.CU_MEM_ADVISE_SET_READ_MOSTLY` is also set on this memory
       region or any subset of it, then the policies associated with that
       advice will override the policies of this advice, unless read
-      accesses from `location` will not result in a read-only copy being
+      accesses from ``location`` will not result in a read-only copy being
       created on that procesor as outlined in description for the advice
       :py:obj:`~.CU_MEM_ADVISE_SET_READ_MOSTLY`. If the memory region
       refers to valid system-allocated pageable memory, and
@@ -39960,11 +39984,11 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
 
     - :py:obj:`~.CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION`: Undoes the effect
       of :py:obj:`~.CU_MEM_ADVISE_SET_PREFERRED_LOCATION` and changes the
-      preferred location to none. The `location` argument is ignored for
+      preferred location to none. The ``location`` argument is ignored for
       this advice.
 
     - :py:obj:`~.CU_MEM_ADVISE_SET_ACCESSED_BY`: This advice implies that
-      the data will be accessed by processor `location`. The
+      the data will be accessed by processor ``location``. The
       :py:obj:`~.CUmemLocation.type` must be either
       :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` with
       :py:obj:`~.CUmemLocation.id` representing a valid device ordinal or
@@ -39995,7 +40019,7 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
       region or any subset of it, then the policies associated with that
       advice will override the policies of this advice. Additionally, if
       the preferred location of this memory region or any subset of it is
-      also `location`, then the policies associated with
+      also ``location``, then the policies associated with
       :py:obj:`~.CU_MEM_ADVISE_SET_PREFERRED_LOCATION` will override the
       policies of this advice. If the memory region refers to valid system-
       allocated pageable memory, and :py:obj:`~.CUmemLocation.type` is
@@ -40010,8 +40034,8 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
 
     - :py:obj:`~.CU_MEM_ADVISE_UNSET_ACCESSED_BY`: Undoes the effect of
       :py:obj:`~.CU_MEM_ADVISE_SET_ACCESSED_BY`. Any mappings to the data
-      from `location` may be removed at any time causing accesses to result
-      in non-fatal page faults. If the memory region refers to valid
+      from ``location`` may be removed at any time causing accesses to
+      result in non-fatal page faults. If the memory region refers to valid
       system-allocated pageable memory, and :py:obj:`~.CUmemLocation.type`
       is :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` then device in
       :py:obj:`~.CUmemLocation.id` must have a non-zero value for the
@@ -40072,27 +40096,28 @@ def cuMemPrefetchBatchAsync(dptrs : Optional[tuple[CUdeviceptr] | list[CUdevicep
     The semantics of the individual prefetch operations are as described in
     :py:obj:`~.cuMemPrefetchAsync`.
 
-    Performs memory prefetch on address ranges specified in `dptrs` and
-    `sizes`. Both arrays must be of the same length as specified by
-    `count`. Each memory range specified must refer to managed memory
+    Performs memory prefetch on address ranges specified in ``dptrs`` and
+    ``sizes``. Both arrays must be of the same length as specified by
+    ``count``. Each memory range specified must refer to managed memory
     allocated via :py:obj:`~.cuMemAllocManaged` or declared via managed
     variables or it may also refer to system-allocated memory when all
     devices have a non-zero value for
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`. The prefetch
     location for every operation in the batch is specified in the
-    `prefetchLocs` array. Each entry in this array can apply to more than
-    one operation. This can be done by specifying in the `prefetchLocIdxs`
-    array, the index of the first prefetch operation that the corresponding
-    entry in the `prefetchLocs` array applies to. Both `prefetchLocs` and
-    `prefetchLocIdxs` must be of the same length as specified by
-    `numPrefetchLocs`. For example, if a batch has 10 prefetches listed in
-    dptrs/sizes, the first 4 of which are to be prefetched to one location
-    and the remaining 6 are to be prefetched to another, then
-    `numPrefetchLocs` will be 2, `prefetchLocIdxs` will be {0, 4} and
-    `prefetchLocs` will contain the two locations. Note the first entry in
-    `prefetchLocIdxs` must always be 0. Also, each entry must be greater
-    than the previous entry and the last entry should be less than `count`.
-    Furthermore, `numPrefetchLocs` must be lesser than or equal to `count`.
+    ``prefetchLocs`` array. Each entry in this array can apply to more than
+    one operation. This can be done by specifying in the
+    ``prefetchLocIdxs`` array, the index of the first prefetch operation
+    that the corresponding entry in the ``prefetchLocs`` array applies to.
+    Both ``prefetchLocs`` and ``prefetchLocIdxs`` must be of the same
+    length as specified by ``numPrefetchLocs``. For example, if a batch has
+    10 prefetches listed in dptrs/sizes, the first 4 of which are to be
+    prefetched to one location and the remaining 6 are to be prefetched to
+    another, then ``numPrefetchLocs`` will be 2, ``prefetchLocIdxs`` will
+    be {0, 4} and ``prefetchLocs`` will contain the two locations. Note the
+    first entry in ``prefetchLocIdxs`` must always be 0. Also, each entry
+    must be greater than the previous entry and the last entry should be
+    less than ``count``. Furthermore, ``numPrefetchLocs`` must be lesser
+    than or equal to ``count``.
 
     Parameters
     ----------
@@ -40101,18 +40126,18 @@ def cuMemPrefetchBatchAsync(dptrs : Optional[tuple[CUdeviceptr] | list[CUdevicep
     sizes : list[int]
         Array of sizes for memory prefetch operations.
     count : size_t
-        Size of `dptrs` and `sizes` arrays.
+        Size of ``dptrs`` and ``sizes`` arrays.
     prefetchLocs : list[:py:obj:`~.CUmemLocation`]
         Array of locations to prefetch to.
     prefetchLocIdxs : list[int]
         Array of indices to specify which operands each entry in the
-        `prefetchLocs` array applies to. The locations specified in
+        ``prefetchLocs`` array applies to. The locations specified in
         prefetchLocs[k] will be applied to copies starting from
         prefetchLocIdxs[k] through prefetchLocIdxs[k+1] - 1. Also
         prefetchLocs[numPrefetchLocs - 1] will apply to prefetches starting
         from prefetchLocIdxs[numPrefetchLocs - 1] through count - 1.
     numPrefetchLocs : size_t
-        Size of `prefetchLocs` and `prefetchLocIdxs` arrays.
+        Size of ``prefetchLocs`` and ``prefetchLocIdxs`` arrays.
     flags : unsigned long long
         Flags reserved for future use. Must be zero.
     hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
@@ -40201,9 +40226,9 @@ def cuMemDiscardBatchAsync(dptrs : Optional[tuple[CUdeviceptr] | list[CUdevicept
     that occur simultaneously with the discard operation result in
     undefined behavior.
 
-    Performs memory discard on address ranges specified in `dptrs` and
-    `sizes`. Both arrays must be of the same length as specified by
-    `count`. Each memory range specified must refer to managed memory
+    Performs memory discard on address ranges specified in ``dptrs`` and
+    ``sizes``. Both arrays must be of the same length as specified by
+    ``count``. Each memory range specified must refer to managed memory
     allocated via :py:obj:`~.cuMemAllocManaged` or declared via managed
     variables or it may also refer to system-allocated memory when all
     devices have a non-zero value for
@@ -40216,7 +40241,7 @@ def cuMemDiscardBatchAsync(dptrs : Optional[tuple[CUdeviceptr] | list[CUdevicept
     sizes : list[int]
         Array of sizes for memory discard operations.
     count : size_t
-        Size of `dptrs` and `sizes` arrays.
+        Size of ``dptrs`` and ``sizes`` arrays.
     flags : unsigned long long
         Flags reserved for future use. Must be zero.
     hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
@@ -40285,27 +40310,27 @@ def cuMemDiscardAndPrefetchBatchAsync(dptrs : Optional[tuple[CUdeviceptr] | list
     undefined behavior.
 
     Performs memory discard and prefetch on address ranges specified in
-    `dptrs` and `sizes`. Both arrays must be of the same length as
-    specified by `count`. Each memory range specified must refer to managed
-    memory allocated via :py:obj:`~.cuMemAllocManaged` or declared via
-    managed variables or it may also refer to system-allocated memory when
-    all devices have a non-zero value for
+    ``dptrs`` and ``sizes``. Both arrays must be of the same length as
+    specified by ``count``. Each memory range specified must refer to
+    managed memory allocated via :py:obj:`~.cuMemAllocManaged` or declared
+    via managed variables or it may also refer to system-allocated memory
+    when all devices have a non-zero value for
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`. Every operation
     in the batch has to be associated with a valid location to prefetch the
-    address range to and specified in the `prefetchLocs` array. Each entry
-    in this array can apply to more than one operation. This can be done by
-    specifying in the `prefetchLocIdxs` array, the index of the first
-    operation that the corresponding entry in the `prefetchLocs` array
-    applies to. Both `prefetchLocs` and `prefetchLocIdxs` must be of the
-    same length as specified by `numPrefetchLocs`. For example, if a batch
-    has 10 operations listed in dptrs/sizes, the first 6 of which are to be
-    prefetched to one location and the remaining 4 are to be prefetched to
-    another, then `numPrefetchLocs` will be 2, `prefetchLocIdxs` will be
-    {0, 6} and `prefetchLocs` will contain the two set of locations. Note
-    the first entry in `prefetchLocIdxs` must always be 0. Also, each entry
-    must be greater than the previous entry and the last entry should be
-    less than `count`. Furthermore, `numPrefetchLocs` must be lesser than
-    or equal to `count`.
+    address range to and specified in the ``prefetchLocs`` array. Each
+    entry in this array can apply to more than one operation. This can be
+    done by specifying in the ``prefetchLocIdxs`` array, the index of the
+    first operation that the corresponding entry in the ``prefetchLocs``
+    array applies to. Both ``prefetchLocs`` and ``prefetchLocIdxs`` must be
+    of the same length as specified by ``numPrefetchLocs``. For example, if
+    a batch has 10 operations listed in dptrs/sizes, the first 6 of which
+    are to be prefetched to one location and the remaining 4 are to be
+    prefetched to another, then ``numPrefetchLocs`` will be 2,
+    ``prefetchLocIdxs`` will be {0, 6} and ``prefetchLocs`` will contain
+    the two set of locations. Note the first entry in ``prefetchLocIdxs``
+    must always be 0. Also, each entry must be greater than the previous
+    entry and the last entry should be less than ``count``. Furthermore,
+    ``numPrefetchLocs`` must be lesser than or equal to ``count``.
 
     Parameters
     ----------
@@ -40314,18 +40339,18 @@ def cuMemDiscardAndPrefetchBatchAsync(dptrs : Optional[tuple[CUdeviceptr] | list
     sizes : list[int]
         Array of sizes for memory discard operations.
     count : size_t
-        Size of `dptrs` and `sizes` arrays.
+        Size of ``dptrs`` and ``sizes`` arrays.
     prefetchLocs : list[:py:obj:`~.CUmemLocation`]
         Array of locations to prefetch to.
     prefetchLocIdxs : list[int]
         Array of indices to specify which operands each entry in the
-        `prefetchLocs` array applies to. The locations specified in
+        ``prefetchLocs`` array applies to. The locations specified in
         prefetchLocs[k] will be applied to operations starting from
         prefetchLocIdxs[k] through prefetchLocIdxs[k+1] - 1. Also
         prefetchLocs[numPrefetchLocs - 1] will apply to copies starting
         from prefetchLocIdxs[numPrefetchLocs - 1] through count - 1.
     numPrefetchLocs : size_t
-        Size of `prefetchLocs` and `prefetchLocIdxs` arrays.
+        Size of ``prefetchLocs`` and ``prefetchLocIdxs`` arrays.
     flags : unsigned long long
         Flags reserved for future use. Must be zero.
     hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
@@ -40395,22 +40420,22 @@ def cuMemDiscardAndPrefetchBatchAsync(dptrs : Optional[tuple[CUdeviceptr] | list
 def cuMemRangeGetAttribute(size_t dataSize, attribute not None : CUmem_range_attribute, devPtr, size_t count):
     """ Query an attribute of a given memory range.
 
-    Query an attribute about the memory range starting at `devPtr` with a
-    size of `count` bytes. The memory range must refer to managed memory
+    Query an attribute about the memory range starting at ``devPtr`` with a
+    size of ``count`` bytes. The memory range must refer to managed memory
     allocated via :py:obj:`~.cuMemAllocManaged` or declared via managed
     variables.
 
-    The `attribute` parameter can take the following values:
+    The ``attribute`` parameter can take the following values:
 
     - :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY`: If this attribute is
-      specified, `data` will be interpreted as a 32-bit integer, and
-      `dataSize` must be 4. The result returned will be 1 if all pages in
+      specified, ``data`` will be interpreted as a 32-bit integer, and
+      ``dataSize`` must be 4. The result returned will be 1 if all pages in
       the given memory range have read-duplication enabled, or 0 otherwise.
 
     - :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION`: If this
-      attribute is specified, `data` will be interpreted as a 32-bit
-      integer, and `dataSize` must be 4. The result returned will be a GPU
-      device id if all pages in the memory range have that GPU as their
+      attribute is specified, ``data`` will be interpreted as a 32-bit
+      integer, and ``dataSize`` must be 4. The result returned will be a
+      GPU device id if all pages in the memory range have that GPU as their
       preferred location, or it will be CU_DEVICE_CPU if all pages in the
       memory range have the CPU as their preferred location, or it will be
       CU_DEVICE_INVALID if either all the pages don't have the same
@@ -40420,24 +40445,25 @@ def cuMemRangeGetAttribute(size_t dataSize, attribute not None : CUmem_range_att
       preferred location.
 
     - :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY`: If this attribute is
-      specified, `data` will be interpreted as an array of 32-bit integers,
-      and `dataSize` must be a non-zero multiple of 4. The result returned
-      will be a list of device ids that had
+      specified, ``data`` will be interpreted as an array of 32-bit
+      integers, and ``dataSize`` must be a non-zero multiple of 4. The
+      result returned will be a list of device ids that had
       :py:obj:`~.CU_MEM_ADVISE_SET_ACCESSED_BY` set for that entire memory
       range. If any device does not have that advice set for the entire
-      memory range, that device will not be included. If `data` is larger
+      memory range, that device will not be included. If ``data`` is larger
       than the number of devices that have that advice set for that memory
       range, CU_DEVICE_INVALID will be returned in all the extra space
-      provided. For ex., if `dataSize` is 12 (i.e. `data` has 3 elements)
-      and only device 0 has the advice set, then the result returned will
-      be { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If `data` is smaller
-      than the number of devices that have that advice set, then only as
-      many devices will be returned as can fit in the array. There is no
-      guarantee on which specific devices will be returned, however.
+      provided. For ex., if ``dataSize`` is 12 (i.e. ``data`` has 3
+      elements) and only device 0 has the advice set, then the result
+      returned will be { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If
+      ``data`` is smaller than the number of devices that have that advice
+      set, then only as many devices will be returned as can fit in the
+      array. There is no guarantee on which specific devices will be
+      returned, however.
 
     - :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION`: If this
-      attribute is specified, `data` will be interpreted as a 32-bit
-      integer, and `dataSize` must be 4. The result returned will be the
+      attribute is specified, ``data`` will be interpreted as a 32-bit
+      integer, and ``dataSize`` must be 4. The result returned will be the
       last location to which all pages in the memory range were prefetched
       explicitly via :py:obj:`~.cuMemPrefetchAsync`. This will either be a
       GPU id or CU_DEVICE_CPU depending on whether the last location for
@@ -40450,8 +40476,8 @@ def cuMemRangeGetAttribute(size_t dataSize, attribute not None : CUmem_range_att
       even begun.
 
     - :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE`: If this
-      attribute is specified, `data` will be interpreted as a
-      :py:obj:`~.CUmemLocationType`, and `dataSize` must be
+      attribute is specified, ``data`` will be interpreted as a
+      :py:obj:`~.CUmemLocationType`, and ``dataSize`` must be
       sizeof(CUmemLocationType). The :py:obj:`~.CUmemLocationType` returned
       will be :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` if all pages in the
       memory range have the same GPU as their preferred location, or
@@ -40467,8 +40493,8 @@ def cuMemRangeGetAttribute(size_t dataSize, attribute not None : CUmem_range_att
       the query may be different from the preferred location type.
 
       - :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_ID`: If this
-        attribute is specified, `data` will be interpreted as a 32-bit
-        integer, and `dataSize` must be 4. If the
+        attribute is specified, ``data`` will be interpreted as a 32-bit
+        integer, and ``dataSize`` must be 4. If the
         :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE` query
         for the same address range returns
         :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`, it will be a valid device
@@ -40478,8 +40504,8 @@ def cuMemRangeGetAttribute(size_t dataSize, attribute not None : CUmem_range_att
         should be ignored.
 
     - :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE`: If
-      this attribute is specified, `data` will be interpreted as a
-      :py:obj:`~.CUmemLocationType`, and `dataSize` must be
+      this attribute is specified, ``data`` will be interpreted as a
+      :py:obj:`~.CUmemLocationType`, and ``dataSize`` must be
       sizeof(CUmemLocationType). The result returned will be the last
       location to which all pages in the memory range were prefetched
       explicitly via :py:obj:`~.cuMemPrefetchAsync`. The
@@ -40496,8 +40522,8 @@ def cuMemRangeGetAttribute(size_t dataSize, attribute not None : CUmem_range_att
       the prefetch operation to that location has completed or even begun.
 
       - :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_ID`: If
-        this attribute is specified, `data` will be interpreted as a 32-bit
-        integer, and `dataSize` must be 4. If the
+        this attribute is specified, ``data`` will be interpreted as a
+        32-bit integer, and ``dataSize`` must be 4. If the
         :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE`
         query for the same address range returns
         :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`, it will be a valid device
@@ -40553,13 +40579,13 @@ def cuMemRangeGetAttribute(size_t dataSize, attribute not None : CUmem_range_att
 def cuMemRangeGetAttributes(dataSizes : tuple[int] | list[int], attributes : Optional[tuple[CUmem_range_attribute] | list[CUmem_range_attribute]], size_t numAttributes, devPtr, size_t count):
     """ Query attributes of a given memory range.
 
-    Query attributes of the memory range starting at `devPtr` with a size
-    of `count` bytes. The memory range must refer to managed memory
+    Query attributes of the memory range starting at ``devPtr`` with a size
+    of ``count`` bytes. The memory range must refer to managed memory
     allocated via :py:obj:`~.cuMemAllocManaged` or declared via managed
-    variables. The `attributes` array will be interpreted to have
-    `numAttributes` entries. The `dataSizes` array will also be interpreted
-    to have `numAttributes` entries. The results of the query will be
-    stored in `data`.
+    variables. The ``attributes`` array will be interpreted to have
+    ``numAttributes`` entries. The ``dataSizes`` array will also be
+    interpreted to have ``numAttributes`` entries. The results of the query
+    will be stored in ``data``.
 
     The list of supported attributes are given below. Please refer to
     :py:obj:`~.cuMemRangeGetAttribute` for attribute descriptions and
@@ -40645,15 +40671,15 @@ def cuPointerSetAttribute(value, attribute not None : CUpointer_attribute, ptr):
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS`:
 
     - A boolean attribute that can either be set (1) or unset (0). When
-      set, the region of memory that `ptr` points to is guaranteed to
+      set, the region of memory that ``ptr`` points to is guaranteed to
       always synchronize memory operations that are synchronous. If there
       are some previously initiated synchronous memory operations that are
       pending when this attribute is set, the function does not return
       until those memory operations are complete. See further documentation
       in the section titled "API synchronization behavior" to learn more
       about cases when synchronous memory operations can exhibit
-      asynchronous behavior. `value` will be considered as a pointer to an
-      unsigned integer to which this attribute is to be set.
+      asynchronous behavior. ``value`` will be considered as a pointer to
+      an unsigned integer to which this attribute is to be set.
 
     Parameters
     ----------
@@ -40731,11 +40757,11 @@ def cuPointerGetAttributes(unsigned int numAttributes, attributes : Optional[tup
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE`
 
     Unlike :py:obj:`~.cuPointerGetAttribute`, this function will not return
-    an error when the `ptr` encountered is not a valid CUDA pointer.
+    an error when the ``ptr`` encountered is not a valid CUDA pointer.
     Instead, the attributes are assigned default NULL values and
     CUDA_SUCCESS is returned.
 
-    If `ptr` was not allocated by, mapped by, or registered with a
+    If ``ptr`` was not allocated by, mapped by, or registered with a
     :py:obj:`~.CUcontext` which uses UVA (Unified Virtual Addressing),
     :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT` is returned.
 
@@ -40790,10 +40816,10 @@ def cuPointerGetAttributes(unsigned int numAttributes, attributes : Optional[tup
 def cuStreamCreate(unsigned int Flags):
     """ Create a stream.
 
-    Creates a stream and returns a handle in `phStream`. The `Flags`
+    Creates a stream and returns a handle in ``phStream``. The ``Flags``
     argument determines behaviors of the stream.
 
-    Valid values for `Flags` are:
+    Valid values for ``Flags`` are:
 
     - :py:obj:`~.CU_STREAM_DEFAULT`: Default stream creation flag.
 
@@ -40833,12 +40859,12 @@ def cuStreamCreateWithPriority(unsigned int flags, int priority):
     """ Create a stream with the given priority.
 
     Creates a stream with the specified priority and returns a handle in
-    `phStream`. This affects the scheduling priority of work in the stream.
-    Priorities provide a hint to preferentially run work with higher
-    priority when possible, but do not preempt already-running work or
-    provide any other functional guarantee on execution order.
+    ``phStream``. This affects the scheduling priority of work in the
+    stream. Priorities provide a hint to preferentially run work with
+    higher priority when possible, but do not preempt already-running work
+    or provide any other functional guarantee on execution order.
 
-    `priority` follows a convention where lower numbers represent higher
+    ``priority`` follows a convention where lower numbers represent higher
     priorities. '0' represents default priority. The range of meaningful
     numerical priorities can be queried using
     :py:obj:`~.cuCtxGetStreamPriorityRange`. If the specified priority is
@@ -40891,8 +40917,8 @@ def cuStreamBeginCaptureToCig(hStream, streamCigCaptureParams : Optional[CUstrea
     :py:obj:`~.cuDeviceGetAttribute()` with
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_D3D12_CIG_STREAMS_SUPPORTED`.
 
-    Begin CIG (CUDA in Graphics) capture on `hStream` for the graphics API
-    as provided in `streamCigCaptureParams`. When a stream is in CIG
+    Begin CIG (CUDA in Graphics) capture on ``hStream`` for the graphics
+    API as provided in ``streamCigCaptureParams``. When a stream is in CIG
     capture mode, all operations pushed into the stream will not be
     executed, but will instead be captured into a graphics API command
     list/command buffer. All kernel launches and memory copy/memory set
@@ -40900,7 +40926,7 @@ def cuStreamBeginCaptureToCig(hStream, streamCigCaptureParams : Optional[CUstrea
     executed by the graphics API, all the stream's operations will execute
     in order along with other graphics API commands in the command list.
 
-    CIG stream capture may not be initiated if `stream` is
+    CIG stream capture may not be initiated if ``stream`` is
     CU_STREAM_LEGACY. Capture must be ended on the same stream in which it
     was initiated, and it may only be initiated if the stream is not
     already in CIG capture mode.
@@ -40910,10 +40936,10 @@ def cuStreamBeginCaptureToCig(hStream, streamCigCaptureParams : Optional[CUstrea
     returned.
 
     Data from the graphics client can be shared with CUDA via the
-    `streamSharedData` in `streamCigCaptureParams`. The format of
-    `streamSharedData` is dependent on the type of the graphics client. For
-    D3D12, `streamSharedData` is an ID3D12CommandList object pointer. The
-    command list must be in ready state for recording commands whenever
+    ``streamSharedData`` in ``streamCigCaptureParams``. The format of
+    ``streamSharedData`` is dependent on the type of the graphics client.
+    For D3D12, ``streamSharedData`` is an ID3D12CommandList object pointer.
+    The command list must be in ready state for recording commands whenever
     kernels are launched on the stream. The command list provided must
     belong to the graphics API device that the CIG context was created
     with, otherwise the behavior will be undefined.
@@ -40932,8 +40958,8 @@ def cuStreamBeginCaptureToCig(hStream, streamCigCaptureParams : Optional[CUstrea
     CUDA work with regards to other CUDA work submitted under the same CIG
     context. Out-of-order execution can lead to device hangs or exceptions.
 
-    CIG capture mode operates similarly to `cuStreamBeginCapture` with the
-    `CU_STREAM_CAPTURE_MODE_RELAXED` option. There are additional
+    CIG capture mode operates similarly to ``cuStreamBeginCapture`` with
+    the ``CU_STREAM_CAPTURE_MODE_RELAXED`` option. There are additional
     limitations to streams in CIG capture mode. The following functions are
     not allowed for CIG streams whether directly or indirectly via a
     recorded graph launch: :py:obj:`~.cuLaunchHostFunc`
@@ -40979,12 +41005,12 @@ def cuStreamBeginCaptureToCig(hStream, streamCigCaptureParams : Optional[CUstrea
 def cuStreamEndCaptureToCig(hStream):
     """ Ends CIG capture on a stream.
 
-    End CIG capture on `hStream`. Capture must have been initiated on
-    `hStream` via a call to :py:obj:`~.cuStreamBeginCaptureToCig`. Once
-    this function is called, `hStream` will exit CIG capture mode and
+    End CIG capture on ``hStream``. Capture must have been initiated on
+    ``hStream`` via a call to :py:obj:`~.cuStreamBeginCaptureToCig`. Once
+    this function is called, ``hStream`` will exit CIG capture mode and
     return to its original state, thus removing all CIG stream
     restrictions. Also, the command list/command buffer that was associated
-    with `hStream` in the previous call to
+    with ``hStream`` in the previous call to
     :py:obj:`~.cuStreamBeginCaptureToCig` is now allowed to be submitted
     for execution on the graphics API. However, the stream may not be
     destroyed until execution of the command list is fully done on the GPU.
@@ -41027,8 +41053,8 @@ def cuStreamGetPriority(hStream):
     Query the priority of a stream created using
     :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamCreateWithPriority` or
     :py:obj:`~.cuGreenCtxStreamCreate` and return the priority in
-    `priority`. Note that if the stream was created with a priority outside
-    the numerical range returned by
+    ``priority``. Note that if the stream was created with a priority
+    outside the numerical range returned by
     :py:obj:`~.cuCtxGetStreamPriorityRange`, this function returns the
     clamped priority. See :py:obj:`~.cuStreamCreateWithPriority` for
     details about priority clamping.
@@ -41072,7 +41098,7 @@ def cuStreamGetPriority(hStream):
 def cuStreamGetDevice(hStream):
     """ Returns the device handle of the stream.
 
-    Returns in `*device` the device handle of the stream
+    Returns in ``*device`` the device handle of the stream
 
     Parameters
     ----------
@@ -41114,7 +41140,7 @@ def cuStreamGetFlags(hStream):
 
     Query the flags of a stream created using :py:obj:`~.cuStreamCreate`,
     :py:obj:`~.cuStreamCreateWithPriority` or
-    :py:obj:`~.cuGreenCtxStreamCreate` and return the flags in `flags`.
+    :py:obj:`~.cuGreenCtxStreamCreate` and return the flags in ``flags``.
 
     Parameters
     ----------
@@ -41127,7 +41153,7 @@ def cuStreamGetFlags(hStream):
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     flags : unsigned int
         Pointer to an unsigned integer in which the stream's flags are
-        returned The value returned in `flags` is a logical 'OR' of all
+        returned The value returned in ``flags`` is a logical 'OR' of all
         flags that were used while creating this stream. See
         :py:obj:`~.cuStreamCreate` for the list of valid flags
 
@@ -41157,10 +41183,10 @@ def cuStreamGetFlags(hStream):
 def cuStreamGetId(hStream):
     """ Returns the unique Id associated with the stream handle supplied.
 
-    Returns in `streamId` the unique Id which is associated with the given
-    stream handle. The Id is unique for the life of the program.
+    Returns in ``streamId`` the unique Id which is associated with the
+    given stream handle. The Id is unique for the life of the program.
 
-    The stream handle `hStream` can refer to any of the following:
+    The stream handle ``hStream`` can refer to any of the following:
 
     - a stream created via any of the CUDA driver APIs such as
       :py:obj:`~.cuStreamCreate` and
@@ -41221,7 +41247,7 @@ def cuStreamGetCtx(hStream):
     to the one returned by :py:obj:`~.cuCtxFromGreenCtx()` on the green
     context associated with the stream at creation time.
 
-    The stream handle `hStream` can refer to any of the following:
+    The stream handle ``hStream`` can refer to any of the following:
 
     - a stream created via any of the CUDA driver APIs such as
       :py:obj:`~.cuStreamCreate` and
@@ -41282,13 +41308,13 @@ def cuStreamGetCtx_v2(hStream):
     Returns the contexts that the stream is associated with.
 
     If the stream is associated with a green context, the API returns the
-    green context in `pGreenCtx` and the primary context of the associated
-    device in `pCtx`.
+    green context in ``pGreenCtx`` and the primary context of the
+    associated device in ``pCtx``.
 
     If the stream is associated with a regular context, the API returns the
-    regular context in `pCtx` and NULL in `pGreenCtx`.
+    regular context in ``pCtx`` and NULL in ``pGreenCtx``.
 
-    The stream handle `hStream` can refer to any of the following:
+    The stream handle ``hStream`` can refer to any of the following:
 
     - a stream created via any of the CUDA driver APIs such as
       :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamCreateWithPriority`
@@ -41307,13 +41333,14 @@ def cuStreamGetCtx_v2(hStream):
       the calling thread. If a green context (that was converted via
       :py:obj:`~.cuCtxFromGreenCtx()` before setting it current) is current
       to the calling thread, the API will return the green context in
-      `pGreenCtx` and the primary context of the associated device in
-      `pCtx`. If a regular context is current, the API returns the regular
-      context in `pCtx` and NULL in `pGreenCtx`. Note that specifying
-      :py:obj:`~.CU_STREAM_PER_THREAD` or :py:obj:`~.cudaStreamPerThread`
-      will return :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` if a green context
-      is current to the calling thread. If no context is current to the
-      calling thread, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT` is returned.
+      ``pGreenCtx`` and the primary context of the associated device in
+      ``pCtx``. If a regular context is current, the API returns the
+      regular context in ``pCtx`` and NULL in ``pGreenCtx``. Note that
+      specifying :py:obj:`~.CU_STREAM_PER_THREAD` or
+      :py:obj:`~.cudaStreamPerThread` will return
+      :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` if a green context is current
+      to the calling thread. If no context is current to the calling
+      thread, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT` is returned.
 
     Parameters
     ----------
@@ -41357,11 +41384,11 @@ def cuStreamGetCtx_v2(hStream):
 def cuStreamWaitEvent(hStream, hEvent, unsigned int Flags):
     """ Make a compute stream wait on an event.
 
-    Makes all future work submitted to `hStream` wait for all work captured
-    in `hEvent`. See :py:obj:`~.cuEventRecord()` for details on what is
-    captured by an event. The synchronization will be performed efficiently
-    on the device when applicable. `hEvent` may be from a different context
-    or device than `hStream`.
+    Makes all future work submitted to ``hStream`` wait for all work
+    captured in ``hEvent``. See :py:obj:`~.cuEventRecord()` for details on
+    what is captured by an event. The synchronization will be performed
+    efficiently on the device when applicable. ``hEvent`` may be from a
+    different context or device than ``hStream``.
 
     flags include:
 
@@ -41534,17 +41561,17 @@ def cuStreamAddCallback(hStream, callback, userData, unsigned int flags):
 def cuStreamBeginCapture(hStream, mode not None : CUstreamCaptureMode):
     """ Begins graph capture on a stream.
 
-    Begin graph capture on `hStream`. When a stream is in capture mode, all
-    operations pushed into the stream will not be executed, but will
+    Begin graph capture on ``hStream``. When a stream is in capture mode,
+    all operations pushed into the stream will not be executed, but will
     instead be captured into a graph, which will be returned via
     :py:obj:`~.cuStreamEndCapture`. Capture may not be initiated if
-    `stream` is CU_STREAM_LEGACY. Capture must be ended on the same stream
-    in which it was initiated, and it may only be initiated if the stream
-    is not already in capture mode. The capture mode may be queried via
-    :py:obj:`~.cuStreamIsCapturing`. A unique id representing the capture
-    sequence may be queried via :py:obj:`~.cuStreamGetCaptureInfo`.
+    ``stream`` is CU_STREAM_LEGACY. Capture must be ended on the same
+    stream in which it was initiated, and it may only be initiated if the
+    stream is not already in capture mode. The capture mode may be queried
+    via :py:obj:`~.cuStreamIsCapturing`. A unique id representing the
+    capture sequence may be queried via :py:obj:`~.cuStreamGetCaptureInfo`.
 
-    If `mode` is not :py:obj:`~.CU_STREAM_CAPTURE_MODE_RELAXED`,
+    If ``mode`` is not :py:obj:`~.CU_STREAM_CAPTURE_MODE_RELAXED`,
     :py:obj:`~.cuStreamEndCapture` must be called on this stream from the
     same thread.
 
@@ -41590,7 +41617,7 @@ def cuStreamBeginCapture(hStream, mode not None : CUstreamCaptureMode):
 def cuStreamBeginRecaptureToGraph(hStream, mode not None : CUstreamCaptureMode, hGraph, callbackFunc, userData):
     """ Begin graph capture on a stream to an existing graph.
 
-    Begin graph capture on `hStream` to the existing `hGraph`. The node
+    Begin graph capture on ``hStream`` to the existing ``hGraph``. The node
     creation order while recapturing the graph must be identical to the
     original graph. The recapture will fail immediately for:
 
@@ -41600,11 +41627,11 @@ def cuStreamBeginRecaptureToGraph(hStream, mode not None : CUstreamCaptureMode,
     - Parameter mismatches for memory allocation or free nodes
 
     Any other node parameter mismatches during recapture can be configured
-    to call the function provided in `callbackFunc`. The recapture will
+    to call the function provided in ``callbackFunc``. The recapture will
     fail immediately if the callback returns anything other than
     CUDA_SUCCESS.
 
-    If the recapture fails for any reason, the `graph` will be in an
+    If the recapture fails for any reason, the ``graph`` will be in an
     undefined state and should be destroyed.
 
     See cuStreamBeginCapture for additional detail on beginning the
@@ -41638,7 +41665,7 @@ def cuStreamBeginRecaptureToGraph(hStream, mode not None : CUstreamCaptureMode,
 
     Notes
     -----
-    Any user objects associated with `graph` will be released prior to the recapture.
+    Any user objects associated with ``graph`` will be released prior to the recapture.
     """
     cdef cydriver.CUgraphRecaptureCallback cycallbackFunc
     if callbackFunc is None:
@@ -41679,20 +41706,20 @@ def cuStreamBeginRecaptureToGraph(hStream, mode not None : CUstreamCaptureMode,
 def cuStreamBeginCaptureToGraph(hStream, hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], dependencyData : Optional[tuple[CUgraphEdgeData] | list[CUgraphEdgeData]], size_t numDependencies, mode not None : CUstreamCaptureMode):
     """ Begins graph capture on a stream to an existing graph.
 
-    Begin graph capture on `hStream`, placing new nodes into an existing
+    Begin graph capture on ``hStream``, placing new nodes into an existing
     graph. When a stream is in capture mode, all operations pushed into the
     stream will not be executed, but will instead be captured into
-    `hGraph`. The graph will not be instantiable until the user calls
+    ``hGraph``. The graph will not be instantiable until the user calls
     :py:obj:`~.cuStreamEndCapture`.
 
-    Capture may not be initiated if `stream` is CU_STREAM_LEGACY. Capture
+    Capture may not be initiated if ``stream`` is CU_STREAM_LEGACY. Capture
     must be ended on the same stream in which it was initiated, and it may
     only be initiated if the stream is not already in capture mode. The
     capture mode may be queried via :py:obj:`~.cuStreamIsCapturing`. A
     unique id representing the capture sequence may be queried via
     :py:obj:`~.cuStreamGetCaptureInfo`.
 
-    If `mode` is not :py:obj:`~.CU_STREAM_CAPTURE_MODE_RELAXED`,
+    If ``mode`` is not :py:obj:`~.CU_STREAM_CAPTURE_MODE_RELAXED`,
     :py:obj:`~.cuStreamEndCapture` must be called on this stream from the
     same thread.
 
@@ -41786,8 +41813,8 @@ def cuThreadExchangeStreamCaptureMode(mode not None : CUstreamCaptureMode):
     """ Swaps the stream capture interaction mode for a thread.
 
     Sets the calling thread's stream capture interaction mode to the value
-    contained in `*mode`, and overwrites `*mode` with the previous mode for
-    the thread. To facilitate deterministic behavior across function or
+    contained in ``*mode``, and overwrites ``*mode`` with the previous mode
+    for the thread. To facilitate deterministic behavior across function or
     module boundaries, callers are encouraged to use this API in a push-pop
     fashion:
 
@@ -41810,24 +41837,24 @@ def cuThreadExchangeStreamCaptureMode(mode not None : CUstreamCaptureMode):
 
     A thread's mode is one of the following:
 
-    - `CU_STREAM_CAPTURE_MODE_GLOBAL:` This is the default mode. If the
+    - ``CU_STREAM_CAPTURE_MODE_GLOBAL:`` This is the default mode. If the
       local thread has an ongoing capture sequence that was not initiated
-      with `CU_STREAM_CAPTURE_MODE_RELAXED` at `cuStreamBeginCapture`, or
-      if any other thread has a concurrent capture sequence initiated with
-      `CU_STREAM_CAPTURE_MODE_GLOBAL`, this thread is prohibited from
-      potentially unsafe API calls.
+      with ``CU_STREAM_CAPTURE_MODE_RELAXED`` at ``cuStreamBeginCapture``,
+      or if any other thread has a concurrent capture sequence initiated
+      with ``CU_STREAM_CAPTURE_MODE_GLOBAL``, this thread is prohibited
+      from potentially unsafe API calls.
 
-    - `CU_STREAM_CAPTURE_MODE_THREAD_LOCAL:` If the local thread has an
+    - ``CU_STREAM_CAPTURE_MODE_THREAD_LOCAL:`` If the local thread has an
       ongoing capture sequence not initiated with
-      `CU_STREAM_CAPTURE_MODE_RELAXED`, it is prohibited from potentially
+      ``CU_STREAM_CAPTURE_MODE_RELAXED``, it is prohibited from potentially
       unsafe API calls. Concurrent capture sequences in other threads are
       ignored.
 
-    - `CU_STREAM_CAPTURE_MODE_RELAXED:` The local thread is not prohibited
-      from potentially unsafe API calls. Note that the thread is still
-      prohibited from API calls which necessarily conflict with stream
-      capture, for example, attempting :py:obj:`~.cuEventQuery` on an event
-      that was last recorded inside a capture sequence.
+    - ``CU_STREAM_CAPTURE_MODE_RELAXED:`` The local thread is not
+      prohibited from potentially unsafe API calls. Note that the thread is
+      still prohibited from API calls which necessarily conflict with
+      stream capture, for example, attempting :py:obj:`~.cuEventQuery` on
+      an event that was last recorded inside a capture sequence.
 
     Parameters
     ----------
@@ -41859,13 +41886,13 @@ def cuThreadExchangeStreamCaptureMode(mode not None : CUstreamCaptureMode):
 def cuStreamEndCapture(hStream):
     """ Ends capture on a stream, returning the captured graph.
 
-    End capture on `hStream`, returning the captured graph via `phGraph`.
-    Capture must have been initiated on `hStream` via a call to
-    :py:obj:`~.cuStreamBeginCapture`. If capture was invalidated, due to a
-    violation of the rules of stream capture, then a NULL graph will be
+    End capture on ``hStream``, returning the captured graph via
+    ``phGraph``. Capture must have been initiated on ``hStream`` via a call
+    to :py:obj:`~.cuStreamBeginCapture`. If capture was invalidated, due to
+    a violation of the rules of stream capture, then a NULL graph will be
     returned.
 
-    If the `mode` argument to :py:obj:`~.cuStreamBeginCapture` was not
+    If the ``mode`` argument to :py:obj:`~.cuStreamBeginCapture` was not
     :py:obj:`~.CU_STREAM_CAPTURE_MODE_RELAXED`, this call must be from the
     same thread as :py:obj:`~.cuStreamBeginCapture`.
 
@@ -41907,8 +41934,8 @@ def cuStreamEndCapture(hStream):
 def cuStreamIsCapturing(hStream):
     """ Returns a stream's capture status.
 
-    Return the capture status of `hStream` via `captureStatus`. After a
-    successful call, `*captureStatus` will contain one of the following:
+    Return the capture status of ``hStream`` via ``captureStatus``. After a
+    successful call, ``*captureStatus`` will contain one of the following:
 
     - :py:obj:`~.CU_STREAM_CAPTURE_STATUS_NONE`: The stream is not
       capturing.
@@ -41919,12 +41946,12 @@ def cuStreamIsCapturing(hStream):
       capturing but an error has invalidated the capture sequence. The
       capture sequence must be terminated with
       :py:obj:`~.cuStreamEndCapture` on the stream where it was initiated
-      in order to continue using `hStream`.
+      in order to continue using ``hStream``.
 
     Note that, if this is called on :py:obj:`~.CU_STREAM_LEGACY` (the "null
     stream") while a blocking stream in the same context is capturing, it
     will return :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_IMPLICIT` and
-    `*captureStatus` is unspecified after the call. The blocking stream
+    ``*captureStatus`` is unspecified after the call. The blocking stream
     capture is not invalidated.
 
     When a blocking stream is capturing, the legacy stream is in an
@@ -41984,9 +42011,9 @@ def cuStreamGetCaptureInfo(hStream):
     - the returned capture status is
       :py:obj:`~.CU_STREAM_CAPTURE_STATUS_ACTIVE`
 
-    If `edgeData_out` is non-NULL then `dependencies_out` must be as well.
-    If `dependencies_out` is non-NULL and `edgeData_out` is NULL, but there
-    is non-zero edge data for one or more of the current stream
+    If ``edgeData_out`` is non-NULL then ``dependencies_out`` must be as
+    well. If ``dependencies_out`` is non-NULL and ``edgeData_out`` is NULL,
+    but there is non-zero edge data for one or more of the current stream
     dependencies, the call will return :py:obj:`~.CUDA_ERROR_LOSSY_QUERY`.
 
     Parameters
@@ -42024,11 +42051,11 @@ def cuStreamGetCaptureInfo(hStream):
         operate on the graph (not the stream) without copying.
     edgeData_out : list[:py:obj:`~.CUgraphEdgeData`]
         Optional location to store a pointer to an array of graph edge
-        data. This array parallels `dependencies_out`; the next node to be
-        added has an edge to `dependencies_out`[i] with annotation
-        `edgeData_out`[i] for each `i`. The array pointer is valid until
-        the next API call which operates on the stream or until the capture
-        is terminated.
+        data. This array parallels ``dependencies_out``; the next node to
+        be added has an edge to ``dependencies_out[i]`` with annotation
+        ``edgeData_out[i]`` for each ``i``. The array pointer is valid
+        until the next API call which operates on the stream or until the
+        capture is terminated.
     numDependencies_out : int
         Optional location to store the size of the array returned in
         dependencies_out.
@@ -42161,13 +42188,13 @@ def cuStreamUpdateCaptureDependencies(hStream, dependencies : Optional[tuple[CUg
 def cuStreamAttachMemAsync(hStream, dptr, size_t length, unsigned int flags):
     """ Attach memory to a stream asynchronously.
 
-    Enqueues an operation in `hStream` to specify stream association of
-    `length` bytes of memory starting from `dptr`. This function is a
+    Enqueues an operation in ``hStream`` to specify stream association of
+    ``length`` bytes of memory starting from ``dptr``. This function is a
     stream-ordered operation, meaning that it is dependent on, and will
     only take effect when, previous work in stream has completed. Any
     previous association is automatically replaced.
 
-    `dptr` must point to one of the following types of memories:
+    ``dptr`` must point to one of the following types of memories:
 
     - managed memory declared using the managed keyword or allocated with
       :py:obj:`~.cuMemAllocManaged`.
@@ -42177,31 +42204,32 @@ def cuStreamAttachMemAsync(hStream, dptr, size_t length, unsigned int flags):
       with the stream reports a non-zero value for the device attribute
       :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`.
 
-    For managed allocations, `length` must be either zero or the entire
+    For managed allocations, ``length`` must be either zero or the entire
     allocation's size. Both indicate that the entire allocation's stream
     association is being changed. Currently, it is not possible to change
     stream association for a portion of a managed allocation.
 
-    For pageable host allocations, `length` must be non-zero.
-
-    The stream association is specified using `flags` which must be one of
-    :py:obj:`~.CUmemAttach_flags`. If the :py:obj:`~.CU_MEM_ATTACH_GLOBAL`
-    flag is specified, the memory can be accessed by any stream on any
-    device. If the :py:obj:`~.CU_MEM_ATTACH_HOST` flag is specified, the
-    program makes a guarantee that it won't access the memory on the device
-    from any stream on a device that has a zero value for the device
-    attribute :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`. If
-    the :py:obj:`~.CU_MEM_ATTACH_SINGLE` flag is specified and `hStream` is
+    For pageable host allocations, ``length`` must be non-zero.
+
+    The stream association is specified using ``flags`` which must be one
+    of :py:obj:`~.CUmemAttach_flags`. If the
+    :py:obj:`~.CU_MEM_ATTACH_GLOBAL` flag is specified, the memory can be
+    accessed by any stream on any device. If the
+    :py:obj:`~.CU_MEM_ATTACH_HOST` flag is specified, the program makes a
+    guarantee that it won't access the memory on the device from any stream
+    on a device that has a zero value for the device attribute
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`. If the
+    :py:obj:`~.CU_MEM_ATTACH_SINGLE` flag is specified and ``hStream`` is
     associated with a device that has a zero value for the device attribute
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`, the program
     makes a guarantee that it will only access the memory on the device
-    from `hStream`. It is illegal to attach singly to the NULL stream,
+    from ``hStream``. It is illegal to attach singly to the NULL stream,
     because the NULL stream is a virtual global stream and not a specific
     stream. An error will be returned in this case.
 
     When memory is associated with a single stream, the Unified Memory
     system will allow CPU access to this memory region so long as all
-    operations in `hStream` have completed, regardless of whether other
+    operations in ``hStream`` have completed, regardless of whether other
     streams are active. In effect, this constrains exclusive ownership of
     the managed memory region by an active GPU to per-stream activity
     instead of whole-GPU activity.
@@ -42217,7 +42245,7 @@ def cuStreamAttachMemAsync(hStream, dptr, size_t length, unsigned int flags):
     and coherency will be changed appropriately for all kernels which
     follow a stream-association change.
 
-    If `hStream` is destroyed while data is associated with it, the
+    If ``hStream`` is destroyed while data is associated with it, the
     association is removed and the association reverts to the default
     visibility of the allocation as specified at
     :py:obj:`~.cuMemAllocManaged`. For managed variables, the default
@@ -42275,7 +42303,7 @@ def cuStreamQuery(hStream):
     """ Determine status of a compute stream.
 
     Returns :py:obj:`~.CUDA_SUCCESS` if all operations in the stream
-    specified by `hStream` have completed, or
+    specified by ``hStream`` have completed, or
     :py:obj:`~.CUDA_ERROR_NOT_READY` if not.
 
     For the purposes of Unified Memory, a return value of
@@ -42316,11 +42344,11 @@ def cuStreamSynchronize(hStream):
     """ Wait until a stream's tasks are completed.
 
     Waits until the device has completed all operations in the stream
-    specified by `hStream`. If the context was created with the
+    specified by ``hStream``. If the context was created with the
     :py:obj:`~.CU_CTX_SCHED_BLOCKING_SYNC` flag, the CPU thread will block
     until the stream is finished with all of its tasks.
 
-    \note_null_stream
+    \\note_null_stream
 
     Parameters
     ----------
@@ -42355,13 +42383,13 @@ def cuStreamSynchronize(hStream):
 def cuStreamDestroy(hStream):
     """ Destroys a stream.
 
-    Destroys the stream specified by `hStream`.
+    Destroys the stream specified by ``hStream``.
 
-    In case the device is still doing work in the stream `hStream` when
+    In case the device is still doing work in the stream ``hStream`` when
     :py:obj:`~.cuStreamDestroy()` is called, the function will return
-    immediately and the resources associated with `hStream` will be
+    immediately and the resources associated with ``hStream`` will be
     released automatically once the device has completed all work in
-    `hStream`.
+    ``hStream``.
 
     Parameters
     ----------
@@ -42396,8 +42424,8 @@ def cuStreamDestroy(hStream):
 def cuStreamCopyAttributes(dst, src):
     """ Copies attributes from source stream to destination stream.
 
-    Copies attributes from source stream `src` to destination stream `dst`.
-    Both streams must have the same context.
+    Copies attributes from source stream ``src`` to destination stream
+    ``dst``. Both streams must have the same context.
 
     Parameters
     ----------
@@ -42442,8 +42470,8 @@ def cuStreamCopyAttributes(dst, src):
 def cuStreamGetAttribute(hStream, attr not None : CUstreamAttrID):
     """ Queries stream attribute.
 
-    Queries attribute `attr` from `hStream` and stores it in corresponding
-    member of `value_out`.
+    Queries attribute ``attr`` from ``hStream`` and stores it in
+    corresponding member of ``value_out``.
 
     Parameters
     ----------
@@ -42486,8 +42514,8 @@ def cuStreamGetAttribute(hStream, attr not None : CUstreamAttrID):
 def cuStreamSetAttribute(hStream, attr not None : CUstreamAttrID, value : Optional[CUstreamAttrValue]):
     """ Sets stream attribute.
 
-    Sets attribute `attr` on `hStream` from corresponding attribute of
-    `value`. The updated attribute will be applied to subsequent work
+    Sets attribute ``attr`` on ``hStream`` from corresponding attribute of
+    ``value``. The updated attribute will be applied to subsequent work
     submitted to the stream. It will not affect previously submitted work.
 
     Parameters
@@ -42529,8 +42557,8 @@ def cuStreamSetAttribute(hStream, attr not None : CUstreamAttrID, value : Option
 def cuEventCreate(unsigned int Flags):
     """ Creates an event.
 
-    Creates an event *phEvent for the current context with the flags
-    specified via `Flags`. Valid flags include:
+    Creates an event \\*phEvent for the current context with the flags
+    specified via ``Flags``. Valid flags include:
 
     - :py:obj:`~.CU_EVENT_DEFAULT`: Default event creation flag.
 
@@ -42580,13 +42608,13 @@ def cuEventCreate(unsigned int Flags):
 def cuEventRecord(hEvent, hStream):
     """ Records an event.
 
-    Captures in `hEvent` the contents of `hStream` at the time of this
-    call. `hEvent` and `hStream` must be from the same context otherwise
-    :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` is returned. Calls such as
-    :py:obj:`~.cuEventQuery()` or :py:obj:`~.cuStreamWaitEvent()` will then
-    examine or wait for completion of the work that was captured. Uses of
-    `hStream` after this call do not modify `hEvent`. See note on default
-    stream behavior for what is captured in the default case.
+    Captures in ``hEvent`` the contents of ``hStream`` at the time of this
+    call. ``hEvent`` and ``hStream`` must be from the same context
+    otherwise :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` is returned. Calls such
+    as :py:obj:`~.cuEventQuery()` or :py:obj:`~.cuStreamWaitEvent()` will
+    then examine or wait for completion of the work that was captured. Uses
+    of ``hStream`` after this call do not modify ``hEvent``. See note on
+    default stream behavior for what is captured in the default case.
 
     :py:obj:`~.cuEventRecord()` can be called multiple times on the same
     event and will overwrite the previously captured state. Other APIs such
@@ -42640,13 +42668,13 @@ def cuEventRecord(hEvent, hStream):
 def cuEventRecordWithFlags(hEvent, hStream, unsigned int flags):
     """ Records an event.
 
-    Captures in `hEvent` the contents of `hStream` at the time of this
-    call. `hEvent` and `hStream` must be from the same context otherwise
-    :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` is returned. Calls such as
-    :py:obj:`~.cuEventQuery()` or :py:obj:`~.cuStreamWaitEvent()` will then
-    examine or wait for completion of the work that was captured. Uses of
-    `hStream` after this call do not modify `hEvent`. See note on default
-    stream behavior for what is captured in the default case.
+    Captures in ``hEvent`` the contents of ``hStream`` at the time of this
+    call. ``hEvent`` and ``hStream`` must be from the same context
+    otherwise :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` is returned. Calls such
+    as :py:obj:`~.cuEventQuery()` or :py:obj:`~.cuStreamWaitEvent()` will
+    then examine or wait for completion of the work that was captured. Uses
+    of ``hStream`` after this call do not modify ``hEvent``. See note on
+    default stream behavior for what is captured in the default case.
 
     :py:obj:`~.cuEventRecordWithFlags()` can be called multiple times on
     the same event and will overwrite the previously captured state. Other
@@ -42710,7 +42738,7 @@ def cuEventRecordWithFlags(hEvent, hStream, unsigned int flags):
 def cuEventQuery(hEvent):
     """ Queries an event's status.
 
-    Queries the status of all work currently captured by `hEvent`. See
+    Queries the status of all work currently captured by ``hEvent``. See
     :py:obj:`~.cuEventRecord()` for details on what is captured by an
     event.
 
@@ -42755,9 +42783,9 @@ def cuEventQuery(hEvent):
 def cuEventSynchronize(hEvent):
     """ Waits for an event to complete.
 
-    Waits until the completion of all work currently captured in `hEvent`.
-    See :py:obj:`~.cuEventRecord()` for details on what is captured by an
-    event.
+    Waits until the completion of all work currently captured in
+    ``hEvent``. See :py:obj:`~.cuEventRecord()` for details on what is
+    captured by an event.
 
     Waiting for an event that was created with the
     :py:obj:`~.CU_EVENT_BLOCKING_SYNC` flag will cause the calling CPU
@@ -42799,7 +42827,7 @@ def cuEventSynchronize(hEvent):
 def cuEventDestroy(hEvent):
     """ Destroys an event.
 
-    Destroys the event specified by `hEvent`.
+    Destroys the event specified by ``hEvent``.
 
     An event may be destroyed before it is complete (i.e., while
     :py:obj:`~.cuEventQuery()` would return
@@ -42877,7 +42905,7 @@ def cuEventElapsedTime(hStart, hEnd):
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_READY`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     pMilliseconds : float
-        Time between `hStart` and `hEnd` in ms
+        Time between ``hStart`` and ``hEnd`` in ms
 
     See Also
     --------
@@ -42914,10 +42942,10 @@ def cuImportExternalMemory(memHandleDesc : Optional[CUDA_EXTERNAL_MEMORY_HANDLE_
     """ Imports an external memory object.
 
     Imports an externally allocated memory object and returns a handle to
-    that in `extMem_out`.
+    that in ``extMem_out``.
 
     The properties of the handle being imported must be described in
-    `memHandleDesc`. The :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`
+    ``memHandleDesc``. The :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC`
     structure is defined as follows:
 
     **View CUDA Toolkit Documentation for a C++ code example**
@@ -43075,10 +43103,10 @@ def cuExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[CUDA_EXTERNAL_
     """ Maps a buffer onto an imported memory object.
 
     Maps a buffer onto an imported memory object and returns a device
-    pointer in `devPtr`.
+    pointer in ``devPtr``.
 
     The properties of the buffer being mapped must be described in
-    `bufferDesc`. The :py:obj:`~.CUDA_EXTERNAL_MEMORY_BUFFER_DESC`
+    ``bufferDesc``. The :py:obj:`~.CUDA_EXTERNAL_MEMORY_BUFFER_DESC`
     structure is defined as follows:
 
     **View CUDA Toolkit Documentation for a C++ code example**
@@ -43100,7 +43128,7 @@ def cuExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[CUDA_EXTERNAL_
     separate buffers and then apply the appropriate offsets to the returned
     pointer to derive the individual buffers.
 
-    The returned pointer `devPtr` must be freed using
+    The returned pointer ``devPtr`` must be freed using
     :py:obj:`~.cuMemFree`.
 
     Parameters
@@ -43145,10 +43173,10 @@ def cuExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[CUDA_E
     """ Maps a CUDA mipmapped array onto an external memory object.
 
     Maps a CUDA mipmapped array onto an external object and returns a
-    handle to it in `mipmap`.
+    handle to it in ``mipmap``.
 
     The properties of the CUDA mipmapped array being mapped must be
-    described in `mipmapDesc`. The structure
+    described in ``mipmapDesc``. The structure
     :py:obj:`~.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC` is defined as
     follows:
 
@@ -43168,12 +43196,12 @@ def cuExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[CUDA_E
     :py:obj:`~.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC.numLevels`
     specifies the total number of levels in the mipmap chain.
 
-    If `extMem` was imported from a handle of type
+    If ``extMem`` was imported from a handle of type
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`, then
     :py:obj:`~.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC.numLevels` must be
     equal to 1.
 
-    Mapping `extMem` imported from a handle of type
+    Mapping ``extMem`` imported from a handle of type
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_DMABUF_FD`, is not supported.
 
     The returned CUDA mipmapped array must be freed using
@@ -43259,11 +43287,11 @@ def cuImportExternalSemaphore(semHandleDesc : Optional[CUDA_EXTERNAL_SEMAPHORE_H
     """ Imports an external semaphore.
 
     Imports an externally allocated synchronization object and returns a
-    handle to that in `extSem_out`.
+    handle to that in ``extSem_out``.
 
     The properties of the handle being imported must be described in
-    `semHandleDesc`. The :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC` is
-    defined as follows:
+    ``semHandleDesc``. The :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC`
+    is defined as follows:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -43431,8 +43459,8 @@ def cuSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemap
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC` this API sets
     :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.nvSciSync.fence`
     to a value that can be used by subsequent waiters of the same NvSciSync
-    object to order operations with those currently submitted in `stream`.
-    Such an update will overwrite previous contents of
+    object to order operations with those currently submitted in
+    ``stream``. Such an update will overwrite previous contents of
     :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.nvSciSync.fence`.
     By default, signaling such an external semaphore object causes
     appropriate memory synchronization operations to be performed over all
@@ -43729,8 +43757,8 @@ def cuStreamWaitValue32(stream, addr, value, unsigned int flags):
     Enqueues a synchronization of the stream on the given memory location.
     Work ordered after the operation will block until the given condition
     on the memory is satisfied. By default, the condition is to wait for
-    (int32_t)(*addr - value) >= 0, a cyclic greater-or-equal. Other
-    condition types can be specified via `flags`.
+    (int32_t)(\\*addr - value) >= 0, a cyclic greater-or-equal. Other
+    condition types can be specified via ``flags``.
 
     If the memory was registered via :py:obj:`~.cuMemHostRegister()`, the
     device pointer should be obtained with
@@ -43803,8 +43831,8 @@ def cuStreamWaitValue64(stream, addr, value, unsigned int flags):
     Enqueues a synchronization of the stream on the given memory location.
     Work ordered after the operation will block until the given condition
     on the memory is satisfied. By default, the condition is to wait for
-    (int64_t)(*addr - value) >= 0, a cyclic greater-or-equal. Other
-    condition types can be specified via `flags`.
+    (int64_t)(\\*addr - value) >= 0, a cyclic greater-or-equal. Other
+    condition types can be specified via ``flags``.
 
     If the memory was registered via :py:obj:`~.cuMemHostRegister()`, the
     device pointer should be obtained with
@@ -44071,8 +44099,8 @@ def cuStreamBatchMemOp(stream, unsigned int count, paramArray : Optional[tuple[C
 def cuFuncGetAttribute(attrib not None : CUfunction_attribute, hfunc):
     """ Returns information about a function.
 
-    Returns in `*pi` the integer value of the attribute `attrib` on the
-    kernel given by `hfunc`. The supported attributes are:
+    Returns in ``*pi`` the integer value of the attribute ``attrib`` on the
+    kernel given by ``hfunc``. The supported attributes are:
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK`: The maximum
       number of threads per block, beyond which a launch of the function
@@ -44207,8 +44235,8 @@ def cuFuncGetAttribute(attrib not None : CUfunction_attribute, hfunc):
 def cuFuncSetAttribute(hfunc, attrib not None : CUfunction_attribute, int value):
     """ Sets information about a function.
 
-    This call sets the value of a specified attribute `attrib` on the
-    kernel given by `hfunc` to an integer value specified by `val` This
+    This call sets the value of a specified attribute ``attrib`` on the
+    kernel given by ``hfunc`` to an integer value specified by ``val`` This
     function returns CUDA_SUCCESS if the new value of the attribute could
     be successfully set. If the set fails, this call will return an error.
     Not all attributes can have values set. Attempting to set a value on a
@@ -44302,14 +44330,15 @@ def cuFuncSetCacheConfig(hfunc, config not None : CUfunc_cache):
     """ Sets the preferred cache configuration for a device function.
 
     On devices where the L1 cache and shared memory use the same hardware
-    resources, this sets through `config` the preferred cache configuration
-    for the device function `hfunc`. This is only a preference. The driver
-    will use the requested configuration if possible, but it is free to
-    choose a different configuration if required to execute `hfunc`. Any
-    context-wide preference set via :py:obj:`~.cuCtxSetCacheConfig()` will
-    be overridden by this per-function setting unless the per-function
-    setting is :py:obj:`~.CU_FUNC_CACHE_PREFER_NONE`. In that case, the
-    current context-wide setting will be used.
+    resources, this sets through ``config`` the preferred cache
+    configuration for the device function ``hfunc``. This is only a
+    preference. The driver will use the requested configuration if
+    possible, but it is free to choose a different configuration if
+    required to execute ``hfunc``. Any context-wide preference set via
+    :py:obj:`~.cuCtxSetCacheConfig()` will be overridden by this per-
+    function setting unless the per-function setting is
+    :py:obj:`~.CU_FUNC_CACHE_PREFER_NONE`. In that case, the current
+    context-wide setting will be used.
 
     This setting does nothing on devices where the size of the L1 cache and
     shared memory are fixed.
@@ -44367,9 +44396,9 @@ def cuFuncSetCacheConfig(hfunc, config not None : CUfunc_cache):
 def cuFuncGetModule(hfunc):
     """ Returns a module handle.
 
-    Returns in `*hmod` the handle of the module that function `hfunc` is
-    located in. The lifetime of the module corresponds to the lifetime of
-    the context it was loaded in or until the module is explicitly
+    Returns in ``*hmod`` the handle of the module that function ``hfunc``
+    is located in. The lifetime of the module corresponds to the lifetime
+    of the context it was loaded in or until the module is explicitly
     unloaded.
 
     The CUDA runtime manages its own modules loaded into the primary
@@ -44411,13 +44440,13 @@ def cuFuncGetModule(hfunc):
 def cuFuncGetName(hfunc):
     """ Returns the function name for a :py:obj:`~.CUfunction` handle.
 
-    Returns in `**name` the function name associated with the function
-    handle `hfunc` . The function name is returned as a null-terminated
+    Returns in ``**name`` the function name associated with the function
+    handle ``hfunc`` . The function name is returned as a null-terminated
     string. The returned name is only valid when the function handle is
     valid. If the module is unloaded or reloaded, one must call the API
     again to get the updated name. This API may return a mangled name if
-    the function is not declared as having C linkage. If either `**name` or
-    `hfunc` is NULL, :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned.
+    the function is not declared as having C linkage. If either ``**name``
+    or ``hfunc`` is NULL, :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned.
 
     Parameters
     ----------
@@ -44453,15 +44482,15 @@ def cuFuncGetName(hfunc):
 def cuFuncGetParamInfo(func, size_t paramIndex):
     """ Returns the offset and size of a kernel parameter in the device-side parameter layout.
 
-    Queries the kernel parameter at `paramIndex` into `func's` list of
-    parameters, and returns in `paramOffset` and `paramSize` the offset and
-    size, respectively, where the parameter will reside in the device-side
-    parameter layout. This information can be used to update kernel node
-    parameters from the device via
+    Queries the kernel parameter at ``paramIndex`` into ``func's`` list of
+    parameters, and returns in ``paramOffset`` and ``paramSize`` the offset
+    and size, respectively, where the parameter will reside in the device-
+    side parameter layout. This information can be used to update kernel
+    node parameters from the device via
     :py:obj:`~.cudaGraphKernelNodeSetParam()` and
-    :py:obj:`~.cudaGraphKernelNodeUpdatesApply()`. `paramIndex` must be
-    less than the number of parameters that `func` takes. `paramSize` can
-    be set to NULL if only the parameter offset is desired.
+    :py:obj:`~.cudaGraphKernelNodeUpdatesApply()`. ``paramIndex`` must be
+    less than the number of parameters that ``func`` takes. ``paramSize``
+    can be set to NULL if only the parameter offset is desired.
 
     Parameters
     ----------
@@ -44508,8 +44537,8 @@ def cuFuncGetParamInfo(func, size_t paramIndex):
 def cuFuncGetParamCount(func):
     """ Returns the number of parameters used by the function.
 
-    Queries the number of kernel parameters used by `func` and returns it
-    in `paramCount`.
+    Queries the number of kernel parameters used by ``func`` and returns it
+    in ``paramCount``.
 
     Parameters
     ----------
@@ -44549,7 +44578,7 @@ def cuFuncGetParamCount(func):
 def cuFuncIsLoaded(function):
     """ Returns if the function is loaded.
 
-    Returns in `state` the loading state of `function`.
+    Returns in ``state`` the loading state of ``function``.
 
     Parameters
     ----------
@@ -44589,7 +44618,7 @@ def cuFuncIsLoaded(function):
 def cuFuncLoad(function):
     """ Loads a function.
 
-    Finalizes function loading for `function`. Calling this API with a
+    Finalizes function loading for ``function``. Calling this API with a
     fully loaded function has no effect.
 
     Parameters
@@ -44626,53 +44655,53 @@ def cuLaunchKernel(f, unsigned int gridDimX, unsigned int gridDimY, unsigned int
     """ Launches a CUDA function :py:obj:`~.CUfunction` or a CUDA kernel :py:obj:`~.CUkernel`.
 
     Invokes the function :py:obj:`~.CUfunction` or the kernel
-    :py:obj:`~.CUkernel` `f` on a `gridDimX` x `gridDimY` x `gridDimZ` grid
-    of blocks. Each block contains `blockDimX` x `blockDimY` x `blockDimZ`
-    threads.
+    :py:obj:`~.CUkernel` ``f`` on a ``gridDimX`` x ``gridDimY`` x
+    ``gridDimZ`` grid of blocks. Each block contains ``blockDimX`` x
+    ``blockDimY`` x ``blockDimZ`` threads.
 
-    `sharedMemBytes` sets the amount of dynamic shared memory that will be
-    available to each thread block.
+    ``sharedMemBytes`` sets the amount of dynamic shared memory that will
+    be available to each thread block.
 
-    Kernel parameters to `f` can be specified in one of two ways:
+    Kernel parameters to ``f`` can be specified in one of two ways:
 
-    1) Kernel parameters can be specified via `kernelParams`. If `f` has N
-    parameters, then `kernelParams` needs to be an array of N pointers.
-    Each of `kernelParams`[0] through `kernelParams`[N-1] must point to a
-    region of memory from which the actual kernel parameter will be copied.
-    The number of kernel parameters and their offsets and sizes do not need
-    to be specified as that information is retrieved directly from the
-    kernel's image.
+    1) Kernel parameters can be specified via ``kernelParams``. If ``f``
+    has N parameters, then ``kernelParams`` needs to be an array of N
+    pointers. Each of ``kernelParams[0]`` through ``kernelParams[N-1]``
+    must point to a region of memory from which the actual kernel parameter
+    will be copied. The number of kernel parameters and their offsets and
+    sizes do not need to be specified as that information is retrieved
+    directly from the kernel's image.
 
     2) Kernel parameters can also be packaged by the application into a
-    single buffer that is passed in via the `extra` parameter. This places
-    the burden on the application of knowing each kernel parameter's size
-    and alignment/padding within the buffer. Here is an example of using
-    the `extra` parameter in this manner:
+    single buffer that is passed in via the ``extra`` parameter. This
+    places the burden on the application of knowing each kernel parameter's
+    size and alignment/padding within the buffer. Here is an example of
+    using the ``extra`` parameter in this manner:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    The `extra` parameter exists to allow :py:obj:`~.cuLaunchKernel` to
-    take additional less commonly used arguments. `extra` specifies a list
-    of names of extra settings and their corresponding values. Each extra
-    setting name is immediately followed by the corresponding value. The
-    list must be terminated with either NULL or
+    The ``extra`` parameter exists to allow :py:obj:`~.cuLaunchKernel` to
+    take additional less commonly used arguments. ``extra`` specifies a
+    list of names of extra settings and their corresponding values. Each
+    extra setting name is immediately followed by the corresponding value.
+    The list must be terminated with either NULL or
     :py:obj:`~.CU_LAUNCH_PARAM_END`.
 
     - :py:obj:`~.CU_LAUNCH_PARAM_END`, which indicates the end of the
-      `extra` array;
+      ``extra`` array;
 
     - :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER`, which specifies that the
-      next value in `extra` will be a pointer to a buffer containing all
-      the kernel parameters for launching kernel `f`;
+      next value in ``extra`` will be a pointer to a buffer containing all
+      the kernel parameters for launching kernel ``f``;
 
     - :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE`, which specifies that the
-      next value in `extra` will be a pointer to a size_t containing the
+      next value in ``extra`` will be a pointer to a size_t containing the
       size of the buffer specified with
       :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER`;
 
     The error :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned if
-    kernel parameters are specified with both `kernelParams` and `extra`
-    (i.e. both `kernelParams` and `extra` are non-NULL).
+    kernel parameters are specified with both ``kernelParams`` and
+    ``extra`` (i.e. both ``kernelParams`` and ``extra`` are non-NULL).
 
     Calling :py:obj:`~.cuLaunchKernel()` invalidates the persistent
     function state set through the following deprecated APIs:
@@ -44680,7 +44709,7 @@ def cuLaunchKernel(f, unsigned int gridDimX, unsigned int gridDimY, unsigned int
     :py:obj:`~.cuParamSetSize()`, :py:obj:`~.cuParamSeti()`,
     :py:obj:`~.cuParamSetf()`, :py:obj:`~.cuParamSetv()`.
 
-    Note that to use :py:obj:`~.cuLaunchKernel()`, the kernel `f` must
+    Note that to use :py:obj:`~.cuLaunchKernel()`, the kernel ``f`` must
     either have been compiled with toolchain version 3.2 or later so that
     it will contain kernel parameter information, or have no kernel
     parameters. If either of these conditions is not met, then
@@ -44691,7 +44720,7 @@ def cuLaunchKernel(f, unsigned int gridDimX, unsigned int gridDimY, unsigned int
     :py:obj:`~.CUkernel` by querying the handle using
     :py:obj:`~.cuLibraryGetKernel()` and then passing it to the API by
     casting to :py:obj:`~.CUfunction`. Here, the context to launch the
-    kernel on will either be taken from the specified stream `hStream` or
+    kernel on will either be taken from the specified stream ``hStream`` or
     the current context in case of NULL stream.
 
     Parameters
@@ -44759,8 +44788,8 @@ def cuLaunchKernelEx(config : Optional[CUlaunchConfig], f, kernelParams, void_pt
     """ Launches a CUDA function :py:obj:`~.CUfunction` or a CUDA kernel :py:obj:`~.CUkernel` with launch-time configuration.
 
     Invokes the function :py:obj:`~.CUfunction` or the kernel
-    :py:obj:`~.CUkernel` `f` with the specified launch-time configuration
-    `config`.
+    :py:obj:`~.CUkernel` ``f`` with the specified launch-time configuration
+    ``config``.
 
     The :py:obj:`~.CUlaunchConfig` structure is defined as:
 
@@ -44819,7 +44848,7 @@ def cuLaunchKernelEx(config : Optional[CUlaunchConfig], f, kernelParams, void_pt
     - :py:obj:`~.CUlaunchAttribute.value` is a union that hold the
       attribute value.
 
-    An example of using the `config` parameter:
+    An example of using the ``config`` parameter:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -44925,8 +44954,8 @@ def cuLaunchKernelEx(config : Optional[CUlaunchConfig], f, kernelParams, void_pt
     has been specified. The preferred substitute The preferred substitute
     cluster dimension must be an integer multiple greater than zero of the
     regular cluster dimension and must divide the grid. It must also be no
-    more than `maxBlocksPerCluster`, if it is set in the kernel's
-    `__launch_bounds__`. Otherwise it must be less than the maximum value
+    more than ``maxBlocksPerCluster``, if it is set in the kernel's
+    ``__launch_bounds__``. Otherwise it must be less than the maximum value
     the driver can support. Otherwise, setting this attribute to a value
     physically unable to fit on any particular device is permitted.
 
@@ -44945,7 +44974,7 @@ def cuLaunchKernelEx(config : Optional[CUlaunchConfig], f, kernelParams, void_pt
 
     - :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`
 
-    Kernel parameters to `f` can be specified in the same ways that they
+    Kernel parameters to ``f`` can be specified in the same ways that they
     can be using :py:obj:`~.cuLaunchKernel`.
 
     Note that the API can also be used to launch context-less kernel
@@ -45000,12 +45029,12 @@ def cuLaunchCooperativeKernel(f, unsigned int gridDimX, unsigned int gridDimY, u
     """ Launches a CUDA function :py:obj:`~.CUfunction` or a CUDA kernel :py:obj:`~.CUkernel` where thread blocks can cooperate and synchronize as they execute.
 
     Invokes the function :py:obj:`~.CUfunction` or the kernel
-    :py:obj:`~.CUkernel` `f` on a `gridDimX` x `gridDimY` x `gridDimZ` grid
-    of blocks. Each block contains `blockDimX` x `blockDimY` x `blockDimZ`
-    threads.
+    :py:obj:`~.CUkernel` ``f`` on a ``gridDimX`` x ``gridDimY`` x
+    ``gridDimZ`` grid of blocks. Each block contains ``blockDimX`` x
+    ``blockDimY`` x ``blockDimZ`` threads.
 
-    `sharedMemBytes` sets the amount of dynamic shared memory that will be
-    available to each thread block.
+    ``sharedMemBytes`` sets the amount of dynamic shared memory that will
+    be available to each thread block.
 
     The device on which this kernel is invoked must have a non-zero value
     for the device attribute
@@ -45020,26 +45049,26 @@ def cuLaunchCooperativeKernel(f, unsigned int gridDimX, unsigned int gridDimY, u
 
     The kernel cannot make use of CUDA dynamic parallelism.
 
-    Kernel parameters must be specified via `kernelParams`. If `f` has N
-    parameters, then `kernelParams` needs to be an array of N pointers.
-    Each of `kernelParams`[0] through `kernelParams`[N-1] must point to a
-    region of memory from which the actual kernel parameter will be copied.
-    The number of kernel parameters and their offsets and sizes do not need
-    to be specified as that information is retrieved directly from the
-    kernel's image.
+    Kernel parameters must be specified via ``kernelParams``. If ``f`` has
+    N parameters, then ``kernelParams`` needs to be an array of N pointers.
+    Each of ``kernelParams[0]`` through ``kernelParams[N-1]`` must point to
+    a region of memory from which the actual kernel parameter will be
+    copied. The number of kernel parameters and their offsets and sizes do
+    not need to be specified as that information is retrieved directly from
+    the kernel's image.
 
     Calling :py:obj:`~.cuLaunchCooperativeKernel()` sets persistent
     function state that is the same as function state set through
     :py:obj:`~.cuLaunchKernel` API
 
-    When the kernel `f` is launched via
+    When the kernel ``f`` is launched via
     :py:obj:`~.cuLaunchCooperativeKernel()`, the previous block shape,
-    shared size and parameter info associated with `f` is overwritten.
+    shared size and parameter info associated with ``f`` is overwritten.
 
     Note that to use :py:obj:`~.cuLaunchCooperativeKernel()`, the kernel
-    `f` must either have been compiled with toolchain version 3.2 or later
-    so that it will contain kernel parameter information, or have no kernel
-    parameters. If either of these conditions is not met, then
+    ``f`` must either have been compiled with toolchain version 3.2 or
+    later so that it will contain kernel parameter information, or have no
+    kernel parameters. If either of these conditions is not met, then
     :py:obj:`~.cuLaunchCooperativeKernel()` will return
     :py:obj:`~.CUDA_ERROR_INVALID_IMAGE`.
 
@@ -45047,7 +45076,7 @@ def cuLaunchCooperativeKernel(f, unsigned int gridDimX, unsigned int gridDimY, u
     :py:obj:`~.CUkernel` by querying the handle using
     :py:obj:`~.cuLibraryGetKernel()` and then passing it to the API by
     casting to :py:obj:`~.CUfunction`. Here, the context to launch the
-    kernel on will either be taken from the specified stream `hStream` or
+    kernel on will either be taken from the specified stream ``hStream`` or
     the current context in case of NULL stream.
 
     Parameters
@@ -45313,8 +45342,8 @@ def cuFuncSetBlockShape(hfunc, int x, int y, int z):
 
     [Deprecated]
 
-    Specifies the `x`, `y`, and `z` dimensions of the thread blocks that
-    are created when the kernel given by `hfunc` is launched.
+    Specifies the ``x``, ``y``, and ``z`` dimensions of the thread blocks
+    that are created when the kernel given by ``hfunc`` is launched.
 
     Parameters
     ----------
@@ -45357,8 +45386,8 @@ def cuFuncSetSharedSize(hfunc, unsigned int numbytes):
 
     [Deprecated]
 
-    Sets through `numbytes` the amount of dynamic shared memory that will
-    be available to each thread block when the kernel given by `hfunc` is
+    Sets through ``numbytes`` the amount of dynamic shared memory that will
+    be available to each thread block when the kernel given by ``hfunc`` is
     launched.
 
     Parameters
@@ -45398,8 +45427,8 @@ def cuParamSetSize(hfunc, unsigned int numbytes):
 
     [Deprecated]
 
-    Sets through `numbytes` the total size in bytes needed by the function
-    parameters of the kernel corresponding to `hfunc`.
+    Sets through ``numbytes`` the total size in bytes needed by the
+    function parameters of the kernel corresponding to ``hfunc``.
 
     Parameters
     ----------
@@ -45439,7 +45468,7 @@ def cuParamSeti(hfunc, int offset, unsigned int value):
     [Deprecated]
 
     Sets an integer parameter that will be specified the next time the
-    kernel corresponding to `hfunc` will be invoked. `offset` is a byte
+    kernel corresponding to ``hfunc`` will be invoked. ``offset`` is a byte
     offset.
 
     Parameters
@@ -45482,8 +45511,8 @@ def cuParamSetf(hfunc, int offset, float value):
     [Deprecated]
 
     Sets a floating-point parameter that will be specified the next time
-    the kernel corresponding to `hfunc` will be invoked. `offset` is a byte
-    offset.
+    the kernel corresponding to ``hfunc`` will be invoked. ``offset`` is a
+    byte offset.
 
     Parameters
     ----------
@@ -45524,9 +45553,9 @@ def cuParamSetv(hfunc, int offset, ptr, unsigned int numbytes):
 
     [Deprecated]
 
-    Copies an arbitrary amount of data (specified in `numbytes`) from `ptr`
-    into the parameter space of the kernel corresponding to `hfunc`.
-    `offset` is a byte offset.
+    Copies an arbitrary amount of data (specified in ``numbytes``) from
+    ``ptr`` into the parameter space of the kernel corresponding to
+    ``hfunc``. ``offset`` is a byte offset.
 
     Parameters
     ----------
@@ -45572,7 +45601,7 @@ def cuLaunch(f):
 
     [Deprecated]
 
-    Invokes the kernel `f` on a 1 x 1 x 1 grid of blocks. The block
+    Invokes the kernel ``f`` on a 1 x 1 x 1 grid of blocks. The block
     contains the number of threads specified by a previous call to
     :py:obj:`~.cuFuncSetBlockShape()`.
 
@@ -45623,7 +45652,7 @@ def cuLaunchGrid(f, int grid_width, int grid_height):
 
     [Deprecated]
 
-    Invokes the kernel `f` on a `grid_width` x `grid_height` grid of
+    Invokes the kernel ``f`` on a ``grid_width`` x ``grid_height`` grid of
     blocks. Each block contains the number of threads specified by a
     previous call to :py:obj:`~.cuFuncSetBlockShape()`.
 
@@ -45678,7 +45707,7 @@ def cuLaunchGridAsync(f, int grid_width, int grid_height, hStream):
 
     [Deprecated]
 
-    Invokes the kernel `f` on a `grid_width` x `grid_height` grid of
+    Invokes the kernel ``f`` on a ``grid_width`` x ``grid_height`` grid of
     blocks. Each block contains the number of threads specified by a
     previous call to :py:obj:`~.cuFuncSetBlockShape()`.
 
@@ -45694,7 +45723,7 @@ def cuLaunchGridAsync(f, int grid_width, int grid_height, hStream):
     initialized prior to calling this function. Failure to do so results in
     undefined behavior.
 
-    \note_null_stream
+    \\note_null_stream
 
     Parameters
     ----------
@@ -45718,7 +45747,7 @@ def cuLaunchGridAsync(f, int grid_width, int grid_height, hStream):
 
     Notes
     -----
-    In certain cases where cubins are created with no ABI (i.e., using `ptxas`  `no`), this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by growing the per-thread stack as needed per launch and not shrinking it afterwards.
+    In certain cases where cubins are created with no ABI (i.e., using ``ptxas`` ``--abi-compile`` ``no``), this function may serialize kernel launches. The CUDA driver retains asynchronous behavior by growing the per-thread stack as needed per launch and not shrinking it afterwards.
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -45749,10 +45778,11 @@ def cuLaunchCooperativeKernelMultiDevice(launchParamsList : Optional[tuple[CUDA_
 
     [Deprecated]
 
-    Invokes kernels as specified in the `launchParamsList` array where each
-    element of the array specifies all the parameters required to perform a
-    single kernel launch. These kernels can cooperate and synchronize as
-    they execute. The size of the array is specified by `numDevices`.
+    Invokes kernels as specified in the ``launchParamsList`` array where
+    each element of the array specifies all the parameters required to
+    perform a single kernel launch. These kernels can cooperate and
+    synchronize as they execute. The size of the array is specified by
+    ``numDevices``.
 
     No two kernels can be launched on the same device. All the devices
     targeted by this multi-device launch must be identical. All devices
@@ -45835,8 +45865,8 @@ def cuLaunchCooperativeKernelMultiDevice(launchParamsList : Optional[tuple[CUDA_
       to kernel parameters. If :py:obj:`~.CUDA_LAUNCH_PARAMS.function` has
       N parameters, then :py:obj:`~.CUDA_LAUNCH_PARAMS.kernelParams` needs
       to be an array of N pointers. Each of
-      :py:obj:`~.CUDA_LAUNCH_PARAMS.kernelParams`[0] through
-      :py:obj:`~.CUDA_LAUNCH_PARAMS.kernelParams`[N-1] must point to a
+      :py:obj:`~.CUDA_LAUNCH_PARAMS.kernelParams` ``[0]`` through
+      :py:obj:`~.CUDA_LAUNCH_PARAMS.kernelParams` ``[N-1]`` must point to a
       region of memory from which the actual kernel parameter will be
       copied. The number of kernel parameters and their offsets and sizes
       do not need to be specified as that information is retrieved directly
@@ -45862,12 +45892,12 @@ def cuLaunchCooperativeKernelMultiDevice(launchParamsList : Optional[tuple[CUDA_
     Calling :py:obj:`~.cuLaunchCooperativeKernelMultiDevice()` sets
     persistent function state that is the same as function state set
     through :py:obj:`~.cuLaunchKernel` API when called individually for
-    each element in `launchParamsList`.
+    each element in ``launchParamsList``.
 
     When kernels are launched via
     :py:obj:`~.cuLaunchCooperativeKernelMultiDevice()`, the previous block
     shape, shared size and parameter info associated with each
-    :py:obj:`~.CUDA_LAUNCH_PARAMS.function` in `launchParamsList` is
+    :py:obj:`~.CUDA_LAUNCH_PARAMS.function` in ``launchParamsList`` is
     overwritten.
 
     Note that to use :py:obj:`~.cuLaunchCooperativeKernelMultiDevice()`,
@@ -45882,7 +45912,7 @@ def cuLaunchCooperativeKernelMultiDevice(launchParamsList : Optional[tuple[CUDA_
     launchParamsList : list[:py:obj:`~.CUDA_LAUNCH_PARAMS`]
         List of launch parameters, one per device
     numDevices : unsigned int
-        Size of the `launchParamsList` array
+        Size of the ``launchParamsList`` array
     flags : unsigned int
         Flags to control launch behavior
 
@@ -45924,10 +45954,10 @@ def cuParamSetTexRef(hfunc, int texunit, hTexRef):
     [Deprecated]
 
     Makes the CUDA array or linear memory bound to the texture reference
-    `hTexRef` available to a device program as a texture. In this version
+    ``hTexRef`` available to a device program as a texture. In this version
     of CUDA, the texture-reference must be obtained via
-    :py:obj:`~.cuModuleGetTexRef()` and the `texunit` parameter must be set
-    to :py:obj:`~.CU_PARAM_TR_DEFAULT`.
+    :py:obj:`~.cuModuleGetTexRef()` and the ``texunit`` parameter must be
+    set to :py:obj:`~.CU_PARAM_TR_DEFAULT`.
 
     Parameters
     ----------
@@ -46043,7 +46073,7 @@ def cuFuncSetSharedMemConfig(hfunc, config not None : CUsharedconfig):
 def cuGraphCreate(unsigned int flags):
     """ Creates a graph.
 
-    Creates an empty graph, which is returned via `phGraph`.
+    Creates an empty graph, which is returned via ``phGraph``.
 
     Parameters
     ----------
@@ -46075,64 +46105,64 @@ def cuGraphCreate(unsigned int flags):
 def cuGraphAddKernelNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], size_t numDependencies, nodeParams : Optional[CUDA_KERNEL_NODE_PARAMS]):
     """ Creates a kernel execution node and adds it to a graph.
 
-    Creates a new kernel execution node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `dependencies` may not have any duplicate entries. A
-    handle to the new node will be returned in `phGraphNode`.
+    Creates a new kernel execution node and adds it to ``hGraph`` with
+    ``numDependencies`` dependencies specified via ``dependencies`` and
+    arguments specified in ``nodeParams``. It is possible for
+    ``numDependencies`` to be 0, in which case the node will be placed at
+    the root of the graph. ``dependencies`` may not have any duplicate
+    entries. A handle to the new node will be returned in ``phGraphNode``.
 
     The CUDA_KERNEL_NODE_PARAMS structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    When the graph is launched, the node will invoke kernel `func` on a
-    (`gridDimX` x `gridDimY` x `gridDimZ`) grid of blocks. Each block
-    contains (`blockDimX` x `blockDimY` x `blockDimZ`) threads.
+    When the graph is launched, the node will invoke kernel ``func`` on a
+    (``gridDimX`` x ``gridDimY`` x ``gridDimZ``) grid of blocks. Each block
+    contains (``blockDimX`` x ``blockDimY`` x ``blockDimZ``) threads.
 
-    `sharedMemBytes` sets the amount of dynamic shared memory that will be
-    available to each thread block.
+    ``sharedMemBytes`` sets the amount of dynamic shared memory that will
+    be available to each thread block.
 
-    Kernel parameters to `func` can be specified in one of two ways:
+    Kernel parameters to ``func`` can be specified in one of two ways:
 
-    1) Kernel parameters can be specified via `kernelParams`. If the kernel
-    has N parameters, then `kernelParams` needs to be an array of N
-    pointers. Each pointer, from `kernelParams`[0] to `kernelParams`[N-1],
-    points to the region of memory from which the actual parameter will be
-    copied. The number of kernel parameters and their offsets and sizes do
-    not need to be specified as that information is retrieved directly from
-    the kernel's image.
+    1) Kernel parameters can be specified via ``kernelParams``. If the
+    kernel has N parameters, then ``kernelParams`` needs to be an array of
+    N pointers. Each pointer, from ``kernelParams[0]`` to
+    ``kernelParams[N-1]``, points to the region of memory from which the
+    actual parameter will be copied. The number of kernel parameters and
+    their offsets and sizes do not need to be specified as that information
+    is retrieved directly from the kernel's image.
 
     2) Kernel parameters for non-cooperative kernels can also be packaged
-    by the application into a single buffer that is passed in via `extra`.
-    This places the burden on the application of knowing each kernel
-    parameter's size and alignment/padding within the buffer. The `extra`
-    parameter exists to allow this function to take additional less
-    commonly used arguments. `extra` specifies a list of names of extra
-    settings and their corresponding values. Each extra setting name is
-    immediately followed by the corresponding value. The list must be
+    by the application into a single buffer that is passed in via
+    ``extra``. This places the burden on the application of knowing each
+    kernel parameter's size and alignment/padding within the buffer. The
+    ``extra`` parameter exists to allow this function to take additional
+    less commonly used arguments. ``extra`` specifies a list of names of
+    extra settings and their corresponding values. Each extra setting name
+    is immediately followed by the corresponding value. The list must be
     terminated with either NULL or CU_LAUNCH_PARAM_END.
 
     - :py:obj:`~.CU_LAUNCH_PARAM_END`, which indicates the end of the
-      `extra` array;
+      ``extra`` array;
 
     - :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER`, which specifies that the
-      next value in `extra` will be a pointer to a buffer containing all
-      the kernel parameters for launching kernel `func`;
+      next value in ``extra`` will be a pointer to a buffer containing all
+      the kernel parameters for launching kernel ``func``;
 
     - :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE`, which specifies that the
-      next value in `extra` will be a pointer to a size_t containing the
+      next value in ``extra`` will be a pointer to a size_t containing the
       size of the buffer specified with
       :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER`;
 
     The error :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned if
-    kernel parameters are specified with both `kernelParams` and `extra`
-    (i.e. both `kernelParams` and `extra` are non-NULL).
-    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned if `extra` is
+    kernel parameters are specified with both ``kernelParams`` and
+    ``extra`` (i.e. both ``kernelParams`` and ``extra`` are non-NULL).
+    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned if ``extra`` is
     used for a cooperative kernel.
 
-    The `kernelParams` or `extra` array, as well as the argument values it
-    points to, are copied during this call.
+    The ``kernelParams`` or ``extra`` array, as well as the argument values
+    it points to, are copied during this call.
 
     Parameters
     ----------
@@ -46199,16 +46229,16 @@ def cuGraphAddKernelNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | li
 def cuGraphKernelNodeGetParams(hNode):
     """ Returns a kernel node's parameters.
 
-    Returns the parameters of kernel node `hNode` in `nodeParams`. The
-    `kernelParams` or `extra` array returned in `nodeParams`, as well as
-    the argument values it points to, are owned by the node. This memory
+    Returns the parameters of kernel node ``hNode`` in ``nodeParams``. The
+    ``kernelParams`` or ``extra`` array returned in ``nodeParams``, as well
+    as the argument values it points to, are owned by the node. This memory
     remains valid until the node is destroyed or its parameters are
     modified, and should not be modified directly. Use
     :py:obj:`~.cuGraphKernelNodeSetParams` to update the parameters of this
     node.
 
-    The params will contain either `kernelParams` or `extra`, according to
-    which of these was most recently set on the node.
+    The params will contain either ``kernelParams`` or ``extra``, according
+    to which of these was most recently set on the node.
 
     Parameters
     ----------
@@ -46248,7 +46278,7 @@ def cuGraphKernelNodeGetParams(hNode):
 def cuGraphKernelNodeSetParams(hNode, nodeParams : Optional[CUDA_KERNEL_NODE_PARAMS]):
     """ Sets a kernel node's parameters.
 
-    Sets the parameters of kernel node `hNode` to `nodeParams`.
+    Sets the parameters of kernel node ``hNode`` to ``nodeParams``.
 
     Parameters
     ----------
@@ -46286,16 +46316,16 @@ def cuGraphKernelNodeSetParams(hNode, nodeParams : Optional[CUDA_KERNEL_NODE_PAR
 def cuGraphAddMemcpyNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], size_t numDependencies, copyParams : Optional[CUDA_MEMCPY3D], ctx):
     """ Creates a memcpy node and adds it to a graph.
 
-    Creates a new memcpy node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies`. It is
-    possible for `numDependencies` to be 0, in which case the node will be
-    placed at the root of the graph. `dependencies` may not have any
+    Creates a new memcpy node and adds it to ``hGraph`` with
+    ``numDependencies`` dependencies specified via ``dependencies``. It is
+    possible for ``numDependencies`` to be 0, in which case the node will
+    be placed at the root of the graph. ``dependencies`` may not have any
     duplicate entries. A handle to the new node will be returned in
-    `phGraphNode`.
+    ``phGraphNode``.
 
     When the graph is launched, the node will perform the memcpy described
-    by `copyParams`. See :py:obj:`~.cuMemcpy3D()` for a description of the
-    structure and its restrictions.
+    by ``copyParams``. See :py:obj:`~.cuMemcpy3D()` for a description of
+    the structure and its restrictions.
 
     Memcpy nodes have some additional restrictions with regards to managed
     memory, if the system contains at least one device which has a zero
@@ -46377,7 +46407,7 @@ def cuGraphAddMemcpyNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | li
 def cuGraphMemcpyNodeGetParams(hNode):
     """ Returns a memcpy node's parameters.
 
-    Returns the parameters of memcpy node `hNode` in `nodeParams`.
+    Returns the parameters of memcpy node ``hNode`` in ``nodeParams``.
 
     Parameters
     ----------
@@ -46417,7 +46447,7 @@ def cuGraphMemcpyNodeGetParams(hNode):
 def cuGraphMemcpyNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMCPY3D]):
     """ Sets a memcpy node's parameters.
 
-    Sets the parameters of memcpy node `hNode` to `nodeParams`.
+    Sets the parameters of memcpy node ``hNode`` to ``nodeParams``.
 
     Parameters
     ----------
@@ -46455,15 +46485,15 @@ def cuGraphMemcpyNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMCPY3D]):
 def cuGraphAddMemsetNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], size_t numDependencies, memsetParams : Optional[CUDA_MEMSET_NODE_PARAMS], ctx):
     """ Creates a memset node and adds it to a graph.
 
-    Creates a new memset node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies`. It is
-    possible for `numDependencies` to be 0, in which case the node will be
-    placed at the root of the graph. `dependencies` may not have any
+    Creates a new memset node and adds it to ``hGraph`` with
+    ``numDependencies`` dependencies specified via ``dependencies``. It is
+    possible for ``numDependencies`` to be 0, in which case the node will
+    be placed at the root of the graph. ``dependencies`` may not have any
     duplicate entries. A handle to the new node will be returned in
-    `phGraphNode`.
+    ``phGraphNode``.
 
     The element size must be 1, 2, or 4 bytes. When the graph is launched,
-    the node will perform the memset described by `memsetParams`.
+    the node will perform the memset described by ``memsetParams``.
 
     Parameters
     ----------
@@ -46536,7 +46566,7 @@ def cuGraphAddMemsetNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | li
 def cuGraphMemsetNodeGetParams(hNode):
     """ Returns a memset node's parameters.
 
-    Returns the parameters of memset node `hNode` in `nodeParams`.
+    Returns the parameters of memset node ``hNode`` in ``nodeParams``.
 
     Parameters
     ----------
@@ -46576,7 +46606,7 @@ def cuGraphMemsetNodeGetParams(hNode):
 def cuGraphMemsetNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMSET_NODE_PARAMS]):
     """ Sets a memset node's parameters.
 
-    Sets the parameters of memset node `hNode` to `nodeParams`.
+    Sets the parameters of memset node ``hNode`` to ``nodeParams``.
 
     Parameters
     ----------
@@ -46614,12 +46644,12 @@ def cuGraphMemsetNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMSET_NODE_PAR
 def cuGraphAddHostNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], size_t numDependencies, nodeParams : Optional[CUDA_HOST_NODE_PARAMS]):
     """ Creates a host execution node and adds it to a graph.
 
-    Creates a new CPU execution node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `dependencies` may not have any duplicate entries. A
-    handle to the new node will be returned in `phGraphNode`.
+    Creates a new CPU execution node and adds it to ``hGraph`` with
+    ``numDependencies`` dependencies specified via ``dependencies`` and
+    arguments specified in ``nodeParams``. It is possible for
+    ``numDependencies`` to be 0, in which case the node will be placed at
+    the root of the graph. ``dependencies`` may not have any duplicate
+    entries. A handle to the new node will be returned in ``phGraphNode``.
 
     When the graph is launched, the node will invoke the specified CPU
     function. Host nodes are not supported under MPS with pre-Volta GPUs.
@@ -46685,7 +46715,7 @@ def cuGraphAddHostNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list
 def cuGraphHostNodeGetParams(hNode):
     """ Returns a host node's parameters.
 
-    Returns the parameters of host node `hNode` in `nodeParams`.
+    Returns the parameters of host node ``hNode`` in ``nodeParams``.
 
     Parameters
     ----------
@@ -46725,7 +46755,7 @@ def cuGraphHostNodeGetParams(hNode):
 def cuGraphHostNodeSetParams(hNode, nodeParams : Optional[CUDA_HOST_NODE_PARAMS]):
     """ Sets a host node's parameters.
 
-    Sets the parameters of host node `hNode` to `nodeParams`.
+    Sets the parameters of host node ``hNode`` to ``nodeParams``.
 
     Parameters
     ----------
@@ -46764,13 +46794,13 @@ def cuGraphAddChildGraphNode(hGraph, dependencies : Optional[tuple[CUgraphNode]
     """ Creates a child graph node and adds it to a graph.
 
     Creates a new node which executes an embedded graph, and adds it to
-    `hGraph` with `numDependencies` dependencies specified via
-    `dependencies`. It is possible for `numDependencies` to be 0, in which
-    case the node will be placed at the root of the graph. `dependencies`
-    may not have any duplicate entries. A handle to the new node will be
-    returned in `phGraphNode`.
+    ``hGraph`` with ``numDependencies`` dependencies specified via
+    ``dependencies``. It is possible for ``numDependencies`` to be 0, in
+    which case the node will be placed at the root of the graph.
+    ``dependencies`` may not have any duplicate entries. A handle to the
+    new node will be returned in ``phGraphNode``.
 
-    If `childGraph` contains allocation nodes, free nodes, or conditional
+    If ``childGraph`` contains allocation nodes, free nodes, or conditional
     nodes, this call will return an error.
 
     The node executes an embedded child graph. The child graph is cloned in
@@ -46889,17 +46919,17 @@ def cuGraphChildGraphNodeGetGraph(hNode):
 def cuGraphAddEmptyNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], size_t numDependencies):
     """ Creates an empty node and adds it to a graph.
 
-    Creates a new node which performs no operation, and adds it to `hGraph`
-    with `numDependencies` dependencies specified via `dependencies`. It is
-    possible for `numDependencies` to be 0, in which case the node will be
-    placed at the root of the graph. `dependencies` may not have any
-    duplicate entries. A handle to the new node will be returned in
-    `phGraphNode`.
+    Creates a new node which performs no operation, and adds it to
+    ``hGraph`` with ``numDependencies`` dependencies specified via
+    ``dependencies``. It is possible for ``numDependencies`` to be 0, in
+    which case the node will be placed at the root of the graph.
+    ``dependencies`` may not have any duplicate entries. A handle to the
+    new node will be returned in ``phGraphNode``.
 
     An empty node performs no operation during execution, but can be used
     for transitive ordering. For example, a phased execution graph with 2
     groups of n nodes with a barrier between them can be represented using
-    an empty node and 2*n dependency edges, rather than no empty node and
+    an empty node and 2\\*n dependency edges, rather than no empty node and
     n^2 dependency edges.
 
     Parameters
@@ -46960,14 +46990,14 @@ def cuGraphAddEmptyNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | lis
 def cuGraphAddEventRecordNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], size_t numDependencies, event):
     """ Creates an event record node and adds it to a graph.
 
-    Creates a new event record node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and event
-    specified in `event`. It is possible for `numDependencies` to be 0, in
-    which case the node will be placed at the root of the graph.
-    `dependencies` may not have any duplicate entries. A handle to the new
-    node will be returned in `phGraphNode`.
+    Creates a new event record node and adds it to ``hGraph`` with
+    ``numDependencies`` dependencies specified via ``dependencies`` and
+    event specified in ``event``. It is possible for ``numDependencies`` to
+    be 0, in which case the node will be placed at the root of the graph.
+    ``dependencies`` may not have any duplicate entries. A handle to the
+    new node will be returned in ``phGraphNode``.
 
-    Each launch of the graph will record `event` to capture execution of
+    Each launch of the graph will record ``event`` to capture execution of
     the node's dependencies.
 
     Parameters
@@ -47038,7 +47068,7 @@ def cuGraphAddEventRecordNode(hGraph, dependencies : Optional[tuple[CUgraphNode]
 def cuGraphEventRecordNodeGetEvent(hNode):
     """ Returns the event associated with an event record node.
 
-    Returns the event of event record node `hNode` in `event_out`.
+    Returns the event of event record node ``hNode`` in ``event_out``.
 
     Parameters
     ----------
@@ -47078,7 +47108,7 @@ def cuGraphEventRecordNodeGetEvent(hNode):
 def cuGraphEventRecordNodeSetEvent(hNode, event):
     """ Sets an event record node's event.
 
-    Sets the event of event record node `hNode` to `event`.
+    Sets the event of event record node ``hNode`` to ``event``.
 
     Parameters
     ----------
@@ -47123,16 +47153,16 @@ def cuGraphEventRecordNodeSetEvent(hNode, event):
 def cuGraphAddEventWaitNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], size_t numDependencies, event):
     """ Creates an event wait node and adds it to a graph.
 
-    Creates a new event wait node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and event
-    specified in `event`. It is possible for `numDependencies` to be 0, in
-    which case the node will be placed at the root of the graph.
-    `dependencies` may not have any duplicate entries. A handle to the new
-    node will be returned in `phGraphNode`.
+    Creates a new event wait node and adds it to ``hGraph`` with
+    ``numDependencies`` dependencies specified via ``dependencies`` and
+    event specified in ``event``. It is possible for ``numDependencies`` to
+    be 0, in which case the node will be placed at the root of the graph.
+    ``dependencies`` may not have any duplicate entries. A handle to the
+    new node will be returned in ``phGraphNode``.
 
-    The graph node will wait for all work captured in `event`. See
+    The graph node will wait for all work captured in ``event``. See
     :py:obj:`~.cuEventRecord()` for details on what is captured by an
-    event. `event` may be from a different context or device than the
+    event. ``event`` may be from a different context or device than the
     launch stream.
 
     Parameters
@@ -47203,7 +47233,7 @@ def cuGraphAddEventWaitNode(hGraph, dependencies : Optional[tuple[CUgraphNode] |
 def cuGraphEventWaitNodeGetEvent(hNode):
     """ Returns the event associated with an event wait node.
 
-    Returns the event of event wait node `hNode` in `event_out`.
+    Returns the event of event wait node ``hNode`` in ``event_out``.
 
     Parameters
     ----------
@@ -47243,7 +47273,7 @@ def cuGraphEventWaitNodeGetEvent(hNode):
 def cuGraphEventWaitNodeSetEvent(hNode, event):
     """ Sets an event wait node's event.
 
-    Sets the event of event wait node `hNode` to `event`.
+    Sets the event of event wait node ``hNode`` to ``event``.
 
     Parameters
     ----------
@@ -47288,12 +47318,12 @@ def cuGraphEventWaitNodeSetEvent(hNode, event):
 def cuGraphAddExternalSemaphoresSignalNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], size_t numDependencies, nodeParams : Optional[CUDA_EXT_SEM_SIGNAL_NODE_PARAMS]):
     """ Creates an external semaphore signal node and adds it to a graph.
 
-    Creates a new external semaphore signal node and adds it to `hGraph`
-    with `numDependencies` dependencies specified via `dependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `dependencies` may not have any duplicate entries. A
-    handle to the new node will be returned in `phGraphNode`.
+    Creates a new external semaphore signal node and adds it to ``hGraph``
+    with ``numDependencies`` dependencies specified via ``dependencies``
+    and arguments specified in ``nodeParams``. It is possible for
+    ``numDependencies`` to be 0, in which case the node will be placed at
+    the root of the graph. ``dependencies`` may not have any duplicate
+    entries. A handle to the new node will be returned in ``phGraphNode``.
 
     Performs a signal operation on a set of externally allocated semaphore
     objects when the node is launched. The operation(s) will occur after
@@ -47360,9 +47390,9 @@ def cuGraphAddExternalSemaphoresSignalNode(hGraph, dependencies : Optional[tuple
 def cuGraphExternalSemaphoresSignalNodeGetParams(hNode):
     """ Returns an external semaphore signal node's parameters.
 
-    Returns the parameters of an external semaphore signal node `hNode` in
-    `params_out`. The `extSemArray` and `paramsArray` returned in
-    `params_out`, are owned by the node. This memory remains valid until
+    Returns the parameters of an external semaphore signal node ``hNode``
+    in ``params_out``. The ``extSemArray`` and ``paramsArray`` returned in
+    ``params_out``, are owned by the node. This memory remains valid until
     the node is destroyed or its parameters are modified, and should not be
     modified directly. Use
     :py:obj:`~.cuGraphExternalSemaphoresSignalNodeSetParams` to update the
@@ -47406,8 +47436,8 @@ def cuGraphExternalSemaphoresSignalNodeGetParams(hNode):
 def cuGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[CUDA_EXT_SEM_SIGNAL_NODE_PARAMS]):
     """ Sets an external semaphore signal node's parameters.
 
-    Sets the parameters of an external semaphore signal node `hNode` to
-    `nodeParams`.
+    Sets the parameters of an external semaphore signal node ``hNode`` to
+    ``nodeParams``.
 
     Parameters
     ----------
@@ -47445,12 +47475,12 @@ def cuGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[CU
 def cuGraphAddExternalSemaphoresWaitNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], size_t numDependencies, nodeParams : Optional[CUDA_EXT_SEM_WAIT_NODE_PARAMS]):
     """ Creates an external semaphore wait node and adds it to a graph.
 
-    Creates a new external semaphore wait node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `dependencies` may not have any duplicate entries. A
-    handle to the new node will be returned in `phGraphNode`.
+    Creates a new external semaphore wait node and adds it to ``hGraph``
+    with ``numDependencies`` dependencies specified via ``dependencies``
+    and arguments specified in ``nodeParams``. It is possible for
+    ``numDependencies`` to be 0, in which case the node will be placed at
+    the root of the graph. ``dependencies`` may not have any duplicate
+    entries. A handle to the new node will be returned in ``phGraphNode``.
 
     Performs a wait operation on a set of externally allocated semaphore
     objects when the node is launched. The node's dependencies will not be
@@ -47517,9 +47547,9 @@ def cuGraphAddExternalSemaphoresWaitNode(hGraph, dependencies : Optional[tuple[C
 def cuGraphExternalSemaphoresWaitNodeGetParams(hNode):
     """ Returns an external semaphore wait node's parameters.
 
-    Returns the parameters of an external semaphore wait node `hNode` in
-    `params_out`. The `extSemArray` and `paramsArray` returned in
-    `params_out`, are owned by the node. This memory remains valid until
+    Returns the parameters of an external semaphore wait node ``hNode`` in
+    ``params_out``. The ``extSemArray`` and ``paramsArray`` returned in
+    ``params_out``, are owned by the node. This memory remains valid until
     the node is destroyed or its parameters are modified, and should not be
     modified directly. Use
     :py:obj:`~.cuGraphExternalSemaphoresSignalNodeSetParams` to update the
@@ -47563,8 +47593,8 @@ def cuGraphExternalSemaphoresWaitNodeGetParams(hNode):
 def cuGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[CUDA_EXT_SEM_WAIT_NODE_PARAMS]):
     """ Sets an external semaphore wait node's parameters.
 
-    Sets the parameters of an external semaphore wait node `hNode` to
-    `nodeParams`.
+    Sets the parameters of an external semaphore wait node ``hNode`` to
+    ``nodeParams``.
 
     Parameters
     ----------
@@ -47602,14 +47632,14 @@ def cuGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[CUDA
 def cuGraphAddBatchMemOpNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], size_t numDependencies, nodeParams : Optional[CUDA_BATCH_MEM_OP_NODE_PARAMS]):
     """ Creates a batch memory operation node and adds it to a graph.
 
-    Creates a new batch memory operation node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `dependencies` may not have any duplicate entries. A
-    handle to the new node will be returned in `phGraphNode`.
+    Creates a new batch memory operation node and adds it to ``hGraph``
+    with ``numDependencies`` dependencies specified via ``dependencies``
+    and arguments specified in ``nodeParams``. It is possible for
+    ``numDependencies`` to be 0, in which case the node will be placed at
+    the root of the graph. ``dependencies`` may not have any duplicate
+    entries. A handle to the new node will be returned in ``phGraphNode``.
 
-    When the node is added, the paramArray inside `nodeParams` is copied
+    When the node is added, the paramArray inside ``nodeParams`` is copied
     and therefore it can be freed after the call returns.
 
     Parameters
@@ -47677,9 +47707,9 @@ def cuGraphAddBatchMemOpNode(hGraph, dependencies : Optional[tuple[CUgraphNode]
 def cuGraphBatchMemOpNodeGetParams(hNode):
     """ Returns a batch mem op node's parameters.
 
-    Returns the parameters of batch mem op node `hNode` in
-    `nodeParams_out`. The `paramArray` returned in `nodeParams_out` is
-    owned by the node. This memory remains valid until the node is
+    Returns the parameters of batch mem op node ``hNode`` in
+    ``nodeParams_out``. The ``paramArray`` returned in ``nodeParams_out``
+    is owned by the node. This memory remains valid until the node is
     destroyed or its parameters are modified, and should not be modified
     directly. Use :py:obj:`~.cuGraphBatchMemOpNodeSetParams` to update the
     parameters of this node.
@@ -47722,9 +47752,9 @@ def cuGraphBatchMemOpNodeGetParams(hNode):
 def cuGraphBatchMemOpNodeSetParams(hNode, nodeParams : Optional[CUDA_BATCH_MEM_OP_NODE_PARAMS]):
     """ Sets a batch mem op node's parameters.
 
-    Sets the parameters of batch mem op node `hNode` to `nodeParams`.
+    Sets the parameters of batch mem op node ``hNode`` to ``nodeParams``.
 
-    The paramArray inside `nodeParams` is copied and therefore it can be
+    The paramArray inside ``nodeParams`` is copied and therefore it can be
     freed after the call returns.
 
     Parameters
@@ -47764,9 +47794,9 @@ def cuGraphExecBatchMemOpNodeSetParams(hGraphExec, hNode, nodeParams : Optional[
     """ Sets the parameters for a batch mem op node in the given graphExec.
 
     Sets the parameters of a batch mem op node in an executable graph
-    `hGraphExec`. The node is identified by the corresponding node `hNode`
-    in the non-executable graph, from which the executable graph was
-    instantiated.
+    ``hGraphExec``. The node is identified by the corresponding node
+    ``hNode`` in the non-executable graph, from which the executable graph
+    was instantiated.
 
     The following fields on operations may be modified on an executable
     graph:
@@ -47778,13 +47808,13 @@ def cuGraphExecBatchMemOpNodeSetParams(hGraphExec, hNode, nodeParams : Optional[
     Other fields, such as the context, count or type of operations, and
     other types of operations such as membars, may not be modified.
 
-    `hNode` must not have been removed from the original graph.
+    ``hNode`` must not have been removed from the original graph.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
-    The paramArray inside `nodeParams` is copied and therefore it can be
+    The paramArray inside ``nodeParams`` is copied and therefore it can be
     freed after the call returns.
 
     Parameters
@@ -47834,15 +47864,15 @@ def cuGraphExecBatchMemOpNodeSetParams(hGraphExec, hNode, nodeParams : Optional[
 def cuGraphAddMemAllocNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], size_t numDependencies, nodeParams : Optional[CUDA_MEM_ALLOC_NODE_PARAMS]):
     """ Creates an allocation node and adds it to a graph.
 
-    Creates a new allocation node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `dependencies` may not have any duplicate entries. A
-    handle to the new node will be returned in `phGraphNode`.
+    Creates a new allocation node and adds it to ``hGraph`` with
+    ``numDependencies`` dependencies specified via ``dependencies`` and
+    arguments specified in ``nodeParams``. It is possible for
+    ``numDependencies`` to be 0, in which case the node will be placed at
+    the root of the graph. ``dependencies`` may not have any duplicate
+    entries. A handle to the new node will be returned in ``phGraphNode``.
 
     When :py:obj:`~.cuGraphAddMemAllocNode` creates an allocation node, it
-    returns the address of the allocation in `nodeParams.dptr`. The
+    returns the address of the allocation in ``nodeParams.dptr``. The
     allocation's address remains fixed across instantiations and launches.
 
     If the allocation is freed in the same graph, by creating a free node
@@ -47946,10 +47976,10 @@ def cuGraphAddMemAllocNode(hGraph, dependencies : Optional[tuple[CUgraphNode] |
 def cuGraphMemAllocNodeGetParams(hNode):
     """ Returns a memory alloc node's parameters.
 
-    Returns the parameters of a memory alloc node `hNode` in `params_out`.
-    The `poolProps` and `accessDescs` returned in `params_out`, are owned
-    by the node. This memory remains valid until the node is destroyed. The
-    returned parameters must not be modified.
+    Returns the parameters of a memory alloc node ``hNode`` in
+    ``params_out``. The ``poolProps`` and ``accessDescs`` returned in
+    ``params_out``, are owned by the node. This memory remains valid until
+    the node is destroyed. The returned parameters must not be modified.
 
     Parameters
     ----------
@@ -47989,12 +48019,12 @@ def cuGraphMemAllocNodeGetParams(hNode):
 def cuGraphAddMemFreeNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], size_t numDependencies, dptr):
     """ Creates a memory free node and adds it to a graph.
 
-    Creates a new memory free node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `dependencies` may not have any duplicate entries. A
-    handle to the new node will be returned in `phGraphNode`.
+    Creates a new memory free node and adds it to ``hGraph`` with
+    ``numDependencies`` dependencies specified via ``dependencies`` and
+    arguments specified in ``nodeParams``. It is possible for
+    ``numDependencies`` to be 0, in which case the node will be placed at
+    the root of the graph. ``dependencies`` may not have any duplicate
+    entries. A handle to the new node will be returned in ``phGraphNode``.
 
     :py:obj:`~.cuGraphAddMemFreeNode` will return
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if the user attempts to free:
@@ -48085,7 +48115,7 @@ def cuGraphAddMemFreeNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | l
 def cuGraphMemFreeNodeGetParams(hNode):
     """ Returns a memory free node's parameters.
 
-    Returns the address of a memory free node `hNode` in `dptr_out`.
+    Returns the address of a memory free node ``hNode`` in ``dptr_out``.
 
     Parameters
     ----------
@@ -48271,8 +48301,8 @@ def cuDeviceSetGraphMemAttribute(device, attr not None : CUgraphMem_attribute, v
 def cuGraphClone(originalGraph):
     """ Clones a graph.
 
-    This function creates a copy of `originalGraph` and returns it in
-    `phGraphClone`. All parameters are copied into the cloned graph. The
+    This function creates a copy of ``originalGraph`` and returns it in
+    ``phGraphClone``. All parameters are copied into the cloned graph. The
     original graph may be modified after this call without affecting the
     clone.
 
@@ -48321,14 +48351,14 @@ def cuGraphClone(originalGraph):
 def cuGraphNodeFindInClone(hOriginalNode, hClonedGraph):
     """ Finds a cloned version of a node.
 
-    This function returns the node in `hClonedGraph` corresponding to
-    `hOriginalNode` in the original graph.
+    This function returns the node in ``hClonedGraph`` corresponding to
+    ``hOriginalNode`` in the original graph.
 
-    `hClonedGraph` must have been cloned from `hOriginalGraph` via
-    :py:obj:`~.cuGraphClone`. `hOriginalNode` must have been in
-    `hOriginalGraph` at the time of the call to :py:obj:`~.cuGraphClone`,
-    and the corresponding cloned node in `hClonedGraph` must not have been
-    removed. The cloned node is then returned via `phClonedNode`.
+    ``hClonedGraph`` must have been cloned from ``hOriginalGraph`` via
+    :py:obj:`~.cuGraphClone`. ``hOriginalNode`` must have been in
+    ``hOriginalGraph`` at the time of the call to :py:obj:`~.cuGraphClone`,
+    and the corresponding cloned node in ``hClonedGraph`` must not have
+    been removed. The cloned node is then returned via ``phClonedNode``.
 
     Parameters
     ----------
@@ -48378,7 +48408,7 @@ def cuGraphNodeFindInClone(hOriginalNode, hClonedGraph):
 def cuGraphNodeGetType(hNode):
     """ Returns a node's type.
 
-    Returns the node type of `hNode` in `typename`.
+    Returns the node type of ``hNode`` in ``typename``.
 
     Parameters
     ----------
@@ -48418,8 +48448,8 @@ def cuGraphNodeGetType(hNode):
 def cuGraphNodeGetContainingGraph(hNode):
     """ Returns the graph that contains a given graph node.
 
-    Returns the graph that contains `hNode` in `*phGraph`. If `hNode` is in
-    a child graph, the child graph it is in is returned.
+    Returns the graph that contains ``hNode`` in ``*phGraph``. If ``hNode``
+    is in a child graph, the child graph it is in is returned.
 
     Parameters
     ----------
@@ -48430,7 +48460,7 @@ def cuGraphNodeGetContainingGraph(hNode):
     -------
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
-    *phGraph : :py:obj:`~.CUgraph`
+    \\*phGraph : :py:obj:`~.CUgraph`
         Pointer to return the containing graph
 
     See Also
@@ -48459,9 +48489,9 @@ def cuGraphNodeGetContainingGraph(hNode):
 def cuGraphNodeGetLocalId(hNode):
     """ Returns the local node id of a given graph node.
 
-    Returns the node id of `hNode` in `*nodeId`. The nodeId matches that
-    referenced by :py:obj:`~.cuGraphDebugDotPrint`. The local nodeId and
-    graphId together can uniquely identify the node.
+    Returns the node id of ``hNode`` in ``*nodeId``. The nodeId matches
+    that referenced by :py:obj:`~.cuGraphDebugDotPrint`. The local nodeId
+    and graphId together can uniquely identify the node.
 
     Parameters
     ----------
@@ -48510,7 +48540,7 @@ def cuGraphNodeGetToolsId(hNode):
     -------
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
-    *toolsNodeId : unsigned long long
+    \\*toolsNodeId : unsigned long long
         Pointer to return the id used by tools
 
     See Also
@@ -48539,8 +48569,8 @@ def cuGraphNodeGetToolsId(hNode):
 def cuGraphGetId(hGraph):
     """ Returns the id of a given graph.
 
-    Returns the id of `hGraph` in `*graphId`. The value in `*graphId` will
-    match that referenced by :py:obj:`~.cuGraphDebugDotPrint`.
+    Returns the id of ``hGraph`` in ``*graphId``. The value in ``*graphId``
+    will match that referenced by :py:obj:`~.cuGraphDebugDotPrint`.
 
     Parameters
     ----------
@@ -48551,7 +48581,7 @@ def cuGraphGetId(hGraph):
     -------
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
-    *graphId : unsigned int
+    \\*graphId : unsigned int
         Pointer to return the graphId
 
     See Also
@@ -48580,8 +48610,9 @@ def cuGraphGetId(hGraph):
 def cuGraphExecGetId(hGraphExec):
     """ Returns the id of a given graph exec.
 
-    Returns the id of `hGraphExec` in `*graphId`. The value in `*graphId`
-    will match that referenced by :py:obj:`~.cuGraphDebugDotPrint`.
+    Returns the id of ``hGraphExec`` in ``*graphId``. The value in
+    ``*graphId`` will match that referenced by
+    :py:obj:`~.cuGraphDebugDotPrint`.
 
     Parameters
     ----------
@@ -48592,7 +48623,7 @@ def cuGraphExecGetId(hGraphExec):
     -------
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
-    *graphId : unsigned int
+    \\*graphId : unsigned int
         Pointer to return the graphId
 
     See Also
@@ -48621,12 +48652,12 @@ def cuGraphExecGetId(hGraphExec):
 def cuGraphGetNodes(hGraph, size_t numNodes = 0):
     """ Returns a graph's nodes.
 
-    Returns a list of `hGraph's` nodes. `nodes` may be NULL, in which case
-    this function will return the number of nodes in `numNodes`. Otherwise,
-    `numNodes` entries will be filled in. If `numNodes` is higher than the
-    actual number of nodes, the remaining entries in `nodes` will be set to
-    NULL, and the number of nodes actually obtained will be returned in
-    `numNodes`.
+    Returns a list of ``hGraph's`` nodes. ``nodes`` may be NULL, in which
+    case this function will return the number of nodes in ``numNodes``.
+    Otherwise, ``numNodes`` entries will be filled in. If ``numNodes`` is
+    higher than the actual number of nodes, the remaining entries in
+    ``nodes`` will be set to NULL, and the number of nodes actually
+    obtained will be returned in ``numNodes``.
 
     Parameters
     ----------
@@ -48682,12 +48713,12 @@ def cuGraphGetNodes(hGraph, size_t numNodes = 0):
 def cuGraphGetRootNodes(hGraph, size_t numRootNodes = 0):
     """ Returns a graph's root nodes.
 
-    Returns a list of `hGraph's` root nodes. `rootNodes` may be NULL, in
-    which case this function will return the number of root nodes in
-    `numRootNodes`. Otherwise, `numRootNodes` entries will be filled in. If
-    `numRootNodes` is higher than the actual number of root nodes, the
-    remaining entries in `rootNodes` will be set to NULL, and the number of
-    nodes actually obtained will be returned in `numRootNodes`.
+    Returns a list of ``hGraph's`` root nodes. ``rootNodes`` may be NULL,
+    in which case this function will return the number of root nodes in
+    ``numRootNodes``. Otherwise, ``numRootNodes`` entries will be filled
+    in. If ``numRootNodes`` is higher than the actual number of root nodes,
+    the remaining entries in ``rootNodes`` will be set to NULL, and the
+    number of nodes actually obtained will be returned in ``numRootNodes``.
 
     Parameters
     ----------
@@ -48743,19 +48774,19 @@ def cuGraphGetRootNodes(hGraph, size_t numRootNodes = 0):
 def cuGraphGetEdges(hGraph, size_t numEdges = 0):
     """ Returns a graph's dependency edges.
 
-    Returns a list of `hGraph's` dependency edges. Edges are returned via
-    corresponding indices in `from`, `to` and `edgeData`; that is, the node
-    in `to`[i] has a dependency on the node in `from`[i] with data
-    `edgeData`[i]. `from` and `to` may both be NULL, in which case this
-    function only returns the number of edges in `numEdges`. Otherwise,
-    `numEdges` entries will be filled in. If `numEdges` is higher than the
-    actual number of edges, the remaining entries in `from` and `to` will
-    be set to NULL, and the number of edges actually returned will be
-    written to `numEdges`. `edgeData` may alone be NULL, in which case the
-    edges must all have default (zeroed) edge data. Attempting a lossy
-    query via NULL `edgeData` will result in
-    :py:obj:`~.CUDA_ERROR_LOSSY_QUERY`. If `edgeData` is non-NULL then
-    `from` and `to` must be as well.
+    Returns a list of ``hGraph's`` dependency edges. Edges are returned via
+    corresponding indices in ``from``, ``to`` and ``edgeData``; that is,
+    the node in ``to[i]`` has a dependency on the node in ``from[i]`` with
+    data ``edgeData[i]``. ``from`` and ``to`` may both be NULL, in which
+    case this function only returns the number of edges in ``numEdges``.
+    Otherwise, ``numEdges`` entries will be filled in. If ``numEdges`` is
+    higher than the actual number of edges, the remaining entries in
+    ``from`` and ``to`` will be set to NULL, and the number of edges
+    actually returned will be written to ``numEdges``. ``edgeData`` may
+    alone be NULL, in which case the edges must all have default (zeroed)
+    edge data. Attempting a lossy query via NULL ``edgeData`` will result
+    in :py:obj:`~.CUDA_ERROR_LOSSY_QUERY`. If ``edgeData`` is non-NULL then
+    ``from`` and ``to`` must be as well.
 
     Parameters
     ----------
@@ -48839,18 +48870,18 @@ def cuGraphGetEdges(hGraph, size_t numEdges = 0):
 def cuGraphNodeGetDependencies(hNode, size_t numDependencies = 0):
     """ Returns a node's dependencies.
 
-    Returns a list of `node's` dependencies. `dependencies` may be NULL, in
-    which case this function will return the number of dependencies in
-    `numDependencies`. Otherwise, `numDependencies` entries will be filled
-    in. If `numDependencies` is higher than the actual number of
-    dependencies, the remaining entries in `dependencies` will be set to
-    NULL, and the number of nodes actually obtained will be returned in
-    `numDependencies`.
+    Returns a list of ``node's`` dependencies. ``dependencies`` may be
+    NULL, in which case this function will return the number of
+    dependencies in ``numDependencies``. Otherwise, ``numDependencies``
+    entries will be filled in. If ``numDependencies`` is higher than the
+    actual number of dependencies, the remaining entries in
+    ``dependencies`` will be set to NULL, and the number of nodes actually
+    obtained will be returned in ``numDependencies``.
 
     Note that if an edge has non-zero (non-default) edge data and
-    `edgeData` is NULL, this API will return
-    :py:obj:`~.CUDA_ERROR_LOSSY_QUERY`. If `edgeData` is non-NULL, then
-    `dependencies` must be as well.
+    ``edgeData`` is NULL, this API will return
+    :py:obj:`~.CUDA_ERROR_LOSSY_QUERY`. If ``edgeData`` is non-NULL, then
+    ``dependencies`` must be as well.
 
     Parameters
     ----------
@@ -48920,18 +48951,18 @@ def cuGraphNodeGetDependencies(hNode, size_t numDependencies = 0):
 def cuGraphNodeGetDependentNodes(hNode, size_t numDependentNodes = 0):
     """ Returns a node's dependent nodes.
 
-    Returns a list of `node's` dependent nodes. `dependentNodes` may be
+    Returns a list of ``node's`` dependent nodes. ``dependentNodes`` may be
     NULL, in which case this function will return the number of dependent
-    nodes in `numDependentNodes`. Otherwise, `numDependentNodes` entries
-    will be filled in. If `numDependentNodes` is higher than the actual
-    number of dependent nodes, the remaining entries in `dependentNodes`
-    will be set to NULL, and the number of nodes actually obtained will be
-    returned in `numDependentNodes`.
+    nodes in ``numDependentNodes``. Otherwise, ``numDependentNodes``
+    entries will be filled in. If ``numDependentNodes`` is higher than the
+    actual number of dependent nodes, the remaining entries in
+    ``dependentNodes`` will be set to NULL, and the number of nodes
+    actually obtained will be returned in ``numDependentNodes``.
 
     Note that if an edge has non-zero (non-default) edge data and
-    `edgeData` is NULL, this API will return
-    :py:obj:`~.CUDA_ERROR_LOSSY_QUERY`. If `edgeData` is non-NULL, then
-    `dependentNodes` must be as well.
+    ``edgeData`` is NULL, this API will return
+    :py:obj:`~.CUDA_ERROR_LOSSY_QUERY`. If ``edgeData`` is non-NULL, then
+    ``dependentNodes`` must be as well.
 
     Parameters
     ----------
@@ -49001,12 +49032,13 @@ def cuGraphNodeGetDependentNodes(hNode, size_t numDependentNodes = 0):
 def cuGraphAddDependencies(hGraph, from_ : Optional[tuple[CUgraphNode] | list[CUgraphNode]], to : Optional[tuple[CUgraphNode] | list[CUgraphNode]], edgeData : Optional[tuple[CUgraphEdgeData] | list[CUgraphEdgeData]], size_t numDependencies):
     """ Adds dependency edges to a graph.
 
-    The number of dependencies to be added is defined by `numDependencies`
-    Elements in `from` and `to` at corresponding indices define a
-    dependency. Each node in `from` and `to` must belong to `hGraph`.
+    The number of dependencies to be added is defined by
+    ``numDependencies`` Elements in ``from`` and ``to`` at corresponding
+    indices define a dependency. Each node in ``from`` and ``to`` must
+    belong to ``hGraph``.
 
-    If `numDependencies` is 0, elements in `from` and `to` will be ignored.
-    Specifying an existing dependency will return an error.
+    If ``numDependencies`` is 0, elements in ``from`` and ``to`` will be
+    ignored. Specifying an existing dependency will return an error.
 
     Parameters
     ----------
@@ -49094,15 +49126,15 @@ def cuGraphAddDependencies(hGraph, from_ : Optional[tuple[CUgraphNode] | list[CU
 def cuGraphRemoveDependencies(hGraph, from_ : Optional[tuple[CUgraphNode] | list[CUgraphNode]], to : Optional[tuple[CUgraphNode] | list[CUgraphNode]], edgeData : Optional[tuple[CUgraphEdgeData] | list[CUgraphEdgeData]], size_t numDependencies):
     """ Removes dependency edges from a graph.
 
-    The number of `dependencies` to be removed is defined by
-    `numDependencies`. Elements in `from` and `to` at corresponding indices
-    define a dependency. Each node in `from` and `to` must belong to
-    `hGraph`.
+    The number of ``dependencies`` to be removed is defined by
+    ``numDependencies``. Elements in ``from`` and ``to`` at corresponding
+    indices define a dependency. Each node in ``from`` and ``to`` must
+    belong to ``hGraph``.
 
-    If `numDependencies` is 0, elements in `from` and `to` will be ignored.
-    Specifying an edge that does not exist in the graph, with data matching
-    `edgeData`, results in an error. `edgeData` is nullable, which is
-    equivalent to passing default (zeroed) data for each edge.
+    If ``numDependencies`` is 0, elements in ``from`` and ``to`` will be
+    ignored. Specifying an edge that does not exist in the graph, with data
+    matching ``edgeData``, results in an error. ``edgeData`` is nullable,
+    which is equivalent to passing default (zeroed) data for each edge.
 
     Dependencies cannot be removed from graphs which contain allocation or
     free nodes. Any attempt to do so will return an error.
@@ -49193,8 +49225,8 @@ def cuGraphRemoveDependencies(hGraph, from_ : Optional[tuple[CUgraphNode] | list
 def cuGraphDestroyNode(hNode):
     """ Remove a node from the graph.
 
-    Removes `hNode` from its graph. This operation also severs any
-    dependencies of other nodes on `hNode` and vice versa.
+    Removes ``hNode`` from its graph. This operation also severs any
+    dependencies of other nodes on ``hNode`` and vice versa.
 
     Nodes which belong to a graph which contains allocation or free nodes
     cannot be destroyed. Any attempt to do so will return an error.
@@ -49232,12 +49264,12 @@ def cuGraphDestroyNode(hNode):
 def cuGraphInstantiate(hGraph, unsigned long long flags):
     """ Creates an executable graph from a graph.
 
-    Instantiates `hGraph` as an executable graph. The graph is validated
+    Instantiates ``hGraph`` as an executable graph. The graph is validated
     for any structural constraints or intra-node constraints which were not
     previously validated. If instantiation is successful, a handle to the
-    instantiated graph is returned in `phGraphExec`.
+    instantiated graph is returned in ``phGraphExec``.
 
-    The `flags` parameter controls the behavior of instantiation and
+    The ``flags`` parameter controls the behavior of instantiation and
     subsequent graph launches. Valid flags are:
 
     - :py:obj:`~.CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH`, which
@@ -49259,13 +49291,14 @@ def cuGraphInstantiate(hGraph, unsigned long long flags):
       that priorities are only available on kernel nodes, and are copied
       from stream priority during stream capture.
 
-    If `hGraph` contains any allocation or free nodes, there can be at most
-    one executable graph in existence for that graph at a time. An attempt
-    to instantiate a second executable graph before destroying the first
-    with :py:obj:`~.cuGraphExecDestroy` will result in an error. The same
-    also applies if `hGraph` contains any device-updatable kernel nodes.
+    If ``hGraph`` contains any allocation or free nodes, there can be at
+    most one executable graph in existence for that graph at a time. An
+    attempt to instantiate a second executable graph before destroying the
+    first with :py:obj:`~.cuGraphExecDestroy` will result in an error. The
+    same also applies if ``hGraph`` contains any device-updatable kernel
+    nodes.
 
-    If `hGraph` contains kernels which call device-side cudaGraphLaunch()
+    If ``hGraph`` contains kernels which call device-side cudaGraphLaunch()
     from multiple contexts, this will result in an error.
 
     Graphs instantiated for launch on the device have additional
@@ -49337,21 +49370,21 @@ def cuGraphInstantiate(hGraph, unsigned long long flags):
 def cuGraphInstantiateWithParams(hGraph, instantiateParams : Optional[CUDA_GRAPH_INSTANTIATE_PARAMS]):
     """ Creates an executable graph from a graph.
 
-    Instantiates `hGraph` as an executable graph according to the
-    `instantiateParams` structure. The graph is validated for any
+    Instantiates ``hGraph`` as an executable graph according to the
+    ``instantiateParams`` structure. The graph is validated for any
     structural constraints or intra-node constraints which were not
     previously validated. If instantiation is successful, a handle to the
-    instantiated graph is returned in `phGraphExec`.
+    instantiated graph is returned in ``phGraphExec``.
 
-    `instantiateParams` controls the behavior of instantiation and
+    ``instantiateParams`` controls the behavior of instantiation and
     subsequent graph launches, as well as returning more detailed
     information in the event of an error.
     :py:obj:`~.CUDA_GRAPH_INSTANTIATE_PARAMS` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    The `flags` field controls the behavior of instantiation and subsequent
-    graph launches. Valid flags are:
+    The ``flags`` field controls the behavior of instantiation and
+    subsequent graph launches. Valid flags are:
 
     - :py:obj:`~.CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH`, which
       configures a graph containing memory allocation nodes to
@@ -49359,7 +49392,7 @@ def cuGraphInstantiateWithParams(hGraph, instantiateParams : Optional[CUDA_GRAPH
       relaunched.
 
     - :py:obj:`~.CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD`, which will perform an
-      upload of the graph into `hUploadStream` once the graph has been
+      upload of the graph into ``hUploadStream`` once the graph has been
       instantiated.
 
     - :py:obj:`~.CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH`, which
@@ -49376,13 +49409,14 @@ def cuGraphInstantiateWithParams(hGraph, instantiateParams : Optional[CUDA_GRAPH
       that priorities are only available on kernel nodes, and are copied
       from stream priority during stream capture.
 
-    If `hGraph` contains any allocation or free nodes, there can be at most
-    one executable graph in existence for that graph at a time. An attempt
-    to instantiate a second executable graph before destroying the first
-    with :py:obj:`~.cuGraphExecDestroy` will result in an error. The same
-    also applies if `hGraph` contains any device-updatable kernel nodes.
+    If ``hGraph`` contains any allocation or free nodes, there can be at
+    most one executable graph in existence for that graph at a time. An
+    attempt to instantiate a second executable graph before destroying the
+    first with :py:obj:`~.cuGraphExecDestroy` will result in an error. The
+    same also applies if ``hGraph`` contains any device-updatable kernel
+    nodes.
 
-    If `hGraph` contains kernels which call device-side cudaGraphLaunch()
+    If ``hGraph`` contains kernels which call device-side cudaGraphLaunch()
     from multiple contexts, this will result in an error.
 
     Graphs instantiated for launch on the device have additional
@@ -49413,34 +49447,34 @@ def cuGraphInstantiateWithParams(hGraph, instantiateParams : Optional[CUDA_GRAPH
       - Both operands must be accessible from the current context, and the
         current context must match the context of other nodes in the graph.
 
-    In the event of an error, the `result_out` and `hErrNode_out` fields
-    will contain more information about the nature of the error. Possible
-    error reporting includes:
+    In the event of an error, the ``result_out`` and ``hErrNode_out``
+    fields will contain more information about the nature of the error.
+    Possible error reporting includes:
 
     - :py:obj:`~.CUDA_GRAPH_INSTANTIATE_ERROR`, if passed an invalid value
       or if an unexpected error occurred which is described by the return
-      value of the function. `hErrNode_out` will be set to NULL.
+      value of the function. ``hErrNode_out`` will be set to NULL.
 
     - :py:obj:`~.CUDA_GRAPH_INSTANTIATE_INVALID_STRUCTURE`, if the graph
-      structure is invalid. `hErrNode_out` will be set to one of the
+      structure is invalid. ``hErrNode_out`` will be set to one of the
       offending nodes.
 
     - :py:obj:`~.CUDA_GRAPH_INSTANTIATE_NODE_OPERATION_NOT_SUPPORTED`, if
       the graph is instantiated for device launch but contains a node of an
       unsupported node type, or a node which performs unsupported
       operations, such as use of CUDA dynamic parallelism within a kernel
-      node. `hErrNode_out` will be set to this node.
+      node. ``hErrNode_out`` will be set to this node.
 
     - :py:obj:`~.CUDA_GRAPH_INSTANTIATE_MULTIPLE_CTXS_NOT_SUPPORTED`, if
       the graph is instantiated for device launch but a node’s context
       differs from that of another node. This error can also be returned if
       a graph is not instantiated for device launch and it contains kernels
       which call device-side cudaGraphLaunch() from multiple contexts.
-      `hErrNode_out` will be set to this node.
+      ``hErrNode_out`` will be set to this node.
 
-    If instantiation is successful, `result_out` will be set to
-    :py:obj:`~.CUDA_GRAPH_INSTANTIATE_SUCCESS`, and `hErrNode_out` will be
-    set to NULL.
+    If instantiation is successful, ``result_out`` will be set to
+    :py:obj:`~.CUDA_GRAPH_INSTANTIATE_SUCCESS`, and ``hErrNode_out`` will
+    be set to NULL.
 
     Parameters
     ----------
@@ -49527,13 +49561,13 @@ def cuGraphExecKernelNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA
     """ Sets the parameters for a kernel node in the given graphExec.
 
     Sets the parameters of a kernel node in an executable graph
-    `hGraphExec`. The node is identified by the corresponding node `hNode`
-    in the non-executable graph, from which the executable graph was
-    instantiated.
+    ``hGraphExec``. The node is identified by the corresponding node
+    ``hNode`` in the non-executable graph, from which the executable graph
+    was instantiated.
 
-    `hNode` must not have been removed from the original graph. All
-    `nodeParams` fields may change, but the following restrictions apply to
-    `func` updates:
+    ``hNode`` must not have been removed from the original graph. All
+    ``nodeParams`` fields may change, but the following restrictions apply
+    to ``func`` updates:
 
     - The owning context of the function cannot change.
 
@@ -49544,19 +49578,20 @@ def cuGraphExecKernelNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA
       calls cannot be updated to a function which makes device-side update
       calls.
 
-    - If `hGraphExec` was not instantiated for device launch, a node whose
-      function originally did not use device-side cudaGraphLaunch() cannot
-      be updated to a function which uses device-side cudaGraphLaunch()
-      unless the node resides on the same context as nodes which contained
-      such calls at instantiate-time. If no such calls were present at
-      instantiation, these updates cannot be performed at all.
+    - If ``hGraphExec`` was not instantiated for device launch, a node
+      whose function originally did not use device-side cudaGraphLaunch()
+      cannot be updated to a function which uses device-side
+      cudaGraphLaunch() unless the node resides on the same context as
+      nodes which contained such calls at instantiate-time. If no such
+      calls were present at instantiation, these updates cannot be
+      performed at all.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
-    If `hNode` is a device-updatable kernel node, the next upload/launch of
-    `hGraphExec` will overwrite any previous device-side updates.
+    If ``hNode`` is a device-updatable kernel node, the next upload/launch
+    of ``hGraphExec`` will overwrite any previous device-side updates.
     Additionally, applying host updates to a device-updatable kernel node
     while it is being updated from the device will result in undefined
     behavior.
@@ -49607,20 +49642,20 @@ def cuGraphExecKernelNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA
 def cuGraphExecMemcpyNodeSetParams(hGraphExec, hNode, copyParams : Optional[CUDA_MEMCPY3D], ctx):
     """ Sets the parameters for a memcpy node in the given graphExec.
 
-    Updates the work represented by `hNode` in `hGraphExec` as though
-    `hNode` had contained `copyParams` at instantiation. hNode must remain
-    in the graph which was used to instantiate `hGraphExec`. Changed edges
-    to and from hNode are ignored.
+    Updates the work represented by ``hNode`` in ``hGraphExec`` as though
+    ``hNode`` had contained ``copyParams`` at instantiation. hNode must
+    remain in the graph which was used to instantiate ``hGraphExec``.
+    Changed edges to and from hNode are ignored.
 
-    The source and destination memory in `copyParams` must be allocated
+    The source and destination memory in ``copyParams`` must be allocated
     from the same contexts as the original source and destination memory.
     Both the instantiation-time memory operands and the memory operands in
-    `copyParams` must be 1-dimensional. Zero-length operations are not
+    ``copyParams`` must be 1-dimensional. Zero-length operations are not
     supported.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. hNode is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. hNode is also not modified by this call.
 
     Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings
     changed or either the original or new memory operands are
@@ -49682,10 +49717,10 @@ def cuGraphExecMemcpyNodeSetParams(hGraphExec, hNode, copyParams : Optional[CUDA
 def cuGraphExecMemsetNodeSetParams(hGraphExec, hNode, memsetParams : Optional[CUDA_MEMSET_NODE_PARAMS], ctx):
     """ Sets the parameters for a memset node in the given graphExec.
 
-    Updates the work represented by `hNode` in `hGraphExec` as though
-    `hNode` had contained `memsetParams` at instantiation. hNode must
-    remain in the graph which was used to instantiate `hGraphExec`. Changed
-    edges to and from hNode are ignored.
+    Updates the work represented by ``hNode`` in ``hGraphExec`` as though
+    ``hNode`` had contained ``memsetParams`` at instantiation. hNode must
+    remain in the graph which was used to instantiate ``hGraphExec``.
+    Changed edges to and from hNode are ignored.
 
     Zero sized operations are not supported.
 
@@ -49702,9 +49737,9 @@ def cuGraphExecMemsetNodeSetParams(hGraphExec, hNode, memsetParams : Optional[CU
     resulting work maps onto the work resources already allocated for the
     node.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. hNode is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. hNode is also not modified by this call.
 
     Parameters
     ----------
@@ -49762,14 +49797,14 @@ def cuGraphExecMemsetNodeSetParams(hGraphExec, hNode, memsetParams : Optional[CU
 def cuGraphExecHostNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA_HOST_NODE_PARAMS]):
     """ Sets the parameters for a host node in the given graphExec.
 
-    Updates the work represented by `hNode` in `hGraphExec` as though
-    `hNode` had contained `nodeParams` at instantiation. hNode must remain
-    in the graph which was used to instantiate `hGraphExec`. Changed edges
-    to and from hNode are ignored.
+    Updates the work represented by ``hNode`` in ``hGraphExec`` as though
+    ``hNode`` had contained ``nodeParams`` at instantiation. hNode must
+    remain in the graph which was used to instantiate ``hGraphExec``.
+    Changed edges to and from hNode are ignored.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. hNode is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. hNode is also not modified by this call.
 
     Parameters
     ----------
@@ -49817,18 +49852,18 @@ def cuGraphExecHostNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA_H
 def cuGraphExecChildGraphNodeSetParams(hGraphExec, hNode, childGraph):
     """ Updates node parameters in the child graph node in the given graphExec.
 
-    Updates the work represented by `hNode` in `hGraphExec` as though the
-    nodes contained in `hNode's` graph had the parameters contained in
-    `childGraph's` nodes at instantiation. `hNode` must remain in the graph
-    which was used to instantiate `hGraphExec`. Changed edges to and from
-    `hNode` are ignored.
+    Updates the work represented by ``hNode`` in ``hGraphExec`` as though
+    the nodes contained in ``hNode's`` graph had the parameters contained
+    in ``childGraph's`` nodes at instantiation. ``hNode`` must remain in
+    the graph which was used to instantiate ``hGraphExec``. Changed edges
+    to and from ``hNode`` are ignored.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
-    The topology of `childGraph`, as well as the node insertion order, must
-    match that of the graph contained in `hNode`. See
+    The topology of ``childGraph``, as well as the node insertion order,
+    must match that of the graph contained in ``hNode``. See
     :py:obj:`~.cuGraphExecUpdate()` for a list of restrictions on what can
     be updated in an instantiated graph. The update is recursive, so child
     graph nodes contained within the top level child graph will also be
@@ -49888,13 +49923,13 @@ def cuGraphExecEventRecordNodeSetEvent(hGraphExec, hNode, event):
     """ Sets the event for an event record node in the given graphExec.
 
     Sets the event of an event record node in an executable graph
-    `hGraphExec`. The node is identified by the corresponding node `hNode`
-    in the non-executable graph, from which the executable graph was
-    instantiated.
+    ``hGraphExec``. The node is identified by the corresponding node
+    ``hNode`` in the non-executable graph, from which the executable graph
+    was instantiated.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
     Parameters
     ----------
@@ -49951,13 +49986,13 @@ def cuGraphExecEventWaitNodeSetEvent(hGraphExec, hNode, event):
     """ Sets the event for an event wait node in the given graphExec.
 
     Sets the event of an event wait node in an executable graph
-    `hGraphExec`. The node is identified by the corresponding node `hNode`
-    in the non-executable graph, from which the executable graph was
-    instantiated.
+    ``hGraphExec``. The node is identified by the corresponding node
+    ``hNode`` in the non-executable graph, from which the executable graph
+    was instantiated.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
     Parameters
     ----------
@@ -50014,17 +50049,17 @@ def cuGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePara
     """ Sets the parameters for an external semaphore signal node in the given graphExec.
 
     Sets the parameters of an external semaphore signal node in an
-    executable graph `hGraphExec`. The node is identified by the
-    corresponding node `hNode` in the non-executable graph, from which the
-    executable graph was instantiated.
+    executable graph ``hGraphExec``. The node is identified by the
+    corresponding node ``hNode`` in the non-executable graph, from which
+    the executable graph was instantiated.
 
-    `hNode` must not have been removed from the original graph.
+    ``hNode`` must not have been removed from the original graph.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
-    Changing `nodeParams->numExtSems` is not supported.
+    Changing ``nodeParams->numExtSems`` is not supported.
 
     Parameters
     ----------
@@ -50074,17 +50109,17 @@ def cuGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodeParams
     """ Sets the parameters for an external semaphore wait node in the given graphExec.
 
     Sets the parameters of an external semaphore wait node in an executable
-    graph `hGraphExec`. The node is identified by the corresponding node
-    `hNode` in the non-executable graph, from which the executable graph
+    graph ``hGraphExec``. The node is identified by the corresponding node
+    ``hNode`` in the non-executable graph, from which the executable graph
     was instantiated.
 
-    `hNode` must not have been removed from the original graph.
+    ``hNode`` must not have been removed from the original graph.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
-    Changing `nodeParams->numExtSems` is not supported.
+    Changing ``nodeParams->numExtSems`` is not supported.
 
     Parameters
     ----------
@@ -50133,22 +50168,22 @@ def cuGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodeParams
 def cuGraphNodeSetEnabled(hGraphExec, hNode, unsigned int isEnabled):
     """ Enables or disables the specified node in the given graphExec.
 
-    Sets `hNode` to be either enabled or disabled. Disabled nodes are
+    Sets ``hNode`` to be either enabled or disabled. Disabled nodes are
     functionally equivalent to empty nodes until they are reenabled.
     Existing node parameters are not affected by disabling/enabling the
     node.
 
-    The node is identified by the corresponding node `hNode` in the non-
+    The node is identified by the corresponding node ``hNode`` in the non-
     executable graph, from which the executable graph was instantiated.
 
-    `hNode` must not have been removed from the original graph.
+    ``hNode`` must not have been removed from the original graph.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
-    If `hNode` is a device-updatable kernel node, the next upload/launch of
-    `hGraphExec` will overwrite any previous device-side updates.
+    If ``hNode`` is a device-updatable kernel node, the next upload/launch
+    of ``hGraphExec`` will overwrite any previous device-side updates.
     Additionally, applying host updates to a device-updatable kernel node
     while it is being updated from the device will result in undefined
     behavior.
@@ -50202,12 +50237,13 @@ def cuGraphNodeSetEnabled(hGraphExec, hNode, unsigned int isEnabled):
 def cuGraphNodeGetEnabled(hGraphExec, hNode):
     """ Query whether a node in the given graphExec is enabled.
 
-    Sets isEnabled to 1 if `hNode` is enabled, or 0 if `hNode` is disabled.
+    Sets isEnabled to 1 if ``hNode`` is enabled, or 0 if ``hNode`` is
+    disabled.
 
-    The node is identified by the corresponding node `hNode` in the non-
+    The node is identified by the corresponding node ``hNode`` in the non-
     executable graph, from which the executable graph was instantiated.
 
-    `hNode` must not have been removed from the original graph.
+    ``hNode`` must not have been removed from the original graph.
 
     Parameters
     ----------
@@ -50263,11 +50299,11 @@ def cuGraphNodeGetEnabled(hGraphExec, hNode):
 def cuGraphUpload(hGraphExec, hStream):
     """ Uploads an executable graph in a stream.
 
-    Uploads `hGraphExec` to the device in `hStream` without executing it.
-    Uploads of the same `hGraphExec` will be serialized. Each upload is
-    ordered behind both any previous work in `hStream` and any previous
-    launches of `hGraphExec`. Uses memory cached by `stream` to back the
-    allocations owned by `hGraphExec`.
+    Uploads ``hGraphExec`` to the device in ``hStream`` without executing
+    it. Uploads of the same ``hGraphExec`` will be serialized. Each upload
+    is ordered behind both any previous work in ``hStream`` and any
+    previous launches of ``hGraphExec``. Uses memory cached by ``stream``
+    to back the allocations owned by ``hGraphExec``.
 
     Parameters
     ----------
@@ -50312,14 +50348,14 @@ def cuGraphUpload(hGraphExec, hStream):
 def cuGraphLaunch(hGraphExec, hStream):
     """ Launches an executable graph in a stream.
 
-    Executes `hGraphExec` in `hStream`. Only one instance of `hGraphExec`
-    may be executing at a time. Each launch is ordered behind both any
-    previous work in `hStream` and any previous launches of `hGraphExec`.
-    To execute a graph concurrently, it must be instantiated multiple times
-    into multiple executable graphs.
+    Executes ``hGraphExec`` in ``hStream``. Only one instance of
+    ``hGraphExec`` may be executing at a time. Each launch is ordered
+    behind both any previous work in ``hStream`` and any previous launches
+    of ``hGraphExec``. To execute a graph concurrently, it must be
+    instantiated multiple times into multiple executable graphs.
 
-    If any allocations created by `hGraphExec` remain unfreed (from a
-    previous launch) and `hGraphExec` was not instantiated with
+    If any allocations created by ``hGraphExec`` remain unfreed (from a
+    previous launch) and ``hGraphExec`` was not instantiated with
     :py:obj:`~.CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH`, the launch
     will fail with :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
@@ -50366,9 +50402,9 @@ def cuGraphLaunch(hGraphExec, hStream):
 def cuGraphExecDestroy(hGraphExec):
     """ Destroys an executable graph.
 
-    Destroys the executable graph specified by `hGraphExec`, as well as all
-    of its executable nodes. If the executable graph is in-flight, it will
-    not be terminated, but rather freed asynchronously on completion.
+    Destroys the executable graph specified by ``hGraphExec``, as well as
+    all of its executable nodes. If the executable graph is in-flight, it
+    will not be terminated, but rather freed asynchronously on completion.
 
     Parameters
     ----------
@@ -50403,7 +50439,8 @@ def cuGraphExecDestroy(hGraphExec):
 def cuGraphDestroy(hGraph):
     """ Destroys a graph.
 
-    Destroys the graph specified by `hGraph`, as well as all of its nodes.
+    Destroys the graph specified by ``hGraph``, as well as all of its
+    nodes.
 
     Parameters
     ----------
@@ -50439,8 +50476,8 @@ def cuGraphExecUpdate(hGraphExec, hGraph):
     """ Check whether an executable graph can be updated with a graph and perform the update if possible.
 
     Updates the node parameters in the instantiated graph specified by
-    `hGraphExec` with the node parameters in a topologically identical
-    graph specified by `hGraph`.
+    ``hGraphExec`` with the node parameters in a topologically identical
+    graph specified by ``hGraph``.
 
     Limitations:
 
@@ -50464,7 +50501,7 @@ def cuGraphExecUpdate(hGraphExec, hGraph):
         requested priority values, before they are clamped to the device's
         supported range.
 
-      - If `hGraphExec` was not instantiated for device launch, a node
+      - If ``hGraphExec`` was not instantiated for device launch, a node
         whose function originally did not use device-side cudaGraphLaunch()
         cannot be updated to a function which uses device-side
         cudaGraphLaunch() unless the node resides on the same context as
@@ -50472,7 +50509,7 @@ def cuGraphExecUpdate(hGraphExec, hGraph):
         calls were present at instantiation, these updates cannot be
         performed at all.
 
-      - Neither `hGraph` nor `hGraphExec` may contain device-updatable
+      - Neither ``hGraph`` nor ``hGraphExec`` may contain device-updatable
         kernel nodes.
 
     - Memset and memcpy nodes:
@@ -50511,29 +50548,29 @@ def cuGraphExecUpdate(hGraphExec, hGraph):
     Note: The API may add further restrictions in future releases. The
     return code should always be checked.
 
-    cuGraphExecUpdate sets the result member of `resultInfo` to
+    cuGraphExecUpdate sets the result member of ``resultInfo`` to
     CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under the following
     conditions:
 
-    - The count of nodes directly in `hGraphExec` and `hGraph` differ, in
-      which case resultInfo->errorNode is set to NULL.
+    - The count of nodes directly in ``hGraphExec`` and ``hGraph`` differ,
+      in which case resultInfo->errorNode is set to NULL.
 
-    - `hGraph` has more exit nodes than `hGraph`, in which case
+    - ``hGraph`` has more exit nodes than ``hGraph``, in which case
       resultInfo->errorNode is set to one of the exit nodes in hGraph.
 
-    - A node in `hGraph` has a different number of dependencies than the
-      node from `hGraphExec` it is paired with, in which case
-      resultInfo->errorNode is set to the node from `hGraph`.
+    - A node in ``hGraph`` has a different number of dependencies than the
+      node from ``hGraphExec`` it is paired with, in which case
+      resultInfo->errorNode is set to the node from ``hGraph``.
 
-    - A node in `hGraph` has a dependency that does not match with the
-      corresponding dependency of the paired node from `hGraphExec`.
-      resultInfo->errorNode will be set to the node from `hGraph`.
+    - A node in ``hGraph`` has a dependency that does not match with the
+      corresponding dependency of the paired node from ``hGraphExec``.
+      resultInfo->errorNode will be set to the node from ``hGraph``.
       resultInfo->errorFromNode will be set to the mismatched dependency.
       The dependencies are paired based on edge order and a dependency does
       not match when the nodes are already paired based on other edges
       examined in the graph.
 
-    cuGraphExecUpdate sets the result member of `resultInfo` to:
+    cuGraphExecUpdate sets the result member of ``resultInfo`` to:
 
     - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value.
 
@@ -50541,27 +50578,27 @@ def cuGraphExecUpdate(hGraphExec, hGraph):
       changed
 
     - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node
-      changed, in which case `hErrorNode_out` is set to the node from
-      `hGraph`.
+      changed, in which case ``hErrorNode_out`` is set to the node from
+      ``hGraph``.
 
     - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the
       function changed in an unsupported way(see note above), in which case
-      `hErrorNode_out` is set to the node from `hGraph`
+      ``hErrorNode_out`` is set to the node from ``hGraph``
 
     - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a
       node changed in a way that is not supported, in which case
-      `hErrorNode_out` is set to the node from `hGraph`.
+      ``hErrorNode_out`` is set to the node from ``hGraph``.
 
     - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a
       node changed in a way that is not supported, in which case
-      `hErrorNode_out` is set to the node from `hGraph`.
+      ``hErrorNode_out`` is set to the node from ``hGraph``.
 
     - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is
       unsupported, like the node's type or configuration, in which case
-      `hErrorNode_out` is set to the node from `hGraph`
+      ``hErrorNode_out`` is set to the node from ``hGraph``
 
     If the update fails for a reason not listed above, the result member of
-    `resultInfo` will be set to CU_GRAPH_EXEC_UPDATE_ERROR. If the update
+    ``resultInfo`` will be set to CU_GRAPH_EXEC_UPDATE_ERROR. If the update
     succeeds, the result member will be set to
     CU_GRAPH_EXEC_UPDATE_SUCCESS.
 
@@ -50618,7 +50655,7 @@ def cuGraphExecUpdate(hGraphExec, hGraph):
 def cuGraphKernelNodeCopyAttributes(dst, src):
     """ Copies attributes from source node to destination node.
 
-    Copies attributes from source node `src` to destination node `dst`.
+    Copies attributes from source node ``src`` to destination node ``dst``.
     Both node must have the same context.
 
     Parameters
@@ -50665,8 +50702,8 @@ def cuGraphKernelNodeCopyAttributes(dst, src):
 def cuGraphKernelNodeGetAttribute(hNode, attr not None : CUkernelNodeAttrID):
     """ Queries node attribute.
 
-    Queries attribute `attr` from node `hNode` and stores it in
-    corresponding member of `value_out`.
+    Queries attribute ``attr`` from node ``hNode`` and stores it in
+    corresponding member of ``value_out``.
 
     Parameters
     ----------
@@ -50709,8 +50746,8 @@ def cuGraphKernelNodeGetAttribute(hNode, attr not None : CUkernelNodeAttrID):
 def cuGraphKernelNodeSetAttribute(hNode, attr not None : CUkernelNodeAttrID, value : Optional[CUkernelNodeAttrValue]):
     """ Sets node attribute.
 
-    Sets attribute `attr` on node `hNode` from corresponding attribute of
-    `value`.
+    Sets attribute ``attr`` on node ``hNode`` from corresponding attribute
+    of ``value``.
 
     Parameters
     ----------
@@ -50751,11 +50788,11 @@ def cuGraphKernelNodeSetAttribute(hNode, attr not None : CUkernelNodeAttrID, val
 def cuGraphDebugDotPrint(hGraph, char* path, unsigned int flags):
     """ Write a DOT file describing graph structure.
 
-    Using the provided `hGraph`, write to `path` a DOT formatted
+    Using the provided ``hGraph``, write to ``path`` a DOT formatted
     description of the graph. By default this includes the graph topology,
-    node types, node id, kernel names and memcpy direction. `flags` can be
-    specified to write more detailed information about each node type such
-    as parameter values, kernel attributes, node and function handles.
+    node types, node id, kernel names and memcpy direction. ``flags`` can
+    be specified to write more detailed information about each node type
+    such as parameter values, kernel attributes, node and function handles.
 
     Parameters
     ----------
@@ -51048,24 +51085,24 @@ def cuGraphReleaseUserObject(graph, object, unsigned int count):
 def cuGraphAddNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUgraphNode]], dependencyData : Optional[tuple[CUgraphEdgeData] | list[CUgraphEdgeData]], size_t numDependencies, nodeParams : Optional[CUgraphNodeParams]):
     """ Adds a node of arbitrary type to a graph.
 
-    Creates a new node in `hGraph` described by `nodeParams` with
-    `numDependencies` dependencies specified via `dependencies`.
-    `numDependencies` may be 0. `dependencies` may be null if
-    `numDependencies` is 0. `dependencies` may not have any duplicate
+    Creates a new node in ``hGraph`` described by ``nodeParams`` with
+    ``numDependencies`` dependencies specified via ``dependencies``.
+    ``numDependencies`` may be 0. ``dependencies`` may be null if
+    ``numDependencies`` is 0. ``dependencies`` may not have any duplicate
     entries.
 
-    `nodeParams` is a tagged union. The node type should be specified in
-    the `typename` field, and type-specific parameters in the corresponding
-    union member. All unused bytes - that is, `reserved0` and all bytes
-    past the utilized union member - must be set to zero. It is recommended
-    to use brace initialization or memset to ensure all bytes are
-    initialized.
+    ``nodeParams`` is a tagged union. The node type should be specified in
+    the ``typename`` field, and type-specific parameters in the
+    corresponding union member. All unused bytes - that is, ``reserved0``
+    and all bytes past the utilized union member - must be set to zero. It
+    is recommended to use brace initialization or memset to ensure all
+    bytes are initialized.
 
-    Note that for some node types, `nodeParams` may contain "out
+    Note that for some node types, ``nodeParams`` may contain "out
     parameters" which are modified during the call, such as
-    `nodeParams->alloc.dptr`.
+    ``nodeParams->alloc.dptr``.
 
-    A handle to the new node will be returned in `phGraphNode`.
+    A handle to the new node will be returned in ``phGraphNode``.
 
     Parameters
     ----------
@@ -51144,10 +51181,10 @@ def cuGraphAddNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUg
 def cuGraphNodeSetParams(hNode, nodeParams : Optional[CUgraphNodeParams]):
     """ Update a graph node's parameters.
 
-    Sets the parameters of graph node `hNode` to `nodeParams`. The node
-    type specified by `nodeParams->type` must match the type of `hNode`.
-    `nodeParams` must be fully initialized and all unused bytes (reserved,
-    padding) zeroed.
+    Sets the parameters of graph node ``hNode`` to ``nodeParams``. The node
+    type specified by ``nodeParams->type`` must match the type of
+    ``hNode``. ``nodeParams`` must be fully initialized and all unused
+    bytes (reserved, padding) zeroed.
 
     Modifying parameters is not supported for node types
     CU_GRAPH_NODE_TYPE_MEM_ALLOC and CU_GRAPH_NODE_TYPE_MEM_FREE.
@@ -51188,11 +51225,11 @@ def cuGraphNodeSetParams(hNode, nodeParams : Optional[CUgraphNodeParams]):
 def cuGraphNodeGetParams(hNode):
     """ Return a graph node's parameters.
 
-    Returns the parameters of graph node `hNode` in `*nodeParams`.
+    Returns the parameters of graph node ``hNode`` in ``*nodeParams``.
 
-    Any pointers returned in `*nodeParams` point to driver-owned memory
+    Any pointers returned in ``*nodeParams`` point to driver-owned memory
     associated with the node. This memory remains valid until the node is
-    destroyed. Any memory pointed to from `*nodeParams` must not be
+    destroyed. Any memory pointed to from ``*nodeParams`` must not be
     modified.
 
     The returned parameters are a description of the node, but may not be
@@ -51241,14 +51278,14 @@ def cuGraphNodeGetParams(hNode):
 def cuGraphExecNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUgraphNodeParams]):
     """ Update a graph node's parameters in an instantiated graph.
 
-    Sets the parameters of a node in an executable graph `hGraphExec`. The
-    node is identified by the corresponding node `hNode` in the non-
+    Sets the parameters of a node in an executable graph ``hGraphExec``.
+    The node is identified by the corresponding node ``hNode`` in the non-
     executable graph from which the executable graph was instantiated.
-    `hNode` must not have been removed from the original graph.
+    ``hNode`` must not have been removed from the original graph.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
     Allowed changes to parameters on executable graphs are as follows:
 
@@ -51301,7 +51338,7 @@ def cuGraphExecNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUgraphNod
 def cuGraphConditionalHandleCreate(hGraph, ctx, unsigned int defaultLaunchValue, unsigned int flags):
     """ Create a conditional handle.
 
-    Creates a conditional handle associated with `hGraph`.
+    Creates a conditional handle associated with ``hGraph``.
 
     The conditional handle must be associated with a conditional node in
     this graph or one of its children.
@@ -51321,7 +51358,7 @@ def cuGraphConditionalHandleCreate(hGraph, ctx, unsigned int defaultLaunchValue,
     defaultLaunchValue : unsigned int
         Optional initial value for the conditional variable. Applied at the
         beginning of each graph execution if CU_GRAPH_COND_ASSIGN_DEFAULT
-        is set in `flags`.
+        is set in ``flags``.
     flags : unsigned int
         Currently must be CU_GRAPH_COND_ASSIGN_DEFAULT or 0.
 
@@ -51366,7 +51403,7 @@ def cuGraphConditionalHandleCreate(hGraph, ctx, unsigned int defaultLaunchValue,
 def cuOccupancyMaxActiveBlocksPerMultiprocessor(func, int blockSize, size_t dynamicSMemSize):
     """ Returns occupancy of a function.
 
-    Returns in `*numBlocks` the number of the maximum active blocks per
+    Returns in ``*numBlocks`` the number of the maximum active blocks per
     streaming multiprocessor.
 
     Note that the API can also be used with context-less kernel
@@ -51417,11 +51454,11 @@ def cuOccupancyMaxActiveBlocksPerMultiprocessor(func, int blockSize, size_t dyna
 def cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(func, int blockSize, size_t dynamicSMemSize, unsigned int flags):
     """ Returns occupancy of a function.
 
-    Returns in `*numBlocks` the number of the maximum active blocks per
+    Returns in ``*numBlocks`` the number of the maximum active blocks per
     streaming multiprocessor.
 
-    The `Flags` parameter controls how special cases are handled. The valid
-    flags are:
+    The ``Flags`` parameter controls how special cases are handled. The
+    valid flags are:
 
     - :py:obj:`~.CU_OCCUPANCY_DEFAULT`, which maintains the default
       behavior as :py:obj:`~.cuOccupancyMaxActiveBlocksPerMultiprocessor`;
@@ -51486,28 +51523,28 @@ def cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(func, int blockSize, si
 def cuOccupancyMaxPotentialBlockSize(func, blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit):
     """ Suggest a launch configuration with reasonable occupancy.
 
-    Returns in `*blockSize` a reasonable block size that can achieve the
+    Returns in ``*blockSize`` a reasonable block size that can achieve the
     maximum occupancy (or, the maximum number of active warps with the
-    fewest blocks per multiprocessor), and in `*minGridSize` the minimum
+    fewest blocks per multiprocessor), and in ``*minGridSize`` the minimum
     grid size to achieve the maximum occupancy.
 
-    If `blockSizeLimit` is 0, the configurator will use the maximum block
+    If ``blockSizeLimit`` is 0, the configurator will use the maximum block
     size permitted by the device / function instead.
 
     If per-block dynamic shared memory allocation is not needed, the user
-    should leave both `blockSizeToDynamicSMemSize` and `dynamicSMemSize` as
-    0.
+    should leave both ``blockSizeToDynamicSMemSize`` and
+    ``dynamicSMemSize`` as 0.
 
     If per-block dynamic shared memory allocation is needed, then if the
     dynamic shared memory size is constant regardless of block size, the
-    size should be passed through `dynamicSMemSize`, and
-    `blockSizeToDynamicSMemSize` should be NULL.
+    size should be passed through ``dynamicSMemSize``, and
+    ``blockSizeToDynamicSMemSize`` should be NULL.
 
     Otherwise, if the per-block dynamic shared memory size varies with
     different block sizes, the user needs to provide a unary function
-    through `blockSizeToDynamicSMemSize` that computes the dynamic shared
-    memory needed by `func` for any given block size. `dynamicSMemSize` is
-    ignored. An example signature is:
+    through ``blockSizeToDynamicSMemSize`` that computes the dynamic shared
+    memory needed by ``func`` for any given block size. ``dynamicSMemSize``
+    is ignored. An example signature is:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -51523,11 +51560,11 @@ def cuOccupancyMaxPotentialBlockSize(func, blockSizeToDynamicSMemSize, size_t dy
         Kernel for which launch configuration is calculated
     blockSizeToDynamicSMemSize : :py:obj:`~.CUoccupancyB2DSize`
         A function that calculates how much per-block dynamic shared memory
-        `func` uses based on the block size
+        ``func`` uses based on the block size
     dynamicSMemSize : size_t
         Dynamic shared memory usage intended, in bytes
     blockSizeLimit : int
-        The maximum block size `func` is designed to handle
+        The maximum block size ``func`` is designed to handle
 
     Returns
     -------
@@ -51577,10 +51614,10 @@ def cuOccupancyMaxPotentialBlockSizeWithFlags(func, blockSizeToDynamicSMemSize,
     addition to arguments passed to
     :py:obj:`~.cuOccupancyMaxPotentialBlockSize`,
     :py:obj:`~.cuOccupancyMaxPotentialBlockSizeWithFlags` also takes a
-    `Flags` parameter.
+    ``Flags`` parameter.
 
-    The `Flags` parameter controls how special cases are handled. The valid
-    flags are:
+    The ``Flags`` parameter controls how special cases are handled. The
+    valid flags are:
 
     - :py:obj:`~.CU_OCCUPANCY_DEFAULT`, which maintains the default
       behavior as :py:obj:`~.cuOccupancyMaxPotentialBlockSize`;
@@ -51607,11 +51644,11 @@ def cuOccupancyMaxPotentialBlockSizeWithFlags(func, blockSizeToDynamicSMemSize,
         Kernel for which launch configuration is calculated
     blockSizeToDynamicSMemSize : :py:obj:`~.CUoccupancyB2DSize`
         A function that calculates how much per-block dynamic shared memory
-        `func` uses based on the block size
+        ``func`` uses based on the block size
     dynamicSMemSize : size_t
         Dynamic shared memory usage intended, in bytes
     blockSizeLimit : int
-        The maximum block size `func` is designed to handle
+        The maximum block size ``func`` is designed to handle
     flags : unsigned int
         Options
 
@@ -51657,10 +51694,10 @@ def cuOccupancyMaxPotentialBlockSizeWithFlags(func, blockSizeToDynamicSMemSize,
 
 @cython.embedsignature(True)
 def cuOccupancyAvailableDynamicSMemPerBlock(func, int numBlocks, int blockSize):
-    """ Returns dynamic shared memory available per block when launching `numBlocks` blocks on SM.
+    """ Returns dynamic shared memory available per block when launching ``numBlocks`` blocks on SM.
 
-    Returns in `*dynamicSmemSize` the maximum size of dynamic shared memory
-    to allow `numBlocks` blocks per SM.
+    Returns in ``*dynamicSmemSize`` the maximum size of dynamic shared
+    memory to allow ``numBlocks`` blocks per SM.
 
     Note that the API can also be used with context-less kernel
     :py:obj:`~.CUkernel` by querying the handle using
@@ -51704,11 +51741,11 @@ def cuOccupancyAvailableDynamicSMemPerBlock(func, int numBlocks, int blockSize):
 
 @cython.embedsignature(True)
 def cuOccupancyMaxPotentialClusterSize(func, config : Optional[CUlaunchConfig]):
-    """ Given the kernel function (`func`) and launch configuration (`config`), return the maximum cluster size in `*clusterSize`.
+    """ Given the kernel function (``func``) and launch configuration (``config``), return the maximum cluster size in ``*clusterSize``.
 
-    The cluster dimensions in `config` are ignored. If func has a required
-    cluster size set (see :py:obj:`~.cudaFuncGetAttributes` /
-    :py:obj:`~.cuFuncGetAttribute`),`*clusterSize` will reflect the
+    The cluster dimensions in ``config`` are ignored. If func has a
+    required cluster size set (see :py:obj:`~.cudaFuncGetAttributes` /
+    :py:obj:`~.cuFuncGetAttribute`),``*clusterSize`` will reflect the
     required cluster size.
 
     By default this function will always return a value that's portable on
@@ -51722,7 +51759,7 @@ def cuOccupancyMaxPotentialClusterSize(func, config : Optional[CUlaunchConfig]):
     :py:obj:`~.cuLibraryGetKernel()` and then passing it to the API by
     casting to :py:obj:`~.CUfunction`. Here, the context to use for
     calculations will either be taken from the specified stream
-    `config->hStream` or the current context in case of NULL stream.
+    ``config->hStream`` or the current context in case of NULL stream.
 
     Parameters
     ----------
@@ -51764,7 +51801,7 @@ def cuOccupancyMaxPotentialClusterSize(func, config : Optional[CUlaunchConfig]):
 
 @cython.embedsignature(True)
 def cuOccupancyMaxActiveClusters(func, config : Optional[CUlaunchConfig]):
-    """ Given the kernel function (`func`) and launch configuration (`config`), return the maximum number of clusters that could co-exist on the target device in `*numClusters`.
+    """ Given the kernel function (``func``) and launch configuration (``config``), return the maximum number of clusters that could co-exist on the target device in ``*numClusters``.
 
     If the function has required cluster size already set (see
     :py:obj:`~.cudaFuncGetAttributes` / :py:obj:`~.cuFuncGetAttribute`),
@@ -51782,7 +51819,7 @@ def cuOccupancyMaxActiveClusters(func, config : Optional[CUlaunchConfig]):
     :py:obj:`~.cuLibraryGetKernel()` and then passing it to the API by
     casting to :py:obj:`~.CUfunction`. Here, the context to use for
     calculations will either be taken from the specified stream
-    `config->hStream` or the current context in case of NULL stream.
+    ``config->hStream`` or the current context in case of NULL stream.
 
     Parameters
     ----------
@@ -51828,11 +51865,11 @@ def cuTexRefSetArray(hTexRef, hArray, unsigned int Flags):
 
     [Deprecated]
 
-    Binds the CUDA array `hArray` to the texture reference `hTexRef`. Any
-    previous address or CUDA array state associated with the texture
-    reference is superseded by this function. `Flags` must be set to
+    Binds the CUDA array ``hArray`` to the texture reference ``hTexRef``.
+    Any previous address or CUDA array state associated with the texture
+    reference is superseded by this function. ``Flags`` must be set to
     :py:obj:`~.CU_TRSA_OVERRIDE_FORMAT`. Any CUDA array previously bound to
-    `hTexRef` is unbound.
+    ``hTexRef`` is unbound.
 
     Parameters
     ----------
@@ -51881,11 +51918,11 @@ def cuTexRefSetMipmappedArray(hTexRef, hMipmappedArray, unsigned int Flags):
 
     [Deprecated]
 
-    Binds the CUDA mipmapped array `hMipmappedArray` to the texture
-    reference `hTexRef`. Any previous address or CUDA array state
+    Binds the CUDA mipmapped array ``hMipmappedArray`` to the texture
+    reference ``hTexRef``. Any previous address or CUDA array state
     associated with the texture reference is superseded by this function.
-    `Flags` must be set to :py:obj:`~.CU_TRSA_OVERRIDE_FORMAT`. Any CUDA
-    array previously bound to `hTexRef` is unbound.
+    ``Flags`` must be set to :py:obj:`~.CU_TRSA_OVERRIDE_FORMAT`. Any CUDA
+    array previously bound to ``hTexRef`` is unbound.
 
     Parameters
     ----------
@@ -51934,27 +51971,27 @@ def cuTexRefSetAddress(hTexRef, dptr, size_t numbytes):
 
     [Deprecated]
 
-    Binds a linear address range to the texture reference `hTexRef`. Any
+    Binds a linear address range to the texture reference ``hTexRef``. Any
     previous address or CUDA array state associated with the texture
     reference is superseded by this function. Any memory previously bound
-    to `hTexRef` is unbound.
+    to ``hTexRef`` is unbound.
 
     Since the hardware enforces an alignment requirement on texture base
     addresses, :py:obj:`~.cuTexRefSetAddress()` passes back a byte offset
-    in `*ByteOffset` that must be applied to texture fetches in order to
+    in ``*ByteOffset`` that must be applied to texture fetches in order to
     read from the desired memory. This offset must be divided by the texel
     size and passed to kernels that read from the texture so they can be
     applied to the :py:obj:`~.tex1Dfetch()` function.
 
     If the device memory pointer was returned from
     :py:obj:`~.cuMemAlloc()`, the offset is guaranteed to be 0 and NULL may
-    be passed as the `ByteOffset` parameter.
+    be passed as the ``ByteOffset`` parameter.
 
     The total number of elements (or texels) in the linear address range
     cannot exceed
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH`. The
-    number of elements is computed as (`numbytes` / bytesPerElement), where
-    bytesPerElement is determined from the data format and number of
+    number of elements is computed as (``numbytes`` / bytesPerElement),
+    where bytesPerElement is determined from the data format and number of
     components set using :py:obj:`~.cuTexRefSetFormat()`.
 
     Parameters
@@ -52009,10 +52046,10 @@ def cuTexRefSetAddress2D(hTexRef, desc : Optional[CUDA_ARRAY_DESCRIPTOR], dptr,
 
     [Deprecated]
 
-    Binds a linear address range to the texture reference `hTexRef`. Any
+    Binds a linear address range to the texture reference ``hTexRef``. Any
     previous address or CUDA array state associated with the texture
     reference is superseded by this function. Any memory previously bound
-    to `hTexRef` is unbound.
+    to ``hTexRef`` is unbound.
 
     Using a :py:obj:`~.tex2D()` function inside a kernel requires a call to
     either :py:obj:`~.cuTexRefSetArray()` to bind the corresponding texture
@@ -52022,22 +52059,22 @@ def cuTexRefSetAddress2D(hTexRef, desc : Optional[CUDA_ARRAY_DESCRIPTOR], dptr,
     Function calls to :py:obj:`~.cuTexRefSetFormat()` cannot follow calls
     to :py:obj:`~.cuTexRefSetAddress2D()` for the same texture reference.
 
-    It is required that `dptr` be aligned to the appropriate hardware-
+    It is required that ``dptr`` be aligned to the appropriate hardware-
     specific texture alignment. You can query this value using the device
     attribute :py:obj:`~.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT`. If an
-    unaligned `dptr` is supplied, :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is
+    unaligned ``dptr`` is supplied, :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is
     returned.
 
-    `Pitch` has to be aligned to the hardware-specific texture pitch
+    ``Pitch`` has to be aligned to the hardware-specific texture pitch
     alignment. This value can be queried using the device attribute
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT`. If an
-    unaligned `Pitch` is supplied, :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is
-    returned.
+    unaligned ``Pitch`` is supplied, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
+    is returned.
 
     Width and Height, which are specified in elements (or texels), cannot
     exceed :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH`
     and :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT`
-    respectively. `Pitch`, which is specified in bytes, cannot exceed
+    respectively. ``Pitch``, which is specified in bytes, cannot exceed
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH`.
 
     Parameters
@@ -52091,8 +52128,8 @@ def cuTexRefSetFormat(hTexRef, fmt not None : CUarray_format, int NumPackedCompo
     [Deprecated]
 
     Specifies the format of the data to be read by the texture reference
-    `hTexRef`. `fmt` and `NumPackedComponents` are exactly analogous to the
-    :py:obj:`~.Format` and :py:obj:`~.NumChannels` members of the
+    ``hTexRef``. ``fmt`` and ``NumPackedComponents`` are exactly analogous
+    to the :py:obj:`~.Format` and :py:obj:`~.NumChannels` members of the
     :py:obj:`~.CUDA_ARRAY_DESCRIPTOR` structure: They specify the format of
     each component and the number of components per array element.
 
@@ -52136,15 +52173,15 @@ def cuTexRefSetAddressMode(hTexRef, int dim, am not None : CUaddress_mode):
 
     [Deprecated]
 
-    Specifies the addressing mode `am` for the given dimension `dim` of the
-    texture reference `hTexRef`. If `dim` is zero, the addressing mode is
-    applied to the first parameter of the functions used to fetch from the
-    texture; if `dim` is 1, the second, and so on.
+    Specifies the addressing mode ``am`` for the given dimension ``dim`` of
+    the texture reference ``hTexRef``. If ``dim`` is zero, the addressing
+    mode is applied to the first parameter of the functions used to fetch
+    from the texture; if ``dim`` is 1, the second, and so on.
     :py:obj:`~.CUaddress_mode` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    Note that this call has no effect if `hTexRef` is bound to linear
+    Note that this call has no effect if ``hTexRef`` is bound to linear
     memory. Also, if the flag, :py:obj:`~.CU_TRSF_NORMALIZED_COORDINATES`,
     is not set, the only supported address mode is
     :py:obj:`~.CU_TR_ADDRESS_MODE_CLAMP`.
@@ -52189,13 +52226,13 @@ def cuTexRefSetFilterMode(hTexRef, fm not None : CUfilter_mode):
 
     [Deprecated]
 
-    Specifies the filtering mode `fm` to be used when reading memory
-    through the texture reference `hTexRef`. :py:obj:`~.CUfilter_mode_enum`
-    is defined as:
+    Specifies the filtering mode ``fm`` to be used when reading memory
+    through the texture reference ``hTexRef``.
+    :py:obj:`~.CUfilter_mode_enum` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    Note that this call has no effect if `hTexRef` is bound to linear
+    Note that this call has no effect if ``hTexRef`` is bound to linear
     memory.
 
     Parameters
@@ -52236,13 +52273,13 @@ def cuTexRefSetMipmapFilterMode(hTexRef, fm not None : CUfilter_mode):
 
     [Deprecated]
 
-    Specifies the mipmap filtering mode `fm` to be used when reading memory
-    through the texture reference `hTexRef`. :py:obj:`~.CUfilter_mode_enum`
-    is defined as:
+    Specifies the mipmap filtering mode ``fm`` to be used when reading
+    memory through the texture reference ``hTexRef``.
+    :py:obj:`~.CUfilter_mode_enum` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    Note that this call has no effect if `hTexRef` is not bound to a
+    Note that this call has no effect if ``hTexRef`` is not bound to a
     mipmapped array.
 
     Parameters
@@ -52283,11 +52320,11 @@ def cuTexRefSetMipmapLevelBias(hTexRef, float bias):
 
     [Deprecated]
 
-    Specifies the mipmap level bias `bias` to be added to the specified
+    Specifies the mipmap level bias ``bias`` to be added to the specified
     mipmap level when reading memory through the texture reference
-    `hTexRef`.
+    ``hTexRef``.
 
-    Note that this call has no effect if `hTexRef` is not bound to a
+    Note that this call has no effect if ``hTexRef`` is not bound to a
     mipmapped array.
 
     Parameters
@@ -52327,11 +52364,11 @@ def cuTexRefSetMipmapLevelClamp(hTexRef, float minMipmapLevelClamp, float maxMip
 
     [Deprecated]
 
-    Specifies the min/max mipmap level clamps, `minMipmapLevelClamp` and
-    `maxMipmapLevelClamp` respectively, to be used when reading memory
-    through the texture reference `hTexRef`.
+    Specifies the min/max mipmap level clamps, ``minMipmapLevelClamp`` and
+    ``maxMipmapLevelClamp`` respectively, to be used when reading memory
+    through the texture reference ``hTexRef``.
 
-    Note that this call has no effect if `hTexRef` is not bound to a
+    Note that this call has no effect if ``hTexRef`` is not bound to a
     mipmapped array.
 
     Parameters
@@ -52373,10 +52410,10 @@ def cuTexRefSetMaxAnisotropy(hTexRef, unsigned int maxAniso):
 
     [Deprecated]
 
-    Specifies the maximum anisotropy `maxAniso` to be used when reading
-    memory through the texture reference `hTexRef`.
+    Specifies the maximum anisotropy ``maxAniso`` to be used when reading
+    memory through the texture reference ``hTexRef``.
 
-    Note that this call has no effect if `hTexRef` is bound to linear
+    Note that this call has no effect if ``hTexRef`` is bound to linear
     memory.
 
     Parameters
@@ -52416,8 +52453,8 @@ def cuTexRefSetBorderColor(hTexRef, float pBorderColor):
 
     [Deprecated]
 
-    Specifies the value of the RGBA color via the `pBorderColor` to the
-    texture reference `hTexRef`. The color value supports only float type
+    Specifies the value of the RGBA color via the ``pBorderColor`` to the
+    texture reference ``hTexRef``. The color value supports only float type
     and holds color components in the following sequence: pBorderColor[0]
     holds 'R' component pBorderColor[1] holds 'G' component pBorderColor[2]
     holds 'B' component pBorderColor[3] holds 'A' component
@@ -52464,8 +52501,9 @@ def cuTexRefSetFlags(hTexRef, unsigned int Flags):
 
     [Deprecated]
 
-    Specifies optional flags via `Flags` to specify the behavior of data
-    returned through the texture reference `hTexRef`. The valid flags are:
+    Specifies optional flags via ``Flags`` to specify the behavior of data
+    returned through the texture reference ``hTexRef``. The valid flags
+    are:
 
     - :py:obj:`~.CU_TRSF_READ_AS_INTEGER`, which suppresses the default
       behavior of having the texture promote integer data to floating point
@@ -52522,8 +52560,8 @@ def cuTexRefGetAddress(hTexRef):
 
     [Deprecated]
 
-    Returns in `*pdptr` the base address bound to the texture reference
-    `hTexRef`, or returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if the
+    Returns in ``*pdptr`` the base address bound to the texture reference
+    ``hTexRef``, or returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if the
     texture reference is not bound to any device memory range.
 
     Parameters
@@ -52566,8 +52604,8 @@ def cuTexRefGetArray(hTexRef):
 
     [Deprecated]
 
-    Returns in `*phArray` the CUDA array bound to the texture reference
-    `hTexRef`, or returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if the
+    Returns in ``*phArray`` the CUDA array bound to the texture reference
+    ``hTexRef``, or returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if the
     texture reference is not bound to any CUDA array.
 
     Parameters
@@ -52610,8 +52648,8 @@ def cuTexRefGetMipmappedArray(hTexRef):
 
     [Deprecated]
 
-    Returns in `*phMipmappedArray` the CUDA mipmapped array bound to the
-    texture reference `hTexRef`, or returns
+    Returns in ``*phMipmappedArray`` the CUDA mipmapped array bound to the
+    texture reference ``hTexRef``, or returns
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if the texture reference is not
     bound to any CUDA mipmapped array.
 
@@ -52655,9 +52693,9 @@ def cuTexRefGetAddressMode(hTexRef, int dim):
 
     [Deprecated]
 
-    Returns in `*pam` the addressing mode corresponding to the dimension
-    `dim` of the texture reference `hTexRef`. Currently, the only valid
-    value for `dim` are 0 and 1.
+    Returns in ``*pam`` the addressing mode corresponding to the dimension
+    ``dim`` of the texture reference ``hTexRef``. Currently, the only valid
+    value for ``dim`` are 0 and 1.
 
     Parameters
     ----------
@@ -52701,8 +52739,8 @@ def cuTexRefGetFilterMode(hTexRef):
 
     [Deprecated]
 
-    Returns in `*pfm` the filtering mode of the texture reference
-    `hTexRef`.
+    Returns in ``*pfm`` the filtering mode of the texture reference
+    ``hTexRef``.
 
     Parameters
     ----------
@@ -52744,9 +52782,10 @@ def cuTexRefGetFormat(hTexRef):
 
     [Deprecated]
 
-    Returns in `*pFormat` and `*pNumChannels` the format and number of
-    components of the CUDA array bound to the texture reference `hTexRef`.
-    If `pFormat` or `pNumChannels` is NULL, it will be ignored.
+    Returns in ``*pFormat`` and ``*pNumChannels`` the format and number of
+    components of the CUDA array bound to the texture reference
+    ``hTexRef``. If ``pFormat`` or ``pNumChannels`` is NULL, it will be
+    ignored.
 
     Parameters
     ----------
@@ -52791,8 +52830,8 @@ def cuTexRefGetMipmapFilterMode(hTexRef):
 
     [Deprecated]
 
-    Returns the mipmap filtering mode in `pfm` that's used when reading
-    memory through the texture reference `hTexRef`.
+    Returns the mipmap filtering mode in ``pfm`` that's used when reading
+    memory through the texture reference ``hTexRef``.
 
     Parameters
     ----------
@@ -52834,9 +52873,9 @@ def cuTexRefGetMipmapLevelBias(hTexRef):
 
     [Deprecated]
 
-    Returns the mipmap level bias in `pBias` that's added to the specified
-    mipmap level when reading memory through the texture reference
-    `hTexRef`.
+    Returns the mipmap level bias in ``pBias`` that's added to the
+    specified mipmap level when reading memory through the texture
+    reference ``hTexRef``.
 
     Parameters
     ----------
@@ -52878,9 +52917,9 @@ def cuTexRefGetMipmapLevelClamp(hTexRef):
 
     [Deprecated]
 
-    Returns the min/max mipmap level clamps in `pminMipmapLevelClamp` and
-    `pmaxMipmapLevelClamp` that's used when reading memory through the
-    texture reference `hTexRef`.
+    Returns the min/max mipmap level clamps in ``pminMipmapLevelClamp`` and
+    ``pmaxMipmapLevelClamp`` that's used when reading memory through the
+    texture reference ``hTexRef``.
 
     Parameters
     ----------
@@ -52925,8 +52964,8 @@ def cuTexRefGetMaxAnisotropy(hTexRef):
 
     [Deprecated]
 
-    Returns the maximum anisotropy in `pmaxAniso` that's used when reading
-    memory through the texture reference `hTexRef`.
+    Returns the maximum anisotropy in ``pmaxAniso`` that's used when
+    reading memory through the texture reference ``hTexRef``.
 
     Parameters
     ----------
@@ -52968,11 +53007,11 @@ def cuTexRefGetBorderColor(hTexRef):
 
     [Deprecated]
 
-    Returns in `pBorderColor`, values of the RGBA color used by the texture
-    reference `hTexRef`. The color value is of type float and holds color
-    components in the following sequence: pBorderColor[0] holds 'R'
-    component pBorderColor[1] holds 'G' component pBorderColor[2] holds 'B'
-    component pBorderColor[3] holds 'A' component
+    Returns in ``pBorderColor``, values of the RGBA color used by the
+    texture reference ``hTexRef``. The color value is of type float and
+    holds color components in the following sequence: pBorderColor[0] holds
+    'R' component pBorderColor[1] holds 'G' component pBorderColor[2] holds
+    'B' component pBorderColor[3] holds 'A' component
 
     Parameters
     ----------
@@ -53014,7 +53053,7 @@ def cuTexRefGetFlags(hTexRef):
 
     [Deprecated]
 
-    Returns in `*pFlags` the flags of the texture reference `hTexRef`.
+    Returns in ``*pFlags`` the flags of the texture reference ``hTexRef``.
 
     Parameters
     ----------
@@ -53056,9 +53095,9 @@ def cuTexRefCreate():
 
     [Deprecated]
 
-    Creates a texture reference and returns its handle in `*pTexRef`. Once
-    created, the application must call :py:obj:`~.cuTexRefSetArray()` or
-    :py:obj:`~.cuTexRefSetAddress()` to associate the reference with
+    Creates a texture reference and returns its handle in ``*pTexRef``.
+    Once created, the application must call :py:obj:`~.cuTexRefSetArray()`
+    or :py:obj:`~.cuTexRefSetAddress()` to associate the reference with
     allocated memory. Other texture reference functions are used to specify
     the format and interpretation (addressing, filtering, etc.) to be used
     when the memory is read through this texture reference.
@@ -53090,7 +53129,7 @@ def cuTexRefDestroy(hTexRef):
 
     [Deprecated]
 
-    Destroys the texture reference specified by `hTexRef`.
+    Destroys the texture reference specified by ``hTexRef``.
 
     Parameters
     ----------
@@ -53127,12 +53166,12 @@ def cuSurfRefSetArray(hSurfRef, hArray, unsigned int Flags):
 
     [Deprecated]
 
-    Sets the CUDA array `hArray` to be read and written by the surface
-    reference `hSurfRef`. Any previous CUDA array state associated with the
-    surface reference is superseded by this function. `Flags` must be set
-    to 0. The :py:obj:`~.CUDA_ARRAY3D_SURFACE_LDST` flag must have been set
-    for the CUDA array. Any CUDA array previously bound to `hSurfRef` is
-    unbound.
+    Sets the CUDA array ``hArray`` to be read and written by the surface
+    reference ``hSurfRef``. Any previous CUDA array state associated with
+    the surface reference is superseded by this function. ``Flags`` must be
+    set to 0. The :py:obj:`~.CUDA_ARRAY3D_SURFACE_LDST` flag must have been
+    set for the CUDA array. Any CUDA array previously bound to ``hSurfRef``
+    is unbound.
 
     Parameters
     ----------
@@ -53181,8 +53220,8 @@ def cuSurfRefGetArray(hSurfRef):
 
     [Deprecated]
 
-    Returns in `*phArray` the CUDA array bound to the surface reference
-    `hSurfRef`, or returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if the
+    Returns in ``*phArray`` the CUDA array bound to the surface reference
+    ``hSurfRef``, or returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if the
     surface reference is not bound to any CUDA array.
 
     Parameters
@@ -53223,14 +53262,14 @@ def cuSurfRefGetArray(hSurfRef):
 def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Optional[CUDA_TEXTURE_DESC], pResViewDesc : Optional[CUDA_RESOURCE_VIEW_DESC]):
     """ Creates a texture object.
 
-    Creates a texture object and returns it in `pTexObject`. `pResDesc`
-    describes the data to texture from. `pTexDesc` describes how the data
-    should be sampled. `pResViewDesc` is an optional argument that
-    specifies an alternate format for the data described by `pResDesc`, and
-    also describes the subresource region to restrict access to when
-    texturing. `pResViewDesc` can only be specified if the type of resource
-    is a CUDA array or a CUDA mipmapped array not in a block compressed
-    format.
+    Creates a texture object and returns it in ``pTexObject``. ``pResDesc``
+    describes the data to texture from. ``pTexDesc`` describes how the data
+    should be sampled. ``pResViewDesc`` is an optional argument that
+    specifies an alternate format for the data described by ``pResDesc``,
+    and also describes the subresource region to restrict access to when
+    texturing. ``pResViewDesc`` can only be specified if the type of
+    resource is a CUDA array or a CUDA mipmapped array not in a block
+    compressed format.
 
     Texture objects are only supported on devices of compute capability 3.0
     or higher. Additionally, a texture object is an opaque value, and, as
@@ -53462,7 +53501,7 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
 def cuTexObjectDestroy(texObject):
     """ Destroys a texture object.
 
-    Destroys the texture object specified by `texObject`.
+    Destroys the texture object specified by ``texObject``.
 
     Parameters
     ----------
@@ -53498,7 +53537,7 @@ def cuTexObjectGetResourceDesc(texObject):
     """ Returns a texture object's resource descriptor.
 
     Returns the resource descriptor for the texture object specified by
-    `texObject`.
+    ``texObject``.
 
     Parameters
     ----------
@@ -53539,7 +53578,7 @@ def cuTexObjectGetTextureDesc(texObject):
     """ Returns a texture object's texture descriptor.
 
     Returns the texture descriptor for the texture object specified by
-    `texObject`.
+    ``texObject``.
 
     Parameters
     ----------
@@ -53580,7 +53619,7 @@ def cuTexObjectGetResourceViewDesc(texObject):
     """ Returns a texture object's resource view descriptor.
 
     Returns the resource view descriptor for the texture object specified
-    by `texObject`. If no resource view was set for `texObject`, the
+    by ``texObject``. If no resource view was set for ``texObject``, the
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned.
 
     Parameters
@@ -53621,8 +53660,8 @@ def cuTexObjectGetResourceViewDesc(texObject):
 def cuSurfObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC]):
     """ Creates a surface object.
 
-    Creates a surface object and returns it in `pSurfObject`. `pResDesc`
-    describes the data to perform surface load/stores on.
+    Creates a surface object and returns it in ``pSurfObject``.
+    ``pResDesc`` describes the data to perform surface load/stores on.
     :py:obj:`~.CUDA_RESOURCE_DESC.resType` must be
     :py:obj:`~.CU_RESOURCE_TYPE_ARRAY` and
     :py:obj:`~.CUDA_RESOURCE_DESC.res.array.hArray` must be set to a valid
@@ -53664,7 +53703,7 @@ def cuSurfObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC]):
 def cuSurfObjectDestroy(surfObject):
     """ Destroys a surface object.
 
-    Destroys the surface object specified by `surfObject`.
+    Destroys the surface object specified by ``surfObject``.
 
     Parameters
     ----------
@@ -53700,7 +53739,7 @@ def cuSurfObjectGetResourceDesc(surfObject):
     """ Returns a surface object's resource descriptor.
 
     Returns the resource descriptor for the surface object specified by
-    `surfObject`.
+    ``surfObject``.
 
     Parameters
     ----------
@@ -53741,7 +53780,8 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
     """ Create a tensor map descriptor object representing tiled memory region.
 
     Creates a descriptor for Tensor Memory Access (TMA) object specified by
-    the parameters describing a tiled region and returns it in `tensorMap`.
+    the parameters describing a tiled region and returns it in
+    ``tensorMap``.
 
     Tensor map objects are only supported on devices of compute capability
     9.0 or higher. Additionally, a tensor map object is an opaque value,
@@ -53749,9 +53789,9 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
 
     The parameters passed are bound to the following requirements:
 
-    - `tensorMap` address must be aligned to 64 bytes.
+    - ``tensorMap`` address must be aligned to 64 bytes.
 
-    - `tensorDataType` has to be an enum from
+    - ``tensorDataType`` has to be an enum from
       :py:obj:`~.CUtensorMapDataType` which is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
@@ -53765,49 +53805,49 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
       packed values to memory aligned as 16 bytes. There are 4 byte gaps
       between every 12 byte chunk of packed values.
 
-    - `tensorRank` must be non-zero and less than or equal to the maximum
-      supported dimensionality of 5. If `interleave` is not
-      :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`, then `tensorRank` must
+    - ``tensorRank`` must be non-zero and less than or equal to the maximum
+      supported dimensionality of 5. If ``interleave`` is not
+      :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`, then ``tensorRank`` must
       additionally be greater than or equal to 3.
 
-    - `globalAddress`, which specifies the starting address of the memory
+    - ``globalAddress``, which specifies the starting address of the memory
       region described, must be 16 byte aligned. The following requirements
       need to also be met:
 
-      - When `interleave` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`,
-        `globalAddress` must be 32 byte aligned.
+      - When ``interleave`` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`,
+        ``globalAddress`` must be 32 byte aligned.
 
-      - When `tensorDataType` is
+      - When ``tensorDataType`` is
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` or
-        :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, `globalAddress`
-        must be 32 byte aligned.
+        :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`,
+        ``globalAddress`` must be 32 byte aligned.
 
-    `globalDim` array, which specifies tensor size of each of the
-    `tensorRank` dimensions, must be non-zero and less than or equal to
+    ``globalDim`` array, which specifies tensor size of each of the
+    ``tensorRank`` dimensions, must be non-zero and less than or equal to
     2^32. Additionally, the following requirements need to be met for the
     packed data types:
 
-    - When `tensorDataType` is
+    - When ``tensorDataType`` is
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` or
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, globalDim[0] must
       be a multiple of 128.
 
-    - When `tensorDataType` is
-      :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B`, `globalDim`[0] must
-      be a multiple of 2.
+    - When ``tensorDataType`` is
+      :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B`, ``globalDim[0]``
+      must be a multiple of 2.
 
     - Dimension for the packed data types must reflect the number of
       individual U# values.
 
-    `globalStrides` array, which specifies tensor stride of each of the
-    lower `tensorRank` - 1 dimensions in bytes, must be a multiple of 16
+    ``globalStrides`` array, which specifies tensor stride of each of the
+    lower ``tensorRank`` - 1 dimensions in bytes, must be a multiple of 16
     and less than 2^40. Additionally, the following requirements need to be
     met:
 
-    - When `interleave` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`, the
+    - When ``interleave`` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`, the
       strides must be a multiple of 32.
 
-    - When `tensorDataType` is
+    - When ``tensorDataType`` is
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` or
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, the strides must
       be a multiple of 32. Each following dimension specified includes
@@ -53815,45 +53855,45 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
-    `boxDim` array, which specifies number of elements to be traversed
-    along each of the `tensorRank` dimensions, must be non-zero and less
+    ``boxDim`` array, which specifies number of elements to be traversed
+    along each of the ``tensorRank`` dimensions, must be non-zero and less
     than or equal to 256. Additionally, the following requirements need to
     be met:
 
-    - When `interleave` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`, {
-      `boxDim`[0] * elementSizeInBytes( `tensorDataType` ) } must be a
+    - When ``interleave`` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`, {
+      ``boxDim[0]`` * elementSizeInBytes( ``tensorDataType`` ) } must be a
       multiple of 16 bytes.
 
-    - When `tensorDataType` is
+    - When ``tensorDataType`` is
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` or
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, boxDim[0] must be
       128.
 
-    `elementStrides` array, which specifies the iteration step along each
-    of the `tensorRank` dimensions, must be non-zero and less than or equal
-    to 8. Note that when `interleave` is
+    ``elementStrides`` array, which specifies the iteration step along each
+    of the ``tensorRank`` dimensions, must be non-zero and less than or
+    equal to 8. Note that when ``interleave`` is
     :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`, the first element of this
     array is ignored since TMA doesn’t support the stride for dimension
-    zero. When all elements of `elementStrides` array is one, `boxDim`
+    zero. When all elements of ``elementStrides`` array is one, ``boxDim``
     specifies the number of elements to load. However, if the
-    `elementStrides`[i] is not equal to one, then TMA loads ceil(
-    `boxDim`[i] / `elementStrides`[i]) number of elements along i-th
-    dimension. To load N elements along i-th dimension, `boxDim`[i] must be
-    set to N * `elementStrides`[i].
+    ``elementStrides[i]`` is not equal to one, then TMA loads ceil(
+    ``boxDim[i]`` / ``elementStrides[i]``) number of elements along i-th
+    dimension. To load N elements along i-th dimension, ``boxDim[i]`` must
+    be set to N * ``elementStrides[i]``.
 
-    - `interleave` specifies the interleaved layout of type
+    - ``interleave`` specifies the interleaved layout of type
       :py:obj:`~.CUtensorMapInterleave`, which is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
     - TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16
       bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
-      uses 32 bytes. When `interleave` is
-      :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE` and `swizzle` is not
+      uses 32 bytes. When ``interleave`` is
+      :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE` and ``swizzle`` is not
       :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_NONE`, the bounding box inner
-      dimension (computed as `boxDim`[0] multiplied by element size derived
-      from `tensorDataType`) must be less than or equal to the swizzle
-      size.
+      dimension (computed as ``boxDim[0]`` multiplied by element size
+      derived from ``tensorDataType``) must be less than or equal to the
+      swizzle size.
 
       - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension
         to be <= 32.
@@ -53862,12 +53902,13 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
         to be <= 64.
 
       - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner
-        dimension to be <= 128. Additionally, `tensorDataType` of
+        dimension to be <= 128. Additionally, ``tensorDataType`` of
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` requires
-        `interleave` to be :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`.
+        ``interleave`` to be :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`.
 
-    - `swizzle`, which specifies the shared memory bank swizzling pattern,
-      has to be of type :py:obj:`~.CUtensorMapSwizzle` which is defined as:
+    - ``swizzle``, which specifies the shared memory bank swizzling
+      pattern, has to be of type :py:obj:`~.CUtensorMapSwizzle` which is
+      defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -53876,10 +53917,10 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
       in shared memory. This difference in data organization may cause bank
       conflicts when shared memory is accessed. In order to avoid this
       problem, data can be loaded to shared memory with shuffling across
-      shared memory banks. When `interleave` is
-      :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`, `swizzle` must be
+      shared memory banks. When ``interleave`` is
+      :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`, ``swizzle`` must be
       :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_32B`. Other interleave modes can
-      have any swizzling pattern. When the `tensorDataType` is
+      have any swizzling pattern. When the ``tensorDataType`` is
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B`, only the following
       swizzle modes are supported:
 
@@ -53890,7 +53931,7 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
       - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
 
       - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only) When the
-        `tensorDataType` is
+        ``tensorDataType`` is
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, only the
         following swizzle modes are supported:
 
@@ -53900,13 +53941,13 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
 
       - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
 
-    - `l2Promotion` specifies L2 fetch size which indicates the byte
+    - ``l2Promotion`` specifies L2 fetch size which indicates the byte
       granurality at which L2 requests is filled from DRAM. It must be of
       type :py:obj:`~.CUtensorMapL2promotion`, which is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
-    - `oobFill`, which indicates whether zero or a special NaN constant
+    - ``oobFill``, which indicates whether zero or a special NaN constant
       should be used to fill out-of-bound elements, must be of type
       :py:obj:`~.CUtensorMapFloatOOBfill` which is defined as:
 
@@ -53914,8 +53955,8 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
 
     - Note that
       :py:obj:`~.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA` can
-      only be used when `tensorDataType` represents a floating-point data
-      type, and when `tensorDataType` is not
+      only be used when ``tensorDataType`` represents a floating-point data
+      type, and when ``tensorDataType`` is not
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B`,
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, and
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B`.
@@ -53930,16 +53971,16 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
         Starting address of memory region described by tensor
     globalDim : list[:py:obj:`~.cuuint64_t`]
         Array containing tensor size (number of elements) along each of the
-        `tensorRank` dimensions
+        ``tensorRank`` dimensions
     globalStrides : list[:py:obj:`~.cuuint64_t`]
         Array containing stride size (in bytes) along each of the
-        `tensorRank` - 1 dimensions
+        ``tensorRank`` - 1 dimensions
     boxDim : list[:py:obj:`~.cuuint32_t`]
         Array containing traversal box size (number of elments) along each
-        of the `tensorRank` dimensions. Specifies how many elements to be
+        of the ``tensorRank`` dimensions. Specifies how many elements to be
         traversed along each tensor dimension.
     elementStrides : list[:py:obj:`~.cuuint32_t`]
-        Array containing traversal stride in each of the `tensorRank`
+        Array containing traversal stride in each of the ``tensorRank``
         dimensions
     interleave : :py:obj:`~.CUtensorMapInterleave`
         Type of interleaved layout the tensor addresses
@@ -54050,7 +54091,7 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
 
     Creates a descriptor for Tensor Memory Access (TMA) object specified by
     the parameters describing a im2col memory layout and returns it in
-    `tensorMap`.
+    ``tensorMap``.
 
     Tensor map objects are only supported on devices of compute capability
     9.0 or higher. Additionally, a tensor map object is an opaque value,
@@ -54058,9 +54099,9 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
 
     The parameters passed are bound to the following requirements:
 
-    - `tensorMap` address must be aligned to 64 bytes.
+    - ``tensorMap`` address must be aligned to 64 bytes.
 
-    - `tensorDataType` has to be an enum from
+    - ``tensorDataType`` has to be an enum from
       :py:obj:`~.CUtensorMapDataType` which is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
@@ -54074,47 +54115,47 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
       packed values to memory aligned as 16 bytes. There are 4 byte gaps
       between every 12 byte chunk of packed values.
 
-    - `tensorRank`, which specifies the number of tensor dimensions, must
+    - ``tensorRank``, which specifies the number of tensor dimensions, must
       be 3, 4, or 5.
 
-    - `globalAddress`, which specifies the starting address of the memory
+    - ``globalAddress``, which specifies the starting address of the memory
       region described, must be 16 byte aligned. The following requirements
       need to also be met:
 
-      - When `interleave` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`,
-        `globalAddress` must be 32 byte aligned.
+      - When ``interleave`` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`,
+        ``globalAddress`` must be 32 byte aligned.
 
-      - When `tensorDataType` is
+      - When ``tensorDataType`` is
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` or
-        :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, `globalAddress`
-        must be 32 byte aligned.
+        :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`,
+        ``globalAddress`` must be 32 byte aligned.
 
-    - `globalDim` array, which specifies tensor size of each of the
-      `tensorRank` dimensions, must be non-zero and less than or equal to
+    - ``globalDim`` array, which specifies tensor size of each of the
+      ``tensorRank`` dimensions, must be non-zero and less than or equal to
       2^32. Additionally, the following requirements need to be met for the
       packed data types:
 
-      - When `tensorDataType` is
+      - When ``tensorDataType`` is
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` or
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, globalDim[0]
         must be a multiple of 128.
 
-      - When `tensorDataType` is
-        :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B`, `globalDim`[0]
+      - When ``tensorDataType`` is
+        :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B`, ``globalDim[0]``
         must be a multiple of 2.
 
       - Dimension for the packed data types must reflect the number of
         individual U# values.
 
-    - `globalStrides` array, which specifies tensor stride of each of the
-      lower `tensorRank` - 1 dimensions in bytes, must be a multiple of 16
-      and less than 2^40. Additionally, the following requirements need to
-      be met:
+    - ``globalStrides`` array, which specifies tensor stride of each of the
+      lower ``tensorRank`` - 1 dimensions in bytes, must be a multiple of
+      16 and less than 2^40. Additionally, the following requirements need
+      to be met:
 
-      - When `interleave` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`, the
-        strides must be a multiple of 32.
+      - When ``interleave`` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`,
+        the strides must be a multiple of 32.
 
-      - When `tensorDataType` is
+      - When ``tensorDataType`` is
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` or
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, the strides must
         be a multiple of 32. Each following dimension specified includes
@@ -54122,69 +54163,70 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
 
       - **View CUDA Toolkit Documentation for a C++ code example**
 
-    - `pixelBoxLowerCorner` array specifies the coordinate offsets {D, H,
+    - ``pixelBoxLowerCorner`` array specifies the coordinate offsets {D, H,
       W} of the bounding box from top/left/front corner. The number of
       offsets and their precision depend on the tensor dimensionality:
 
-      - When `tensorRank` is 3, one signed offset within range [-32768,
+      - When ``tensorRank`` is 3, one signed offset within range [-32768,
         32767] is supported.
 
-      - When `tensorRank` is 4, two signed offsets each within range [-128,
-        127] are supported.
+      - When ``tensorRank`` is 4, two signed offsets each within range
+        [-128, 127] are supported.
 
-      - When `tensorRank` is 5, three offsets each within range [-16, 15]
+      - When ``tensorRank`` is 5, three offsets each within range [-16, 15]
         are supported.
 
-    - `pixelBoxUpperCorner` array specifies the coordinate offsets {D, H,
+    - ``pixelBoxUpperCorner`` array specifies the coordinate offsets {D, H,
       W} of the bounding box from bottom/right/back corner. The number of
       offsets and their precision depend on the tensor dimensionality:
 
-      - When `tensorRank` is 3, one signed offset within range [-32768,
+      - When ``tensorRank`` is 3, one signed offset within range [-32768,
         32767] is supported.
 
-      - When `tensorRank` is 4, two signed offsets each within range [-128,
-        127] are supported.
+      - When ``tensorRank`` is 4, two signed offsets each within range
+        [-128, 127] are supported.
 
-      - When `tensorRank` is 5, three offsets each within range [-16, 15]
-        are supported. The bounding box specified by `pixelBoxLowerCorner`
-        and `pixelBoxUpperCorner` must have non-zero area.
+      - When ``tensorRank`` is 5, three offsets each within range [-16, 15]
+        are supported. The bounding box specified by
+        ``pixelBoxLowerCorner`` and ``pixelBoxUpperCorner`` must have non-
+        zero area.
 
-    - `channelsPerPixel`, which specifies the number of elements which must
-      be accessed along C dimension, must be less than or equal to 256.
-      Additionally, when `tensorDataType` is
+    - ``channelsPerPixel``, which specifies the number of elements which
+      must be accessed along C dimension, must be less than or equal to
+      256. Additionally, when ``tensorDataType`` is
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` or
-      :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, `channelsPerPixel`
-      must be 128.
+      :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`,
+      ``channelsPerPixel`` must be 128.
 
-    - `pixelsPerColumn`, which specifies the number of elements that must
+    - ``pixelsPerColumn``, which specifies the number of elements that must
       be accessed along the {N, D, H, W} dimensions, must be less than or
       equal to 1024.
 
-    - `elementStrides` array, which specifies the iteration step along each
-      of the `tensorRank` dimensions, must be non-zero and less than or
-      equal to 8. Note that when `interleave` is
+    - ``elementStrides`` array, which specifies the iteration step along
+      each of the ``tensorRank`` dimensions, must be non-zero and less than
+      or equal to 8. Note that when ``interleave`` is
       :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`, the first element of this
       array is ignored since TMA doesn’t support the stride for dimension
-      zero. When all elements of the `elementStrides` array are one,
-      `boxDim` specifies the number of elements to load. However, if
-      `elementStrides`[i] is not equal to one for some `i`, then TMA loads
-      ceil( `boxDim`[i] / `elementStrides`[i]) number of elements along
-      i-th dimension. To load N elements along i-th dimension, `boxDim`[i]
-      must be set to N * `elementStrides`[i].
-
-    - `interleave` specifies the interleaved layout of type
+      zero. When all elements of the ``elementStrides`` array are one,
+      ``boxDim`` specifies the number of elements to load. However, if
+      ``elementStrides[i]`` is not equal to one for some ``i``, then TMA
+      loads ceil( ``boxDim[i]`` / ``elementStrides[i]``) number of elements
+      along i-th dimension. To load N elements along i-th dimension,
+      ``boxDim[i]`` must be set to N * ``elementStrides[i]``.
+
+    - ``interleave`` specifies the interleaved layout of type
       :py:obj:`~.CUtensorMapInterleave`, which is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
     - TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16
       bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
-      uses 32 bytes. When `interleave` is
-      :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE` and `swizzle` is not
+      uses 32 bytes. When ``interleave`` is
+      :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE` and ``swizzle`` is not
       :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_NONE`, the bounding box inner
-      dimension (computed as `channelsPerPixel` multiplied by element size
-      in bytes derived from `tensorDataType`) must be less than or equal to
-      the swizzle size.
+      dimension (computed as ``channelsPerPixel`` multiplied by element
+      size in bytes derived from ``tensorDataType``) must be less than or
+      equal to the swizzle size.
 
       - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension
         to be <= 32.
@@ -54193,12 +54235,13 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
         to be <= 64.
 
       - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner
-        dimension to be <= 128. Additionally, `tensorDataType` of
+        dimension to be <= 128. Additionally, ``tensorDataType`` of
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` requires
-        `interleave` to be :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`.
+        ``interleave`` to be :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`.
 
-    - `swizzle`, which specifies the shared memory bank swizzling pattern,
-      has to be of type :py:obj:`~.CUtensorMapSwizzle` which is defined as:
+    - ``swizzle``, which specifies the shared memory bank swizzling
+      pattern, has to be of type :py:obj:`~.CUtensorMapSwizzle` which is
+      defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -54207,10 +54250,10 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
       in shared memory. This difference in data organization may cause bank
       conflicts when shared memory is accessed. In order to avoid this
       problem, data can be loaded to shared memory with shuffling across
-      shared memory banks. When `interleave` is
-      :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`, `swizzle` must be
+      shared memory banks. When ``interleave`` is
+      :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`, ``swizzle`` must be
       :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_32B`. Other interleave modes can
-      have any swizzling pattern. When the `tensorDataType` is
+      have any swizzling pattern. When the ``tensorDataType`` is
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B`, only the following
       swizzle modes are supported:
 
@@ -54221,7 +54264,7 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
       - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
 
       - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only) When the
-        `tensorDataType` is
+        ``tensorDataType`` is
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, only the
         following swizzle modes are supported:
 
@@ -54231,13 +54274,13 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
 
       - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
 
-    - `l2Promotion` specifies L2 fetch size which indicates the byte
+    - ``l2Promotion`` specifies L2 fetch size which indicates the byte
       granularity at which L2 requests are filled from DRAM. It must be of
       type :py:obj:`~.CUtensorMapL2promotion`, which is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
-    - `oobFill`, which indicates whether zero or a special NaN constant
+    - ``oobFill``, which indicates whether zero or a special NaN constant
       should be used to fill out-of-bound elements, must be of type
       :py:obj:`~.CUtensorMapFloatOOBfill` which is defined as:
 
@@ -54245,8 +54288,8 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
 
     - Note that
       :py:obj:`~.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA` can
-      only be used when `tensorDataType` represents a floating-point data
-      type, and when `tensorDataType` is not
+      only be used when ``tensorDataType`` represents a floating-point data
+      type, and when ``tensorDataType`` is not
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B`,
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, and
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B`.
@@ -54261,10 +54304,10 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
         Starting address of memory region described by tensor
     globalDim : list[:py:obj:`~.cuuint64_t`]
         Array containing tensor size (number of elements) along each of the
-        `tensorRank` dimensions
+        ``tensorRank`` dimensions
     globalStrides : list[:py:obj:`~.cuuint64_t`]
         Array containing stride size (in bytes) along each of the
-        `tensorRank` - 1 dimensions
+        ``tensorRank`` - 1 dimensions
     pixelBoxLowerCorner : list[int]
         Array containing DHW dimensions of lower box corner
     pixelBoxUpperCorner : list[int]
@@ -54274,7 +54317,7 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
     pixelsPerColumn : Any
         Number of pixels per column
     elementStrides : list[:py:obj:`~.cuuint32_t`]
-        Array containing traversal stride in each of the `tensorRank`
+        Array containing traversal stride in each of the ``tensorRank``
         dimensions
     interleave : :py:obj:`~.CUtensorMapInterleave`
         Type of interleaved layout the tensor addresses
@@ -54395,8 +54438,8 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
 
     Creates a descriptor for Tensor Memory Access (TMA) object specified by
     the parameters describing a im2col memory layout and where the row is
-    always loaded along the W dimensuin and returns it in `tensorMap`. This
-    assumes the tensor layout in memory is either NDHWC, NHWC, or NWC.
+    always loaded along the W dimensuin and returns it in ``tensorMap``.
+    This assumes the tensor layout in memory is either NDHWC, NHWC, or NWC.
 
     This API is only supported on devices of compute capability 10.0 or
     higher. Additionally, a tensor map object is an opaque value, and, as
@@ -54404,9 +54447,9 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
 
     The parameters passed are bound to the following requirements:
 
-    - `tensorMap` address must be aligned to 64 bytes.
+    - ``tensorMap`` address must be aligned to 64 bytes.
 
-    - `tensorDataType` has to be an enum from
+    - ``tensorDataType`` has to be an enum from
       :py:obj:`~.CUtensorMapDataType` which is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
@@ -54420,47 +54463,47 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
       packed values to memory aligned as 16 bytes. There are 4 byte gaps
       between every 12 byte chunk of packed values.
 
-    - `tensorRank`, which specifies the number of tensor dimensions, must
+    - ``tensorRank``, which specifies the number of tensor dimensions, must
       be 3, 4, or 5.
 
-    - `globalAddress`, which specifies the starting address of the memory
+    - ``globalAddress``, which specifies the starting address of the memory
       region described, must be 16 byte aligned. The following requirements
       need to also be met:
 
-      - When `interleave` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`,
-        `globalAddress` must be 32 byte aligned.
+      - When ``interleave`` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`,
+        ``globalAddress`` must be 32 byte aligned.
 
-      - When `tensorDataType` is
+      - When ``tensorDataType`` is
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` or
-        :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, `globalAddress`
-        must be 32 byte aligned.
+        :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`,
+        ``globalAddress`` must be 32 byte aligned.
 
-    `globalDim` array, which specifies tensor size of each of the
-    `tensorRank` dimensions, must be non-zero and less than or equal to
+    ``globalDim`` array, which specifies tensor size of each of the
+    ``tensorRank`` dimensions, must be non-zero and less than or equal to
     2^32. Additionally, the following requirements need to be met for the
     packed data types:
 
-    - When `tensorDataType` is
+    - When ``tensorDataType`` is
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` or
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, globalDim[0] must
       be a multiple of 128.
 
-    - When `tensorDataType` is
-      :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B`, `globalDim`[0] must
-      be a multiple of 2.
+    - When ``tensorDataType`` is
+      :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B`, ``globalDim[0]``
+      must be a multiple of 2.
 
     - Dimension for the packed data types must reflect the number of
       individual U# values.
 
-    `globalStrides` array, which specifies tensor stride of each of the
-    lower `tensorRank` - 1 dimensions in bytes, must be a multiple of 16
+    ``globalStrides`` array, which specifies tensor stride of each of the
+    lower ``tensorRank`` - 1 dimensions in bytes, must be a multiple of 16
     and less than 2^40. Additionally, the following requirements need to be
     met:
 
-    - When `interleave` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`, the
+    - When ``interleave`` is :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_32B`, the
       strides must be a multiple of 32.
 
-    - When `tensorDataType` is
+    - When ``tensorDataType`` is
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` or
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, the strides must
       be a multiple of 32. Each following dimension specified includes
@@ -54468,64 +54511,64 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
-    `pixelBoxLowerCornerWidth` specifies the coordinate offset W of the
+    ``pixelBoxLowerCornerWidth`` specifies the coordinate offset W of the
     bounding box from left corner. The offset must be within range [-32768,
     32767].
 
-    - `pixelBoxUpperCornerWidth` specifies the coordinate offset W of the
+    - ``pixelBoxUpperCornerWidth`` specifies the coordinate offset W of the
       bounding box from right corner. The offset must be within range
       [-32768, 32767].
 
-    The bounding box specified by `pixelBoxLowerCornerWidth` and
-    `pixelBoxUpperCornerWidth` must have non-zero area. Note that the size
-    of the box along D and H dimensions is always equal to one.
+    The bounding box specified by ``pixelBoxLowerCornerWidth`` and
+    ``pixelBoxUpperCornerWidth`` must have non-zero area. Note that the
+    size of the box along D and H dimensions is always equal to one.
 
-    - `channelsPerPixel`, which specifies the number of elements which must
-      be accessed along C dimension, must be less than or equal to 256.
-      Additionally, when `tensorDataType` is
+    - ``channelsPerPixel``, which specifies the number of elements which
+      must be accessed along C dimension, must be less than or equal to
+      256. Additionally, when ``tensorDataType`` is
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` or
-      :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, `channelsPerPixel`
-      must be 128.
+      :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`,
+      ``channelsPerPixel`` must be 128.
 
-    - `pixelsPerColumn`, which specifies the number of elements that must
+    - ``pixelsPerColumn``, which specifies the number of elements that must
       be accessed along the W dimension, must be less than or equal to
-      1024. This field is ignored when `mode` is
+      1024. This field is ignored when ``mode`` is
       :py:obj:`~.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W128`.
 
-    - `elementStrides` array, which specifies the iteration step along each
-      of the `tensorRank` dimensions, must be non-zero and less than or
-      equal to 8. Note that when `interleave` is
+    - ``elementStrides`` array, which specifies the iteration step along
+      each of the ``tensorRank`` dimensions, must be non-zero and less than
+      or equal to 8. Note that when ``interleave`` is
       :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`, the first element of this
       array is ignored since TMA doesn’t support the stride for dimension
-      zero. When all elements of the `elementStrides` array are one,
-      `boxDim` specifies the number of elements to load. However, if
-      `elementStrides`[i] is not equal to one for some `i`, then TMA loads
-      ceil( `boxDim`[i] / `elementStrides`[i]) number of elements along
-      i-th dimension. To load N elements along i-th dimension, `boxDim`[i]
-      must be set to N * `elementStrides`[i].
-
-    - `interleave` specifies the interleaved layout of type
+      zero. When all elements of the ``elementStrides`` array are one,
+      ``boxDim`` specifies the number of elements to load. However, if
+      ``elementStrides[i]`` is not equal to one for some ``i``, then TMA
+      loads ceil( ``boxDim[i]`` / ``elementStrides[i]``) number of elements
+      along i-th dimension. To load N elements along i-th dimension,
+      ``boxDim[i]`` must be set to N * ``elementStrides[i]``.
+
+    - ``interleave`` specifies the interleaved layout of type
       :py:obj:`~.CUtensorMapInterleave`, which is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
     - TMA supports interleaved layouts like NC/8HWC8 where C8 utilizes 16
       bytes in memory assuming 2 byte per channel or NC/16HWC16 where C16
-      uses 32 bytes. When `interleave` is
+      uses 32 bytes. When ``interleave`` is
       :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`, the bounding box inner
-      dimension (computed as `channelsPerPixel` multiplied by element size
-      in bytes derived from `tensorDataType`) must be less than or equal to
-      the swizzle size.
+      dimension (computed as ``channelsPerPixel`` multiplied by element
+      size in bytes derived from ``tensorDataType``) must be less than or
+      equal to the swizzle size.
 
       - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension
         to be <= 64.
 
       - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner
-        dimension to be <= 128. Additionally, `tensorDataType` of
+        dimension to be <= 128. Additionally, ``tensorDataType`` of
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` requires
-        `interleave` to be :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`.
+        ``interleave`` to be :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`.
 
-    - `mode`, which describes loading of elements loaded along the W
+    - ``mode``, which describes loading of elements loaded along the W
       dimension, has to be one of the following
       :py:obj:`~.CUtensorMapIm2ColWideMode` types:
 
@@ -54533,11 +54576,11 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
 
     - :py:obj:`~.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W` allows the number of
       elements loaded along the W dimension to be specified via the
-      `pixelsPerColumn` field.
+      ``pixelsPerColumn`` field.
 
-    - `swizzle`, which specifies the shared memory bank swizzling pattern,
-      must be one of the following :py:obj:`~.CUtensorMapSwizzle` modes
-      (other swizzle modes are not supported):
+    - ``swizzle``, which specifies the shared memory bank swizzling
+      pattern, must be one of the following :py:obj:`~.CUtensorMapSwizzle`
+      modes (other swizzle modes are not supported):
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -54546,7 +54589,7 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
       in shared memory. This difference in data organization may cause bank
       conflicts when shared memory is accessed. In order to avoid this
       problem, data can be loaded to shared memory with shuffling across
-      shared memory banks. When the `tensorDataType` is
+      shared memory banks. When the ``tensorDataType`` is
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B`, only the following
       swizzle modes are supported:
 
@@ -54555,7 +54598,7 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
       - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
 
       - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store) When the
-        `tensorDataType` is
+        ``tensorDataType`` is
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, only the
         following swizzle modes are supported:
 
@@ -54564,15 +54607,15 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
       - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
 
     Additionally, :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_96B` is supported only
-    when `mode` is :py:obj:`~.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W`.
+    when ``mode`` is :py:obj:`~.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W`.
 
-    - `l2Promotion` specifies L2 fetch size which indicates the byte
+    - ``l2Promotion`` specifies L2 fetch size which indicates the byte
       granularity at which L2 requests are filled from DRAM. It must be of
       type :py:obj:`~.CUtensorMapL2promotion`, which is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
-    - `oobFill`, which indicates whether zero or a special NaN constant
+    - ``oobFill``, which indicates whether zero or a special NaN constant
       should be used to fill out-of-bound elements, must be of type
       :py:obj:`~.CUtensorMapFloatOOBfill` which is defined as:
 
@@ -54580,8 +54623,8 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
 
     - Note that
       :py:obj:`~.CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA` can
-      only be used when `tensorDataType` represents a floating-point data
-      type, and when `tensorDataType` is not
+      only be used when ``tensorDataType`` represents a floating-point data
+      type, and when ``tensorDataType`` is not
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B`,
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, and
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B`.
@@ -54596,10 +54639,10 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
         Starting address of memory region described by tensor
     globalDim : list[:py:obj:`~.cuuint64_t`]
         Array containing tensor size (number of elements) along each of the
-        `tensorRank` dimensions
+        ``tensorRank`` dimensions
     globalStrides : list[:py:obj:`~.cuuint64_t`]
         Array containing stride size (in bytes) along each of the
-        `tensorRank` - 1 dimensions
+        ``tensorRank`` - 1 dimensions
     pixelBoxLowerCornerWidth : int
         Width offset of left box corner
     pixelBoxUpperCornerWidth : int
@@ -54609,7 +54652,7 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
     pixelsPerColumn : Any
         Number of pixels per column
     elementStrides : list[:py:obj:`~.cuuint32_t`]
-        Array containing traversal stride in each of the `tensorRank`
+        Array containing traversal stride in each of the ``tensorRank``
         dimensions
     interleave : :py:obj:`~.CUtensorMapInterleave`
         Type of interleaved layout the tensor addresses
@@ -54724,7 +54767,7 @@ def cuTensorMapReplaceAddress(tensorMap : Optional[CUtensorMap], globalAddress):
     """ Modify an existing tensor map descriptor with an updated global address.
 
     Modifies the descriptor for Tensor Memory Access (TMA) object passed in
-    `tensorMap` with an updated `globalAddress`.
+    ``tensorMap`` with an updated ``globalAddress``.
 
     Tensor map objects are only supported on devices of compute capability
     9.0 or higher. Additionally, a tensor map object is an opaque value,
@@ -54762,19 +54805,19 @@ def cuTensorMapReplaceAddress(tensorMap : Optional[CUtensorMap], globalAddress):
 def cuDeviceCanAccessPeer(dev, peerDev):
     """ Queries if a device may directly access a peer device's memory.
 
-    Returns in `*canAccessPeer` a value of 1 if contexts on `dev` are
-    capable of directly accessing memory from contexts on `peerDev` and 0
-    otherwise. If direct access of `peerDev` from `dev` is possible, then
-    access may be enabled on two specific contexts by calling
+    Returns in ``*canAccessPeer`` a value of 1 if contexts on ``dev`` are
+    capable of directly accessing memory from contexts on ``peerDev`` and 0
+    otherwise. If direct access of ``peerDev`` from ``dev`` is possible,
+    then access may be enabled on two specific contexts by calling
     :py:obj:`~.cuCtxEnablePeerAccess()`.
 
     Parameters
     ----------
     dev : :py:obj:`~.CUdevice`
-        Device from which allocations on `peerDev` are to be directly
+        Device from which allocations on ``peerDev`` are to be directly
         accessed.
     peerDev : :py:obj:`~.CUdevice`
-        Device on which the allocations to be directly accessed by `dev`
+        Device on which the allocations to be directly accessed by ``dev``
         reside.
 
     Returns
@@ -54818,15 +54861,15 @@ def cuDeviceCanAccessPeer(dev, peerDev):
 def cuCtxEnablePeerAccess(peerContext, unsigned int Flags):
     """ Enables direct access to memory allocations in a peer context.
 
-    If both the current context and `peerContext` are on devices which
+    If both the current context and ``peerContext`` are on devices which
     support unified addressing (as may be queried using
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING`) and same major
-    compute capability, then on success all allocations from `peerContext`
-    will immediately be accessible by the current context. See
-    :py:obj:`~.Unified Addressing` for additional details.
+    compute capability, then on success all allocations from
+    ``peerContext`` will immediately be accessible by the current context.
+    See Unified Addressing for additional details.
 
     Note that access granted by this call is unidirectional and that in
-    order to access memory from the current context in `peerContext`, a
+    order to access memory from the current context in ``peerContext``, a
     separate symmetric call to :py:obj:`~.cuCtxEnablePeerAccess()` is
     required.
 
@@ -54837,10 +54880,10 @@ def cuCtxEnablePeerAccess(peerContext, unsigned int Flags):
     Returns :py:obj:`~.CUDA_ERROR_PEER_ACCESS_UNSUPPORTED` if
     :py:obj:`~.cuDeviceCanAccessPeer()` indicates that the
     :py:obj:`~.CUdevice` of the current context cannot directly access
-    memory from the :py:obj:`~.CUdevice` of `peerContext`.
+    memory from the :py:obj:`~.CUdevice` of ``peerContext``.
 
     Returns :py:obj:`~.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED` if direct
-    access of `peerContext` from the current context has already been
+    access of ``peerContext`` from the current context has already been
     enabled.
 
     Returns :py:obj:`~.CUDA_ERROR_TOO_MANY_PEERS` if direct peer access is
@@ -54848,10 +54891,10 @@ def cuCtxEnablePeerAccess(peerContext, unsigned int Flags):
     been exhausted.
 
     Returns :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT` if there is no current
-    context, `peerContext` is not a valid context, or if the current
-    context is `peerContext`.
+    context, ``peerContext`` is not a valid context, or if the current
+    context is ``peerContext``.
 
-    Returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if `Flags` is not 0.
+    Returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if ``Flags`` is not 0.
 
     Parameters
     ----------
@@ -54889,11 +54932,11 @@ def cuCtxDisablePeerAccess(peerContext):
     """ Disables direct access to memory allocations in a peer context and unregisters any registered allocations.
 
     Returns :py:obj:`~.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED` if direct peer
-    access has not yet been enabled from `peerContext` to the current
+    access has not yet been enabled from ``peerContext`` to the current
     context.
 
     Returns :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT` if there is no current
-    context, or if `peerContext` is not a valid context.
+    context, or if ``peerContext`` is not a valid context.
 
     Parameters
     ----------
@@ -54928,9 +54971,9 @@ def cuCtxDisablePeerAccess(peerContext):
 def cuDeviceGetP2PAttribute(attrib not None : CUdevice_P2PAttribute, srcDevice, dstDevice):
     """ Queries attributes of the link between two devices.
 
-    Returns in `*value` the value of the requested attribute `attrib` of
-    the link between `srcDevice` and `dstDevice`. The supported attributes
-    are:
+    Returns in ``*value`` the value of the requested attribute ``attrib``
+    of the link between ``srcDevice`` and ``dstDevice``. The supported
+    attributes are:
 
     - :py:obj:`~.CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK`: A relative
       value indicating the performance of the link between two devices.
@@ -54949,17 +54992,17 @@ def cuDeviceGetP2PAttribute(attrib not None : CUdevice_P2PAttribute, srcDevice,
       Information about specific operations can be retrieved with
       :py:obj:`~.cuDeviceGetP2PAtomicCapabilities`.
 
-    Returns :py:obj:`~.CUDA_ERROR_INVALID_DEVICE` if `srcDevice` or
-    `dstDevice` are not valid or if they represent the same device.
+    Returns :py:obj:`~.CUDA_ERROR_INVALID_DEVICE` if ``srcDevice`` or
+    ``dstDevice`` are not valid or if they represent the same device.
 
-    Returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if `attrib` is not valid
-    or if `value` is a null pointer.
+    Returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if ``attrib`` is not valid
+    or if ``value`` is a null pointer.
 
     Parameters
     ----------
     attrib : :py:obj:`~.CUdevice_P2PAttribute`
-        The requested attribute of the link between `srcDevice` and
-        `dstDevice`.
+        The requested attribute of the link between ``srcDevice`` and
+        ``dstDevice``.
     srcDevice : :py:obj:`~.CUdevice`
         The source device of the target link.
     dstDevice : :py:obj:`~.CUdevice`
@@ -55007,22 +55050,22 @@ def cuDeviceGetP2PAttribute(attrib not None : CUdevice_P2PAttribute, srcDevice,
 def cuDeviceGetP2PAtomicCapabilities(operations : Optional[tuple[CUatomicOperation] | list[CUatomicOperation]], unsigned int count, srcDevice, dstDevice):
     """ Queries details about atomic operations supported between two devices.
 
-    Returns in `*capabilities` the details about requested atomic
-    `*operations` over the the link between `srcDevice` and `dstDevice`.
-    The allocated size of `*operations` and `*capabilities` must be
-    `count`.
+    Returns in ``*capabilities`` the details about requested atomic
+    ``*operations`` over the the link between ``srcDevice`` and
+    ``dstDevice``. The allocated size of ``*operations`` and
+    ``*capabilities`` must be ``count``.
 
-    For each :py:obj:`~.CUatomicOperation` in `*operations`, the
-    corresponding result in `*capabilities` will be a bitmask indicating
+    For each :py:obj:`~.CUatomicOperation` in ``*operations``, the
+    corresponding result in ``*capabilities`` will be a bitmask indicating
     which of :py:obj:`~.CUatomicOperationCapability` the link supports
     natively.
 
-    Returns :py:obj:`~.CUDA_ERROR_INVALID_DEVICE` if `srcDevice` or
-    `dstDevice` are not valid or if they represent the same device.
+    Returns :py:obj:`~.CUDA_ERROR_INVALID_DEVICE` if ``srcDevice`` or
+    ``dstDevice`` are not valid or if they represent the same device.
 
-    Returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if `*capabilities` or
-    `*operations` is NULL, if `count` is 0, or if any of `*operations` is
-    not valid.
+    Returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if ``*capabilities`` or
+    ``*operations`` is NULL, if ``count`` is 0, or if any of
+    ``*operations`` is not valid.
 
     Parameters
     ----------
@@ -55090,11 +55133,11 @@ def cuDeviceGetP2PAtomicCapabilities(operations : Optional[tuple[CUatomicOperati
 def cuGraphicsUnregisterResource(resource):
     """ Unregisters a graphics resource for access by CUDA.
 
-    Unregisters the graphics resource `resource` so it is not accessible by
-    CUDA unless registered again.
+    Unregisters the graphics resource ``resource`` so it is not accessible
+    by CUDA unless registered again.
 
-    If `resource` is invalid then :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` is
-    returned.
+    If ``resource`` is invalid then :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
+    is returned.
 
     Parameters
     ----------
@@ -55129,18 +55172,18 @@ def cuGraphicsUnregisterResource(resource):
 def cuGraphicsSubResourceGetMappedArray(resource, unsigned int arrayIndex, unsigned int mipLevel):
     """ Get an array through which to access a subresource of a mapped graphics resource.
 
-    Returns in `*pArray` an array through which the subresource of the
-    mapped graphics resource `resource` which corresponds to array index
-    `arrayIndex` and mipmap level `mipLevel` may be accessed. The value set
-    in `*pArray` may change every time that `resource` is mapped.
+    Returns in ``*pArray`` an array through which the subresource of the
+    mapped graphics resource ``resource`` which corresponds to array index
+    ``arrayIndex`` and mipmap level ``mipLevel`` may be accessed. The value
+    set in ``*pArray`` may change every time that ``resource`` is mapped.
 
-    If `resource` is not a texture then it cannot be accessed via an array
-    and :py:obj:`~.CUDA_ERROR_NOT_MAPPED_AS_ARRAY` is returned. If
-    `arrayIndex` is not a valid array index for `resource` then
-    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned. If `mipLevel` is not
-    a valid mipmap level for `resource` then
-    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned. If `resource` is not
-    mapped then :py:obj:`~.CUDA_ERROR_NOT_MAPPED` is returned.
+    If ``resource`` is not a texture then it cannot be accessed via an
+    array and :py:obj:`~.CUDA_ERROR_NOT_MAPPED_AS_ARRAY` is returned. If
+    ``arrayIndex`` is not a valid array index for ``resource`` then
+    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned. If ``mipLevel`` is
+    not a valid mipmap level for ``resource`` then
+    :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned. If ``resource`` is
+    not mapped then :py:obj:`~.CUDA_ERROR_NOT_MAPPED` is returned.
 
     Parameters
     ----------
@@ -55158,7 +55201,7 @@ def cuGraphicsSubResourceGetMappedArray(resource, unsigned int arrayIndex, unsig
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_MAPPED`, :py:obj:`~.CUDA_ERROR_NOT_MAPPED_AS_ARRAY`
     pArray : :py:obj:`~.CUarray`
-        Returned array through which a subresource of `resource` may be
+        Returned array through which a subresource of ``resource`` may be
         accessed
 
     See Also
@@ -55187,13 +55230,13 @@ def cuGraphicsSubResourceGetMappedArray(resource, unsigned int arrayIndex, unsig
 def cuGraphicsResourceGetMappedMipmappedArray(resource):
     """ Get a mipmapped array through which to access a mapped graphics resource.
 
-    Returns in `*pMipmappedArray` a mipmapped array through which the
-    mapped graphics resource `resource`. The value set in
-    `*pMipmappedArray` may change every time that `resource` is mapped.
+    Returns in ``*pMipmappedArray`` a mipmapped array through which the
+    mapped graphics resource ``resource``. The value set in
+    ``*pMipmappedArray`` may change every time that ``resource`` is mapped.
 
-    If `resource` is not a texture then it cannot be accessed via a
+    If ``resource`` is not a texture then it cannot be accessed via a
     mipmapped array and :py:obj:`~.CUDA_ERROR_NOT_MAPPED_AS_ARRAY` is
-    returned. If `resource` is not mapped then
+    returned. If ``resource`` is not mapped then
     :py:obj:`~.CUDA_ERROR_NOT_MAPPED` is returned.
 
     Parameters
@@ -55206,7 +55249,7 @@ def cuGraphicsResourceGetMappedMipmappedArray(resource):
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_MAPPED`, :py:obj:`~.CUDA_ERROR_NOT_MAPPED_AS_ARRAY`
     pMipmappedArray : :py:obj:`~.CUmipmappedArray`
-        Returned mipmapped array through which `resource` may be accessed
+        Returned mipmapped array through which ``resource`` may be accessed
 
     See Also
     --------
@@ -55234,14 +55277,14 @@ def cuGraphicsResourceGetMappedMipmappedArray(resource):
 def cuGraphicsResourceGetMappedPointer(resource):
     """ Get a device pointer through which to access a mapped graphics resource.
 
-    Returns in `*pDevPtr` a pointer through which the mapped graphics
-    resource `resource` may be accessed. Returns in `pSize` the size of the
-    memory in bytes which may be accessed from that pointer. The value set
-    in `pPointer` may change every time that `resource` is mapped.
+    Returns in ``*pDevPtr`` a pointer through which the mapped graphics
+    resource ``resource`` may be accessed. Returns in ``pSize`` the size of
+    the memory in bytes which may be accessed from that pointer. The value
+    set in ``pPointer`` may change every time that ``resource`` is mapped.
 
-    If `resource` is not a buffer then it cannot be accessed via a pointer
-    and :py:obj:`~.CUDA_ERROR_NOT_MAPPED_AS_POINTER` is returned. If
-    `resource` is not mapped then :py:obj:`~.CUDA_ERROR_NOT_MAPPED` is
+    If ``resource`` is not a buffer then it cannot be accessed via a
+    pointer and :py:obj:`~.CUDA_ERROR_NOT_MAPPED_AS_POINTER` is returned.
+    If ``resource`` is not mapped then :py:obj:`~.CUDA_ERROR_NOT_MAPPED` is
     returned.
 
     Parameters
@@ -55281,10 +55324,10 @@ def cuGraphicsResourceGetMappedPointer(resource):
 def cuGraphicsResourceSetMapFlags(resource, unsigned int flags):
     """ Set usage flags for mapping a graphics resource.
 
-    Set `flags` for mapping the graphics resource `resource`.
+    Set ``flags`` for mapping the graphics resource ``resource``.
 
-    Changes to `flags` will take effect the next time `resource` is mapped.
-    The `flags` argument may be any of the following:
+    Changes to ``flags`` will take effect the next time ``resource`` is
+    mapped. The ``flags`` argument may be any of the following:
 
     - :py:obj:`~.CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE`: Specifies no hints
       about how this resource will be used. It is therefore assumed that
@@ -55300,8 +55343,8 @@ def cuGraphicsResourceSetMapFlags(resource, unsigned int flags):
       resource and will write over the entire contents of the resource, so
       none of the data previously stored in the resource will be preserved.
 
-    If `resource` is presently mapped for access by CUDA then
-    :py:obj:`~.CUDA_ERROR_ALREADY_MAPPED` is returned. If `flags` is not
+    If ``resource`` is presently mapped for access by CUDA then
+    :py:obj:`~.CUDA_ERROR_ALREADY_MAPPED` is returned. If ``flags`` is not
     one of the above values then :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is
     returned.
 
@@ -55340,20 +55383,21 @@ def cuGraphicsResourceSetMapFlags(resource, unsigned int flags):
 def cuGraphicsMapResources(unsigned int count, resources, hStream):
     """ Map graphics resources for access by CUDA.
 
-    Maps the `count` graphics resources in `resources` for access by CUDA.
+    Maps the ``count`` graphics resources in ``resources`` for access by
+    CUDA.
 
-    The resources in `resources` may be accessed by CUDA until they are
-    unmapped. The graphics API from which `resources` were registered
+    The resources in ``resources`` may be accessed by CUDA until they are
+    unmapped. The graphics API from which ``resources`` were registered
     should not access any resources while they are mapped by CUDA. If an
     application does so, the results are undefined.
 
     This function provides the synchronization guarantee that any graphics
     calls issued before :py:obj:`~.cuGraphicsMapResources()` will complete
-    before any subsequent CUDA work issued in `stream` begins.
+    before any subsequent CUDA work issued in ``stream`` begins.
 
-    If `resources` includes any duplicate entries then
+    If ``resources`` includes any duplicate entries then
     :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` is returned. If any of
-    `resources` are presently mapped for access by CUDA then
+    ``resources`` are presently mapped for access by CUDA then
     :py:obj:`~.CUDA_ERROR_ALREADY_MAPPED` is returned.
 
     Parameters
@@ -55403,18 +55447,18 @@ def cuGraphicsMapResources(unsigned int count, resources, hStream):
 def cuGraphicsUnmapResources(unsigned int count, resources, hStream):
     """ Unmap graphics resources.
 
-    Unmaps the `count` graphics resources in `resources`.
+    Unmaps the ``count`` graphics resources in ``resources``.
 
-    Once unmapped, the resources in `resources` may not be accessed by CUDA
-    until they are mapped again.
+    Once unmapped, the resources in ``resources`` may not be accessed by
+    CUDA until they are mapped again.
 
     This function provides the synchronization guarantee that any CUDA work
-    issued in `stream` before :py:obj:`~.cuGraphicsUnmapResources()` will
+    issued in ``stream`` before :py:obj:`~.cuGraphicsUnmapResources()` will
     complete before any subsequently issued graphics work begins.
 
-    If `resources` includes any duplicate entries then
+    If ``resources`` includes any duplicate entries then
     :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` is returned. If any of
-    `resources` are not presently mapped for access by CUDA then
+    ``resources`` are not presently mapped for access by CUDA then
     :py:obj:`~.CUDA_ERROR_NOT_MAPPED` is returned.
 
     Parameters
@@ -55464,7 +55508,7 @@ def cuGraphicsUnmapResources(unsigned int count, resources, hStream):
 def cuGetProcAddress(char* symbol, int cudaVersion, flags):
     """ Returns the requested driver API function pointer.
 
-    Returns in `**pfn` the address of the CUDA driver function for the
+    Returns in ``**pfn`` the address of the CUDA driver function for the
     requested CUDA version and flags.
 
     The CUDA version is specified as (1000 * major + 10 * minor), so CUDA
@@ -55480,18 +55524,18 @@ def cuGetProcAddress(char* symbol, int cudaVersion, flags):
     file. The function pointer typedef can be picked up from the
     corresponding typedefs header file. For example, cudaTypedefs.h
     consists of function pointer typedefs for driver APIs defined in
-    :py:obj:`~.cuda.h`.
+    cuda.h.
 
-    The API will return :py:obj:`~.CUDA_SUCCESS` and set the returned `pfn`
-    to NULL if the requested driver function is not supported on the
-    platform, no ABI compatible driver function exists for the specified
-    `cudaVersion` or if the driver symbol is invalid.
+    The API will return :py:obj:`~.CUDA_SUCCESS` and set the returned
+    ``pfn`` to NULL if the requested driver function is not supported on
+    the platform, no ABI compatible driver function exists for the
+    specified ``cudaVersion`` or if the driver symbol is invalid.
 
-    It will also set the optional `symbolStatus` to one of the values in
+    It will also set the optional ``symbolStatus`` to one of the values in
     :py:obj:`~.CUdriverProcAddressQueryResult` with the following meanings:
 
     - :py:obj:`~.CU_GET_PROC_ADDRESS_SUCCESS` - The requested symbol was
-      succesfully found based on input arguments and `pfn` is valid
+      succesfully found based on input arguments and ``pfn`` is valid
 
     - :py:obj:`~.CU_GET_PROC_ADDRESS_SYMBOL_NOT_FOUND` - The requested
       symbol was not found
@@ -55523,8 +55567,8 @@ def cuGetProcAddress(char* symbol, int cudaVersion, flags):
     ----------
     symbol : bytes
         The base name of the driver API function to look for. As an
-        example, for the driver API :py:obj:`~.cuMemAlloc_v2`, `symbol`
-        would be cuMemAlloc and `cudaVersion` would be the ABI compatible
+        example, for the driver API :py:obj:`~.cuMemAlloc_v2`, ``symbol``
+        would be cuMemAlloc and ``cudaVersion`` would be the ABI compatible
         CUDA version for the _v2 variant.
     cudaVersion : int
         The CUDA version to look for the requested driver symbol
@@ -55539,8 +55583,8 @@ def cuGetProcAddress(char* symbol, int cudaVersion, flags):
         Location to return the function pointer to the requested driver
         function
     symbolStatus : :py:obj:`~.CUdriverProcAddressQueryResult`
-        Optional location to store the status of the search for `symbol`
-        based on `cudaVersion`. See
+        Optional location to store the status of the search for ``symbol``
+        based on ``cudaVersion``. See
         :py:obj:`~.CUdriverProcAddressQueryResult` for possible values.
 
     See Also
@@ -55570,13 +55614,13 @@ def cuGetProcAddress(char* symbol, int cudaVersion, flags):
 def cuCoredumpGetAttribute(attrib not None : CUcoredumpSettings):
     """ Allows caller to fetch a coredump attribute value for the current context.
 
-    Returns in `*value` the requested value specified by `attrib`. It is up
-    to the caller to ensure that the data type and size of `*value` matches
-    the request.
+    Returns in ``*value`` the requested value specified by ``attrib``. It
+    is up to the caller to ensure that the data type and size of ``*value``
+    matches the request.
 
-    If the caller calls this function with `*value` equal to NULL, the size
-    of the memory region (in bytes) expected for `attrib` will be placed in
-    `size`.
+    If the caller calls this function with ``*value`` equal to NULL, the
+    size of the memory region (in bytes) expected for ``attrib`` will be
+    placed in ``size``.
 
     The supported attributes are:
 
@@ -55680,7 +55724,7 @@ def cuCoredumpGetAttribute(attrib not None : CUcoredumpSettings):
     attrib : :py:obj:`~.CUcoredumpSettings`
         The enum defining which value to fetch.
     size : int
-        The size of the memory region `value` points to.
+        The size of the memory region ``value`` points to.
 
     Returns
     -------
@@ -55689,7 +55733,7 @@ def cuCoredumpGetAttribute(attrib not None : CUcoredumpSettings):
     value : Any
         void* containing the requested data.
     size : int
-        The size of the memory region `value` points to.
+        The size of the memory region ``value`` points to.
 
     See Also
     --------
@@ -55712,13 +55756,13 @@ def cuCoredumpGetAttribute(attrib not None : CUcoredumpSettings):
 def cuCoredumpGetAttributeGlobal(attrib not None : CUcoredumpSettings):
     """ Allows caller to fetch a coredump attribute value for the entire application.
 
-    Returns in `*value` the requested value specified by `attrib`. It is up
-    to the caller to ensure that the data type and size of `*value` matches
-    the request.
+    Returns in ``*value`` the requested value specified by ``attrib``. It
+    is up to the caller to ensure that the data type and size of ``*value``
+    matches the request.
 
-    If the caller calls this function with `*value` equal to NULL, the size
-    of the memory region (in bytes) expected for `attrib` will be placed in
-    `size`.
+    If the caller calls this function with ``*value`` equal to NULL, the
+    size of the memory region (in bytes) expected for ``attrib`` will be
+    placed in ``size``.
 
     The supported attributes are:
 
@@ -55818,7 +55862,7 @@ def cuCoredumpGetAttributeGlobal(attrib not None : CUcoredumpSettings):
     attrib : :py:obj:`~.CUcoredumpSettings`
         The enum defining which value to fetch.
     size : int
-        The size of the memory region `value` points to.
+        The size of the memory region ``value`` points to.
 
     Returns
     -------
@@ -55827,7 +55871,7 @@ def cuCoredumpGetAttributeGlobal(attrib not None : CUcoredumpSettings):
     value : Any
         void* containing the requested data.
     size : int
-        The size of the memory region `value` points to.
+        The size of the memory region ``value`` points to.
 
     See Also
     --------
@@ -55860,13 +55904,13 @@ def cuCoredumpSetAttribute(attrib not None : CUcoredumpSettings, value):
     made to ensure no change in behavior for any users that may be
     currently using these variables to get coredumps.
 
-    `*value` shall contain the requested value specified by `set`. It is up
-    to the caller to ensure that the data type and size of `*value` matches
-    the request.
+    ``*value`` shall contain the requested value specified by ``set``. It
+    is up to the caller to ensure that the data type and size of ``*value``
+    matches the request.
 
-    If the caller calls this function with `*value` equal to NULL, the size
-    of the memory region (in bytes) expected for `set` will be placed in
-    `size`.
+    If the caller calls this function with ``*value`` equal to NULL, the
+    size of the memory region (in bytes) expected for ``set`` will be
+    placed in ``size``.
 
     /note This function will return :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED` if
     the caller attempts to set :py:obj:`~.CU_COREDUMP_ENABLE_ON_EXCEPTION`
@@ -55965,14 +56009,14 @@ def cuCoredumpSetAttribute(attrib not None : CUcoredumpSettings, value):
     value : Any
         void* containing the requested data.
     size : int
-        The size of the memory region `value` points to.
+        The size of the memory region ``value`` points to.
 
     Returns
     -------
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     size : int
-        The size of the memory region `value` points to.
+        The size of the memory region ``value`` points to.
 
     See Also
     --------
@@ -56007,13 +56051,13 @@ def cuCoredumpSetAttributeGlobal(attrib not None : CUcoredumpSettings, value):
     made to ensure no change in behavior for any users that may be
     currently using these variables to get coredumps.
 
-    `*value` shall contain the requested value specified by `set`. It is up
-    to the caller to ensure that the data type and size of `*value` matches
-    the request.
+    ``*value`` shall contain the requested value specified by ``set``. It
+    is up to the caller to ensure that the data type and size of ``*value``
+    matches the request.
 
-    If the caller calls this function with `*value` equal to NULL, the size
-    of the memory region (in bytes) expected for `set` will be placed in
-    `size`.
+    If the caller calls this function with ``*value`` equal to NULL, the
+    size of the memory region (in bytes) expected for ``set`` will be
+    placed in ``size``.
 
     The supported attributes are:
 
@@ -56117,14 +56161,14 @@ def cuCoredumpSetAttributeGlobal(attrib not None : CUcoredumpSettings, value):
     value : Any
         void* containing the requested data.
     size : int
-        The size of the memory region `value` points to.
+        The size of the memory region ``value`` points to.
 
     Returns
     -------
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`
     size : int
-        The size of the memory region `value` points to.
+        The size of the memory region ``value`` points to.
 
     See Also
     --------
@@ -56335,7 +56379,7 @@ def cuCoredumpDeregisterCompleteCallback(callback):
 
 @cython.embedsignature(True)
 def cuGetExportTable(pExportTableId : Optional[CUuuid]):
-    """ 
+    """
 
     Parameters
     ----------
@@ -56365,12 +56409,12 @@ def cuGreenCtxCreate(desc, dev, unsigned int flags):
     """ Creates a green context with a specified set of resources.
 
     This API creates a green context with the resources specified in the
-    descriptor `desc` and returns it in the handle represented by `phCtx`.
-    This API will retain the primary context on device `dev`, which will is
-    released when the green context is destroyed. It is advised to have the
-    primary context active before calling this API to avoid the heavy cost
-    of triggering primary context initialization and deinitialization
-    multiple times.
+    descriptor ``desc`` and returns it in the handle represented by
+    ``phCtx``. This API will retain the primary context on device ``dev``,
+    which will is released when the green context is destroyed. It is
+    advised to have the primary context active before calling this API to
+    avoid the heavy cost of triggering primary context initialization and
+    deinitialization multiple times.
 
     The API does not set the green context current. In order to set it
     current, you need to explicitly set it current by first converting the
@@ -56385,9 +56429,9 @@ def cuGreenCtxCreate(desc, dev, unsigned int flags):
 
     The supported flags are:
 
-    - `CU_GREEN_CTX_NONE` : Default behavior.
+    - ``CU_GREEN_CTX_NONE`` : Default behavior.
 
-    - `CU_GREEN_CTX_DEFAULT_STREAM` : Creates a default stream to use
+    - ``CU_GREEN_CTX_DEFAULT_STREAM`` : Creates a default stream to use
       inside the green context.
 
     Parameters
@@ -56491,11 +56535,11 @@ def cuGreenCtxDestroy(hCtx):
 def cuCtxFromGreenCtx(hCtx):
     """ Returns a :py:obj:`~.CUcontext` handle for a green context.
 
-    This API returns in `pContext` a :py:obj:`~.CUcontext` handle that
-    represents the specified green context `hCtx`. The returned handle can
-    be passed to CUDA APIs that accept a :py:obj:`~.CUcontext` and will be
-    treated as if it were a primary context, while still honoring the
-    resources and configuration associated with `hCtx` as applicable.
+    This API returns in ``pContext`` a :py:obj:`~.CUcontext` handle that
+    represents the specified green context ``hCtx``. The returned handle
+    can be passed to CUDA APIs that accept a :py:obj:`~.CUcontext` and will
+    be treated as if it were a primary context, while still honoring the
+    resources and configuration associated with ``hCtx`` as applicable.
 
     Applications that wish to use a green context with CUDA APIs that
     require a :py:obj:`~.CUcontext` must use this API to obtain a handle to
@@ -56512,14 +56556,14 @@ def cuCtxFromGreenCtx(hCtx):
 
     - For APIs whose behavior depends on green context resources (for
       example, kernel launch), the operation is performed using the
-      resources and configuration of the specified green context `hCtx`.
+      resources and configuration of the specified green context ``hCtx``.
 
     This call does not create a new independent context and does not change
     the underlying context lifetime. The validity of the returned
-    `pContext` is tied to `hCtx`, and no additional destruction or release
-    is required beyond correctly managing `hCtx` with the green context
-    APIs. Destroying `pContext` via :py:obj:`~.cuCtxDestroy` is undefined
-    behavior.
+    ``pContext`` is tied to ``hCtx``, and no additional destruction or
+    release is required beyond correctly managing ``hCtx`` with the green
+    context APIs. Destroying ``pContext`` via :py:obj:`~.cuCtxDestroy` is
+    undefined behavior.
 
     Parameters
     ----------
@@ -56559,8 +56603,8 @@ def cuCtxFromGreenCtx(hCtx):
 def cuDeviceGetDevResource(device, typename not None : CUdevResourceType):
     """ Get device resources.
 
-    Get the `typename` resources available to the `device`. This may often
-    be the starting point for further partitioning or configuring of
+    Get the ``typename`` resources available to the ``device``. This may
+    often be the starting point for further partitioning or configuring of
     resources.
 
     Note: The API is not supported on 32-bit platforms.
@@ -56577,7 +56621,7 @@ def cuDeviceGetDevResource(device, typename not None : CUdevResourceType):
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     resource : :py:obj:`~.CUdevResource`
-        Output pointer to a :py:obj:`~.CUdevResource` structure
+        Output pointer to a CUdevResource structure
 
     See Also
     --------
@@ -56606,8 +56650,8 @@ def cuDeviceGetDevResource(device, typename not None : CUdevResourceType):
 def cuCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
     """ Get context resources.
 
-    Get the `typename` resources available to the context represented by
-    `hCtx`  Note: The API is not supported on 32-bit platforms.
+    Get the ``typename`` resources available to the context represented by
+    ``hCtx``  Note: The API is not supported on 32-bit platforms.
 
     Parameters
     ----------
@@ -56621,7 +56665,7 @@ def cuCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
     resource : :py:obj:`~.CUdevResource`
-        Output pointer to a :py:obj:`~.CUdevResource` structure
+        Output pointer to a CUdevResource structure
 
     See Also
     --------
@@ -56650,8 +56694,8 @@ def cuCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
 def cuGreenCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
     """ Get green context resources.
 
-    Get the `typename` resources available to the green context represented
-    by `hCtx`
+    Get the ``typename`` resources available to the green context
+    represented by ``hCtx``
 
     Parameters
     ----------
@@ -56665,7 +56709,7 @@ def cuGreenCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     resource : :py:obj:`~.CUdevResource`
-        Output pointer to a :py:obj:`~.CUdevResource` structure
+        Output pointer to a CUdevResource structure
 
     See Also
     --------
@@ -56692,17 +56736,17 @@ def cuGreenCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
 
 @cython.embedsignature(True)
 def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevResource], unsigned int flags, unsigned int minCount):
-    """ Splits `CU_DEV_RESOURCE_TYPE_SM` resources.
-
-    Splits `CU_DEV_RESOURCE_TYPE_SM` resources into `nbGroups`, adhering to
-    the minimum SM count specified in `minCount` and the usage flags in
-    `flags`. If `result` is NULL, the API simulates a split and provides
-    the amount of groups that would be created in `nbGroups`. Otherwise,
-    `nbGroups` must point to the amount of elements in `result` and on
-    return, the API will overwrite `nbGroups` with the amount actually
-    created. The groups are written to the array in `result`. `nbGroups`
-    can be less than the total amount if a smaller number of groups is
-    needed.
+    """ Splits ``CU_DEV_RESOURCE_TYPE_SM`` resources.
+
+    Splits ``CU_DEV_RESOURCE_TYPE_SM`` resources into ``nbGroups``,
+    adhering to the minimum SM count specified in ``minCount`` and the
+    usage flags in ``flags``. If ``result`` is NULL, the API simulates a
+    split and provides the amount of groups that would be created in
+    ``nbGroups``. Otherwise, ``nbGroups`` must point to the amount of
+    elements in ``result`` and on return, the API will overwrite
+    ``nbGroups`` with the amount actually created. The groups are written
+    to the array in ``result``. ``nbGroups`` can be less than the total
+    amount if a smaller number of groups is needed.
 
     This API is used to spatially partition the input resource. The input
     resource needs to come from one of :py:obj:`~.cuDeviceGetDevResource`,
@@ -56715,28 +56759,28 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
     performance and functional characteristics of the input resource, and
     guarantee a split that will create a disjoint set of symmetrical
     partitions. This may lead to fewer groups created than purely dividing
-    the total SM count by the `minCount` due to cluster requirements or
+    the total SM count by the ``minCount`` due to cluster requirements or
     alignment and granularity requirements for the minCount. These
     requirements can be queried with :py:obj:`~.cuDeviceGetDevResource`,
     :py:obj:`~.cuCtxGetDevResource`, and
     :py:obj:`~.cuGreenCtxGetDevResource` for
-    :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`, using the `minSmPartitionSize` and
-    `smCoscheduledAlignment` fields to determine minimum partition size and
-    alignment granularity, respectively.
+    :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`, using the ``minSmPartitionSize``
+    and ``smCoscheduledAlignment`` fields to determine minimum partition
+    size and alignment granularity, respectively.
 
-    The `remainder` set does not have the same functional or performance
-    guarantees as the groups in `result`. Its use should be carefully
-    planned and future partitions of the `remainder` set are discouraged.
+    The ``remainder`` set does not have the same functional or performance
+    guarantees as the groups in ``result``. Its use should be carefully
+    planned and future partitions of the ``remainder`` set are discouraged.
 
     The following flags are supported:
 
-    - `CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING` : Lower the minimum
-      SM count and alignment, and treat each SM independent of its
+    - ``CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING`` : Lower the
+      minimum SM count and alignment, and treat each SM independent of its
       hierarchy. This allows more fine grained partitions but at the cost
       of advanced features (such as large clusters on compute capability
       9.0+).
 
-    - `CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE` : Compute
+    - ``CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE`` : Compute
       Capability 9.0+ only. Attempt to create groups that may allow for
       maximally sized thread clusters. This can be queried post green
       context creation using
@@ -56744,16 +56788,16 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
 
     A successful API call must either have:
 
-    - A valid array of `result` pointers of size passed in `nbGroups`, with
-      `input` of type `CU_DEV_RESOURCE_TYPE_SM`. Value of `minCount` must
-      be between 0 and the SM count specified in `input`. `remainder` may
-      be NULL.
+    - A valid array of ``result`` pointers of size passed in ``nbGroups``,
+      with ``input`` of type ``CU_DEV_RESOURCE_TYPE_SM``. Value of
+      ``minCount`` must be between 0 and the SM count specified in
+      ``input``. ``remainder`` may be NULL.
 
-    - NULL passed in for `result`, with a valid integer pointer in
-      `nbGroups` and `input` of type `CU_DEV_RESOURCE_TYPE_SM`. Value of
-      `minCount` must be between 0 and the SM count specified in `input`.
-      `remainder` may be NULL. This queries the number of groups that would
-      be created by the API.
+    - NULL passed in for ``result``, with a valid integer pointer in
+      ``nbGroups`` and ``input`` of type ``CU_DEV_RESOURCE_TYPE_SM``. Value
+      of ``minCount`` must be between 0 and the SM count specified in
+      ``input``. ``remainder`` may be NULL. This queries the number of
+      groups that would be created by the API.
 
     Note: The API is not supported on 32-bit platforms.
 
@@ -56764,7 +56808,7 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
         should be created as described below.
     input : :py:obj:`~.CUdevResource`
         Input SM resource to be split. Must be a valid
-        `CU_DEV_RESOURCE_TYPE_SM` resource.
+        ``CU_DEV_RESOURCE_TYPE_SM`` resource.
     flags : unsigned int
         Flags specifying how these partitions are used or which constraints
         to abide by when splitting the input. Zero is valid for default
@@ -56777,14 +56821,14 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION`
     result : list[:py:obj:`~.CUdevResource`]
-        Output array of `CUdevResource` resources. Can be NULL to query the
-        number of groups.
+        Output array of ``CUdevResource`` resources. Can be NULL to query
+        the number of groups.
     nbGroups : unsigned int
         This is a pointer, specifying the number of groups that would be or
         should be created as described below.
     remainder : :py:obj:`~.CUdevResource`
-        If the input resource cannot be cleanly split among `nbGroups`, the
-        remainder is placed in here. Can be ommitted (NULL) if the user
+        If the input resource cannot be cleanly split among ``nbGroups``,
+        the remainder is placed in here. Can be ommitted (NULL) if the user
         does not need the remaining set.
 
     See Also
@@ -56816,83 +56860,84 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
 
 @cython.embedsignature(True)
 def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource], unsigned int flags, groupParams : Optional[tuple[CU_DEV_SM_RESOURCE_GROUP_PARAMS] | list[CU_DEV_SM_RESOURCE_GROUP_PARAMS]]):
-    """ Splits a `CU_DEV_RESOURCE_TYPE_SM` resource into structured groups.
+    """ Splits a ``CU_DEV_RESOURCE_TYPE_SM`` resource into structured groups.
 
     This API will split a resource of :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`
-    into `nbGroups` structured device resource groups (the `result` array),
-    as well as an optional `remainder`, according to a set of requirements
-    specified in the `groupParams` array. The term “structured” is a trait
-    that specifies the `result` has SMs that are co-scheduled together.
-    This co-scheduling can be specified via the `coscheduledSmCount` field
-    of the `groupParams` structure, while the `smCount` will specify how
-    many SMs are required in total for that result. The remainder is always
-    “unstructured”, it does not have any set guarantees with respect to co-
-    scheduling and those properties will need to either be queried via the
-    occupancy set of APIs or further split into structured groups by this
-    API.
+    into ``nbGroups`` structured device resource groups (the ``result``
+    array), as well as an optional ``remainder``, according to a set of
+    requirements specified in the ``groupParams`` array. The term
+    “structured” is a trait that specifies the ``result`` has SMs that are
+    co-scheduled together. This co-scheduling can be specified via the
+    ``coscheduledSmCount`` field of the ``groupParams`` structure, while
+    the ``smCount`` will specify how many SMs are required in total for
+    that result. The remainder is always “unstructured”, it does not have
+    any set guarantees with respect to co-scheduling and those properties
+    will need to either be queried via the occupancy set of APIs or further
+    split into structured groups by this API.
 
     The API has a discovery mode for use cases where it is difficult to
     know ahead of time what the SM count should be. Discovery happens when
-    the `smCount` field of a given `groupParams` array entry is set to 0 -
-    the smCount will be filled in by the API with the derived SM count
-    according to the provided `groupParams` fields and constraints.
+    the ``smCount`` field of a given ``groupParams`` array entry is set to
+    0 - the smCount will be filled in by the API with the derived SM count
+    according to the provided ``groupParams`` fields and constraints.
     Discovery can be used with both a valid result array and with a NULL
-    `result` pointer value. The latter is useful in situations where the
+    ``result`` pointer value. The latter is useful in situations where the
     smCount will end up being zero, which is an invalid value to create a
-    result entry with, but allowed for discovery purposes when the `result`
-    is NULL.
+    result entry with, but allowed for discovery purposes when the
+    ``result`` is NULL.
 
-    The `groupParams` array is evaluated from index 0 to `nbGroups` - 1.
-    For each index in the `groupParams` array, the API will evaluate which
-    SMs may be a good fit based on constraints and assign those SMs to
-    `result`. This evaluation order is important to consider when using
-    discovery mode, as it helps discover the remaining SMs.
+    The ``groupParams`` array is evaluated from index 0 to ``nbGroups`` -
+    1. For each index in the ``groupParams`` array, the API will evaluate
+    which SMs may be a good fit based on constraints and assign those SMs
+    to ``result``. This evaluation order is important to consider when
+    using discovery mode, as it helps discover the remaining SMs.
 
     For a valid call:
 
-    - `result` should point to a `CUdevResource` array of size `nbGroups`,
-      or alternatively, may be NULL, if the developer wishes for only the
-      groupParams entries to be updated
+    - ``result`` should point to a ``CUdevResource`` array of size
+      ``nbGroups``, or alternatively, may be NULL, if the developer wishes
+      for only the groupParams entries to be updated
 
-    - `input` should be a valid :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`
+    - ``input`` should be a valid :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`
       resource that originates from querying the green context, device
       context, or device.
 
-    - The `remainder` group may be NULL.
+    - The ``remainder`` group may be NULL.
 
-    - There are no API `flags` at this time, so the value passed in should
-      be 0.
+    - There are no API ``flags`` at this time, so the value passed in
+      should be 0.
 
-    - A :py:obj:`~.CU_DEV_SM_RESOURCE_GROUP_PARAMS` array of size
-      `nbGroups`. Each entry must be zero-initialized.
+    - A CU_DEV_SM_RESOURCE_GROUP_PARAMS array of size ``nbGroups``. Each
+      entry must be zero-initialized.
 
-      - `smCount:` must be either 0 or in the range of [2,inputSmCount]
-        where inputSmCount is the amount of SMs the `input` resource has.
-        `smCount` must be a multiple of 2, as well as a multiple of
-        `coscheduledSmCount`. When assigning SMs to a group (and if results
-        are expected by having the `result` parameter set), `smCount`
-        cannot end up with 0 or a value less than `coscheduledSmCount`
-        otherwise CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION will be
-        returned.
+      - ``smCount:`` must be either 0 or in the range of [2,inputSmCount]
+        where inputSmCount is the amount of SMs the ``input`` resource has.
+        ``smCount`` must be a multiple of 2, as well as a multiple of
+        ``coscheduledSmCount``. When assigning SMs to a group (and if
+        results are expected by having the ``result`` parameter set),
+        ``smCount`` cannot end up with 0 or a value less than
+        ``coscheduledSmCount`` otherwise
+        CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION will be returned.
 
-      - `coscheduledSmCount:` allows grouping SMs together in order to be
+      - ``coscheduledSmCount:`` allows grouping SMs together in order to be
         able to launch clusters on Compute Architecture 9.0+. The default
         value may be queried from the device’s
         :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM` resource (8 on Compute
         Architecture 9.0+ and 2 otherwise). The maximum is 32 on Compute
         Architecture 9.0+ and 2 otherwise.
 
-      - `preferredCoscheduledSmCount:` Attempts to merge
-        `coscheduledSmCount` groups into larger groups, in order to make
-        use of `preferredClusterDimensions` on Compute Architecture 10.0+.
-        The default value is set to `coscheduledSmCount`.
+      - ``preferredCoscheduledSmCount:`` Attempts to merge
+        ``coscheduledSmCount`` groups into larger groups, in order to make
+        use of ``preferredClusterDimensions`` on Compute Architecture
+        10.0+. The default value is set to ``coscheduledSmCount``.
 
-      - `flags:`
+      - ``flags:``
 
-    - `CU_DEV_SM_RESOURCE_GROUP_BACKFILL:` lets `smCount` be a non-multiple
-    of `coscheduledSmCount`, filling the difference between SM count and
-    already assigned co-scheduled groupings with other SMs. This lets any
-    resulting group behave similar to the `remainder` group for example.
+        - ``CU_DEV_SM_RESOURCE_GROUP_BACKFILL:`` lets ``smCount`` be a non-
+          multiple of ``coscheduledSmCount``, filling the difference
+          between SM count and already assigned co-scheduled groupings with
+          other SMs. This lets any resulting group behave similar to the
+          ``remainder`` group for example.
 
     Example params and their effect:
 
@@ -56916,7 +56961,7 @@ def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource]
       always need to adhere to a structure of coscheduledSmCount (even if
       its just 2), and therefore must always have enough coscheduled SMs to
       cover that requirement (even with the
-      `CU_DEV_SM_RESOURCE_GROUP_BACKFILL` flag enabled).
+      ``CU_DEV_SM_RESOURCE_GROUP_BACKFILL`` flag enabled).
 
     Splitting an input into N groups, can be accomplished by repeatedly
     splitting off 1 group and re-splitting the remainder (a bisect
@@ -56926,10 +56971,10 @@ def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource]
     Parameters
     ----------
     nbGroups : unsigned int
-        Specifies the number of groups in `result` and `groupParams`
+        Specifies the number of groups in ``result`` and ``groupParams``
     input : :py:obj:`~.CUdevResource`
         Input SM resource to be split. Must be a valid
-        `CU_DEV_RESOURCE_TYPE_SM` resource.
+        ``CU_DEV_RESOURCE_TYPE_SM`` resource.
     flags : unsigned int
         Flags specifying how the API should behave. The value should be 0
         for now.
@@ -56942,7 +56987,7 @@ def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource]
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION`
     result : list[:py:obj:`~.CUdevResource`]
-        Output array of `CUdevResource` resources. Can be NULL, alongside
+        Output array of ``CUdevResource`` resources. Can be NULL, alongside
         an smCount of 0, for discovery purpose.
     remainder : :py:obj:`~.CUdevResource`
         If splitting the input resource leaves any SMs, the remainder is
@@ -56993,22 +57038,22 @@ def cuDevResourceGenerateDesc(resources : Optional[tuple[CUdevResource] | list[C
     """ Generate a resource descriptor.
 
     Generates a single resource descriptor with the set of resources
-    specified in `resources`. The generated resource descriptor is
+    specified in ``resources``. The generated resource descriptor is
     necessary for the creation of green contexts via the
     :py:obj:`~.cuGreenCtxCreate` API. Resources of the same type can be
     passed in, provided they meet the requirements as noted below.
 
     A successful API call must have:
 
-    - A valid output pointer for the `phDesc` descriptor as well as a valid
-      array of `resources` pointers, with the array size passed in
-      `nbResources`. If multiple resources are provided in `resources`, the
-      device they came from must be the same, otherwise
+    - A valid output pointer for the ``phDesc`` descriptor as well as a
+      valid array of ``resources`` pointers, with the array size passed in
+      ``nbResources``. If multiple resources are provided in ``resources``,
+      the device they came from must be the same, otherwise
       CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned. If multiple
-      resources are provided in `resources` and they are of type
+      resources are provided in ``resources`` and they are of type
       :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`, they must be outputs (whether
-      `result` or `remaining`) from the same split API instance and have
-      the same smCoscheduledAlignment values, otherwise
+      ``result`` or ``remaining``) from the same split API instance and
+      have the same smCoscheduledAlignment values, otherwise
       CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
 
     Note: The API is not supported on 32-bit platforms.
@@ -57018,7 +57063,7 @@ def cuDevResourceGenerateDesc(resources : Optional[tuple[CUdevResource] | list[C
     resources : list[:py:obj:`~.CUdevResource`]
         Array of resources to be included in the descriptor
     nbResources : unsigned int
-        Number of resources passed in `resources`
+        Number of resources passed in ``resources``
 
     Returns
     -------
@@ -57060,13 +57105,13 @@ def cuDevResourceGenerateDesc(resources : Optional[tuple[CUdevResource] | list[C
 def cuGreenCtxRecordEvent(hCtx, hEvent):
     """ Records an event.
 
-    Captures in `hEvent` all the activities of the green context of `hCtx`
-    at the time of this call. `hEvent` and `hCtx` must be from the same
-    primary context otherwise :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` is
-    returned. Calls such as :py:obj:`~.cuEventQuery()` or
-    :py:obj:`~.cuGreenCtxWaitEvent()` will then examine or wait for
-    completion of the work that was captured. Uses of `hCtx` after this
-    call do not modify `hEvent`.
+    Captures in ``hEvent`` all the activities of the green context of
+    ``hCtx`` at the time of this call. ``hEvent`` and ``hCtx`` must be from
+    the same primary context otherwise
+    :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` is returned. Calls such as
+    :py:obj:`~.cuEventQuery()` or :py:obj:`~.cuGreenCtxWaitEvent()` will
+    then examine or wait for completion of the work that was captured. Uses
+    of ``hCtx`` after this call do not modify ``hEvent``.
 
     Parameters
     ----------
@@ -57086,7 +57131,7 @@ def cuGreenCtxRecordEvent(hCtx, hEvent):
 
     Notes
     -----
-    The API will return :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED` if the specified green context `hCtx` has a stream in the capture mode. In such a case, the call will invalidate all the conflicting captures.
+    The API will return :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED` if the specified green context ``hCtx`` has a stream in the capture mode. In such a case, the call will invalidate all the conflicting captures.
     """
     cdef cydriver.CUevent cyhEvent
     if hEvent is None:
@@ -57115,9 +57160,9 @@ def cuGreenCtxRecordEvent(hCtx, hEvent):
 def cuGreenCtxWaitEvent(hCtx, hEvent):
     """ Make a green context wait on an event.
 
-    Makes all future work submitted to green context `hCtx` wait for all
-    work captured in `hEvent`. The synchronization will be performed on the
-    device and will not block the calling CPU thread. See
+    Makes all future work submitted to green context ``hCtx`` wait for all
+    work captured in ``hEvent``. The synchronization will be performed on
+    the device and will not block the calling CPU thread. See
     :py:obj:`~.cuGreenCtxRecordEvent()` or :py:obj:`~.cuEventRecord()`, for
     details on what is captured by an event.
 
@@ -57139,9 +57184,9 @@ def cuGreenCtxWaitEvent(hCtx, hEvent):
 
     Notes
     -----
-    `hEvent` may be from a different context or device than `hCtx`.
+    ``hEvent`` may be from a different context or device than ``hCtx``.
 
-    The API will return :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED` and invalidate the capture if the specified event `hEvent` is part of an ongoing capture sequence or if the specified green context `hCtx` has a stream in the capture mode.
+    The API will return :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED` and invalidate the capture if the specified event ``hEvent`` is part of an ongoing capture sequence or if the specified green context ``hCtx`` has a stream in the capture mode.
     """
     cdef cydriver.CUevent cyhEvent
     if hEvent is None:
@@ -57173,7 +57218,7 @@ def cuStreamGetGreenCtx(hStream):
     Returns the CUDA green context that the stream is associated with, or
     NULL if the stream is not associated with any green context.
 
-    The stream handle `hStream` can refer to any of the following:
+    The stream handle ``hStream`` can refer to any of the following:
 
     - a stream created via any of the CUDA driver APIs such as
       :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamCreateWithPriority`
@@ -57182,13 +57227,13 @@ def cuStreamGetGreenCtx(hStream):
       :py:obj:`~.cudaStreamCreateWithFlags` and
       :py:obj:`~.cudaStreamCreateWithPriority`. If during stream creation
       the context that was active in the calling thread was obtained with
-      cuCtxFromGreenCtx, that green context is returned in `phCtx`.
-      Otherwise, `*phCtx` is set to NULL instead.
+      cuCtxFromGreenCtx, that green context is returned in ``phCtx``.
+      Otherwise, ``*phCtx`` is set to NULL instead.
 
     - special stream such as the NULL stream or
       :py:obj:`~.CU_STREAM_LEGACY`. In that case if context that is active
       in the calling thread was obtained with cuCtxFromGreenCtx, that green
-      context is returned. Otherwise, `*phCtx` is set to NULL instead.
+      context is returned. Otherwise, ``*phCtx`` is set to NULL instead.
 
     Passing an invalid handle will result in undefined behavior.
 
@@ -57230,13 +57275,13 @@ def cuStreamGetGreenCtx(hStream):
 def cuGreenCtxStreamCreate(greenCtx, unsigned int flags, int priority):
     """ Create a stream for use in the green context.
 
-    Creates a stream for use in the specified green context `greenCtx` and
-    returns a handle in `phStream`. The stream can be destroyed by calling
-    :py:obj:`~.cuStreamDestroy()`. Note that the API ignores the context
-    that is current to the calling thread and creates a stream in the
-    specified green context `greenCtx`.
+    Creates a stream for use in the specified green context ``greenCtx``
+    and returns a handle in ``phStream``. The stream can be destroyed by
+    calling :py:obj:`~.cuStreamDestroy()`. Note that the API ignores the
+    context that is current to the calling thread and creates a stream in
+    the specified green context ``greenCtx``.
 
-    The supported values for `flags` are:
+    The supported values for ``flags`` are:
 
     - :py:obj:`~.CU_STREAM_NON_BLOCKING`: This must be specified. It
       indicates that work running in the created stream may run
@@ -57244,11 +57289,11 @@ def cuGreenCtxStreamCreate(greenCtx, unsigned int flags, int priority):
       stream should perform no implicit synchronization with the default
       stream.
 
-    Specifying `priority` affects the scheduling priority of work in the
+    Specifying ``priority`` affects the scheduling priority of work in the
     stream. Priorities provide a hint to preferentially run work with
     higher priority when possible, but do not preempt already-running work
     or provide any other functional guarantee on execution order.
-    `priority` follows a convention where lower numbers represent higher
+    ``priority`` follows a convention where lower numbers represent higher
     priorities. '0' represents default priority. The range of meaningful
     numerical priorities can be queried using
     :py:obj:`~.cuCtxGetStreamPriorityRange`. If the specified priority is
@@ -57261,7 +57306,7 @@ def cuGreenCtxStreamCreate(greenCtx, unsigned int flags, int priority):
     greenCtx : :py:obj:`~.CUgreenCtx`
         Green context for which to create the stream for
     flags : unsigned int
-        Flags for stream creation. `CU_STREAM_NON_BLOCKING` must be
+        Flags for stream creation. ``CU_STREAM_NON_BLOCKING`` must be
         specified.
     priority : int
         Stream priority. Lower numbers represent higher priorities. See
@@ -57305,11 +57350,11 @@ def cuGreenCtxStreamCreate(greenCtx, unsigned int flags, int priority):
 def cuGreenCtxGetId(greenCtx):
     """ Returns the unique Id associated with the green context supplied.
 
-    Returns in `greenCtxId` the unique Id which is associated with a given
-    green context. The Id is unique for the life of the program for this
-    instance of CUDA. If green context is supplied as NULL and the current
-    context is set to a green context, the Id of the current green context
-    is returned.
+    Returns in ``greenCtxId`` the unique Id which is associated with a
+    given green context. The Id is unique for the life of the program for
+    this instance of CUDA. If green context is supplied as NULL and the
+    current context is set to a green context, the Id of the current green
+    context is returned.
 
     Parameters
     ----------
@@ -57349,12 +57394,12 @@ def cuGreenCtxGetId(greenCtx):
 def cuStreamGetDevResource(hStream, typename not None : CUdevResourceType):
     """ Get stream resources.
 
-    Get the `typename` resources available to the `hStream` and store them
-    in `resource`.
+    Get the ``typename`` resources available to the ``hStream`` and store
+    them in ``resource``.
 
     Note: The API will return :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`
-    is `typename` is `CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG` or
-    `CU_DEV_RESOURCE_TYPE_WORKQUEUE`.
+    is ``typename`` is ``CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG`` or
+    ``CU_DEV_RESOURCE_TYPE_WORKQUEUE``.
 
     Parameters
     ----------
@@ -57368,7 +57413,7 @@ def cuStreamGetDevResource(hStream, typename not None : CUdevResourceType):
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     resource : :py:obj:`~.CUdevResource`
-        Output pointer to a :py:obj:`~.CUdevResource` structure
+        Output pointer to a CUdevResource structure
 
     See Also
     --------
@@ -57522,7 +57567,7 @@ def cuLogsDumpToFile(iterator : Optional[CUlogIterator], char* pathToFile, unsig
 
     Logs generated by the driver are stored in an internal buffer and can
     be copied out using this API. This API dumps all driver logs starting
-    from `iterator` into `pathToFile` provided.
+    from ``iterator`` into ``pathToFile`` provided.
 
     Parameters
     ----------
@@ -57544,7 +57589,7 @@ def cuLogsDumpToFile(iterator : Optional[CUlogIterator], char* pathToFile, unsig
 
     Notes
     -----
-    `iterator` is auto-advancing. Dumping logs will update the value of `iterator` to receive the next generated log.
+    ``iterator`` is auto-advancing. Dumping logs will update the value of ``iterator`` to receive the next generated log.
 
     The driver reserves limited memory for storing logs. The oldest logs may be overwritten and become unrecoverable. An indication will appear in the destination outupt if the logs have been truncated. Call dump after each failed API to mitigate this risk.
     """
@@ -57566,14 +57611,14 @@ def cuLogsDumpToMemory(iterator : Optional[CUlogIterator], char* buffer, size_t
 
     Logs generated by the driver are stored in an internal buffer and can
     be copied out using this API. This API dumps driver logs from
-    `iterator` into `buffer` up to the size specified in `*size`. The
+    ``iterator`` into ``buffer`` up to the size specified in ``*size``. The
     driver will always null terminate the buffer but there will not be a
-    null character between log entries, only a newline \n. The driver will
-    then return the actual number of bytes written in `*size`, excluding
-    the null terminator. If there are no messages to dump, `*size` will be
-    set to 0 and the function will return :py:obj:`~.CUDA_SUCCESS`. If the
-    provided `buffer` is not large enough to hold any messages, `*size`
-    will be set to 0 and the function will return
+    null character between log entries, only a newline \\n. The driver will
+    then return the actual number of bytes written in ``*size``, excluding
+    the null terminator. If there are no messages to dump, ``*size`` will
+    be set to 0 and the function will return :py:obj:`~.CUDA_SUCCESS`. If
+    the provided ``buffer`` is not large enough to hold any messages,
+    ``*size`` will be set to 0 and the function will return
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
     Parameters
@@ -57600,11 +57645,11 @@ def cuLogsDumpToMemory(iterator : Optional[CUlogIterator], char* buffer, size_t
 
     Notes
     -----
-    `iterator` is auto-advancing. Dumping logs will update the value of `iterator` to receive the next generated log.
+    ``iterator`` is auto-advancing. Dumping logs will update the value of ``iterator`` to receive the next generated log.
 
     The driver reserves limited memory for storing logs. The maximum size of the buffer is 25600 bytes. The oldest logs may be overwritten and become unrecoverable. An indication will appear in the destination outupt if the logs have been truncated. Call dump after each failed API to mitigate this risk.
 
-    If the provided value in `*size` is not large enough to hold all buffered messages, a message will be added at the head of the buffer indicating this. The driver then computes the number of messages it is able to store in `buffer` and writes it out. The final message in `buffer` will always be the most recent log message as of when the API is called.
+    If the provided value in ``*size`` is not large enough to hold all buffered messages, a message will be added at the head of the buffer indicating this. The driver then computes the number of messages it is able to store in ``buffer`` and writes it out. The final message in ``buffer`` will always be the most recent log message as of when the API is called.
     """
     cdef cydriver.CUlogIterator* cyiterator = NULL
     if iterator is not None:
@@ -57622,8 +57667,8 @@ def cuLogsDumpToMemory(iterator : Optional[CUlogIterator], char* buffer, size_t
 def cuCheckpointProcessGetRestoreThreadId(int pid):
     """ Returns the restore thread ID for a CUDA process.
 
-    Returns in `*tid` the thread ID of the CUDA restore thread for the
-    process specified by `pid`.
+    Returns in ``*tid`` the thread ID of the CUDA restore thread for the
+    process specified by ``pid``.
 
     Parameters
     ----------
@@ -57651,8 +57696,8 @@ def cuCheckpointProcessGetRestoreThreadId(int pid):
 def cuCheckpointProcessGetState(int pid):
     """ Returns the process state of a CUDA process.
 
-    Returns in `*state` the current state of the CUDA process specified by
-    `pid`.
+    Returns in ``*state`` the current state of the CUDA process specified
+    by ``pid``.
 
     Parameters
     ----------
@@ -57680,8 +57725,8 @@ def cuCheckpointProcessGetState(int pid):
 def cuCheckpointProcessLock(int pid, args : Optional[CUcheckpointLockArgs]):
     """ Lock a running CUDA process.
 
-    Lock the CUDA process specified by `pid` which will block further CUDA
-    API calls. Process must be in the RUNNING state in order to lock.
+    Lock the CUDA process specified by ``pid`` which will block further
+    CUDA API calls. Process must be in the RUNNING state in order to lock.
 
     Upon successful return the process will be in the LOCKED state.
 
@@ -57712,7 +57757,7 @@ def cuCheckpointProcessLock(int pid, args : Optional[CUcheckpointLockArgs]):
 def cuCheckpointProcessCheckpoint(int pid, args : Optional[CUcheckpointCheckpointArgs]):
     """ Checkpoint a CUDA process's GPU memory contents.
 
-    Checkpoints a CUDA process specified by `pid` that is in the LOCKED
+    Checkpoints a CUDA process specified by ``pid`` that is in the LOCKED
     state. The GPU memory contents will be brought into host memory and all
     underlying references will be released. Process must be in the LOCKED
     state to checkpoint.
@@ -57743,13 +57788,13 @@ def cuCheckpointProcessCheckpoint(int pid, args : Optional[CUcheckpointCheckpoin
 def cuCheckpointProcessRestore(int pid, args : Optional[CUcheckpointRestoreArgs]):
     """ Restore a CUDA process's GPU memory contents from its last checkpoint.
 
-    Restores a CUDA process specified by `pid` from its last checkpoint.
+    Restores a CUDA process specified by ``pid`` from its last checkpoint.
     Process must be in the CHECKPOINTED state to restore.
 
-    GPU UUID pairs can be specified in `args` to remap the process old GPUs
-    onto new GPUs. The GPU to restore onto needs to have enough memory and
-    be of the same chip type as the old GPU. If an array of GPU UUID pairs
-    is specified, it must contain every checkpointed GPU.
+    GPU UUID pairs can be specified in ``args`` to remap the process old
+    GPUs onto new GPUs. The GPU to restore onto needs to have enough memory
+    and be of the same chip type as the old GPU. If an array of GPU UUID
+    pairs is specified, it must contain every checkpointed GPU.
 
     Upon successful return the process will be in the LOCKED state.
 
@@ -57784,8 +57829,8 @@ def cuCheckpointProcessRestore(int pid, args : Optional[CUcheckpointRestoreArgs]
 def cuCheckpointProcessUnlock(int pid, args : Optional[CUcheckpointUnlockArgs]):
     """ Unlock a CUDA process to allow CUDA API calls.
 
-    Unlocks a process specified by `pid` allowing it to resume making CUDA
-    API calls. Process must be in the LOCKED state.
+    Unlocks a process specified by ``pid`` allowing it to resume making
+    CUDA API calls. Process must be in the LOCKED state.
 
     Upon successful return the process will be in the RUNNING state.
 
@@ -57869,11 +57914,11 @@ def cuProfilerStop():
 def cuGraphicsEGLRegisterImage(image, unsigned int flags):
     """ Registers an EGL image.
 
-    Registers the EGLImageKHR specified by `image` for access by CUDA. A
-    handle to the registered object is returned as `pCudaResource`.
+    Registers the EGLImageKHR specified by ``image`` for access by CUDA. A
+    handle to the registered object is returned as ``pCudaResource``.
     Additional Mapping/Unmapping is not required for the registered
     resource and :py:obj:`~.cuGraphicsResourceGetMappedEglFrame` can be
-    directly called on the `pCudaResource`.
+    directly called on the ``pCudaResource``.
 
     The application will be responsible for synchronizing access to shared
     objects. The application must ensure that any pending operation which
@@ -57886,7 +57931,7 @@ def cuGraphicsEGLRegisterImage(image, unsigned int flags):
     accomplished by calling cuCtxSynchronize or cuEventSynchronize
     (preferably).
 
-    The surface's intended usage is specified using `flags`, as follows:
+    The surface's intended usage is specified using ``flags``, as follows:
 
     - :py:obj:`~.CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE`: Specifies no hints
       about how this resource will be used. It is therefore assumed that
@@ -57945,7 +57990,7 @@ def cuGraphicsEGLRegisterImage(image, unsigned int flags):
 def cuEGLStreamConsumerConnect(stream):
     """ Connect CUDA to EGLStream as a consumer.
 
-    Connect CUDA as a consumer to EGLStreamKHR specified by `stream`.
+    Connect CUDA as a consumer to EGLStreamKHR specified by ``stream``.
 
     The EGLStreamKHR is an EGL object that transfers a sequence of image
     frames from one API to another.
@@ -57988,8 +58033,8 @@ def cuEGLStreamConsumerConnect(stream):
 def cuEGLStreamConsumerConnectWithFlags(stream, unsigned int flags):
     """ Connect CUDA to EGLStream as a consumer with given flags.
 
-    Connect CUDA as a consumer to EGLStreamKHR specified by `stream` with
-    specified `flags` defined by CUeglResourceLocationFlags.
+    Connect CUDA as a consumer to EGLStreamKHR specified by ``stream`` with
+    specified ``flags`` defined by CUeglResourceLocationFlags.
 
     The flags specify whether the consumer wants to access frames from
     system memory or video memory. Default is
@@ -58077,7 +58122,7 @@ def cuEGLStreamConsumerAcquireFrame(conn, pCudaResource, pStream, unsigned int t
     setting EGL_SUPPORT_REUSE_NV flag to EGL_FALSE during stream
     initialization. By default, EGLStream is created with this flag set to
     EGL_TRUE. :py:obj:`~.cuGraphicsResourceGetMappedEglFrame` can be called
-    on `pCudaResource` to get :py:obj:`~.CUeglFrame`.
+    on ``pCudaResource`` to get :py:obj:`~.CUeglFrame`.
 
     Parameters
     ----------
@@ -58144,7 +58189,7 @@ def cuEGLStreamConsumerAcquireFrame(conn, pCudaResource, pStream, unsigned int t
 def cuEGLStreamConsumerReleaseFrame(conn, pCudaResource, pStream):
     """ Releases the last frame acquired from the EGLStream.
 
-    Release the acquired image frame specified by `pCudaResource` to
+    Release the acquired image frame specified by ``pCudaResource`` to
     EGLStreamKHR. If EGL_SUPPORT_REUSE_NV flag is set to EGL_TRUE, at the
     time of EGL creation this API doesn't release the last frame acquired
     on the EGLStream. By default, EGLStream is created with this flag set
@@ -58207,7 +58252,7 @@ def cuEGLStreamConsumerReleaseFrame(conn, pCudaResource, pStream):
 def cuEGLStreamProducerConnect(stream, width, height):
     """ Connect CUDA to EGLStream as a producer.
 
-    Connect CUDA as a producer to EGLStreamKHR specified by `stream`.
+    Connect CUDA as a producer to EGLStreamKHR specified by ``stream``.
 
     The EGLStreamKHR is an EGL object that transfers a sequence of image
     frames from one API to another.
@@ -58432,16 +58477,16 @@ def cuEGLStreamProducerReturnFrame(conn, eglframe : Optional[CUeglFrame], pStrea
 def cuGraphicsResourceGetMappedEglFrame(resource, unsigned int index, unsigned int mipLevel):
     """ Get an eglFrame through which to access a registered EGL graphics resource.
 
-    Returns in `*eglFrame` an eglFrame pointer through which the registered
-    graphics resource `resource` may be accessed. This API can only be
-    called for registered EGL graphics resources.
+    Returns in ``*eglFrame`` an eglFrame pointer through which the
+    registered graphics resource ``resource`` may be accessed. This API can
+    only be called for registered EGL graphics resources.
 
     The :py:obj:`~.CUeglFrame` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    If `resource` is not registered then :py:obj:`~.CUDA_ERROR_NOT_MAPPED`
-    is returned.
+    If ``resource`` is not registered then
+    :py:obj:`~.CUDA_ERROR_NOT_MAPPED` is returned.
 
     Parameters
     ----------
@@ -58481,8 +58526,8 @@ def cuGraphicsResourceGetMappedEglFrame(resource, unsigned int index, unsigned i
 def cuEventCreateFromEGLSync(eglSync, unsigned int flags):
     """ Creates an event from EGLSync object.
 
-    Creates an event *phEvent from an EGLSyncKHR eglSync with the flags
-    specified via `flags`. Valid flags include:
+    Creates an event \\*phEvent from an EGLSyncKHR eglSync with the flags
+    specified via ``flags``. Valid flags include:
 
     - :py:obj:`~.CU_EVENT_DEFAULT`: Default event creation flag.
 
@@ -58491,7 +58536,7 @@ def cuEventCreateFromEGLSync(eglSync, unsigned int flags):
       :py:obj:`~.cuEventSynchronize()` to wait on an event created with
       this flag will block until the event has actually been completed.
 
-    Once the `eglSync` gets destroyed, :py:obj:`~.cuEventDestroy` is the
+    Once the ``eglSync`` gets destroyed, :py:obj:`~.cuEventDestroy` is the
     only API that can be invoked on the event.
 
     :py:obj:`~.cuEventRecord` and TimingData are not supported for events
@@ -58540,9 +58585,9 @@ def cuEventCreateFromEGLSync(eglSync, unsigned int flags):
 def cuGraphicsGLRegisterBuffer(buffer, unsigned int Flags):
     """ Registers an OpenGL buffer object.
 
-    Registers the buffer object specified by `buffer` for access by CUDA. A
-    handle to the registered object is returned as `pCudaResource`. The
-    register flags `Flags` specify the intended usage, as follows:
+    Registers the buffer object specified by ``buffer`` for access by CUDA.
+    A handle to the registered object is returned as ``pCudaResource``. The
+    register flags ``Flags`` specify the intended usage, as follows:
 
     - :py:obj:`~.CU_GRAPHICS_REGISTER_FLAGS_NONE`: Specifies no hints about
       how this resource will be used. It is therefore assumed that this
@@ -58597,16 +58642,16 @@ def cuGraphicsGLRegisterBuffer(buffer, unsigned int Flags):
 def cuGraphicsGLRegisterImage(image, target, unsigned int Flags):
     """ Register an OpenGL texture or renderbuffer object.
 
-    Registers the texture or renderbuffer object specified by `image` for
+    Registers the texture or renderbuffer object specified by ``image`` for
     access by CUDA.   A handle to the registered object is returned as
-    `pCudaResource`.
+    ``pCudaResource``.
 
-    `target` must match the type of the object, and must be one of
+    ``target`` must match the type of the object, and must be one of
     :py:obj:`~.GL_TEXTURE_2D`, :py:obj:`~.GL_TEXTURE_RECTANGLE`,
     :py:obj:`~.GL_TEXTURE_CUBE_MAP`, :py:obj:`~.GL_TEXTURE_3D`,
     :py:obj:`~.GL_TEXTURE_2D_ARRAY`, or :py:obj:`~.GL_RENDERBUFFER`.
 
-    The register flags `Flags` specify the intended usage, as follows:
+    The register flags ``Flags`` specify the intended usage, as follows:
 
     - :py:obj:`~.CU_GRAPHICS_REGISTER_FLAGS_NONE`: Specifies no hints about
       how this resource will be used. It is therefore assumed that this
@@ -58652,7 +58697,7 @@ def cuGraphicsGLRegisterImage(image, target, unsigned int Flags):
     image : :py:obj:`~.GLuint`
         name of texture or renderbuffer object to be registered
     target : :py:obj:`~.GLenum`
-        Identifies the type of object specified by `image`
+        Identifies the type of object specified by ``image``
     Flags : unsigned int
         Register flags
 
@@ -58697,14 +58742,14 @@ def cuGraphicsGLRegisterImage(image, target, unsigned int Flags):
 def cuGLGetDevices(unsigned int cudaDeviceCount, deviceList not None : CUGLDeviceList):
     """ Gets the CUDA devices associated with the current OpenGL context.
 
-    Returns in `*pCudaDeviceCount` the number of CUDA-compatible devices
+    Returns in ``*pCudaDeviceCount`` the number of CUDA-compatible devices
     corresponding to the current OpenGL context. Also returns in
-    `*pCudaDevices` at most cudaDeviceCount of the CUDA-compatible devices
+    ``*pCudaDevices`` at most cudaDeviceCount of the CUDA-compatible devices
     corresponding to the current OpenGL context. If any of the GPUs being
     used by the current OpenGL context are not CUDA capable then the call
     will return CUDA_ERROR_NO_DEVICE.
 
-    The `deviceList` argument may be any of the following:
+    The ``deviceList`` argument may be any of the following:
     CU_GL_DEVICE_LIST_ALL: Query all devices used by the current OpenGL
     context. CU_GL_DEVICE_LIST_CURRENT_FRAME: Query the devices used by the
     current OpenGL context to render the current frame (in SLI).
@@ -58766,8 +58811,8 @@ def cuGLGetDevices(unsigned int cudaDeviceCount, deviceList not None : CUGLDevic
 def cuVDPAUGetDevice(vdpDevice, vdpGetProcAddress):
     """ Gets the CUDA device associated with a VDPAU device.
 
-    Returns in `*pDevice` the CUDA device associated with a `vdpDevice`, if
-    applicable.
+    Returns in ``*pDevice`` the CUDA device associated with a
+    ``vdpDevice``, if applicable.
 
     Parameters
     ----------
@@ -58823,7 +58868,7 @@ def cuVDPAUCtxCreate(unsigned int flags, device, vdpDevice, vdpGetProcAddress):
     associates the CUDA context with the calling thread. It must be called
     before performing any other VDPAU interoperability operations. It may
     fail if the needed VDPAU driver facilities are not available. For usage
-    of the `flags` parameter, see :py:obj:`~.cuCtxCreate()`.
+    of the ``flags`` parameter, see :py:obj:`~.cuCtxCreate()`.
 
     Parameters
     ----------
@@ -58887,9 +58932,10 @@ def cuVDPAUCtxCreate(unsigned int flags, device, vdpDevice, vdpGetProcAddress):
 def cuGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
     """ Registers a VDPAU VdpVideoSurface object.
 
-    Registers the VdpVideoSurface specified by `vdpSurface` for access by
-    CUDA. A handle to the registered object is returned as `pCudaResource`.
-    The surface's intended usage is specified using `flags`, as follows:
+    Registers the VdpVideoSurface specified by ``vdpSurface`` for access by
+    CUDA. A handle to the registered object is returned as
+    ``pCudaResource``. The surface's intended usage is specified using
+    ``flags``, as follows:
 
     - :py:obj:`~.CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE`: Specifies no hints
       about how this resource will be used. It is therefore assumed that
@@ -58907,8 +58953,8 @@ def cuGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
     The VdpVideoSurface is presented as an array of subresources that may
     be accessed using pointers returned by
     :py:obj:`~.cuGraphicsSubResourceGetMappedArray`. The exact number of
-    valid `arrayIndex` values depends on the VDPAU surface format. The
-    mapping is shown in the table below. `mipLevel` must be 0.
+    valid ``arrayIndex`` values depends on the VDPAU surface format. The
+    mapping is shown in the table below. ``mipLevel`` must be 0.
 
     Parameters
     ----------
@@ -58950,9 +58996,10 @@ def cuGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
 def cuGraphicsVDPAURegisterOutputSurface(vdpSurface, unsigned int flags):
     """ Registers a VDPAU VdpOutputSurface object.
 
-    Registers the VdpOutputSurface specified by `vdpSurface` for access by
-    CUDA. A handle to the registered object is returned as `pCudaResource`.
-    The surface's intended usage is specified using `flags`, as follows:
+    Registers the VdpOutputSurface specified by ``vdpSurface`` for access
+    by CUDA. A handle to the registered object is returned as
+    ``pCudaResource``. The surface's intended usage is specified using
+    ``flags``, as follows:
 
     - :py:obj:`~.CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE`: Specifies no hints
       about how this resource will be used. It is therefore assumed that
@@ -58970,8 +59017,8 @@ def cuGraphicsVDPAURegisterOutputSurface(vdpSurface, unsigned int flags):
     The VdpOutputSurface is presented as an array of subresources that may
     be accessed using pointers returned by
     :py:obj:`~.cuGraphicsSubResourceGetMappedArray`. The exact number of
-    valid `arrayIndex` values depends on the VDPAU surface format. The
-    mapping is shown in the table below. `mipLevel` must be 0.
+    valid ``arrayIndex`` values depends on the VDPAU surface format. The
+    mapping is shown in the table below. ``mipLevel`` must be 0.
 
     Parameters
     ----------
@@ -60023,4 +60070,3 @@ cdef int _add_native_handle_getters() except?-1:
     {{endif}}
     return 0
 _add_native_handle_getters()
-
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pxd b/cuda_bindings/cuda/bindings/nvrtc.pxd
index 743c75f883..c4f57162eb 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pxd
+++ b/cuda_bindings/cuda/bindings/nvrtc.pxd
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 cimport cuda.bindings.cynvrtc as cynvrtc
 
 include "_lib/utils.pxd"
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pyx b/cuda_bindings/cuda/bindings/nvrtc.pyx
index e6d1dc6ad0..0e325058f8 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pyx
+++ b/cuda_bindings/cuda/bindings/nvrtc.pyx
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -316,7 +316,7 @@ cdef class nvrtcBundledHeadersInfo(anon_struct0):
 
 @cython.embedsignature(True)
 def nvrtcGetErrorString(result not None : nvrtcResult):
-    """ nvrtcGetErrorString is a helper function that returns a string describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to `"NVRTC_SUCCESS"`. For unrecognized enumeration values, it returns `"NVRTC_ERROR unknown"`.
+    """ nvrtcGetErrorString is a helper function that returns a string describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to ``"NVRTC_SUCCESS"``. For unrecognized enumeration values, it returns ``"NVRTC_ERROR unknown"``.
 
     Parameters
     ----------
@@ -337,7 +337,7 @@ def nvrtcGetErrorString(result not None : nvrtcResult):
 
 @cython.embedsignature(True)
 def nvrtcVersion():
-    """ nvrtcVersion sets the output parameters `major` and `minor` with the CUDA Runtime Compilation version number.
+    """ nvrtcVersion sets the output parameters ``major`` and ``minor`` with the CUDA Runtime Compilation version number.
 
     Returns
     -------
@@ -359,7 +359,7 @@ def nvrtcVersion():
 
 @cython.embedsignature(True)
 def nvrtcGetNumSupportedArchs():
-    """ nvrtcGetNumSupportedArchs sets the output parameter `numArchs` with the number of architectures supported by NVRTC. This can then be used to pass an array to :py:obj:`~.nvrtcGetSupportedArchs` to get the supported architectures.
+    """ nvrtcGetNumSupportedArchs sets the output parameter ``numArchs`` with the number of architectures supported by NVRTC. This can then be used to pass an array to :py:obj:`~.nvrtcGetSupportedArchs` to get the supported architectures.
 
     see :py:obj:`~.nvrtcGetSupportedArchs`
 
@@ -380,7 +380,7 @@ def nvrtcGetNumSupportedArchs():
 
 @cython.embedsignature(True)
 def nvrtcGetSupportedArchs():
-    """ nvrtcGetSupportedArchs populates the array passed via the output parameter `supportedArchs` with the architectures supported by NVRTC. The array is sorted in the ascending order. The size of the array to be passed can be determined using :py:obj:`~.nvrtcGetNumSupportedArchs`.
+    """ nvrtcGetSupportedArchs populates the array passed via the output parameter ``supportedArchs`` with the architectures supported by NVRTC. The array is sorted in the ascending order. The size of the array to be passed can be determined using :py:obj:`~.nvrtcGetNumSupportedArchs`.
 
     see :py:obj:`~.nvrtcGetNumSupportedArchs`
 
@@ -404,26 +404,26 @@ def nvrtcGetSupportedArchs():
 
 @cython.embedsignature(True)
 def nvrtcCreateProgram(char* src, char* name, int numHeaders, headers : Optional[tuple[bytes] | list[bytes]], includeNames : Optional[tuple[bytes] | list[bytes]]):
-    """ nvrtcCreateProgram creates an instance of nvrtcProgram with the given input parameters, and sets the output parameter `prog` with it.
+    """ nvrtcCreateProgram creates an instance of nvrtcProgram with the given input parameters, and sets the output parameter ``prog`` with it.
 
     Parameters
     ----------
     src : bytes
         CUDA program source.
     name : bytes
-        CUDA program name.  `name` can be `NULL`; `"default_program"` is
-        used when `name` is `NULL` or "".
+        CUDA program name.  ``name`` can be ``NULL``; ``"default_program"``
+        is used when ``name`` is ``NULL`` or "".
     numHeaders : int
-        Number of headers used.  `numHeaders` must be greater than or equal
-        to 0.
+        Number of headers used.  ``numHeaders`` must be greater than or
+        equal to 0.
     headers : list[bytes]
-        Sources of the headers.  `headers` can be `NULL` when `numHeaders`
-        is 0.
+        Sources of the headers.  ``headers`` can be ``NULL`` when
+        ``numHeaders`` is 0.
     includeNames : list[bytes]
         Name of each header by which they can be included in the CUDA
-        program source.  `includeNames` can be `NULL` when `numHeaders` is
-        0. These headers must be included with the exact names specified
-        here.
+        program source.  ``includeNames`` can be ``NULL`` when
+        ``numHeaders`` is 0. These headers must be included with the exact
+        names specified here.
 
     Returns
     -------
@@ -494,8 +494,7 @@ def nvrtcDestroyProgram(prog):
 def nvrtcCompileProgram(prog, int numOptions, options : Optional[tuple[bytes] | list[bytes]]):
     """ nvrtcCompileProgram compiles the given program.
 
-    It supports compile options listed in :py:obj:`~.Supported Compile
-    Options`.
+    It supports compile options listed in Supported Compile Options.
 
     Parameters
     ----------
@@ -504,8 +503,8 @@ def nvrtcCompileProgram(prog, int numOptions, options : Optional[tuple[bytes] |
     numOptions : int
         Number of compiler options passed.
     options : list[bytes]
-        Compiler options in the form of C string array.  `options` can be
-        `NULL` when `numOptions` is 0.
+        Compiler options in the form of C string array.  ``options`` can be
+        ``NULL`` when ``numOptions`` is 0.
 
     Returns
     -------
@@ -539,7 +538,7 @@ def nvrtcCompileProgram(prog, int numOptions, options : Optional[tuple[bytes] |
 
 @cython.embedsignature(True)
 def nvrtcGetPTXSize(prog):
-    """ nvrtcGetPTXSize sets the value of `ptxSizeRet` with the size of the PTX generated by the previous compilation of `prog` (including the trailing `NULL`).
+    """ nvrtcGetPTXSize sets the value of ``ptxSizeRet`` with the size of the PTX generated by the previous compilation of ``prog`` (including the trailing ``NULL``).
 
     Parameters
     ----------
@@ -553,7 +552,7 @@ def nvrtcGetPTXSize(prog):
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
     ptxSizeRet : int
-        Size of the generated PTX (including the trailing `NULL`).
+        Size of the generated PTX (including the trailing ``NULL``).
 
     See Also
     --------
@@ -576,7 +575,7 @@ def nvrtcGetPTXSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetPTX(prog, char* ptx):
-    """ nvrtcGetPTX stores the PTX generated by the previous compilation of `prog` in the memory pointed by `ptx`.
+    """ nvrtcGetPTX stores the PTX generated by the previous compilation of ``prog`` in the memory pointed by ``ptx``.
 
     Parameters
     ----------
@@ -610,7 +609,7 @@ def nvrtcGetPTX(prog, char* ptx):
 
 @cython.embedsignature(True)
 def nvrtcGetCUBINSize(prog):
-    """ nvrtcGetCUBINSize sets the value of `cubinSizeRet` with the size of the cubin generated by the previous compilation of `prog`. The value of cubinSizeRet is set to 0 if the value specified to `-arch` is a virtual architecture instead of an actual architecture.
+    """ nvrtcGetCUBINSize sets the value of ``cubinSizeRet`` with the size of the cubin generated by the previous compilation of ``prog``. The value of cubinSizeRet is set to 0 if the value specified to ``-arch`` is a virtual architecture instead of an actual architecture.
 
     Parameters
     ----------
@@ -647,7 +646,7 @@ def nvrtcGetCUBINSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetCUBIN(prog, char* cubin):
-    """ nvrtcGetCUBIN stores the cubin generated by the previous compilation of `prog` in the memory pointed by `cubin`. No cubin is available if the value specified to `-arch` is a virtual architecture instead of an actual architecture. The cubin does not contain code for the Tile functions (`__tile__` / `__tile_global__`) or variables (`__tile__`); use `nvrtcGetTileIR()` to extract the cuda_tile IR generated for Tile code.
+    """ nvrtcGetCUBIN stores the cubin generated by the previous compilation of ``prog`` in the memory pointed by ``cubin``. No cubin is available if the value specified to ``-arch`` is a virtual architecture instead of an actual architecture. The cubin does not contain code for the Tile functions (``__tile__`` / ``__tile_global__``) or variables (``__tile__``); use ``nvrtcGetTileIR()`` to extract the cuda_tile IR generated for Tile code.
 
     Parameters
     ----------
@@ -681,7 +680,7 @@ def nvrtcGetCUBIN(prog, char* cubin):
 
 @cython.embedsignature(True)
 def nvrtcGetLTOIRSize(prog):
-    """ nvrtcGetLTOIRSize sets the value of `LTOIRSizeRet` with the size of the LTO IR generated by the previous compilation of `prog`. The value of LTOIRSizeRet is set to 0 if the program was not compiled with `-dlto`.
+    """ nvrtcGetLTOIRSize sets the value of ``LTOIRSizeRet`` with the size of the LTO IR generated by the previous compilation of ``prog``. The value of LTOIRSizeRet is set to 0 if the program was not compiled with ``-dlto``.
 
     Parameters
     ----------
@@ -718,7 +717,7 @@ def nvrtcGetLTOIRSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetLTOIR(prog, char* LTOIR):
-    """ nvrtcGetLTOIR stores the LTO IR generated by the previous compilation of `prog` in the memory pointed by `LTOIR`. No LTO IR is available if the program was compiled without `-dlto`.
+    """ nvrtcGetLTOIR stores the LTO IR generated by the previous compilation of ``prog`` in the memory pointed by ``LTOIR``. No LTO IR is available if the program was compiled without ``-dlto``.
 
     Parameters
     ----------
@@ -752,7 +751,7 @@ def nvrtcGetLTOIR(prog, char* LTOIR):
 
 @cython.embedsignature(True)
 def nvrtcGetOptiXIRSize(prog):
-    """ nvrtcGetOptiXIRSize sets the value of `optixirSizeRet` with the size of the OptiX IR generated by the previous compilation of `prog`. The value of nvrtcGetOptiXIRSize is set to 0 if the program was compiled with options incompatible with OptiX IR generation.
+    """ nvrtcGetOptiXIRSize sets the value of ``optixirSizeRet`` with the size of the OptiX IR generated by the previous compilation of ``prog``. The value of nvrtcGetOptiXIRSize is set to 0 if the program was compiled with options incompatible with OptiX IR generation.
 
     Parameters
     ----------
@@ -789,7 +788,7 @@ def nvrtcGetOptiXIRSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetOptiXIR(prog, char* optixir):
-    """ nvrtcGetOptiXIR stores the OptiX IR generated by the previous compilation of `prog` in the memory pointed by `optixir`. No OptiX IR is available if the program was compiled with options incompatible with OptiX IR generation.
+    """ nvrtcGetOptiXIR stores the OptiX IR generated by the previous compilation of ``prog`` in the memory pointed by ``optixir``. No OptiX IR is available if the program was compiled with options incompatible with OptiX IR generation.
 
     Parameters
     ----------
@@ -823,10 +822,10 @@ def nvrtcGetOptiXIR(prog, char* optixir):
 
 @cython.embedsignature(True)
 def nvrtcGetProgramLogSize(prog):
-    """ nvrtcGetProgramLogSize sets `logSizeRet` with the size of the log generated by the previous compilation of `prog` (including the trailing `NULL`).
+    """ nvrtcGetProgramLogSize sets ``logSizeRet`` with the size of the log generated by the previous compilation of ``prog`` (including the trailing ``NULL``).
 
     Note that compilation log may be generated with warnings and
-    informative messages, even when the compilation of `prog` succeeds.
+    informative messages, even when the compilation of ``prog`` succeeds.
 
     Parameters
     ----------
@@ -840,7 +839,7 @@ def nvrtcGetProgramLogSize(prog):
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
     logSizeRet : int
-        Size of the compilation log (including the trailing `NULL`).
+        Size of the compilation log (including the trailing ``NULL``).
 
     See Also
     --------
@@ -863,7 +862,7 @@ def nvrtcGetProgramLogSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetProgramLog(prog, char* log):
-    """ nvrtcGetProgramLog stores the log generated by the previous compilation of `prog` in the memory pointed by `log`.
+    """ nvrtcGetProgramLog stores the log generated by the previous compilation of ``prog`` in the memory pointed by ``log``.
 
     Parameters
     ----------
@@ -1106,9 +1105,9 @@ def nvrtcSetFlowCallback(prog, callback, payload):
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    When invoking the callback, the compiler will always pass `payload` to
-    param1 so that the callback may make decisions based on `payload` .
-    It'll always pass NULL to param2 for now which is reserved for future
+    When invoking the callback, the compiler will always pass ``payload``
+    to param1 so that the callback may make decisions based on ``payload``
+    . It'll always pass NULL to param2 for now which is reserved for future
     extensions.
 
     (2) It must return 1 to cancel compilation or 0 to continue. Other
@@ -1158,7 +1157,7 @@ def nvrtcSetFlowCallback(prog, callback, payload):
 
 @cython.embedsignature(True)
 def nvrtcGetTileIRSize(prog):
-    """ nvrtcGetTileIRSize sets the value of `TileIRSizeRet` with the size of the cuda_tile IR generated by the previous compilation of `prog`.
+    """ nvrtcGetTileIRSize sets the value of ``TileIRSizeRet`` with the size of the cuda_tile IR generated by the previous compilation of ``prog``.
 
     Parameters
     ----------
@@ -1195,7 +1194,7 @@ def nvrtcGetTileIRSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetTileIR(prog, char* TileIR):
-    """ nvrtcGetTileIR stores the cuda_tile IR generated by the previous compilation of `prog` in the memory pointed by `TileIR`.
+    """ nvrtcGetTileIR stores the cuda_tile IR generated by the previous compilation of ``prog`` in the memory pointed by ``TileIR``.
 
     Parameters
     ----------
@@ -1267,7 +1266,7 @@ def nvrtcInstallBundledHeaders(char* installPath, unsigned int flags):
         - :py:obj:`~.NVRTC_ERROR_BUSY` (lock held by another process and NVRTC_INSTALL_HEADERS_NO_WAIT was specified)
     errorLog : bytes
         Optional pointer to receive detailed error message on failure. If
-        non-NULL, `*errorLog` will be set to point to a string describing
+        non-NULL, ``*errorLog`` will be set to point to a string describing
         the error cause. Note: subsequent API calls from the same thread
         may overwrite this message. May be NULL if error details are not
         needed.
@@ -1305,7 +1304,7 @@ def nvrtcGetBundledHeadersInfo():
         Pointer to structure to receive header information.
     errorLog : bytes
         Optional pointer to receive detailed error message on failure. If
-        non-NULL, `*errorLog` will be set to point to a string describing
+        non-NULL, ``*errorLog`` will be set to point to a string describing
         the error cause. Note: subsequent API calls from the same thread
         may overwrite this message. May be NULL if error details are not
         needed.
@@ -1341,7 +1340,7 @@ def nvrtcRemoveBundledHeaders(char* installPath):
         - :py:obj:`~.NVRTC_ERROR_BUILTIN_OPERATION_FAILURE` (removal failed)
     errorLog : bytes
         Optional pointer to receive detailed error message on failure. If
-        non-NULL, `*errorLog` will be set to point to a string describing
+        non-NULL, ``*errorLog`` will be set to point to a string describing
         the error cause. Note: subsequent API calls from the same thread
         may overwrite this message. May be NULL if error details are not
         needed.
diff --git a/cuda_bindings/cuda/bindings/runtime.pxd.in b/cuda_bindings/cuda/bindings/runtime.pxd.in
index 323fc99e46..6c36eb6434 100644
--- a/cuda_bindings/cuda/bindings/runtime.pxd.in
+++ b/cuda_bindings/cuda/bindings/runtime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 cimport cuda.bindings.cyruntime as cyruntime
 
 include "_lib/utils.pxd"
@@ -12,7 +12,7 @@ cimport cuda.bindings.driver as driver
 cdef class cudaDevResourceDesc_t:
     """
 
-    An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources. Created via ::cudaDeviceResourceGenerateDesc
+    An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources. Created via ``cudaDeviceResourceGenerateDesc``
 
     Methods
     -------
@@ -526,7 +526,7 @@ cdef class cudaArrayMemoryRequirements:
 
 cdef class cudaPitchedPtr:
     """
-    CUDA Pitched memory pointer  make_cudaPitchedPtr
+    CUDA Pitched memory pointer  ``make_cudaPitchedPtr``
 
     Attributes
     ----------
@@ -562,7 +562,7 @@ cdef class cudaPitchedPtr:
 
 cdef class cudaExtent:
     """
-    CUDA extent  make_cudaExtent
+    CUDA extent  ``make_cudaExtent``
 
     Attributes
     ----------
@@ -592,7 +592,7 @@ cdef class cudaExtent:
 
 cdef class cudaPos:
     """
-    CUDA 3D position  make_cudaPos
+    CUDA 3D position  ``make_cudaPos``
 
     Attributes
     ----------
@@ -929,11 +929,11 @@ cdef class cudaAccessPolicyWindow:
     {{endif}}
     {{if 'cudaAccessPolicyWindow.hitProp' in found_struct}}
     hitProp : cudaAccessProperty
-        ::CUaccessProperty set for hit.
+        ``CUaccessProperty`` set for hit.
     {{endif}}
     {{if 'cudaAccessPolicyWindow.missProp' in found_struct}}
     missProp : cudaAccessProperty
-        ::CUaccessProperty set for miss. Must be either NORMAL or
+        ``CUaccessProperty`` set for miss. Must be either NORMAL or
         STREAMING.
     {{endif}}
 
@@ -1510,7 +1510,7 @@ cdef class cudaMemAccessDesc:
     {{endif}}
     {{if 'cudaMemAccessDesc.flags' in found_struct}}
     flags : cudaMemAccessFlags
-        ::CUmemProt accessibility flags to set on the request
+        ``CUmemProt`` accessibility flags to set on the request
     {{endif}}
 
     Methods
@@ -1613,8 +1613,8 @@ cdef class cudaMemAllocNodeParams:
     {{if 'cudaMemAllocNodeParams.poolProps' in found_struct}}
     poolProps : cudaMemPoolProps
         in: location where the allocation should reside (specified in
-        ::location). ::handleTypes must be cudaMemHandleTypeNone. IPC is
-        not supported. in: array of memory access descriptors. Used to
+        ``location``). ``handleTypes`` must be cudaMemHandleTypeNone. IPC
+        is not supported. in: array of memory access descriptors. Used to
         describe peer GPU access
     {{endif}}
     {{if 'cudaMemAllocNodeParams.accessDescs' in found_struct}}
@@ -1624,7 +1624,7 @@ cdef class cudaMemAllocNodeParams:
     {{endif}}
     {{if 'cudaMemAllocNodeParams.accessDescCount' in found_struct}}
     accessDescCount : size_t
-        in: Number of `accessDescs`s
+        in: Number of ``accessDescs``
     {{endif}}
     {{if 'cudaMemAllocNodeParams.bytesize' in found_struct}}
     bytesize : size_t
@@ -1664,8 +1664,8 @@ cdef class cudaMemAllocNodeParamsV2:
     {{if 'cudaMemAllocNodeParamsV2.poolProps' in found_struct}}
     poolProps : cudaMemPoolProps
         in: location where the allocation should reside (specified in
-        ::location). ::handleTypes must be cudaMemHandleTypeNone. IPC is
-        not supported. in: array of memory access descriptors. Used to
+        ``location``). ``handleTypes`` must be cudaMemHandleTypeNone. IPC
+        is not supported. in: array of memory access descriptors. Used to
         describe peer GPU access
     {{endif}}
     {{if 'cudaMemAllocNodeParamsV2.accessDescs' in found_struct}}
@@ -1675,7 +1675,7 @@ cdef class cudaMemAllocNodeParamsV2:
     {{endif}}
     {{if 'cudaMemAllocNodeParamsV2.accessDescCount' in found_struct}}
     accessDescCount : size_t
-        in: Number of `accessDescs`s
+        in: Number of ``accessDescs``
     {{endif}}
     {{if 'cudaMemAllocNodeParamsV2.bytesize' in found_struct}}
     bytesize : size_t
@@ -3134,7 +3134,7 @@ cdef class cudaDevSmResourceGroupParams_st:
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
     flags : unsigned int
-        Combination of `cudaDevSmResourceGroup_flags` values to indicate
+        Combination of ``cudaDevSmResourceGroup_flags`` values to indicate
         this this group is created.
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
@@ -3159,16 +3159,16 @@ cdef class cudaDevResource_st:
     of the API that created it. struct enumcudaDevResourceTypetype;
     union structcudaDevSmResourcesm;
     structcudaDevWorkqueueConfigResourcewqConfig;
-    structcudaDevWorkqueueResourcewq; ; ;  - If `typename` is
-    `cudaDevResourceTypeInvalid`, this resoure is not valid and cannot
-    be further accessed.    - If `typename` is `cudaDevResourceTypeSm`,
-    the cudaDevSmResource structure `sm` is filled in. For example,
-    `sm.smCount` will reflect the amount of streaming multiprocessors
-    available in this resource.    - If `typename` is
-    `cudaDevResourceTypeWorkqueueConfig`, the
-    cudaDevWorkqueueConfigResource structure `wqConfig` is filled in.
-    - If `typename` is `cudaDevResourceTypeWorkqueue`, the
-    cudaDevWorkqueueResource structure `wq` is filled in.
+    structcudaDevWorkqueueResourcewq; ; ;  - If ``typename`` is
+    ``cudaDevResourceTypeInvalid``, this resoure is not valid and
+    cannot be further accessed.    - If ``typename`` is
+    ``cudaDevResourceTypeSm``, the cudaDevSmResource structure ``sm``
+    is filled in. For example, ``sm.smCount`` will reflect the amount
+    of streaming multiprocessors available in this resource.    - If
+    ``typename`` is ``cudaDevResourceTypeWorkqueueConfig``, the
+    cudaDevWorkqueueConfigResource structure ``wqConfig`` is filled in.
+    - If ``typename`` is ``cudaDevResourceTypeWorkqueue``, the
+    cudaDevWorkqueueResource structure ``wq`` is filled in.
 
     Attributes
     ----------
@@ -3182,16 +3182,17 @@ cdef class cudaDevResource_st:
     {{endif}}
     {{if 'cudaDevResource_st.sm' in found_struct}}
     sm : cudaDevSmResource
-        Resource corresponding to cudaDevResourceTypeSm `typename`.
+        Resource corresponding to cudaDevResourceTypeSm ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st.wqConfig' in found_struct}}
     wqConfig : cudaDevWorkqueueConfigResource
         Resource corresponding to cudaDevResourceTypeWorkqueueConfig
-        `typename`.
+        ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st.wq' in found_struct}}
     wq : cudaDevWorkqueueResource
-        Resource corresponding to cudaDevResourceTypeWorkqueue `typename`.
+        Resource corresponding to cudaDevResourceTypeWorkqueue
+        ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st._oversize' in found_struct}}
     _oversize : bytes
@@ -3579,12 +3580,12 @@ cdef class cudaConditionalNodeParams:
         graphs at any level, must belong to the same CUDA context.
         These graphs may be populated using graph node creation APIs or
         cudaStreamBeginCaptureToGraph. cudaGraphCondTypeIf: phGraph_out[0]
-        is executed when the condition is non-zero. If `size` == 2,
+        is executed when the condition is non-zero. If ``size`` == 2,
         phGraph_out[1] will be executed when the condition is zero.
         cudaGraphCondTypeWhile: phGraph_out[0] is executed as long as the
         condition is non-zero. cudaGraphCondTypeSwitch: phGraph_out[n] is
         executed when the condition is equal to n. If the condition >=
-        `size`, no body graph is executed.
+        ``size``, no body graph is executed.
     {{endif}}
     {{if 'cudaConditionalNodeParams.ctx' in found_struct}}
     ctx : cudaExecutionContext_t
@@ -3823,7 +3824,7 @@ cdef class cudaGraphEdgeData_st:
         node on the edge. The meaning is specfic to the node type. A value
         of 0 in all cases means full completion of the upstream node, with
         memory visibility to the downstream node or portion thereof
-        (indicated by `to_port`).   Only kernel nodes define non-zero
+        (indicated by ``to_port``).   Only kernel nodes define non-zero
         ports. A kernel node can use the following output port types:
         cudaGraphKernelNodePortDefault,
         cudaGraphKernelNodePortProgrammatic, or
@@ -3832,7 +3833,7 @@ cdef class cudaGraphEdgeData_st:
     {{if 'cudaGraphEdgeData_st.to_port' in found_struct}}
     to_port : bytes
         This indicates what portion of the downstream node is dependent on
-        the upstream node or portion thereof (indicated by `from_port`).
+        the upstream node or portion thereof (indicated by ``from_port``).
         The meaning is specific to the node type. A value of 0 in all cases
         means the entirety of the downstream node is dependent on the
         upstream work.   Currently no node types define non-zero ports.
@@ -4001,7 +4002,7 @@ cdef class anon_union10:
 cdef class cudaGraphKernelNodeUpdate:
     """
     Struct to specify a single node update to pass as part of a larger
-    array to ::cudaGraphKernelNodeUpdatesApply
+    array to ``cudaGraphKernelNodeUpdatesApply``
 
     Attributes
     ----------
@@ -4225,11 +4226,11 @@ cdef class cudaLaunchAttributeValue:
     clusterDim : anon_struct17
         Value of launch attribute cudaLaunchAttributeClusterDimension that
         represents the desired cluster dimensions for the kernel. Opaque
-        type with the following fields: - `x` - The X dimension of the
+        type with the following fields: - ``x`` - The X dimension of the
         cluster, in blocks. Must be a divisor of the grid X dimension.    -
-        `y` - The Y dimension of the cluster, in blocks. Must be a divisor
-        of the grid Y dimension.    - `z` - The Z dimension of the cluster,
-        in blocks. Must be a divisor of the grid Z dimension.
+        ``y`` - The Y dimension of the cluster, in blocks. Must be a
+        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
+        the cluster, in blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterSchedulingPolicyPreference' in found_struct}}
     clusterSchedulingPolicyPreference : cudaClusterSchedulingPolicy
@@ -4245,11 +4246,12 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct18
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
-        the following fields: - `cudaEvent_t` event - Event to fire when
-        all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        the following fields: - ``cudaEvent_t`` event - Event to fire when
+        all blocks trigger it.    - ``int`` flags; - Event record flags,
+        see cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - ``int`` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -4271,21 +4273,21 @@ cdef class cudaLaunchAttributeValue:
         Value of launch attribute
         cudaLaunchAttributePreferredClusterDimension that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - `x` - The X dimension of the preferred
-        cluster, in blocks. Must be a divisor of the grid X dimension, and
-        must be a multiple of the `x` field of
-        ::cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension
+        with the following fields: - ``x`` - The X dimension of the
+        preferred cluster, in blocks. Must be a divisor of the grid X
+        dimension, and must be a multiple of the ``x`` field of
+        cudaLaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension
         of the preferred cluster, in blocks. Must be a divisor of the grid
-        Y dimension, and must be a multiple of the `y` field of
-        ::cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension
-        of the preferred cluster, in blocks. Must be equal to the `z` field
-        of ::cudaLaunchAttributeValue::clusterDim.
+        Y dimension, and must be a multiple of the ``y`` field of
+        cudaLaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension
+        of the preferred cluster, in blocks. Must be equal to the ``z``
+        field of cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
-        with the following fields: - `cudaEvent_t` event - Event to fire
-        when the last block launches.    - `int` flags - Event record
+        with the following fields: - ``cudaEvent_t`` event - Event to fire
+        when the last block launches.    - ``int`` flags - Event record
         flags, see cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
@@ -4293,9 +4295,9 @@ cdef class cudaLaunchAttributeValue:
     deviceUpdatableKernelNode : anon_struct21
         Value of launch attribute
         cudaLaunchAttributeDeviceUpdatableKernelNode with the following
-        fields: - `int` deviceUpdatable - Whether or not the resulting
+        fields: - ``int`` deviceUpdatable - Whether or not the resulting
         kernel node should be device-updatable.    -
-        `cudaGraphDeviceNode_t` devNode - Returns a handle to pass to the
+        ``cudaGraphDeviceNode_t`` devNode - Returns a handle to pass to the
         various device-side update functions.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemCarveout' in found_struct}}
@@ -4429,7 +4431,7 @@ cdef class cudaAsyncNotificationInfo:
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
     info : anon_union11
-        Information about the notification. `typename` must be checked in
+        Information about the notification. ``typename`` must be checked in
         order to interpret this field.
     {{endif}}
 
@@ -4781,7 +4783,7 @@ cdef class cudaDevSmResourceGroupParams(cudaDevSmResourceGroupParams_st):
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
     flags : unsigned int
-        Combination of `cudaDevSmResourceGroup_flags` values to indicate
+        Combination of ``cudaDevSmResourceGroup_flags`` values to indicate
         this this group is created.
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
@@ -4805,16 +4807,16 @@ cdef class cudaDevResource(cudaDevResource_st):
     of the API that created it. struct enumcudaDevResourceTypetype;
     union structcudaDevSmResourcesm;
     structcudaDevWorkqueueConfigResourcewqConfig;
-    structcudaDevWorkqueueResourcewq; ; ;  - If `typename` is
-    `cudaDevResourceTypeInvalid`, this resoure is not valid and cannot
-    be further accessed.    - If `typename` is `cudaDevResourceTypeSm`,
-    the cudaDevSmResource structure `sm` is filled in. For example,
-    `sm.smCount` will reflect the amount of streaming multiprocessors
-    available in this resource.    - If `typename` is
-    `cudaDevResourceTypeWorkqueueConfig`, the
-    cudaDevWorkqueueConfigResource structure `wqConfig` is filled in.
-    - If `typename` is `cudaDevResourceTypeWorkqueue`, the
-    cudaDevWorkqueueResource structure `wq` is filled in.
+    structcudaDevWorkqueueResourcewq; ; ;  - If ``typename`` is
+    ``cudaDevResourceTypeInvalid``, this resoure is not valid and
+    cannot be further accessed.    - If ``typename`` is
+    ``cudaDevResourceTypeSm``, the cudaDevSmResource structure ``sm``
+    is filled in. For example, ``sm.smCount`` will reflect the amount
+    of streaming multiprocessors available in this resource.    - If
+    ``typename`` is ``cudaDevResourceTypeWorkqueueConfig``, the
+    cudaDevWorkqueueConfigResource structure ``wqConfig`` is filled in.
+    - If ``typename`` is ``cudaDevResourceTypeWorkqueue``, the
+    cudaDevWorkqueueResource structure ``wq`` is filled in.
 
     Attributes
     ----------
@@ -4828,16 +4830,17 @@ cdef class cudaDevResource(cudaDevResource_st):
     {{endif}}
     {{if 'cudaDevResource_st.sm' in found_struct}}
     sm : cudaDevSmResource
-        Resource corresponding to cudaDevResourceTypeSm `typename`.
+        Resource corresponding to cudaDevResourceTypeSm ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st.wqConfig' in found_struct}}
     wqConfig : cudaDevWorkqueueConfigResource
         Resource corresponding to cudaDevResourceTypeWorkqueueConfig
-        `typename`.
+        ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st.wq' in found_struct}}
     wq : cudaDevWorkqueueResource
-        Resource corresponding to cudaDevResourceTypeWorkqueue `typename`.
+        Resource corresponding to cudaDevResourceTypeWorkqueue
+        ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st._oversize' in found_struct}}
     _oversize : bytes
@@ -4872,7 +4875,7 @@ cdef class cudaGraphEdgeData(cudaGraphEdgeData_st):
         node on the edge. The meaning is specfic to the node type. A value
         of 0 in all cases means full completion of the upstream node, with
         memory visibility to the downstream node or portion thereof
-        (indicated by `to_port`).   Only kernel nodes define non-zero
+        (indicated by ``to_port``).   Only kernel nodes define non-zero
         ports. A kernel node can use the following output port types:
         cudaGraphKernelNodePortDefault,
         cudaGraphKernelNodePortProgrammatic, or
@@ -4881,7 +4884,7 @@ cdef class cudaGraphEdgeData(cudaGraphEdgeData_st):
     {{if 'cudaGraphEdgeData_st.to_port' in found_struct}}
     to_port : bytes
         This indicates what portion of the downstream node is dependent on
-        the upstream node or portion thereof (indicated by `from_port`).
+        the upstream node or portion thereof (indicated by ``from_port``).
         The meaning is specific to the node type. A value of 0 in all cases
         means the entirety of the downstream node is dependent on the
         upstream work.   Currently no node types define non-zero ports.
@@ -5037,7 +5040,7 @@ cdef class cudaAsyncNotificationInfo_t(cudaAsyncNotificationInfo):
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
     info : anon_union11
-        Information about the notification. `typename` must be checked in
+        Information about the notification. ``typename`` must be checked in
         order to interpret this field.
     {{endif}}
 
@@ -5078,11 +5081,11 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
     clusterDim : anon_struct17
         Value of launch attribute cudaLaunchAttributeClusterDimension that
         represents the desired cluster dimensions for the kernel. Opaque
-        type with the following fields: - `x` - The X dimension of the
+        type with the following fields: - ``x`` - The X dimension of the
         cluster, in blocks. Must be a divisor of the grid X dimension.    -
-        `y` - The Y dimension of the cluster, in blocks. Must be a divisor
-        of the grid Y dimension.    - `z` - The Z dimension of the cluster,
-        in blocks. Must be a divisor of the grid Z dimension.
+        ``y`` - The Y dimension of the cluster, in blocks. Must be a
+        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
+        the cluster, in blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterSchedulingPolicyPreference' in found_struct}}
     clusterSchedulingPolicyPreference : cudaClusterSchedulingPolicy
@@ -5098,11 +5101,12 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct18
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
-        the following fields: - `cudaEvent_t` event - Event to fire when
-        all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        the following fields: - ``cudaEvent_t`` event - Event to fire when
+        all blocks trigger it.    - ``int`` flags; - Event record flags,
+        see cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - ``int`` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -5124,21 +5128,21 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
         Value of launch attribute
         cudaLaunchAttributePreferredClusterDimension that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - `x` - The X dimension of the preferred
-        cluster, in blocks. Must be a divisor of the grid X dimension, and
-        must be a multiple of the `x` field of
-        ::cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension
+        with the following fields: - ``x`` - The X dimension of the
+        preferred cluster, in blocks. Must be a divisor of the grid X
+        dimension, and must be a multiple of the ``x`` field of
+        cudaLaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension
         of the preferred cluster, in blocks. Must be a divisor of the grid
-        Y dimension, and must be a multiple of the `y` field of
-        ::cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension
-        of the preferred cluster, in blocks. Must be equal to the `z` field
-        of ::cudaLaunchAttributeValue::clusterDim.
+        Y dimension, and must be a multiple of the ``y`` field of
+        cudaLaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension
+        of the preferred cluster, in blocks. Must be equal to the ``z``
+        field of cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
-        with the following fields: - `cudaEvent_t` event - Event to fire
-        when the last block launches.    - `int` flags - Event record
+        with the following fields: - ``cudaEvent_t`` event - Event to fire
+        when the last block launches.    - ``int`` flags - Event record
         flags, see cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
@@ -5146,9 +5150,9 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
     deviceUpdatableKernelNode : anon_struct21
         Value of launch attribute
         cudaLaunchAttributeDeviceUpdatableKernelNode with the following
-        fields: - `int` deviceUpdatable - Whether or not the resulting
+        fields: - ``int`` deviceUpdatable - Whether or not the resulting
         kernel node should be device-updatable.    -
-        `cudaGraphDeviceNode_t` devNode - Returns a handle to pass to the
+        ``cudaGraphDeviceNode_t`` devNode - Returns a handle to pass to the
         various device-side update functions.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemCarveout' in found_struct}}
@@ -5209,11 +5213,11 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
     clusterDim : anon_struct17
         Value of launch attribute cudaLaunchAttributeClusterDimension that
         represents the desired cluster dimensions for the kernel. Opaque
-        type with the following fields: - `x` - The X dimension of the
+        type with the following fields: - ``x`` - The X dimension of the
         cluster, in blocks. Must be a divisor of the grid X dimension.    -
-        `y` - The Y dimension of the cluster, in blocks. Must be a divisor
-        of the grid Y dimension.    - `z` - The Z dimension of the cluster,
-        in blocks. Must be a divisor of the grid Z dimension.
+        ``y`` - The Y dimension of the cluster, in blocks. Must be a
+        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
+        the cluster, in blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterSchedulingPolicyPreference' in found_struct}}
     clusterSchedulingPolicyPreference : cudaClusterSchedulingPolicy
@@ -5229,11 +5233,12 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct18
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
-        the following fields: - `cudaEvent_t` event - Event to fire when
-        all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        the following fields: - ``cudaEvent_t`` event - Event to fire when
+        all blocks trigger it.    - ``int`` flags; - Event record flags,
+        see cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - ``int`` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -5255,21 +5260,21 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
         Value of launch attribute
         cudaLaunchAttributePreferredClusterDimension that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - `x` - The X dimension of the preferred
-        cluster, in blocks. Must be a divisor of the grid X dimension, and
-        must be a multiple of the `x` field of
-        ::cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension
+        with the following fields: - ``x`` - The X dimension of the
+        preferred cluster, in blocks. Must be a divisor of the grid X
+        dimension, and must be a multiple of the ``x`` field of
+        cudaLaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension
         of the preferred cluster, in blocks. Must be a divisor of the grid
-        Y dimension, and must be a multiple of the `y` field of
-        ::cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension
-        of the preferred cluster, in blocks. Must be equal to the `z` field
-        of ::cudaLaunchAttributeValue::clusterDim.
+        Y dimension, and must be a multiple of the ``y`` field of
+        cudaLaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension
+        of the preferred cluster, in blocks. Must be equal to the ``z``
+        field of cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
-        with the following fields: - `cudaEvent_t` event - Event to fire
-        when the last block launches.    - `int` flags - Event record
+        with the following fields: - ``cudaEvent_t`` event - Event to fire
+        when the last block launches.    - ``int`` flags - Event record
         flags, see cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
@@ -5277,9 +5282,9 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
     deviceUpdatableKernelNode : anon_struct21
         Value of launch attribute
         cudaLaunchAttributeDeviceUpdatableKernelNode with the following
-        fields: - `int` deviceUpdatable - Whether or not the resulting
+        fields: - ``int`` deviceUpdatable - Whether or not the resulting
         kernel node should be device-updatable.    -
-        `cudaGraphDeviceNode_t` devNode - Returns a handle to pass to the
+        ``cudaGraphDeviceNode_t`` devNode - Returns a handle to pass to the
         various device-side update functions.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemCarveout' in found_struct}}
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 5c38d5c0a2..26a4e6ad67 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1719+g565f73f4e. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -228,21 +228,19 @@ CUDA_IPC_HANDLE_SIZE = cyruntime.CUDA_IPC_HANDLE_SIZE
 #: Indicates that the external memory object is a dedicated resource
 cudaExternalMemoryDedicated = cyruntime.cudaExternalMemoryDedicated
 
-#: When the /p flags parameter of
-#: :py:obj:`~.cudaExternalSemaphoreSignalParams` contains this flag, it
-#: indicates that signaling an external semaphore object should skip
-#: performing appropriate memory synchronization operations over all the
-#: external memory objects that are imported as
+#: When the /p flags parameter of cudaExternalSemaphoreSignalParams
+#: contains this flag, it indicates that signaling an external semaphore
+#: object should skip performing appropriate memory synchronization
+#: operations over all the external memory objects that are imported as
 #: :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are
 #: performed by default to ensure data coherency with other importers of
 #: the same NvSciBuf memory objects.
 cudaExternalSemaphoreSignalSkipNvSciBufMemSync = cyruntime.cudaExternalSemaphoreSignalSkipNvSciBufMemSync
 
-#: When the /p flags parameter of
-#: :py:obj:`~.cudaExternalSemaphoreWaitParams` contains this flag, it
-#: indicates that waiting an external semaphore object should skip
-#: performing appropriate memory synchronization operations over all the
-#: external memory objects that are imported as
+#: When the /p flags parameter of cudaExternalSemaphoreWaitParams contains
+#: this flag, it indicates that waiting an external semaphore object should
+#: skip performing appropriate memory synchronization operations over all
+#: the external memory objects that are imported as
 #: :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are
 #: performed by default to ensure data coherency with other importers of
 #: the same NvSciBuf memory objects.
@@ -413,8 +411,7 @@ class cudaError_t(_FastEnum):
         'This indicates that a kernel launch is requesting resources that can never\n'
         'be satisfied by the current device. Requesting more shared memory per block\n'
         'than the device supports will trigger this error, as will requesting too\n'
-        'many threads or blocks. See :py:obj:`~.cudaDeviceProp` for more device\n'
-        'limitations.\n'
+        'many threads or blocks. See cudaDeviceProp for more device limitations.\n'
     ){{endif}}
     {{if 'cudaErrorVersionTranslation' in found_values}}
 
@@ -618,7 +615,7 @@ class cudaError_t(_FastEnum):
         'Driver context was created using an older version of the API, because the\n'
         'Runtime API call expects a primary driver context and the Driver context is\n'
         'not primary, or because the Driver context has been destroyed. Please see\n'
-        ':py:obj:`~.Interactions`with the CUDA Driver API" for more information.\n'
+        'Interactions with the CUDA Driver API for more information.\n'
     ){{endif}}
     {{if 'cudaErrorMissingConfiguration' in found_values}}
 
@@ -1468,7 +1465,7 @@ class cudaGraphDependencyType(_FastEnum):
     cudaGraphDependencyTypeProgrammatic = (
         cyruntime.cudaGraphDependencyType_enum.cudaGraphDependencyTypeProgrammatic,
         'This dependency type allows the downstream node to use\n'
-        '`cudaGridDependencySynchronize()`. It may only be used between kernel\n'
+        '``cudaGridDependencySynchronize()``. It may only be used between kernel\n'
         'nodes, and must be used with either the\n'
         ':py:obj:`~.cudaGraphKernelNodePortProgrammatic` or\n'
         ':py:obj:`~.cudaGraphKernelNodePortLaunchCompletion` outgoing port.\n'
@@ -1606,41 +1603,41 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributeAccessPolicyWindow = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeCooperative' in found_values}}
 
     cudaLaunchAttributeCooperative = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::cooperative.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.cooperative`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSynchronizationPolicy' in found_values}}
 
     cudaLaunchAttributeSynchronizationPolicy = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy,
-        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.\n'
+        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterDimension' in found_values}}
 
     cudaLaunchAttributeClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::clusterDim.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.clusterDim`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterSchedulingPolicyPreference' in found_values}}
 
     cudaLaunchAttributeClusterSchedulingPolicyPreference = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeProgrammaticStreamSerialization' in found_values}}
 
     cudaLaunchAttributeProgrammaticStreamSerialization = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization,
         'Valid for launches. Setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed`\n'
         'to non-0 signals that the kernel will use programmatic means to resolve its\n'
         'stream dependency, so that the CUDA runtime should opportunistically allow\n'
         "the grid's execution to overlap with the previous kernel in the stream, if\n"
@@ -1653,11 +1650,11 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributeProgrammaticEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the\n'
-        'event. Event recorded through this launch attribute is guaranteed to only\n'
-        'trigger after all block in the associated kernel trigger the event. A block\n'
-        'can trigger the event programmatically in a future CUDA release. A trigger\n'
-        "can also be inserted at the beginning of each block's execution if\n"
+        ':py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event.\n'
+        'Event recorded through this launch attribute is guaranteed to only trigger\n'
+        'after all block in the associated kernel trigger the event. A block can\n'
+        'trigger the event programmatically in a future CUDA release. A trigger can\n'
+        "also be inserted at the beginning of each block's execution if\n"
         'triggerAtBlockStart is set to non-0. The dependent launches can choose to\n'
         'wait on the dependency using the programmatic sync\n'
         '(cudaGridDependencySynchronize() or equivalent PTX instructions). Note that\n'
@@ -1678,28 +1675,28 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributePriority = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePriority,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::priority.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.priority`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomainMap' in found_values}}
 
     cudaLaunchAttributeMemSyncDomainMap = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomain' in found_values}}
 
     cudaLaunchAttributeMemSyncDomain = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePreferredClusterDimension' in found_values}}
 
     cudaLaunchAttributePreferredClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension,
         'Valid for graph nodes and launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the\n'
         'kernel launch to specify a preferred substitute cluster dimension. Blocks\n'
         'may be grouped according to either the dimensions specified with this\n'
         'attribute (grouped into a "preferred substitute cluster"), or the one\n'
@@ -1722,9 +1719,9 @@ class cudaLaunchAttributeID(_FastEnum):
         ' This attribute will only take effect when a regular cluster dimension has\n'
         'been specified. The preferred substitute cluster dimension must be an\n'
         'integer multiple greater than zero of the regular cluster dimension and\n'
-        'must divide the grid. It must also be no more than `maxBlocksPerCluster`,\n'
-        "if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less\n"
-        'than the maximum value the driver can support. Otherwise, setting this\n'
+        'must divide the grid. It must also be no more than ``maxBlocksPerCluster``,\n'
+        "if it is set in the kernel's ``__launch_bounds__``. Otherwise it must be\n"
+        'less than the maximum value the driver can support. Otherwise, setting this\n'
         'attribute to a value physically unable to fit on any particular device is\n'
         'permitted.\n'
     ){{endif}}
@@ -1733,7 +1730,7 @@ class cudaLaunchAttributeID(_FastEnum):
     cudaLaunchAttributeLaunchCompletionEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the\n'
         'event.\n'
         ' Nominally, the event is triggered once all blocks of the kernel have begun\n'
         'execution. Currently this is a best effort. If a kernel B has a launch\n'
@@ -1743,8 +1740,8 @@ class cudaLaunchAttributeID(_FastEnum):
         'on different GPUs) or if B is a higher priority than A. Exercise caution if\n'
         'such an ordering inversion could lead to deadlock.\n'
         ' A launch completion event is nominally similar to a programmatic event\n'
-        'with `triggerAtBlockStart` set except that it is not visible to\n'
-        '`cudaGridDependencySynchronize()` and can be used with compute capability\n'
+        'with ``triggerAtBlockStart`` set except that it is not visible to\n'
+        '``cudaGridDependencySynchronize()`` and can be used with compute capability\n'
         'less than 9.0.\n'
         ' The event supplied must not be an interprocess or interop event. The event\n'
         'must disable timing (i.e. must be created with the\n'
@@ -1760,7 +1757,7 @@ class cudaLaunchAttributeID(_FastEnum):
         'only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode`\n'
         'which can be passed to the various device-side update functions to update\n'
         "the node's kernel parameters from within another kernel. For more\n"
         'information on the types of device updates that can be made, as well as the\n'
@@ -1788,7 +1785,7 @@ class cudaLaunchAttributeID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout,
         'Valid for launches. On devices where the L1 cache and shared memory use the\n'
         'same hardware resources, setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage\n'
         'between 0-100 signals sets the shared memory carveout preference in percent\n'
         'of the total shared memory for that kernel launch. This attribute takes\n'
         'precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`.\n'
@@ -1814,7 +1811,7 @@ class cudaLaunchAttributeID(_FastEnum):
         'not improve the performance of either the targeted kernel or the\n'
         'encapsulating application.\n'
         ' Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::nvlinkUtilCentricScheduling are 0\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0\n'
         '(disabled) and 1 (enabled).\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePortableClusterSizeMode' in found_values}}
@@ -1823,8 +1820,8 @@ class cudaLaunchAttributeID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePortableClusterSizeMode,
         'Valid for graph nodes, launches. This indicates whether the kernel launch\n'
         'is allowed to use a non-portable cluster size. Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::portableClusterSizeMode are values\n'
-        'for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.portableClusterSizeMode` are values for\n'
+        ':py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
         'return :py:obj:`~.cudaErrorInvalidValue`\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSharedMemoryMode' in found_values}}
@@ -3220,8 +3217,8 @@ class cudaMemcpyKind(_FastEnum):
 
 class cudaAccessProperty(_FastEnum):
     """
-    Specifies performance hint with :py:obj:`~.cudaAccessPolicyWindow`
-    for hitProp and missProp members.
+    Specifies performance hint with cudaAccessPolicyWindow for hitProp
+    and missProp members.
     """
     {{if 'cudaAccessPropertyNormal' in found_values}}
 
@@ -5870,11 +5867,11 @@ class cudaLibraryOption(_FastEnum):
 
     cudaLibraryBinaryIsPreserved = (
         cyruntime.cudaLibraryOption.cudaLibraryBinaryIsPreserved,
-        'Specifes that the argument `code` passed to\n'
+        'Specifes that the argument ``code`` passed to\n'
         ':py:obj:`~.cudaLibraryLoadData()` will be preserved. Specifying this option\n'
-        'will let the driver know that `code` can be accessed at any point until\n'
+        'will let the driver know that ``code`` can be accessed at any point until\n'
         ':py:obj:`~.cudaLibraryUnload()`. The default behavior is for the driver to\n'
-        'allocate and maintain its own copy of `code`. Note that this is only a\n'
+        'allocate and maintain its own copy of ``code``. Note that this is only a\n'
         'memory usage optimization hint and the driver can choose to ignore it if\n'
         'required. Specifying this option with :py:obj:`~.cudaLibraryLoadFromFile()`\n'
         'is invalid and will return :py:obj:`~.cudaErrorInvalidValue`.\n'
@@ -6011,8 +6008,8 @@ class cudaGraphConditionalNodeType(_FastEnum):
     cudaGraphCondTypeIf = (
         cyruntime.cudaGraphConditionalNodeType.cudaGraphCondTypeIf,
         "Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If\n"
-        '`size` == 2, an optional ELSE graph is created and this is executed if the\n'
-        'condition is zero.\n'
+        '``size`` == 2, an optional ELSE graph is created and this is executed if\n'
+        'the condition is zero.\n'
     ){{endif}}
     {{if 'cudaGraphCondTypeWhile' in found_values}}
 
@@ -6352,25 +6349,25 @@ class cudaGraphDebugDotFlags(_FastEnum):
 
     cudaGraphDebugDotFlagsKernelNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsKernelNodeParams,
-        'Adds :py:obj:`~.cudaKernelNodeParams` to output\n'
+        'Adds cudaKernelNodeParams to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsMemcpyNodeParams' in found_values}}
 
     cudaGraphDebugDotFlagsMemcpyNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsMemcpyNodeParams,
-        'Adds :py:obj:`~.cudaMemcpy3DParms` to output\n'
+        'Adds cudaMemcpy3DParms to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsMemsetNodeParams' in found_values}}
 
     cudaGraphDebugDotFlagsMemsetNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsMemsetNodeParams,
-        'Adds :py:obj:`~.cudaMemsetParams` to output\n'
+        'Adds cudaMemsetParams to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsHostNodeParams' in found_values}}
 
     cudaGraphDebugDotFlagsHostNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsHostNodeParams,
-        'Adds :py:obj:`~.cudaHostNodeParams` to output\n'
+        'Adds cudaHostNodeParams to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsEventNodeParams' in found_values}}
 
@@ -6382,13 +6379,13 @@ class cudaGraphDebugDotFlags(_FastEnum):
 
     cudaGraphDebugDotFlagsExtSemasSignalNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasSignalNodeParams,
-        'Adds :py:obj:`~.cudaExternalSemaphoreSignalNodeParams` values to output\n'
+        'Adds cudaExternalSemaphoreSignalNodeParams values to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsExtSemasWaitNodeParams' in found_values}}
 
     cudaGraphDebugDotFlagsExtSemasWaitNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasWaitNodeParams,
-        'Adds :py:obj:`~.cudaExternalSemaphoreWaitNodeParams` to output\n'
+        'Adds cudaExternalSemaphoreWaitNodeParams to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsKernelNodeAttributes' in found_values}}
 
@@ -6406,7 +6403,7 @@ class cudaGraphDebugDotFlags(_FastEnum):
 
     cudaGraphDebugDotFlagsConditionalNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsConditionalNodeParams,
-        'Adds :py:obj:`~.cudaConditionalNodeParams` to output\n'
+        'Adds cudaConditionalNodeParams to output\n'
     ){{endif}}
 
 {{endif}}
@@ -6429,7 +6426,7 @@ class cudaGraphInstantiateFlags(_FastEnum):
         'Automatically upload the graph after instantiation. Only supported by\n'
         ' :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed\n'
         'using the\n'
-        ' stream provided in `instantiateParams`.\n'
+        ' stream provided in ``instantiateParams``.\n'
     ){{endif}}
     {{if 'cudaGraphInstantiateFlagDeviceLaunch' in found_values}}
 
@@ -6710,41 +6707,41 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributeAccessPolicyWindow = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeCooperative' in found_values}}
 
     cudaLaunchAttributeCooperative = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::cooperative.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.cooperative`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSynchronizationPolicy' in found_values}}
 
     cudaLaunchAttributeSynchronizationPolicy = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy,
-        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.\n'
+        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterDimension' in found_values}}
 
     cudaLaunchAttributeClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::clusterDim.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.clusterDim`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterSchedulingPolicyPreference' in found_values}}
 
     cudaLaunchAttributeClusterSchedulingPolicyPreference = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeProgrammaticStreamSerialization' in found_values}}
 
     cudaLaunchAttributeProgrammaticStreamSerialization = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization,
         'Valid for launches. Setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed`\n'
         'to non-0 signals that the kernel will use programmatic means to resolve its\n'
         'stream dependency, so that the CUDA runtime should opportunistically allow\n'
         "the grid's execution to overlap with the previous kernel in the stream, if\n"
@@ -6757,11 +6754,11 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributeProgrammaticEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the\n'
-        'event. Event recorded through this launch attribute is guaranteed to only\n'
-        'trigger after all block in the associated kernel trigger the event. A block\n'
-        'can trigger the event programmatically in a future CUDA release. A trigger\n'
-        "can also be inserted at the beginning of each block's execution if\n"
+        ':py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event.\n'
+        'Event recorded through this launch attribute is guaranteed to only trigger\n'
+        'after all block in the associated kernel trigger the event. A block can\n'
+        'trigger the event programmatically in a future CUDA release. A trigger can\n'
+        "also be inserted at the beginning of each block's execution if\n"
         'triggerAtBlockStart is set to non-0. The dependent launches can choose to\n'
         'wait on the dependency using the programmatic sync\n'
         '(cudaGridDependencySynchronize() or equivalent PTX instructions). Note that\n'
@@ -6782,28 +6779,28 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributePriority = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePriority,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::priority.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.priority`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomainMap' in found_values}}
 
     cudaLaunchAttributeMemSyncDomainMap = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomain' in found_values}}
 
     cudaLaunchAttributeMemSyncDomain = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePreferredClusterDimension' in found_values}}
 
     cudaLaunchAttributePreferredClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension,
         'Valid for graph nodes and launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the\n'
         'kernel launch to specify a preferred substitute cluster dimension. Blocks\n'
         'may be grouped according to either the dimensions specified with this\n'
         'attribute (grouped into a "preferred substitute cluster"), or the one\n'
@@ -6826,9 +6823,9 @@ class cudaStreamAttrID(_FastEnum):
         ' This attribute will only take effect when a regular cluster dimension has\n'
         'been specified. The preferred substitute cluster dimension must be an\n'
         'integer multiple greater than zero of the regular cluster dimension and\n'
-        'must divide the grid. It must also be no more than `maxBlocksPerCluster`,\n'
-        "if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less\n"
-        'than the maximum value the driver can support. Otherwise, setting this\n'
+        'must divide the grid. It must also be no more than ``maxBlocksPerCluster``,\n'
+        "if it is set in the kernel's ``__launch_bounds__``. Otherwise it must be\n"
+        'less than the maximum value the driver can support. Otherwise, setting this\n'
         'attribute to a value physically unable to fit on any particular device is\n'
         'permitted.\n'
     ){{endif}}
@@ -6837,7 +6834,7 @@ class cudaStreamAttrID(_FastEnum):
     cudaLaunchAttributeLaunchCompletionEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the\n'
         'event.\n'
         ' Nominally, the event is triggered once all blocks of the kernel have begun\n'
         'execution. Currently this is a best effort. If a kernel B has a launch\n'
@@ -6847,8 +6844,8 @@ class cudaStreamAttrID(_FastEnum):
         'on different GPUs) or if B is a higher priority than A. Exercise caution if\n'
         'such an ordering inversion could lead to deadlock.\n'
         ' A launch completion event is nominally similar to a programmatic event\n'
-        'with `triggerAtBlockStart` set except that it is not visible to\n'
-        '`cudaGridDependencySynchronize()` and can be used with compute capability\n'
+        'with ``triggerAtBlockStart`` set except that it is not visible to\n'
+        '``cudaGridDependencySynchronize()`` and can be used with compute capability\n'
         'less than 9.0.\n'
         ' The event supplied must not be an interprocess or interop event. The event\n'
         'must disable timing (i.e. must be created with the\n'
@@ -6864,7 +6861,7 @@ class cudaStreamAttrID(_FastEnum):
         'only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode`\n'
         'which can be passed to the various device-side update functions to update\n'
         "the node's kernel parameters from within another kernel. For more\n"
         'information on the types of device updates that can be made, as well as the\n'
@@ -6892,7 +6889,7 @@ class cudaStreamAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout,
         'Valid for launches. On devices where the L1 cache and shared memory use the\n'
         'same hardware resources, setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage\n'
         'between 0-100 signals sets the shared memory carveout preference in percent\n'
         'of the total shared memory for that kernel launch. This attribute takes\n'
         'precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`.\n'
@@ -6918,7 +6915,7 @@ class cudaStreamAttrID(_FastEnum):
         'not improve the performance of either the targeted kernel or the\n'
         'encapsulating application.\n'
         ' Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::nvlinkUtilCentricScheduling are 0\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0\n'
         '(disabled) and 1 (enabled).\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePortableClusterSizeMode' in found_values}}
@@ -6927,8 +6924,8 @@ class cudaStreamAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePortableClusterSizeMode,
         'Valid for graph nodes, launches. This indicates whether the kernel launch\n'
         'is allowed to use a non-portable cluster size. Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::portableClusterSizeMode are values\n'
-        'for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.portableClusterSizeMode` are values for\n'
+        ':py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
         'return :py:obj:`~.cudaErrorInvalidValue`\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSharedMemoryMode' in found_values}}
@@ -6958,41 +6955,41 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributeAccessPolicyWindow = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeCooperative' in found_values}}
 
     cudaLaunchAttributeCooperative = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::cooperative.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.cooperative`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSynchronizationPolicy' in found_values}}
 
     cudaLaunchAttributeSynchronizationPolicy = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy,
-        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.\n'
+        'Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterDimension' in found_values}}
 
     cudaLaunchAttributeClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::clusterDim.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.clusterDim`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeClusterSchedulingPolicyPreference' in found_values}}
 
     cudaLaunchAttributeClusterSchedulingPolicyPreference = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference,
         'Valid for graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeProgrammaticStreamSerialization' in found_values}}
 
     cudaLaunchAttributeProgrammaticStreamSerialization = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization,
         'Valid for launches. Setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed`\n'
         'to non-0 signals that the kernel will use programmatic means to resolve its\n'
         'stream dependency, so that the CUDA runtime should opportunistically allow\n'
         "the grid's execution to overlap with the previous kernel in the stream, if\n"
@@ -7005,11 +7002,11 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributeProgrammaticEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the\n'
-        'event. Event recorded through this launch attribute is guaranteed to only\n'
-        'trigger after all block in the associated kernel trigger the event. A block\n'
-        'can trigger the event programmatically in a future CUDA release. A trigger\n'
-        "can also be inserted at the beginning of each block's execution if\n"
+        ':py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event.\n'
+        'Event recorded through this launch attribute is guaranteed to only trigger\n'
+        'after all block in the associated kernel trigger the event. A block can\n'
+        'trigger the event programmatically in a future CUDA release. A trigger can\n'
+        "also be inserted at the beginning of each block's execution if\n"
         'triggerAtBlockStart is set to non-0. The dependent launches can choose to\n'
         'wait on the dependency using the programmatic sync\n'
         '(cudaGridDependencySynchronize() or equivalent PTX instructions). Note that\n'
@@ -7030,28 +7027,28 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributePriority = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePriority,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::priority.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.priority`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomainMap' in found_values}}
 
     cudaLaunchAttributeMemSyncDomainMap = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeMemSyncDomain' in found_values}}
 
     cudaLaunchAttributeMemSyncDomain = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain,
         'Valid for streams, graph nodes, launches. See\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePreferredClusterDimension' in found_values}}
 
     cudaLaunchAttributePreferredClusterDimension = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension,
         'Valid for graph nodes and launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the\n'
         'kernel launch to specify a preferred substitute cluster dimension. Blocks\n'
         'may be grouped according to either the dimensions specified with this\n'
         'attribute (grouped into a "preferred substitute cluster"), or the one\n'
@@ -7074,9 +7071,9 @@ class cudaKernelNodeAttrID(_FastEnum):
         ' This attribute will only take effect when a regular cluster dimension has\n'
         'been specified. The preferred substitute cluster dimension must be an\n'
         'integer multiple greater than zero of the regular cluster dimension and\n'
-        'must divide the grid. It must also be no more than `maxBlocksPerCluster`,\n'
-        "if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less\n"
-        'than the maximum value the driver can support. Otherwise, setting this\n'
+        'must divide the grid. It must also be no more than ``maxBlocksPerCluster``,\n'
+        "if it is set in the kernel's ``__launch_bounds__``. Otherwise it must be\n"
+        'less than the maximum value the driver can support. Otherwise, setting this\n'
         'attribute to a value physically unable to fit on any particular device is\n'
         'permitted.\n'
     ){{endif}}
@@ -7085,7 +7082,7 @@ class cudaKernelNodeAttrID(_FastEnum):
     cudaLaunchAttributeLaunchCompletionEvent = (
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent,
         'Valid for launches. Set\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the\n'
         'event.\n'
         ' Nominally, the event is triggered once all blocks of the kernel have begun\n'
         'execution. Currently this is a best effort. If a kernel B has a launch\n'
@@ -7095,8 +7092,8 @@ class cudaKernelNodeAttrID(_FastEnum):
         'on different GPUs) or if B is a higher priority than A. Exercise caution if\n'
         'such an ordering inversion could lead to deadlock.\n'
         ' A launch completion event is nominally similar to a programmatic event\n'
-        'with `triggerAtBlockStart` set except that it is not visible to\n'
-        '`cudaGridDependencySynchronize()` and can be used with compute capability\n'
+        'with ``triggerAtBlockStart`` set except that it is not visible to\n'
+        '``cudaGridDependencySynchronize()`` and can be used with compute capability\n'
         'less than 9.0.\n'
         ' The event supplied must not be an interprocess or interop event. The event\n'
         'must disable timing (i.e. must be created with the\n'
@@ -7112,7 +7109,7 @@ class cudaKernelNodeAttrID(_FastEnum):
         'only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode`\n'
         'which can be passed to the various device-side update functions to update\n'
         "the node's kernel parameters from within another kernel. For more\n"
         'information on the types of device updates that can be made, as well as the\n'
@@ -7140,7 +7137,7 @@ class cudaKernelNodeAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout,
         'Valid for launches. On devices where the L1 cache and shared memory use the\n'
         'same hardware resources, setting\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage\n'
         'between 0-100 signals sets the shared memory carveout preference in percent\n'
         'of the total shared memory for that kernel launch. This attribute takes\n'
         'precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`.\n'
@@ -7166,7 +7163,7 @@ class cudaKernelNodeAttrID(_FastEnum):
         'not improve the performance of either the targeted kernel or the\n'
         'encapsulating application.\n'
         ' Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::nvlinkUtilCentricScheduling are 0\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0\n'
         '(disabled) and 1 (enabled).\n'
     ){{endif}}
     {{if 'cudaLaunchAttributePortableClusterSizeMode' in found_values}}
@@ -7175,8 +7172,8 @@ class cudaKernelNodeAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributePortableClusterSizeMode,
         'Valid for graph nodes, launches. This indicates whether the kernel launch\n'
         'is allowed to use a non-portable cluster size. Valid values for\n'
-        ':py:obj:`~.cudaLaunchAttributeValue`::portableClusterSizeMode are values\n'
-        'for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.portableClusterSizeMode` are values for\n'
+        ':py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will\n'
         'return :py:obj:`~.cudaErrorInvalidValue`\n'
     ){{endif}}
     {{if 'cudaLaunchAttributeSharedMemoryMode' in found_values}}
@@ -7193,7 +7190,7 @@ class cudaKernelNodeAttrID(_FastEnum):
 cdef class cudaDevResourceDesc_t:
     """
 
-    An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources. Created via ::cudaDeviceResourceGenerateDesc
+    An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources. Created via ``cudaDeviceResourceGenerateDesc``
 
     Methods
     -------
@@ -8457,7 +8454,7 @@ cdef class cudaArrayMemoryRequirements:
 
 cdef class cudaPitchedPtr:
     """
-    CUDA Pitched memory pointer  make_cudaPitchedPtr
+    CUDA Pitched memory pointer  ``make_cudaPitchedPtr``
 
     Attributes
     ----------
@@ -8562,7 +8559,7 @@ cdef class cudaPitchedPtr:
 
 cdef class cudaExtent:
     """
-    CUDA extent  make_cudaExtent
+    CUDA extent  ``make_cudaExtent``
 
     Attributes
     ----------
@@ -8649,7 +8646,7 @@ cdef class cudaExtent:
 
 cdef class cudaPos:
     """
-    CUDA 3D position  make_cudaPos
+    CUDA 3D position  ``make_cudaPos``
 
     Attributes
     ----------
@@ -9644,11 +9641,11 @@ cdef class cudaAccessPolicyWindow:
     {{endif}}
     {{if 'cudaAccessPolicyWindow.hitProp' in found_struct}}
     hitProp : cudaAccessProperty
-        ::CUaccessProperty set for hit.
+        ``CUaccessProperty`` set for hit.
     {{endif}}
     {{if 'cudaAccessPolicyWindow.missProp' in found_struct}}
     missProp : cudaAccessProperty
-        ::CUaccessProperty set for miss. Must be either NORMAL or
+        ``CUaccessProperty`` set for miss. Must be either NORMAL or
         STREAMING.
     {{endif}}
 
@@ -11351,7 +11348,7 @@ cdef class cudaMemAccessDesc:
     {{endif}}
     {{if 'cudaMemAccessDesc.flags' in found_struct}}
     flags : cudaMemAccessFlags
-        ::CUmemProt accessibility flags to set on the request
+        ``CUmemProt`` accessibility flags to set on the request
     {{endif}}
 
     Methods
@@ -11643,8 +11640,8 @@ cdef class cudaMemAllocNodeParams:
     {{if 'cudaMemAllocNodeParams.poolProps' in found_struct}}
     poolProps : cudaMemPoolProps
         in: location where the allocation should reside (specified in
-        ::location). ::handleTypes must be cudaMemHandleTypeNone. IPC is
-        not supported. in: array of memory access descriptors. Used to
+        ``location``). ``handleTypes`` must be cudaMemHandleTypeNone. IPC
+        is not supported. in: array of memory access descriptors. Used to
         describe peer GPU access
     {{endif}}
     {{if 'cudaMemAllocNodeParams.accessDescs' in found_struct}}
@@ -11654,7 +11651,7 @@ cdef class cudaMemAllocNodeParams:
     {{endif}}
     {{if 'cudaMemAllocNodeParams.accessDescCount' in found_struct}}
     accessDescCount : size_t
-        in: Number of `accessDescs`s
+        in: Number of ``accessDescs``
     {{endif}}
     {{if 'cudaMemAllocNodeParams.bytesize' in found_struct}}
     bytesize : size_t
@@ -11794,8 +11791,8 @@ cdef class cudaMemAllocNodeParamsV2:
     {{if 'cudaMemAllocNodeParamsV2.poolProps' in found_struct}}
     poolProps : cudaMemPoolProps
         in: location where the allocation should reside (specified in
-        ::location). ::handleTypes must be cudaMemHandleTypeNone. IPC is
-        not supported. in: array of memory access descriptors. Used to
+        ``location``). ``handleTypes`` must be cudaMemHandleTypeNone. IPC
+        is not supported. in: array of memory access descriptors. Used to
         describe peer GPU access
     {{endif}}
     {{if 'cudaMemAllocNodeParamsV2.accessDescs' in found_struct}}
@@ -11805,7 +11802,7 @@ cdef class cudaMemAllocNodeParamsV2:
     {{endif}}
     {{if 'cudaMemAllocNodeParamsV2.accessDescCount' in found_struct}}
     accessDescCount : size_t
-        in: Number of `accessDescs`s
+        in: Number of ``accessDescs``
     {{endif}}
     {{if 'cudaMemAllocNodeParamsV2.bytesize' in found_struct}}
     bytesize : size_t
@@ -16419,7 +16416,7 @@ cdef class cudaDevSmResourceGroupParams_st:
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
     flags : unsigned int
-        Combination of `cudaDevSmResourceGroup_flags` values to indicate
+        Combination of ``cudaDevSmResourceGroup_flags`` values to indicate
         this this group is created.
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
@@ -16529,16 +16526,16 @@ cdef class cudaDevResource_st:
     of the API that created it. struct enumcudaDevResourceTypetype;
     union structcudaDevSmResourcesm;
     structcudaDevWorkqueueConfigResourcewqConfig;
-    structcudaDevWorkqueueResourcewq; ; ;  - If `typename` is
-    `cudaDevResourceTypeInvalid`, this resoure is not valid and cannot
-    be further accessed.    - If `typename` is `cudaDevResourceTypeSm`,
-    the cudaDevSmResource structure `sm` is filled in. For example,
-    `sm.smCount` will reflect the amount of streaming multiprocessors
-    available in this resource.    - If `typename` is
-    `cudaDevResourceTypeWorkqueueConfig`, the
-    cudaDevWorkqueueConfigResource structure `wqConfig` is filled in.
-    - If `typename` is `cudaDevResourceTypeWorkqueue`, the
-    cudaDevWorkqueueResource structure `wq` is filled in.
+    structcudaDevWorkqueueResourcewq; ; ;  - If ``typename`` is
+    ``cudaDevResourceTypeInvalid``, this resoure is not valid and
+    cannot be further accessed.    - If ``typename`` is
+    ``cudaDevResourceTypeSm``, the cudaDevSmResource structure ``sm``
+    is filled in. For example, ``sm.smCount`` will reflect the amount
+    of streaming multiprocessors available in this resource.    - If
+    ``typename`` is ``cudaDevResourceTypeWorkqueueConfig``, the
+    cudaDevWorkqueueConfigResource structure ``wqConfig`` is filled in.
+    - If ``typename`` is ``cudaDevResourceTypeWorkqueue``, the
+    cudaDevWorkqueueResource structure ``wq`` is filled in.
 
     Attributes
     ----------
@@ -16552,16 +16549,17 @@ cdef class cudaDevResource_st:
     {{endif}}
     {{if 'cudaDevResource_st.sm' in found_struct}}
     sm : cudaDevSmResource
-        Resource corresponding to cudaDevResourceTypeSm `typename`.
+        Resource corresponding to cudaDevResourceTypeSm ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st.wqConfig' in found_struct}}
     wqConfig : cudaDevWorkqueueConfigResource
         Resource corresponding to cudaDevResourceTypeWorkqueueConfig
-        `typename`.
+        ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st.wq' in found_struct}}
     wq : cudaDevWorkqueueResource
-        Resource corresponding to cudaDevResourceTypeWorkqueue `typename`.
+        Resource corresponding to cudaDevResourceTypeWorkqueue
+        ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st._oversize' in found_struct}}
     _oversize : bytes
@@ -17793,12 +17791,12 @@ cdef class cudaConditionalNodeParams:
         graphs at any level, must belong to the same CUDA context.
         These graphs may be populated using graph node creation APIs or
         cudaStreamBeginCaptureToGraph. cudaGraphCondTypeIf: phGraph_out[0]
-        is executed when the condition is non-zero. If `size` == 2,
+        is executed when the condition is non-zero. If ``size`` == 2,
         phGraph_out[1] will be executed when the condition is zero.
         cudaGraphCondTypeWhile: phGraph_out[0] is executed as long as the
         condition is non-zero. cudaGraphCondTypeSwitch: phGraph_out[n] is
         executed when the condition is equal to n. If the condition >=
-        `size`, no body graph is executed.
+        ``size``, no body graph is executed.
     {{endif}}
     {{if 'cudaConditionalNodeParams.ctx' in found_struct}}
     ctx : cudaExecutionContext_t
@@ -18504,7 +18502,7 @@ cdef class cudaGraphEdgeData_st:
         node on the edge. The meaning is specfic to the node type. A value
         of 0 in all cases means full completion of the upstream node, with
         memory visibility to the downstream node or portion thereof
-        (indicated by `to_port`).   Only kernel nodes define non-zero
+        (indicated by ``to_port``).   Only kernel nodes define non-zero
         ports. A kernel node can use the following output port types:
         cudaGraphKernelNodePortDefault,
         cudaGraphKernelNodePortProgrammatic, or
@@ -18513,7 +18511,7 @@ cdef class cudaGraphEdgeData_st:
     {{if 'cudaGraphEdgeData_st.to_port' in found_struct}}
     to_port : bytes
         This indicates what portion of the downstream node is dependent on
-        the upstream node or portion thereof (indicated by `from_port`).
+        the upstream node or portion thereof (indicated by ``from_port``).
         The meaning is specific to the node type. A value of 0 in all cases
         means the entirety of the downstream node is dependent on the
         upstream work.   Currently no node types define non-zero ports.
@@ -19030,7 +19028,7 @@ cdef class anon_union10:
 cdef class cudaGraphKernelNodeUpdate:
     """
     Struct to specify a single node update to pass as part of a larger
-    array to ::cudaGraphKernelNodeUpdatesApply
+    array to ``cudaGraphKernelNodeUpdatesApply``
 
     Attributes
     ----------
@@ -19644,11 +19642,11 @@ cdef class cudaLaunchAttributeValue:
     clusterDim : anon_struct17
         Value of launch attribute cudaLaunchAttributeClusterDimension that
         represents the desired cluster dimensions for the kernel. Opaque
-        type with the following fields: - `x` - The X dimension of the
+        type with the following fields: - ``x`` - The X dimension of the
         cluster, in blocks. Must be a divisor of the grid X dimension.    -
-        `y` - The Y dimension of the cluster, in blocks. Must be a divisor
-        of the grid Y dimension.    - `z` - The Z dimension of the cluster,
-        in blocks. Must be a divisor of the grid Z dimension.
+        ``y`` - The Y dimension of the cluster, in blocks. Must be a
+        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
+        the cluster, in blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterSchedulingPolicyPreference' in found_struct}}
     clusterSchedulingPolicyPreference : cudaClusterSchedulingPolicy
@@ -19664,11 +19662,12 @@ cdef class cudaLaunchAttributeValue:
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct18
         Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
-        the following fields: - `cudaEvent_t` event - Event to fire when
-        all blocks trigger it.    - `int` flags; - Event record flags, see
-        cudaEventRecordWithFlags. Does not accept cudaEventRecordExternal.
-        - `int` triggerAtBlockStart - If this is set to non-0, each block
-        launch will automatically trigger the event.
+        the following fields: - ``cudaEvent_t`` event - Event to fire when
+        all blocks trigger it.    - ``int`` flags; - Event record flags,
+        see cudaEventRecordWithFlags. Does not accept
+        cudaEventRecordExternal.    - ``int`` triggerAtBlockStart - If this
+        is set to non-0, each block launch will automatically trigger the
+        event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
@@ -19690,21 +19689,21 @@ cdef class cudaLaunchAttributeValue:
         Value of launch attribute
         cudaLaunchAttributePreferredClusterDimension that represents the
         desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - `x` - The X dimension of the preferred
-        cluster, in blocks. Must be a divisor of the grid X dimension, and
-        must be a multiple of the `x` field of
-        ::cudaLaunchAttributeValue::clusterDim.    - `y` - The Y dimension
+        with the following fields: - ``x`` - The X dimension of the
+        preferred cluster, in blocks. Must be a divisor of the grid X
+        dimension, and must be a multiple of the ``x`` field of
+        cudaLaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension
         of the preferred cluster, in blocks. Must be a divisor of the grid
-        Y dimension, and must be a multiple of the `y` field of
-        ::cudaLaunchAttributeValue::clusterDim.    - `z` - The Z dimension
-        of the preferred cluster, in blocks. Must be equal to the `z` field
-        of ::cudaLaunchAttributeValue::clusterDim.
+        Y dimension, and must be a multiple of the ``y`` field of
+        cudaLaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension
+        of the preferred cluster, in blocks. Must be equal to the ``z``
+        field of cudaLaunchAttributeValue::clusterDim.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
         Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
-        with the following fields: - `cudaEvent_t` event - Event to fire
-        when the last block launches.    - `int` flags - Event record
+        with the following fields: - ``cudaEvent_t`` event - Event to fire
+        when the last block launches.    - ``int`` flags - Event record
         flags, see cudaEventRecordWithFlags. Does not accept
         cudaEventRecordExternal.
     {{endif}}
@@ -19712,9 +19711,9 @@ cdef class cudaLaunchAttributeValue:
     deviceUpdatableKernelNode : anon_struct21
         Value of launch attribute
         cudaLaunchAttributeDeviceUpdatableKernelNode with the following
-        fields: - `int` deviceUpdatable - Whether or not the resulting
+        fields: - ``int`` deviceUpdatable - Whether or not the resulting
         kernel node should be device-updatable.    -
-        `cudaGraphDeviceNode_t` devNode - Returns a handle to pass to the
+        ``cudaGraphDeviceNode_t`` devNode - Returns a handle to pass to the
         various device-side update functions.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemCarveout' in found_struct}}
@@ -20225,7 +20224,7 @@ cdef class cudaAsyncNotificationInfo:
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
     info : anon_union11
-        Information about the notification. `typename` must be checked in
+        Information about the notification. ``typename`` must be checked in
         order to interpret this field.
     {{endif}}
 
@@ -21338,7 +21337,7 @@ def cudaDeviceReset():
     in subsequent API calls and doing so will result in undefined behavior.
     These resources include CUDA types :py:obj:`~.cudaStream_t`,
     :py:obj:`~.cudaEvent_t`, :py:obj:`~.cudaArray_t`,
-    :py:obj:`~.cudaMipmappedArray_t`, :py:obj:`~.cudaPitchedPtr`,
+    :py:obj:`~.cudaMipmappedArray_t`, cudaPitchedPtr,
     :py:obj:`~.cudaTextureObject_t`, :py:obj:`~.cudaSurfaceObject_t`,
     :py:obj:`~.textureReference`, :py:obj:`~.surfaceReference`,
     :py:obj:`~.cudaExternalMemory_t`, :py:obj:`~.cudaExternalSemaphore_t`
@@ -21406,9 +21405,9 @@ def cudaDeviceSynchronize():
 def cudaDeviceSetLimit(limit not None : cudaLimit, size_t value):
     """ Set resource limits.
 
-    Setting `limit` to `value` is a request by the application to update
-    the current limit maintained by the device. The driver is free to
-    modify the requested value to meet h/w requirements (this could be
+    Setting ``limit`` to ``value`` is a request by the application to
+    update the current limit maintained by the device. The driver is free
+    to modify the requested value to meet h/w requirements (this could be
     clamping to minimum or maximum values, rounding up to nearest element
     size, etc). The application can use :py:obj:`~.cudaDeviceGetLimit()` to
     find out exactly what the limit has been set to.
@@ -21509,7 +21508,7 @@ def cudaDeviceSetLimit(limit not None : cudaLimit, size_t value):
 def cudaDeviceGetLimit(limit not None : cudaLimit):
     """ Return resource limits.
 
-    Returns in `*pValue` the current size of `limit`. The following
+    Returns in ``*pValue`` the current size of ``limit``. The following
     :py:obj:`~.cudaLimit` values are supported.
 
     - :py:obj:`~.cudaLimitStackSize` is the stack size in bytes of each GPU
@@ -21569,9 +21568,9 @@ def cudaDeviceGetLimit(limit not None : cudaLimit):
 def cudaDeviceGetTexture1DLinearMaxWidth(fmtDesc : Optional[cudaChannelFormatDesc], int device):
     """ Returns the maximum number of elements allocatable in a 1D linear texture for a given element size.
 
-    Returns in `maxWidthInElements` the maximum number of elements
+    Returns in ``maxWidthInElements`` the maximum number of elements
     allocatable in a 1D linear texture for given format descriptor
-    `fmtDesc`.
+    ``fmtDesc``.
 
     Parameters
     ----------
@@ -21586,7 +21585,7 @@ def cudaDeviceGetTexture1DLinearMaxWidth(fmtDesc : Optional[cudaChannelFormatDes
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorUnsupportedLimit`, :py:obj:`~.cudaErrorInvalidValue`
     maxWidthInElements : int
         Returns maximum number of texture elements allocatable for given
-        `fmtDesc`.
+        ``fmtDesc``.
 
     See Also
     --------
@@ -21608,13 +21607,13 @@ def cudaDeviceGetCacheConfig():
     """ Returns the preferred cache configuration for the current device.
 
     On devices where the L1 cache and shared memory use the same hardware
-    resources, this returns through `pCacheConfig` the preferred cache
+    resources, this returns through ``pCacheConfig`` the preferred cache
     configuration for the current device. This is only a preference. The
     runtime will use the requested configuration if possible, but it is
     free to choose a different configuration if required to execute
     functions.
 
-    This will return a `pCacheConfig` of
+    This will return a ``pCacheConfig`` of
     :py:obj:`~.cudaFuncCachePreferNone` on devices where the size of the L1
     cache and shared memory are fixed.
 
@@ -21657,21 +21656,21 @@ def cudaDeviceGetCacheConfig():
 def cudaDeviceGetStreamPriorityRange():
     """ Returns numerical values that correspond to the least and greatest stream priorities.
 
-    Returns in `*leastPriority` and `*greatestPriority` the numerical
+    Returns in ``*leastPriority`` and ``*greatestPriority`` the numerical
     values that correspond to the least and greatest stream priorities
     respectively. Stream priorities follow a convention where lower numbers
     imply greater priorities. The range of meaningful stream priorities is
-    given by [`*greatestPriority`, `*leastPriority`]. If the user attempts
-    to create a stream with a priority value that is outside the the
-    meaningful range as specified by this API, the priority is
-    automatically clamped down or up to either `*leastPriority` or
-    `*greatestPriority` respectively. See
+    given by [``*greatestPriority``, ``*leastPriority``]. If the user
+    attempts to create a stream with a priority value that is outside the
+    the meaningful range as specified by this API, the priority is
+    automatically clamped down or up to either ``*leastPriority`` or
+    ``*greatestPriority`` respectively. See
     :py:obj:`~.cudaStreamCreateWithPriority` for details on creating a
-    priority stream. A NULL may be passed in for `*leastPriority` or
-    `*greatestPriority` if the value is not desired.
+    priority stream. A NULL may be passed in for ``*leastPriority`` or
+    ``*greatestPriority`` if the value is not desired.
 
-    This function will return '0' in both `*leastPriority` and
-    `*greatestPriority` if the current context's device does not support
+    This function will return '0' in both ``*leastPriority`` and
+    ``*greatestPriority`` if the current context's device does not support
     stream priorities (see :py:obj:`~.cudaDeviceGetAttribute`).
 
     Returns
@@ -21705,7 +21704,7 @@ def cudaDeviceSetCacheConfig(cacheConfig not None : cudaFuncCache):
     """ Sets the preferred cache configuration for the current device.
 
     On devices where the L1 cache and shared memory use the same hardware
-    resources, this sets through `cacheConfig` the preferred cache
+    resources, this sets through ``cacheConfig`` the preferred cache
     configuration for the current device. This is only a preference. The
     runtime will use the requested configuration if possible, but it is
     free to choose a different configuration if required to execute the
@@ -21762,10 +21761,10 @@ def cudaDeviceSetCacheConfig(cacheConfig not None : cudaFuncCache):
 def cudaDeviceGetByPCIBusId(char* pciBusId):
     """ Returns a handle to a compute device.
 
-    Returns in `*device` a device ordinal given a PCI bus ID string.
+    Returns in ``*device`` a device ordinal given a PCI bus ID string.
 
-    where `domain`, `bus`, `device`, and `function` are all hexadecimal
-    values
+    where ``domain``, ``bus``, ``device``, and ``function`` are all
+    hexadecimal values
 
     Parameters
     ----------
@@ -21797,18 +21796,18 @@ def cudaDeviceGetByPCIBusId(char* pciBusId):
 def cudaDeviceGetPCIBusId(int length, int device):
     """ Returns a PCI Bus Id string for the device.
 
-    Returns an ASCII string identifying the device `dev` in the NULL-
-    terminated string pointed to by `pciBusId`. `length` specifies the
+    Returns an ASCII string identifying the device ``dev`` in the NULL-
+    terminated string pointed to by ``pciBusId``. ``length`` specifies the
     maximum length of the string that may be returned.
 
-    where `domain`, `bus`, `device`, and `function` are all hexadecimal
-    values. pciBusId should be large enough to store 13 characters
-    including the NULL-terminator.
+    where ``domain``, ``bus``, ``device``, and ``function`` are all
+    hexadecimal values. pciBusId should be large enough to store 13
+    characters including the NULL-terminator.
 
     Parameters
     ----------
     length : int
-        Maximum length of string to store in `name`
+        Maximum length of string to store in ``name``
     device : int
         Device to get identifier string for
 
@@ -22053,8 +22052,8 @@ def cudaIpcOpenMemHandle(handle not None : cudaIpcMemHandle_t, unsigned int flag
 
     Notes
     -----
-    No guarantees are made about the address returned in `*devPtr`. 
-     In particular, multiple processes may not receive the same address for the same `handle`.
+    No guarantees are made about the address returned in ``*devPtr``.
+     In particular, multiple processes may not receive the same address for the same ``handle``.
     """
     cdef void_ptr devPtr = 0
     with nogil:
@@ -22169,11 +22168,11 @@ cdef void cudaAsyncNotificationCallbackWrapper(cyruntime.cudaAsyncNotificationIn
 def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData):
     """ Registers a callback function to receive async notifications.
 
-    Registers `callbackFunc` to receive async notifications.
+    Registers ``callbackFunc`` to receive async notifications.
 
-    The `userData` parameter is passed to the callback function at async
-    notification time. Likewise, `callback` is also passed to the callback
-    function to distinguish between multiple registered callbacks.
+    The ``userData`` parameter is passed to the callback function at async
+    notification time. Likewise, ``callback`` is also passed to the
+    callback function to distinguish between multiple registered callbacks.
 
     The callback function being registered should be designed to return
     quickly (~10ms). Any long running tasks should be queued for execution
@@ -22184,7 +22183,7 @@ def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData):
     :py:obj:`~.cudaErrorNotPermitted`. Async notification callbacks execute
     in an undefined order and may be serialized.
 
-    Returns in `*callback` a handle representing the registered callback
+    Returns in ``*callback`` a handle representing the registered callback
     instance.
 
     Parameters
@@ -22246,13 +22245,13 @@ def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData):
 def cudaDeviceUnregisterAsyncNotification(int device, callback):
     """ Unregisters an async notification callback.
 
-    Unregisters `callback` so that the corresponding callback function will
-    stop receiving async notifications.
+    Unregisters ``callback`` so that the corresponding callback function
+    will stop receiving async notifications.
 
     Parameters
     ----------
     device : int
-        The device from which to remove `callback`.
+        The device from which to remove ``callback``.
     callback : :py:obj:`~.cudaAsyncCallbackHandle_t`
         The callback instance to unregister from receiving async
         notifications.
@@ -22290,7 +22289,7 @@ def cudaDeviceGetSharedMemConfig():
 
     [Deprecated]
 
-    This function will return in `pConfig` the current size of shared
+    This function will return in ``pConfig`` the current size of shared
     memory banks on the current device. On devices with configurable shared
     memory banks, :py:obj:`~.cudaDeviceSetSharedMemConfig` can be used to
     change this setting, so that all subsequent kernel launches will by
@@ -22460,7 +22459,7 @@ def cudaGetErrorName(error not None : cudaError_t):
     cudaError_t.cudaSuccess
         cudaError_t.cudaSuccess
     bytes
-        `char*` pointer to a NULL-terminated string
+        ``char*`` pointer to a NULL-terminated string
 
     See Also
     --------
@@ -22491,7 +22490,7 @@ def cudaGetErrorString(error not None : cudaError_t):
     cudaError_t.cudaSuccess
         cudaError_t.cudaSuccess
     bytes
-        `char*` pointer to a NULL-terminated string
+        ``char*`` pointer to a NULL-terminated string
 
     See Also
     --------
@@ -22509,7 +22508,7 @@ def cudaGetErrorString(error not None : cudaError_t):
 def cudaGetDeviceCount():
     """ Returns the number of compute-capable devices.
 
-    Returns in `*count` the number of devices with compute capability
+    Returns in ``*count`` the number of devices with compute capability
     greater or equal to 2.0 that are available for execution.
 
     Returns
@@ -22538,7 +22537,7 @@ def cudaGetDeviceCount():
 def cudaGetDeviceProperties(int device):
     """ Returns information about the compute-device.
 
-    Returns in `*prop` the properties of device `dev`.
+    Returns in ``*prop`` the properties of device ``dev``.
 
     Parameters
     ----------
@@ -22570,8 +22569,8 @@ def cudaGetDeviceProperties(int device):
 def cudaDeviceGetAttribute(attr not None : cudaDeviceAttr, int device):
     """ Returns information about the device.
 
-    Returns in `*value` the integer value of the attribute `attr` on device
-    `device`.
+    Returns in ``*value`` the integer value of the attribute ``attr`` on
+    device ``device``.
 
     Parameters
     ----------
@@ -22606,20 +22605,21 @@ def cudaDeviceGetAttribute(attr not None : cudaDeviceAttr, int device):
 def cudaDeviceGetHostAtomicCapabilities(operations : Optional[tuple[cudaAtomicOperation] | list[cudaAtomicOperation]], unsigned int count, int device):
     """ Queries details about atomic operations supported between the device and host.
 
-    Returns in `*capabilities` the details about requested atomic
-    `*operations` over the the link between `dev` and the host. The
-    allocated size of `*operations` and `*capabilities` must be `count`.
+    Returns in ``*capabilities`` the details about requested atomic
+    ``*operations`` over the the link between ``dev`` and the host. The
+    allocated size of ``*operations`` and ``*capabilities`` must be
+    ``count``.
 
-    For each :py:obj:`~.cudaAtomicOperation` in `*operations`, the
-    corresponding result in `*capabilities` will be a bitmask indicating
+    For each :py:obj:`~.cudaAtomicOperation` in ``*operations``, the
+    corresponding result in ``*capabilities`` will be a bitmask indicating
     which of :py:obj:`~.cudaAtomicOperationCapability` the link supports
     natively.
 
-    Returns :py:obj:`~.cudaErrorInvalidDevice` if `dev` is not valid.
+    Returns :py:obj:`~.cudaErrorInvalidDevice` if ``dev`` is not valid.
 
-    Returns :py:obj:`~.cudaErrorInvalidValue` if `*capabilities` or
-    `*operations` is NULL, if `count` is 0, or if any of `*operations` is
-    not valid.
+    Returns :py:obj:`~.cudaErrorInvalidValue` if ``*capabilities`` or
+    ``*operations`` is NULL, if ``count`` is 0, or if any of
+    ``*operations`` is not valid.
 
     Parameters
     ----------
@@ -22784,19 +22784,20 @@ def cudaDeviceGetMemPool(int device):
 def cudaDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, int device, int flags):
     """ Return NvSciSync attributes that this device can support.
 
-    Returns in `nvSciSyncAttrList`, the properties of NvSciSync that this
-    CUDA device, `dev` can support. The returned `nvSciSyncAttrList` can be
-    used to create an NvSciSync that matches this device's capabilities.
+    Returns in ``nvSciSyncAttrList``, the properties of NvSciSync that this
+    CUDA device, ``dev`` can support. The returned ``nvSciSyncAttrList``
+    can be used to create an NvSciSync that matches this device's
+    capabilities.
 
-    If NvSciSyncAttrKey_RequiredPerm field in `nvSciSyncAttrList` is
+    If NvSciSyncAttrKey_RequiredPerm field in ``nvSciSyncAttrList`` is
     already set this API will return :py:obj:`~.cudaErrorInvalidValue`.
 
-    The applications should set `nvSciSyncAttrList` to a valid
+    The applications should set ``nvSciSyncAttrList`` to a valid
     NvSciSyncAttrList failing which this API will return
     :py:obj:`~.cudaErrorInvalidHandle`.
 
-    The `flags` controls how applications intends to use the NvSciSync
-    created from the `nvSciSyncAttrList`. The valid flags are:
+    The ``flags`` controls how applications intends to use the NvSciSync
+    created from the ``nvSciSyncAttrList``. The valid flags are:
 
     - :py:obj:`~.cudaNvSciSyncAttrSignal`, specifies that the applications
       intends to signal an NvSciSync on this CUDA device.
@@ -22807,32 +22808,33 @@ def cudaDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, int device, int flags):
     At least one of these flags must be set, failing which the API returns
     :py:obj:`~.cudaErrorInvalidValue`. Both the flags are orthogonal to one
     another: a developer may set both these flags that allows to set both
-    wait and signal specific attributes in the same `nvSciSyncAttrList`.
+    wait and signal specific attributes in the same ``nvSciSyncAttrList``.
 
-    Note that this API updates the input `nvSciSyncAttrList` with values
+    Note that this API updates the input ``nvSciSyncAttrList`` with values
     equivalent to the following public attribute key-values:
     NvSciSyncAttrKey_RequiredPerm is set to
 
     - NvSciSyncAccessPerm_SignalOnly if :py:obj:`~.cudaNvSciSyncAttrSignal`
-      is set in `flags`.
+      is set in ``flags``.
 
     - NvSciSyncAccessPerm_WaitOnly if :py:obj:`~.cudaNvSciSyncAttrWait` is
-      set in `flags`.
+      set in ``flags``.
 
     - NvSciSyncAccessPerm_WaitSignal if both
       :py:obj:`~.cudaNvSciSyncAttrWait` and
-      :py:obj:`~.cudaNvSciSyncAttrSignal` are set in `flags`.
+      :py:obj:`~.cudaNvSciSyncAttrSignal` are set in ``flags``.
       NvSciSyncAttrKey_PrimitiveInfo is set to
 
-    - NvSciSyncAttrValPrimitiveType_SysmemSemaphore on any valid `device`.
+    - NvSciSyncAttrValPrimitiveType_SysmemSemaphore on any valid
+      ``device``.
 
-    - NvSciSyncAttrValPrimitiveType_Syncpoint if `device` is a Tegra
+    - NvSciSyncAttrValPrimitiveType_Syncpoint if ``device`` is a Tegra
       device.
 
-    - NvSciSyncAttrValPrimitiveType_SysmemSemaphorePayload64b if `device`
+    - NvSciSyncAttrValPrimitiveType_SysmemSemaphorePayload64b if ``device``
       is GA10X+. NvSciSyncAttrKey_GpuId is set to the same UUID that is
-      returned in `cudaDeviceProp.uuid` from
-      :py:obj:`~.cudaDeviceGetProperties` for this `device`.
+      returned in ``cudaDeviceProp.uuid`` from
+      :py:obj:`~.cudaDeviceGetProperties` for this ``device``.
 
     :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorDeviceUninitialized`,
     :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidHandle`,
@@ -22871,9 +22873,9 @@ def cudaDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, int device, int flags):
 def cudaDeviceGetP2PAttribute(attr not None : cudaDeviceP2PAttr, int srcDevice, int dstDevice):
     """ Queries attributes of the link between two devices.
 
-    Returns in `*value` the value of the requested attribute `attrib` of
-    the link between `srcDevice` and `dstDevice`. The supported attributes
-    are:
+    Returns in ``*value`` the value of the requested attribute ``attrib``
+    of the link between ``srcDevice`` and ``dstDevice``. The supported
+    attributes are:
 
     - :py:obj:`~.cudaDevP2PAttrPerformanceRank`: A relative value
       indicating the performance of the link between two devices. Lower
@@ -22894,17 +22896,17 @@ def cudaDeviceGetP2PAttribute(attr not None : cudaDeviceP2PAttr, int srcDevice,
       about specific operations can be retrieved with
       :py:obj:`~.cudaDeviceGetP2PAtomicCapabilities`.
 
-    Returns :py:obj:`~.cudaErrorInvalidDevice` if `srcDevice` or
-    `dstDevice` are not valid or if they represent the same device.
+    Returns :py:obj:`~.cudaErrorInvalidDevice` if ``srcDevice`` or
+    ``dstDevice`` are not valid or if they represent the same device.
 
-    Returns :py:obj:`~.cudaErrorInvalidValue` if `attrib` is not valid or
-    if `value` is a null pointer.
+    Returns :py:obj:`~.cudaErrorInvalidValue` if ``attrib`` is not valid or
+    if ``value`` is a null pointer.
 
     Parameters
     ----------
     attrib : :py:obj:`~.cudaDeviceP2PAttr`
-        The requested attribute of the link between `srcDevice` and
-        `dstDevice`.
+        The requested attribute of the link between ``srcDevice`` and
+        ``dstDevice``.
     srcDevice : int
         The source device of the target link.
     dstDevice : int
@@ -22936,22 +22938,22 @@ def cudaDeviceGetP2PAttribute(attr not None : cudaDeviceP2PAttr, int srcDevice,
 def cudaDeviceGetP2PAtomicCapabilities(operations : Optional[tuple[cudaAtomicOperation] | list[cudaAtomicOperation]], unsigned int count, int srcDevice, int dstDevice):
     """ Queries details about atomic operations supported between two devices.
 
-    Returns in `*capabilities` the details about requested atomic
-    `*operations` over the the link between `srcDevice` and `dstDevice`.
-    The allocated size of `*operations` and `*capabilities` must be
-    `count`.
+    Returns in ``*capabilities`` the details about requested atomic
+    ``*operations`` over the the link between ``srcDevice`` and
+    ``dstDevice``. The allocated size of ``*operations`` and
+    ``*capabilities`` must be ``count``.
 
-    For each :py:obj:`~.cudaAtomicOperation` in `*operations`, the
-    corresponding result in `*capabilities` will be a bitmask indicating
+    For each :py:obj:`~.cudaAtomicOperation` in ``*operations``, the
+    corresponding result in ``*capabilities`` will be a bitmask indicating
     which of :py:obj:`~.cudaAtomicOperationCapability` the link supports
     natively.
 
-    Returns :py:obj:`~.cudaErrorInvalidDevice` if `srcDevice` or
-    `dstDevice` are not valid or if they represent the same device.
+    Returns :py:obj:`~.cudaErrorInvalidDevice` if ``srcDevice`` or
+    ``dstDevice`` are not valid or if they represent the same device.
 
-    Returns :py:obj:`~.cudaErrorInvalidValue` if `*capabilities` or
-    `*operations` is NULL, if `count` is 0, or if any of `*operations` is
-    not valid.
+    Returns :py:obj:`~.cudaErrorInvalidValue` if ``*capabilities`` or
+    ``*operations`` is NULL, if ``count`` is 0, or if any of
+    ``*operations`` is not valid.
 
     Parameters
     ----------
@@ -23003,8 +23005,8 @@ def cudaDeviceGetP2PAtomicCapabilities(operations : Optional[tuple[cudaAtomicOpe
 def cudaChooseDevice(prop : Optional[cudaDeviceProp]):
     """ Select compute-device which best matches criteria.
 
-    Returns in `*device` the device which has properties that best match
-    `*prop`.
+    Returns in ``*device`` the device which has properties that best match
+    ``*prop``.
 
     Parameters
     ----------
@@ -23038,10 +23040,10 @@ def cudaInitDevice(int device, unsigned int deviceFlags, unsigned int flags):
     """ Initialize device to be used for GPU executions.
 
     This function will initialize the CUDA Runtime structures and primary
-    context on `device` when called, but the context will not be made
-    current to `device`.
+    context on ``device`` when called, but the context will not be made
+    current to ``device``.
 
-    When :py:obj:`~.cudaInitDeviceFlagsAreValid` is set in `flags`,
+    When :py:obj:`~.cudaInitDeviceFlagsAreValid` is set in ``flags``,
     deviceFlags are applied to the requested device. The values of
     deviceFlags match those of the flags parameters in
     :py:obj:`~.cudaSetDeviceFlags`. The effect may be verified by
@@ -23080,19 +23082,19 @@ def cudaInitDevice(int device, unsigned int deviceFlags, unsigned int flags):
 def cudaSetDevice(int device):
     """ Set device to be used for GPU executions.
 
-    Sets `device` as the current device for the calling host thread. Valid
-    device id's are 0 to (:py:obj:`~.cudaGetDeviceCount()` - 1).
+    Sets ``device`` as the current device for the calling host thread.
+    Valid device id's are 0 to (:py:obj:`~.cudaGetDeviceCount()` - 1).
 
     Any device memory subsequently allocated from this host thread using
     :py:obj:`~.cudaMalloc()`, :py:obj:`~.cudaMallocPitch()` or
-    :py:obj:`~.cudaMallocArray()` will be physically resident on `device`.
-    Any host memory allocated from this host thread using
+    :py:obj:`~.cudaMallocArray()` will be physically resident on
+    ``device``. Any host memory allocated from this host thread using
     :py:obj:`~.cudaMallocHost()` or :py:obj:`~.cudaHostAlloc()` or
     :py:obj:`~.cudaHostRegister()` will have its lifetime associated with
-    `device`. Any streams or events created from this host thread will be
-    associated with `device`. Any kernels launched from this host thread
+    ``device``. Any streams or events created from this host thread will be
+    associated with ``device``. Any kernels launched from this host thread
     using the <<<>>> operator or :py:obj:`~.cudaLaunchKernel()` will be
-    executed on `device`.
+    executed on ``device``.
 
     This call may be made from any host thread, to any device, and at any
     time. This function will do no synchronization with the previous or new
@@ -23102,8 +23104,8 @@ def cudaSetDevice(int device):
     allocations, stream and event creations, and kernel launches will be
     associated with the primary context. This function will also
     immediately initialize the runtime state on the primary context, and
-    the context will be current on `device` immediately. This function will
-    return an error if the device is in
+    the context will be current on ``device`` immediately. This function
+    will return an error if the device is in
     :py:obj:`~.cudaComputeModeExclusiveProcess` and is occupied by another
     process or if the device is in :py:obj:`~.cudaComputeModeProhibited`.
 
@@ -23136,7 +23138,7 @@ def cudaSetDevice(int device):
 def cudaGetDevice():
     """ Returns which device is currently being used.
 
-    Returns in `*device` the current device for the calling host thread.
+    Returns in ``*device`` the current device for the calling host thread.
 
     Returns
     -------
@@ -23164,27 +23166,27 @@ def cudaGetDevice():
 def cudaSetDeviceFlags(unsigned int flags):
     """ Sets flags to be used for device executions.
 
-    Records `flags` as the flags for the current device. If the current
+    Records ``flags`` as the flags for the current device. If the current
     device has been set and that device has already been initialized, the
     previous flags are overwritten. If the current device has not been
     initialized, it is initialized with the provided flags. If no device
     has been made current to the calling thread, a default device is
     selected and initialized with the provided flags.
 
-    The three LSBs of the `flags` parameter can be used to control how the
-    CPU thread interacts with the OS scheduler when waiting for results
+    The three LSBs of the ``flags`` parameter can be used to control how
+    the CPU thread interacts with the OS scheduler when waiting for results
     from the device.
 
-    - :py:obj:`~.cudaDeviceScheduleAuto`: The default value if the `flags`
-      parameter is zero, uses a heuristic based on the number of active
-      CUDA contexts in the process `C` and the number of logical processors
-      in the system `P`. If `C` > `P`, then CUDA will yield to other OS
-      threads when waiting for the device, otherwise CUDA will not yield
-      while waiting for results and actively spin on the processor.
-      Additionally, on Tegra devices, :py:obj:`~.cudaDeviceScheduleAuto`
-      uses a heuristic based on the power profile of the platform and may
-      choose :py:obj:`~.cudaDeviceScheduleBlockingSync` for low-powered
-      devices.
+    - :py:obj:`~.cudaDeviceScheduleAuto`: The default value if the
+      ``flags`` parameter is zero, uses a heuristic based on the number of
+      active CUDA contexts in the process ``C`` and the number of logical
+      processors in the system ``P``. If ``C`` > ``P``, then CUDA will
+      yield to other OS threads when waiting for the device, otherwise CUDA
+      will not yield while waiting for results and actively spin on the
+      processor. Additionally, on Tegra devices,
+      :py:obj:`~.cudaDeviceScheduleAuto` uses a heuristic based on the
+      power profile of the platform and may choose
+      :py:obj:`~.cudaDeviceScheduleBlockingSync` for low-powered devices.
 
     - :py:obj:`~.cudaDeviceScheduleSpin`: Instruct CUDA to actively spin
       when waiting for results from the device. This can decrease latency
@@ -23202,9 +23204,8 @@ def cudaSetDeviceFlags(unsigned int flags):
 
     - :py:obj:`~.cudaDeviceBlockingSync`: Instruct CUDA to block the CPU
       thread on a synchronization primitive when waiting for the device to
-      finish work.   :py:obj:`~.Deprecated:` This flag was deprecated as of
-      CUDA 4.0 and replaced with
-      :py:obj:`~.cudaDeviceScheduleBlockingSync`.
+      finish work.   Deprecated: This flag was deprecated as of CUDA 4.0
+      and replaced with :py:obj:`~.cudaDeviceScheduleBlockingSync`.
 
     - :py:obj:`~.cudaDeviceMapHost`: This flag enables allocating pinned
       host memory that is accessible to the device. It is implicit for the
@@ -23216,9 +23217,9 @@ def cudaSetDeviceFlags(unsigned int flags):
       local memory after resizing local memory for a kernel. This can
       prevent thrashing by local memory allocations when launching many
       kernels with high local memory usage at the cost of potentially
-      increased memory usage.   :py:obj:`~.Deprecated:` This flag is
-      deprecated and the behavior enabled by this flag is now the default
-      and cannot be disabled.
+      increased memory usage.   Deprecated: This flag is deprecated and the
+      behavior enabled by this flag is now the default and cannot be
+      disabled.
 
     - :py:obj:`~.cudaDeviceSyncMemops`: Ensures that synchronous memory
       operations initiated on this context will always synchronize. See
@@ -23251,7 +23252,7 @@ def cudaSetDeviceFlags(unsigned int flags):
 def cudaGetDeviceFlags():
     """ Gets the flags for the current device.
 
-    Returns in `flags` the flags for the current device. If there is a
+    Returns in ``flags`` the flags for the current device. If there is a
     current device for the calling thread, the flags for the device are
     returned. If there is no current device, the flags for the first device
     are returned, which may be the default flags. Compare to the behavior
@@ -23335,8 +23336,8 @@ def cudaStreamCreateWithFlags(unsigned int flags):
     calling host thread. If no context is current to the calling host
     thread, then the primary context for a device is selected, made current
     to the calling thread, and initialized before creating a stream on it.
-    The `flags` argument determines the behaviors of the stream. Valid
-    values for `flags` are
+    The ``flags`` argument determines the behaviors of the stream. Valid
+    values for ``flags`` are
 
     - :py:obj:`~.cudaStreamDefault`: Default stream creation flag.
 
@@ -23376,8 +23377,8 @@ def cudaStreamCreateWithPriority(unsigned int flags, int priority):
     """ Create an asynchronous stream with the specified priority.
 
     Creates a stream with the specified priority and returns a handle in
-    `pStream`. The stream is created on the context that is current to the
-    calling host thread. If no context is current to the calling host
+    ``pStream``. The stream is created on the context that is current to
+    the calling host thread. If no context is current to the calling host
     thread, then the primary context for a device is selected, made current
     to the calling thread, and initialized before creating a stream on it.
     This affects the scheduling priority of work in the stream. Priorities
@@ -23385,7 +23386,7 @@ def cudaStreamCreateWithPriority(unsigned int flags, int priority):
     possible, but do not preempt already-running work or provide any other
     functional guarantee on execution order.
 
-    `priority` follows a convention where lower numbers represent higher
+    ``priority`` follows a convention where lower numbers represent higher
     priorities. '0' represents default priority. The range of meaningful
     numerical priorities can be queried using
     :py:obj:`~.cudaDeviceGetStreamPriorityRange`. If the specified priority
@@ -23437,8 +23438,8 @@ def cudaStreamGetPriority(hStream):
     """ Query the priority of a stream.
 
     Query the priority of a stream. The priority is returned in in
-    `priority`. Note that if the stream was created with a priority outside
-    the meaningful numerical range returned by
+    ``priority``. Note that if the stream was created with a priority
+    outside the meaningful numerical range returned by
     :py:obj:`~.cudaDeviceGetStreamPriorityRange`, this function returns the
     clamped priority. See :py:obj:`~.cudaStreamCreateWithPriority` for
     details about priority clamping.
@@ -23482,7 +23483,7 @@ def cudaStreamGetPriority(hStream):
 def cudaStreamGetFlags(hStream):
     """ Query the flags of a stream.
 
-    Query the flags of a stream. The flags are returned in `flags`. See
+    Query the flags of a stream. The flags are returned in ``flags``. See
     :py:obj:`~.cudaStreamCreateWithFlags` for a list of valid flags.
 
     Parameters
@@ -23524,10 +23525,10 @@ def cudaStreamGetFlags(hStream):
 def cudaStreamGetId(hStream):
     """ Query the Id of a stream.
 
-    Query the Id of a stream. The Id is returned in `streamId`. The Id is
+    Query the Id of a stream. The Id is returned in ``streamId``. The Id is
     unique for the life of the program.
 
-    The stream handle `hStream` can refer to any of the following:
+    The stream handle ``hStream`` can refer to any of the following:
 
     - a stream created via any of the CUDA runtime APIs such as
       :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`
@@ -23580,7 +23581,7 @@ def cudaStreamGetId(hStream):
 def cudaStreamGetDevice(hStream):
     """ Query the device of a stream.
 
-    Returns in `*device` the device of the stream.
+    Returns in ``*device`` the device of the stream.
 
     Parameters
     ----------
@@ -23630,7 +23631,7 @@ def cudaCtxResetPersistingL2Cache():
 
     See Also
     --------
-    :py:obj:`~.cudaAccessPolicyWindow`
+    cudaAccessPolicyWindow
     """
     with nogil:
         err = cyruntime.cudaCtxResetPersistingL2Cache()
@@ -23643,8 +23644,8 @@ def cudaCtxResetPersistingL2Cache():
 def cudaStreamCopyAttributes(dst, src):
     """ Copies attributes from source stream to destination stream.
 
-    Copies attributes from source stream `src` to destination stream `dst`.
-    Both streams must have the same context.
+    Copies attributes from source stream ``src`` to destination stream
+    ``dst``. Both streams must have the same context.
 
     Parameters
     ----------
@@ -23660,7 +23661,7 @@ def cudaStreamCopyAttributes(dst, src):
 
     See Also
     --------
-    :py:obj:`~.cudaAccessPolicyWindow`
+    cudaAccessPolicyWindow
     """
     cdef cyruntime.cudaStream_t cysrc
     if src is None:
@@ -23689,8 +23690,8 @@ def cudaStreamCopyAttributes(dst, src):
 def cudaStreamGetAttribute(hStream, attr not None : cudaStreamAttrID):
     """ Queries stream attribute.
 
-    Queries attribute `attr` from `hStream` and stores it in corresponding
-    member of `value_out`.
+    Queries attribute ``attr`` from ``hStream`` and stores it in
+    corresponding member of ``value_out``.
 
     Parameters
     ----------
@@ -23708,7 +23709,7 @@ def cudaStreamGetAttribute(hStream, attr not None : cudaStreamAttrID):
 
     See Also
     --------
-    :py:obj:`~.cudaAccessPolicyWindow`
+    cudaAccessPolicyWindow
     """
     cdef cyruntime.cudaStream_t cyhStream
     if hStream is None:
@@ -23733,8 +23734,8 @@ def cudaStreamGetAttribute(hStream, attr not None : cudaStreamAttrID):
 def cudaStreamSetAttribute(hStream, attr not None : cudaStreamAttrID, value : Optional[cudaStreamAttrValue]):
     """ Sets stream attribute.
 
-    Sets attribute `attr` on `hStream` from corresponding attribute of
-    `value`. The updated attribute will be applied to subsequent work
+    Sets attribute ``attr`` on ``hStream`` from corresponding attribute of
+    ``value``. The updated attribute will be applied to subsequent work
     submitted to the stream. It will not affect previously submitted work.
 
     Parameters
@@ -23753,7 +23754,7 @@ def cudaStreamSetAttribute(hStream, attr not None : cudaStreamAttrID, value : Op
 
     See Also
     --------
-    :py:obj:`~.cudaAccessPolicyWindow`
+    cudaAccessPolicyWindow
     """
     cdef cyruntime.cudaStream_t cyhStream
     if hStream is None:
@@ -23776,12 +23777,13 @@ def cudaStreamSetAttribute(hStream, attr not None : cudaStreamAttrID, value : Op
 def cudaStreamDestroy(stream):
     """ Destroys and cleans up an asynchronous stream.
 
-    Destroys and cleans up the asynchronous stream specified by `stream`.
+    Destroys and cleans up the asynchronous stream specified by ``stream``.
 
-    In case the device is still doing work in the stream `stream` when
+    In case the device is still doing work in the stream ``stream`` when
     :py:obj:`~.cudaStreamDestroy()` is called, the function will return
-    immediately and the resources associated with `stream` will be released
-    automatically once the device has completed all work in `stream`.
+    immediately and the resources associated with ``stream`` will be
+    released automatically once the device has completed all work in
+    ``stream``.
 
     Parameters
     ----------
@@ -23816,11 +23818,11 @@ def cudaStreamDestroy(stream):
 def cudaStreamWaitEvent(stream, event, unsigned int flags):
     """ Make a compute stream wait on an event.
 
-    Makes all future work submitted to `stream` wait for all work captured
-    in `event`. See :py:obj:`~.cudaEventRecord()` for details on what is
-    captured by an event. The synchronization will be performed efficiently
-    on the device when applicable. `event` may be from a different device
-    than `stream`.
+    Makes all future work submitted to ``stream`` wait for all work
+    captured in ``event``. See :py:obj:`~.cudaEventRecord()` for details on
+    what is captured by an event. The synchronization will be performed
+    efficiently on the device when applicable. ``event`` may be from a
+    different device than ``stream``.
 
     flags include:
 
@@ -23993,7 +23995,7 @@ def cudaStreamAddCallback(stream, callback, userData, unsigned int flags):
 def cudaStreamSynchronize(stream):
     """ Waits for stream tasks to complete.
 
-    Blocks until `stream` has completed all operations. If the
+    Blocks until ``stream`` has completed all operations. If the
     :py:obj:`~.cudaDeviceScheduleBlockingSync` flag was set for this
     device, the host thread will block until the stream is finished with
     all of its tasks.
@@ -24031,7 +24033,7 @@ def cudaStreamSynchronize(stream):
 def cudaStreamQuery(stream):
     """ Queries an asynchronous stream for completion status.
 
-    Returns :py:obj:`~.cudaSuccess` if all operations in `stream` have
+    Returns :py:obj:`~.cudaSuccess` if all operations in ``stream`` have
     completed, or :py:obj:`~.cudaErrorNotReady` if not.
 
     For the purposes of Unified Memory, a return value of
@@ -24071,13 +24073,13 @@ def cudaStreamQuery(stream):
 def cudaStreamAttachMemAsync(stream, devPtr, size_t length, unsigned int flags):
     """ Attach memory to a stream asynchronously.
 
-    Enqueues an operation in `stream` to specify stream association of
-    `length` bytes of memory starting from `devPtr`. This function is a
+    Enqueues an operation in ``stream`` to specify stream association of
+    ``length`` bytes of memory starting from ``devPtr``. This function is a
     stream-ordered operation, meaning that it is dependent on, and will
     only take effect when, previous work in stream has completed. Any
     previous association is automatically replaced.
 
-    `devPtr` must point to an one of the following types of memories:
+    ``devPtr`` must point to an one of the following types of memories:
 
     - managed memory declared using the managed keyword or allocated with
       :py:obj:`~.cudaMallocManaged`.
@@ -24087,33 +24089,33 @@ def cudaStreamAttachMemAsync(stream, devPtr, size_t length, unsigned int flags):
       with the stream reports a non-zero value for the device attribute
       :py:obj:`~.cudaDevAttrPageableMemoryAccess`.
 
-    For managed allocations, `length` must be either zero or the entire
+    For managed allocations, ``length`` must be either zero or the entire
     allocation's size. Both indicate that the entire allocation's stream
     association is being changed. Currently, it is not possible to change
     stream association for a portion of a managed allocation.
 
-    For pageable allocations, `length` must be non-zero.
+    For pageable allocations, ``length`` must be non-zero.
 
-    The stream association is specified using `flags` which must be one of
-    :py:obj:`~.cudaMemAttachGlobal`, :py:obj:`~.cudaMemAttachHost` or
-    :py:obj:`~.cudaMemAttachSingle`. The default value for `flags` is
+    The stream association is specified using ``flags`` which must be one
+    of :py:obj:`~.cudaMemAttachGlobal`, :py:obj:`~.cudaMemAttachHost` or
+    :py:obj:`~.cudaMemAttachSingle`. The default value for ``flags`` is
     :py:obj:`~.cudaMemAttachSingle` If the :py:obj:`~.cudaMemAttachGlobal`
     flag is specified, the memory can be accessed by any stream on any
     device. If the :py:obj:`~.cudaMemAttachHost` flag is specified, the
     program makes a guarantee that it won't access the memory on the device
     from any stream on a device that has a zero value for the device
     attribute :py:obj:`~.cudaDevAttrConcurrentManagedAccess`. If the
-    :py:obj:`~.cudaMemAttachSingle` flag is specified and `stream` is
+    :py:obj:`~.cudaMemAttachSingle` flag is specified and ``stream`` is
     associated with a device that has a zero value for the device attribute
     :py:obj:`~.cudaDevAttrConcurrentManagedAccess`, the program makes a
     guarantee that it will only access the memory on the device from
-    `stream`. It is illegal to attach singly to the NULL stream, because
+    ``stream``. It is illegal to attach singly to the NULL stream, because
     the NULL stream is a virtual global stream and not a specific stream.
     An error will be returned in this case.
 
     When memory is associated with a single stream, the Unified Memory
     system will allow CPU access to this memory region so long as all
-    operations in `stream` have completed, regardless of whether other
+    operations in ``stream`` have completed, regardless of whether other
     streams are active. In effect, this constrains exclusive ownership of
     the managed memory region by an active GPU to per-stream activity
     instead of whole-GPU activity.
@@ -24129,7 +24131,7 @@ def cudaStreamAttachMemAsync(stream, devPtr, size_t length, unsigned int flags):
     visibility and coherency will be changed appropriately for all kernels
     which follow a stream-association change.
 
-    If `stream` is destroyed while data is associated with it, the
+    If ``stream`` is destroyed while data is associated with it, the
     association is removed and the association reverts to the default
     visibility of the allocation as specified at
     :py:obj:`~.cudaMallocManaged`. For managed variables, the default
@@ -24183,18 +24185,18 @@ def cudaStreamAttachMemAsync(stream, devPtr, size_t length, unsigned int flags):
 def cudaStreamBeginCapture(stream, mode not None : cudaStreamCaptureMode):
     """ Begins graph capture on a stream.
 
-    Begin graph capture on `stream`. When a stream is in capture mode, all
-    operations pushed into the stream will not be executed, but will
+    Begin graph capture on ``stream``. When a stream is in capture mode,
+    all operations pushed into the stream will not be executed, but will
     instead be captured into a graph, which will be returned via
     :py:obj:`~.cudaStreamEndCapture`. Capture may not be initiated if
-    `stream` is :py:obj:`~.cudaStreamLegacy`. Capture must be ended on the
-    same stream in which it was initiated, and it may only be initiated if
-    the stream is not already in capture mode. The capture mode may be
+    ``stream`` is :py:obj:`~.cudaStreamLegacy`. Capture must be ended on
+    the same stream in which it was initiated, and it may only be initiated
+    if the stream is not already in capture mode. The capture mode may be
     queried via :py:obj:`~.cudaStreamIsCapturing`. A unique id representing
     the capture sequence may be queried via
     :py:obj:`~.cudaStreamGetCaptureInfo`.
 
-    If `mode` is not :py:obj:`~.cudaStreamCaptureModeRelaxed`,
+    If ``mode`` is not :py:obj:`~.cudaStreamCaptureModeRelaxed`,
     :py:obj:`~.cudaStreamEndCapture` must be called on this stream from the
     same thread.
 
@@ -24240,7 +24242,7 @@ def cudaStreamBeginCapture(stream, mode not None : cudaStreamCaptureMode):
 def cudaStreamBeginRecaptureToGraph(stream, mode not None : cudaStreamCaptureMode, graph, callbackData : Optional[cudaGraphRecaptureCallbackData]):
     """ Begin graph capture on a stream to an existing graph.
 
-    Begin graph capture on `stream` to the existing `graph`. The node
+    Begin graph capture on ``stream`` to the existing ``graph``. The node
     creation order while recapturing the graph must be identical to the
     original graph. The recapture will fail immediately for:
 
@@ -24250,11 +24252,11 @@ def cudaStreamBeginRecaptureToGraph(stream, mode not None : cudaStreamCaptureMod
     - Parameter mismatches for memory allocation or free nodes
 
     Any other node parameter mismatches during recapture can be configured
-    to call the function provided in `callbackFunc`. The recapture will
+    to call the function provided in ``callbackFunc``. The recapture will
     fail immediately if the callback returns anything other than
     cudaSuccess.
 
-    If the recapture fails for any reason, the `graph` will be in an
+    If the recapture fails for any reason, the ``graph`` will be in an
     undefined state and should be destroyed.
 
     See cudaStreamBeginCapture for additional detail on beginning the
@@ -24285,7 +24287,7 @@ def cudaStreamBeginRecaptureToGraph(stream, mode not None : cudaStreamCaptureMod
 
     Notes
     -----
-    Any user objects associated with `graph` will be released prior to the recapture.
+    Any user objects associated with ``graph`` will be released prior to the recapture.
     """
     cdef cyruntime.cudaGraph_t cygraph
     if graph is None:
@@ -24316,19 +24318,19 @@ def cudaStreamBeginRecaptureToGraph(stream, mode not None : cudaStreamCaptureMod
 def cudaStreamBeginCaptureToGraph(stream, graph, dependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], dependencyData : Optional[tuple[cudaGraphEdgeData] | list[cudaGraphEdgeData]], size_t numDependencies, mode not None : cudaStreamCaptureMode):
     """ Begins graph capture on a stream to an existing graph.
 
-    Begin graph capture on `stream`. When a stream is in capture mode, all
-    operations pushed into the stream will not be executed, but will
-    instead be captured into `graph`, which will be returned via
+    Begin graph capture on ``stream``. When a stream is in capture mode,
+    all operations pushed into the stream will not be executed, but will
+    instead be captured into ``graph``, which will be returned via
     :py:obj:`~.cudaStreamEndCapture`.
 
-    Capture may not be initiated if `stream` is
+    Capture may not be initiated if ``stream`` is
     :py:obj:`~.cudaStreamLegacy`. Capture must be ended on the same stream
     in which it was initiated, and it may only be initiated if the stream
     is not already in capture mode. The capture mode may be queried via
     :py:obj:`~.cudaStreamIsCapturing`. A unique id representing the capture
     sequence may be queried via :py:obj:`~.cudaStreamGetCaptureInfo`.
 
-    If `mode` is not :py:obj:`~.cudaStreamCaptureModeRelaxed`,
+    If ``mode`` is not :py:obj:`~.cudaStreamCaptureModeRelaxed`,
     :py:obj:`~.cudaStreamEndCapture` must be called on this stream from the
     same thread.
 
@@ -24422,8 +24424,8 @@ def cudaThreadExchangeStreamCaptureMode(mode not None : cudaStreamCaptureMode):
     """ Swaps the stream capture interaction mode for a thread.
 
     Sets the calling thread's stream capture interaction mode to the value
-    contained in `*mode`, and overwrites `*mode` with the previous mode for
-    the thread. To facilitate deterministic behavior across function or
+    contained in ``*mode``, and overwrites ``*mode`` with the previous mode
+    for the thread. To facilitate deterministic behavior across function or
     module boundaries, callers are encouraged to use this API in a push-pop
     fashion:
 
@@ -24446,20 +24448,20 @@ def cudaThreadExchangeStreamCaptureMode(mode not None : cudaStreamCaptureMode):
 
     A thread's mode is one of the following:
 
-    - `cudaStreamCaptureModeGlobal:` This is the default mode. If the local
-      thread has an ongoing capture sequence that was not initiated with
-      `cudaStreamCaptureModeRelaxed` at `cuStreamBeginCapture`, or if any
-      other thread has a concurrent capture sequence initiated with
-      `cudaStreamCaptureModeGlobal`, this thread is prohibited from
+    - ``cudaStreamCaptureModeGlobal:`` This is the default mode. If the
+      local thread has an ongoing capture sequence that was not initiated
+      with ``cudaStreamCaptureModeRelaxed`` at ``cuStreamBeginCapture``, or
+      if any other thread has a concurrent capture sequence initiated with
+      ``cudaStreamCaptureModeGlobal``, this thread is prohibited from
       potentially unsafe API calls.
 
-    - `cudaStreamCaptureModeThreadLocal:` If the local thread has an
+    - ``cudaStreamCaptureModeThreadLocal:`` If the local thread has an
       ongoing capture sequence not initiated with
-      `cudaStreamCaptureModeRelaxed`, it is prohibited from potentially
+      ``cudaStreamCaptureModeRelaxed``, it is prohibited from potentially
       unsafe API calls. Concurrent capture sequences in other threads are
       ignored.
 
-    - `cudaStreamCaptureModeRelaxed:` The local thread is not prohibited
+    - ``cudaStreamCaptureModeRelaxed:`` The local thread is not prohibited
       from potentially unsafe API calls. Note that the thread is still
       prohibited from API calls which necessarily conflict with stream
       capture, for example, attempting :py:obj:`~.cudaEventQuery` on an
@@ -24495,13 +24497,13 @@ def cudaThreadExchangeStreamCaptureMode(mode not None : cudaStreamCaptureMode):
 def cudaStreamEndCapture(stream):
     """ Ends capture on a stream, returning the captured graph.
 
-    End capture on `stream`, returning the captured graph via `pGraph`.
-    Capture must have been initiated on `stream` via a call to
+    End capture on ``stream``, returning the captured graph via ``pGraph``.
+    Capture must have been initiated on ``stream`` via a call to
     :py:obj:`~.cudaStreamBeginCapture`. If capture was invalidated, due to
     a violation of the rules of stream capture, then a NULL graph will be
     returned.
 
-    If the `mode` argument to :py:obj:`~.cudaStreamBeginCapture` was not
+    If the ``mode`` argument to :py:obj:`~.cudaStreamBeginCapture` was not
     :py:obj:`~.cudaStreamCaptureModeRelaxed`, this call must be from the
     same thread as :py:obj:`~.cudaStreamBeginCapture`.
 
@@ -24543,8 +24545,8 @@ def cudaStreamEndCapture(stream):
 def cudaStreamIsCapturing(stream):
     """ Returns a stream's capture status.
 
-    Return the capture status of `stream` via `pCaptureStatus`. After a
-    successful call, `*pCaptureStatus` will contain one of the following:
+    Return the capture status of ``stream`` via ``pCaptureStatus``. After a
+    successful call, ``*pCaptureStatus`` will contain one of the following:
 
     - :py:obj:`~.cudaStreamCaptureStatusNone`: The stream is not capturing.
 
@@ -24554,12 +24556,12 @@ def cudaStreamIsCapturing(stream):
       capturing but an error has invalidated the capture sequence. The
       capture sequence must be terminated with
       :py:obj:`~.cudaStreamEndCapture` on the stream where it was initiated
-      in order to continue using `stream`.
+      in order to continue using ``stream``.
 
     Note that, if this is called on :py:obj:`~.cudaStreamLegacy` (the "null
     stream") while a blocking stream on the same device is capturing, it
     will return :py:obj:`~.cudaErrorStreamCaptureImplicit` and
-    `*pCaptureStatus` is unspecified after the call. The blocking stream
+    ``*pCaptureStatus`` is unspecified after the call. The blocking stream
     capture is not invalidated.
 
     When a blocking stream is capturing, the legacy stream is in an
@@ -24619,9 +24621,9 @@ def cudaStreamGetCaptureInfo(stream):
     - the returned capture status is
       :py:obj:`~.cudaStreamCaptureStatusActive`
 
-    If `edgeData_out` is non-NULL then `dependencies_out` must be as well.
-    If `dependencies_out` is non-NULL and `edgeData_out` is NULL, but there
-    is non-zero edge data for one or more of the current stream
+    If ``edgeData_out`` is non-NULL then ``dependencies_out`` must be as
+    well. If ``dependencies_out`` is non-NULL and ``edgeData_out`` is NULL,
+    but there is non-zero edge data for one or more of the current stream
     dependencies, the call will return :py:obj:`~.cudaErrorLossyQuery`.
 
     Parameters
@@ -24659,11 +24661,11 @@ def cudaStreamGetCaptureInfo(stream):
         operate on the graph (not the stream) without copying.
     edgeData_out : list[:py:obj:`~.cudaGraphEdgeData`]
         Optional location to store a pointer to an array of graph edge
-        data. This array parallels `dependencies_out`; the next node to be
-        added has an edge to `dependencies_out`[i] with annotation
-        `edgeData_out`[i] for each `i`. The array pointer is valid until
-        the next API call which operates on the stream or until the capture
-        is terminated.
+        data. This array parallels ``dependencies_out``; the next node to
+        be added has an edge to ``dependencies_out[i]`` with annotation
+        ``edgeData_out[i]`` for each ``i``. The array pointer is valid
+        until the next API call which operates on the stream or until the
+        capture is terminated.
     numDependencies_out : int
         Optional location to store the size of the array returned in
         dependencies_out.
@@ -24876,12 +24878,13 @@ def cudaEventCreateWithFlags(unsigned int flags):
 def cudaEventRecord(event, stream):
     """ Records an event.
 
-    Captures in `event` the contents of `stream` at the time of this call.
-    `event` and `stream` must be on the same CUDA context. Calls such as
-    :py:obj:`~.cudaEventQuery()` or :py:obj:`~.cudaStreamWaitEvent()` will
-    then examine or wait for completion of the work that was captured. Uses
-    of `stream` after this call do not modify `event`. See note on default
-    stream behavior for what is captured in the default case.
+    Captures in ``event`` the contents of ``stream`` at the time of this
+    call. ``event`` and ``stream`` must be on the same CUDA context. Calls
+    such as :py:obj:`~.cudaEventQuery()` or
+    :py:obj:`~.cudaStreamWaitEvent()` will then examine or wait for
+    completion of the work that was captured. Uses of ``stream`` after this
+    call do not modify ``event``. See note on default stream behavior for
+    what is captured in the default case.
 
     :py:obj:`~.cudaEventRecord()` can be called multiple times on the same
     event and will overwrite the previously captured state. Other APIs such
@@ -24935,12 +24938,13 @@ def cudaEventRecord(event, stream):
 def cudaEventRecordWithFlags(event, stream, unsigned int flags):
     """ Records an event.
 
-    Captures in `event` the contents of `stream` at the time of this call.
-    `event` and `stream` must be on the same CUDA context. Calls such as
-    :py:obj:`~.cudaEventQuery()` or :py:obj:`~.cudaStreamWaitEvent()` will
-    then examine or wait for completion of the work that was captured. Uses
-    of `stream` after this call do not modify `event`. See note on default
-    stream behavior for what is captured in the default case.
+    Captures in ``event`` the contents of ``stream`` at the time of this
+    call. ``event`` and ``stream`` must be on the same CUDA context. Calls
+    such as :py:obj:`~.cudaEventQuery()` or
+    :py:obj:`~.cudaStreamWaitEvent()` will then examine or wait for
+    completion of the work that was captured. Uses of ``stream`` after this
+    call do not modify ``event``. See note on default stream behavior for
+    what is captured in the default case.
 
     :py:obj:`~.cudaEventRecordWithFlags()` can be called multiple times on
     the same event and will overwrite the previously captured state. Other
@@ -25003,7 +25007,7 @@ def cudaEventRecordWithFlags(event, stream, unsigned int flags):
 def cudaEventQuery(event):
     """ Queries an event's status.
 
-    Queries the status of all work currently captured by `event`. See
+    Queries the status of all work currently captured by ``event``. See
     :py:obj:`~.cudaEventRecord()` for details on what is captured by an
     event.
 
@@ -25048,7 +25052,7 @@ def cudaEventQuery(event):
 def cudaEventSynchronize(event):
     """ Waits for an event to complete.
 
-    Waits until the completion of all work currently captured in `event`.
+    Waits until the completion of all work currently captured in ``event``.
     See :py:obj:`~.cudaEventRecord()` for details on what is captured by an
     event.
 
@@ -25092,7 +25096,7 @@ def cudaEventSynchronize(event):
 def cudaEventDestroy(event):
     """ Destroys an event object.
 
-    Destroys the event specified by `event`.
+    Destroys the event specified by ``event``.
 
     An event may be destroyed before it is complete (i.e., while
     :py:obj:`~.cudaEventQuery()` would return
@@ -25170,7 +25174,7 @@ def cudaEventElapsedTime(start, end):
     cudaError_t
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotReady`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`, :py:obj:`~.cudaErrorUnknown`
     ms : float
-        Time between `start` and `end` in ms
+        Time between ``start`` and ``end`` in ms
 
     See Also
     --------
@@ -25207,11 +25211,11 @@ def cudaImportExternalMemory(memHandleDesc : Optional[cudaExternalMemoryHandleDe
     """ Imports an external memory object.
 
     Imports an externally allocated memory object and returns a handle to
-    that in `extMem_out`.
+    that in ``extMem_out``.
 
     The properties of the handle being imported must be described in
-    `memHandleDesc`. The :py:obj:`~.cudaExternalMemoryHandleDesc` structure
-    is defined as follows:
+    ``memHandleDesc``. The cudaExternalMemoryHandleDesc structure is
+    defined as follows:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -25223,88 +25227,83 @@ def cudaImportExternalMemory(memHandleDesc : Optional[cudaExternalMemoryHandleDe
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeOpaqueFd`, then
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::fd must be a valid
-    file descriptor referencing a memory object. Ownership of the file
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.fd` must be a valid file
+    descriptor referencing a memory object. Ownership of the file
     descriptor is transferred to the CUDA driver when the handle is
     imported successfully. Performing any operations on the file descriptor
     after it is imported results in undefined behavior.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeOpaqueWin32`, then exactly one
-    of :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle and
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name must not
-    be NULL. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle is not
-    NULL, then it must represent a valid shared NT handle that references a
-    memory object. Ownership of this handle is not transferred to CUDA
-    after the import operation, so the application must release the handle
-    using the appropriate system call. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name is not
-    NULL, then it must point to a NULL-terminated array of UTF-16
-    characters that refers to a memory object.
+    of :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name` must not be
+    NULL. If :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle`
+    is not NULL, then it must represent a valid shared NT handle that
+    references a memory object. Ownership of this handle is not transferred
+    to CUDA after the import operation, so the application must release the
+    handle using the appropriate system call. If
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name` is not NULL,
+    then it must point to a NULL-terminated array of UTF-16 characters that
+    refers to a memory object.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeOpaqueWin32Kmt`, then
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle must be
-    non-NULL and
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name must be
-    NULL. The handle specified must be a globally shared KMT handle. This
-    handle does not hold a reference to the underlying object, and thus
-    will be invalid when all references to the memory object are destroyed.
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle` must be
+    non-NULL and :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name`
+    must be NULL. The handle specified must be a globally shared KMT
+    handle. This handle does not hold a reference to the underlying object,
+    and thus will be invalid when all references to the memory object are
+    destroyed.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeD3D12Heap`, then exactly one of
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle and
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name must not
-    be NULL. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle is not
-    NULL, then it must represent a valid shared NT handle that is returned
-    by ID3D12Device::CreateSharedHandle when referring to a ID3D12Heap
-    object. This handle holds a reference to the underlying object. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name is not
-    NULL, then it must point to a NULL-terminated array of UTF-16
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name` must not be
+    NULL. If :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle`
+    is not NULL, then it must represent a valid shared NT handle that is
+    returned by ID3D12Device::CreateSharedHandle when referring to a
+    ID3D12Heap object. This handle holds a reference to the underlying
+    object. If :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name`
+    is not NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a ID3D12Heap object.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeD3D12Resource`, then exactly one
-    of :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle and
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name must not
-    be NULL. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle is not
-    NULL, then it must represent a valid shared NT handle that is returned
-    by ID3D12Device::CreateSharedHandle when referring to a ID3D12Resource
-    object. This handle holds a reference to the underlying object. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name is not
-    NULL, then it must point to a NULL-terminated array of UTF-16
+    of :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name` must not be
+    NULL. If :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle`
+    is not NULL, then it must represent a valid shared NT handle that is
+    returned by ID3D12Device::CreateSharedHandle when referring to a
+    ID3D12Resource object. This handle holds a reference to the underlying
+    object. If :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name`
+    is not NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a ID3D12Resource object.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeD3D11Resource`,then exactly one
-    of :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle and
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name must not
-    be NULL. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that is
+    of :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name` must not be
+    NULL. If :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle`
+    is   not NULL, then it must represent a valid shared NT handle that is
     returned by IDXGIResource1::CreateSharedHandle when referring to a
     ID3D11Resource object. If
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name is not
-    NULL, then it must point to a NULL-terminated array of UTF-16
-    characters that refers to a ID3D11Resource object.
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name` is not NULL,
+    then it must point to a NULL-terminated array of UTF-16 characters that
+    refers to a ID3D11Resource object.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeD3D11ResourceKmt`, then
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::handle must be
-    non-NULL and
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::win32::name must be
-    NULL. The handle specified must be a valid shared KMT handle that is
-    returned by IDXGIResource::GetSharedHandle when referring to a
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.handle` must be
+    non-NULL and :py:obj:`~.cudaExternalMemoryHandleDesc.handle.win32.name`
+    must be NULL. The handle specified must be a valid shared KMT handle
+    that is returned by IDXGIResource::GetSharedHandle when referring to a
     ID3D11Resource object.
 
     If :py:obj:`~.cudaExternalMemoryHandleDesc.type` is
     :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, then
-    :py:obj:`~.cudaExternalMemoryHandleDesc`::handle::nvSciBufObject must
-    be NON-NULL and reference a valid NvSciBuf object. If the NvSciBuf
-    object imported into CUDA is also mapped by other drivers, then the
+    :py:obj:`~.cudaExternalMemoryHandleDesc.handle.nvSciBufObject` must be
+    NON-NULL and reference a valid NvSciBuf object. If the NvSciBuf object
+    imported into CUDA is also mapped by other drivers, then the
     application must use :py:obj:`~.cudaWaitExternalSemaphoresAsync` or
     :py:obj:`~.cudaSignalExternalSemaphoresAsync` as approprriate barriers
     to maintain coherence between CUDA and the other drivers. See
@@ -25361,11 +25360,11 @@ def cudaExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[cudaExternal
     """ Maps a buffer onto an imported memory object.
 
     Maps a buffer onto an imported memory object and returns a device
-    pointer in `devPtr`.
+    pointer in ``devPtr``.
 
     The properties of the buffer being mapped must be described in
-    `bufferDesc`. The :py:obj:`~.cudaExternalMemoryBufferDesc` structure is
-    defined as follows:
+    ``bufferDesc``. The cudaExternalMemoryBufferDesc structure is defined
+    as follows:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -25385,7 +25384,8 @@ def cudaExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[cudaExternal
     separate buffers and then apply the appropriate offsets to the returned
     pointer to derive the individual buffers.
 
-    The returned pointer `devPtr` must be freed using :py:obj:`~.cudaFree`.
+    The returned pointer ``devPtr`` must be freed using
+    :py:obj:`~.cudaFree`.
 
     Parameters
     ----------
@@ -25429,11 +25429,11 @@ def cudaExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[cuda
     """ Maps a CUDA mipmapped array onto an external memory object.
 
     Maps a CUDA mipmapped array onto an external object and returns a
-    handle to it in `mipmap`.
+    handle to it in ``mipmap``.
 
     The properties of the CUDA mipmapped array being mapped must be
-    described in `mipmapDesc`. The structure
-    :py:obj:`~.cudaExternalMemoryMipmappedArrayDesc` is defined as follows:
+    described in ``mipmapDesc``. The structure
+    cudaExternalMemoryMipmappedArrayDesc is defined as follows:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -25539,11 +25539,11 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
     """ Imports an external semaphore.
 
     Imports an externally allocated synchronization object and returns a
-    handle to that in `extSem_out`.
+    handle to that in ``extSem_out``.
 
     The properties of the handle being imported must be described in
-    `semHandleDesc`. The :py:obj:`~.cudaExternalSemaphoreHandleDesc` is
-    defined as follows:
+    ``semHandleDesc``. The cudaExternalSemaphoreHandleDesc is defined as
+    follows:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -25555,7 +25555,7 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeOpaqueFd`, then
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::fd must be a valid
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.fd` must be a valid
     file descriptor referencing a synchronization object. Ownership of the
     file descriptor is transferred to the CUDA driver when the handle is
     imported successfully. Performing any operations on the file descriptor
@@ -25563,80 +25563,78 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeOpaqueWin32`, then exactly
-    one of
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle and
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name must
+    one of :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle`
+    and :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must
     not be NULL. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that
-    references a synchronization object. Ownership of this handle is not
-    transferred to CUDA after the import operation, so the application must
-    release the handle using the appropriate system call. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name is not
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that references a
+    synchronization object. Ownership of this handle is not transferred to
+    CUDA after the import operation, so the application must release the
+    handle using the appropriate system call. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` is not
     NULL, then it must name a valid synchronization object.
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt`, then
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle must
-    be non-NULL and
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name must
-    be NULL. The handle specified must be a globally shared KMT handle.
-    This handle does not hold a reference to the underlying object, and
-    thus will be invalid when all references to the synchronization object
-    are destroyed.
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` must be
+    non-NULL and
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must be
+    NULL. The handle specified must be a globally shared KMT handle. This
+    handle does not hold a reference to the underlying object, and thus
+    will be invalid when all references to the synchronization object are
+    destroyed.
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeD3D12Fence`, then exactly one
-    of :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle
-    and :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that is
-    returned by ID3D12Device::CreateSharedHandle when referring to a
-    ID3D12Fence object. This handle holds a reference to the underlying
-    object. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name is not
+    of :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must not
+    be NULL. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that is returned
+    by ID3D12Device::CreateSharedHandle when referring to a ID3D12Fence
+    object. This handle holds a reference to the underlying object. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` is not
     NULL, then it must name a valid synchronization object that refers to a
     valid ID3D12Fence object.
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeD3D11Fence`, then exactly one
-    of :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle
-    and :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that is
-    returned by ID3D11Fence::CreateSharedHandle. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name is not
+    of :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must not
+    be NULL. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that is returned
+    by ID3D11Fence::CreateSharedHandle. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` is not
     NULL, then it must name a valid synchronization object that refers to a
     valid ID3D11Fence object.
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync`, then
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::nvSciSyncObj
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.nvSciSyncObj`
     represents a valid NvSciSyncObj.
 
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutex`, then exactly one
-    of :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle
-    and :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name
-    must not be NULL. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle is
-    not NULL, then it represent a valid shared NT handle that is returned
-    by IDXGIResource1::CreateSharedHandle when referring to a
-    IDXGIKeyedMutex object.
+    of :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must not
+    be NULL. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` is not
+    NULL, then it represent a valid shared NT handle that is returned by
+    IDXGIResource1::CreateSharedHandle when referring to a IDXGIKeyedMutex
+    object.
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutexKmt`, then
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle must
-    be non-NULL and
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name must
-    be NULL. The handle specified must represent a valid KMT handle that is
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` must be
+    non-NULL and
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must be
+    NULL. The handle specified must represent a valid KMT handle that is
     returned by IDXGIResource::GetSharedHandle when referring to a
     IDXGIKeyedMutex object.
 
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd`, then
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::fd must be a valid
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.fd` must be a valid
     file descriptor referencing a synchronization object. Ownership of the
     file descriptor is transferred to the CUDA driver when the handle is
     imported successfully. Performing any operations on the file descriptor
@@ -25645,15 +25643,15 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
     If :py:obj:`~.cudaExternalSemaphoreHandleDesc.type` is
     :py:obj:`~.cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32`, then
     exactly one of
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle and
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name must
-    not be NULL. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::handle is
-    not NULL, then it must represent a valid shared NT handle that
-    references a synchronization object. Ownership of this handle is not
-    transferred to CUDA after the import operation, so the application must
-    release the handle using the appropriate system call. If
-    :py:obj:`~.cudaExternalSemaphoreHandleDesc`::handle::win32::name is not
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` and
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` must not
+    be NULL. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.handle` is not
+    NULL, then it must represent a valid shared NT handle that references a
+    synchronization object. Ownership of this handle is not transferred to
+    CUDA after the import operation, so the application must release the
+    handle using the appropriate system call. If
+    :py:obj:`~.cudaExternalSemaphoreHandleDesc.handle.win32.name` is not
     NULL, then it must name a valid synchronization object.
 
     Parameters
@@ -25706,15 +25704,15 @@ def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalS
     :py:obj:`~.cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd`,
     :py:obj:`~.cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32` then
     the semaphore will be set to the value specified in
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::fence::value.
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.fence.value`.
 
     If the semaphore object is of the type
     :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync` this API sets
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::nvSciSync::fence
-    to a value that can be used by subsequent waiters of the same NvSciSync
-    object to order operations with those currently submitted in `stream`.
-    Such an update will overwrite previous contents of
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::nvSciSync::fence.
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.nvSciSync.fence` to
+    a value that can be used by subsequent waiters of the same NvSciSync
+    object to order operations with those currently submitted in
+    ``stream``. Such an update will overwrite previous contents of
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.nvSciSync.fence`.
     By default, signaling such an external semaphore object causes
     appropriate memory synchronization operations to be performed over all
     the external memory objects that are imported as
@@ -25731,7 +25729,7 @@ def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalS
     in :py:obj:`~.cudaDeviceGetNvSciSyncAttributes` to
     cudaNvSciSyncAttrSignal, this API will return cudaErrorNotSupported.
 
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::nvSciSync::fence
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.nvSciSync.fence`
     associated with semaphore object of the type
     :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync` can be
     deterministic. For this the NvSciSyncAttrList used to create the
@@ -25750,7 +25748,7 @@ def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalS
     with deterministic fence support enabled in different streams or by
     adding explicit dependency amongst such streams so that the semaphore
     is signaled in order.
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::nvSciSync::fence
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.nvSciSync.fence`
     associated with semaphore object of the type
     :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync` can be timestamp
     enabled. For this the NvSciSyncAttrList used to create the object must
@@ -25771,7 +25769,7 @@ def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalS
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutex`,
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutexKmt`, then the
     keyed mutex will be released with the key specified in
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::keyedmutex::key.
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.keyedmutex.key`.
 
     Parameters
     ----------
@@ -25866,14 +25864,14 @@ def cudaWaitExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalSem
     :py:obj:`~.cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32` then
     waiting on the semaphore will wait until the value of the semaphore is
     greater than or equal to
-    :py:obj:`~.cudaExternalSemaphoreWaitParams`::params::fence::value.
+    :py:obj:`~.cudaExternalSemaphoreWaitParams.params.fence.value`.
 
     If the semaphore object is of the type
     :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync` then, waiting on
     the semaphore will wait until the
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::nvSciSync::fence
-    is signaled by the signaler of the NvSciSyncObj that was associated
-    with this semaphore object. By default, waiting on such an external
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.nvSciSync.fence` is
+    signaled by the signaler of the NvSciSyncObj that was associated with
+    this semaphore object. By default, waiting on such an external
     semaphore object causes appropriate memory synchronization operations
     to be performed over all external memory objects that are imported as
     :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`. This ensures that any
@@ -25893,10 +25891,9 @@ def cudaWaitExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalSem
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutex`,
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutexKmt`, then the
     keyed mutex will be acquired when it is released with the key specified
-    in
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::keyedmutex::key
+    in :py:obj:`~.cudaExternalSemaphoreSignalParams.params.keyedmutex.key`
     or until the timeout specified by
-    :py:obj:`~.cudaExternalSemaphoreSignalParams`::params::keyedmutex::timeoutMs
+    :py:obj:`~.cudaExternalSemaphoreSignalParams.params.keyedmutex.timeoutMs`
     has lapsed. The timeout interval can either be a finite value specified
     in milliseconds or an infinite value. In case an infinite value is
     specified the timeout never elapses. The windows INFINITE macro must be
@@ -26010,14 +26007,14 @@ def cudaFuncSetCacheConfig(func, cacheConfig not None : cudaFuncCache):
     """ Sets the preferred cache configuration for a device function.
 
     On devices where the L1 cache and shared memory use the same hardware
-    resources, this sets through `cacheConfig` the preferred cache
-    configuration for the function specified via `func`. This is only a
+    resources, this sets through ``cacheConfig`` the preferred cache
+    configuration for the function specified via ``func``. This is only a
     preference. The runtime will use the requested configuration if
     possible, but it is free to choose a different configuration if
-    required to execute `func`.
+    required to execute ``func``.
 
-    `func` is a device function symbol and must be declared as a
-    `__global__` function. If the specified function does not exist, then
+    ``func`` is a device function symbol and must be declared as a
+    ``__global__`` function. If the specified function does not exist, then
     :py:obj:`~.cudaErrorInvalidDeviceFunction` is returned. For templated
     functions, pass the function symbol as follows:
     func_name<template_arg_0,...,template_arg_N>
@@ -26052,7 +26049,7 @@ def cudaFuncSetCacheConfig(func, cacheConfig not None : cudaFuncCache):
     Returns
     -------
     cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDeviceFunction`2
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDeviceFunction` 2
 
     See Also
     --------
@@ -26078,11 +26075,11 @@ def cudaFuncGetAttributes(func):
     """ Find out attributes for a given function.
 
     This function obtains the attributes of a function specified via
-    `func`. `func` is a device function symbol and must be declared as a
-    `__global__` function. The fetched attributes are placed in `attr`. If
-    the specified function does not exist, then it is assumed to be a
-    :py:obj:`~.cudaKernel_t` and used as is. For templated functions, pass
-    the function symbol as follows:
+    ``func``. ``func`` is a device function symbol and must be declared as
+    a ``__global__`` function. The fetched attributes are placed in
+    ``attr``. If the specified function does not exist, then it is assumed
+    to be a :py:obj:`~.cudaKernel_t` and used as is. For templated
+    functions, pass the function symbol as follows:
     func_name<template_arg_0,...,template_arg_N>
 
     Note that some function attributes such as
@@ -26097,7 +26094,7 @@ def cudaFuncGetAttributes(func):
     Returns
     -------
     cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDeviceFunction`2
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDeviceFunction` 2
     attr : :py:obj:`~.cudaFuncAttributes`
         Return pointer to function's attributes
 
@@ -26122,16 +26119,16 @@ def cudaFuncGetAttributes(func):
 def cudaFuncSetAttribute(func, attr not None : cudaFuncAttribute, int value):
     """ Set attributes for a given function.
 
-    This function sets the attributes of a function specified via `func`.
-    The parameter `func` must be a pointer to a function that executes on
-    the device. The parameter specified by `func` must be declared as a
-    `__global__` function. The enumeration defined by `attr` is set to the
-    value defined by `value`. If the specified function does not exist,
-    then it is assumed to be a :py:obj:`~.cudaKernel_t` and used as is. If
-    the specified attribute cannot be written, or if the value is
+    This function sets the attributes of a function specified via ``func``.
+    The parameter ``func`` must be a pointer to a function that executes on
+    the device. The parameter specified by ``func`` must be declared as a
+    ``__global__`` function. The enumeration defined by ``attr`` is set to
+    the value defined by ``value``. If the specified function does not
+    exist, then it is assumed to be a :py:obj:`~.cudaKernel_t` and used as
+    is. If the specified attribute cannot be written, or if the value is
     incorrect, then :py:obj:`~.cudaErrorInvalidValue` is returned.
 
-    Valid values for `attr` are:
+    Valid values for ``attr`` are:
 
     - :py:obj:`~.cudaFuncAttributeMaxDynamicSharedMemorySize` - The
       requested maximum size in bytes of dynamically-allocated shared
@@ -26209,8 +26206,8 @@ def cudaFuncSetAttribute(func, attr not None : cudaFuncAttribute, int value):
 def cudaFuncGetParamCount(func):
     """ Returns the number of parameters used by the function.
 
-    Queries the number of kernel parameters used by `func` and returns it
-    in `paramCount`.
+    Queries the number of kernel parameters used by ``func`` and returns it
+    in ``paramCount``.
 
     Parameters
     ----------
@@ -26512,7 +26509,7 @@ def cudaFuncSetSharedMemConfig(func, config not None : cudaSharedMemConfig):
 def cudaOccupancyMaxActiveBlocksPerMultiprocessor(func, int blockSize, size_t dynamicSMemSize):
     """ Returns occupancy for a device function.
 
-    Returns in `*numBlocks` the maximum number of active blocks per
+    Returns in ``*numBlocks`` the maximum number of active blocks per
     streaming multiprocessor for the device function.
 
     Parameters
@@ -26550,10 +26547,10 @@ def cudaOccupancyMaxActiveBlocksPerMultiprocessor(func, int blockSize, size_t dy
 
 @cython.embedsignature(True)
 def cudaOccupancyAvailableDynamicSMemPerBlock(func, int numBlocks, int blockSize):
-    """ Returns dynamic shared memory available per block when launching `numBlocks` blocks on SM.
+    """ Returns dynamic shared memory available per block when launching ``numBlocks`` blocks on SM.
 
-    Returns in `*dynamicSmemSize` the maximum size of dynamic shared memory
-    to allow `numBlocks` blocks per SM.
+    Returns in ``*dynamicSmemSize`` the maximum size of dynamic shared
+    memory to allow ``numBlocks`` blocks per SM.
 
     Parameters
     ----------
@@ -26592,10 +26589,10 @@ def cudaOccupancyAvailableDynamicSMemPerBlock(func, int numBlocks, int blockSize
 def cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(func, int blockSize, size_t dynamicSMemSize, unsigned int flags):
     """ Returns occupancy for a device function with the specified flags.
 
-    Returns in `*numBlocks` the maximum number of active blocks per
+    Returns in ``*numBlocks`` the maximum number of active blocks per
     streaming multiprocessor for the device function.
 
-    The `flags` parameter controls how special cases are handled. Valid
+    The ``flags`` parameter controls how special cases are handled. Valid
     flags include:
 
     - :py:obj:`~.cudaOccupancyDefault`: keeps the default behavior as
@@ -26649,20 +26646,20 @@ def cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(func, int blockSize,
 def cudaMallocManaged(size_t size, unsigned int flags):
     """ Allocates memory that will be automatically managed by the Unified Memory system.
 
-    Allocates `size` bytes of managed memory on the device and returns in
-    `*devPtr` a pointer to the allocated memory. If the device doesn't
+    Allocates ``size`` bytes of managed memory on the device and returns in
+    ``*devPtr`` a pointer to the allocated memory. If the device doesn't
     support allocating managed memory, :py:obj:`~.cudaErrorNotSupported` is
     returned. Support for managed memory can be queried using the device
     attribute :py:obj:`~.cudaDevAttrManagedMemory`. The allocated memory is
     suitably aligned for any kind of variable. The memory is not cleared.
-    If `size` is 0, :py:obj:`~.cudaMallocManaged` returns
+    If ``size`` is 0, :py:obj:`~.cudaMallocManaged` returns
     :py:obj:`~.cudaErrorInvalidValue`. The pointer is valid on the CPU and
     on all GPUs in the system that support managed memory. All accesses to
     this pointer must obey the Unified Memory programming model.
 
-    `flags` specifies the default stream association for this allocation.
-    `flags` must be one of :py:obj:`~.cudaMemAttachGlobal` or
-    :py:obj:`~.cudaMemAttachHost`. The default value for `flags` is
+    ``flags`` specifies the default stream association for this allocation.
+    ``flags`` must be one of :py:obj:`~.cudaMemAttachGlobal` or
+    :py:obj:`~.cudaMemAttachHost`. The default value for ``flags`` is
     :py:obj:`~.cudaMemAttachGlobal`. If :py:obj:`~.cudaMemAttachGlobal` is
     specified, then this memory is accessible from any stream on any
     device. If :py:obj:`~.cudaMemAttachHost` is specified, then the
@@ -26783,14 +26780,14 @@ def cudaMallocManaged(size_t size, unsigned int flags):
 def cudaMalloc(size_t size):
     """ Allocate memory on the device.
 
-    Allocates `size` bytes of linear memory on the device and returns in
-    `*devPtr` a pointer to the allocated memory. The allocated memory is
+    Allocates ``size`` bytes of linear memory on the device and returns in
+    ``*devPtr`` a pointer to the allocated memory. The allocated memory is
     suitably aligned for any kind of variable. The memory is not cleared.
     :py:obj:`~.cudaMalloc()` returns :py:obj:`~.cudaErrorMemoryAllocation`
     in case of failure.
 
     The device version of :py:obj:`~.cudaFree` cannot be used with a
-    `*devPtr` allocated using the host API, and vice versa.
+    ``*devPtr`` allocated using the host API, and vice versa.
 
     Parameters
     ----------
@@ -26822,10 +26819,10 @@ def cudaMalloc(size_t size):
 def cudaMallocHost(size_t size):
     """ Allocates page-locked memory on the host.
 
-    Allocates `size` bytes of host memory that is page-locked and
+    Allocates ``size`` bytes of host memory that is page-locked and
     accessible to the device. The driver tracks the virtual memory ranges
     allocated with this function and automatically accelerates calls to
-    functions such as :py:obj:`~.cudaMemcpy`*(). Since the memory can be
+    functions such as :py:obj:`~.cudaMemcpy`\\*(). Since the memory can be
     accessed directly by the device, it can be read or written with much
     higher bandwidth than pageable memory obtained with functions such as
     :py:obj:`~.malloc()`.
@@ -26870,17 +26867,17 @@ def cudaMallocHost(size_t size):
 def cudaMallocPitch(size_t width, size_t height):
     """ Allocates pitched memory on the device.
 
-    Allocates at least `width` (in bytes) * `height` bytes of linear memory
-    on the device and returns in `*devPtr` a pointer to the allocated
-    memory. The function may pad the allocation to ensure that
+    Allocates at least ``width`` (in bytes) * ``height`` bytes of linear
+    memory on the device and returns in ``*devPtr`` a pointer to the
+    allocated memory. The function may pad the allocation to ensure that
     corresponding pointers in any given row will continue to meet the
     alignment requirements for coalescing as the address is updated from
-    row to row. The pitch returned in `*pitch` by
+    row to row. The pitch returned in ``*pitch`` by
     :py:obj:`~.cudaMallocPitch()` is the width in bytes of the allocation.
-    The intended usage of `pitch` is as a separate parameter of the
+    The intended usage of ``pitch`` is as a separate parameter of the
     allocation, used to compute addresses within the 2D array. Given the
-    row and column of an array element of type `T`, the address is computed
-    as:
+    row and column of an array element of type ``T``, the address is
+    computed as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -26926,11 +26923,10 @@ def cudaMallocPitch(size_t width, size_t height):
 def cudaMallocArray(desc : Optional[cudaChannelFormatDesc], size_t width, size_t height, unsigned int flags):
     """ Allocate an array on the device.
 
-    Allocates a CUDA array according to the
-    :py:obj:`~.cudaChannelFormatDesc` structure `desc` and returns a handle
-    to the new CUDA array in `*array`.
+    Allocates a CUDA array according to the cudaChannelFormatDesc structure
+    ``desc`` and returns a handle to the new CUDA array in ``*array``.
 
-    The :py:obj:`~.cudaChannelFormatDesc` is defined as:
+    The cudaChannelFormatDesc is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -26939,7 +26935,7 @@ def cudaMallocArray(desc : Optional[cudaChannelFormatDesc], size_t width, size_t
     :py:obj:`~.cudaChannelFormatKindUnsigned`, or
     :py:obj:`~.cudaChannelFormatKindFloat`.
 
-    The `flags` parameter enables different options to be specified that
+    The ``flags`` parameter enables different options to be specified that
     affect the allocation, as follows.
 
     - :py:obj:`~.cudaArrayDefault`: This flag's value is defined to be 0
@@ -26963,7 +26959,7 @@ def cudaMallocArray(desc : Optional[cudaChannelFormatDesc], size_t width, size_t
       The physical backing memory must be allocated via
       :py:obj:`~.cuMemCreate`.
 
-    `width` and `height` must meet certain size requirements. See
+    ``width`` and ``height`` must meet certain size requirements. See
     :py:obj:`~.cudaMalloc3DArray()` for more details.
 
     Parameters
@@ -27003,7 +26999,7 @@ def cudaMallocArray(desc : Optional[cudaChannelFormatDesc], size_t width, size_t
 def cudaFree(devPtr):
     """ Frees memory on the device.
 
-    Frees the memory space pointed to by `devPtr`, which must have been
+    Frees the memory space pointed to by ``devPtr``, which must have been
     returned by a previous call to one of the following memory allocation
     APIs - :py:obj:`~.cudaMalloc()`, :py:obj:`~.cudaMallocPitch()`,
     :py:obj:`~.cudaMallocManaged()`, :py:obj:`~.cudaMallocAsync()`,
@@ -27018,13 +27014,13 @@ def cudaFree(devPtr):
     stream ordered memory allocator. For all other pointers, this API may
     perform implicit synchronization.
 
-    If :py:obj:`~.cudaFree`(`devPtr`) has already been called before, an
-    error is returned. If `devPtr` is 0, no operation is performed.
+    If :py:obj:`~.cudaFree`(``devPtr``) has already been called before, an
+    error is returned. If ``devPtr`` is 0, no operation is performed.
     :py:obj:`~.cudaFree()` returns :py:obj:`~.cudaErrorValue` in case of
     failure.
 
     The device version of :py:obj:`~.cudaFree` cannot be used with a
-    `*devPtr` allocated using the host API, and vice versa.
+    ``*devPtr`` allocated using the host API, and vice versa.
 
     Parameters
     ----------
@@ -27054,7 +27050,7 @@ def cudaFree(devPtr):
 def cudaFreeHost(ptr):
     """ Frees page-locked memory.
 
-    Frees the memory space pointed to by `hostPtr`, which must have been
+    Frees the memory space pointed to by ``hostPtr``, which must have been
     returned by a previous call to :py:obj:`~.cudaMallocHost()` or
     :py:obj:`~.cudaHostAlloc()`.
 
@@ -27086,8 +27082,8 @@ def cudaFreeHost(ptr):
 def cudaFreeArray(array):
     """ Frees an array on the device.
 
-    Frees the CUDA array `array`, which must have been returned by a
-    previous call to :py:obj:`~.cudaMallocArray()`. If `devPtr` is 0, no
+    Frees the CUDA array ``array``, which must have been returned by a
+    previous call to :py:obj:`~.cudaMallocArray()`. If ``devPtr`` is 0, no
     operation is performed.
 
     Parameters
@@ -27123,9 +27119,9 @@ def cudaFreeArray(array):
 def cudaFreeMipmappedArray(mipmappedArray):
     """ Frees a mipmapped array on the device.
 
-    Frees the CUDA mipmapped array `mipmappedArray`, which must have been
+    Frees the CUDA mipmapped array ``mipmappedArray``, which must have been
     returned by a previous call to :py:obj:`~.cudaMallocMipmappedArray()`.
-    If `devPtr` is 0, no operation is performed.
+    If ``devPtr`` is 0, no operation is performed.
 
     Parameters
     ----------
@@ -27160,7 +27156,7 @@ def cudaFreeMipmappedArray(mipmappedArray):
 def cudaHostAlloc(size_t size, unsigned int flags):
     """ Allocates page-locked memory on the host.
 
-    Allocates `size` bytes of host memory that is page-locked and
+    Allocates ``size`` bytes of host memory that is page-locked and
     accessible to the device. The driver tracks the virtual memory ranges
     allocated with this function and automatically accelerates calls to
     functions such as :py:obj:`~.cudaMemcpy()`. Since the memory can be
@@ -27172,7 +27168,7 @@ def cudaHostAlloc(size_t size, unsigned int flags):
     used sparingly to allocate staging areas for data exchange between host
     and device.
 
-    The `flags` parameter enables different options to be specified that
+    The ``flags`` parameter enables different options to be specified that
     affect the allocation, as follows.
 
     - :py:obj:`~.cudaHostAllocDefault`: This flag's value is defined to be
@@ -27245,10 +27241,10 @@ def cudaHostAlloc(size_t size, unsigned int flags):
 def cudaHostRegister(ptr, size_t size, unsigned int flags):
     """ Registers an existing host memory range for use by CUDA.
 
-    Page-locks the memory range specified by `ptr` and `size` and maps it
-    for the device(s) as specified by `flags`. This memory range also is
-    added to the same tracking mechanism as :py:obj:`~.cudaHostAlloc()` to
-    automatically accelerate calls to functions such as
+    Page-locks the memory range specified by ``ptr`` and ``size`` and maps
+    it for the device(s) as specified by ``flags``. This memory range also
+    is added to the same tracking mechanism as :py:obj:`~.cudaHostAlloc()`
+    to automatically accelerate calls to functions such as
     :py:obj:`~.cudaMemcpy()`. Since the memory can be accessed directly by
     the device, it can be read or written with much higher bandwidth than
     pageable memory that has not been registered. Page-locking excessive
@@ -27259,13 +27255,13 @@ def cudaHostRegister(ptr, size_t size, unsigned int flags):
 
     On systems where :py:obj:`~.pageableMemoryAccessUsesHostPageTables` is
     true, :py:obj:`~.cudaHostRegister` will not page-lock the memory range
-    specified by `ptr` but only populate unpopulated pages.
+    specified by ``ptr`` but only populate unpopulated pages.
 
     :py:obj:`~.cudaHostRegister` is supported only on I/O coherent devices
     that have a non-zero value for the device attribute
     :py:obj:`~.cudaDevAttrHostRegisterSupported`.
 
-    The `flags` parameter enables different options to be specified that
+    The ``flags`` parameter enables different options to be specified that
     affect the allocation, as follows.
 
     - :py:obj:`~.cudaHostRegisterDefault`: On a system with unified virtual
@@ -27313,16 +27309,16 @@ def cudaHostRegister(ptr, size_t size, unsigned int flags):
 
     For devices that have a non-zero value for the device attribute
     :py:obj:`~.cudaDevAttrCanUseHostPointerForRegisteredMem`, the memory
-    can also be accessed from the device using the host pointer `ptr`. The
-    device pointer returned by :py:obj:`~.cudaHostGetDevicePointer()` may
-    or may not match the original host pointer `ptr` and depends on the
-    devices visible to the application. If all devices visible to the
+    can also be accessed from the device using the host pointer ``ptr``.
+    The device pointer returned by :py:obj:`~.cudaHostGetDevicePointer()`
+    may or may not match the original host pointer ``ptr`` and depends on
+    the devices visible to the application. If all devices visible to the
     application have a non-zero value for the device attribute, the device
     pointer returned by :py:obj:`~.cudaHostGetDevicePointer()` will match
-    the original pointer `ptr`. If any device visible to the application
+    the original pointer ``ptr``. If any device visible to the application
     has a zero value for the device attribute, the device pointer returned
     by :py:obj:`~.cudaHostGetDevicePointer()` will not match the original
-    host pointer `ptr`, but it will be suitable for use on all devices
+    host pointer ``ptr``, but it will be suitable for use on all devices
     provided Unified Virtual Addressing is enabled. In such systems, it is
     valid to access the memory using either pointer on devices that have a
     non-zero value for the device attribute. Note however that such devices
@@ -27363,7 +27359,7 @@ def cudaHostRegister(ptr, size_t size, unsigned int flags):
 def cudaHostUnregister(ptr):
     """ Unregisters a memory range that was registered with cudaHostRegister.
 
-    Unmaps the memory range whose base address is specified by `ptr`, and
+    Unmaps the memory range whose base address is specified by ``ptr``, and
     makes it pageable again.
 
     The base address must be the same one specified to
@@ -27408,22 +27404,23 @@ def cudaHostGetDevicePointer(pHost, unsigned int flags):
 
     For devices that have a non-zero value for the device attribute
     :py:obj:`~.cudaDevAttrCanUseHostPointerForRegisteredMem`, the memory
-    can also be accessed from the device using the host pointer `pHost`.
+    can also be accessed from the device using the host pointer ``pHost``.
     The device pointer returned by :py:obj:`~.cudaHostGetDevicePointer()`
-    may or may not match the original host pointer `pHost` and depends on
+    may or may not match the original host pointer ``pHost`` and depends on
     the devices visible to the application. If all devices visible to the
     application have a non-zero value for the device attribute, the device
     pointer returned by :py:obj:`~.cudaHostGetDevicePointer()` will match
-    the original pointer `pHost`. If any device visible to the application
-    has a zero value for the device attribute, the device pointer returned
-    by :py:obj:`~.cudaHostGetDevicePointer()` will not match the original
-    host pointer `pHost`, but it will be suitable for use on all devices
-    provided Unified Virtual Addressing is enabled. In such systems, it is
-    valid to access the memory using either pointer on devices that have a
-    non-zero value for the device attribute. Note however that such devices
-    should access the memory using only of the two pointers and not both.
+    the original pointer ``pHost``. If any device visible to the
+    application has a zero value for the device attribute, the device
+    pointer returned by :py:obj:`~.cudaHostGetDevicePointer()` will not
+    match the original host pointer ``pHost``, but it will be suitable for
+    use on all devices provided Unified Virtual Addressing is enabled. In
+    such systems, it is valid to access the memory using either pointer on
+    devices that have a non-zero value for the device attribute. Note
+    however that such devices should access the memory using only of the
+    two pointers and not both.
 
-    `flags` provides for future releases. For now, it must be set to 0.
+    ``flags`` provides for future releases. For now, it must be set to 0.
 
     Parameters
     ----------
@@ -27496,16 +27493,16 @@ def cudaHostGetFlags(pHost):
 def cudaMalloc3D(extent not None : cudaExtent):
     """ Allocates logical 1D, 2D, or 3D memory objects on the device.
 
-    Allocates at least `width` * `height` * `depth` bytes of linear memory
-    on the device and returns a :py:obj:`~.cudaPitchedPtr` in which `ptr`
-    is a pointer to the allocated memory. The function may pad the
-    allocation to ensure hardware alignment requirements are met. The pitch
-    returned in the `pitch` field of `pitchedDevPtr` is the width in bytes
-    of the allocation.
+    Allocates at least ``width`` * ``height`` * ``depth`` bytes of linear
+    memory on the device and returns a cudaPitchedPtr in which ``ptr`` is a
+    pointer to the allocated memory. The function may pad the allocation to
+    ensure hardware alignment requirements are met. The pitch returned in
+    the ``pitch`` field of ``pitchedDevPtr`` is the width in bytes of the
+    allocation.
 
-    The returned :py:obj:`~.cudaPitchedPtr` contains additional fields
-    `xsize` and `ysize`, the logical width and height of the allocation,
-    which are equivalent to the `width` and `height` `extent` parameters
+    The returned cudaPitchedPtr contains additional fields ``xsize`` and
+    ``ysize``, the logical width and height of the allocation, which are
+    equivalent to the ``width`` and ``height`` ``extent`` parameters
     provided by the programmer during allocation.
 
     For allocations of 2D and 3D objects, it is highly recommended that
@@ -27518,7 +27515,7 @@ def cudaMalloc3D(extent not None : cudaExtent):
     Parameters
     ----------
     extent : :py:obj:`~.cudaExtent`
-        Requested allocation size (`width` field in bytes)
+        Requested allocation size (``width`` field in bytes)
 
     Returns
     -------
@@ -27529,7 +27526,7 @@ def cudaMalloc3D(extent not None : cudaExtent):
 
     See Also
     --------
-    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaPitchedPtr, make_cudaExtent, :py:obj:`~.cuMemAllocPitch`
+    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMemAllocPitch`
     """
     cdef cudaPitchedPtr pitchedDevPtr = cudaPitchedPtr()
     with nogil:
@@ -27545,11 +27542,10 @@ def cudaMalloc3D(extent not None : cudaExtent):
 def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None : cudaExtent, unsigned int flags):
     """ Allocate an array on the device.
 
-    Allocates a CUDA array according to the
-    :py:obj:`~.cudaChannelFormatDesc` structure `desc` and returns a handle
-    to the new CUDA array in `*array`.
+    Allocates a CUDA array according to the cudaChannelFormatDesc structure
+    ``desc`` and returns a handle to the new CUDA array in ``*array``.
 
-    The :py:obj:`~.cudaChannelFormatDesc` is defined as:
+    The cudaChannelFormatDesc is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -27590,7 +27586,7 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
       layers represent the first cubemap, the next six layers form the
       second cubemap, and so on.
 
-    The `flags` parameter enables different options to be specified that
+    The ``flags`` parameter enables different options to be specified that
     affect the allocation, as follows.
 
     - :py:obj:`~.cudaArrayDefault`: This flag's value is defined to be 0
@@ -27638,7 +27634,7 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
     desc : :py:obj:`~.cudaChannelFormatDesc`
         Requested channel format
     extent : :py:obj:`~.cudaExtent`
-        Requested allocation size (`width` field in elements)
+        Requested allocation size (``width`` field in elements)
     flags : unsigned int
         Flags for extensions
 
@@ -27651,7 +27647,7 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaExtent, :py:obj:`~.cuArray3DCreate`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuArray3DCreate`
     """
     cdef cudaArray_t array = cudaArray_t()
     cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc is not None else NULL
@@ -27668,13 +27664,13 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
 def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not None : cudaExtent, unsigned int numLevels, unsigned int flags):
     """ Allocate a mipmapped array on the device.
 
-    Allocates a CUDA mipmapped array according to the
-    :py:obj:`~.cudaChannelFormatDesc` structure `desc` and returns a handle
-    to the new CUDA mipmapped array in `*mipmappedArray`. `numLevels`
-    specifies the number of mipmap levels to be allocated. This value is
-    clamped to the range [1, 1 + floor(log2(max(width, height, depth)))].
+    Allocates a CUDA mipmapped array according to the cudaChannelFormatDesc
+    structure ``desc`` and returns a handle to the new CUDA mipmapped array
+    in ``*mipmappedArray``. ``numLevels`` specifies the number of mipmap
+    levels to be allocated. This value is clamped to the range [1, 1 +
+    floor(log2(max(width, height, depth)))].
 
-    The :py:obj:`~.cudaChannelFormatDesc` is defined as:
+    The cudaChannelFormatDesc is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -27716,7 +27712,7 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
       cubemap mipmapped array, the next six layers form the second cubemap
       mipmapped array, and so on.
 
-    The `flags` parameter enables different options to be specified that
+    The ``flags`` parameter enables different options to be specified that
     affect the allocation, as follows.
 
     - :py:obj:`~.cudaArrayDefault`: This flag's value is defined to be 0
@@ -27762,7 +27758,7 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
     desc : :py:obj:`~.cudaChannelFormatDesc`
         Requested channel format
     extent : :py:obj:`~.cudaExtent`
-        Requested allocation size (`width` field in elements)
+        Requested allocation size (``width`` field in elements)
     numLevels : unsigned int
         Number of mipmap levels to allocate
     flags : unsigned int
@@ -27777,7 +27773,7 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaExtent, :py:obj:`~.cuMipmappedArrayCreate`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMipmappedArrayCreate`
     """
     cdef cudaMipmappedArray_t mipmappedArray = cudaMipmappedArray_t()
     cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc is not None else NULL
@@ -27794,14 +27790,14 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
 def cudaGetMipmappedArrayLevel(mipmappedArray, unsigned int level):
     """ Gets a mipmap level of a CUDA mipmapped array.
 
-    Returns in `*levelArray` a CUDA array that represents a single mipmap
-    level of the CUDA mipmapped array `mipmappedArray`.
+    Returns in ``*levelArray`` a CUDA array that represents a single mipmap
+    level of the CUDA mipmapped array ``mipmappedArray``.
 
-    If `level` is greater than the maximum number of levels in this
+    If ``level`` is greater than the maximum number of levels in this
     mipmapped array, :py:obj:`~.cudaErrorInvalidValue` is returned.
 
-    If `mipmappedArray` is NULL, :py:obj:`~.cudaErrorInvalidResourceHandle`
-    is returned.
+    If ``mipmappedArray`` is NULL,
+    :py:obj:`~.cudaErrorInvalidResourceHandle` is returned.
 
     Parameters
     ----------
@@ -27819,7 +27815,7 @@ def cudaGetMipmappedArrayLevel(mipmappedArray, unsigned int level):
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, make_cudaExtent, :py:obj:`~.cuMipmappedArrayGetLevel`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMipmappedArrayGetLevel`
     """
     cdef cyruntime.cudaMipmappedArray_const_t cymipmappedArray
     if mipmappedArray is None:
@@ -27848,28 +27844,28 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
     :py:obj:`~.cudaMemcpy3D()` copies data betwen two 3D objects. The
     source and destination objects may be in either host memory, device
     memory, or a CUDA array. The source, destination, extent, and kind of
-    copy performed is specified by the :py:obj:`~.cudaMemcpy3DParms` struct
-    which should be initialized to zero before use:
+    copy performed is specified by the cudaMemcpy3DParms struct which
+    should be initialized to zero before use:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
     The struct passed to :py:obj:`~.cudaMemcpy3D()` must specify one of
-    `srcArray` or `srcPtr` and one of `dstArray` or `dstPtr`. Passing more
-    than one non-zero source or destination will cause
+    ``srcArray`` or ``srcPtr`` and one of ``dstArray`` or ``dstPtr``.
+    Passing more than one non-zero source or destination will cause
     :py:obj:`~.cudaMemcpy3D()` to return an error.
 
-    The `srcPos` and `dstPos` fields are optional offsets into the source
-    and destination objects and are defined in units of each object's
-    elements. The element for a host or device pointer is assumed to be
-    unsigned char.
+    The ``srcPos`` and ``dstPos`` fields are optional offsets into the
+    source and destination objects and are defined in units of each
+    object's elements. The element for a host or device pointer is assumed
+    to be unsigned char.
 
-    The `extent` field defines the dimensions of the transferred area in
+    The ``extent`` field defines the dimensions of the transferred area in
     elements. If a CUDA array is participating in the copy, the extent is
     defined in terms of that array's elements. If no CUDA array is
     participating in the copy then the extents are defined in elements of
     unsigned char.
 
-    The `kind` field defines the direction of the copy. It must be one of
+    The ``kind`` field defines the direction of the copy. It must be one of
     :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
     :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
@@ -27892,14 +27888,14 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
     source and destination objects are specified, undefined behavior will
     result.
 
-    The source object must entirely contain the region defined by `srcPos`
-    and `extent`. The destination object must entirely contain the region
-    defined by `dstPos` and `extent`.
+    The source object must entirely contain the region defined by
+    ``srcPos`` and ``extent``. The destination object must entirely contain
+    the region defined by ``dstPos`` and ``extent``.
 
-    :py:obj:`~.cudaMemcpy3D()` returns an error if the pitch of `srcPtr` or
-    `dstPtr` exceeds the maximum allowed. The pitch of a
-    :py:obj:`~.cudaPitchedPtr` allocated with :py:obj:`~.cudaMalloc3D()`
-    will always be valid.
+    :py:obj:`~.cudaMemcpy3D()` returns an error if the pitch of ``srcPtr``
+    or ``dstPtr`` exceeds the maximum allowed. The pitch of a
+    cudaPitchedPtr allocated with :py:obj:`~.cudaMalloc3D()` will always be
+    valid.
 
     Parameters
     ----------
@@ -27913,7 +27909,7 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3DAsync`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, make_cudaExtent, make_cudaPos, :py:obj:`~.cuMemcpy3D`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3DAsync`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.make_cudaPos`, :py:obj:`~.cuMemcpy3D`
     """
     cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = <cyruntime.cudaMemcpy3DParms*>p._pvt_ptr if p is not None else NULL
     with nogil:
@@ -27927,9 +27923,9 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
 def cudaMemcpy3DPeer(p : Optional[cudaMemcpy3DPeerParms]):
     """ Copies memory between devices.
 
-    Perform a 3D memory copy according to the parameters specified in `p`.
-    See the definition of the :py:obj:`~.cudaMemcpy3DPeerParms` structure
-    for documentation of its parameters.
+    Perform a 3D memory copy according to the parameters specified in
+    ``p``. See the definition of the cudaMemcpy3DPeerParms structure for
+    documentation of its parameters.
 
     Note that this function is synchronous with respect to the host only if
     the source or destination of the transfer is host memory. Note also
@@ -27969,29 +27965,29 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
     :py:obj:`~.cudaMemcpy3DAsync()` copies data betwen two 3D objects. The
     source and destination objects may be in either host memory, device
     memory, or a CUDA array. The source, destination, extent, and kind of
-    copy performed is specified by the :py:obj:`~.cudaMemcpy3DParms` struct
-    which should be initialized to zero before use:
+    copy performed is specified by the cudaMemcpy3DParms struct which
+    should be initialized to zero before use:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
     The struct passed to :py:obj:`~.cudaMemcpy3DAsync()` must specify one
-    of `srcArray` or `srcPtr` and one of `dstArray` or `dstPtr`. Passing
-    more than one non-zero source or destination will cause
+    of ``srcArray`` or ``srcPtr`` and one of ``dstArray`` or ``dstPtr``.
+    Passing more than one non-zero source or destination will cause
     :py:obj:`~.cudaMemcpy3DAsync()` to return an error.
 
-    The `srcPos` and `dstPos` fields are optional offsets into the source
-    and destination objects and are defined in units of each object's
-    elements. The element for a host or device pointer is assumed to be
-    unsigned char. For CUDA arrays, positions must be in the range [0,
-    2048) for any dimension.
+    The ``srcPos`` and ``dstPos`` fields are optional offsets into the
+    source and destination objects and are defined in units of each
+    object's elements. The element for a host or device pointer is assumed
+    to be unsigned char. For CUDA arrays, positions must be in the range
+    [0, 2048) for any dimension.
 
-    The `extent` field defines the dimensions of the transferred area in
+    The ``extent`` field defines the dimensions of the transferred area in
     elements. If a CUDA array is participating in the copy, the extent is
     defined in terms of that array's elements. If no CUDA array is
     participating in the copy then the extents are defined in elements of
     unsigned char.
 
-    The `kind` field defines the direction of the copy. It must be one of
+    The ``kind`` field defines the direction of the copy. It must be one of
     :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
     :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
@@ -28015,19 +28011,19 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
     result.
 
     The source object must lie entirely within the region defined by
-    `srcPos` and `extent`. The destination object must lie entirely within
-    the region defined by `dstPos` and `extent`.
+    ``srcPos`` and ``extent``. The destination object must lie entirely
+    within the region defined by ``dstPos`` and ``extent``.
 
     :py:obj:`~.cudaMemcpy3DAsync()` returns an error if the pitch of
-    `srcPtr` or `dstPtr` exceeds the maximum allowed. The pitch of a
-    :py:obj:`~.cudaPitchedPtr` allocated with :py:obj:`~.cudaMalloc3D()`
-    will always be valid.
+    ``srcPtr`` or ``dstPtr`` exceeds the maximum allowed. The pitch of a
+    cudaPitchedPtr allocated with :py:obj:`~.cudaMalloc3D()` will always be
+    valid.
 
     :py:obj:`~.cudaMemcpy3DAsync()` is asynchronous with respect to the
     host, so the call may return before the copy is complete. The copy can
-    optionally be associated to a stream by passing a non-zero `stream`
-    argument. If `kind` is :py:obj:`~.cudaMemcpyHostToDevice` or
-    :py:obj:`~.cudaMemcpyDeviceToHost` and `stream` is non-zero, the copy
+    optionally be associated to a stream by passing a non-zero ``stream``
+    argument. If ``kind`` is :py:obj:`~.cudaMemcpyHostToDevice` or
+    :py:obj:`~.cudaMemcpyDeviceToHost` and ``stream`` is non-zero, the copy
     may overlap with operations in other streams.
 
     The device version of this function only handles device to device
@@ -28047,7 +28043,7 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, ::::py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, make_cudaExtent, make_cudaPos, :py:obj:`~.cuMemcpy3DAsync`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, ::::py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.make_cudaPos`, :py:obj:`~.cuMemcpy3DAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -28069,9 +28065,9 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
 def cudaMemcpy3DPeerAsync(p : Optional[cudaMemcpy3DPeerParms], stream):
     """ Copies memory between devices asynchronously.
 
-    Perform a 3D memory copy according to the parameters specified in `p`.
-    See the definition of the :py:obj:`~.cudaMemcpy3DPeerParms` structure
-    for documentation of its parameters.
+    Perform a 3D memory copy according to the parameters specified in
+    ``p``. See the definition of the cudaMemcpy3DPeerParms structure for
+    documentation of its parameters.
 
     Parameters
     ----------
@@ -28109,15 +28105,15 @@ def cudaMemcpy3DPeerAsync(p : Optional[cudaMemcpy3DPeerParms], stream):
 def cudaMemGetInfo():
     """ Gets free and total device memory.
 
-    Returns in `*total` the total amount of memory available to the the
-    current context. Returns in `*free` the amount of memory on the device
-    that is free according to the OS. CUDA is not guaranteed to be able to
-    allocate all of the memory that the OS reports as free. In a multi-
-    tenet situation, free estimate returned is prone to race condition
-    where a new allocation/free done by a different process or a different
-    thread in the same process between the time when free memory was
-    estimated and reported, will result in deviation in free value reported
-    and actual free memory.
+    Returns in ``*total`` the total amount of memory available to the the
+    current context. Returns in ``*free`` the amount of memory on the
+    device that is free according to the OS. CUDA is not guaranteed to be
+    able to allocate all of the memory that the OS reports as free. In a
+    multi-tenet situation, free estimate returned is prone to race
+    condition where a new allocation/free done by a different process or a
+    different thread in the same process between the time when free memory
+    was estimated and reported, will result in deviation in free value
+    reported and actual free memory.
 
     The integrated GPU on Tegra shares memory with CPU and other component
     of the SoC. The free and total values returned by the API excludes the
@@ -28154,10 +28150,10 @@ def cudaMemGetInfo():
 def cudaArrayGetInfo(array):
     """ Gets info about the specified cudaArray.
 
-    Returns in `*desc`, `*extent` and `*flags` respectively, the type,
-    shape and flags of `array`.
+    Returns in ``*desc``, ``*extent`` and ``*flags`` respectively, the
+    type, shape and flags of ``array``.
 
-    Any of `*desc`, `*extent` and `*flags` may be specified as NULL.
+    Any of ``*desc``, ``*extent`` and ``*flags`` may be specified as NULL.
 
     Parameters
     ----------
@@ -28203,21 +28199,22 @@ def cudaArrayGetInfo(array):
 def cudaArrayGetPlane(hArray, unsigned int planeIdx):
     """ Gets a CUDA array plane from a CUDA array.
 
-    Returns in `pPlaneArray` a CUDA array that represents a single format
-    plane of the CUDA array `hArray`.
+    Returns in ``pPlaneArray`` a CUDA array that represents a single format
+    plane of the CUDA array ``hArray``.
 
-    If `planeIdx` is greater than the maximum number of planes in this
+    If ``planeIdx`` is greater than the maximum number of planes in this
     array or if the array does not have a multi-planar format e.g:
     :py:obj:`~.cudaChannelFormatKindNV12`, then
     :py:obj:`~.cudaErrorInvalidValue` is returned.
 
-    Note that if the `hArray` has format
-    :py:obj:`~.cudaChannelFormatKindNV12`, then passing in 0 for `planeIdx`
-    returns a CUDA array of the same size as `hArray` but with one 8-bit
-    channel and :py:obj:`~.cudaChannelFormatKindUnsigned` as its format
-    kind. If 1 is passed for `planeIdx`, then the returned CUDA array has
-    half the height and width of `hArray` with two 8-bit channels and
-    :py:obj:`~.cudaChannelFormatKindUnsigned` as its format kind.
+    Note that if the ``hArray`` has format
+    :py:obj:`~.cudaChannelFormatKindNV12`, then passing in 0 for
+    ``planeIdx`` returns a CUDA array of the same size as ``hArray`` but
+    with one 8-bit channel and :py:obj:`~.cudaChannelFormatKindUnsigned` as
+    its format kind. If 1 is passed for ``planeIdx``, then the returned
+    CUDA array has half the height and width of ``hArray`` with two 8-bit
+    channels and :py:obj:`~.cudaChannelFormatKindUnsigned` as its format
+    kind.
 
     Parameters
     ----------
@@ -28231,7 +28228,7 @@ def cudaArrayGetPlane(hArray, unsigned int planeIdx):
     cudaError_t
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue` :py:obj:`~.cudaErrorInvalidResourceHandle`
     pPlaneArray : :py:obj:`~.cudaArray_t`
-        Returned CUDA array referenced by the `planeIdx`
+        Returned CUDA array referenced by the ``planeIdx``
 
     See Also
     --------
@@ -28259,8 +28256,8 @@ def cudaArrayGetPlane(hArray, unsigned int planeIdx):
 def cudaArrayGetMemoryRequirements(array, int device):
     """ Returns the memory requirements of a CUDA array.
 
-    Returns the memory requirements of a CUDA array in `memoryRequirements`
-    If the CUDA array is not allocated with flag
+    Returns the memory requirements of a CUDA array in
+    ``memoryRequirements`` If the CUDA array is not allocated with flag
     :py:obj:`~.cudaArrayDeferredMapping` :py:obj:`~.cudaErrorInvalidValue`
     will be returned.
 
@@ -28281,7 +28278,7 @@ def cudaArrayGetMemoryRequirements(array, int device):
     cudaError_t
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     memoryRequirements : :py:obj:`~.cudaArrayMemoryRequirements`
-        Pointer to :py:obj:`~.cudaArrayMemoryRequirements`
+        Pointer to cudaArrayMemoryRequirements
 
     See Also
     --------
@@ -28310,8 +28307,8 @@ def cudaMipmappedArrayGetMemoryRequirements(mipmap, int device):
     """ Returns the memory requirements of a CUDA mipmapped array.
 
     Returns the memory requirements of a CUDA mipmapped array in
-    `memoryRequirements` If the CUDA mipmapped array is not allocated with
-    flag :py:obj:`~.cudaArrayDeferredMapping`
+    ``memoryRequirements`` If the CUDA mipmapped array is not allocated
+    with flag :py:obj:`~.cudaArrayDeferredMapping`
     :py:obj:`~.cudaErrorInvalidValue` will be returned.
 
     The returned value in :py:obj:`~.cudaArrayMemoryRequirements.size`
@@ -28331,7 +28328,7 @@ def cudaMipmappedArrayGetMemoryRequirements(mipmap, int device):
     cudaError_t
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     memoryRequirements : :py:obj:`~.cudaArrayMemoryRequirements`
-        Pointer to :py:obj:`~.cudaArrayMemoryRequirements`
+        Pointer to cudaArrayMemoryRequirements
 
     See Also
     --------
@@ -28360,7 +28357,7 @@ def cudaArrayGetSparseProperties(array):
     """ Returns the layout properties of a sparse CUDA array.
 
     Returns the layout properties of a sparse CUDA array in
-    `sparseProperties`. If the CUDA array is not allocated with flag
+    ``sparseProperties``. If the CUDA array is not allocated with flag
     :py:obj:`~.cudaArraySparse` :py:obj:`~.cudaErrorInvalidValue` will be
     returned.
 
@@ -28369,13 +28366,13 @@ def cudaArrayGetSparseProperties(array):
     :py:obj:`~.cudaArraySparseProperties.miptailSize` represents the total
     size of the array. Otherwise, it will be zero. Also, the returned value
     in :py:obj:`~.cudaArraySparseProperties.miptailFirstLevel` is always
-    zero. Note that the `array` must have been allocated using
+    zero. Note that the ``array`` must have been allocated using
     :py:obj:`~.cudaMallocArray` or :py:obj:`~.cudaMalloc3DArray`. For CUDA
     arrays obtained using :py:obj:`~.cudaMipmappedArrayGetLevel`,
     :py:obj:`~.cudaErrorInvalidValue` will be returned. Instead,
     :py:obj:`~.cudaMipmappedArrayGetSparseProperties` must be used to
     obtain the sparse properties of the entire CUDA mipmapped array to
-    which `array` belongs to.
+    which ``array`` belongs to.
 
     Parameters
     ----------
@@ -28387,7 +28384,7 @@ def cudaArrayGetSparseProperties(array):
     cudaError_t
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     sparseProperties : :py:obj:`~.cudaArraySparseProperties`
-        Pointer to return the :py:obj:`~.cudaArraySparseProperties`
+        Pointer to return the cudaArraySparseProperties
 
     See Also
     --------
@@ -28415,7 +28412,7 @@ def cudaArrayGetSparseProperties(array):
 def cudaMipmappedArrayGetSparseProperties(mipmap):
     """ Returns the layout properties of a sparse CUDA mipmapped array.
 
-    Returns the sparse array layout properties in `sparseProperties`. If
+    Returns the sparse array layout properties in ``sparseProperties``. If
     the CUDA mipmapped array is not allocated with flag
     :py:obj:`~.cudaArraySparse` :py:obj:`~.cudaErrorInvalidValue` will be
     returned.
@@ -28443,7 +28440,7 @@ def cudaMipmappedArrayGetSparseProperties(mipmap):
     cudaError_t
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     sparseProperties : :py:obj:`~.cudaArraySparseProperties`
-        Pointer to return :py:obj:`~.cudaArraySparseProperties`
+        Pointer to return cudaArraySparseProperties
 
     See Also
     --------
@@ -28471,10 +28468,11 @@ def cudaMipmappedArrayGetSparseProperties(mipmap):
 def cudaMemcpy(dst, src, size_t count, kind not None : cudaMemcpyKind):
     """ Copies data between host and device.
 
-    Copies `count` bytes from the memory area pointed to by `src` to the
-    memory area pointed to by `dst`, where `kind` specifies the direction
-    of the copy, and must be one of :py:obj:`~.cudaMemcpyHostToHost`,
-    :py:obj:`~.cudaMemcpyHostToDevice`, :py:obj:`~.cudaMemcpyDeviceToHost`,
+    Copies ``count`` bytes from the memory area pointed to by ``src`` to
+    the memory area pointed to by ``dst``, where ``kind`` specifies the
+    direction of the copy, and must be one of
+    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
@@ -28483,7 +28481,7 @@ def cudaMemcpy(dst, src, size_t count, kind not None : cudaMemcpyKind):
     and src pointers that do not match the direction of the copy results in
     an undefined behavior.
 
-    \note_sync
+    \\note_sync
 
     Parameters
     ----------
@@ -28523,15 +28521,15 @@ def cudaMemcpy(dst, src, size_t count, kind not None : cudaMemcpyKind):
 def cudaMemcpyPeer(dst, int dstDevice, src, int srcDevice, size_t count):
     """ Copies memory between two devices.
 
-    Copies memory from one device to memory on another device. `dst` is the
-    base device pointer of the destination memory and `dstDevice` is the
-    destination device. `src` is the base device pointer of the source
-    memory and `srcDevice` is the source device. `count` specifies the
-    number of bytes to copy.
+    Copies memory from one device to memory on another device. ``dst`` is
+    the base device pointer of the destination memory and ``dstDevice`` is
+    the destination device. ``src`` is the base device pointer of the
+    source memory and ``srcDevice`` is the source device. ``count``
+    specifies the number of bytes to copy.
 
     Note that this function is asynchronous with respect to the host, but
     serialized with respect all pending and future asynchronous work in to
-    the current device, `srcDevice`, and `dstDevice` (use
+    the current device, ``srcDevice``, and ``dstDevice`` (use
     :py:obj:`~.cudaMemcpyPeerAsync` to avoid this synchronization).
 
     Parameters
@@ -28573,23 +28571,23 @@ def cudaMemcpyPeer(dst, int dstDevice, src, int srcDevice, size_t count):
 def cudaMemcpy2D(dst, size_t dpitch, src, size_t spitch, size_t width, size_t height, kind not None : cudaMemcpyKind):
     """ Copies data between host and device.
 
-    Copies a matrix (`height` rows of `width` bytes each) from the memory
-    area pointed to by `src` to the memory area pointed to by `dst`, where
-    `kind` specifies the direction of the copy, and must be one of
-    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
-    :py:obj:`~.cudaMemcpyDeviceToHost`,
+    Copies a matrix (``height`` rows of ``width`` bytes each) from the
+    memory area pointed to by ``src`` to the memory area pointed to by
+    ``dst``, where ``kind`` specifies the direction of the copy, and must
+    be one of :py:obj:`~.cudaMemcpyHostToHost`,
+    :py:obj:`~.cudaMemcpyHostToDevice`, :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
     :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
-    unified virtual addressing. `dpitch` and `spitch` are the widths in
-    memory in bytes of the 2D arrays pointed to by `dst` and `src`,
+    unified virtual addressing. ``dpitch`` and ``spitch`` are the widths in
+    memory in bytes of the 2D arrays pointed to by ``dst`` and ``src``,
     including any padding added to the end of each row. The memory areas
-    may not overlap. `width` must not exceed either `dpitch` or `spitch`.
-    Calling :py:obj:`~.cudaMemcpy2D()` with `dst` and `src` pointers that
-    do not match the direction of the copy results in an undefined
-    behavior. :py:obj:`~.cudaMemcpy2D()` returns an error if `dpitch` or
-    `spitch` exceeds the maximum allowed.
+    may not overlap. ``width`` must not exceed either ``dpitch`` or
+    ``spitch``. Calling :py:obj:`~.cudaMemcpy2D()` with ``dst`` and ``src``
+    pointers that do not match the direction of the copy results in an
+    undefined behavior. :py:obj:`~.cudaMemcpy2D()` returns an error if
+    ``dpitch`` or ``spitch`` exceeds the maximum allowed.
 
     Parameters
     ----------
@@ -28635,22 +28633,22 @@ def cudaMemcpy2D(dst, size_t dpitch, src, size_t spitch, size_t width, size_t he
 def cudaMemcpy2DToArray(dst, size_t wOffset, size_t hOffset, src, size_t spitch, size_t width, size_t height, kind not None : cudaMemcpyKind):
     """ Copies data between host and device.
 
-    Copies a matrix (`height` rows of `width` bytes each) from the memory
-    area pointed to by `src` to the CUDA array `dst` starting at `hOffset`
-    rows and `wOffset` bytes from the upper left corner, where `kind`
-    specifies the direction of the copy, and must be one of
+    Copies a matrix (``height`` rows of ``width`` bytes each) from the
+    memory area pointed to by ``src`` to the CUDA array ``dst`` starting at
+    ``hOffset`` rows and ``wOffset`` bytes from the upper left corner,
+    where ``kind`` specifies the direction of the copy, and must be one of
     :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
     :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
     :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
-    unified virtual addressing. `spitch` is the width in memory in bytes of
-    the 2D array pointed to by `src`, including any padding added to the
-    end of each row. `wOffset` + `width` must not exceed the width of the
-    CUDA array `dst`. `width` must not exceed `spitch`.
-    :py:obj:`~.cudaMemcpy2DToArray()` returns an error if `spitch` exceeds
-    the maximum allowed.
+    unified virtual addressing. ``spitch`` is the width in memory in bytes
+    of the 2D array pointed to by ``src``, including any padding added to
+    the end of each row. ``wOffset`` + ``width`` must not exceed the width
+    of the CUDA array ``dst``. ``width`` must not exceed ``spitch``.
+    :py:obj:`~.cudaMemcpy2DToArray()` returns an error if ``spitch``
+    exceeds the maximum allowed.
 
     Parameters
     ----------
@@ -28703,21 +28701,21 @@ def cudaMemcpy2DToArray(dst, size_t wOffset, size_t hOffset, src, size_t spitch,
 def cudaMemcpy2DFromArray(dst, size_t dpitch, src, size_t wOffset, size_t hOffset, size_t width, size_t height, kind not None : cudaMemcpyKind):
     """ Copies data between host and device.
 
-    Copies a matrix (`height` rows of `width` bytes each) from the CUDA
-    array `src` starting at `hOffset` rows and `wOffset` bytes from the
-    upper left corner to the memory area pointed to by `dst`, where `kind`
-    specifies the direction of the copy, and must be one of
+    Copies a matrix (``height`` rows of ``width`` bytes each) from the CUDA
+    array ``src`` starting at ``hOffset`` rows and ``wOffset`` bytes from
+    the upper left corner to the memory area pointed to by ``dst``, where
+    ``kind`` specifies the direction of the copy, and must be one of
     :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
     :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
     :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
-    unified virtual addressing. `dpitch` is the width in memory in bytes of
-    the 2D array pointed to by `dst`, including any padding added to the
-    end of each row. `wOffset` + `width` must not exceed the width of the
-    CUDA array `src`. `width` must not exceed `dpitch`.
-    :py:obj:`~.cudaMemcpy2DFromArray()` returns an error if `dpitch`
+    unified virtual addressing. ``dpitch`` is the width in memory in bytes
+    of the 2D array pointed to by ``dst``, including any padding added to
+    the end of each row. ``wOffset`` + ``width`` must not exceed the width
+    of the CUDA array ``src``. ``width`` must not exceed ``dpitch``.
+    :py:obj:`~.cudaMemcpy2DFromArray()` returns an error if ``dpitch``
     exceeds the maximum allowed.
 
     Parameters
@@ -28771,20 +28769,20 @@ def cudaMemcpy2DFromArray(dst, size_t dpitch, src, size_t wOffset, size_t hOffse
 def cudaMemcpy2DArrayToArray(dst, size_t wOffsetDst, size_t hOffsetDst, src, size_t wOffsetSrc, size_t hOffsetSrc, size_t width, size_t height, kind not None : cudaMemcpyKind):
     """ Copies data between host and device.
 
-    Copies a matrix (`height` rows of `width` bytes each) from the CUDA
-    array `src` starting at `hOffsetSrc` rows and `wOffsetSrc` bytes from
-    the upper left corner to the CUDA array `dst` starting at `hOffsetDst`
-    rows and `wOffsetDst` bytes from the upper left corner, where `kind`
-    specifies the direction of the copy, and must be one of
-    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
-    :py:obj:`~.cudaMemcpyDeviceToHost`,
+    Copies a matrix (``height`` rows of ``width`` bytes each) from the CUDA
+    array ``src`` starting at ``hOffsetSrc`` rows and ``wOffsetSrc`` bytes
+    from the upper left corner to the CUDA array ``dst`` starting at
+    ``hOffsetDst`` rows and ``wOffsetDst`` bytes from the upper left
+    corner, where ``kind`` specifies the direction of the copy, and must be
+    one of :py:obj:`~.cudaMemcpyHostToHost`,
+    :py:obj:`~.cudaMemcpyHostToDevice`, :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
     :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
-    unified virtual addressing. `wOffsetDst` + `width` must not exceed the
-    width of the CUDA array `dst`. `wOffsetSrc` + `width` must not exceed
-    the width of the CUDA array `src`.
+    unified virtual addressing. ``wOffsetDst`` + ``width`` must not exceed
+    the width of the CUDA array ``dst``. ``wOffsetSrc`` + ``width`` must
+    not exceed the width of the CUDA array ``src``.
 
     Parameters
     ----------
@@ -28844,10 +28842,11 @@ def cudaMemcpy2DArrayToArray(dst, size_t wOffsetDst, size_t hOffsetDst, src, siz
 def cudaMemcpyAsync(dst, src, size_t count, kind not None : cudaMemcpyKind, stream):
     """ Copies data between host and device.
 
-    Copies `count` bytes from the memory area pointed to by `src` to the
-    memory area pointed to by `dst`, where `kind` specifies the direction
-    of the copy, and must be one of :py:obj:`~.cudaMemcpyHostToHost`,
-    :py:obj:`~.cudaMemcpyHostToDevice`, :py:obj:`~.cudaMemcpyDeviceToHost`,
+    Copies ``count`` bytes from the memory area pointed to by ``src`` to
+    the memory area pointed to by ``dst``, where ``kind`` specifies the
+    direction of the copy, and must be one of
+    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
@@ -28855,14 +28854,14 @@ def cudaMemcpyAsync(dst, src, size_t count, kind not None : cudaMemcpyKind, stre
     unified virtual addressing.
 
     The memory areas may not overlap. Calling :py:obj:`~.cudaMemcpyAsync()`
-    with `dst` and `src` pointers that do not match the direction of the
-    copy results in an undefined behavior.
+    with ``dst`` and ``src`` pointers that do not match the direction of
+    the copy results in an undefined behavior.
 
     :py:obj:`~.cudaMemcpyAsync()` is asynchronous with respect to the host,
     so the call may return before the copy is complete. The copy can
-    optionally be associated to a stream by passing a non-zero `stream`
-    argument. If `kind` is :py:obj:`~.cudaMemcpyHostToDevice` or
-    :py:obj:`~.cudaMemcpyDeviceToHost` and the `stream` is non-zero, the
+    optionally be associated to a stream by passing a non-zero ``stream``
+    argument. If ``kind`` is :py:obj:`~.cudaMemcpyHostToDevice` or
+    :py:obj:`~.cudaMemcpyDeviceToHost` and the ``stream`` is non-zero, the
     copy may overlap with operations in other streams.
 
     The device version of this function only handles device to device
@@ -28916,11 +28915,11 @@ def cudaMemcpyAsync(dst, src, size_t count, kind not None : cudaMemcpyKind, stre
 def cudaMemcpyPeerAsync(dst, int dstDevice, src, int srcDevice, size_t count, stream):
     """ Copies memory between two devices asynchronously.
 
-    Copies memory from one device to memory on another device. `dst` is the
-    base device pointer of the destination memory and `dstDevice` is the
-    destination device. `src` is the base device pointer of the source
-    memory and `srcDevice` is the source device. `count` specifies the
-    number of bytes to copy.
+    Copies memory from one device to memory on another device. ``dst`` is
+    the base device pointer of the destination memory and ``dstDevice`` is
+    the destination device. ``src`` is the base device pointer of the
+    source memory and ``srcDevice`` is the source device. ``count``
+    specifies the number of bytes to copy.
 
     Note that this function is asynchronous with respect to the host and
     all work on other devices.
@@ -28980,26 +28979,26 @@ def cudaMemcpyBatchAsync(dsts : Optional[tuple[Any] | list[Any]], srcs : Optiona
     For copies involving CUDA arrays, please see
     :py:obj:`~.cudaMemcpy3DBatchAsync`.
 
-    Performs memory copies from source buffers specified in `srcs` to
-    destination buffers specified in `dsts`. The size of each copy is
-    specified in `sizes`. All three arrays must be of the same length as
-    specified by `count`. Since there are no ordering guarantees for copies
-    within a batch, specifying any dependent copies within a batch will
-    result in undefined behavior.
+    Performs memory copies from source buffers specified in ``srcs`` to
+    destination buffers specified in ``dsts``. The size of each copy is
+    specified in ``sizes``. All three arrays must be of the same length as
+    specified by ``count``. Since there are no ordering guarantees for
+    copies within a batch, specifying any dependent copies within a batch
+    will result in undefined behavior.
 
     Every copy in the batch has to be associated with a set of attributes
-    specified in the `attrs` array. Each entry in this array can apply to
-    more than one copy. This can be done by specifying in the `attrsIdxs`
+    specified in the ``attrs`` array. Each entry in this array can apply to
+    more than one copy. This can be done by specifying in the ``attrsIdxs``
     array, the index of the first copy that the corresponding entry in the
-    `attrs` array applies to. Both `attrs` and `attrsIdxs` must be of the
-    same length as specified by `numAttrs`. For example, if a batch has 10
-    copies listed in dst/src/sizes, the first 6 of which have one set of
-    attributes and the remaining 4 another, then `numAttrs` will be 2,
-    `attrsIdxs` will be {0, 6} and `attrs` will contains the two sets of
-    attributes. Note that the first entry in `attrsIdxs` must always be 0.
-    Also, each entry must be greater than the previous entry and the last
-    entry should be less than `count`. Furthermore, `numAttrs` must be
-    lesser than or equal to `count`.
+    ``attrs`` array applies to. Both ``attrs`` and ``attrsIdxs`` must be of
+    the same length as specified by ``numAttrs``. For example, if a batch
+    has 10 copies listed in dst/src/sizes, the first 6 of which have one
+    set of attributes and the remaining 4 another, then ``numAttrs`` will
+    be 2, ``attrsIdxs`` will be {0, 6} and ``attrs`` will contains the two
+    sets of attributes. Note that the first entry in ``attrsIdxs`` must
+    always be 0. Also, each entry must be greater than the previous entry
+    and the last entry should be less than ``count``. Furthermore,
+    ``numAttrs`` must be lesser than or equal to ``count``.
 
     The :py:obj:`~.cudaMemcpyAttributes.srcAccessOrder` indicates the
     source access ordering to be observed for copies associated with the
@@ -29022,8 +29021,8 @@ def cudaMemcpyBatchAsync(dsts : Optional[tuple[Any] | list[Any]], srcs : Optiona
     no prior operations in the stream can be accessing the memory.
     Specifying this flag allows the driver to optimize the copy on certain
     platforms. Each memcpy operation in the batch must have a valid
-    :py:obj:`~.cudaMemcpyAttributes` corresponding to it including the
-    appropriate srcAccessOrder setting, otherwise the API will return
+    cudaMemcpyAttributes corresponding to it including the appropriate
+    srcAccessOrder setting, otherwise the API will return
     :py:obj:`~.cudaErrorInvalidValue`.
 
     The :py:obj:`~.cudaMemcpyAttributes.srcLocHint` and
@@ -29052,17 +29051,17 @@ def cudaMemcpyBatchAsync(dsts : Optional[tuple[Any] | list[Any]], srcs : Optiona
     sizes : list[int]
         Array of sizes for memcpy operations.
     count : size_t
-        Size of `dsts`, `srcs` and `sizes` arrays
+        Size of ``dsts``, ``srcs`` and ``sizes`` arrays
     attrs : list[:py:obj:`~.cudaMemcpyAttributes`]
         Array of memcpy attributes.
     attrsIdxs : list[int]
-        Array of indices to specify which copies each entry in the `attrs`
-        array applies to. The attributes specified in attrs[k] will be
-        applied to copies starting from attrsIdxs[k] through attrsIdxs[k+1]
-        - 1. Also attrs[numAttrs-1] will apply to copies starting from
-        attrsIdxs[numAttrs-1] through count - 1.
+        Array of indices to specify which copies each entry in the
+        ``attrs`` array applies to. The attributes specified in attrs[k]
+        will be applied to copies starting from attrsIdxs[k] through
+        attrsIdxs[k+1] - 1. Also attrs[numAttrs-1] will apply to copies
+        starting from attrsIdxs[numAttrs-1] through count - 1.
     numAttrs : size_t
-        Size of `attrs` and `attrsIdxs` arrays.
+        Size of ``attrs`` and ``attrsIdxs`` arrays.
     hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
         The stream to enqueue the operations in. Must not be legacy NULL
         stream.
@@ -29129,8 +29128,8 @@ def cudaMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[cudaMemcpy3DBa
     any specific order. Note that this means specifying any dependent
     copies within a batch will result in undefined behavior.
 
-    Performs memory copies as specified in the `opList` array. The length
-    of this array is specified in `numOps`. Each entry in this array
+    Performs memory copies as specified in the ``opList`` array. The length
+    of this array is specified in ``numOps``. Each entry in this array
     describes a copy operation. This includes among other things, the
     source and destination operands for the copy as specified in
     :py:obj:`~.cudaMemcpy3DBatchOp.src` and
@@ -29146,31 +29145,31 @@ def cudaMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[cudaMemcpy3DBa
 
     For a given operand, if :py:obj:`~.cudaMemcpy3DOperand.type` is
     specified as :py:obj:`~.cudaMemcpyOperandTypePointer`, then
-    :py:obj:`~.cudaMemcpy3DOperand`::op::ptr will be used. The
-    :py:obj:`~.cudaMemcpy3DOperand`::op::ptr::ptr field must contain the
+    :py:obj:`~.cudaMemcpy3DOperand.op.ptr` will be used. The
+    :py:obj:`~.cudaMemcpy3DOperand.op.ptr.ptr` field must contain the
     pointer where the copy should begin. The
-    :py:obj:`~.cudaMemcpy3DOperand`::op::ptr::rowLength field specifies the
+    :py:obj:`~.cudaMemcpy3DOperand.op.ptr.rowLength` field specifies the
     length of each row in elements and must either be zero or be greater
     than or equal to the width of the copy specified in
-    :py:obj:`~.cudaMemcpy3DBatchOp`::extent::width. The
-    :py:obj:`~.cudaMemcpy3DOperand`::op::ptr::layerHeight field specifies
-    the height of each layer and must either be zero or be greater than or
+    :py:obj:`~.cudaMemcpy3DBatchOp.extent.width`. The
+    :py:obj:`~.cudaMemcpy3DOperand.op.ptr.layerHeight` field specifies the
+    height of each layer and must either be zero or be greater than or
     equal to the height of the copy specified in
-    :py:obj:`~.cudaMemcpy3DBatchOp`::extent::height. When either of these
+    :py:obj:`~.cudaMemcpy3DBatchOp.extent.height`. When either of these
     values is zero, that aspect of the operand is considered to be tightly
     packed according to the copy extent. For managed memory pointers on
     devices where :py:obj:`~.cudaDevAttrConcurrentManagedAccess` is true or
     system-allocated pageable memory on devices where
     :py:obj:`~.cudaDevAttrPageableMemoryAccess` is true, the
-    :py:obj:`~.cudaMemcpy3DOperand`::op::ptr::locHint field can be used to
+    :py:obj:`~.cudaMemcpy3DOperand.op.ptr.locHint` field can be used to
     hint the location of the operand.
 
     If an operand's type is specified as
     :py:obj:`~.cudaMemcpyOperandTypeArray`, then
-    :py:obj:`~.cudaMemcpy3DOperand`::op::array will be used. The
-    :py:obj:`~.cudaMemcpy3DOperand`::op::array::array field specifies the
-    CUDA array and :py:obj:`~.cudaMemcpy3DOperand`::op::array::offset
-    specifies the 3D offset into that array where the copy begins.
+    :py:obj:`~.cudaMemcpy3DOperand.op.array` will be used. The
+    :py:obj:`~.cudaMemcpy3DOperand.op.array.array` field specifies the CUDA
+    array and :py:obj:`~.cudaMemcpy3DOperand.op.array.offset` specifies the
+    3D offset into that array where the copy begins.
 
     The :py:obj:`~.cudaMemcpyAttributes.srcAccessOrder` indicates the
     source access ordering to be observed for copies associated with the
@@ -29192,7 +29191,7 @@ def cudaMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[cudaMemcpy3DBa
     pointers allocated outside CUDA (ex., via malloc) when it's known that
     no prior operations in the stream can be accessing the memory.
     Specifying this flag allows the driver to optimize the copy on certain
-    platforms. Each memcopy operation in `opList` must have a valid
+    platforms. Each memcopy operation in ``opList`` must have a valid
     srcAccessOrder setting, otherwise this API will return
     :py:obj:`~.cudaErrorInvalidValue`.
 
@@ -29208,7 +29207,7 @@ def cudaMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[cudaMemcpy3DBa
     numOps : size_t
         Total number of memcpy operations.
     opList : list[:py:obj:`~.cudaMemcpy3DBatchOp`]
-        Array of size `numOps` containing the actual memcpy operations.
+        Array of size ``numOps`` containing the actual memcpy operations.
     flags : unsigned long long
         Flags for future use, must be zero now.
     hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
@@ -29252,19 +29251,19 @@ def cudaMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[cudaMemcpy3DBa
 
 @cython.embedsignature(True)
 def cudaMemcpyWithAttributesAsync(dst, src, size_t size, attr : Optional[cudaMemcpyAttributes], stream):
-    """ 
+    """
 
     Performs asynchronous memory copy operation with the specified
     attributes.
 
-    Performs asynchronous memory copy operation where `dst` and `src` are
-    the destination and source pointers respectively. `size` specifies the
-    number of bytes to copy. `attr` specifies the attributes for the copy
-    and `hStream` specifies the stream to enqueue the operation in.
+    Performs asynchronous memory copy operation where ``dst`` and ``src``
+    are the destination and source pointers respectively. ``size``
+    specifies the number of bytes to copy. ``attr`` specifies the
+    attributes for the copy and ``hStream`` specifies the stream to enqueue
+    the operation in.
 
     For more information regarding the attributes, please refer to
-    :py:obj:`~.cudaMemcpyAttributes` and it's usage desciption
-    in::cudaMemcpyBatchAsync
+    cudaMemcpyAttributes and it's usage desciption in::cudaMemcpyBatchAsync
 
     Parameters
     ----------
@@ -29312,16 +29311,16 @@ def cudaMemcpyWithAttributesAsync(dst, src, size_t size, attr : Optional[cudaMem
 
 @cython.embedsignature(True)
 def cudaMemcpy3DWithAttributesAsync(op : Optional[cudaMemcpy3DBatchOp], unsigned long long flags, stream):
-    """ 
+    """
 
     Performs 3D asynchronous memory copy with the specified attributes.
 
-    Performs the copy operation specified in `op`. `flags` specifies the
-    flags for the copy and `hStream` specifies the stream to enqueue the
-    operation in.
+    Performs the copy operation specified in ``op``. ``flags`` specifies
+    the flags for the copy and ``hStream`` specifies the stream to enqueue
+    the operation in.
 
     For more information regarding the operation, please refer to
-    :py:obj:`~.cudaMemcpy3DBatchOp` and it's usage desciption
+    cudaMemcpy3DBatchOp and it's usage desciption
     in::cudaMemcpy3DBatchAsync
 
     Parameters
@@ -29362,30 +29361,31 @@ def cudaMemcpy3DWithAttributesAsync(op : Optional[cudaMemcpy3DBatchOp], unsigned
 def cudaMemcpy2DAsync(dst, size_t dpitch, src, size_t spitch, size_t width, size_t height, kind not None : cudaMemcpyKind, stream):
     """ Copies data between host and device.
 
-    Copies a matrix (`height` rows of `width` bytes each) from the memory
-    area pointed to by `src` to the memory area pointed to by `dst`, where
-    `kind` specifies the direction of the copy, and must be one of
-    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
-    :py:obj:`~.cudaMemcpyDeviceToHost`,
+    Copies a matrix (``height`` rows of ``width`` bytes each) from the
+    memory area pointed to by ``src`` to the memory area pointed to by
+    ``dst``, where ``kind`` specifies the direction of the copy, and must
+    be one of :py:obj:`~.cudaMemcpyHostToHost`,
+    :py:obj:`~.cudaMemcpyHostToDevice`, :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
     :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
-    unified virtual addressing. `dpitch` and `spitch` are the widths in
-    memory in bytes of the 2D arrays pointed to by `dst` and `src`,
+    unified virtual addressing. ``dpitch`` and ``spitch`` are the widths in
+    memory in bytes of the 2D arrays pointed to by ``dst`` and ``src``,
     including any padding added to the end of each row. The memory areas
-    may not overlap. `width` must not exceed either `dpitch` or `spitch`.
+    may not overlap. ``width`` must not exceed either ``dpitch`` or
+    ``spitch``.
 
-    Calling :py:obj:`~.cudaMemcpy2DAsync()` with `dst` and `src` pointers
-    that do not match the direction of the copy results in an undefined
-    behavior. :py:obj:`~.cudaMemcpy2DAsync()` returns an error if `dpitch`
-    or `spitch` is greater than the maximum allowed.
+    Calling :py:obj:`~.cudaMemcpy2DAsync()` with ``dst`` and ``src``
+    pointers that do not match the direction of the copy results in an
+    undefined behavior. :py:obj:`~.cudaMemcpy2DAsync()` returns an error if
+    ``dpitch`` or ``spitch`` is greater than the maximum allowed.
 
     :py:obj:`~.cudaMemcpy2DAsync()` is asynchronous with respect to the
     host, so the call may return before the copy is complete. The copy can
-    optionally be associated to a stream by passing a non-zero `stream`
-    argument. If `kind` is :py:obj:`~.cudaMemcpyHostToDevice` or
-    :py:obj:`~.cudaMemcpyDeviceToHost` and `stream` is non-zero, the copy
+    optionally be associated to a stream by passing a non-zero ``stream``
+    argument. If ``kind`` is :py:obj:`~.cudaMemcpyHostToDevice` or
+    :py:obj:`~.cudaMemcpyDeviceToHost` and ``stream`` is non-zero, the copy
     may overlap with operations in other streams.
 
     The device version of this function only handles device to device
@@ -29445,29 +29445,29 @@ def cudaMemcpy2DAsync(dst, size_t dpitch, src, size_t spitch, size_t width, size
 def cudaMemcpy2DToArrayAsync(dst, size_t wOffset, size_t hOffset, src, size_t spitch, size_t width, size_t height, kind not None : cudaMemcpyKind, stream):
     """ Copies data between host and device.
 
-    Copies a matrix (`height` rows of `width` bytes each) from the memory
-    area pointed to by `src` to the CUDA array `dst` starting at `hOffset`
-    rows and `wOffset` bytes from the upper left corner, where `kind`
-    specifies the direction of the copy, and must be one of
+    Copies a matrix (``height`` rows of ``width`` bytes each) from the
+    memory area pointed to by ``src`` to the CUDA array ``dst`` starting at
+    ``hOffset`` rows and ``wOffset`` bytes from the upper left corner,
+    where ``kind`` specifies the direction of the copy, and must be one of
     :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
     :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
     :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
-    unified virtual addressing. `spitch` is the width in memory in bytes of
-    the 2D array pointed to by `src`, including any padding added to the
-    end of each row. `wOffset` + `width` must not exceed the width of the
-    CUDA array `dst`. `width` must not exceed `spitch`.
-    :py:obj:`~.cudaMemcpy2DToArrayAsync()` returns an error if `spitch`
+    unified virtual addressing. ``spitch`` is the width in memory in bytes
+    of the 2D array pointed to by ``src``, including any padding added to
+    the end of each row. ``wOffset`` + ``width`` must not exceed the width
+    of the CUDA array ``dst``. ``width`` must not exceed ``spitch``.
+    :py:obj:`~.cudaMemcpy2DToArrayAsync()` returns an error if ``spitch``
     exceeds the maximum allowed.
 
     :py:obj:`~.cudaMemcpy2DToArrayAsync()` is asynchronous with respect to
     the host, so the call may return before the copy is complete. The copy
-    can optionally be associated to a stream by passing a non-zero `stream`
-    argument. If `kind` is :py:obj:`~.cudaMemcpyHostToDevice` or
-    :py:obj:`~.cudaMemcpyDeviceToHost` and `stream` is non-zero, the copy
-    may overlap with operations in other streams.
+    can optionally be associated to a stream by passing a non-zero
+    ``stream`` argument. If ``kind`` is :py:obj:`~.cudaMemcpyHostToDevice`
+    or :py:obj:`~.cudaMemcpyDeviceToHost` and ``stream`` is non-zero, the
+    copy may overlap with operations in other streams.
 
     :py:obj:`~.cudaMemcpy2DFromArrayAsync`,
     :py:obj:`~.cudaMemcpyToSymbolAsync`,
@@ -29534,29 +29534,29 @@ def cudaMemcpy2DToArrayAsync(dst, size_t wOffset, size_t hOffset, src, size_t sp
 def cudaMemcpy2DFromArrayAsync(dst, size_t dpitch, src, size_t wOffset, size_t hOffset, size_t width, size_t height, kind not None : cudaMemcpyKind, stream):
     """ Copies data between host and device.
 
-    Copies a matrix (`height` rows of `width` bytes each) from the CUDA
-    array `src` starting at `hOffset` rows and `wOffset` bytes from the
-    upper left corner to the memory area pointed to by `dst`, where `kind`
-    specifies the direction of the copy, and must be one of
+    Copies a matrix (``height`` rows of ``width`` bytes each) from the CUDA
+    array ``src`` starting at ``hOffset`` rows and ``wOffset`` bytes from
+    the upper left corner to the memory area pointed to by ``dst``, where
+    ``kind`` specifies the direction of the copy, and must be one of
     :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
     :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
     :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
-    unified virtual addressing. `dpitch` is the width in memory in bytes of
-    the 2D array pointed to by `dst`, including any padding added to the
-    end of each row. `wOffset` + `width` must not exceed the width of the
-    CUDA array `src`. `width` must not exceed `dpitch`.
-    :py:obj:`~.cudaMemcpy2DFromArrayAsync()` returns an error if `dpitch`
+    unified virtual addressing. ``dpitch`` is the width in memory in bytes
+    of the 2D array pointed to by ``dst``, including any padding added to
+    the end of each row. ``wOffset`` + ``width`` must not exceed the width
+    of the CUDA array ``src``. ``width`` must not exceed ``dpitch``.
+    :py:obj:`~.cudaMemcpy2DFromArrayAsync()` returns an error if ``dpitch``
     exceeds the maximum allowed.
 
     :py:obj:`~.cudaMemcpy2DFromArrayAsync()` is asynchronous with respect
     to the host, so the call may return before the copy is complete. The
     copy can optionally be associated to a stream by passing a non-zero
-    `stream` argument. If `kind` is :py:obj:`~.cudaMemcpyHostToDevice` or
-    :py:obj:`~.cudaMemcpyDeviceToHost` and `stream` is non-zero, the copy
-    may overlap with operations in other streams.
+    ``stream`` argument. If ``kind`` is :py:obj:`~.cudaMemcpyHostToDevice`
+    or :py:obj:`~.cudaMemcpyDeviceToHost` and ``stream`` is non-zero, the
+    copy may overlap with operations in other streams.
 
     :py:obj:`~.cudaMemcpyToSymbolAsync`,
     :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpy2DAsync`
@@ -29622,11 +29622,11 @@ def cudaMemcpy2DFromArrayAsync(dst, size_t dpitch, src, size_t wOffset, size_t h
 def cudaMemset(devPtr, int value, size_t count):
     """ Initializes or sets device memory to a value.
 
-    Fills the first `count` bytes of the memory area pointed to by `devPtr`
-    with the constant byte value `value`.
+    Fills the first ``count`` bytes of the memory area pointed to by
+    ``devPtr`` with the constant byte value ``value``.
 
     Note that this function is asynchronous with respect to the host unless
-    `devPtr` refers to pinned host memory.
+    ``devPtr`` refers to pinned host memory.
 
     Parameters
     ----------
@@ -29660,21 +29660,22 @@ def cudaMemset(devPtr, int value, size_t count):
 def cudaMemset2D(devPtr, size_t pitch, int value, size_t width, size_t height):
     """ Initializes or sets device memory to a value.
 
-    Sets to the specified value `value` a matrix (`height` rows of `width`
-    bytes each) pointed to by `dstPtr`. `pitch` is the width in bytes of
-    the 2D array pointed to by `dstPtr`, including any padding added to the
-    end of each row. This function performs fastest when the pitch is one
-    that has been passed back by :py:obj:`~.cudaMallocPitch()`.
+    Sets to the specified value ``value`` a matrix (``height`` rows of
+    ``width`` bytes each) pointed to by ``dstPtr``. ``pitch`` is the width
+    in bytes of the 2D array pointed to by ``dstPtr``, including any
+    padding added to the end of each row. This function performs fastest
+    when the pitch is one that has been passed back by
+    :py:obj:`~.cudaMallocPitch()`.
 
     Note that this function is asynchronous with respect to the host unless
-    `devPtr` refers to pinned host memory.
+    ``devPtr`` refers to pinned host memory.
 
     Parameters
     ----------
     devPtr : Any
         Pointer to 2D device memory
     pitch : size_t
-        Pitch in bytes of 2D device memory(Unused if `height` is 1)
+        Pitch in bytes of 2D device memory(Unused if ``height`` is 1)
     value : int
         Value to set for each byte of specified memory
     width : size_t
@@ -29705,29 +29706,30 @@ def cudaMemset2D(devPtr, size_t pitch, int value, size_t width, size_t height):
 def cudaMemset3D(pitchedDevPtr not None : cudaPitchedPtr, int value, extent not None : cudaExtent):
     """ Initializes or sets device memory to a value.
 
-    Initializes each element of a 3D array to the specified value `value`.
-    The object to initialize is defined by `pitchedDevPtr`. The `pitch`
-    field of `pitchedDevPtr` is the width in memory in bytes of the 3D
-    array pointed to by `pitchedDevPtr`, including any padding added to the
-    end of each row. The `xsize` field specifies the logical width of each
-    row in bytes, while the `ysize` field specifies the height of each 2D
-    slice in rows. The `pitch` field of `pitchedDevPtr` is ignored when
-    `height` and `depth` are both equal to 1.
-
-    The extents of the initialized region are specified as a `width` in
-    bytes, a `height` in rows, and a `depth` in slices.
-
-    Extents with `width` greater than or equal to the `xsize` of
-    `pitchedDevPtr` may perform significantly faster than extents narrower
-    than the `xsize`. Secondarily, extents with `height` equal to the
-    `ysize` of `pitchedDevPtr` will perform faster than when the `height`
-    is shorter than the `ysize`.
-
-    This function performs fastest when the `pitchedDevPtr` has been
+    Initializes each element of a 3D array to the specified value
+    ``value``. The object to initialize is defined by ``pitchedDevPtr``.
+    The ``pitch`` field of ``pitchedDevPtr`` is the width in memory in
+    bytes of the 3D array pointed to by ``pitchedDevPtr``, including any
+    padding added to the end of each row. The ``xsize`` field specifies the
+    logical width of each row in bytes, while the ``ysize`` field specifies
+    the height of each 2D slice in rows. The ``pitch`` field of
+    ``pitchedDevPtr`` is ignored when ``height`` and ``depth`` are both
+    equal to 1.
+
+    The extents of the initialized region are specified as a ``width`` in
+    bytes, a ``height`` in rows, and a ``depth`` in slices.
+
+    Extents with ``width`` greater than or equal to the ``xsize`` of
+    ``pitchedDevPtr`` may perform significantly faster than extents
+    narrower than the ``xsize``. Secondarily, extents with ``height`` equal
+    to the ``ysize`` of ``pitchedDevPtr`` will perform faster than when the
+    ``height`` is shorter than the ``ysize``.
+
+    This function performs fastest when the ``pitchedDevPtr`` has been
     allocated by :py:obj:`~.cudaMalloc3D()`.
 
     Note that this function is asynchronous with respect to the host unless
-    `pitchedDevPtr` refers to pinned host memory.
+    ``pitchedDevPtr`` refers to pinned host memory.
 
     Parameters
     ----------
@@ -29736,7 +29738,7 @@ def cudaMemset3D(pitchedDevPtr not None : cudaPitchedPtr, int value, extent not
     value : int
         Value to set for each byte of specified memory
     extent : :py:obj:`~.cudaExtent`
-        Size parameters for where to set device memory (`width` field in
+        Size parameters for where to set device memory (``width`` field in
         bytes)
 
     Returns
@@ -29746,7 +29748,7 @@ def cudaMemset3D(pitchedDevPtr not None : cudaPitchedPtr, int value, extent not
 
     See Also
     --------
-    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cudaMalloc3D`, make_cudaPitchedPtr, make_cudaExtent
+    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`
     """
     with nogil:
         err = cyruntime.cudaMemset3D(pitchedDevPtr._pvt_ptr[0], value, extent._pvt_ptr[0])
@@ -29759,13 +29761,13 @@ def cudaMemset3D(pitchedDevPtr not None : cudaPitchedPtr, int value, extent not
 def cudaMemsetAsync(devPtr, int value, size_t count, stream):
     """ Initializes or sets device memory to a value.
 
-    Fills the first `count` bytes of the memory area pointed to by `devPtr`
-    with the constant byte value `value`.
+    Fills the first ``count`` bytes of the memory area pointed to by
+    ``devPtr`` with the constant byte value ``value``.
 
     :py:obj:`~.cudaMemsetAsync()` is asynchronous with respect to the host,
     so the call may return before the memset is complete. The operation can
-    optionally be associated to a stream by passing a non-zero `stream`
-    argument. If `stream` is non-zero, the operation may overlap with
+    optionally be associated to a stream by passing a non-zero ``stream``
+    argument. If ``stream`` is non-zero, the operation may overlap with
     operations in other streams.
 
     The device version of this function only handles device to device
@@ -29813,16 +29815,17 @@ def cudaMemsetAsync(devPtr, int value, size_t count, stream):
 def cudaMemset2DAsync(devPtr, size_t pitch, int value, size_t width, size_t height, stream):
     """ Initializes or sets device memory to a value.
 
-    Sets to the specified value `value` a matrix (`height` rows of `width`
-    bytes each) pointed to by `dstPtr`. `pitch` is the width in bytes of
-    the 2D array pointed to by `dstPtr`, including any padding added to the
-    end of each row. This function performs fastest when the pitch is one
-    that has been passed back by :py:obj:`~.cudaMallocPitch()`.
+    Sets to the specified value ``value`` a matrix (``height`` rows of
+    ``width`` bytes each) pointed to by ``dstPtr``. ``pitch`` is the width
+    in bytes of the 2D array pointed to by ``dstPtr``, including any
+    padding added to the end of each row. This function performs fastest
+    when the pitch is one that has been passed back by
+    :py:obj:`~.cudaMallocPitch()`.
 
     :py:obj:`~.cudaMemset2DAsync()` is asynchronous with respect to the
     host, so the call may return before the memset is complete. The
     operation can optionally be associated to a stream by passing a non-
-    zero `stream` argument. If `stream` is non-zero, the operation may
+    zero ``stream`` argument. If ``stream`` is non-zero, the operation may
     overlap with operations in other streams.
 
     The device version of this function only handles device to device
@@ -29833,7 +29836,7 @@ def cudaMemset2DAsync(devPtr, size_t pitch, int value, size_t width, size_t heig
     devPtr : Any
         Pointer to 2D device memory
     pitch : size_t
-        Pitch in bytes of 2D device memory(Unused if `height` is 1)
+        Pitch in bytes of 2D device memory(Unused if ``height`` is 1)
     value : int
         Value to set for each byte of specified memory
     width : size_t
@@ -29874,31 +29877,32 @@ def cudaMemset2DAsync(devPtr, size_t pitch, int value, size_t width, size_t heig
 def cudaMemset3DAsync(pitchedDevPtr not None : cudaPitchedPtr, int value, extent not None : cudaExtent, stream):
     """ Initializes or sets device memory to a value.
 
-    Initializes each element of a 3D array to the specified value `value`.
-    The object to initialize is defined by `pitchedDevPtr`. The `pitch`
-    field of `pitchedDevPtr` is the width in memory in bytes of the 3D
-    array pointed to by `pitchedDevPtr`, including any padding added to the
-    end of each row. The `xsize` field specifies the logical width of each
-    row in bytes, while the `ysize` field specifies the height of each 2D
-    slice in rows. The `pitch` field of `pitchedDevPtr` is ignored when
-    `height` and `depth` are both equal to 1.
-
-    The extents of the initialized region are specified as a `width` in
-    bytes, a `height` in rows, and a `depth` in slices.
-
-    Extents with `width` greater than or equal to the `xsize` of
-    `pitchedDevPtr` may perform significantly faster than extents narrower
-    than the `xsize`. Secondarily, extents with `height` equal to the
-    `ysize` of `pitchedDevPtr` will perform faster than when the `height`
-    is shorter than the `ysize`.
-
-    This function performs fastest when the `pitchedDevPtr` has been
+    Initializes each element of a 3D array to the specified value
+    ``value``. The object to initialize is defined by ``pitchedDevPtr``.
+    The ``pitch`` field of ``pitchedDevPtr`` is the width in memory in
+    bytes of the 3D array pointed to by ``pitchedDevPtr``, including any
+    padding added to the end of each row. The ``xsize`` field specifies the
+    logical width of each row in bytes, while the ``ysize`` field specifies
+    the height of each 2D slice in rows. The ``pitch`` field of
+    ``pitchedDevPtr`` is ignored when ``height`` and ``depth`` are both
+    equal to 1.
+
+    The extents of the initialized region are specified as a ``width`` in
+    bytes, a ``height`` in rows, and a ``depth`` in slices.
+
+    Extents with ``width`` greater than or equal to the ``xsize`` of
+    ``pitchedDevPtr`` may perform significantly faster than extents
+    narrower than the ``xsize``. Secondarily, extents with ``height`` equal
+    to the ``ysize`` of ``pitchedDevPtr`` will perform faster than when the
+    ``height`` is shorter than the ``ysize``.
+
+    This function performs fastest when the ``pitchedDevPtr`` has been
     allocated by :py:obj:`~.cudaMalloc3D()`.
 
     :py:obj:`~.cudaMemset3DAsync()` is asynchronous with respect to the
     host, so the call may return before the memset is complete. The
     operation can optionally be associated to a stream by passing a non-
-    zero `stream` argument. If `stream` is non-zero, the operation may
+    zero ``stream`` argument. If ``stream`` is non-zero, the operation may
     overlap with operations in other streams.
 
     The device version of this function only handles device to device
@@ -29911,7 +29915,7 @@ def cudaMemset3DAsync(pitchedDevPtr not None : cudaPitchedPtr, int value, extent
     value : int
         Value to set for each byte of specified memory
     extent : :py:obj:`~.cudaExtent`
-        Size parameters for where to set device memory (`width` field in
+        Size parameters for where to set device memory (``width`` field in
         bytes)
     stream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
         Stream identifier
@@ -29923,7 +29927,7 @@ def cudaMemset3DAsync(pitchedDevPtr not None : cudaPitchedPtr, int value, extent
 
     See Also
     --------
-    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMalloc3D`, make_cudaPitchedPtr, make_cudaExtent
+    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -29944,10 +29948,10 @@ def cudaMemset3DAsync(pitchedDevPtr not None : cudaPitchedPtr, int value, extent
 def cudaMemPrefetchAsync(devPtr, size_t count, location not None : cudaMemLocation, unsigned int flags, stream):
     """ Prefetches memory to the specified destination location.
 
-    Prefetches memory to the specified destination location. `devPtr` is
-    the base device pointer of the memory to be prefetched and `location`
-    specifies the destination location. `count` specifies the number of
-    bytes to copy. `stream` is the stream in which the operation is
+    Prefetches memory to the specified destination location. ``devPtr`` is
+    the base device pointer of the memory to be prefetched and ``location``
+    specifies the destination location. ``count`` specifies the number of
+    bytes to copy. ``stream`` is the stream in which the operation is
     enqueued. The memory range must refer to managed memory allocated via
     :py:obj:`~.cudaMallocManaged` or declared via managed variables, or it
     may also refer to memory allocated from a managed memory pool, or it
@@ -29958,7 +29962,7 @@ def cudaMemPrefetchAsync(devPtr, size_t count, location not None : cudaMemLocati
     :py:obj:`~.cudaMemLocation.type` will prefetch memory to GPU specified
     by device ordinal :py:obj:`~.cudaMemLocation.id` which must have non-
     zero value for the device attribute
-    :py:obj:`~.concurrentManagedAccess`. Additionally, `stream` must be
+    :py:obj:`~.concurrentManagedAccess`. Additionally, ``stream`` must be
     associated with a device that has a non-zero value for the device
     attribute :py:obj:`~.concurrentManagedAccess`. Specifying
     :py:obj:`~.cudaMemLocationTypeHost` as :py:obj:`~.cudaMemLocation.type`
@@ -30001,7 +30005,7 @@ def cudaMemPrefetchAsync(devPtr, size_t count, location not None : cudaMemLocati
 
     If :py:obj:`~.cudaMemAdviseSetPreferredLocation` was called on any
     subset of this memory range, then the pages will be migrated to
-    `location` even if `location` is not the preferred location of any
+    ``location`` even if ``location`` is not the preferred location of any
     pages in the memory range.
 
     If :py:obj:`~.cudaMemAdviseSetAccessedBy` was called on any subset of
@@ -30073,27 +30077,28 @@ def cudaMemPrefetchBatchAsync(dptrs : Optional[tuple[Any] | list[Any]], sizes :
     The semantics of the individual prefetch operations are as described in
     :py:obj:`~.cudaMemPrefetchAsync`.
 
-    Performs memory prefetch on address ranges specified in `dptrs` and
-    `sizes`. Both arrays must be of the same length as specified by
-    `count`. Each memory range specified must refer to managed memory
+    Performs memory prefetch on address ranges specified in ``dptrs`` and
+    ``sizes``. Both arrays must be of the same length as specified by
+    ``count``. Each memory range specified must refer to managed memory
     allocated via :py:obj:`~.cudaMallocManaged` or declared via managed
     variables or it may also refer to system-allocated memory when all
     devices have a non-zero value for
     :py:obj:`~.cudaDevAttrPageableMemoryAccess`. The prefetch location for
-    every operation in the batch is specified in the `prefetchLocs` array.
-    Each entry in this array can apply to more than one operation. This can
-    be done by specifying in the `prefetchLocIdxs` array, the index of the
-    first prefetch operation that the corresponding entry in the
-    `prefetchLocs` array applies to. Both `prefetchLocs` and
-    `prefetchLocIdxs` must be of the same length as specified by
-    `numPrefetchLocs`. For example, if a batch has 10 prefetches listed in
-    dptrs/sizes, the first 4 of which are to be prefetched to one location
-    and the remaining 6 are to be prefetched to another, then
-    `numPrefetchLocs` will be 2, `prefetchLocIdxs` will be {0, 4} and
-    `prefetchLocs` will contain the two locations. Note the first entry in
-    `prefetchLocIdxs` must always be 0. Also, each entry must be greater
-    than the previous entry and the last entry should be less than `count`.
-    Furthermore, `numPrefetchLocs` must be lesser than or equal to `count`.
+    every operation in the batch is specified in the ``prefetchLocs``
+    array. Each entry in this array can apply to more than one operation.
+    This can be done by specifying in the ``prefetchLocIdxs`` array, the
+    index of the first prefetch operation that the corresponding entry in
+    the ``prefetchLocs`` array applies to. Both ``prefetchLocs`` and
+    ``prefetchLocIdxs`` must be of the same length as specified by
+    ``numPrefetchLocs``. For example, if a batch has 10 prefetches listed
+    in dptrs/sizes, the first 4 of which are to be prefetched to one
+    location and the remaining 6 are to be prefetched to another, then
+    ``numPrefetchLocs`` will be 2, ``prefetchLocIdxs`` will be {0, 4} and
+    ``prefetchLocs`` will contain the two locations. Note the first entry
+    in ``prefetchLocIdxs`` must always be 0. Also, each entry must be
+    greater than the previous entry and the last entry should be less than
+    ``count``. Furthermore, ``numPrefetchLocs`` must be lesser than or
+    equal to ``count``.
 
     Parameters
     ----------
@@ -30102,18 +30107,18 @@ def cudaMemPrefetchBatchAsync(dptrs : Optional[tuple[Any] | list[Any]], sizes :
     sizes : list[int]
         Array of sizes for memory prefetch operations.
     count : size_t
-        Size of `dptrs` and `sizes` arrays.
+        Size of ``dptrs`` and ``sizes`` arrays.
     prefetchLocs : list[:py:obj:`~.cudaMemLocation`]
         Array of locations to prefetch to.
     prefetchLocIdxs : list[int]
         Array of indices to specify which operands each entry in the
-        `prefetchLocs` array applies to. The locations specified in
+        ``prefetchLocs`` array applies to. The locations specified in
         prefetchLocs[k] will be applied to copies starting from
         prefetchLocIdxs[k] through prefetchLocIdxs[k+1] - 1. Also
         prefetchLocs[numPrefetchLocs - 1] will apply to prefetches starting
         from prefetchLocIdxs[numPrefetchLocs - 1] through count - 1.
     numPrefetchLocs : size_t
-        Size of `prefetchLocs` and `prefetchLocIdxs` arrays.
+        Size of ``prefetchLocs`` and ``prefetchLocIdxs`` arrays.
     flags : unsigned long long
         Flags reserved for future use. Must be zero.
     hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
@@ -30191,9 +30196,9 @@ def cudaMemDiscardBatchAsync(dptrs : Optional[tuple[Any] | list[Any]], sizes : t
     prefetches to any part of the memory range that occur simultaneously
     with the discard operation result in undefined behavior.
 
-    Performs memory discard on address ranges specified in `dptrs` and
-    `sizes`. Both arrays must be of the same length as specified by
-    `count`. Each memory range specified must refer to managed memory
+    Performs memory discard on address ranges specified in ``dptrs`` and
+    ``sizes``. Both arrays must be of the same length as specified by
+    ``count``. Each memory range specified must refer to managed memory
     allocated via :py:obj:`~.cudaMallocManaged` or declared via managed
     variables or it may also refer to system-allocated memory when all
     devices have a non-zero value for
@@ -30206,7 +30211,7 @@ def cudaMemDiscardBatchAsync(dptrs : Optional[tuple[Any] | list[Any]], sizes : t
     sizes : list[int]
         Array of sizes for memory discard operations.
     count : size_t
-        Size of `dptrs` and `sizes` arrays.
+        Size of ``dptrs`` and ``sizes`` arrays.
     flags : unsigned long long
         Flags reserved for future use. Must be zero.
     hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
@@ -30264,27 +30269,27 @@ def cudaMemDiscardAndPrefetchBatchAsync(dptrs : Optional[tuple[Any] | list[Any]]
     undefined behavior.
 
     Performs memory discard and prefetch on address ranges specified in
-    `dptrs` and `sizes`. Both arrays must be of the same length as
-    specified by `count`. Each memory range specified must refer to managed
-    memory allocated via :py:obj:`~.cudaMallocManaged` or declared via
-    managed variables or it may also refer to system-allocated memory when
-    all devices have a non-zero value for
+    ``dptrs`` and ``sizes``. Both arrays must be of the same length as
+    specified by ``count``. Each memory range specified must refer to
+    managed memory allocated via :py:obj:`~.cudaMallocManaged` or declared
+    via managed variables or it may also refer to system-allocated memory
+    when all devices have a non-zero value for
     :py:obj:`~.cudaDevAttrPageableMemoryAccess`. Every operation in the
     batch has to be associated with a valid location to prefetch the
-    address range to and specified in the `prefetchLocs` array. Each entry
-    in this array can apply to more than one operation. This can be done by
-    specifying in the `prefetchLocIdxs` array, the index of the first
-    operation that the corresponding entry in the `prefetchLocs` array
-    applies to. Both `prefetchLocs` and `prefetchLocIdxs` must be of the
-    same length as specified by `numPrefetchLocs`. For example, if a batch
-    has 10 operations listed in dptrs/sizes, the first 6 of which are to be
-    prefetched to one location and the remaining 4 are to be prefetched to
-    another, then `numPrefetchLocs` will be 2, `prefetchLocIdxs` will be
-    {0, 6} and `prefetchLocs` will contain the two set of locations. Note
-    the first entry in `prefetchLocIdxs` must always be 0. Also, each entry
-    must be greater than the previous entry and the last entry should be
-    less than `count`. Furthermore, `numPrefetchLocs` must be lesser than
-    or equal to `count`.
+    address range to and specified in the ``prefetchLocs`` array. Each
+    entry in this array can apply to more than one operation. This can be
+    done by specifying in the ``prefetchLocIdxs`` array, the index of the
+    first operation that the corresponding entry in the ``prefetchLocs``
+    array applies to. Both ``prefetchLocs`` and ``prefetchLocIdxs`` must be
+    of the same length as specified by ``numPrefetchLocs``. For example, if
+    a batch has 10 operations listed in dptrs/sizes, the first 6 of which
+    are to be prefetched to one location and the remaining 4 are to be
+    prefetched to another, then ``numPrefetchLocs`` will be 2,
+    ``prefetchLocIdxs`` will be {0, 6} and ``prefetchLocs`` will contain
+    the two set of locations. Note the first entry in ``prefetchLocIdxs``
+    must always be 0. Also, each entry must be greater than the previous
+    entry and the last entry should be less than ``count``. Furthermore,
+    ``numPrefetchLocs`` must be lesser than or equal to ``count``.
 
     Parameters
     ----------
@@ -30293,18 +30298,18 @@ def cudaMemDiscardAndPrefetchBatchAsync(dptrs : Optional[tuple[Any] | list[Any]]
     sizes : list[int]
         Array of sizes for memory discard operations.
     count : size_t
-        Size of `dptrs` and `sizes` arrays.
+        Size of ``dptrs`` and ``sizes`` arrays.
     prefetchLocs : list[:py:obj:`~.cudaMemLocation`]
         Array of locations to prefetch to.
     prefetchLocIdxs : list[int]
         Array of indices to specify which operands each entry in the
-        `prefetchLocs` array applies to. The locations specified in
+        ``prefetchLocs`` array applies to. The locations specified in
         prefetchLocs[k] will be applied to operations starting from
         prefetchLocIdxs[k] through prefetchLocIdxs[k+1] - 1. Also
         prefetchLocs[numPrefetchLocs - 1] will apply to copies starting
         from prefetchLocIdxs[numPrefetchLocs - 1] through count - 1.
     numPrefetchLocs : size_t
-        Size of `prefetchLocs` and `prefetchLocIdxs` arrays.
+        Size of ``prefetchLocs`` and ``prefetchLocIdxs`` arrays.
     flags : unsigned long long
         Flags reserved for future use. Must be zero.
     hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
@@ -30364,18 +30369,18 @@ def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, loca
     """ Advise about the usage of a given memory range.
 
     Advise the Unified Memory subsystem about the usage pattern for the
-    memory range starting at `devPtr` with a size of `count` bytes. The
+    memory range starting at ``devPtr`` with a size of ``count`` bytes. The
     start address and end address of the memory range will be rounded down
     and rounded up respectively to be aligned to CPU page size before the
     advice is applied. The memory range must refer to managed memory
     allocated via :py:obj:`~.cudaMallocManaged` or declared via managed
     variables. The memory range could also refer to system-allocated
     pageable memory provided it represents a valid, host-accessible region
-    of memory and all additional constraints imposed by `advice` as
+    of memory and all additional constraints imposed by ``advice`` as
     outlined below are also satisfied. Specifying an invalid system-
     allocated pageable memory range results in an error being returned.
 
-    The `advice` parameter can take the following values:
+    The ``advice`` parameter can take the following values:
 
     - :py:obj:`~.cudaMemAdviseSetReadMostly`: This implies that the data is
       mostly going to be read from and only occasionally written to. Any
@@ -30391,7 +30396,7 @@ def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, loca
       page will be invalidated except for the one where the write occurred.
       If the writing processor is the CPU and the preferred location of the
       page is a host NUMA node, then the page will also be migrated to that
-      host NUMA node. The `location` argument is ignored for this advice.
+      host NUMA node. The ``location`` argument is ignored for this advice.
       Note that for a page to be read-duplicated, the accessing processor
       must either be the CPU or a GPU that has a non-zero value for the
       device attribute :py:obj:`~.cudaDevAttrConcurrentManagedAccess`.
@@ -30415,12 +30420,12 @@ def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, loca
       collapsed into a single copy. The location for the collapsed copy
       will be the preferred location if the page has a preferred location
       and one of the read-duplicated copies was resident at that location.
-      Otherwise, the location chosen is arbitrary. Note: The `location`
+      Otherwise, the location chosen is arbitrary. Note: The ``location``
       argument is ignored for this advice.
 
     - :py:obj:`~.cudaMemAdviseSetPreferredLocation`: This advice sets the
       preferred location for the data to be the memory belonging to
-      `location`. When :py:obj:`~.cudaMemLocation.type` is
+      ``location``. When :py:obj:`~.cudaMemLocation.type` is
       :py:obj:`~.cudaMemLocationTypeHost`, :py:obj:`~.cudaMemLocation.id`
       is ignored and the preferred location is set to be host memory. To
       set the preferred location to a specific host NUMA node, applications
@@ -30455,7 +30460,7 @@ def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, loca
       :py:obj:`~.cudaMemAdviseSetReadMostly` is also set on this memory
       region or any subset of it, then the policies associated with that
       advice will override the policies of this advice, unless read
-      accesses from `location` will not result in a read-only copy being
+      accesses from ``location`` will not result in a read-only copy being
       created on that procesor as outlined in description for the advice
       :py:obj:`~.cudaMemAdviseSetReadMostly`. If the memory region refers
       to valid system-allocated pageable memory, and
@@ -30467,11 +30472,11 @@ def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, loca
 
     - :py:obj:`~.cudaMemAdviseUnsetPreferredLocation`: Undoes the effect of
       :py:obj:`~.cudaMemAdviseSetPreferredLocation` and changes the
-      preferred location to none. The `location` argument is ignored for
+      preferred location to none. The ``location`` argument is ignored for
       this advice.
 
     - :py:obj:`~.cudaMemAdviseSetAccessedBy`: This advice implies that the
-      data will be accessed by processor `location`. The
+      data will be accessed by processor ``location``. The
       :py:obj:`~.cudaMemLocation.type` must be either
       :py:obj:`~.cudaMemLocationTypeDevice` with
       :py:obj:`~.cudaMemLocation.id` representing a valid device ordinal or
@@ -30501,7 +30506,7 @@ def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, loca
       region or any subset of it, then the policies associated with that
       advice will override the policies of this advice. Additionally, if
       the preferred location of this memory region or any subset of it is
-      also `location`, then the policies associated with
+      also ``location``, then the policies associated with
       :py:obj:`~.CU_MEM_ADVISE_SET_PREFERRED_LOCATION` will override the
       policies of this advice. If the memory region refers to valid system-
       allocated pageable memory, and :py:obj:`~.cudaMemLocation.type` is
@@ -30515,7 +30520,7 @@ def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, loca
 
     - :py:obj:`~.CU_MEM_ADVISE_UNSET_ACCESSED_BY`: Undoes the effect of
       :py:obj:`~.cudaMemAdviseSetAccessedBy`. Any mappings to the data from
-      `location` may be removed at any time causing accesses to result in
+      ``location`` may be removed at any time causing accesses to result in
       non-fatal page faults. If the memory region refers to valid system-
       allocated pageable memory, and :py:obj:`~.cudaMemLocation.type` is
       :py:obj:`~.cudaMemLocationTypeDevice` then device in
@@ -30561,22 +30566,22 @@ def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, loca
 def cudaMemRangeGetAttribute(size_t dataSize, attribute not None : cudaMemRangeAttribute, devPtr, size_t count):
     """ Query an attribute of a given memory range.
 
-    Query an attribute about the memory range starting at `devPtr` with a
-    size of `count` bytes. The memory range must refer to managed memory
+    Query an attribute about the memory range starting at ``devPtr`` with a
+    size of ``count`` bytes. The memory range must refer to managed memory
     allocated via :py:obj:`~.cudaMallocManaged` or declared via managed
     variables.
 
-    The `attribute` parameter can take the following values:
+    The ``attribute`` parameter can take the following values:
 
     - :py:obj:`~.cudaMemRangeAttributeReadMostly`: If this attribute is
-      specified, `data` will be interpreted as a 32-bit integer, and
-      `dataSize` must be 4. The result returned will be 1 if all pages in
+      specified, ``data`` will be interpreted as a 32-bit integer, and
+      ``dataSize`` must be 4. The result returned will be 1 if all pages in
       the given memory range have read-duplication enabled, or 0 otherwise.
 
     - :py:obj:`~.cudaMemRangeAttributePreferredLocation`: If this attribute
-      is specified, `data` will be interpreted as a 32-bit integer, and
-      `dataSize` must be 4. The result returned will be a GPU device id if
-      all pages in the memory range have that GPU as their preferred
+      is specified, ``data`` will be interpreted as a 32-bit integer, and
+      ``dataSize`` must be 4. The result returned will be a GPU device id
+      if all pages in the memory range have that GPU as their preferred
       location, or it will be cudaCpuDeviceId if all pages in the memory
       range have the CPU as their preferred location, or it will be
       cudaInvalidDeviceId if either all the pages don't have the same
@@ -30586,24 +30591,25 @@ def cudaMemRangeGetAttribute(size_t dataSize, attribute not None : cudaMemRangeA
       preferred location.
 
     - :py:obj:`~.cudaMemRangeAttributeAccessedBy`: If this attribute is
-      specified, `data` will be interpreted as an array of 32-bit integers,
-      and `dataSize` must be a non-zero multiple of 4. The result returned
-      will be a list of device ids that had
+      specified, ``data`` will be interpreted as an array of 32-bit
+      integers, and ``dataSize`` must be a non-zero multiple of 4. The
+      result returned will be a list of device ids that had
       :py:obj:`~.cudaMemAdviceSetAccessedBy` set for that entire memory
       range. If any device does not have that advice set for the entire
-      memory range, that device will not be included. If `data` is larger
+      memory range, that device will not be included. If ``data`` is larger
       than the number of devices that have that advice set for that memory
       range, cudaInvalidDeviceId will be returned in all the extra space
-      provided. For ex., if `dataSize` is 12 (i.e. `data` has 3 elements)
-      and only device 0 has the advice set, then the result returned will
-      be { 0, cudaInvalidDeviceId, cudaInvalidDeviceId }. If `data` is
-      smaller than the number of devices that have that advice set, then
-      only as many devices will be returned as can fit in the array. There
-      is no guarantee on which specific devices will be returned, however.
+      provided. For ex., if ``dataSize`` is 12 (i.e. ``data`` has 3
+      elements) and only device 0 has the advice set, then the result
+      returned will be { 0, cudaInvalidDeviceId, cudaInvalidDeviceId }. If
+      ``data`` is smaller than the number of devices that have that advice
+      set, then only as many devices will be returned as can fit in the
+      array. There is no guarantee on which specific devices will be
+      returned, however.
 
     - :py:obj:`~.cudaMemRangeAttributeLastPrefetchLocation`: If this
-      attribute is specified, `data` will be interpreted as a 32-bit
-      integer, and `dataSize` must be 4. The result returned will be the
+      attribute is specified, ``data`` will be interpreted as a 32-bit
+      integer, and ``dataSize`` must be 4. The result returned will be the
       last location to which all pages in the memory range were prefetched
       explicitly via :py:obj:`~.cudaMemPrefetchAsync`. This will either be
       a GPU id or cudaCpuDeviceId depending on whether the last location
@@ -30616,8 +30622,8 @@ def cudaMemRangeGetAttribute(size_t dataSize, attribute not None : cudaMemRangeA
       completed or even begun.
 
     - :py:obj:`~.cudaMemRangeAttributePreferredLocationType`: If this
-      attribute is specified, `data` will be interpreted as a
-      :py:obj:`~.cudaMemLocationType`, and `dataSize` must be
+      attribute is specified, ``data`` will be interpreted as a
+      :py:obj:`~.cudaMemLocationType`, and ``dataSize`` must be
       sizeof(cudaMemLocationType). The :py:obj:`~.cudaMemLocationType`
       returned will be :py:obj:`~.cudaMemLocationTypeDevice` if all pages
       in the memory range have the same GPU as their preferred location, or
@@ -30633,8 +30639,8 @@ def cudaMemRangeGetAttribute(size_t dataSize, attribute not None : cudaMemRangeA
       the query may be different from the preferred location type.
 
       - :py:obj:`~.cudaMemRangeAttributePreferredLocationId`: If this
-        attribute is specified, `data` will be interpreted as a 32-bit
-        integer, and `dataSize` must be 4. If the
+        attribute is specified, ``data`` will be interpreted as a 32-bit
+        integer, and ``dataSize`` must be 4. If the
         :py:obj:`~.cudaMemRangeAttributePreferredLocationType` query for
         the same address range returns
         :py:obj:`~.cudaMemLocationTypeDevice`, it will be a valid device
@@ -30643,8 +30649,8 @@ def cudaMemRangeGetAttribute(size_t dataSize, attribute not None : cudaMemRangeA
         location type, the id should be ignored.
 
     - :py:obj:`~.cudaMemRangeAttributeLastPrefetchLocationType`: If this
-      attribute is specified, `data` will be interpreted as a
-      :py:obj:`~.cudaMemLocationType`, and `dataSize` must be
+      attribute is specified, ``data`` will be interpreted as a
+      :py:obj:`~.cudaMemLocationType`, and ``dataSize`` must be
       sizeof(cudaMemLocationType). The result returned will be the last
       location type to which all pages in the memory range were prefetched
       explicitly via :py:obj:`~.cuMemPrefetchAsync`. The
@@ -30661,8 +30667,8 @@ def cudaMemRangeGetAttribute(size_t dataSize, attribute not None : cudaMemRangeA
       the prefetch operation to that location has completed or even begun.
 
       - :py:obj:`~.cudaMemRangeAttributeLastPrefetchLocationId`: If this
-        attribute is specified, `data` will be interpreted as a 32-bit
-        integer, and `dataSize` must be 4. If the
+        attribute is specified, ``data`` will be interpreted as a 32-bit
+        integer, and ``dataSize`` must be 4. If the
         :py:obj:`~.cudaMemRangeAttributeLastPrefetchLocationType` query for
         the same address range returns
         :py:obj:`~.cudaMemLocationTypeDevice`, it will be a valid device
@@ -30712,13 +30718,13 @@ def cudaMemRangeGetAttribute(size_t dataSize, attribute not None : cudaMemRangeA
 def cudaMemRangeGetAttributes(dataSizes : tuple[int] | list[int], attributes : Optional[tuple[cudaMemRangeAttribute] | list[cudaMemRangeAttribute]], size_t numAttributes, devPtr, size_t count):
     """ Query attributes of a given memory range.
 
-    Query attributes of the memory range starting at `devPtr` with a size
-    of `count` bytes. The memory range must refer to managed memory
+    Query attributes of the memory range starting at ``devPtr`` with a size
+    of ``count`` bytes. The memory range must refer to managed memory
     allocated via :py:obj:`~.cudaMallocManaged` or declared via managed
-    variables. The `attributes` array will be interpreted to have
-    `numAttributes` entries. The `dataSizes` array will also be interpreted
-    to have `numAttributes` entries. The results of the query will be
-    stored in `data`.
+    variables. The ``attributes`` array will be interpreted to have
+    ``numAttributes`` entries. The ``dataSizes`` array will also be
+    interpreted to have ``numAttributes`` entries. The results of the query
+    will be stored in ``data``.
 
     The list of supported attributes are given below. Please refer to
     :py:obj:`~.cudaMemRangeGetAttribute` for attribute descriptions and
@@ -30796,11 +30802,12 @@ def cudaMemcpyToArray(dst, size_t wOffset, size_t hOffset, src, size_t count, ki
 
     [Deprecated]
 
-    Copies `count` bytes from the memory area pointed to by `src` to the
-    CUDA array `dst` starting at `hOffset` rows and `wOffset` bytes from
-    the upper left corner, where `kind` specifies the direction of the
-    copy, and must be one of :py:obj:`~.cudaMemcpyHostToHost`,
-    :py:obj:`~.cudaMemcpyHostToDevice`, :py:obj:`~.cudaMemcpyDeviceToHost`,
+    Copies ``count`` bytes from the memory area pointed to by ``src`` to
+    the CUDA array ``dst`` starting at ``hOffset`` rows and ``wOffset``
+    bytes from the upper left corner, where ``kind`` specifies the
+    direction of the copy, and must be one of
+    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
@@ -30856,11 +30863,12 @@ def cudaMemcpyFromArray(dst, src, size_t wOffset, size_t hOffset, size_t count,
 
     [Deprecated]
 
-    Copies `count` bytes from the CUDA array `src` starting at `hOffset`
-    rows and `wOffset` bytes from the upper left corner to the memory area
-    pointed to by `dst`, where `kind` specifies the direction of the copy,
-    and must be one of :py:obj:`~.cudaMemcpyHostToHost`,
-    :py:obj:`~.cudaMemcpyHostToDevice`, :py:obj:`~.cudaMemcpyDeviceToHost`,
+    Copies ``count`` bytes from the CUDA array ``src`` starting at
+    ``hOffset`` rows and ``wOffset`` bytes from the upper left corner to
+    the memory area pointed to by ``dst``, where ``kind`` specifies the
+    direction of the copy, and must be one of
+    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
@@ -30916,12 +30924,13 @@ def cudaMemcpyArrayToArray(dst, size_t wOffsetDst, size_t hOffsetDst, src, size_
 
     [Deprecated]
 
-    Copies `count` bytes from the CUDA array `src` starting at `hOffsetSrc`
-    rows and `wOffsetSrc` bytes from the upper left corner to the CUDA
-    array `dst` starting at `hOffsetDst` rows and `wOffsetDst` bytes from
-    the upper left corner, where `kind` specifies the direction of the
-    copy, and must be one of :py:obj:`~.cudaMemcpyHostToHost`,
-    :py:obj:`~.cudaMemcpyHostToDevice`, :py:obj:`~.cudaMemcpyDeviceToHost`,
+    Copies ``count`` bytes from the CUDA array ``src`` starting at
+    ``hOffsetSrc`` rows and ``wOffsetSrc`` bytes from the upper left corner
+    to the CUDA array ``dst`` starting at ``hOffsetDst`` rows and
+    ``wOffsetDst`` bytes from the upper left corner, where ``kind``
+    specifies the direction of the copy, and must be one of
+    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
@@ -30986,11 +30995,12 @@ def cudaMemcpyToArrayAsync(dst, size_t wOffset, size_t hOffset, src, size_t coun
 
     [Deprecated]
 
-    Copies `count` bytes from the memory area pointed to by `src` to the
-    CUDA array `dst` starting at `hOffset` rows and `wOffset` bytes from
-    the upper left corner, where `kind` specifies the direction of the
-    copy, and must be one of :py:obj:`~.cudaMemcpyHostToHost`,
-    :py:obj:`~.cudaMemcpyHostToDevice`, :py:obj:`~.cudaMemcpyDeviceToHost`,
+    Copies ``count`` bytes from the memory area pointed to by ``src`` to
+    the CUDA array ``dst`` starting at ``hOffset`` rows and ``wOffset``
+    bytes from the upper left corner, where ``kind`` specifies the
+    direction of the copy, and must be one of
+    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
@@ -30999,10 +31009,10 @@ def cudaMemcpyToArrayAsync(dst, size_t wOffset, size_t hOffset, src, size_t coun
 
     :py:obj:`~.cudaMemcpyToArrayAsync()` is asynchronous with respect to
     the host, so the call may return before the copy is complete. The copy
-    can optionally be associated to a stream by passing a non-zero `stream`
-    argument. If `kind` is :py:obj:`~.cudaMemcpyHostToDevice` or
-    :py:obj:`~.cudaMemcpyDeviceToHost` and `stream` is non-zero, the copy
-    may overlap with operations in other streams.
+    can optionally be associated to a stream by passing a non-zero
+    ``stream`` argument. If ``kind`` is :py:obj:`~.cudaMemcpyHostToDevice`
+    or :py:obj:`~.cudaMemcpyDeviceToHost` and ``stream`` is non-zero, the
+    copy may overlap with operations in other streams.
 
     Parameters
     ----------
@@ -31063,11 +31073,12 @@ def cudaMemcpyFromArrayAsync(dst, src, size_t wOffset, size_t hOffset, size_t co
 
     [Deprecated]
 
-    Copies `count` bytes from the CUDA array `src` starting at `hOffset`
-    rows and `wOffset` bytes from the upper left corner to the memory area
-    pointed to by `dst`, where `kind` specifies the direction of the copy,
-    and must be one of :py:obj:`~.cudaMemcpyHostToHost`,
-    :py:obj:`~.cudaMemcpyHostToDevice`, :py:obj:`~.cudaMemcpyDeviceToHost`,
+    Copies ``count`` bytes from the CUDA array ``src`` starting at
+    ``hOffset`` rows and ``wOffset`` bytes from the upper left corner to
+    the memory area pointed to by ``dst``, where ``kind`` specifies the
+    direction of the copy, and must be one of
+    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
+    :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
@@ -31076,10 +31087,10 @@ def cudaMemcpyFromArrayAsync(dst, src, size_t wOffset, size_t hOffset, size_t co
 
     :py:obj:`~.cudaMemcpyFromArrayAsync()` is asynchronous with respect to
     the host, so the call may return before the copy is complete. The copy
-    can optionally be associated to a stream by passing a non-zero `stream`
-    argument. If `kind` is :py:obj:`~.cudaMemcpyHostToDevice` or
-    :py:obj:`~.cudaMemcpyDeviceToHost` and `stream` is non-zero, the copy
-    may overlap with operations in other streams.
+    can optionally be associated to a stream by passing a non-zero
+    ``stream`` argument. If ``kind`` is :py:obj:`~.cudaMemcpyHostToDevice`
+    or :py:obj:`~.cudaMemcpyDeviceToHost` and ``stream`` is non-zero, the
+    copy may overlap with operations in other streams.
 
     Parameters
     ----------
@@ -31138,8 +31149,8 @@ def cudaMemcpyFromArrayAsync(dst, src, size_t wOffset, size_t hOffset, size_t co
 def cudaMallocAsync(size_t size, hStream):
     """ Allocates memory with stream ordered semantics.
 
-    Inserts an allocation operation into `hStream`. A pointer to the
-    allocated memory is returned immediately in *dptr. The allocation must
+    Inserts an allocation operation into ``hStream``. A pointer to the
+    allocated memory is returned immediately in \\*dptr. The allocation must
     not be accessed until the the allocation operation completes. The
     allocation comes from the memory pool associated with the stream's
     device.
@@ -31193,7 +31204,7 @@ def cudaMallocAsync(size_t size, hStream):
 def cudaFreeAsync(devPtr, hStream):
     """ Frees memory with stream ordered semantics.
 
-    Inserts a free operation into `hStream`. The allocation must not be
+    Inserts a free operation into ``hStream``. The allocation must not be
     accessed after stream execution reaches the free. After this API
     returns, accessing the memory from any subsequent work launched on the
     GPU or querying its pointer attributes results in undefined behavior.
@@ -31578,26 +31589,26 @@ def cudaMemPoolGetAccess(memPool, location : Optional[cudaMemLocation]):
 def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
     """ Creates a memory pool.
 
-    Creates a CUDA memory pool and returns the handle in `pool`. The
-    `poolProps` determines the properties of the pool such as the backing
+    Creates a CUDA memory pool and returns the handle in ``pool``. The
+    ``poolProps`` determines the properties of the pool such as the backing
     device and IPC capabilities.
 
     To create a memory pool for host memory not targeting a specific NUMA
     node, applications must set set
-    :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::type to
+    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.type` to
     :py:obj:`~.cudaMemLocationTypeHost`.
-    :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::id is ignored for such
+    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.id` is ignored for such
     pools. Pools created with the type :py:obj:`~.cudaMemLocationTypeHost`
     are not IPC capable and :py:obj:`~.cudaMemPoolProps.handleTypes` must
     be 0, any other values will result in
     :py:obj:`~.cudaErrorInvalidValue`. To create a memory pool targeting a
     specific host NUMA node, applications must set
-    :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::type to
+    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.type` to
     :py:obj:`~.cudaMemLocationTypeHostNuma` and
-    :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::id must specify the NUMA
+    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.id` must specify the NUMA
     ID of the host memory node. Specifying
     :py:obj:`~.cudaMemLocationTypeHostNumaCurrent` as the
-    :py:obj:`~.cudaMemPoolProps`::cudaMemLocation::type will result in
+    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.type` will result in
     :py:obj:`~.cudaErrorInvalidValue`. By default, the pool's memory will
     be accessible from the device it is allocated on. In the case of pools
     created with :py:obj:`~.cudaMemLocationTypeHostNuma` or
@@ -31608,7 +31619,7 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
     the pool will default to a system dependent value.
 
     Applications that intend to use :py:obj:`~.CU_MEM_HANDLE_TYPE_FABRIC`
-    based memory sharing must ensure: (1) `nvidia-caps-imex-channels`
+    based memory sharing must ensure: (1) ``nvidia-caps-imex-channels``
     character device is created by the driver and is listed under
     /proc/devices (2) have at least one IMEX channel file accessible by the
     user launching the application.
@@ -31624,15 +31635,15 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
     These channel files exist in /dev/nvidia-caps-imex-channels/channel*
     and can be created using standard OS native calls like mknod on Linux.
     For example: To create channel0 with the major number from
-    /proc/devices users can execute the following command: `mknod
-    /dev/nvidia-caps-imex-channels/channel0 c <major number> 0`
+    /proc/devices users can execute the following command: ``mknod
+    /dev/nvidia-caps-imex-channels/channel0 c <major number> 0``
 
     To create a managed memory pool, applications must set
-    :py:obj:`~.cudaMemPoolProps`:cudaMemAllocationType to
+    cudaMemPoolProps:cudaMemAllocationType to
     :py:obj:`~.cudaMemAllocationTypeManaged`.
-    :py:obj:`~.cudaMemPoolProps`::cudaMemAllocationHandleType must also be
+    :py:obj:`~.cudaMemPoolProps.cudaMemAllocationHandleType` must also be
     set to :py:obj:`~.cudaMemHandleTypeNone` since IPC is not supported.
-    For managed memory pools, :py:obj:`~.cudaMemPoolProps`::cudaMemLocation
+    For managed memory pools, :py:obj:`~.cudaMemPoolProps.cudaMemLocation`
     will be treated as the preferred location for all allocations created
     from the pool. An application can also set
     :py:obj:`~.cudaMemLocationTypeNone` to indicate no preferred location.
@@ -31890,8 +31901,8 @@ def cudaMemSetMemPool(location : Optional[cudaMemLocation], typename not None :
 def cudaMallocFromPoolAsync(size_t size, memPool, stream):
     """ Allocates memory from a specified pool with stream ordered semantics.
 
-    Inserts an allocation operation into `hStream`. A pointer to the
-    allocated memory is returned immediately in *dptr. The allocation must
+    Inserts an allocation operation into ``hStream``. A pointer to the
+    allocated memory is returned immediately in \\*dptr. The allocation must
     not be accessed until the the allocation operation completes. The
     allocation comes from the specified memory pool.
 
@@ -32051,7 +32062,7 @@ def cudaMemPoolImportFromShareableHandle(shareableHandle, handleType not None :
 def cudaMemPoolExportPointer(ptr):
     """ Export data to share a memory pool allocation between processes.
 
-    Constructs `shareData_out` for sharing a specific allocation from an
+    Constructs ``shareData_out`` for sharing a specific allocation from an
     already shared memory pool. The recipient process can import the
     allocation with the :py:obj:`~.cudaMemPoolImportPointer` api. The data
     is not a handle and may be shared through any IPC mechanism.
@@ -32089,7 +32100,7 @@ def cudaMemPoolExportPointer(ptr):
 def cudaMemPoolImportPointer(memPool, exportData : Optional[cudaMemPoolPtrExportData]):
     """ Import a memory pool allocation from another process.
 
-    Returns in `ptr_out` a pointer to the imported memory. The imported
+    Returns in ``ptr_out`` a pointer to the imported memory. The imported
     memory must not be accessed before the allocation operation completes
     in the exporting process. The imported memory must be freed from all
     importing processes before being freed in the exporting process. The
@@ -32142,12 +32153,12 @@ def cudaMemPoolImportPointer(memPool, exportData : Optional[cudaMemPoolPtrExport
 def cudaPointerGetAttributes(ptr):
     """ Returns attributes about a specified pointer.
 
-    Returns in `*attributes` the attributes of the pointer `ptr`. If
+    Returns in ``*attributes`` the attributes of the pointer ``ptr``. If
     pointer was not allocated in, mapped by or registered with context
     supporting unified addressing :py:obj:`~.cudaErrorInvalidValue` is
     returned.
 
-    The :py:obj:`~.cudaPointerAttributes` structure is defined as:
+    The cudaPointerAttributes structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -32159,23 +32170,23 @@ def cudaPointerGetAttributes(ptr):
       :py:obj:`~.cudaMemoryTypeDevice` for device memory or
       :py:obj:`~.cudaMemoryTypeManaged` for managed memory.
 
-    - :py:obj:`~.device` is the device against which `ptr` was allocated.
-      If `ptr` has memory type :py:obj:`~.cudaMemoryTypeDevice` then this
-      identifies the device on which the memory referred to by `ptr`
-      physically resides. If `ptr` has memory type
+    - :py:obj:`~.device` is the device against which ``ptr`` was allocated.
+      If ``ptr`` has memory type :py:obj:`~.cudaMemoryTypeDevice` then this
+      identifies the device on which the memory referred to by ``ptr``
+      physically resides. If ``ptr`` has memory type
       :py:obj:`~.cudaMemoryTypeHost` then this identifies the device which
       was current when the allocation was made (and if that device is
       deinitialized then this allocation will vanish with that device's
       state).
 
     - :py:obj:`~.devicePointer` is the device pointer alias through which
-      the memory referred to by `ptr` may be accessed on the current
-      device. If the memory referred to by `ptr` cannot be accessed
+      the memory referred to by ``ptr`` may be accessed on the current
+      device. If the memory referred to by ``ptr`` cannot be accessed
       directly by the current device then this is NULL.
 
     - :py:obj:`~.hostPointer` is the host pointer alias through which the
-      memory referred to by `ptr` may be accessed on the host. If the
-      memory referred to by `ptr` cannot be accessed directly by the host
+      memory referred to by ``ptr`` may be accessed on the host. If the
+      memory referred to by ``ptr`` cannot be accessed directly by the host
       then this is NULL.
 
     Parameters
@@ -32215,19 +32226,20 @@ def cudaPointerGetAttributes(ptr):
 def cudaDeviceCanAccessPeer(int device, int peerDevice):
     """ Queries if a device may directly access a peer device's memory.
 
-    Returns in `*canAccessPeer` a value of 1 if device `device` is capable
-    of directly accessing memory from `peerDevice` and 0 otherwise. If
-    direct access of `peerDevice` from `device` is possible, then access
-    may be enabled by calling :py:obj:`~.cudaDeviceEnablePeerAccess()`.
+    Returns in ``*canAccessPeer`` a value of 1 if device ``device`` is
+    capable of directly accessing memory from ``peerDevice`` and 0
+    otherwise. If direct access of ``peerDevice`` from ``device`` is
+    possible, then access may be enabled by calling
+    :py:obj:`~.cudaDeviceEnablePeerAccess()`.
 
     Parameters
     ----------
     device : int
-        Device from which allocations on `peerDevice` are to be directly
+        Device from which allocations on ``peerDevice`` are to be directly
         accessed.
     peerDevice : int
-        Device on which the allocations to be directly accessed by `device`
-        reside.
+        Device on which the allocations to be directly accessed by
+        ``device`` reside.
 
     Returns
     -------
@@ -32254,14 +32266,14 @@ def cudaDeviceCanAccessPeer(int device, int peerDevice):
 def cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags):
     """ Enables direct access to memory allocations on a peer device.
 
-    On success, all allocations from `peerDevice` will immediately be
+    On success, all allocations from ``peerDevice`` will immediately be
     accessible by the current device. They will remain accessible until
     access is explicitly disabled using
     :py:obj:`~.cudaDeviceDisablePeerAccess()` or either device is reset
     using :py:obj:`~.cudaDeviceReset()`.
 
     Note that access granted by this call is unidirectional and that in
-    order to access memory on the current device from `peerDevice`, a
+    order to access memory on the current device from ``peerDevice``, a
     separate symmetric call to :py:obj:`~.cudaDeviceEnablePeerAccess()` is
     required.
 
@@ -32271,12 +32283,12 @@ def cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags):
 
     Returns :py:obj:`~.cudaErrorInvalidDevice` if
     :py:obj:`~.cudaDeviceCanAccessPeer()` indicates that the current device
-    cannot directly access memory from `peerDevice`.
+    cannot directly access memory from ``peerDevice``.
 
     Returns :py:obj:`~.cudaErrorPeerAccessAlreadyEnabled` if direct access
-    of `peerDevice` from the current device has already been enabled.
+    of ``peerDevice`` from the current device has already been enabled.
 
-    Returns :py:obj:`~.cudaErrorInvalidValue` if `flags` is not 0.
+    Returns :py:obj:`~.cudaErrorInvalidValue` if ``flags`` is not 0.
 
     Parameters
     ----------
@@ -32306,7 +32318,7 @@ def cudaDeviceDisablePeerAccess(int peerDevice):
     """ Disables direct access to memory allocations on a peer device.
 
     Returns :py:obj:`~.cudaErrorPeerAccessNotEnabled` if direct access to
-    memory on `peerDevice` has not yet been enabled from the current
+    memory on ``peerDevice`` has not yet been enabled from the current
     device.
 
     Parameters
@@ -32334,10 +32346,10 @@ def cudaDeviceDisablePeerAccess(int peerDevice):
 def cudaGraphicsUnregisterResource(resource):
     """ Unregisters a graphics resource for access by CUDA.
 
-    Unregisters the graphics resource `resource` so it is not accessible by
-    CUDA unless registered again.
+    Unregisters the graphics resource ``resource`` so it is not accessible
+    by CUDA unless registered again.
 
-    If `resource` is invalid then
+    If ``resource`` is invalid then
     :py:obj:`~.cudaErrorInvalidResourceHandle` is returned.
 
     Parameters
@@ -32373,26 +32385,26 @@ def cudaGraphicsUnregisterResource(resource):
 def cudaGraphicsResourceSetMapFlags(resource, unsigned int flags):
     """ Set usage flags for mapping a graphics resource.
 
-    Set `flags` for mapping the graphics resource `resource`.
+    Set ``flags`` for mapping the graphics resource ``resource``.
 
-    Changes to `flags` will take effect the next time `resource` is mapped.
-    The `flags` argument may be any of the following:
+    Changes to ``flags`` will take effect the next time ``resource`` is
+    mapped. The ``flags`` argument may be any of the following:
 
     - :py:obj:`~.cudaGraphicsMapFlagsNone`: Specifies no hints about how
-      `resource` will be used. It is therefore assumed that CUDA may read
-      from or write to `resource`.
+      ``resource`` will be used. It is therefore assumed that CUDA may read
+      from or write to ``resource``.
 
     - :py:obj:`~.cudaGraphicsMapFlagsReadOnly`: Specifies that CUDA will
-      not write to `resource`.
+      not write to ``resource``.
 
     - :py:obj:`~.cudaGraphicsMapFlagsWriteDiscard`: Specifies CUDA will not
-      read from `resource` and will write over the entire contents of
-      `resource`, so none of the data previously stored in `resource` will
-      be preserved.
+      read from ``resource`` and will write over the entire contents of
+      ``resource``, so none of the data previously stored in ``resource``
+      will be preserved.
 
-    If `resource` is presently mapped for access by CUDA then
-    :py:obj:`~.cudaErrorUnknown` is returned. If `flags` is not one of the
-    above values then :py:obj:`~.cudaErrorInvalidValue` is returned.
+    If ``resource`` is presently mapped for access by CUDA then
+    :py:obj:`~.cudaErrorUnknown` is returned. If ``flags`` is not one of
+    the above values then :py:obj:`~.cudaErrorInvalidValue` is returned.
 
     Parameters
     ----------
@@ -32429,20 +32441,21 @@ def cudaGraphicsResourceSetMapFlags(resource, unsigned int flags):
 def cudaGraphicsMapResources(int count, resources, stream):
     """ Map graphics resources for access by CUDA.
 
-    Maps the `count` graphics resources in `resources` for access by CUDA.
+    Maps the ``count`` graphics resources in ``resources`` for access by
+    CUDA.
 
-    The resources in `resources` may be accessed by CUDA until they are
-    unmapped. The graphics API from which `resources` were registered
+    The resources in ``resources`` may be accessed by CUDA until they are
+    unmapped. The graphics API from which ``resources`` were registered
     should not access any resources while they are mapped by CUDA. If an
     application does so, the results are undefined.
 
     This function provides the synchronization guarantee that any graphics
     calls issued before :py:obj:`~.cudaGraphicsMapResources()` will
-    complete before any subsequent CUDA work issued in `stream` begins.
+    complete before any subsequent CUDA work issued in ``stream`` begins.
 
-    If `resources` contains any duplicate entries then
+    If ``resources`` contains any duplicate entries then
     :py:obj:`~.cudaErrorInvalidResourceHandle` is returned. If any of
-    `resources` are presently mapped for access by CUDA then
+    ``resources`` are presently mapped for access by CUDA then
     :py:obj:`~.cudaErrorUnknown` is returned.
 
     Parameters
@@ -32492,18 +32505,18 @@ def cudaGraphicsMapResources(int count, resources, stream):
 def cudaGraphicsUnmapResources(int count, resources, stream):
     """ Unmap graphics resources.
 
-    Unmaps the `count` graphics resources in `resources`.
+    Unmaps the ``count`` graphics resources in ``resources``.
 
-    Once unmapped, the resources in `resources` may not be accessed by CUDA
-    until they are mapped again.
+    Once unmapped, the resources in ``resources`` may not be accessed by
+    CUDA until they are mapped again.
 
     This function provides the synchronization guarantee that any CUDA work
-    issued in `stream` before :py:obj:`~.cudaGraphicsUnmapResources()` will
-    complete before any subsequently issued graphics work begins.
+    issued in ``stream`` before :py:obj:`~.cudaGraphicsUnmapResources()`
+    will complete before any subsequently issued graphics work begins.
 
-    If `resources` contains any duplicate entries then
+    If ``resources`` contains any duplicate entries then
     :py:obj:`~.cudaErrorInvalidResourceHandle` is returned. If any of
-    `resources` are not presently mapped for access by CUDA then
+    ``resources`` are not presently mapped for access by CUDA then
     :py:obj:`~.cudaErrorUnknown` is returned.
 
     Parameters
@@ -32553,14 +32566,14 @@ def cudaGraphicsUnmapResources(int count, resources, stream):
 def cudaGraphicsResourceGetMappedPointer(resource):
     """ Get an device pointer through which to access a mapped graphics resource.
 
-    Returns in `*devPtr` a pointer through which the mapped graphics
-    resource `resource` may be accessed. Returns in `*size` the size of the
-    memory in bytes which may be accessed from that pointer. The value set
-    in `devPtr` may change every time that `resource` is mapped.
+    Returns in ``*devPtr`` a pointer through which the mapped graphics
+    resource ``resource`` may be accessed. Returns in ``*size`` the size of
+    the memory in bytes which may be accessed from that pointer. The value
+    set in ``devPtr`` may change every time that ``resource`` is mapped.
 
-    If `resource` is not a buffer then it cannot be accessed via a pointer
-    and :py:obj:`~.cudaErrorUnknown` is returned. If `resource` is not
-    mapped then :py:obj:`~.cudaErrorUnknown` is returned.
+    If ``resource`` is not a buffer then it cannot be accessed via a
+    pointer and :py:obj:`~.cudaErrorUnknown` is returned. If ``resource``
+    is not mapped then :py:obj:`~.cudaErrorUnknown` is returned.
 
     Parameters
     ----------
@@ -32599,17 +32612,18 @@ def cudaGraphicsResourceGetMappedPointer(resource):
 def cudaGraphicsSubResourceGetMappedArray(resource, unsigned int arrayIndex, unsigned int mipLevel):
     """ Get an array through which to access a subresource of a mapped graphics resource.
 
-    Returns in `*array` an array through which the subresource of the
-    mapped graphics resource `resource` which corresponds to array index
-    `arrayIndex` and mipmap level `mipLevel` may be accessed. The value set
-    in `array` may change every time that `resource` is mapped.
-
-    If `resource` is not a texture then it cannot be accessed via an array
-    and :py:obj:`~.cudaErrorUnknown` is returned. If `arrayIndex` is not a
-    valid array index for `resource` then :py:obj:`~.cudaErrorInvalidValue`
-    is returned. If `mipLevel` is not a valid mipmap level for `resource`
-    then :py:obj:`~.cudaErrorInvalidValue` is returned. If `resource` is
-    not mapped then :py:obj:`~.cudaErrorUnknown` is returned.
+    Returns in ``*array`` an array through which the subresource of the
+    mapped graphics resource ``resource`` which corresponds to array index
+    ``arrayIndex`` and mipmap level ``mipLevel`` may be accessed. The value
+    set in ``array`` may change every time that ``resource`` is mapped.
+
+    If ``resource`` is not a texture then it cannot be accessed via an
+    array and :py:obj:`~.cudaErrorUnknown` is returned. If ``arrayIndex``
+    is not a valid array index for ``resource`` then
+    :py:obj:`~.cudaErrorInvalidValue` is returned. If ``mipLevel`` is not a
+    valid mipmap level for ``resource`` then
+    :py:obj:`~.cudaErrorInvalidValue` is returned. If ``resource`` is not
+    mapped then :py:obj:`~.cudaErrorUnknown` is returned.
 
     Parameters
     ----------
@@ -32627,7 +32641,7 @@ def cudaGraphicsSubResourceGetMappedArray(resource, unsigned int arrayIndex, uns
     cudaError_t
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorUnknown`
     array : :py:obj:`~.cudaArray_t`
-        Returned array through which a subresource of `resource` may be
+        Returned array through which a subresource of ``resource`` may be
         accessed
 
     See Also
@@ -32656,13 +32670,13 @@ def cudaGraphicsSubResourceGetMappedArray(resource, unsigned int arrayIndex, uns
 def cudaGraphicsResourceGetMappedMipmappedArray(resource):
     """ Get a mipmapped array through which to access a mapped graphics resource.
 
-    Returns in `*mipmappedArray` a mipmapped array through which the mapped
-    graphics resource `resource` may be accessed. The value set in
-    `mipmappedArray` may change every time that `resource` is mapped.
+    Returns in ``*mipmappedArray`` a mipmapped array through which the
+    mapped graphics resource ``resource`` may be accessed. The value set in
+    ``mipmappedArray`` may change every time that ``resource`` is mapped.
 
-    If `resource` is not a texture then it cannot be accessed via an array
-    and :py:obj:`~.cudaErrorUnknown` is returned. If `resource` is not
-    mapped then :py:obj:`~.cudaErrorUnknown` is returned.
+    If ``resource`` is not a texture then it cannot be accessed via an
+    array and :py:obj:`~.cudaErrorUnknown` is returned. If ``resource`` is
+    not mapped then :py:obj:`~.cudaErrorUnknown` is returned.
 
     Parameters
     ----------
@@ -32674,7 +32688,7 @@ def cudaGraphicsResourceGetMappedMipmappedArray(resource):
     cudaError_t
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorUnknown`
     mipmappedArray : :py:obj:`~.cudaMipmappedArray_t`
-        Returned mipmapped array through which `resource` may be accessed
+        Returned mipmapped array through which ``resource`` may be accessed
 
     See Also
     --------
@@ -32702,7 +32716,8 @@ def cudaGraphicsResourceGetMappedMipmappedArray(resource):
 def cudaGetChannelDesc(array):
     """ Get the channel descriptor of an array.
 
-    Returns in `*desc` the channel descriptor of the CUDA array `array`.
+    Returns in ``*desc`` the channel descriptor of the CUDA array
+    ``array``.
 
     Parameters
     ----------
@@ -32742,9 +32757,9 @@ def cudaGetChannelDesc(array):
 def cudaCreateChannelDesc(int x, int y, int z, int w, f not None : cudaChannelFormatKind):
     """ Returns a channel descriptor using the specified format.
 
-    Returns a channel descriptor with format `f` and number of bits of each
-    component `x`, `y`, `z`, and `w`. The :py:obj:`~.cudaChannelFormatDesc`
-    is defined as:
+    Returns a channel descriptor with format ``f`` and number of bits of
+    each component ``x``, ``y``, ``z``, and ``w``. The
+    cudaChannelFormatDesc is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -32771,7 +32786,7 @@ def cudaCreateChannelDesc(int x, int y, int z, int w, f not None : cudaChannelFo
     cudaError_t.cudaSuccess
         cudaError_t.cudaSuccess
     :py:obj:`~.cudaChannelFormatDesc`
-        Channel descriptor with format `f`
+        Channel descriptor with format ``f``
 
     See Also
     --------
@@ -32791,20 +32806,20 @@ def cudaCreateChannelDesc(int x, int y, int z, int w, f not None : cudaChannelFo
 def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Optional[cudaTextureDesc], pResViewDesc : Optional[cudaResourceViewDesc]):
     """ Creates a texture object.
 
-    Creates a texture object and returns it in `pTexObject`. `pResDesc`
-    describes the data to texture from. `pTexDesc` describes how the data
-    should be sampled. `pResViewDesc` is an optional argument that
-    specifies an alternate format for the data described by `pResDesc`, and
-    also describes the subresource region to restrict access to when
-    texturing. `pResViewDesc` can only be specified if the type of resource
-    is a CUDA array or a CUDA mipmapped array not in a block compressed
-    format.
+    Creates a texture object and returns it in ``pTexObject``. ``pResDesc``
+    describes the data to texture from. ``pTexDesc`` describes how the data
+    should be sampled. ``pResViewDesc`` is an optional argument that
+    specifies an alternate format for the data described by ``pResDesc``,
+    and also describes the subresource region to restrict access to when
+    texturing. ``pResViewDesc`` can only be specified if the type of
+    resource is a CUDA array or a CUDA mipmapped array not in a block
+    compressed format.
 
     Texture objects are only supported on devices of compute capability 3.0
     or higher. Additionally, a texture object is an opaque value, and, as
     such, should only be accessed through CUDA API calls.
 
-    The :py:obj:`~.cudaResourceDesc` structure is defined as:
+    The cudaResourceDesc structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -32817,46 +32832,46 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
 
     If :py:obj:`~.cudaResourceDesc.resType` is set to
     :py:obj:`~.cudaResourceTypeArray`,
-    :py:obj:`~.cudaResourceDesc`::res::array::array must be set to a valid
+    :py:obj:`~.cudaResourceDesc.res.array.array` must be set to a valid
     CUDA array handle.
 
     If :py:obj:`~.cudaResourceDesc.resType` is set to
     :py:obj:`~.cudaResourceTypeMipmappedArray`,
-    :py:obj:`~.cudaResourceDesc`::res::mipmap::mipmap must be set to a
-    valid CUDA mipmapped array handle and
+    :py:obj:`~.cudaResourceDesc.res.mipmap.mipmap` must be set to a valid
+    CUDA mipmapped array handle and
     :py:obj:`~.cudaTextureDesc.normalizedCoords` must be set to true.
 
     If :py:obj:`~.cudaResourceDesc.resType` is set to
     :py:obj:`~.cudaResourceTypeLinear`,
-    :py:obj:`~.cudaResourceDesc`::res::linear::devPtr must be set to a
-    valid device pointer, that is aligned to
+    :py:obj:`~.cudaResourceDesc.res.linear.devPtr` must be set to a valid
+    device pointer, that is aligned to
     :py:obj:`~.cudaDeviceProp.textureAlignment`.
-    :py:obj:`~.cudaResourceDesc`::res::linear::desc describes the format
-    and the number of components per array element.
-    :py:obj:`~.cudaResourceDesc`::res::linear::sizeInBytes specifies the
-    size of the array in bytes. The total number of elements in the linear
+    :py:obj:`~.cudaResourceDesc.res.linear.desc` describes the format and
+    the number of components per array element.
+    :py:obj:`~.cudaResourceDesc.res.linear.sizeInBytes` specifies the size
+    of the array in bytes. The total number of elements in the linear
     address range cannot exceed
     :py:obj:`~.cudaDeviceGetTexture1DLinearMaxWidth()`. The number of
     elements is computed as (sizeInBytes / sizeof(desc)).
 
     If :py:obj:`~.cudaResourceDesc.resType` is set to
     :py:obj:`~.cudaResourceTypePitch2D`,
-    :py:obj:`~.cudaResourceDesc`::res::pitch2D::devPtr must be set to a
-    valid device pointer, that is aligned to
+    :py:obj:`~.cudaResourceDesc.res.pitch2D.devPtr` must be set to a valid
+    device pointer, that is aligned to
     :py:obj:`~.cudaDeviceProp.textureAlignment`.
-    :py:obj:`~.cudaResourceDesc`::res::pitch2D::desc describes the format
-    and the number of components per array element.
-    :py:obj:`~.cudaResourceDesc`::res::pitch2D::width and
-    :py:obj:`~.cudaResourceDesc`::res::pitch2D::height specify the width
-    and height of the array in elements, and cannot exceed
-    :py:obj:`~.cudaDeviceProp.maxTexture2DLinear`[0] and
-    :py:obj:`~.cudaDeviceProp.maxTexture2DLinear`[1] respectively.
-    :py:obj:`~.cudaResourceDesc`::res::pitch2D::pitchInBytes specifies the
+    :py:obj:`~.cudaResourceDesc.res.pitch2D.desc` describes the format and
+    the number of components per array element.
+    :py:obj:`~.cudaResourceDesc.res.pitch2D.width` and
+    :py:obj:`~.cudaResourceDesc.res.pitch2D.height` specify the width and
+    height of the array in elements, and cannot exceed
+    :py:obj:`~.cudaDeviceProp.maxTexture2DLinear` ``[0]`` and
+    :py:obj:`~.cudaDeviceProp.maxTexture2DLinear` ``[1]`` respectively.
+    :py:obj:`~.cudaResourceDesc.res.pitch2D.pitchInBytes` specifies the
     pitch between two rows in bytes and has to be aligned to
     :py:obj:`~.cudaDeviceProp.texturePitchAlignment`. Pitch cannot exceed
-    :py:obj:`~.cudaDeviceProp.maxTexture2DLinear`[2].
+    :py:obj:`~.cudaDeviceProp.maxTexture2DLinear` ``[2]``.
 
-    The :py:obj:`~.cudaTextureDesc` struct is defined as
+    The cudaTextureDesc struct is defined as
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -32899,13 +32914,14 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
       conversion should be performed during texture fetch.
 
     - :py:obj:`~.cudaTextureDesc.borderColor` specifies the float values of
-      color. where: :py:obj:`~.cudaTextureDesc.borderColor`[0] contains
-      value of 'R', :py:obj:`~.cudaTextureDesc.borderColor`[1] contains
-      value of 'G', :py:obj:`~.cudaTextureDesc.borderColor`[2] contains
-      value of 'B', :py:obj:`~.cudaTextureDesc.borderColor`[3] contains
-      value of 'A' Note that application using integer border color values
-      will need to <reinterpret_cast> these values to float. The values are
-      set only when the addressing mode specified by
+      color. where: :py:obj:`~.cudaTextureDesc.borderColor` ``[0]``
+      contains value of 'R', :py:obj:`~.cudaTextureDesc.borderColor`
+      ``[1]`` contains value of 'G',
+      :py:obj:`~.cudaTextureDesc.borderColor` ``[2]`` contains value of
+      'B', :py:obj:`~.cudaTextureDesc.borderColor` ``[3]`` contains value
+      of 'A' Note that application using integer border color values will
+      need to <reinterpret_cast> these values to float. The values are set
+      only when the addressing mode specified by
       :py:obj:`~.cudaTextureDesc.addressMode` is cudaAddressModeBorder.
 
     - :py:obj:`~.cudaTextureDesc.normalizedCoords` specifies whether the
@@ -32945,7 +32961,7 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
       :py:obj:`~.cudaFilterModeLinear` seamless cube map filtering will be
       performed when sampling along the cube face borders.
 
-    The :py:obj:`~.cudaResourceViewDesc` struct is defined as
+    The cudaResourceViewDesc struct is defined as
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -33036,7 +33052,7 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
 def cudaDestroyTextureObject(texObject):
     """ Destroys a texture object.
 
-    Destroys the texture object specified by `texObject`.
+    Destroys the texture object specified by ``texObject``.
 
     Parameters
     ----------
@@ -33072,7 +33088,7 @@ def cudaGetTextureObjectResourceDesc(texObject):
     """ Returns a texture object's resource descriptor.
 
     Returns the resource descriptor for the texture object specified by
-    `texObject`.
+    ``texObject``.
 
     Parameters
     ----------
@@ -33113,7 +33129,7 @@ def cudaGetTextureObjectTextureDesc(texObject):
     """ Returns a texture object's texture descriptor.
 
     Returns the texture descriptor for the texture object specified by
-    `texObject`.
+    ``texObject``.
 
     Parameters
     ----------
@@ -33154,7 +33170,7 @@ def cudaGetTextureObjectResourceViewDesc(texObject):
     """ Returns a texture object's resource view descriptor.
 
     Returns the resource view descriptor for the texture object specified
-    by `texObject`. If no resource view was specified,
+    by ``texObject``. If no resource view was specified,
     :py:obj:`~.cudaErrorInvalidValue` is returned.
 
     Parameters
@@ -33195,11 +33211,11 @@ def cudaGetTextureObjectResourceViewDesc(texObject):
 def cudaCreateSurfaceObject(pResDesc : Optional[cudaResourceDesc]):
     """ Creates a surface object.
 
-    Creates a surface object and returns it in `pSurfObject`. `pResDesc`
-    describes the data to perform surface load/stores on.
+    Creates a surface object and returns it in ``pSurfObject``.
+    ``pResDesc`` describes the data to perform surface load/stores on.
     :py:obj:`~.cudaResourceDesc.resType` must be
     :py:obj:`~.cudaResourceTypeArray` and
-    :py:obj:`~.cudaResourceDesc`::res::array::array must be set to a valid
+    :py:obj:`~.cudaResourceDesc.res.array.array` must be set to a valid
     CUDA array handle.
 
     Surface objects are only supported on devices of compute capability 3.0
@@ -33237,7 +33253,7 @@ def cudaCreateSurfaceObject(pResDesc : Optional[cudaResourceDesc]):
 def cudaDestroySurfaceObject(surfObject):
     """ Destroys a surface object.
 
-    Destroys the surface object specified by `surfObject`.
+    Destroys the surface object specified by ``surfObject``.
 
     Parameters
     ----------
@@ -33270,7 +33286,7 @@ def cudaDestroySurfaceObject(surfObject):
 
 @cython.embedsignature(True)
 def cudaGetSurfaceObjectResourceDesc(surfObject):
-    """ Returns a surface object's resource descriptor Returns the resource descriptor for the surface object specified by `surfObject`.
+    """ Returns a surface object's resource descriptor Returns the resource descriptor for the surface object specified by ``surfObject``.
 
     Parameters
     ----------
@@ -33310,13 +33326,13 @@ def cudaGetSurfaceObjectResourceDesc(surfObject):
 def cudaDriverGetVersion():
     """ Returns the latest version of CUDA supported by the driver.
 
-    Returns in `*driverVersion` the latest version of CUDA supported by the
-    driver. The version is returned as (1000 * major + 10 * minor). For
-    example, CUDA 9.2 would be represented by 9020. If no driver is
+    Returns in ``*driverVersion`` the latest version of CUDA supported by
+    the driver. The version is returned as (1000 \\* major + 10 \\* minor).
+    For example, CUDA 9.2 would be represented by 9020. If no driver is
     installed, then 0 is returned as the driver version.
 
     This function automatically returns :py:obj:`~.cudaErrorInvalidValue`
-    if `driverVersion` is NULL.
+    if ``driverVersion`` is NULL.
 
     Returns
     -------
@@ -33343,8 +33359,8 @@ def cudaDriverGetVersion():
 def cudaRuntimeGetVersion():
     """ Returns the CUDA Runtime version.
 
-    Returns in `*runtimeVersion` the version number of the current CUDA
-    Runtime instance. The version is returned as (1000 * major + 10 *
+    Returns in ``*runtimeVersion`` the version number of the current CUDA
+    Runtime instance. The version is returned as (1000 \\* major + 10 \\*
     minor). For example, CUDA 9.2 would be represented by 9020.
 
     As of CUDA 12.0, this function no longer initializes CUDA. The purpose
@@ -33352,7 +33368,7 @@ def cudaRuntimeGetVersion():
     CUDA Toolkit version in the above format.
 
     This function automatically returns :py:obj:`~.cudaErrorInvalidValue`
-    if the `runtimeVersion` argument is NULL.
+    if the ``runtimeVersion`` argument is NULL.
 
     Returns
     -------
@@ -33477,7 +33493,7 @@ def cudaLogsDumpToFile(iterator : Optional[cudaLogIterator], char* pathToFile, u
 
     Logs generated by the driver are stored in an internal buffer and can
     be copied out using this API. This API dumps all driver logs starting
-    from `iterator` into `pathToFile` provided.
+    from ``iterator`` into ``pathToFile`` provided.
 
     Parameters
     ----------
@@ -33499,7 +33515,7 @@ def cudaLogsDumpToFile(iterator : Optional[cudaLogIterator], char* pathToFile, u
 
     Notes
     -----
-    `iterator` is auto-advancing. Dumping logs will update the value of `iterator` to receive the next generated log.
+    ``iterator`` is auto-advancing. Dumping logs will update the value of ``iterator`` to receive the next generated log.
 
     The driver reserves limited memory for storing logs. The oldest logs may be overwritten and become unrecoverable. An indication will appear in the destination outupt if the logs have been truncated. Call dump after each failed API to mitigate this risk.
     """
@@ -33521,14 +33537,14 @@ def cudaLogsDumpToMemory(iterator : Optional[cudaLogIterator], char* buffer, siz
 
     Logs generated by the driver are stored in an internal buffer and can
     be copied out using this API. This API dumps driver logs from
-    `iterator` into `buffer` up to the size specified in `*size`. The
+    ``iterator`` into ``buffer`` up to the size specified in ``*size``. The
     driver will always null terminate the buffer but there will not be a
-    null character between log entries, only a newline \n. The driver will
-    then return the actual number of bytes written in `*size`, excluding
-    the null terminator. If there are no messages to dump, `*size` will be
-    set to 0 and the function will return :py:obj:`~.CUDA_SUCCESS`. If the
-    provided `buffer` is not large enough to hold any messages, `*size`
-    will be set to 0 and the function will return
+    null character between log entries, only a newline \\n. The driver will
+    then return the actual number of bytes written in ``*size``, excluding
+    the null terminator. If there are no messages to dump, ``*size`` will
+    be set to 0 and the function will return :py:obj:`~.CUDA_SUCCESS`. If
+    the provided ``buffer`` is not large enough to hold any messages,
+    ``*size`` will be set to 0 and the function will return
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
     Parameters
@@ -33555,11 +33571,11 @@ def cudaLogsDumpToMemory(iterator : Optional[cudaLogIterator], char* buffer, siz
 
     Notes
     -----
-    `iterator` is auto-advancing. Dumping logs will update the value of `iterator` to receive the next generated log.
+    ``iterator`` is auto-advancing. Dumping logs will update the value of ``iterator`` to receive the next generated log.
 
     The driver reserves limited memory for storing logs. The maximum size of the buffer is 25600 bytes. The oldest logs may be overwritten and become unrecoverable. An indication will appear in the destination outupt if the logs have been truncated. Call dump after each failed API to mitigate this risk.
 
-    If the provided value in `*size` is not large enough to hold all buffered messages, a message will be added at the head of the buffer indicating this. The driver then computes the number of messages it is able to store in `buffer` and writes it out. The final message in `buffer` will always be the most recent log message as of when the API is called.
+    If the provided value in ``*size`` is not large enough to hold all buffered messages, a message will be added at the head of the buffer indicating this. The driver then computes the number of messages it is able to store in ``buffer`` and writes it out. The final message in ``buffer`` will always be the most recent log message as of when the API is called.
     """
     cdef cyruntime.cudaLogIterator* cyiterator = NULL
     if iterator is not None:
@@ -33577,7 +33593,7 @@ def cudaLogsDumpToMemory(iterator : Optional[cudaLogIterator], char* buffer, siz
 def cudaGraphCreate(unsigned int flags):
     """ Creates a graph.
 
-    Creates an empty graph, which is returned via `pGraph`.
+    Creates an empty graph, which is returned via ``pGraph``.
 
     Parameters
     ----------
@@ -33609,62 +33625,63 @@ def cudaGraphCreate(unsigned int flags):
 def cudaGraphAddKernelNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, pNodeParams : Optional[cudaKernelNodeParams]):
     """ Creates a kernel execution node and adds it to a graph.
 
-    Creates a new kernel execution node and adds it to `graph` with
-    `numDependencies` dependencies specified via `pDependencies` and
-    arguments specified in `pNodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `pDependencies` may not have any duplicate entries.
-    A handle to the new node will be returned in `pGraphNode`.
+    Creates a new kernel execution node and adds it to ``graph`` with
+    ``numDependencies`` dependencies specified via ``pDependencies`` and
+    arguments specified in ``pNodeParams``. It is possible for
+    ``numDependencies`` to be 0, in which case the node will be placed at
+    the root of the graph. ``pDependencies`` may not have any duplicate
+    entries. A handle to the new node will be returned in ``pGraphNode``.
 
-    The :py:obj:`~.cudaKernelNodeParams` structure is defined as:
+    The cudaKernelNodeParams structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    When the graph is launched, the node will invoke kernel `func` on a
-    (`gridDim.x` x `gridDim.y` x `gridDim.z`) grid of blocks. Each block
-    contains (`blockDim.x` x `blockDim.y` x `blockDim.z`) threads.
+    When the graph is launched, the node will invoke kernel ``func`` on a
+    (``gridDim.x`` x ``gridDim.y`` x ``gridDim.z``) grid of blocks. Each
+    block contains (``blockDim.x`` x ``blockDim.y`` x ``blockDim.z``)
+    threads.
 
-    `sharedMem` sets the amount of dynamic shared memory that will be
+    ``sharedMem`` sets the amount of dynamic shared memory that will be
     available to each thread block.
 
-    Kernel parameters to `func` can be specified in one of two ways:
+    Kernel parameters to ``func`` can be specified in one of two ways:
 
-    1) Kernel parameters can be specified via `kernelParams`. If the kernel
-    has N parameters, then `kernelParams` needs to be an array of N
-    pointers. Each pointer, from `kernelParams`[0] to `kernelParams`[N-1],
-    points to the region of memory from which the actual parameter will be
-    copied. The number of kernel parameters and their offsets and sizes do
-    not need to be specified as that information is retrieved directly from
-    the kernel's image.
+    1) Kernel parameters can be specified via ``kernelParams``. If the
+    kernel has N parameters, then ``kernelParams`` needs to be an array of
+    N pointers. Each pointer, from ``kernelParams[0]`` to
+    ``kernelParams[N-1]``, points to the region of memory from which the
+    actual parameter will be copied. The number of kernel parameters and
+    their offsets and sizes do not need to be specified as that information
+    is retrieved directly from the kernel's image.
 
     2) Kernel parameters can also be packaged by the application into a
-    single buffer that is passed in via `extra`. This places the burden on
-    the application of knowing each kernel parameter's size and
-    alignment/padding within the buffer. The `extra` parameter exists to
+    single buffer that is passed in via ``extra``. This places the burden
+    on the application of knowing each kernel parameter's size and
+    alignment/padding within the buffer. The ``extra`` parameter exists to
     allow this function to take additional less commonly used arguments.
-    `extra` specifies a list of names of extra settings and their
+    ``extra`` specifies a list of names of extra settings and their
     corresponding values. Each extra setting name is immediately followed
     by the corresponding value. The list must be terminated with either
     NULL or CU_LAUNCH_PARAM_END.
 
     - :py:obj:`~.CU_LAUNCH_PARAM_END`, which indicates the end of the
-      `extra` array;
+      ``extra`` array;
 
     - :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER`, which specifies that the
-      next value in `extra` will be a pointer to a buffer containing all
-      the kernel parameters for launching kernel `func`;
+      next value in ``extra`` will be a pointer to a buffer containing all
+      the kernel parameters for launching kernel ``func``;
 
     - :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE`, which specifies that the
-      next value in `extra` will be a pointer to a size_t containing the
+      next value in ``extra`` will be a pointer to a size_t containing the
       size of the buffer specified with
       :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER`;
 
     The error :py:obj:`~.cudaErrorInvalidValue` will be returned if kernel
-    parameters are specified with both `kernelParams` and `extra` (i.e.
-    both `kernelParams` and `extra` are non-NULL).
+    parameters are specified with both ``kernelParams`` and ``extra`` (i.e.
+    both ``kernelParams`` and ``extra`` are non-NULL).
 
-    The `kernelParams` or `extra` array, as well as the argument values it
-    points to, are copied during this call.
+    The ``kernelParams`` or ``extra`` array, as well as the argument values
+    it points to, are copied during this call.
 
     Parameters
     ----------
@@ -33731,16 +33748,16 @@ def cudaGraphAddKernelNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
 def cudaGraphKernelNodeGetParams(node):
     """ Returns a kernel node's parameters.
 
-    Returns the parameters of kernel node `node` in `pNodeParams`. The
-    `kernelParams` or `extra` array returned in `pNodeParams`, as well as
-    the argument values it points to, are owned by the node. This memory
-    remains valid until the node is destroyed or its parameters are
+    Returns the parameters of kernel node ``node`` in ``pNodeParams``. The
+    ``kernelParams`` or ``extra`` array returned in ``pNodeParams``, as
+    well as the argument values it points to, are owned by the node. This
+    memory remains valid until the node is destroyed or its parameters are
     modified, and should not be modified directly. Use
     :py:obj:`~.cudaGraphKernelNodeSetParams` to update the parameters of
     this node.
 
-    The params will contain either `kernelParams` or `extra`, according to
-    which of these was most recently set on the node.
+    The params will contain either ``kernelParams`` or ``extra``, according
+    to which of these was most recently set on the node.
 
     Parameters
     ----------
@@ -33780,7 +33797,7 @@ def cudaGraphKernelNodeGetParams(node):
 def cudaGraphKernelNodeSetParams(node, pNodeParams : Optional[cudaKernelNodeParams]):
     """ Sets a kernel node's parameters.
 
-    Sets the parameters of kernel node `node` to `pNodeParams`.
+    Sets the parameters of kernel node ``node`` to ``pNodeParams``.
 
     Parameters
     ----------
@@ -33818,8 +33835,8 @@ def cudaGraphKernelNodeSetParams(node, pNodeParams : Optional[cudaKernelNodePara
 def cudaGraphKernelNodeCopyAttributes(hDst, hSrc):
     """ Copies attributes from source node to destination node.
 
-    Copies attributes from source node `hSrc` to destination node `hDst`.
-    Both node must have the same context.
+    Copies attributes from source node ``hSrc`` to destination node
+    ``hDst``. Both node must have the same context.
 
     Parameters
     ----------
@@ -33836,7 +33853,7 @@ def cudaGraphKernelNodeCopyAttributes(hDst, hSrc):
 
     See Also
     --------
-    :py:obj:`~.cudaAccessPolicyWindow`
+    cudaAccessPolicyWindow
     """
     cdef cyruntime.cudaGraphNode_t cyhSrc
     if hSrc is None:
@@ -33865,8 +33882,8 @@ def cudaGraphKernelNodeCopyAttributes(hDst, hSrc):
 def cudaGraphKernelNodeGetAttribute(hNode, attr not None : cudaKernelNodeAttrID):
     """ Queries node attribute.
 
-    Queries attribute `attr` from node `hNode` and stores it in
-    corresponding member of `value_out`.
+    Queries attribute ``attr`` from node ``hNode`` and stores it in
+    corresponding member of ``value_out``.
 
     Parameters
     ----------
@@ -33884,7 +33901,7 @@ def cudaGraphKernelNodeGetAttribute(hNode, attr not None : cudaKernelNodeAttrID)
 
     See Also
     --------
-    :py:obj:`~.cudaAccessPolicyWindow`
+    cudaAccessPolicyWindow
     """
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
@@ -33909,8 +33926,8 @@ def cudaGraphKernelNodeGetAttribute(hNode, attr not None : cudaKernelNodeAttrID)
 def cudaGraphKernelNodeSetAttribute(hNode, attr not None : cudaKernelNodeAttrID, value : Optional[cudaKernelNodeAttrValue]):
     """ Sets node attribute.
 
-    Sets attribute `attr` on node `hNode` from corresponding attribute of
-    `value`.
+    Sets attribute ``attr`` on node ``hNode`` from corresponding attribute
+    of ``value``.
 
     Parameters
     ----------
@@ -33928,7 +33945,7 @@ def cudaGraphKernelNodeSetAttribute(hNode, attr not None : cudaKernelNodeAttrID,
 
     See Also
     --------
-    :py:obj:`~.cudaAccessPolicyWindow`
+    cudaAccessPolicyWindow
     """
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
@@ -33951,14 +33968,15 @@ def cudaGraphKernelNodeSetAttribute(hNode, attr not None : cudaKernelNodeAttrID,
 def cudaGraphAddMemcpyNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, pCopyParams : Optional[cudaMemcpy3DParms]):
     """ Creates a memcpy node and adds it to a graph.
 
-    Creates a new memcpy node and adds it to `graph` with `numDependencies`
-    dependencies specified via `pDependencies`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `pDependencies` may not have any duplicate entries.
-    A handle to the new node will be returned in `pGraphNode`.
+    Creates a new memcpy node and adds it to ``graph`` with
+    ``numDependencies`` dependencies specified via ``pDependencies``. It is
+    possible for ``numDependencies`` to be 0, in which case the node will
+    be placed at the root of the graph. ``pDependencies`` may not have any
+    duplicate entries. A handle to the new node will be returned in
+    ``pGraphNode``.
 
     When the graph is launched, the node will perform the memcpy described
-    by `pCopyParams`. See :py:obj:`~.cudaMemcpy3D()` for a description of
+    by ``pCopyParams``. See :py:obj:`~.cudaMemcpy3D()` for a description of
     the structure and its restrictions.
 
     Memcpy nodes have some additional restrictions with regards to managed
@@ -34027,18 +34045,18 @@ def cudaGraphAddMemcpyNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
 def cudaGraphAddMemcpyNode1D(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, dst, src, size_t count, kind not None : cudaMemcpyKind):
     """ Creates a 1D memcpy node and adds it to a graph.
 
-    Creates a new 1D memcpy node and adds it to `graph` with
-    `numDependencies` dependencies specified via `pDependencies`. It is
-    possible for `numDependencies` to be 0, in which case the node will be
-    placed at the root of the graph. `pDependencies` may not have any
+    Creates a new 1D memcpy node and adds it to ``graph`` with
+    ``numDependencies`` dependencies specified via ``pDependencies``. It is
+    possible for ``numDependencies`` to be 0, in which case the node will
+    be placed at the root of the graph. ``pDependencies`` may not have any
     duplicate entries. A handle to the new node will be returned in
-    `pGraphNode`.
+    ``pGraphNode``.
 
-    When the graph is launched, the node will copy `count` bytes from the
-    memory area pointed to by `src` to the memory area pointed to by `dst`,
-    where `kind` specifies the direction of the copy, and must be one of
-    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
-    :py:obj:`~.cudaMemcpyDeviceToHost`,
+    When the graph is launched, the node will copy ``count`` bytes from the
+    memory area pointed to by ``src`` to the memory area pointed to by
+    ``dst``, where ``kind`` specifies the direction of the copy, and must
+    be one of :py:obj:`~.cudaMemcpyHostToHost`,
+    :py:obj:`~.cudaMemcpyHostToDevice`, :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
@@ -34124,7 +34142,7 @@ def cudaGraphAddMemcpyNode1D(graph, pDependencies : Optional[tuple[cudaGraphNode
 def cudaGraphMemcpyNodeGetParams(node):
     """ Returns a memcpy node's parameters.
 
-    Returns the parameters of memcpy node `node` in `pNodeParams`.
+    Returns the parameters of memcpy node ``node`` in ``pNodeParams``.
 
     Parameters
     ----------
@@ -34164,7 +34182,7 @@ def cudaGraphMemcpyNodeGetParams(node):
 def cudaGraphMemcpyNodeSetParams(node, pNodeParams : Optional[cudaMemcpy3DParms]):
     """ Sets a memcpy node's parameters.
 
-    Sets the parameters of memcpy node `node` to `pNodeParams`.
+    Sets the parameters of memcpy node ``node`` to ``pNodeParams``.
 
     Parameters
     ----------
@@ -34202,14 +34220,14 @@ def cudaGraphMemcpyNodeSetParams(node, pNodeParams : Optional[cudaMemcpy3DParms]
 def cudaGraphMemcpyNodeSetParams1D(node, dst, src, size_t count, kind not None : cudaMemcpyKind):
     """ Sets a memcpy node's parameters to perform a 1-dimensional copy.
 
-    Sets the parameters of memcpy node `node` to the copy described by the
-    provided parameters.
+    Sets the parameters of memcpy node ``node`` to the copy described by
+    the provided parameters.
 
-    When the graph is launched, the node will copy `count` bytes from the
-    memory area pointed to by `src` to the memory area pointed to by `dst`,
-    where `kind` specifies the direction of the copy, and must be one of
-    :py:obj:`~.cudaMemcpyHostToHost`, :py:obj:`~.cudaMemcpyHostToDevice`,
-    :py:obj:`~.cudaMemcpyDeviceToHost`,
+    When the graph is launched, the node will copy ``count`` bytes from the
+    memory area pointed to by ``src`` to the memory area pointed to by
+    ``dst``, where ``kind`` specifies the direction of the copy, and must
+    be one of :py:obj:`~.cudaMemcpyHostToHost`,
+    :py:obj:`~.cudaMemcpyHostToDevice`, :py:obj:`~.cudaMemcpyDeviceToHost`,
     :py:obj:`~.cudaMemcpyDeviceToDevice`, or :py:obj:`~.cudaMemcpyDefault`.
     Passing :py:obj:`~.cudaMemcpyDefault` is recommended, in which case the
     type of transfer is inferred from the pointer values. However,
@@ -34266,14 +34284,15 @@ def cudaGraphMemcpyNodeSetParams1D(node, dst, src, size_t count, kind not None :
 def cudaGraphAddMemsetNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, pMemsetParams : Optional[cudaMemsetParams]):
     """ Creates a memset node and adds it to a graph.
 
-    Creates a new memset node and adds it to `graph` with `numDependencies`
-    dependencies specified via `pDependencies`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `pDependencies` may not have any duplicate entries.
-    A handle to the new node will be returned in `pGraphNode`.
+    Creates a new memset node and adds it to ``graph`` with
+    ``numDependencies`` dependencies specified via ``pDependencies``. It is
+    possible for ``numDependencies`` to be 0, in which case the node will
+    be placed at the root of the graph. ``pDependencies`` may not have any
+    duplicate entries. A handle to the new node will be returned in
+    ``pGraphNode``.
 
     The element size must be 1, 2, or 4 bytes. When the graph is launched,
-    the node will perform the memset described by `pMemsetParams`.
+    the node will perform the memset described by ``pMemsetParams``.
 
     Parameters
     ----------
@@ -34336,7 +34355,7 @@ def cudaGraphAddMemsetNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
 def cudaGraphMemsetNodeGetParams(node):
     """ Returns a memset node's parameters.
 
-    Returns the parameters of memset node `node` in `pNodeParams`.
+    Returns the parameters of memset node ``node`` in ``pNodeParams``.
 
     Parameters
     ----------
@@ -34376,7 +34395,7 @@ def cudaGraphMemsetNodeGetParams(node):
 def cudaGraphMemsetNodeSetParams(node, pNodeParams : Optional[cudaMemsetParams]):
     """ Sets a memset node's parameters.
 
-    Sets the parameters of memset node `node` to `pNodeParams`.
+    Sets the parameters of memset node ``node`` to ``pNodeParams``.
 
     Parameters
     ----------
@@ -34414,12 +34433,12 @@ def cudaGraphMemsetNodeSetParams(node, pNodeParams : Optional[cudaMemsetParams])
 def cudaGraphAddHostNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, pNodeParams : Optional[cudaHostNodeParams]):
     """ Creates a host execution node and adds it to a graph.
 
-    Creates a new CPU execution node and adds it to `graph` with
-    `numDependencies` dependencies specified via `pDependencies` and
-    arguments specified in `pNodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `pDependencies` may not have any duplicate entries.
-    A handle to the new node will be returned in `pGraphNode`.
+    Creates a new CPU execution node and adds it to ``graph`` with
+    ``numDependencies`` dependencies specified via ``pDependencies`` and
+    arguments specified in ``pNodeParams``. It is possible for
+    ``numDependencies`` to be 0, in which case the node will be placed at
+    the root of the graph. ``pDependencies`` may not have any duplicate
+    entries. A handle to the new node will be returned in ``pGraphNode``.
 
     When the graph is launched, the node will invoke the specified CPU
     function. Host nodes are not supported under MPS with pre-Volta GPUs.
@@ -34485,7 +34504,7 @@ def cudaGraphAddHostNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t]
 def cudaGraphHostNodeGetParams(node):
     """ Returns a host node's parameters.
 
-    Returns the parameters of host node `node` in `pNodeParams`.
+    Returns the parameters of host node ``node`` in ``pNodeParams``.
 
     Parameters
     ----------
@@ -34525,7 +34544,7 @@ def cudaGraphHostNodeGetParams(node):
 def cudaGraphHostNodeSetParams(node, pNodeParams : Optional[cudaHostNodeParams]):
     """ Sets a host node's parameters.
 
-    Sets the parameters of host node `node` to `nodeParams`.
+    Sets the parameters of host node ``node`` to ``nodeParams``.
 
     Parameters
     ----------
@@ -34564,13 +34583,13 @@ def cudaGraphAddChildGraphNode(graph, pDependencies : Optional[tuple[cudaGraphNo
     """ Creates a child graph node and adds it to a graph.
 
     Creates a new node which executes an embedded graph, and adds it to
-    `graph` with `numDependencies` dependencies specified via
-    `pDependencies`. It is possible for `numDependencies` to be 0, in which
-    case the node will be placed at the root of the graph. `pDependencies`
-    may not have any duplicate entries. A handle to the new node will be
-    returned in `pGraphNode`.
+    ``graph`` with ``numDependencies`` dependencies specified via
+    ``pDependencies``. It is possible for ``numDependencies`` to be 0, in
+    which case the node will be placed at the root of the graph.
+    ``pDependencies`` may not have any duplicate entries. A handle to the
+    new node will be returned in ``pGraphNode``.
 
-    If `childGraph` contains allocation nodes, free nodes, or conditional
+    If ``childGraph`` contains allocation nodes, free nodes, or conditional
     nodes, this call will return an error.
 
     The node executes an embedded child graph. The child graph is cloned in
@@ -34689,17 +34708,17 @@ def cudaGraphChildGraphNodeGetGraph(node):
 def cudaGraphAddEmptyNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies):
     """ Creates an empty node and adds it to a graph.
 
-    Creates a new node which performs no operation, and adds it to `graph`
-    with `numDependencies` dependencies specified via `pDependencies`. It
-    is possible for `numDependencies` to be 0, in which case the node will
-    be placed at the root of the graph. `pDependencies` may not have any
-    duplicate entries. A handle to the new node will be returned in
-    `pGraphNode`.
+    Creates a new node which performs no operation, and adds it to
+    ``graph`` with ``numDependencies`` dependencies specified via
+    ``pDependencies``. It is possible for ``numDependencies`` to be 0, in
+    which case the node will be placed at the root of the graph.
+    ``pDependencies`` may not have any duplicate entries. A handle to the
+    new node will be returned in ``pGraphNode``.
 
     An empty node performs no operation during execution, but can be used
     for transitive ordering. For example, a phased execution graph with 2
     groups of n nodes with a barrier between them can be represented using
-    an empty node and 2*n dependency edges, rather than no empty node and
+    an empty node and 2\\*n dependency edges, rather than no empty node and
     n^2 dependency edges.
 
     Parameters
@@ -34760,14 +34779,14 @@ def cudaGraphAddEmptyNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t]
 def cudaGraphAddEventRecordNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, event):
     """ Creates an event record node and adds it to a graph.
 
-    Creates a new event record node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and event
-    specified in `event`. It is possible for `numDependencies` to be 0, in
-    which case the node will be placed at the root of the graph.
-    `dependencies` may not have any duplicate entries. A handle to the new
-    node will be returned in `phGraphNode`.
+    Creates a new event record node and adds it to ``hGraph`` with
+    ``numDependencies`` dependencies specified via ``dependencies`` and
+    event specified in ``event``. It is possible for ``numDependencies`` to
+    be 0, in which case the node will be placed at the root of the graph.
+    ``dependencies`` may not have any duplicate entries. A handle to the
+    new node will be returned in ``phGraphNode``.
 
-    Each launch of the graph will record `event` to capture execution of
+    Each launch of the graph will record ``event`` to capture execution of
     the node's dependencies.
 
     These nodes may not be used in loops or conditionals.
@@ -34840,7 +34859,7 @@ def cudaGraphAddEventRecordNode(graph, pDependencies : Optional[tuple[cudaGraphN
 def cudaGraphEventRecordNodeGetEvent(node):
     """ Returns the event associated with an event record node.
 
-    Returns the event of event record node `hNode` in `event_out`.
+    Returns the event of event record node ``hNode`` in ``event_out``.
 
     Parameters
     ----------
@@ -34880,7 +34899,7 @@ def cudaGraphEventRecordNodeGetEvent(node):
 def cudaGraphEventRecordNodeSetEvent(node, event):
     """ Sets an event record node's event.
 
-    Sets the event of event record node `hNode` to `event`.
+    Sets the event of event record node ``hNode`` to ``event``.
 
     Parameters
     ----------
@@ -34925,18 +34944,18 @@ def cudaGraphEventRecordNodeSetEvent(node, event):
 def cudaGraphAddEventWaitNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, event):
     """ Creates an event wait node and adds it to a graph.
 
-    Creates a new event wait node and adds it to `hGraph` with
-    `numDependencies` dependencies specified via `dependencies` and event
-    specified in `event`. It is possible for `numDependencies` to be 0, in
-    which case the node will be placed at the root of the graph.
-    `dependencies` may not have any duplicate entries. A handle to the new
-    node will be returned in `phGraphNode`.
+    Creates a new event wait node and adds it to ``hGraph`` with
+    ``numDependencies`` dependencies specified via ``dependencies`` and
+    event specified in ``event``. It is possible for ``numDependencies`` to
+    be 0, in which case the node will be placed at the root of the graph.
+    ``dependencies`` may not have any duplicate entries. A handle to the
+    new node will be returned in ``phGraphNode``.
 
-    The graph node will wait for all work captured in `event`. See
+    The graph node will wait for all work captured in ``event``. See
     :py:obj:`~.cuEventRecord()` for details on what is captured by an
     event. The synchronization will be performed efficiently on the device
-    when applicable. `event` may be from a different context or device than
-    the launch stream.
+    when applicable. ``event`` may be from a different context or device
+    than the launch stream.
 
     These nodes may not be used in loops or conditionals.
 
@@ -35008,7 +35027,7 @@ def cudaGraphAddEventWaitNode(graph, pDependencies : Optional[tuple[cudaGraphNod
 def cudaGraphEventWaitNodeGetEvent(node):
     """ Returns the event associated with an event wait node.
 
-    Returns the event of event wait node `hNode` in `event_out`.
+    Returns the event of event wait node ``hNode`` in ``event_out``.
 
     Parameters
     ----------
@@ -35048,7 +35067,7 @@ def cudaGraphEventWaitNodeGetEvent(node):
 def cudaGraphEventWaitNodeSetEvent(node, event):
     """ Sets an event wait node's event.
 
-    Sets the event of event wait node `hNode` to `event`.
+    Sets the event of event wait node ``hNode`` to ``event``.
 
     Parameters
     ----------
@@ -35093,12 +35112,12 @@ def cudaGraphEventWaitNodeSetEvent(node, event):
 def cudaGraphAddExternalSemaphoresSignalNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, nodeParams : Optional[cudaExternalSemaphoreSignalNodeParams]):
     """ Creates an external semaphore signal node and adds it to a graph.
 
-    Creates a new external semaphore signal node and adds it to `graph`
-    with `numDependencies` dependencies specified via `dependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `dependencies` may not have any duplicate entries. A
-    handle to the new node will be returned in `pGraphNode`.
+    Creates a new external semaphore signal node and adds it to ``graph``
+    with ``numDependencies`` dependencies specified via ``dependencies``
+    and arguments specified in ``nodeParams``. It is possible for
+    ``numDependencies`` to be 0, in which case the node will be placed at
+    the root of the graph. ``dependencies`` may not have any duplicate
+    entries. A handle to the new node will be returned in ``pGraphNode``.
 
     Performs a signal operation on a set of externally allocated semaphore
     objects when the node is launched. The operation(s) will occur after
@@ -35165,9 +35184,9 @@ def cudaGraphAddExternalSemaphoresSignalNode(graph, pDependencies : Optional[tup
 def cudaGraphExternalSemaphoresSignalNodeGetParams(hNode):
     """ Returns an external semaphore signal node's parameters.
 
-    Returns the parameters of an external semaphore signal node `hNode` in
-    `params_out`. The `extSemArray` and `paramsArray` returned in
-    `params_out`, are owned by the node. This memory remains valid until
+    Returns the parameters of an external semaphore signal node ``hNode``
+    in ``params_out``. The ``extSemArray`` and ``paramsArray`` returned in
+    ``params_out``, are owned by the node. This memory remains valid until
     the node is destroyed or its parameters are modified, and should not be
     modified directly. Use
     :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams` to update
@@ -35211,8 +35230,8 @@ def cudaGraphExternalSemaphoresSignalNodeGetParams(hNode):
 def cudaGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[cudaExternalSemaphoreSignalNodeParams]):
     """ Sets an external semaphore signal node's parameters.
 
-    Sets the parameters of an external semaphore signal node `hNode` to
-    `nodeParams`.
+    Sets the parameters of an external semaphore signal node ``hNode`` to
+    ``nodeParams``.
 
     Parameters
     ----------
@@ -35250,12 +35269,12 @@ def cudaGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[
 def cudaGraphAddExternalSemaphoresWaitNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, nodeParams : Optional[cudaExternalSemaphoreWaitNodeParams]):
     """ Creates an external semaphore wait node and adds it to a graph.
 
-    Creates a new external semaphore wait node and adds it to `graph` with
-    `numDependencies` dependencies specified via `dependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `dependencies` may not have any duplicate entries. A
-    handle to the new node will be returned in `pGraphNode`.
+    Creates a new external semaphore wait node and adds it to ``graph``
+    with ``numDependencies`` dependencies specified via ``dependencies``
+    and arguments specified in ``nodeParams``. It is possible for
+    ``numDependencies`` to be 0, in which case the node will be placed at
+    the root of the graph. ``dependencies`` may not have any duplicate
+    entries. A handle to the new node will be returned in ``pGraphNode``.
 
     Performs a wait operation on a set of externally allocated semaphore
     objects when the node is launched. The node's dependencies will not be
@@ -35322,9 +35341,9 @@ def cudaGraphAddExternalSemaphoresWaitNode(graph, pDependencies : Optional[tuple
 def cudaGraphExternalSemaphoresWaitNodeGetParams(hNode):
     """ Returns an external semaphore wait node's parameters.
 
-    Returns the parameters of an external semaphore wait node `hNode` in
-    `params_out`. The `extSemArray` and `paramsArray` returned in
-    `params_out`, are owned by the node. This memory remains valid until
+    Returns the parameters of an external semaphore wait node ``hNode`` in
+    ``params_out``. The ``extSemArray`` and ``paramsArray`` returned in
+    ``params_out``, are owned by the node. This memory remains valid until
     the node is destroyed or its parameters are modified, and should not be
     modified directly. Use
     :py:obj:`~.cudaGraphExternalSemaphoresSignalNodeSetParams` to update
@@ -35368,8 +35387,8 @@ def cudaGraphExternalSemaphoresWaitNodeGetParams(hNode):
 def cudaGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[cudaExternalSemaphoreWaitNodeParams]):
     """ Sets an external semaphore wait node's parameters.
 
-    Sets the parameters of an external semaphore wait node `hNode` to
-    `nodeParams`.
+    Sets the parameters of an external semaphore wait node ``hNode`` to
+    ``nodeParams``.
 
     Parameters
     ----------
@@ -35407,15 +35426,15 @@ def cudaGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[cu
 def cudaGraphAddMemAllocNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, nodeParams : Optional[cudaMemAllocNodeParams]):
     """ Creates an allocation node and adds it to a graph.
 
-    Creates a new allocation node and adds it to `graph` with
-    `numDependencies` dependencies specified via `pDependencies` and
-    arguments specified in `nodeParams`. It is possible for
-    `numDependencies` to be 0, in which case the node will be placed at the
-    root of the graph. `pDependencies` may not have any duplicate entries.
-    A handle to the new node will be returned in `pGraphNode`.
+    Creates a new allocation node and adds it to ``graph`` with
+    ``numDependencies`` dependencies specified via ``pDependencies`` and
+    arguments specified in ``nodeParams``. It is possible for
+    ``numDependencies`` to be 0, in which case the node will be placed at
+    the root of the graph. ``pDependencies`` may not have any duplicate
+    entries. A handle to the new node will be returned in ``pGraphNode``.
 
     When :py:obj:`~.cudaGraphAddMemAllocNode` creates an allocation node,
-    it returns the address of the allocation in `nodeParams.dptr`. The
+    it returns the address of the allocation in ``nodeParams.dptr``. The
     allocation's address remains fixed across instantiations and launches.
 
     If the allocation is freed in the same graph, by creating a free node
@@ -35518,10 +35537,10 @@ def cudaGraphAddMemAllocNode(graph, pDependencies : Optional[tuple[cudaGraphNode
 def cudaGraphMemAllocNodeGetParams(node):
     """ Returns a memory alloc node's parameters.
 
-    Returns the parameters of a memory alloc node `hNode` in `params_out`.
-    The `poolProps` and `accessDescs` returned in `params_out`, are owned
-    by the node. This memory remains valid until the node is destroyed. The
-    returned parameters must not be modified.
+    Returns the parameters of a memory alloc node ``hNode`` in
+    ``params_out``. The ``poolProps`` and ``accessDescs`` returned in
+    ``params_out``, are owned by the node. This memory remains valid until
+    the node is destroyed. The returned parameters must not be modified.
 
     Parameters
     ----------
@@ -35561,12 +35580,12 @@ def cudaGraphMemAllocNodeGetParams(node):
 def cudaGraphAddMemFreeNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], size_t numDependencies, dptr):
     """ Creates a memory free node and adds it to a graph.
 
-    Creates a new memory free node and adds it to `graph` with
-    `numDependencies` dependencies specified via `pDependencies` and
-    address specified in `dptr`. It is possible for `numDependencies` to be
-    0, in which case the node will be placed at the root of the graph.
-    `pDependencies` may not have any duplicate entries. A handle to the new
-    node will be returned in `pGraphNode`.
+    Creates a new memory free node and adds it to ``graph`` with
+    ``numDependencies`` dependencies specified via ``pDependencies`` and
+    address specified in ``dptr``. It is possible for ``numDependencies``
+    to be 0, in which case the node will be placed at the root of the
+    graph. ``pDependencies`` may not have any duplicate entries. A handle
+    to the new node will be returned in ``pGraphNode``.
 
     :py:obj:`~.cudaGraphAddMemFreeNode` will return
     :py:obj:`~.cudaErrorInvalidValue` if the user attempts to free:
@@ -35652,7 +35671,7 @@ def cudaGraphAddMemFreeNode(graph, pDependencies : Optional[tuple[cudaGraphNode_
 def cudaGraphMemFreeNodeGetParams(node):
     """ Returns a memory free node's parameters.
 
-    Returns the address of a memory free node `hNode` in `dptr_out`.
+    Returns the address of a memory free node ``hNode`` in ``dptr_out``.
 
     Parameters
     ----------
@@ -35815,8 +35834,8 @@ def cudaDeviceSetGraphMemAttribute(int device, attr not None : cudaGraphMemAttri
 def cudaGraphClone(originalGraph):
     """ Clones a graph.
 
-    This function creates a copy of `originalGraph` and returns it in
-    `pGraphClone`. All parameters are copied into the cloned graph. The
+    This function creates a copy of ``originalGraph`` and returns it in
+    ``pGraphClone``. All parameters are copied into the cloned graph. The
     original graph may be modified after this call without affecting the
     clone.
 
@@ -35865,14 +35884,15 @@ def cudaGraphClone(originalGraph):
 def cudaGraphNodeFindInClone(originalNode, clonedGraph):
     """ Finds a cloned version of a node.
 
-    This function returns the node in `clonedGraph` corresponding to
-    `originalNode` in the original graph.
+    This function returns the node in ``clonedGraph`` corresponding to
+    ``originalNode`` in the original graph.
 
-    `clonedGraph` must have been cloned from `originalGraph` via
-    :py:obj:`~.cudaGraphClone`. `originalNode` must have been in
-    `originalGraph` at the time of the call to :py:obj:`~.cudaGraphClone`,
-    and the corresponding cloned node in `clonedGraph` must not have been
-    removed. The cloned node is then returned via `pClonedNode`.
+    ``clonedGraph`` must have been cloned from ``originalGraph`` via
+    :py:obj:`~.cudaGraphClone`. ``originalNode`` must have been in
+    ``originalGraph`` at the time of the call to
+    :py:obj:`~.cudaGraphClone`, and the corresponding cloned node in
+    ``clonedGraph`` must not have been removed. The cloned node is then
+    returned via ``pClonedNode``.
 
     Parameters
     ----------
@@ -35922,7 +35942,7 @@ def cudaGraphNodeFindInClone(originalNode, clonedGraph):
 def cudaGraphNodeGetType(node):
     """ Returns a node's type.
 
-    Returns the node type of `node` in `pType`.
+    Returns the node type of ``node`` in ``pType``.
 
     Parameters
     ----------
@@ -35962,8 +35982,8 @@ def cudaGraphNodeGetType(node):
 def cudaGraphNodeGetContainingGraph(hNode):
     """ Returns the graph that contains a given graph node.
 
-    Returns the graph that contains `hNode` in `*phGraph`. If hNode is in a
-    child graph, the child graph it is in is returned.
+    Returns the graph that contains ``hNode`` in ``*phGraph``. If hNode is
+    in a child graph, the child graph it is in is returned.
 
     Parameters
     ----------
@@ -36003,9 +36023,9 @@ def cudaGraphNodeGetContainingGraph(hNode):
 def cudaGraphNodeGetLocalId(hNode):
     """ Returns the node id of a given graph node.
 
-    Returns the node id of `hNode` in `*nodeId`. The nodeId matches that
-    referenced by :py:obj:`~.cudaGraphDebugDotPrint`. The local nodeId and
-    graphId together can uniquely identify the node.
+    Returns the node id of ``hNode`` in ``*nodeId``. The nodeId matches
+    that referenced by :py:obj:`~.cudaGraphDebugDotPrint`. The local nodeId
+    and graphId together can uniquely identify the node.
 
     Parameters
     ----------
@@ -36054,7 +36074,7 @@ def cudaGraphNodeGetToolsId(hNode):
     -------
     cudaError_t
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.cudaErrorInvalidValue`
-    *toolsNodeId : unsigned long long
+    \\*toolsNodeId : unsigned long long
         Pointer to return the id used by tools
 
     See Also
@@ -36083,7 +36103,7 @@ def cudaGraphNodeGetToolsId(hNode):
 def cudaGraphGetId(hGraph):
     """ Returns the id of a given graph.
 
-    Returns the id of `hGraph` in `*graphId`. The value in `*graphId`
+    Returns the id of ``hGraph`` in ``*graphId``. The value in ``*graphId``
     matches that referenced by :py:obj:`~.cudaGraphDebugDotPrint`.
 
     Parameters
@@ -36124,8 +36144,9 @@ def cudaGraphGetId(hGraph):
 def cudaGraphExecGetId(hGraphExec):
     """ Returns the id of a given graph exec.
 
-    Returns the id of `hGraphExec` in `*graphId`. The value in `*graphId`
-    matches that referenced by :py:obj:`~.cudaGraphDebugDotPrint`.
+    Returns the id of ``hGraphExec`` in ``*graphId``. The value in
+    ``*graphId`` matches that referenced by
+    :py:obj:`~.cudaGraphDebugDotPrint`.
 
     Parameters
     ----------
@@ -36165,12 +36186,12 @@ def cudaGraphExecGetId(hGraphExec):
 def cudaGraphGetNodes(graph, size_t numNodes = 0):
     """ Returns a graph's nodes.
 
-    Returns a list of `graph's` nodes. `nodes` may be NULL, in which case
-    this function will return the number of nodes in `numNodes`. Otherwise,
-    `numNodes` entries will be filled in. If `numNodes` is higher than the
-    actual number of nodes, the remaining entries in `nodes` will be set to
-    NULL, and the number of nodes actually obtained will be returned in
-    `numNodes`.
+    Returns a list of ``graph's`` nodes. ``nodes`` may be NULL, in which
+    case this function will return the number of nodes in ``numNodes``.
+    Otherwise, ``numNodes`` entries will be filled in. If ``numNodes`` is
+    higher than the actual number of nodes, the remaining entries in
+    ``nodes`` will be set to NULL, and the number of nodes actually
+    obtained will be returned in ``numNodes``.
 
     Parameters
     ----------
@@ -36226,12 +36247,13 @@ def cudaGraphGetNodes(graph, size_t numNodes = 0):
 def cudaGraphGetRootNodes(graph, size_t pNumRootNodes = 0):
     """ Returns a graph's root nodes.
 
-    Returns a list of `graph's` root nodes. `pRootNodes` may be NULL, in
-    which case this function will return the number of root nodes in
-    `pNumRootNodes`. Otherwise, `pNumRootNodes` entries will be filled in.
-    If `pNumRootNodes` is higher than the actual number of root nodes, the
-    remaining entries in `pRootNodes` will be set to NULL, and the number
-    of nodes actually obtained will be returned in `pNumRootNodes`.
+    Returns a list of ``graph's`` root nodes. ``pRootNodes`` may be NULL,
+    in which case this function will return the number of root nodes in
+    ``pNumRootNodes``. Otherwise, ``pNumRootNodes`` entries will be filled
+    in. If ``pNumRootNodes`` is higher than the actual number of root
+    nodes, the remaining entries in ``pRootNodes`` will be set to NULL, and
+    the number of nodes actually obtained will be returned in
+    ``pNumRootNodes``.
 
     Parameters
     ----------
@@ -36287,19 +36309,19 @@ def cudaGraphGetRootNodes(graph, size_t pNumRootNodes = 0):
 def cudaGraphGetEdges(graph, size_t numEdges = 0):
     """ Returns a graph's dependency edges.
 
-    Returns a list of `graph's` dependency edges. Edges are returned via
-    corresponding indices in `from`, `to` and `edgeData`; that is, the node
-    in `to`[i] has a dependency on the node in `from`[i] with data
-    `edgeData`[i]. `from` and `to` may both be NULL, in which case this
-    function only returns the number of edges in `numEdges`. Otherwise,
-    `numEdges` entries will be filled in. If `numEdges` is higher than the
-    actual number of edges, the remaining entries in `from` and `to` will
-    be set to NULL, and the number of edges actually returned will be
-    written to `numEdges`. `edgeData` may alone be NULL, in which case the
-    edges must all have default (zeroed) edge data. Attempting a losst
-    query via NULL `edgeData` will result in
-    :py:obj:`~.cudaErrorLossyQuery`. If `edgeData` is non-NULL then `from`
-    and `to` must be as well.
+    Returns a list of ``graph's`` dependency edges. Edges are returned via
+    corresponding indices in ``from``, ``to`` and ``edgeData``; that is,
+    the node in ``to[i]`` has a dependency on the node in ``from[i]`` with
+    data ``edgeData[i]``. ``from`` and ``to`` may both be NULL, in which
+    case this function only returns the number of edges in ``numEdges``.
+    Otherwise, ``numEdges`` entries will be filled in. If ``numEdges`` is
+    higher than the actual number of edges, the remaining entries in
+    ``from`` and ``to`` will be set to NULL, and the number of edges
+    actually returned will be written to ``numEdges``. ``edgeData`` may
+    alone be NULL, in which case the edges must all have default (zeroed)
+    edge data. Attempting a losst query via NULL ``edgeData`` will result
+    in :py:obj:`~.cudaErrorLossyQuery`. If ``edgeData`` is non-NULL then
+    ``from`` and ``to`` must be as well.
 
     Parameters
     ----------
@@ -36383,18 +36405,18 @@ def cudaGraphGetEdges(graph, size_t numEdges = 0):
 def cudaGraphNodeGetDependencies(node, size_t pNumDependencies = 0):
     """ Returns a node's dependencies.
 
-    Returns a list of `node's` dependencies. `pDependencies` may be NULL,
-    in which case this function will return the number of dependencies in
-    `pNumDependencies`. Otherwise, `pNumDependencies` entries will be
-    filled in. If `pNumDependencies` is higher than the actual number of
-    dependencies, the remaining entries in `pDependencies` will be set to
-    NULL, and the number of nodes actually obtained will be returned in
-    `pNumDependencies`.
+    Returns a list of ``node's`` dependencies. ``pDependencies`` may be
+    NULL, in which case this function will return the number of
+    dependencies in ``pNumDependencies``. Otherwise, ``pNumDependencies``
+    entries will be filled in. If ``pNumDependencies`` is higher than the
+    actual number of dependencies, the remaining entries in
+    ``pDependencies`` will be set to NULL, and the number of nodes actually
+    obtained will be returned in ``pNumDependencies``.
 
     Note that if an edge has non-zero (non-default) edge data and
-    `edgeData` is NULL, this API will return
-    :py:obj:`~.cudaErrorLossyQuery`. If `edgeData` is non-NULL, then
-    `pDependencies` must be as well.
+    ``edgeData`` is NULL, this API will return
+    :py:obj:`~.cudaErrorLossyQuery`. If ``edgeData`` is non-NULL, then
+    ``pDependencies`` must be as well.
 
     Parameters
     ----------
@@ -36464,18 +36486,19 @@ def cudaGraphNodeGetDependencies(node, size_t pNumDependencies = 0):
 def cudaGraphNodeGetDependentNodes(node, size_t pNumDependentNodes = 0):
     """ Returns a node's dependent nodes.
 
-    Returns a list of `node's` dependent nodes. `pDependentNodes` may be
-    NULL, in which case this function will return the number of dependent
-    nodes in `pNumDependentNodes`. Otherwise, `pNumDependentNodes` entries
-    will be filled in. If `pNumDependentNodes` is higher than the actual
-    number of dependent nodes, the remaining entries in `pDependentNodes`
-    will be set to NULL, and the number of nodes actually obtained will be
-    returned in `pNumDependentNodes`.
+    Returns a list of ``node's`` dependent nodes. ``pDependentNodes`` may
+    be NULL, in which case this function will return the number of
+    dependent nodes in ``pNumDependentNodes``. Otherwise,
+    ``pNumDependentNodes`` entries will be filled in. If
+    ``pNumDependentNodes`` is higher than the actual number of dependent
+    nodes, the remaining entries in ``pDependentNodes`` will be set to
+    NULL, and the number of nodes actually obtained will be returned in
+    ``pNumDependentNodes``.
 
     Note that if an edge has non-zero (non-default) edge data and
-    `edgeData` is NULL, this API will return
-    :py:obj:`~.cudaErrorLossyQuery`. If `edgeData` is non-NULL, then
-    `pDependentNodes` must be as well.
+    ``edgeData`` is NULL, this API will return
+    :py:obj:`~.cudaErrorLossyQuery`. If ``edgeData`` is non-NULL, then
+    ``pDependentNodes`` must be as well.
 
     Parameters
     ----------
@@ -36545,11 +36568,12 @@ def cudaGraphNodeGetDependentNodes(node, size_t pNumDependentNodes = 0):
 def cudaGraphAddDependencies(graph, from_ : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], to : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], edgeData : Optional[tuple[cudaGraphEdgeData] | list[cudaGraphEdgeData]], size_t numDependencies):
     """ Adds dependency edges to a graph.
 
-    The number of dependencies to be added is defined by `numDependencies`
-    Elements in `pFrom` and `pTo` at corresponding indices define a
-    dependency. Each node in `pFrom` and `pTo` must belong to `graph`.
+    The number of dependencies to be added is defined by
+    ``numDependencies`` Elements in ``pFrom`` and ``pTo`` at corresponding
+    indices define a dependency. Each node in ``pFrom`` and ``pTo`` must
+    belong to ``graph``.
 
-    If `numDependencies` is 0, elements in `pFrom` and `pTo` will be
+    If ``numDependencies`` is 0, elements in ``pFrom`` and ``pTo`` will be
     ignored. Specifying an existing dependency will return an error.
 
     Parameters
@@ -36638,15 +36662,15 @@ def cudaGraphAddDependencies(graph, from_ : Optional[tuple[cudaGraphNode_t] | li
 def cudaGraphRemoveDependencies(graph, from_ : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], to : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], edgeData : Optional[tuple[cudaGraphEdgeData] | list[cudaGraphEdgeData]], size_t numDependencies):
     """ Removes dependency edges from a graph.
 
-    The number of `pDependencies` to be removed is defined by
-    `numDependencies`. Elements in `pFrom` and `pTo` at corresponding
-    indices define a dependency. Each node in `pFrom` and `pTo` must belong
-    to `graph`.
+    The number of ``pDependencies`` to be removed is defined by
+    ``numDependencies``. Elements in ``pFrom`` and ``pTo`` at corresponding
+    indices define a dependency. Each node in ``pFrom`` and ``pTo`` must
+    belong to ``graph``.
 
-    If `numDependencies` is 0, elements in `pFrom` and `pTo` will be
+    If ``numDependencies`` is 0, elements in ``pFrom`` and ``pTo`` will be
     ignored. Specifying an edge that does not exist in the graph, with data
-    matching `edgeData`, results in an error. `edgeData` is nullable, which
-    is equivalent to passing default (zeroed) data for each edge.
+    matching ``edgeData``, results in an error. ``edgeData`` is nullable,
+    which is equivalent to passing default (zeroed) data for each edge.
 
     Parameters
     ----------
@@ -36734,8 +36758,8 @@ def cudaGraphRemoveDependencies(graph, from_ : Optional[tuple[cudaGraphNode_t] |
 def cudaGraphDestroyNode(node):
     """ Remove a node from the graph.
 
-    Removes `node` from its graph. This operation also severs any
-    dependencies of other nodes on `node` and vice versa.
+    Removes ``node`` from its graph. This operation also severs any
+    dependencies of other nodes on ``node`` and vice versa.
 
     Dependencies cannot be removed from graphs which contain allocation or
     free nodes. Any attempt to do so will return an error.
@@ -36773,12 +36797,12 @@ def cudaGraphDestroyNode(node):
 def cudaGraphInstantiate(graph, unsigned long long flags):
     """ Creates an executable graph from a graph.
 
-    Instantiates `graph` as an executable graph. The graph is validated for
-    any structural constraints or intra-node constraints which were not
+    Instantiates ``graph`` as an executable graph. The graph is validated
+    for any structural constraints or intra-node constraints which were not
     previously validated. If instantiation is successful, a handle to the
-    instantiated graph is returned in `pGraphExec`.
+    instantiated graph is returned in ``pGraphExec``.
 
-    The `flags` parameter controls the behavior of instantiation and
+    The ``flags`` parameter controls the behavior of instantiation and
     subsequent graph launches. Valid flags are:
 
     - :py:obj:`~.cudaGraphInstantiateFlagAutoFreeOnLaunch`, which
@@ -36798,11 +36822,12 @@ def cudaGraphInstantiate(graph, unsigned long long flags):
       priorities are only available on kernel nodes, and are copied from
       stream priority during stream capture.
 
-    If `graph` contains any allocation or free nodes, there can be at most
-    one executable graph in existence for that graph at a time. An attempt
-    to instantiate a second executable graph before destroying the first
-    with :py:obj:`~.cudaGraphExecDestroy` will result in an error. The same
-    also applies if `graph` contains any device-updatable kernel nodes.
+    If ``graph`` contains any allocation or free nodes, there can be at
+    most one executable graph in existence for that graph at a time. An
+    attempt to instantiate a second executable graph before destroying the
+    first with :py:obj:`~.cudaGraphExecDestroy` will result in an error.
+    The same also applies if ``graph`` contains any device-updatable kernel
+    nodes.
 
     Graphs instantiated for launch on the device have additional
     restrictions which do not apply to host graphs:
@@ -36832,7 +36857,7 @@ def cudaGraphInstantiate(graph, unsigned long long flags):
       - Both operands must be accessible from the current device, and the
         current device must match the device of other nodes in the graph.
 
-    If `graph` is not instantiated for launch on the device but contains
+    If ``graph`` is not instantiated for launch on the device but contains
     kernels which call device-side :py:obj:`~.cudaGraphLaunch()` from
     multiple devices, this will result in an error.
 
@@ -36877,12 +36902,12 @@ def cudaGraphInstantiate(graph, unsigned long long flags):
 def cudaGraphInstantiateWithFlags(graph, unsigned long long flags):
     """ Creates an executable graph from a graph.
 
-    Instantiates `graph` as an executable graph. The graph is validated for
-    any structural constraints or intra-node constraints which were not
+    Instantiates ``graph`` as an executable graph. The graph is validated
+    for any structural constraints or intra-node constraints which were not
     previously validated. If instantiation is successful, a handle to the
-    instantiated graph is returned in `pGraphExec`.
+    instantiated graph is returned in ``pGraphExec``.
 
-    The `flags` parameter controls the behavior of instantiation and
+    The ``flags`` parameter controls the behavior of instantiation and
     subsequent graph launches. Valid flags are:
 
     - :py:obj:`~.cudaGraphInstantiateFlagAutoFreeOnLaunch`, which
@@ -36904,13 +36929,14 @@ def cudaGraphInstantiateWithFlags(graph, unsigned long long flags):
       priorities are only available on kernel nodes, and are copied from
       stream priority during stream capture.
 
-    If `graph` contains any allocation or free nodes, there can be at most
-    one executable graph in existence for that graph at a time. An attempt
-    to instantiate a second executable graph before destroying the first
-    with :py:obj:`~.cudaGraphExecDestroy` will result in an error. The same
-    also applies if `graph` contains any device-updatable kernel nodes.
+    If ``graph`` contains any allocation or free nodes, there can be at
+    most one executable graph in existence for that graph at a time. An
+    attempt to instantiate a second executable graph before destroying the
+    first with :py:obj:`~.cudaGraphExecDestroy` will result in an error.
+    The same also applies if ``graph`` contains any device-updatable kernel
+    nodes.
 
-    If `graph` contains kernels which call device-side
+    If ``graph`` contains kernels which call device-side
     :py:obj:`~.cudaGraphLaunch()` from multiple devices, this will result
     in an error.
 
@@ -36983,21 +37009,21 @@ def cudaGraphInstantiateWithFlags(graph, unsigned long long flags):
 def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraphInstantiateParams]):
     """ Creates an executable graph from a graph.
 
-    Instantiates `graph` as an executable graph according to the
-    `instantiateParams` structure. The graph is validated for any
+    Instantiates ``graph`` as an executable graph according to the
+    ``instantiateParams`` structure. The graph is validated for any
     structural constraints or intra-node constraints which were not
     previously validated. If instantiation is successful, a handle to the
-    instantiated graph is returned in `pGraphExec`.
+    instantiated graph is returned in ``pGraphExec``.
 
-    `instantiateParams` controls the behavior of instantiation and
+    ``instantiateParams`` controls the behavior of instantiation and
     subsequent graph launches, as well as returning more detailed
     information in the event of an error.
     :py:obj:`~.cudaGraphInstantiateParams` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    The `flags` field controls the behavior of instantiation and subsequent
-    graph launches. Valid flags are:
+    The ``flags`` field controls the behavior of instantiation and
+    subsequent graph launches. Valid flags are:
 
     - :py:obj:`~.cudaGraphInstantiateFlagAutoFreeOnLaunch`, which
       configures a graph containing memory allocation nodes to
@@ -37005,7 +37031,7 @@ def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraph
       relaunched.
 
     - :py:obj:`~.cudaGraphInstantiateFlagUpload`, which will perform an
-      upload of the graph into `uploadStream` once the graph has been
+      upload of the graph into ``uploadStream`` once the graph has been
       instantiated.
 
     - :py:obj:`~.cudaGraphInstantiateFlagDeviceLaunch`, which configures
@@ -37022,13 +37048,14 @@ def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraph
       priorities are only available on kernel nodes, and are copied from
       stream priority during stream capture.
 
-    If `graph` contains any allocation or free nodes, there can be at most
-    one executable graph in existence for that graph at a time. An attempt
-    to instantiate a second executable graph before destroying the first
-    with :py:obj:`~.cudaGraphExecDestroy` will result in an error. The same
-    also applies if `graph` contains any device-updatable kernel nodes.
+    If ``graph`` contains any allocation or free nodes, there can be at
+    most one executable graph in existence for that graph at a time. An
+    attempt to instantiate a second executable graph before destroying the
+    first with :py:obj:`~.cudaGraphExecDestroy` will result in an error.
+    The same also applies if ``graph`` contains any device-updatable kernel
+    nodes.
 
-    If `graph` contains kernels which call device-side
+    If ``graph`` contains kernels which call device-side
     :py:obj:`~.cudaGraphLaunch()` from multiple devices, this will result
     in an error.
 
@@ -37060,34 +37087,34 @@ def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraph
       - Both operands must be accessible from the current device, and the
         current device must match the device of other nodes in the graph.
 
-    In the event of an error, the `result_out` and `errNode_out` fields
+    In the event of an error, the ``result_out`` and ``errNode_out`` fields
     will contain more information about the nature of the error. Possible
     error reporting includes:
 
     - :py:obj:`~.cudaGraphInstantiateError`, if passed an invalid value or
       if an unexpected error occurred which is described by the return
-      value of the function. `errNode_out` will be set to NULL.
+      value of the function. ``errNode_out`` will be set to NULL.
 
     - :py:obj:`~.cudaGraphInstantiateInvalidStructure`, if the graph
-      structure is invalid. `errNode_out` will be set to one of the
+      structure is invalid. ``errNode_out`` will be set to one of the
       offending nodes.
 
     - :py:obj:`~.cudaGraphInstantiateNodeOperationNotSupported`, if the
       graph is instantiated for device launch but contains a node of an
       unsupported node type, or a node which performs unsupported
       operations, such as use of CUDA dynamic parallelism within a kernel
-      node. `errNode_out` will be set to this node.
+      node. ``errNode_out`` will be set to this node.
 
     - :py:obj:`~.cudaGraphInstantiateMultipleDevicesNotSupported`, if the
       graph is instantiated for device launch but a node’s device differs
       from that of another node. This error can also be returned if a graph
       is not instantiated for device launch and it contains kernels which
       call device-side :py:obj:`~.cudaGraphLaunch()` from multiple devices.
-      `errNode_out` will be set to this node.
+      ``errNode_out`` will be set to this node.
 
-    If instantiation is successful, `result_out` will be set to
-    :py:obj:`~.cudaGraphInstantiateSuccess`, and `hErrNode_out` will be set
-    to NULL.
+    If instantiation is successful, ``result_out`` will be set to
+    :py:obj:`~.cudaGraphInstantiateSuccess`, and ``hErrNode_out`` will be
+    set to NULL.
 
     Parameters
     ----------
@@ -37174,13 +37201,13 @@ def cudaGraphExecKernelNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
     """ Sets the parameters for a kernel node in the given graphExec.
 
     Sets the parameters of a kernel node in an executable graph
-    `hGraphExec`. The node is identified by the corresponding node `node`
-    in the non-executable graph, from which the executable graph was
-    instantiated.
+    ``hGraphExec``. The node is identified by the corresponding node
+    ``node`` in the non-executable graph, from which the executable graph
+    was instantiated.
 
-    `node` must not have been removed from the original graph. All
-    `nodeParams` fields may change, but the following restrictions apply to
-    `func` updates:
+    ``node`` must not have been removed from the original graph. All
+    ``nodeParams`` fields may change, but the following restrictions apply
+    to ``func`` updates:
 
     - The owning device of the function cannot change.
 
@@ -37191,20 +37218,20 @@ def cudaGraphExecKernelNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
       calls cannot be updated to a function which makes device-side update
       calls.
 
-    - If `hGraphExec` was not instantiated for device launch, a node whose
-      function originally did not use device-side
+    - If ``hGraphExec`` was not instantiated for device launch, a node
+      whose function originally did not use device-side
       :py:obj:`~.cudaGraphLaunch()` cannot be updated to a function which
       uses device-side :py:obj:`~.cudaGraphLaunch()` unless the node
       resides on the same device as nodes which contained such calls at
       instantiate-time. If no such calls were present at instantiation,
       these updates cannot be performed at all.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `node` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``node`` is also not modified by this call.
 
-    If `node` is a device-updatable kernel node, the next upload/launch of
-    `hGraphExec` will overwrite any previous device-side updates.
+    If ``node`` is a device-updatable kernel node, the next upload/launch
+    of ``hGraphExec`` will overwrite any previous device-side updates.
     Additionally, applying host updates to a device-updatable kernel node
     while it is being updated from the device will result in undefined
     behavior.
@@ -37255,20 +37282,20 @@ def cudaGraphExecKernelNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
 def cudaGraphExecMemcpyNodeSetParams(hGraphExec, node, pNodeParams : Optional[cudaMemcpy3DParms]):
     """ Sets the parameters for a memcpy node in the given graphExec.
 
-    Updates the work represented by `node` in `hGraphExec` as though `node`
-    had contained `pNodeParams` at instantiation. `node` must remain in the
-    graph which was used to instantiate `hGraphExec`. Changed edges to and
-    from `node` are ignored.
+    Updates the work represented by ``node`` in ``hGraphExec`` as though
+    ``node`` had contained ``pNodeParams`` at instantiation. ``node`` must
+    remain in the graph which was used to instantiate ``hGraphExec``.
+    Changed edges to and from ``node`` are ignored.
 
-    The source and destination memory in `pNodeParams` must be allocated
+    The source and destination memory in ``pNodeParams`` must be allocated
     from the same contexts as the original source and destination memory.
     Both the instantiation-time memory operands and the memory operands in
-    `pNodeParams` must be 1-dimensional. Zero-length operations are not
+    ``pNodeParams`` must be 1-dimensional. Zero-length operations are not
     supported.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `node` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``node`` is also not modified by this call.
 
     Returns :py:obj:`~.cudaErrorInvalidValue` if the memory operands'
     mappings changed or either the original or new memory operands are
@@ -37320,19 +37347,19 @@ def cudaGraphExecMemcpyNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
 def cudaGraphExecMemcpyNodeSetParams1D(hGraphExec, node, dst, src, size_t count, kind not None : cudaMemcpyKind):
     """ Sets the parameters for a memcpy node in the given graphExec to perform a 1-dimensional copy.
 
-    Updates the work represented by `node` in `hGraphExec` as though `node`
-    had contained the given params at instantiation. `node` must remain in
-    the graph which was used to instantiate `hGraphExec`. Changed edges to
-    and from `node` are ignored.
+    Updates the work represented by ``node`` in ``hGraphExec`` as though
+    ``node`` had contained the given params at instantiation. ``node`` must
+    remain in the graph which was used to instantiate ``hGraphExec``.
+    Changed edges to and from ``node`` are ignored.
 
-    `src` and `dst` must be allocated from the same contexts as the
+    ``src`` and ``dst`` must be allocated from the same contexts as the
     original source and destination memory. The instantiation-time memory
     operands must be 1-dimensional. Zero-length operations are not
     supported.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `node` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``node`` is also not modified by this call.
 
     Returns :py:obj:`~.cudaErrorInvalidValue` if the memory operands'
     mappings changed or the original memory operands are multidimensional.
@@ -37395,14 +37422,14 @@ def cudaGraphExecMemcpyNodeSetParams1D(hGraphExec, node, dst, src, size_t count,
 def cudaGraphExecMemsetNodeSetParams(hGraphExec, node, pNodeParams : Optional[cudaMemsetParams]):
     """ Sets the parameters for a memset node in the given graphExec.
 
-    Updates the work represented by `node` in `hGraphExec` as though `node`
-    had contained `pNodeParams` at instantiation. `node` must remain in the
-    graph which was used to instantiate `hGraphExec`. Changed edges to and
-    from `node` are ignored.
+    Updates the work represented by ``node`` in ``hGraphExec`` as though
+    ``node`` had contained ``pNodeParams`` at instantiation. ``node`` must
+    remain in the graph which was used to instantiate ``hGraphExec``.
+    Changed edges to and from ``node`` are ignored.
 
     Zero sized operations are not supported.
 
-    The new destination pointer in `pNodeParams` must be to the same kind
+    The new destination pointer in ``pNodeParams`` must be to the same kind
     of allocation as the original destination pointer and have the same
     context association and device mapping as the original destination
     pointer.
@@ -37415,9 +37442,9 @@ def cudaGraphExecMemsetNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
     resulting work maps onto the work resources already allocated for the
     node.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `node` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``node`` is also not modified by this call.
 
     Parameters
     ----------
@@ -37465,14 +37492,14 @@ def cudaGraphExecMemsetNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
 def cudaGraphExecHostNodeSetParams(hGraphExec, node, pNodeParams : Optional[cudaHostNodeParams]):
     """ Sets the parameters for a host node in the given graphExec.
 
-    Updates the work represented by `node` in `hGraphExec` as though `node`
-    had contained `pNodeParams` at instantiation. `node` must remain in the
-    graph which was used to instantiate `hGraphExec`. Changed edges to and
-    from `node` are ignored.
+    Updates the work represented by ``node`` in ``hGraphExec`` as though
+    ``node`` had contained ``pNodeParams`` at instantiation. ``node`` must
+    remain in the graph which was used to instantiate ``hGraphExec``.
+    Changed edges to and from ``node`` are ignored.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `node` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``node`` is also not modified by this call.
 
     Parameters
     ----------
@@ -37520,18 +37547,18 @@ def cudaGraphExecHostNodeSetParams(hGraphExec, node, pNodeParams : Optional[cuda
 def cudaGraphExecChildGraphNodeSetParams(hGraphExec, node, childGraph):
     """ Updates node parameters in the child graph node in the given graphExec.
 
-    Updates the work represented by `node` in `hGraphExec` as though the
-    nodes contained in `node's` graph had the parameters contained in
-    `childGraph's` nodes at instantiation. `node` must remain in the graph
-    which was used to instantiate `hGraphExec`. Changed edges to and from
-    `node` are ignored.
+    Updates the work represented by ``node`` in ``hGraphExec`` as though
+    the nodes contained in ``node's`` graph had the parameters contained in
+    ``childGraph's`` nodes at instantiation. ``node`` must remain in the
+    graph which was used to instantiate ``hGraphExec``. Changed edges to
+    and from ``node`` are ignored.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `node` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``node`` is also not modified by this call.
 
-    The topology of `childGraph`, as well as the node insertion order, must
-    match that of the graph contained in `node`. See
+    The topology of ``childGraph``, as well as the node insertion order,
+    must match that of the graph contained in ``node``. See
     :py:obj:`~.cudaGraphExecUpdate()` for a list of restrictions on what
     can be updated in an instantiated graph. The update is recursive, so
     child graph nodes contained within the top level child graph will also
@@ -37591,13 +37618,13 @@ def cudaGraphExecEventRecordNodeSetEvent(hGraphExec, hNode, event):
     """ Sets the event for an event record node in the given graphExec.
 
     Sets the event of an event record node in an executable graph
-    `hGraphExec`. The node is identified by the corresponding node `hNode`
-    in the non-executable graph, from which the executable graph was
-    instantiated.
+    ``hGraphExec``. The node is identified by the corresponding node
+    ``hNode`` in the non-executable graph, from which the executable graph
+    was instantiated.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
     Parameters
     ----------
@@ -37654,13 +37681,13 @@ def cudaGraphExecEventWaitNodeSetEvent(hGraphExec, hNode, event):
     """ Sets the event for an event wait node in the given graphExec.
 
     Sets the event of an event wait node in an executable graph
-    `hGraphExec`. The node is identified by the corresponding node `hNode`
-    in the non-executable graph, from which the executable graph was
-    instantiated.
+    ``hGraphExec``. The node is identified by the corresponding node
+    ``hNode`` in the non-executable graph, from which the executable graph
+    was instantiated.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
     Parameters
     ----------
@@ -37717,17 +37744,17 @@ def cudaGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePa
     """ Sets the parameters for an external semaphore signal node in the given graphExec.
 
     Sets the parameters of an external semaphore signal node in an
-    executable graph `hGraphExec`. The node is identified by the
-    corresponding node `hNode` in the non-executable graph, from which the
-    executable graph was instantiated.
+    executable graph ``hGraphExec``. The node is identified by the
+    corresponding node ``hNode`` in the non-executable graph, from which
+    the executable graph was instantiated.
 
-    `hNode` must not have been removed from the original graph.
+    ``hNode`` must not have been removed from the original graph.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
-    Changing `nodeParams->numExtSems` is not supported.
+    Changing ``nodeParams->numExtSems`` is not supported.
 
     Parameters
     ----------
@@ -37777,17 +37804,17 @@ def cudaGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodePara
     """ Sets the parameters for an external semaphore wait node in the given graphExec.
 
     Sets the parameters of an external semaphore wait node in an executable
-    graph `hGraphExec`. The node is identified by the corresponding node
-    `hNode` in the non-executable graph, from which the executable graph
+    graph ``hGraphExec``. The node is identified by the corresponding node
+    ``hNode`` in the non-executable graph, from which the executable graph
     was instantiated.
 
-    `hNode` must not have been removed from the original graph.
+    ``hNode`` must not have been removed from the original graph.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
-    Changing `nodeParams->numExtSems` is not supported.
+    Changing ``nodeParams->numExtSems`` is not supported.
 
     Parameters
     ----------
@@ -37836,19 +37863,19 @@ def cudaGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodePara
 def cudaGraphNodeSetEnabled(hGraphExec, hNode, unsigned int isEnabled):
     """ Enables or disables the specified node in the given graphExec.
 
-    Sets `hNode` to be either enabled or disabled. Disabled nodes are
+    Sets ``hNode`` to be either enabled or disabled. Disabled nodes are
     functionally equivalent to empty nodes until they are reenabled.
     Existing node parameters are not affected by disabling/enabling the
     node.
 
-    The node is identified by the corresponding node `hNode` in the non-
+    The node is identified by the corresponding node ``hNode`` in the non-
     executable graph, from which the executable graph was instantiated.
 
-    `hNode` must not have been removed from the original graph.
+    ``hNode`` must not have been removed from the original graph.
 
-    The modifications only affect future launches of `hGraphExec`. Already
-    enqueued or running launches of `hGraphExec` are not affected by this
-    call. `hNode` is also not modified by this call.
+    The modifications only affect future launches of ``hGraphExec``.
+    Already enqueued or running launches of ``hGraphExec`` are not affected
+    by this call. ``hNode`` is also not modified by this call.
 
     Parameters
     ----------
@@ -37899,12 +37926,13 @@ def cudaGraphNodeSetEnabled(hGraphExec, hNode, unsigned int isEnabled):
 def cudaGraphNodeGetEnabled(hGraphExec, hNode):
     """ Query whether a node in the given graphExec is enabled.
 
-    Sets isEnabled to 1 if `hNode` is enabled, or 0 if `hNode` is disabled.
+    Sets isEnabled to 1 if ``hNode`` is enabled, or 0 if ``hNode`` is
+    disabled.
 
-    The node is identified by the corresponding node `hNode` in the non-
+    The node is identified by the corresponding node ``hNode`` in the non-
     executable graph, from which the executable graph was instantiated.
 
-    `hNode` must not have been removed from the original graph.
+    ``hNode`` must not have been removed from the original graph.
 
     Parameters
     ----------
@@ -37959,8 +37987,8 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
     """ Check whether an executable graph can be updated with a graph and perform the update if possible.
 
     Updates the node parameters in the instantiated graph specified by
-    `hGraphExec` with the node parameters in a topologically identical
-    graph specified by `hGraph`.
+    ``hGraphExec`` with the node parameters in a topologically identical
+    graph specified by ``hGraph``.
 
     Limitations:
 
@@ -37984,7 +38012,7 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
         priority values, before they are clamped to the device's supported
         range.
 
-      - If `hGraphExec` was not instantiated for device launch, a node
+      - If ``hGraphExec`` was not instantiated for device launch, a node
         whose function originally did not use device-side
         :py:obj:`~.cudaGraphLaunch()` cannot be updated to a function which
         uses device-side :py:obj:`~.cudaGraphLaunch()` unless the node
@@ -37992,7 +38020,7 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
         instantiate-time. If no such calls were present at instantiation,
         these updates cannot be performed at all.
 
-      - Neither `hGraph` nor `hGraphExec` may contain device-updatable
+      - Neither ``hGraph`` nor ``hGraphExec`` may contain device-updatable
         kernel nodes.
 
     - Memset and memcpy nodes:
@@ -38027,58 +38055,58 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
     Note: The API may add further restrictions in future releases. The
     return code should always be checked.
 
-    cudaGraphExecUpdate sets the result member of `resultInfo` to
+    cudaGraphExecUpdate sets the result member of ``resultInfo`` to
     cudaGraphExecUpdateErrorTopologyChanged under the following conditions:
 
-    - The count of nodes directly in `hGraphExec` and `hGraph` differ, in
-      which case resultInfo->errorNode is set to NULL.
+    - The count of nodes directly in ``hGraphExec`` and ``hGraph`` differ,
+      in which case resultInfo->errorNode is set to NULL.
 
-    - `hGraph` has more exit nodes than `hGraph`, in which case
+    - ``hGraph`` has more exit nodes than ``hGraph``, in which case
       resultInfo->errorNode is set to one of the exit nodes in hGraph.
 
-    - A node in `hGraph` has a different number of dependencies than the
-      node from `hGraphExec` it is paired with, in which case
-      resultInfo->errorNode is set to the node from `hGraph`.
+    - A node in ``hGraph`` has a different number of dependencies than the
+      node from ``hGraphExec`` it is paired with, in which case
+      resultInfo->errorNode is set to the node from ``hGraph``.
 
-    - A node in `hGraph` has a dependency that does not match with the
-      corresponding dependency of the paired node from `hGraphExec`.
-      resultInfo->errorNode will be set to the node from `hGraph`.
+    - A node in ``hGraph`` has a dependency that does not match with the
+      corresponding dependency of the paired node from ``hGraphExec``.
+      resultInfo->errorNode will be set to the node from ``hGraph``.
       resultInfo->errorFromNode will be set to the mismatched dependency.
       The dependencies are paired based on edge order and a dependency does
       not match when the nodes are already paired based on other edges
       examined in the graph.
 
-    cudaGraphExecUpdate sets `the` result member of `resultInfo` to:
+    cudaGraphExecUpdate sets ``the`` result member of ``resultInfo`` to:
 
     - cudaGraphExecUpdateError if passed an invalid value.
 
     - cudaGraphExecUpdateErrorTopologyChanged if the graph topology changed
 
     - cudaGraphExecUpdateErrorNodeTypeChanged if the type of a node
-      changed, in which case `hErrorNode_out` is set to the node from
-      `hGraph`.
+      changed, in which case ``hErrorNode_out`` is set to the node from
+      ``hGraph``.
 
     - cudaGraphExecUpdateErrorFunctionChanged if the function of a kernel
       node changed (CUDA driver < 11.2)
 
     - cudaGraphExecUpdateErrorUnsupportedFunctionChange if the func field
       of a kernel changed in an unsupported way(see note above), in which
-      case `hErrorNode_out` is set to the node from `hGraph`
+      case ``hErrorNode_out`` is set to the node from ``hGraph``
 
     - cudaGraphExecUpdateErrorParametersChanged if any parameters to a node
       changed in a way that is not supported, in which case
-      `hErrorNode_out` is set to the node from `hGraph`
+      ``hErrorNode_out`` is set to the node from ``hGraph``
 
     - cudaGraphExecUpdateErrorAttributesChanged if any attributes of a node
       changed in a way that is not supported, in which case
-      `hErrorNode_out` is set to the node from `hGraph`
+      ``hErrorNode_out`` is set to the node from ``hGraph``
 
     - cudaGraphExecUpdateErrorNotSupported if something about a node is
       unsupported, like the node's type or configuration, in which case
-      `hErrorNode_out` is set to the node from `hGraph`
+      ``hErrorNode_out`` is set to the node from ``hGraph``
 
     If the update fails for a reason not listed above, the result member of
-    `resultInfo` will be set to cudaGraphExecUpdateError. If the update
+    ``resultInfo`` will be set to cudaGraphExecUpdateError. If the update
     succeeds, the result member will be set to cudaGraphExecUpdateSuccess.
 
     cudaGraphExecUpdate returns cudaSuccess when the updated was performed
@@ -38134,11 +38162,11 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
 def cudaGraphUpload(graphExec, stream):
     """ Uploads an executable graph in a stream.
 
-    Uploads `hGraphExec` to the device in `hStream` without executing it.
-    Uploads of the same `hGraphExec` will be serialized. Each upload is
-    ordered behind both any previous work in `hStream` and any previous
-    launches of `hGraphExec`. Uses memory cached by `stream` to back the
-    allocations owned by `graphExec`.
+    Uploads ``hGraphExec`` to the device in ``hStream`` without executing
+    it. Uploads of the same ``hGraphExec`` will be serialized. Each upload
+    is ordered behind both any previous work in ``hStream`` and any
+    previous launches of ``hGraphExec``. Uses memory cached by ``stream``
+    to back the allocations owned by ``graphExec``.
 
     Parameters
     ----------
@@ -38183,14 +38211,14 @@ def cudaGraphUpload(graphExec, stream):
 def cudaGraphLaunch(graphExec, stream):
     """ Launches an executable graph in a stream.
 
-    Executes `graphExec` in `stream`. Only one instance of `graphExec` may
-    be executing at a time. Each launch is ordered behind both any previous
-    work in `stream` and any previous launches of `graphExec`. To execute a
-    graph concurrently, it must be instantiated multiple times into
-    multiple executable graphs.
+    Executes ``graphExec`` in ``stream``. Only one instance of
+    ``graphExec`` may be executing at a time. Each launch is ordered behind
+    both any previous work in ``stream`` and any previous launches of
+    ``graphExec``. To execute a graph concurrently, it must be instantiated
+    multiple times into multiple executable graphs.
 
-    If any allocations created by `graphExec` remain unfreed (from a
-    previous launch) and `graphExec` was not instantiated with
+    If any allocations created by ``graphExec`` remain unfreed (from a
+    previous launch) and ``graphExec`` was not instantiated with
     :py:obj:`~.cudaGraphInstantiateFlagAutoFreeOnLaunch`, the launch will
     fail with :py:obj:`~.cudaErrorInvalidValue`.
 
@@ -38237,7 +38265,7 @@ def cudaGraphLaunch(graphExec, stream):
 def cudaGraphExecDestroy(graphExec):
     """ Destroys an executable graph.
 
-    Destroys the executable graph specified by `graphExec`.
+    Destroys the executable graph specified by ``graphExec``.
 
     Parameters
     ----------
@@ -38272,7 +38300,7 @@ def cudaGraphExecDestroy(graphExec):
 def cudaGraphDestroy(graph):
     """ Destroys a graph.
 
-    Destroys the graph specified by `graph`, as well as all of its nodes.
+    Destroys the graph specified by ``graph``, as well as all of its nodes.
 
     Parameters
     ----------
@@ -38307,11 +38335,11 @@ def cudaGraphDestroy(graph):
 def cudaGraphDebugDotPrint(graph, char* path, unsigned int flags):
     """ Write a DOT file describing graph structure.
 
-    Using the provided `graph`, write to `path` a DOT formatted description
-    of the graph. By default this includes the graph topology, node types,
-    node id, kernel names and memcpy direction. `flags` can be specified to
-    write more detailed information about each node type such as parameter
-    values, kernel attributes, node and function handles.
+    Using the provided ``graph``, write to ``path`` a DOT formatted
+    description of the graph. By default this includes the graph topology,
+    node types, node id, kernel names and memcpy direction. ``flags`` can
+    be specified to write more detailed information about each node type
+    such as parameter values, kernel attributes, node and function handles.
 
     Parameters
     ----------
@@ -38604,24 +38632,24 @@ def cudaGraphReleaseUserObject(graph, object, unsigned int count):
 def cudaGraphAddNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | list[cudaGraphNode_t]], dependencyData : Optional[tuple[cudaGraphEdgeData] | list[cudaGraphEdgeData]], size_t numDependencies, nodeParams : Optional[cudaGraphNodeParams]):
     """ Adds a node of arbitrary type to a graph.
 
-    Creates a new node in `graph` described by `nodeParams` with
-    `numDependencies` dependencies specified via `pDependencies`.
-    `numDependencies` may be 0. `pDependencies` may be null if
-    `numDependencies` is 0. `pDependencies` may not have any duplicate
+    Creates a new node in ``graph`` described by ``nodeParams`` with
+    ``numDependencies`` dependencies specified via ``pDependencies``.
+    ``numDependencies`` may be 0. ``pDependencies`` may be null if
+    ``numDependencies`` is 0. ``pDependencies`` may not have any duplicate
     entries.
 
-    `nodeParams` is a tagged union. The node type should be specified in
-    the `typename` field, and type-specific parameters in the corresponding
-    union member. All unused bytes - that is, `reserved0` and all bytes
-    past the utilized union member - must be set to zero. It is recommended
-    to use brace initialization or memset to ensure all bytes are
-    initialized.
+    ``nodeParams`` is a tagged union. The node type should be specified in
+    the ``typename`` field, and type-specific parameters in the
+    corresponding union member. All unused bytes - that is, ``reserved0``
+    and all bytes past the utilized union member - must be set to zero. It
+    is recommended to use brace initialization or memset to ensure all
+    bytes are initialized.
 
-    Note that for some node types, `nodeParams` may contain "out
+    Note that for some node types, ``nodeParams`` may contain "out
     parameters" which are modified during the call, such as
-    `nodeParams->alloc.dptr`.
+    ``nodeParams->alloc.dptr``.
 
-    A handle to the new node will be returned in `phGraphNode`.
+    A handle to the new node will be returned in ``phGraphNode``.
 
     Parameters
     ----------
@@ -38700,10 +38728,10 @@ def cudaGraphAddNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | li
 def cudaGraphNodeSetParams(node, nodeParams : Optional[cudaGraphNodeParams]):
     """ Update a graph node's parameters.
 
-    Sets the parameters of graph node `node` to `nodeParams`. The node type
-    specified by `nodeParams->type` must match the type of `node`.
-    `nodeParams` must be fully initialized and all unused bytes (reserved,
-    padding) zeroed.
+    Sets the parameters of graph node ``node`` to ``nodeParams``. The node
+    type specified by ``nodeParams->type`` must match the type of ``node``.
+    ``nodeParams`` must be fully initialized and all unused bytes
+    (reserved, padding) zeroed.
 
     Modifying parameters is not supported for node types
     cudaGraphNodeTypeMemAlloc and cudaGraphNodeTypeMemFree.
@@ -38744,11 +38772,11 @@ def cudaGraphNodeSetParams(node, nodeParams : Optional[cudaGraphNodeParams]):
 def cudaGraphNodeGetParams(node):
     """ Returns a graph node's parameters.
 
-    Returns the parameters of graph node `node` in `*nodeParams`.
+    Returns the parameters of graph node ``node`` in ``*nodeParams``.
 
-    Any pointers returned in `*nodeParams` point to driver-owned memory
+    Any pointers returned in ``*nodeParams`` point to driver-owned memory
     associated with the node. This memory remains valid until the node is
-    destroyed. Any memory pointed to from `*nodeParams` must not be
+    destroyed. Any memory pointed to from ``*nodeParams`` must not be
     modified.
 
     The returned parameters are a description of the node, but may not be
@@ -38797,14 +38825,14 @@ def cudaGraphNodeGetParams(node):
 def cudaGraphExecNodeSetParams(graphExec, node, nodeParams : Optional[cudaGraphNodeParams]):
     """ Update a graph node's parameters in an instantiated graph.
 
-    Sets the parameters of a node in an executable graph `graphExec`. The
-    node is identified by the corresponding node `node` in the non-
+    Sets the parameters of a node in an executable graph ``graphExec``. The
+    node is identified by the corresponding node ``node`` in the non-
     executable graph from which the executable graph was instantiated.
-    `node` must not have been removed from the original graph.
+    ``node`` must not have been removed from the original graph.
 
-    The modifications only affect future launches of `graphExec`. Already
-    enqueued or running launches of `graphExec` are not affected by this
-    call. `node` is also not modified by this call.
+    The modifications only affect future launches of ``graphExec``. Already
+    enqueued or running launches of ``graphExec`` are not affected by this
+    call. ``node`` is also not modified by this call.
 
     Allowed changes to parameters on executable graphs are as follows:
 
@@ -38857,7 +38885,7 @@ def cudaGraphExecNodeSetParams(graphExec, node, nodeParams : Optional[cudaGraphN
 def cudaGraphConditionalHandleCreate(graph, unsigned int defaultLaunchValue, unsigned int flags):
     """ Create a conditional handle.
 
-    Creates a conditional handle associated with `hGraph`.
+    Creates a conditional handle associated with ``hGraph``.
 
     The conditional handle must be associated with a conditional node in
     this graph or one of its children.
@@ -38872,7 +38900,7 @@ def cudaGraphConditionalHandleCreate(graph, unsigned int defaultLaunchValue, uns
     defaultLaunchValue : unsigned int
         Optional initial value for the conditional variable. Applied at the
         beginning of each graph execution if cudaGraphCondAssignDefault is
-        set in `flags`.
+        set in ``flags``.
     flags : unsigned int
         Currently must be cudaGraphCondAssignDefault or 0.
 
@@ -38909,7 +38937,7 @@ def cudaGraphConditionalHandleCreate(graph, unsigned int defaultLaunchValue, uns
 def cudaGraphConditionalHandleCreate_v2(graph, ctx, unsigned int defaultLaunchValue, unsigned int flags):
     """ Create a conditional handle.
 
-    Creates a conditional handle associated with `hGraph`.
+    Creates a conditional handle associated with ``hGraph``.
 
     The conditional handle must be associated with a conditional node in
     this graph or one of its children.
@@ -38927,7 +38955,7 @@ def cudaGraphConditionalHandleCreate_v2(graph, ctx, unsigned int defaultLaunchVa
     defaultLaunchValue : unsigned int
         Optional initial value for the conditional variable. Applied at the
         beginning of each graph execution if cudaGraphCondAssignDefault is
-        set in `flags`.
+        set in ``flags``.
     flags : unsigned int
         Currently must be cudaGraphCondAssignDefault or 0.
 
@@ -38974,8 +39002,8 @@ def cudaGetDriverEntryPoint(char* symbol, unsigned long long flags):
 
     [Deprecated]
 
-    Returns in `**funcPtr` the address of the CUDA driver function for the
-    requested flags.
+    Returns in ``**funcPtr`` the address of the CUDA driver function for
+    the requested flags.
 
     For a requested driver symbol, if the CUDA version in which the driver
     symbol was introduced is less than or equal to the CUDA runtime
@@ -38990,20 +39018,20 @@ def cudaGetDriverEntryPoint(char* symbol, unsigned long long flags):
     cuda.h.
 
     The API will return :py:obj:`~.cudaSuccess` and set the returned
-    `funcPtr` if the requested driver function is valid and supported on
+    ``funcPtr`` if the requested driver function is valid and supported on
     the platform.
 
     The API will return :py:obj:`~.cudaSuccess` and set the returned
-    `funcPtr` to NULL if the requested driver function is not supported on
-    the platform, no ABI compatible driver function exists for the CUDA
+    ``funcPtr`` to NULL if the requested driver function is not supported
+    on the platform, no ABI compatible driver function exists for the CUDA
     runtime version or if the driver symbol is invalid.
 
-    It will also set the optional `driverStatus` to one of the values in
+    It will also set the optional ``driverStatus`` to one of the values in
     :py:obj:`~.cudaDriverEntryPointQueryResult` with the following
     meanings:
 
     - :py:obj:`~.cudaDriverEntryPointSuccess` - The requested symbol was
-      succesfully found based on input arguments and `pfn` is valid
+      succesfully found based on input arguments and ``pfn`` is valid
 
     - :py:obj:`~.cudaDriverEntryPointSymbolNotFound` - The requested symbol
       was not found
@@ -39034,7 +39062,7 @@ def cudaGetDriverEntryPoint(char* symbol, unsigned long long flags):
     ----------
     symbol : bytes
         The base name of the driver API function to look for. As an
-        example, for the driver API :py:obj:`~.cuMemAlloc_v2`, `symbol`
+        example, for the driver API :py:obj:`~.cuMemAlloc_v2`, ``symbol``
         would be cuMemAlloc. Note that the API will use the CUDA runtime
         version to return the address to the most recent ABI compatible
         driver symbol, :py:obj:`~.cuMemAlloc` or :py:obj:`~.cuMemAlloc_v2`.
@@ -39076,8 +39104,8 @@ def cudaGetDriverEntryPoint(char* symbol, unsigned long long flags):
 def cudaGetDriverEntryPointByVersion(char* symbol, unsigned int cudaVersion, unsigned long long flags):
     """ Returns the requested driver API function pointer by CUDA version.
 
-    Returns in `**funcPtr` the address of the CUDA driver function for the
-    requested flags and CUDA driver version.
+    Returns in ``**funcPtr`` the address of the CUDA driver function for
+    the requested flags and CUDA driver version.
 
     The CUDA version is specified as (1000 * major + 10 * minor), so CUDA
     11.2 should be specified as 11020. For a requested driver symbol, if
@@ -39102,27 +39130,27 @@ def cudaGetDriverEntryPointByVersion(char* symbol, unsigned int cudaVersion, uns
     matching custom function typedefs.
 
     The API will return :py:obj:`~.cudaSuccess` and set the returned
-    `funcPtr` if the requested driver function is valid and supported on
+    ``funcPtr`` if the requested driver function is valid and supported on
     the platform.
 
     The API will return :py:obj:`~.cudaSuccess` and set the returned
-    `funcPtr` to NULL if the requested driver function is not supported on
-    the platform, no ABI compatible driver function exists for the
+    ``funcPtr`` to NULL if the requested driver function is not supported
+    on the platform, no ABI compatible driver function exists for the
     requested version or if the driver symbol is invalid.
 
-    It will also set the optional `driverStatus` to one of the values in
+    It will also set the optional ``driverStatus`` to one of the values in
     :py:obj:`~.cudaDriverEntryPointQueryResult` with the following
     meanings:
 
     - :py:obj:`~.cudaDriverEntryPointSuccess` - The requested symbol was
-      succesfully found based on input arguments and `pfn` is valid
+      succesfully found based on input arguments and ``pfn`` is valid
 
     - :py:obj:`~.cudaDriverEntryPointSymbolNotFound` - The requested symbol
       was not found
 
     - :py:obj:`~.cudaDriverEntryPointVersionNotSufficent` - The requested
       symbol was found but is not supported by the specified version
-      `cudaVersion`
+      ``cudaVersion``
 
     The requested flags can be:
 
@@ -39146,7 +39174,7 @@ def cudaGetDriverEntryPointByVersion(char* symbol, unsigned int cudaVersion, uns
     ----------
     symbol : bytes
         The base name of the driver API function to look for. As an
-        example, for the driver API :py:obj:`~.cuMemAlloc_v2`, `symbol`
+        example, for the driver API :py:obj:`~.cuMemAlloc_v2`, ``symbol``
         would be cuMemAlloc.
     cudaVersion : unsigned int
         The CUDA version to look for the requested driver symbol
@@ -39184,40 +39212,40 @@ def cudaGetDriverEntryPointByVersion(char* symbol, unsigned int cudaVersion, uns
 def cudaLibraryLoadData(code, jitOptions : Optional[tuple[cudaJitOption] | list[cudaJitOption]], jitOptionsValues : Optional[tuple[Any] | list[Any]], unsigned int numJitOptions, libraryOptions : Optional[tuple[cudaLibraryOption] | list[cudaLibraryOption]], libraryOptionValues : Optional[tuple[Any] | list[Any]], unsigned int numLibraryOptions):
     """ Load a library with specified code and options.
 
-    Takes a pointer `code` and loads the corresponding library `library`
-    based on the application defined library loading mode:
+    Takes a pointer ``code`` and loads the corresponding library
+    ``library`` based on the application defined library loading mode:
 
     - If module loading is set to EAGER, via the environment variables
-      described in "Module loading", `library` is loaded eagerly into all
+      described in "Module loading", ``library`` is loaded eagerly into all
       contexts at the time of the call and future contexts at the time of
       creation until the library is unloaded with
       :py:obj:`~.cudaLibraryUnload()`.
 
-    - If the environment variables are set to LAZY, `library` is not
+    - If the environment variables are set to LAZY, ``library`` is not
       immediately loaded onto all existent contexts and will only be loaded
       when a function is needed for that context, such as a kernel launch.
 
     These environment variables are described in the CUDA programming guide
     under the "CUDA environment variables" section.
 
-    The `code` may be a `cubin` or `fatbin` as output by nvcc, or a NULL-
-    terminated `PTX`, either as output by nvcc or hand-written, or `Tile`
-    IR data. A fatbin should also contain relocatable code when doing
-    separate compilation. Please also see the documentation for nvrtc
+    The ``code`` may be a ``cubin`` or ``fatbin`` as output by nvcc, or a
+    NULL-terminated ``PTX``, either as output by nvcc or hand-written, or
+    ``Tile`` IR data. A fatbin should also contain relocatable code when
+    doing separate compilation. Please also see the documentation for nvrtc
     (https://docs.nvidia.com/cuda/nvrtc/index.html), nvjitlink
     (https://docs.nvidia.com/cuda/nvjitlink/index.html), and nvfatbin
     (https://docs.nvidia.com/cuda/nvfatbin/index.html) for more information
     on generating loadable code at runtime.
 
-    Options are passed as an array via `jitOptions` and any corresponding
-    parameters are passed in `jitOptionsValues`. The number of total JIT
-    options is supplied via `numJitOptions`. Any outputs will be returned
-    via `jitOptionsValues`.
+    Options are passed as an array via ``jitOptions`` and any corresponding
+    parameters are passed in ``jitOptionsValues``. The number of total JIT
+    options is supplied via ``numJitOptions``. Any outputs will be returned
+    via ``jitOptionsValues``.
 
-    Library load options are passed as an array via `libraryOptions` and
-    any corresponding parameters are passed in `libraryOptionValues`. The
+    Library load options are passed as an array via ``libraryOptions`` and
+    any corresponding parameters are passed in ``libraryOptionValues``. The
     number of total library load options is supplied via
-    `numLibraryOptions`.
+    ``numLibraryOptions``.
 
     Parameters
     ----------
@@ -39284,41 +39312,41 @@ def cudaLibraryLoadData(code, jitOptions : Optional[tuple[cudaJitOption] | list[
 def cudaLibraryLoadFromFile(char* fileName, jitOptions : Optional[tuple[cudaJitOption] | list[cudaJitOption]], jitOptionsValues : Optional[tuple[Any] | list[Any]], unsigned int numJitOptions, libraryOptions : Optional[tuple[cudaLibraryOption] | list[cudaLibraryOption]], libraryOptionValues : Optional[tuple[Any] | list[Any]], unsigned int numLibraryOptions):
     """ Load a library with specified file and options.
 
-    Takes a pointer `code` and loads the corresponding library `library`
-    based on the application defined library loading mode:
+    Takes a pointer ``code`` and loads the corresponding library
+    ``library`` based on the application defined library loading mode:
 
     - If module loading is set to EAGER, via the environment variables
-      described in "Module loading", `library` is loaded eagerly into all
+      described in "Module loading", ``library`` is loaded eagerly into all
       contexts at the time of the call and future contexts at the time of
       creation until the library is unloaded with
       :py:obj:`~.cudaLibraryUnload()`.
 
-    - If the environment variables are set to LAZY, `library` is not
+    - If the environment variables are set to LAZY, ``library`` is not
       immediately loaded onto all existent contexts and will only be loaded
       when a function is needed for that context, such as a kernel launch.
 
     These environment variables are described in the CUDA programming guide
     under the "CUDA environment variables" section.
 
-    The file should be a `cubin` file as output by nvcc, or a `PTX` file
-    either as output by nvcc or handwritten, or a `fatbin` file as output
-    by nvcc or hand-written, or `Tile` IR file. A fatbin should also
-    contain relocatable code when doing separate compilation. Please also
-    see the documentation for nvrtc
+    The file should be a ``cubin`` file as output by nvcc, or a ``PTX``
+    file either as output by nvcc or handwritten, or a ``fatbin`` file as
+    output by nvcc or hand-written, or ``Tile`` IR file. A fatbin should
+    also contain relocatable code when doing separate compilation. Please
+    also see the documentation for nvrtc
     (https://docs.nvidia.com/cuda/nvrtc/index.html), nvjitlink
     (https://docs.nvidia.com/cuda/nvjitlink/index.html), and nvfatbin
     (https://docs.nvidia.com/cuda/nvfatbin/index.html) for more information
     on generating loadable code at runtime.
 
-    Options are passed as an array via `jitOptions` and any corresponding
-    parameters are passed in `jitOptionsValues`. The number of total
-    options is supplied via `numJitOptions`. Any outputs will be returned
-    via `jitOptionsValues`.
+    Options are passed as an array via ``jitOptions`` and any corresponding
+    parameters are passed in ``jitOptionsValues``. The number of total
+    options is supplied via ``numJitOptions``. Any outputs will be returned
+    via ``jitOptionsValues``.
 
-    Library load options are passed as an array via `libraryOptions` and
-    any corresponding parameters are passed in `libraryOptionValues`. The
+    Library load options are passed as an array via ``libraryOptions`` and
+    any corresponding parameters are passed in ``libraryOptionValues``. The
     number of total library load options is supplied via
-    `numLibraryOptions`.
+    ``numLibraryOptions``.
 
     Parameters
     ----------
@@ -39382,7 +39410,7 @@ def cudaLibraryLoadFromFile(char* fileName, jitOptions : Optional[tuple[cudaJitO
 def cudaLibraryUnload(library):
     """ Unloads a library.
 
-    Unloads the library specified with `library`
+    Unloads the library specified with ``library``
 
     Parameters
     ----------
@@ -39417,9 +39445,9 @@ def cudaLibraryUnload(library):
 def cudaLibraryGetKernel(library, char* name):
     """ Returns a kernel handle.
 
-    Returns in `pKernel` the handle of the kernel with name `name` located
-    in library `library`. If kernel handle is not found, the call returns
-    :py:obj:`~.cudaErrorSymbolNotFound`.
+    Returns in ``pKernel`` the handle of the kernel with name ``name``
+    located in library ``library``. If kernel handle is not found, the call
+    returns :py:obj:`~.cudaErrorSymbolNotFound`.
 
     Parameters
     ----------
@@ -39461,14 +39489,15 @@ def cudaLibraryGetKernel(library, char* name):
 def cudaLibraryGetGlobal(library, char* name):
     """ Returns a global device pointer.
 
-    Returns in `*dptr` and `*bytes` the base pointer and size of the global
-    with name `name` for the requested library `library` and the current
-    device. If no global for the requested name `name` exists, the call
-    returns :py:obj:`~.cudaErrorSymbolNotFound`. One of the parameters
-    `dptr` or `numbytes` (not both) can be NULL in which case it is
-    ignored. The returned `dptr` cannot be passed to the Symbol APIs such
-    as :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`,
-    :py:obj:`~.cudaGetSymbolAddress`, or :py:obj:`~.cudaGetSymbolSize`.
+    Returns in ``*dptr`` and ``*bytes`` the base pointer and size of the
+    global with name ``name`` for the requested library ``library`` and the
+    current device. If no global for the requested name ``name`` exists,
+    the call returns :py:obj:`~.cudaErrorSymbolNotFound`. One of the
+    parameters ``dptr`` or ``numbytes`` (not both) can be NULL in which
+    case it is ignored. The returned ``dptr`` cannot be passed to the
+    Symbol APIs such as :py:obj:`~.cudaMemcpyToSymbol`,
+    :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaGetSymbolAddress`, or
+    :py:obj:`~.cudaGetSymbolSize`.
 
     Parameters
     ----------
@@ -39513,14 +39542,14 @@ def cudaLibraryGetGlobal(library, char* name):
 def cudaLibraryGetManaged(library, char* name):
     """ Returns a pointer to managed memory.
 
-    Returns in `*dptr` and `*bytes` the base pointer and size of the
-    managed memory with name `name` for the requested library `library`. If
-    no managed memory with the requested name `name` exists, the call
-    returns :py:obj:`~.cudaErrorSymbolNotFound`. One of the parameters
-    `dptr` or `numbytes` (not both) can be NULL in which case it is
-    ignored. Note that managed memory for library `library` is shared
-    across devices and is registered when the library is loaded. The
-    returned `dptr` cannot be passed to the Symbol APIs such as
+    Returns in ``*dptr`` and ``*bytes`` the base pointer and size of the
+    managed memory with name ``name`` for the requested library
+    ``library``. If no managed memory with the requested name ``name``
+    exists, the call returns :py:obj:`~.cudaErrorSymbolNotFound`. One of
+    the parameters ``dptr`` or ``numbytes`` (not both) can be NULL in which
+    case it is ignored. Note that managed memory for library ``library`` is
+    shared across devices and is registered when the library is loaded. The
+    returned ``dptr`` cannot be passed to the Symbol APIs such as
     :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`,
     :py:obj:`~.cudaGetSymbolAddress`, or :py:obj:`~.cudaGetSymbolSize`.
 
@@ -39567,11 +39596,12 @@ def cudaLibraryGetManaged(library, char* name):
 def cudaLibraryGetUnifiedFunction(library, char* symbol):
     """ Returns a pointer to a unified function.
 
-    Returns in `*fptr` the function pointer to a unified function denoted
-    by `symbol`. If no unified function with name `symbol` exists, the call
-    returns :py:obj:`~.cudaErrorSymbolNotFound`. If there is no device with
-    attribute :py:obj:`~.cudaDeviceProp.unifiedFunctionPointers` present in
-    the system, the call may return :py:obj:`~.cudaErrorSymbolNotFound`.
+    Returns in ``*fptr`` the function pointer to a unified function denoted
+    by ``symbol``. If no unified function with name ``symbol`` exists, the
+    call returns :py:obj:`~.cudaErrorSymbolNotFound`. If there is no device
+    with attribute :py:obj:`~.cudaDeviceProp.unifiedFunctionPointers`
+    present in the system, the call may return
+    :py:obj:`~.cudaErrorSymbolNotFound`.
 
     Parameters
     ----------
@@ -39613,7 +39643,7 @@ def cudaLibraryGetUnifiedFunction(library, char* symbol):
 def cudaLibraryGetKernelCount(lib):
     """ Returns the number of kernels within a library.
 
-    Returns in `count` the number of kernels in `lib`.
+    Returns in ``count`` the number of kernels in ``lib``.
 
     Parameters
     ----------
@@ -39653,9 +39683,9 @@ def cudaLibraryGetKernelCount(lib):
 def cudaLibraryEnumerateKernels(unsigned int numKernels, lib):
     """ Retrieve the kernel handles within a library.
 
-    Returns in `kernels` a maximum number of `numKernels` kernel handles
-    within `lib`. The returned kernel handle becomes invalid when the
-    library is unloaded.
+    Returns in ``kernels`` a maximum number of ``numKernels`` kernel
+    handles within ``lib``. The returned kernel handle becomes invalid when
+    the library is unloaded.
 
     Parameters
     ----------
@@ -39706,23 +39736,24 @@ def cudaLibraryEnumerateKernels(unsigned int numKernels, lib):
 def cudaKernelSetAttributeForDevice(kernel, attr not None : cudaFuncAttribute, int value, int device):
     """ Sets information about a kernel.
 
-    This call sets the value of a specified attribute `attr` on the kernel
-    `kernel` for the requested device `device` to an integer value
-    specified by `value`. This function returns :py:obj:`~.cudaSuccess` if
-    the new value of the attribute could be successfully set. If the set
-    fails, this call will return an error. Not all attributes can have
-    values set. Attempting to set a value on a read-only attribute will
-    result in an error (:py:obj:`~.cudaErrorInvalidValue`)
+    This call sets the value of a specified attribute ``attr`` on the
+    kernel ``kernel`` for the requested device ``device`` to an integer
+    value specified by ``value``. This function returns
+    :py:obj:`~.cudaSuccess` if the new value of the attribute could be
+    successfully set. If the set fails, this call will return an error. Not
+    all attributes can have values set. Attempting to set a value on a
+    read-only attribute will result in an error
+    (:py:obj:`~.cudaErrorInvalidValue`)
 
     Note that attributes set using :py:obj:`~.cudaFuncSetAttribute()` will
     override the attribute set by this API irrespective of whether the call
     to :py:obj:`~.cudaFuncSetAttribute()` is made before or after this API
     call. Because of this and the stricter locking requirements mentioned
     below it is suggested that this call be used during the initialization
-    path and not on each thread accessing `kernel` such as on kernel
+    path and not on each thread accessing ``kernel`` such as on kernel
     launches or on the critical path.
 
-    Valid values for `attr` are:
+    Valid values for ``attr`` are:
 
     - :py:obj:`~.cudaFuncAttributeMaxDynamicSharedMemorySize` - The
       requested maximum size in bytes of dynamically-allocated shared
@@ -39812,8 +39843,8 @@ def cudaKernelSetAttributeForDevice(kernel, attr not None : cudaFuncAttribute, i
 def cudaDeviceGetDevResource(int device, typename not None : cudaDevResourceType):
     """ Get device resources.
 
-    Get the `typename` resources available to the `device`. This may often
-    be the starting point for further partitioning or configuring of
+    Get the ``typename`` resources available to the ``device``. This may
+    often be the starting point for further partitioning or configuring of
     resources.
 
     Note: The API is not supported on 32-bit platforms.
@@ -39849,17 +39880,17 @@ def cudaDeviceGetDevResource(int device, typename not None : cudaDevResourceType
 
 @cython.embedsignature(True)
 def cudaDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[cudaDevResource], unsigned int flags, unsigned int minCount):
-    """ Splits `cudaDevResourceTypeSm` resources.
-
-    Splits `cudaDevResourceTypeSm` resources into `nbGroups`, adhering to
-    the minimum SM count specified in `minCount` and the usage flags in
-    `flags`. If `result` is NULL, the API simulates a split and provides
-    the amount of groups that would be created in `nbGroups`. Otherwise,
-    `nbGroups` must point to the amount of elements in `result` and on
-    return, the API will overwrite `nbGroups` with the amount actually
-    created. The groups are written to the array in `result`. `nbGroups`
-    can be less than the total amount if a smaller number of groups is
-    needed.
+    """ Splits ``cudaDevResourceTypeSm`` resources.
+
+    Splits ``cudaDevResourceTypeSm`` resources into ``nbGroups``, adhering
+    to the minimum SM count specified in ``minCount`` and the usage flags
+    in ``flags``. If ``result`` is NULL, the API simulates a split and
+    provides the amount of groups that would be created in ``nbGroups``.
+    Otherwise, ``nbGroups`` must point to the amount of elements in
+    ``result`` and on return, the API will overwrite ``nbGroups`` with the
+    amount actually created. The groups are written to the array in
+    ``result``. ``nbGroups`` can be less than the total amount if a smaller
+    number of groups is needed.
 
     This API is used to spatially partition the input resource. The input
     resource needs to come from one of
@@ -39872,42 +39903,43 @@ def cudaDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[cudaD
     performance and functional characteristics of the input resource, and
     guarantee a split that will create a disjoint set of symmetrical
     partitions. This may lead to fewer groups created than purely dividing
-    the total SM count by the `minCount` due to cluster requirements or
+    the total SM count by the ``minCount`` due to cluster requirements or
     alignment and granularity requirements for the minCount. These
     requirements can be queried with :py:obj:`~.cudaDeviceGetDevResource`,
     or :py:obj:`~.cudaExecutionCtxGetDevResource` for
-    :py:obj:`~.cudaDevResourceTypeSm`, using the `minSmPartitionSize` and
-    `smCoscheduledAlignment` fields to determine minimum partition size and
-    alignment granularity, respectively.
+    :py:obj:`~.cudaDevResourceTypeSm`, using the ``minSmPartitionSize`` and
+    ``smCoscheduledAlignment`` fields to determine minimum partition size
+    and alignment granularity, respectively.
 
-    The `remainder` set does not have the same functional or performance
-    guarantees as the groups in `result`. Its use should be carefully
-    planned and future partitions of the `remainder` set are discouraged.
+    The ``remainder`` set does not have the same functional or performance
+    guarantees as the groups in ``result``. Its use should be carefully
+    planned and future partitions of the ``remainder`` set are discouraged.
 
     The following flags are supported:
 
-    - `cudaDevSmResourceSplitIgnoreSmCoscheduling` : Lower the minimum SM
+    - ``cudaDevSmResourceSplitIgnoreSmCoscheduling`` : Lower the minimum SM
       count and alignment, and treat each SM independent of its hierarchy.
       This allows more fine grained partitions but at the cost of advanced
       features (such as large clusters on compute capability 9.0+).
 
-    - `cudaDevSmResourceSplitMaxPotentialClusterSize` : Compute Capability
-      9.0+ only. Attempt to create groups that may allow for maximally
-      sized thread clusters. This can be queried post green context
-      creation using :py:obj:`~.cudaOccupancyMaxPotentialClusterSize`.
+    - ``cudaDevSmResourceSplitMaxPotentialClusterSize`` : Compute
+      Capability 9.0+ only. Attempt to create groups that may allow for
+      maximally sized thread clusters. This can be queried post green
+      context creation using
+      :py:obj:`~.cudaOccupancyMaxPotentialClusterSize`.
 
     A successful API call must either have:
 
-    - A valid array of `result` pointers of size passed in `nbGroups`, with
-      `input` of type `cudaDevResourceTypeSm`. Value of `minCount` must be
-      between 0 and the SM count specified in `input`. `remaining` may be
-      NULL.
+    - A valid array of ``result`` pointers of size passed in ``nbGroups``,
+      with ``input`` of type ``cudaDevResourceTypeSm``. Value of
+      ``minCount`` must be between 0 and the SM count specified in
+      ``input``. ``remaining`` may be NULL.
 
-    - NULL passed in for `result`, with a valid integer pointer in
-      `nbGroups` and `input` of type `cudaDevResourceTypeSm`. Value of
-      `minCount` must be between 0 and the SM count specified in `input`.
-      `remaining` may be NULL. This queries the number of groups that would
-      be created by the API.
+    - NULL passed in for ``result``, with a valid integer pointer in
+      ``nbGroups`` and ``input`` of type ``cudaDevResourceTypeSm``. Value
+      of ``minCount`` must be between 0 and the SM count specified in
+      ``input``. ``remaining`` may be NULL. This queries the number of
+      groups that would be created by the API.
 
     Note: The API is not supported on 32-bit platforms.
 
@@ -39917,8 +39949,8 @@ def cudaDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[cudaD
         This is a pointer, specifying the number of groups that would be or
         should be created as described below.
     input : :py:obj:`~.cudaDevResource`
-        Input SM resource to be split. Must be a valid `cudaDevSmResource`
-        resource.
+        Input SM resource to be split. Must be a valid
+        ``cudaDevSmResource`` resource.
     flags : unsigned int
         Flags specifying how these partitions are used or which constraints
         to abide by when splitting the input. Zero is valid for default
@@ -39931,14 +39963,14 @@ def cudaDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[cudaD
     cudaError_t
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorInvalidResourceType`, :py:obj:`~.cudaErrorInvalidResourceConfiguration`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
     result : list[:py:obj:`~.cudaDevResource`]
-        Output array of `cudaDevResource` resources. Can be NULL to query
+        Output array of ``cudaDevResource`` resources. Can be NULL to query
         the number of groups.
     nbGroups : unsigned int
         This is a pointer, specifying the number of groups that would be or
         should be created as described below.
     remaining : :py:obj:`~.cudaDevResource`
-        If the input resource cannot be cleanly split among `nbGroups`, the
-        remaining is placed in here. Can be ommitted (NULL) if the user
+        If the input resource cannot be cleanly split among ``nbGroups``,
+        the remaining is placed in here. Can be ommitted (NULL) if the user
         does not need the remaining set.
 
     See Also
@@ -39970,82 +40002,84 @@ def cudaDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[cudaD
 
 @cython.embedsignature(True)
 def cudaDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[cudaDevResource], unsigned int flags, groupParams : Optional[tuple[cudaDevSmResourceGroupParams] | list[cudaDevSmResourceGroupParams]]):
-    """ Splits a `cudaDevResourceTypeSm` resource into structured groups.
+    """ Splits a ``cudaDevResourceTypeSm`` resource into structured groups.
 
     This API will split a resource of :py:obj:`~.cudaDevResourceTypeSm`
-    into `nbGroups` structured device resource groups (the `result` array),
-    as well as an optional `remainder`, according to a set of requirements
-    specified in the `groupParams` array. The term “structured” is a trait
-    that specifies the `result` has SMs that are co-scheduled together.
-    This co-scheduling can be specified via the `coscheduledSmCount` field
-    of the `groupParams` structure, while the `smCount` will specify how
-    many SMs are required in total for that result. The remainder is always
-    “unstructured”, it does not have any set guarantees with respect to co-
-    scheduling and those properties will need to either be queried via the
-    occupancy set of APIs or further split into structured groups by this
-    API.
+    into ``nbGroups`` structured device resource groups (the ``result``
+    array), as well as an optional ``remainder``, according to a set of
+    requirements specified in the ``groupParams`` array. The term
+    “structured” is a trait that specifies the ``result`` has SMs that are
+    co-scheduled together. This co-scheduling can be specified via the
+    ``coscheduledSmCount`` field of the ``groupParams`` structure, while
+    the ``smCount`` will specify how many SMs are required in total for
+    that result. The remainder is always “unstructured”, it does not have
+    any set guarantees with respect to co-scheduling and those properties
+    will need to either be queried via the occupancy set of APIs or further
+    split into structured groups by this API.
 
     The API has a discovery mode for use cases where it is difficult to
     know ahead of time what the SM count should be. Discovery happens when
-    the `smCount` field of a given `groupParams` array entry is set to 0 -
-    the smCount will be filled in by the API with the derived SM count
-    according to the provided `groupParams` fields and constraints.
+    the ``smCount`` field of a given ``groupParams`` array entry is set to
+    0 - the smCount will be filled in by the API with the derived SM count
+    according to the provided ``groupParams`` fields and constraints.
     Discovery can be used with both a valid result array and with a NULL
-    `result` pointer value. The latter is useful in situations where the
+    ``result`` pointer value. The latter is useful in situations where the
     smCount will end up being zero, which is an invalid value to create a
-    result entry with, but allowed for discovery purposes when the `result`
-    is NULL.
+    result entry with, but allowed for discovery purposes when the
+    ``result`` is NULL.
 
-    The `groupParams` array is evaluated from index 0 to `nbGroups` - 1.
-    For each index in the `groupParams` array, the API will evaluate which
-    SMs may be a good fit based on constraints and assign those SMs to
-    `result`. This evaluation order is important to consider when using
-    discovery mode, as it helps discover the remaining SMs.
+    The ``groupParams`` array is evaluated from index 0 to ``nbGroups`` -
+    1. For each index in the ``groupParams`` array, the API will evaluate
+    which SMs may be a good fit based on constraints and assign those SMs
+    to ``result``. This evaluation order is important to consider when
+    using discovery mode, as it helps discover the remaining SMs.
 
     For a valid call:
 
-    - `result` should point to a `cudaDevResource` array of size
-      `nbGroups`, or alternatively, may be NULL, if the developer wishes
+    - ``result`` should point to a ``cudaDevResource`` array of size
+      ``nbGroups``, or alternatively, may be NULL, if the developer wishes
       for only the groupParams entries to be updated
 
-    - `input` should be a valid :py:obj:`~.cudaDevResourceTypeSm` resource
-      that originates from querying the execution context, or device.
+    - ``input`` should be a valid :py:obj:`~.cudaDevResourceTypeSm`
+      resource that originates from querying the execution context, or
+      device.
 
-    - The `remainder` group may be NULL.
+    - The ``remainder`` group may be NULL.
 
-    - There are no API `flags` at this time, so the value passed in should
-      be 0.
+    - There are no API ``flags`` at this time, so the value passed in
+      should be 0.
 
-    - A :py:obj:`~.cudaDevSmResourceGroupParams` array of size `nbGroups`.
-      Each entry must be zero-initialized.
+    - A :py:obj:`~.cudaDevSmResourceGroupParams` array of size
+      ``nbGroups``. Each entry must be zero-initialized.
 
-      - `smCount:` must be either 0 or in the range of [2,inputSmCount]
-        where inputSmCount is the amount of SMs the `input` resource has.
-        `smCount` must be a multiple of 2, as well as a multiple of
-        `coscheduledSmCount`. When assigning SMs to a group (and if results
-        are expected by having the `result` parameter set), `smCount`
-        cannot end up with 0 or a value less than `coscheduledSmCount`
-        otherwise :py:obj:`~.cudaErrorInvalidResourceConfiguration` will be
-        returned.
+      - ``smCount:`` must be either 0 or in the range of [2,inputSmCount]
+        where inputSmCount is the amount of SMs the ``input`` resource has.
+        ``smCount`` must be a multiple of 2, as well as a multiple of
+        ``coscheduledSmCount``. When assigning SMs to a group (and if
+        results are expected by having the ``result`` parameter set),
+        ``smCount`` cannot end up with 0 or a value less than
+        ``coscheduledSmCount`` otherwise
+        :py:obj:`~.cudaErrorInvalidResourceConfiguration` will be returned.
 
-      - `coscheduledSmCount:` allows grouping SMs together in order to be
+      - ``coscheduledSmCount:`` allows grouping SMs together in order to be
         able to launch clusters on Compute Architecture 9.0+. The default
         value may be queried from the device’s
         :py:obj:`~.cudaDevResourceTypeSm` resource (8 on Compute
         Architecture 9.0+ and 2 otherwise). The maximum is 32 on Compute
         Architecture 9.0+ and 2 otherwise.
 
-      - `preferredCoscheduledSmCount:` Attempts to merge
-        `coscheduledSmCount` groups into larger groups, in order to make
-        use of `preferredClusterDimensions` on Compute Architecture 10.0+.
-        The default value is set to `coscheduledSmCount`.
+      - ``preferredCoscheduledSmCount:`` Attempts to merge
+        ``coscheduledSmCount`` groups into larger groups, in order to make
+        use of ``preferredClusterDimensions`` on Compute Architecture
+        10.0+. The default value is set to ``coscheduledSmCount``.
 
-      - `flags:`
+      - ``flags:``
 
-    - `cudaDevSmResourceGroupBackfill:` lets `smCount` be a non-multiple of
-    `coscheduledSmCount`, filling the difference between SM count and
-    already assigned co-scheduled groupings with other SMs. This lets any
-    resulting group behave similar to the `remainder` group for example.
+        - ``cudaDevSmResourceGroupBackfill:`` lets ``smCount`` be a non-
+          multiple of ``coscheduledSmCount``, filling the difference
+          between SM count and already assigned co-scheduled groupings with
+          other SMs. This lets any resulting group behave similar to the
+          ``remainder`` group for example.
 
     Example params and their effect:
 
@@ -40069,7 +40103,7 @@ def cudaDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[cudaDevResou
       always need to adhere to a structure of coscheduledSmCount (even if
       its just 2), and therefore must always have enough coscheduled SMs to
       cover that requirement (even with the
-      `cudaDevSmResourceGroupBackfill` flag enabled).
+      ``cudaDevSmResourceGroupBackfill`` flag enabled).
 
     Splitting an input into N groups, can be accomplished by repeatedly
     splitting off 1 group and re-splitting the remainder (a bisect
@@ -40079,10 +40113,10 @@ def cudaDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[cudaDevResou
     Parameters
     ----------
     nbGroups : unsigned int
-        Specifies the number of groups in `result` and `groupParams`
+        Specifies the number of groups in ``result`` and ``groupParams``
     input : :py:obj:`~.cudaDevResource`
         Input SM resource to be split. Must be a valid
-        `cudaDevResourceTypeSm` resource.
+        ``cudaDevResourceTypeSm`` resource.
     flags : unsigned int
         Flags specifying how the API should behave. The value should be 0
         for now.
@@ -40095,8 +40129,8 @@ def cudaDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[cudaDevResou
     cudaError_t
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorInvalidResourceType`, :py:obj:`~.cudaErrorInvalidResourceConfiguration`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
     result : list[:py:obj:`~.cudaDevResource`]
-        Output array of `cudaDevResource` resources. Can be NULL, alongside
-        an smCount of 0, for discovery purpose.
+        Output array of ``cudaDevResource`` resources. Can be NULL,
+        alongside an smCount of 0, for discovery purpose.
     remainder : :py:obj:`~.cudaDevResource`
         If splitting the input resource leaves any SMs, the remainder is
         placed in here.
@@ -40146,22 +40180,22 @@ def cudaDevResourceGenerateDesc(resources : Optional[tuple[cudaDevResource] | li
     """ Generate a resource descriptor.
 
     Generates a single resource descriptor with the set of resources
-    specified in `resources`. The generated resource descriptor is
+    specified in ``resources``. The generated resource descriptor is
     necessary for the creation of green contexts via the
     :py:obj:`~.cudaGreenCtxCreate` API. Resources of the same type can be
     passed in, provided they meet the requirements as noted below.
 
     A successful API call must have:
 
-    - A valid output pointer for the `phDesc` descriptor as well as a valid
-      array of `resources` pointers, with the array size passed in
-      `nbResources`. If multiple resources are provided in `resources`, the
-      device they came from must be the same, otherwise
+    - A valid output pointer for the ``phDesc`` descriptor as well as a
+      valid array of ``resources`` pointers, with the array size passed in
+      ``nbResources``. If multiple resources are provided in ``resources``,
+      the device they came from must be the same, otherwise
       :py:obj:`~.cudaErrorInvalidResourceConfiguration` is returned. If
-      multiple resources are provided in `resources` and they are of type
+      multiple resources are provided in ``resources`` and they are of type
       :py:obj:`~.cudaDevResourceTypeSm`, they must be outputs (whether
-      `result` or `remaining`) from the same split API instance and have
-      the same smCoscheduledAlignment values, otherwise
+      ``result`` or ``remaining``) from the same split API instance and
+      have the same smCoscheduledAlignment values, otherwise
       :py:obj:`~.cudaErrorInvalidResourceConfiguration` is returned.
 
     Note: The API is not supported on 32-bit platforms.
@@ -40171,7 +40205,7 @@ def cudaDevResourceGenerateDesc(resources : Optional[tuple[cudaDevResource] | li
     resources : list[:py:obj:`~.cudaDevResource`]
         Array of resources to be included in the descriptor
     nbResources : unsigned int
-        Number of resources passed in `resources`
+        Number of resources passed in ``resources``
 
     Returns
     -------
@@ -40213,7 +40247,8 @@ def cudaGreenCtxCreate(desc, int device, unsigned int flags):
     """ Creates a green context with a specified set of resources.
 
     This API creates a green context with the resources specified in the
-    descriptor `desc` and returns it in the handle represented by `phCtx`.
+    descriptor ``desc`` and returns it in the handle represented by
+    ``phCtx``.
 
     This API retains the device’s primary context for the lifetime of the
     green context. The primary context will be released when the green
@@ -40275,12 +40310,12 @@ def cudaGreenCtxCreate(desc, int device, unsigned int flags):
 def cudaExecutionCtxDestroy(ctx):
     """ Destroy a execution context.
 
-    Destroys the specified execution context `ctx`. It is the
+    Destroys the specified execution context ``ctx``. It is the
     responsibility of the caller to ensure that no API call issues using
-    `ctx` while :py:obj:`~.cudaExecutionCtxDestroy()` is executing or
+    ``ctx`` while :py:obj:`~.cudaExecutionCtxDestroy()` is executing or
     subsequently.
 
-    If `ctx` is a green context, any resources provisioned for it (that
+    If ``ctx`` is a green context, any resources provisioned for it (that
     were initially available via the resource descriptor) are released as
     well.
 
@@ -40298,7 +40333,7 @@ def cudaExecutionCtxDestroy(ctx):
     Additionally, the API will invalidate all active captures on these
     streams.
 
-    Passing in a `ctx` that was not explicitly created via CUDA Runtime
+    Passing in a ``ctx`` that was not explicitly created via CUDA Runtime
     APIs is not allowed and will result in undefined behavior.
 
     Parameters
@@ -40334,7 +40369,8 @@ def cudaExecutionCtxDestroy(ctx):
 def cudaExecutionCtxGetDevResource(ctx, typename not None : cudaDevResourceType):
     """ Get context resources.
 
-    Get the `typename` resources available to context represented by `ctx`.
+    Get the ``typename`` resources available to context represented by
+    ``ctx``.
 
     Note: The API is not supported on 32-bit platforms.
 
@@ -40380,7 +40416,7 @@ def cudaExecutionCtxGetDevResource(ctx, typename not None : cudaDevResourceType)
 def cudaExecutionCtxGetDevice(ctx):
     """ Returns the device handle for the execution context.
 
-    Returns in `*device` the handle of the specified execution context's
+    Returns in ``*device`` the handle of the specified execution context's
     device. The execution context should not be NULL.
 
     Parameters
@@ -40422,7 +40458,7 @@ def cudaExecutionCtxGetDevice(ctx):
 def cudaExecutionCtxGetId(ctx):
     """ Returns the unique Id associated with the execution context supplied.
 
-    Returns in `ctxId` the unique Id which is associated with a given
+    Returns in ``ctxId`` the unique Id which is associated with a given
     context. The Id is unique for the life of the program for this instance
     of CUDA. The execution context should not be NULL.
 
@@ -40465,12 +40501,12 @@ def cudaExecutionCtxGetId(ctx):
 def cudaExecutionCtxStreamCreate(ctx, unsigned int flags, int priority):
     """ Creates a stream and initializes it for the given execution context.
 
-    The API creates a CUDA stream with the specified `flags` and
-    `priority`, initializing it with resources as defined at the time of
-    creating the specified `ctx`. Additionally, the API also enables work
-    submitted to to the stream to be tracked under `ctx`.
+    The API creates a CUDA stream with the specified ``flags`` and
+    ``priority``, initializing it with resources as defined at the time of
+    creating the specified ``ctx``. Additionally, the API also enables work
+    submitted to to the stream to be tracked under ``ctx``.
 
-    The supported values for `flags` are:
+    The supported values for ``flags`` are:
 
     - :py:obj:`~.cudaStreamDefault`: Default stream creation flag. This
       would be :py:obj:`~.cudaStreamNonBlocking` for streams created on a
@@ -40481,11 +40517,11 @@ def cudaExecutionCtxStreamCreate(ctx, unsigned int flags, int priority):
       stream), and that the created stream should perform no implicit
       synchronization with stream 0
 
-    Specifying `priority` affects the scheduling priority of work in the
+    Specifying ``priority`` affects the scheduling priority of work in the
     stream. Priorities provide a hint to preferentially run work with
     higher priority when possible, but do not preempt already-running work
     or provide any other functional guarantee on execution order.
-    `priority` follows a convention where lower numbers represent higher
+    ``priority`` follows a convention where lower numbers represent higher
     priorities. '0' represents default priority. The range of meaningful
     numerical priorities can be queried using
     :py:obj:`~.cudaDeviceGetStreamPriorityRange`. If the specified priority
@@ -40582,12 +40618,12 @@ def cudaExecutionCtxSynchronize(ctx):
 def cudaStreamGetDevResource(hStream, typename not None : cudaDevResourceType):
     """ Get stream resources.
 
-    Get the `typename` resources available to the `hStream` and store them
-    in `resource`.
+    Get the ``typename`` resources available to the ``hStream`` and store
+    them in ``resource``.
 
     Note: The API will return :py:obj:`~.cudaErrorInvalidResourceType` is
-    `typename` is `cudaDevResourceTypeWorkqueueConfig` or
-    `cudaDevResourceTypeWorkqueue`.
+    ``typename`` is ``cudaDevResourceTypeWorkqueueConfig`` or
+    ``cudaDevResourceTypeWorkqueue``.
 
     Parameters
     ----------
@@ -40630,15 +40666,15 @@ def cudaStreamGetDevResource(hStream, typename not None : cudaDevResourceType):
 def cudaExecutionCtxRecordEvent(ctx, event):
     """ Records an event for the specified execution context.
 
-    Captures in `event` all the activities of the execution context `ctx`
-    at the time of this call. `event` and `ctx` must be from the same CUDA
-    device, otherwise :py:obj:`~.cudaErrorInvalidHandle` will be returned.
-    Calls such as :py:obj:`~.cudaEventQuery()` or
+    Captures in ``event`` all the activities of the execution context
+    ``ctx`` at the time of this call. ``event`` and ``ctx`` must be from
+    the same CUDA device, otherwise :py:obj:`~.cudaErrorInvalidHandle` will
+    be returned. Calls such as :py:obj:`~.cudaEventQuery()` or
     :py:obj:`~.cudaExecutionCtxWaitEvent()` will then examine or wait for
-    completion of the work that was captured. Uses of `ctx` after this call
-    do not modify `event`. If the execution context passed to `ctx` is the
-    device (primary) context obtained via
-    :py:obj:`~.cudaDeviceGetExecutionCtx()`, `event` will capture all the
+    completion of the work that was captured. Uses of ``ctx`` after this
+    call do not modify ``event``. If the execution context passed to
+    ``ctx`` is the device (primary) context obtained via
+    :py:obj:`~.cudaDeviceGetExecutionCtx()`, ``event`` will capture all the
     activities of the green contexts created on the device as well.
 
     Parameters
@@ -40660,7 +40696,7 @@ def cudaExecutionCtxRecordEvent(ctx, event):
 
     Notes
     -----
-    The API will return :py:obj:`~.cudaErrorStreamCaptureUnsupported` if the specified execution context `ctx` has a stream in the capture mode. In such a case, the call will invalidate all the conflicting captures.
+    The API will return :py:obj:`~.cudaErrorStreamCaptureUnsupported` if the specified execution context ``ctx`` has a stream in the capture mode. In such a case, the call will invalidate all the conflicting captures.
     """
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
@@ -40689,14 +40725,14 @@ def cudaExecutionCtxRecordEvent(ctx, event):
 def cudaExecutionCtxWaitEvent(ctx, event):
     """ Make an execution context wait on an event.
 
-    Makes all future work submitted to execution context `ctx` wait for all
-    work captured in `event`. The synchronization will be performed on the
-    device and will not block the calling CPU thread. See
+    Makes all future work submitted to execution context ``ctx`` wait for
+    all work captured in ``event``. The synchronization will be performed
+    on the device and will not block the calling CPU thread. See
     :py:obj:`~.cudaExecutionCtxRecordEvent()` for details on what is
-    captured by an event. If the execution context passed to `ctx` is the
+    captured by an event. If the execution context passed to ``ctx`` is the
     device (primary) context obtained via
     :py:obj:`~.cudaDeviceGetExecutionCtx()`, all green contexts created on
-    the device will wait for `event` as well.
+    the device will wait for ``event`` as well.
 
     Parameters
     ----------
@@ -40716,9 +40752,9 @@ def cudaExecutionCtxWaitEvent(ctx, event):
 
     Notes
     -----
-    `event` may be from a different execution context or device than `ctx`.
+    ``event`` may be from a different execution context or device than ``ctx``.
 
-    The API will return :py:obj:`~.cudaErrorStreamCaptureUnsupported` and invalidate the capture if the specified event `event` is part of an ongoing capture sequence or if the specified execution context `ctx` has a stream in the capture mode.
+    The API will return :py:obj:`~.cudaErrorStreamCaptureUnsupported` and invalidate the capture if the specified event ``event`` is part of an ongoing capture sequence or if the specified execution context ``ctx`` has a stream in the capture mode.
     """
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
@@ -40747,7 +40783,7 @@ def cudaExecutionCtxWaitEvent(ctx, event):
 def cudaDeviceGetExecutionCtx(int device):
     """ Returns the execution context for a device.
 
-    Returns in `ctx` the execution context for the specified device. This
+    Returns in ``ctx`` the execution context for the specified device. This
     is the device's primary context. The returned context can then be
     passed to APIs that take in a cudaExecutionContext_t enabling explicit
     context-based programming without relying on thread-local state.
@@ -40798,16 +40834,16 @@ def cudaGetExportTable(pExportTableId : Optional[cudaUUID_t]):
 
 @cython.embedsignature(True)
 def cudaGetKernel(entryFuncAddr):
-    """ Get pointer to device kernel that matches entry function `entryFuncAddr`.
+    """ Get pointer to device kernel that matches entry function ``entryFuncAddr``.
 
-    Returns in `kernelPtr` the device kernel corresponding to the entry
-    function `entryFuncAddr`.
+    Returns in ``kernelPtr`` the device kernel corresponding to the entry
+    function ``entryFuncAddr``.
 
     Note that it is possible that there are multiple symbols belonging to
-    different translation units with the same `entryFuncAddr` registered
+    different translation units with the same ``entryFuncAddr`` registered
     with this CUDA Runtime and so the order which the translation units are
     loaded and registered with the CUDA Runtime can lead to differing
-    return pointers in `kernelPtr` . Suggested methods of ensuring
+    return pointers in ``kernelPtr`` . Suggested methods of ensuring
     uniqueness are to limit visibility of global device functions by using
     static or hidden visibility attribute in the respective translation
     units.
@@ -40843,10 +40879,10 @@ def cudaGetKernel(entryFuncAddr):
 
 @cython.embedsignature(True)
 def make_cudaPitchedPtr(d, size_t p, size_t xsz, size_t ysz):
-    """ Returns a :py:obj:`~.cudaPitchedPtr` based on input parameters.
+    """ Returns a cudaPitchedPtr based on input parameters.
 
-    Returns a :py:obj:`~.cudaPitchedPtr` based on the specified input
-    parameters `d`, `p`, `xsz`, and `ysz`.
+    Returns a cudaPitchedPtr based on the specified input parameters ``d``,
+    ``p``, ``xsz``, and ``ysz``.
 
     Parameters
     ----------
@@ -40864,7 +40900,7 @@ def make_cudaPitchedPtr(d, size_t p, size_t xsz, size_t ysz):
     cudaError_t.cudaSuccess
         cudaError_t.cudaSuccess
     :py:obj:`~.cudaPitchedPtr`
-        :py:obj:`~.cudaPitchedPtr` specified by `d`, `p`, `xsz`, and `ysz`
+        cudaPitchedPtr specified by ``d``, ``p``, ``xsz``, and ``ysz``
 
     See Also
     --------
@@ -40884,10 +40920,10 @@ def make_cudaPitchedPtr(d, size_t p, size_t xsz, size_t ysz):
 
 @cython.embedsignature(True)
 def make_cudaPos(size_t x, size_t y, size_t z):
-    """ Returns a :py:obj:`~.cudaPos` based on input parameters.
+    """ Returns a cudaPos based on input parameters.
 
-    Returns a :py:obj:`~.cudaPos` based on the specified input parameters
-    `x`, `y`, and `z`.
+    Returns a cudaPos based on the specified input parameters ``x``, ``y``,
+    and ``z``.
 
     Parameters
     ----------
@@ -40903,7 +40939,7 @@ def make_cudaPos(size_t x, size_t y, size_t z):
     cudaError_t.cudaSuccess
         cudaError_t.cudaSuccess
     :py:obj:`~.cudaPos`
-        :py:obj:`~.cudaPos` specified by `x`, `y`, and `z`
+        cudaPos specified by ``x``, ``y``, and ``z``
 
     See Also
     --------
@@ -40920,10 +40956,10 @@ def make_cudaPos(size_t x, size_t y, size_t z):
 
 @cython.embedsignature(True)
 def make_cudaExtent(size_t w, size_t h, size_t d):
-    """ Returns a :py:obj:`~.cudaExtent` based on input parameters.
+    """ Returns a cudaExtent based on input parameters.
 
-    Returns a :py:obj:`~.cudaExtent` based on the specified input
-    parameters `w`, `h`, and `d`.
+    Returns a cudaExtent based on the specified input parameters ``w``,
+    ``h``, and ``d``.
 
     Parameters
     ----------
@@ -40940,7 +40976,7 @@ def make_cudaExtent(size_t w, size_t h, size_t d):
     cudaError_t.cudaSuccess
         cudaError_t.cudaSuccess
     :py:obj:`~.cudaExtent`
-        :py:obj:`~.cudaExtent` specified by `w`, `h`, and `d`
+        cudaExtent specified by ``w``, ``h``, and ``d``
 
     See Also
     --------
@@ -40959,11 +40995,11 @@ def make_cudaExtent(size_t w, size_t h, size_t d):
 def cudaGraphicsEGLRegisterImage(image, unsigned int flags):
     """ Registers an EGL image.
 
-    Registers the EGLImageKHR specified by `image` for access by CUDA. A
-    handle to the registered object is returned as `pCudaResource`.
+    Registers the EGLImageKHR specified by ``image`` for access by CUDA. A
+    handle to the registered object is returned as ``pCudaResource``.
     Additional Mapping/Unmapping is not required for the registered
     resource and :py:obj:`~.cudaGraphicsResourceGetMappedEglFrame` can be
-    directly called on the `pCudaResource`.
+    directly called on the ``pCudaResource``.
 
     The application will be responsible for synchronizing access to shared
     objects. The application must ensure that any pending operation which
@@ -40976,7 +41012,7 @@ def cudaGraphicsEGLRegisterImage(image, unsigned int flags):
     accomplished by calling cuCtxSynchronize or cuEventSynchronize
     (preferably).
 
-    The surface's intended usage is specified using `flags`, as follows:
+    The surface's intended usage is specified using ``flags``, as follows:
 
     - :py:obj:`~.cudaGraphicsRegisterFlagsNone`: Specifies no hints about
       how this resource will be used. It is therefore assumed that this
@@ -41035,7 +41071,7 @@ def cudaGraphicsEGLRegisterImage(image, unsigned int flags):
 def cudaEGLStreamConsumerConnect(eglStream):
     """ Connect CUDA to EGLStream as a consumer.
 
-    Connect CUDA as a consumer to EGLStreamKHR specified by `eglStream`.
+    Connect CUDA as a consumer to EGLStreamKHR specified by ``eglStream``.
 
     The EGLStreamKHR is an EGL object that transfers a sequence of image
     frames from one API to another.
@@ -41078,8 +41114,9 @@ def cudaEGLStreamConsumerConnect(eglStream):
 def cudaEGLStreamConsumerConnectWithFlags(eglStream, unsigned int flags):
     """ Connect CUDA to EGLStream as a consumer with given flags.
 
-    Connect CUDA as a consumer to EGLStreamKHR specified by `stream` with
-    specified `flags` defined by :py:obj:`~.cudaEglResourceLocationFlags`.
+    Connect CUDA as a consumer to EGLStreamKHR specified by ``stream`` with
+    specified ``flags`` defined by
+    :py:obj:`~.cudaEglResourceLocationFlags`.
 
     The flags specify whether the consumer wants to access frames from
     system memory or video memory. Default is
@@ -41164,7 +41201,7 @@ def cudaEGLStreamConsumerAcquireFrame(conn, pCudaResource, pStream, unsigned int
 
     Acquire an image frame from EGLStreamKHR.
     :py:obj:`~.cudaGraphicsResourceGetMappedEglFrame` can be called on
-    `pCudaResource` to get :py:obj:`~.cudaEglFrame`.
+    ``pCudaResource`` to get :py:obj:`~.cudaEglFrame`.
 
     Parameters
     ----------
@@ -41228,7 +41265,7 @@ def cudaEGLStreamConsumerAcquireFrame(conn, pCudaResource, pStream, unsigned int
 def cudaEGLStreamConsumerReleaseFrame(conn, pCudaResource, pStream):
     """ Releases the last frame acquired from the EGLStream.
 
-    Release the acquired image frame specified by `pCudaResource` to
+    Release the acquired image frame specified by ``pCudaResource`` to
     EGLStreamKHR.
 
     Parameters
@@ -41288,7 +41325,7 @@ def cudaEGLStreamConsumerReleaseFrame(conn, pCudaResource, pStream):
 def cudaEGLStreamProducerConnect(eglStream, width, height):
     """ Connect CUDA to EGLStream as a producer.
 
-    Connect CUDA as a producer to EGLStreamKHR specified by `stream`.
+    Connect CUDA as a producer to EGLStreamKHR specified by ``stream``.
 
     The EGLStreamKHR is an EGL object that transfers a sequence of image
     frames from one API to another.
@@ -41503,9 +41540,9 @@ def cudaEGLStreamProducerReturnFrame(conn, eglframe : Optional[cudaEglFrame], pS
 def cudaGraphicsResourceGetMappedEglFrame(resource, unsigned int index, unsigned int mipLevel):
     """ Get an eglFrame through which to access a registered EGL graphics resource.
 
-    Returns in `*eglFrame` an eglFrame pointer through which the registered
-    graphics resource `resource` may be accessed. This API can only be
-    called for EGL graphics resources.
+    Returns in ``*eglFrame`` an eglFrame pointer through which the
+    registered graphics resource ``resource`` may be accessed. This API can
+    only be called for EGL graphics resources.
 
     The :py:obj:`~.cudaEglFrame` is defined as
 
@@ -41533,7 +41570,7 @@ def cudaGraphicsResourceGetMappedEglFrame(resource, unsigned int index, unsigned
 
     Notes
     -----
-    Note that in case of multiplanar `*eglFrame`, pitch of only first plane (unsigned int :py:obj:`~.cudaEglPlaneDesc.pitch`) is to be considered by the application.
+    Note that in case of multiplanar ``*eglFrame``, pitch of only first plane (unsigned int :py:obj:`~.cudaEglPlaneDesc.pitch`) is to be considered by the application.
     """
     cdef cyruntime.cudaGraphicsResource_t cyresource
     if resource is None:
@@ -41557,8 +41594,8 @@ def cudaGraphicsResourceGetMappedEglFrame(resource, unsigned int index, unsigned
 def cudaEventCreateFromEGLSync(eglSync, unsigned int flags):
     """ Creates an event from EGLSync object.
 
-    Creates an event *phEvent from an EGLSyncKHR eglSync with the flages
-    specified via `flags`. Valid flags include:
+    Creates an event \\*phEvent from an EGLSyncKHR eglSync with the flages
+    specified via ``flags``. Valid flags include:
 
     - :py:obj:`~.cudaEventDefault`: Default event creation flag.
 
@@ -41669,9 +41706,9 @@ def cudaProfilerStop():
 def cudaGLGetDevices(unsigned int cudaDeviceCount, deviceList not None : cudaGLDeviceList):
     """ Gets the CUDA devices associated with the current OpenGL context.
 
-    Returns in `*pCudaDeviceCount` the number of CUDA-compatible devices
+    Returns in ``*pCudaDeviceCount`` the number of CUDA-compatible devices
     corresponding to the current OpenGL context. Also returns in
-    `*pCudaDevices` at most `cudaDeviceCount` of the CUDA-compatible
+    ``*pCudaDevices`` at most ``cudaDeviceCount`` of the CUDA-compatible
     devices corresponding to the current OpenGL context. If any of the GPUs
     being used by the current OpenGL context are not CUDA capable then the
     call will return cudaErrorNoDevice.
@@ -41679,7 +41716,7 @@ def cudaGLGetDevices(unsigned int cudaDeviceCount, deviceList not None : cudaGLD
     Parameters
     ----------
     cudaDeviceCount : unsigned int
-        The size of the output device array `pCudaDevices`
+        The size of the output device array ``pCudaDevices``
     deviceList : cudaGLDeviceList
         The set of devices to return. This set may be cudaGLDeviceListAll
         for all devices, cudaGLDeviceListCurrentFrame for the devices used
@@ -41737,16 +41774,16 @@ def cudaGLGetDevices(unsigned int cudaDeviceCount, deviceList not None : cudaGLD
 def cudaGraphicsGLRegisterImage(image, target, unsigned int flags):
     """ Register an OpenGL texture or renderbuffer object.
 
-    Registers the texture or renderbuffer object specified by `image` for
+    Registers the texture or renderbuffer object specified by ``image`` for
     access by CUDA. A handle to the registered object is returned as
-    `resource`.
+    ``resource``.
 
-    `target` must match the type of the object, and must be one of
+    ``target`` must match the type of the object, and must be one of
     :py:obj:`~.GL_TEXTURE_2D`, :py:obj:`~.GL_TEXTURE_RECTANGLE`,
     :py:obj:`~.GL_TEXTURE_CUBE_MAP`, :py:obj:`~.GL_TEXTURE_3D`,
     :py:obj:`~.GL_TEXTURE_2D_ARRAY`, or :py:obj:`~.GL_RENDERBUFFER`.
 
-    The register flags `flags` specify the intended usage, as follows:
+    The register flags ``flags`` specify the intended usage, as follows:
 
     - :py:obj:`~.cudaGraphicsRegisterFlagsNone`: Specifies no hints about
       how this resource will be used. It is therefore assumed that this
@@ -41792,7 +41829,7 @@ def cudaGraphicsGLRegisterImage(image, target, unsigned int flags):
     image : :py:obj:`~.GLuint`
         name of texture or renderbuffer object to be registered
     target : :py:obj:`~.GLenum`
-        Identifies the type of object specified by `image`
+        Identifies the type of object specified by ``image``
     flags : unsigned int
         Register flags
 
@@ -41837,9 +41874,9 @@ def cudaGraphicsGLRegisterImage(image, target, unsigned int flags):
 def cudaGraphicsGLRegisterBuffer(buffer, unsigned int flags):
     """ Registers an OpenGL buffer object.
 
-    Registers the buffer object specified by `buffer` for access by CUDA. A
-    handle to the registered object is returned as `resource`. The register
-    flags `flags` specify the intended usage, as follows:
+    Registers the buffer object specified by ``buffer`` for access by CUDA.
+    A handle to the registered object is returned as ``resource``. The
+    register flags ``flags`` specify the intended usage, as follows:
 
     - :py:obj:`~.cudaGraphicsRegisterFlagsNone`: Specifies no hints about
       how this resource will be used. It is therefore assumed that this
@@ -41947,17 +41984,17 @@ def cudaVDPAUGetDevice(vdpDevice, vdpGetProcAddress):
 def cudaVDPAUSetVDPAUDevice(int device, vdpDevice, vdpGetProcAddress):
     """ Sets a CUDA device to use VDPAU interoperability.
 
-    Records `vdpDevice` as the VdpDevice for VDPAU interoperability with
-    the CUDA device `device` and sets `device` as the current device for
-    the calling host thread.
+    Records ``vdpDevice`` as the VdpDevice for VDPAU interoperability with
+    the CUDA device ``device`` and sets ``device`` as the current device
+    for the calling host thread.
 
     This function will immediately initialize the primary context on
-    `device` if needed.
+    ``device`` if needed.
 
-    If `device` has already been initialized then this call will fail with
-    the error :py:obj:`~.cudaErrorSetOnActiveProcess`. In this case it is
-    necessary to reset `device` using :py:obj:`~.cudaDeviceReset()` before
-    VDPAU interoperability on `device` may be enabled.
+    If ``device`` has already been initialized then this call will fail
+    with the error :py:obj:`~.cudaErrorSetOnActiveProcess`. In this case it
+    is necessary to reset ``device`` using :py:obj:`~.cudaDeviceReset()`
+    before VDPAU interoperability on ``device`` may be enabled.
 
     Parameters
     ----------
@@ -42006,9 +42043,9 @@ def cudaVDPAUSetVDPAUDevice(int device, vdpDevice, vdpGetProcAddress):
 def cudaGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
     """ Register a VdpVideoSurface object.
 
-    Registers the VdpVideoSurface specified by `vdpSurface` for access by
-    CUDA. A handle to the registered object is returned as `resource`. The
-    surface's intended usage is specified using `flags`, as follows:
+    Registers the VdpVideoSurface specified by ``vdpSurface`` for access by
+    CUDA. A handle to the registered object is returned as ``resource``.
+    The surface's intended usage is specified using ``flags``, as follows:
 
     - :py:obj:`~.cudaGraphicsMapFlagsNone`: Specifies no hints about how
       this resource will be used. It is therefore assumed that this
@@ -42063,9 +42100,9 @@ def cudaGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
 def cudaGraphicsVDPAURegisterOutputSurface(vdpSurface, unsigned int flags):
     """ Register a VdpOutputSurface object.
 
-    Registers the VdpOutputSurface specified by `vdpSurface` for access by
-    CUDA. A handle to the registered object is returned as `resource`. The
-    surface's intended usage is specified using `flags`, as follows:
+    Registers the VdpOutputSurface specified by ``vdpSurface`` for access
+    by CUDA. A handle to the registered object is returned as ``resource``.
+    The surface's intended usage is specified using ``flags``, as follows:
 
     - :py:obj:`~.cudaGraphicsMapFlagsNone`: Specifies no hints about how
       this resource will be used. It is therefore assumed that this
@@ -42668,4 +42705,3 @@ cdef int _add_native_handle_getters() except?-1:
     {{endif}}
     return 0
 _add_native_handle_getters()
-
diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst
index 49c633aa07..6a5d29ee79 100644
--- a/cuda_bindings/docs/source/module/driver.rst
+++ b/cuda_bindings/docs/source/module/driver.rst
@@ -1609,7 +1609,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES
 
 
-        The :py:obj:`~.CU_STREAM_WAIT_VALUE_FLUSH` flag and the :py:obj:`~.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES` MemOp are supported on the device. See :py:obj:`~.Stream Memory Operations` for additional details.
+        The :py:obj:`~.CU_STREAM_WAIT_VALUE_FLUSH` flag and the :py:obj:`~.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES` MemOp are supported on the device. See Stream Memory Operations for additional details.
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED
@@ -2081,7 +2081,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_HW_DECOMPRESS_CAPABLE
 
 
-        Returns in `*data` a boolean that indicates whether the pointer points to memory that is capable to be used for hardware accelerated decompression.
+        Returns in ``*data`` a boolean that indicates whether the pointer points to memory that is capable to be used for hardware accelerated decompression.
 
 .. autoclass:: cuda.bindings.driver.CUfunction_attribute
 
@@ -2563,7 +2563,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUjit_option.CU_JIT_CACHE_MODE
 
 
-        Specifies whether to enable caching explicitly (-dlcm) 
+        Specifies whether to enable caching explicitly (-dlcm)
 
         Choice is based on supplied :py:obj:`~.CUjit_cacheMode_enum`.
 
@@ -3240,7 +3240,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_IF
 
 
-        Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If `size` == 2, an optional ELSE graph is created and this is executed if the condition is zero.
+        Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If ``size`` == 2, an optional ELSE graph is created and this is executed if the condition is zero.
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphConditionalNodeType.CU_GRAPH_COND_TYPE_WHILE
@@ -3384,7 +3384,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUgraphDependencyType.CU_GRAPH_DEPENDENCY_TYPE_PROGRAMMATIC
 
 
-        This dependency type allows the downstream node to use `cudaGridDependencySynchronize()`. It may only be used between kernel nodes, and must be used with either the :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC` or :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER` outgoing port.
+        This dependency type allows the downstream node to use ``cudaGridDependencySynchronize()``. It may only be used between kernel nodes, and must be used with either the :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC` or :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER` outgoing port.
 
 .. autoclass:: cuda.bindings.driver.CUgraphInstantiateResult
 
@@ -3553,7 +3553,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
 
 
-        Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event through PTX launchdep.release or CUDA builtin function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cuEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cuEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. 
+        Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event through PTX launchdep.release or CUDA builtin function cudaTriggerProgrammaticLaunchCompletion(). A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cuEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cuEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks.
 
          The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.CU_EVENT_DISABLE_TIMING` flag set).
 
@@ -3579,21 +3579,21 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION
 
 
-        Valid for graph nodes, launches. Set :py:obj:`~.CUlaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. 
+        Valid for graph nodes, launches. Set :py:obj:`~.CUlaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible.
 
-         Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. 
+         Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks.
 
-         This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted.
+         This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than ``maxBlocksPerCluster``, if it is set in the kernel's ``__launch_bounds__``. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted.
 
 
     .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT
 
 
-        Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.launchCompletionEvent` to record the event. 
+        Valid for launches. Set :py:obj:`~.CUlaunchAttributeValue.launchCompletionEvent` to record the event.
 
-         Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. 
+         Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock.
 
-         A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. 
+         A launch completion event is nominally similar to a programmatic event with ``triggerAtBlockStart`` set except that it is not visible to ``cudaGridDependencySynchronize()`` and can be used with compute capability less than 9.0.
 
          The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.CU_EVENT_DISABLE_TIMING` flag set).
 
@@ -3601,11 +3601,11 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE
 
 
-        Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. 
+        Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error.
 
-         :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable` can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode` which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. 
+         :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable` can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode` which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.
 
-         Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`. 
+         Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`.
 
          If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again.
 
@@ -3619,13 +3619,13 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_NVLINK_UTIL_CENTRIC_SCHEDULING
 
 
-        Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. 
+        Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization.
 
 
 
-         When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. 
+         When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption.
 
-         This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. 
+         This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application.
 
          Valid values for :py:obj:`~.CUlaunchAttributeValue.nvlinkUtilCentricScheduling` are 0 (disabled) and 1 (enabled).
 
@@ -3746,7 +3746,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUlibraryOption.CU_LIBRARY_BINARY_IS_PRESERVED
 
 
-        Specifes that the argument `code` passed to :py:obj:`~.cuLibraryLoadData()` will be preserved. Specifying this option will let the driver know that `code` can be accessed at any point until :py:obj:`~.cuLibraryUnload()`. The default behavior is for the driver to allocate and maintain its own copy of `code`. Note that this is only a memory usage optimization hint and the driver can choose to ignore it if required. Specifying this option with :py:obj:`~.cuLibraryLoadFromFile()` is invalid and will return :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
+        Specifes that the argument ``code`` passed to :py:obj:`~.cuLibraryLoadData()` will be preserved. Specifying this option will let the driver know that ``code`` can be accessed at any point until :py:obj:`~.cuLibraryUnload()`. The default behavior is for the driver to allocate and maintain its own copy of ``code``. Note that this is only a memory usage optimization hint and the driver can choose to ignore it if required. Specifying this option with :py:obj:`~.cuLibraryLoadFromFile()` is invalid and will return :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
 
     .. autoattribute:: cuda.bindings.driver.CUlibraryOption.CU_LIBRARY_NUM_OPTIONS
@@ -5466,7 +5466,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUgraphInstantiate_flags.CUDA_GRAPH_INSTANTIATE_FLAG_UPLOAD
 
 
-        Automatically upload the graph after instantiation. Only supported by :py:obj:`~.cuGraphInstantiateWithParams`. The upload will be performed using the stream provided in `instantiateParams`.
+        Automatically upload the graph after instantiation. Only supported by :py:obj:`~.cuGraphInstantiateWithParams`. The upload will be performed using the stream provided in ``instantiateParams``.
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphInstantiate_flags.CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH
@@ -6535,19 +6535,19 @@ Data types used by CUDA driver
 
 .. autoattribute:: cuda.bindings.driver.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC
 
-    When the `flags` parameter of :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS` contains this flag, it indicates that signaling an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
+    When the ``flags`` parameter of :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS` contains this flag, it indicates that signaling an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
 
 .. autoattribute:: cuda.bindings.driver.CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC
 
-    When the `flags` parameter of :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS` contains this flag, it indicates that waiting on an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
+    When the ``flags`` parameter of :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS` contains this flag, it indicates that waiting on an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
 
 .. autoattribute:: cuda.bindings.driver.CUDA_NVSCISYNC_ATTR_SIGNAL
 
-    When `flags` of :py:obj:`~.cuDeviceGetNvSciSyncAttributes` is set to this, it indicates that application needs signaler specific NvSciSyncAttr to be filled by :py:obj:`~.cuDeviceGetNvSciSyncAttributes`.
+    When ``flags`` of :py:obj:`~.cuDeviceGetNvSciSyncAttributes` is set to this, it indicates that application needs signaler specific NvSciSyncAttr to be filled by :py:obj:`~.cuDeviceGetNvSciSyncAttributes`.
 
 .. autoattribute:: cuda.bindings.driver.CUDA_NVSCISYNC_ATTR_WAIT
 
-    When `flags` of :py:obj:`~.cuDeviceGetNvSciSyncAttributes` is set to this, it indicates that application needs waiter specific NvSciSyncAttr to be filled by :py:obj:`~.cuDeviceGetNvSciSyncAttributes`.
+    When ``flags`` of :py:obj:`~.cuDeviceGetNvSciSyncAttributes` is set to this, it indicates that application needs waiter specific NvSciSyncAttr to be filled by :py:obj:`~.cuDeviceGetNvSciSyncAttributes`.
 
 .. autoattribute:: cuda.bindings.driver.CU_MEM_CREATE_USAGE_TILE_POOL
 
@@ -6643,7 +6643,7 @@ Data types used by CUDA driver
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_END
 
-    End of array terminator for the `extra` parameter to :py:obj:`~.cuLaunchKernel`
+    End of array terminator for the ``extra`` parameter to :py:obj:`~.cuLaunchKernel`
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT
 
@@ -6651,7 +6651,7 @@ Data types used by CUDA driver
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_BUFFER_POINTER
 
-    Indicator that the next value in the `extra` parameter to :py:obj:`~.cuLaunchKernel` will be a pointer to a buffer containing all kernel parameters used for launching kernel `f`. This buffer needs to honor all alignment/padding requirements of the individual parameters. If :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE` is not also specified in the `extra` array, then :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER` will have no effect.
+    Indicator that the next value in the ``extra`` parameter to :py:obj:`~.cuLaunchKernel` will be a pointer to a buffer containing all kernel parameters used for launching kernel ``f``. This buffer needs to honor all alignment/padding requirements of the individual parameters. If :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE` is not also specified in the ``extra`` array, then :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER` will have no effect.
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT
 
@@ -6659,7 +6659,7 @@ Data types used by CUDA driver
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_BUFFER_SIZE
 
-    Indicator that the next value in the `extra` parameter to :py:obj:`~.cuLaunchKernel` will be a pointer to a size_t which contains the size of the buffer specified with :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER`. It is required that :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER` also be specified in the `extra` array if the value associated with :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE` is not zero.
+    Indicator that the next value in the ``extra`` parameter to :py:obj:`~.cuLaunchKernel` will be a pointer to a size_t which contains the size of the buffer specified with :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER`. It is required that :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER` also be specified in the ``extra`` array if the value associated with :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE` is not zero.
 
 .. autoattribute:: cuda.bindings.driver.CU_PARAM_TR_DEFAULT
 
@@ -7266,7 +7266,7 @@ This section describes the stream memory operations of the low-level CUDA driver
 
 
 
-Support for the CU_STREAM_WAIT_VALUE_NOR flag can be queried with ::CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2.
+Support for the CU_STREAM_WAIT_VALUE_NOR flag can be queried with ``CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2``.
 
 
 
@@ -7715,7 +7715,7 @@ For ``CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG``\ , the resource specifies the expe
 
 
 
-The maximum concurrency limit depends on ::CUDA_DEVICE_MAX_CONNECTIONS and can be queried from the primary context via cuCtxGetDevResource. Configurations may exceed this concurrency limit, but the driver will not guarantee that work submission remains non-overlapping.
+The maximum concurrency limit depends on ``CUDA_DEVICE_MAX_CONNECTIONS`` and can be queried from the primary context via cuCtxGetDevResource. Configurations may exceed this concurrency limit, but the driver will not guarantee that work submission remains non-overlapping.
 
 
 
diff --git a/cuda_bindings/docs/source/module/nvrtc.rst b/cuda_bindings/docs/source/module/nvrtc.rst
index 7c0da68141..74f041d3b1 100644
--- a/cuda_bindings/docs/source/module/nvrtc.rst
+++ b/cuda_bindings/docs/source/module/nvrtc.rst
@@ -1,4 +1,4 @@
-.. SPDX-FileCopyrightText: Copyright (c) 2021-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 .. SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 -----
@@ -319,7 +319,7 @@ Generate line-number information.
 
 
 
-  - ``--dopt=on``\  
+  - ``--dopt=on``\
 
 Enable device code optimization. When specified along with ``-G``\ , enables limited debug information generation for optimized device code (currently, only line number information). When ``-G``\  is not specified, ``-dopt=on``\  is implicit.
 
@@ -347,7 +347,7 @@ Specify the fast-compile level for device code, which controls the tradeoff betw
 
 
 
-  - ``--ptxas-options=<options>``\  
+  - ``--ptxas-options=<options>``\
 
 Specify options directly to ptxas, the PTX optimizing assembler.
 
@@ -395,7 +395,7 @@ For single-precision floating-point square root, use IEEE round-to-nearest mode
 
 
 
-    - Default: ``true``\  
+    - Default: ``true``\
 
 
 
@@ -555,7 +555,7 @@ Add the directory ``<dir>``\  to the list of directories to be searched for head
 
 
 
-  - ``--use-bundled-headers=<dir>``\  
+  - ``--use-bundled-headers=<dir>``\
 
 Install bundled CUDA headers to ``<dir>``\  and add include paths. This is a convenience flag that combines calling nvrtcInstallBundledHeaders and adding ``-I<dir>``\  and ``-I<dir>/cccl``\  to the include search path. Headers are installed only if they don't already exist at the specified location.
 
@@ -900,4 +900,3 @@ Enable stack canaries in device code. Stack canaries make it more difficult to e
 
 
   - ``--fdevice-time-trace=<file-name>``\  (``-fdevice-time-trace=<file-name>``\ ) Enables the time profiler, outputting a JSON file based on given <file-name>. Results can be analyzed on chrome://tracing for a flamegraph visualization.
-
diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst
index 5f7b517756..315583c948 100644
--- a/cuda_bindings/docs/source/module/runtime.rst
+++ b/cuda_bindings/docs/source/module/runtime.rst
@@ -137,7 +137,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidConfiguration
 
 
-        This indicates that a kernel launch is requesting resources that can never be satisfied by the current device. Requesting more shared memory per block than the device supports will trigger this error, as will requesting too many threads or blocks. See :py:obj:`~.cudaDeviceProp` for more device limitations.
+        This indicates that a kernel launch is requesting resources that can never be satisfied by the current device. Requesting more shared memory per block than the device supports will trigger this error, as will requesting too many threads or blocks. See cudaDeviceProp for more device limitations.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorVersionTranslation
@@ -299,7 +299,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorIncompatibleDriverContext
 
 
-        This indicates that the current context is not compatible with this the CUDA Runtime. This can only occur if you are using CUDA Runtime/Driver interoperability and have created an existing Driver context using the driver API. The Driver context may be incompatible either because the Driver context was created using an older version of the API, because the Runtime API call expects a primary driver context and the Driver context is not primary, or because the Driver context has been destroyed. Please see :py:obj:`~.Interactions`with the CUDA Driver API" for more information.
+        This indicates that the current context is not compatible with this the CUDA Runtime. This can only occur if you are using CUDA Runtime/Driver interoperability and have created an existing Driver context using the driver API. The Driver context may be incompatible either because the Driver context was created using an older version of the API, because the Runtime API call expects a primary driver context and the Driver context is not primary, or because the Driver context has been destroyed. Please see Interactions with the CUDA Driver API for more information.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMissingConfiguration
@@ -3474,7 +3474,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaJitOption.cudaJitCacheMode
 
 
-        Specifies whether to enable caching explicitly (-dlcm) 
+        Specifies whether to enable caching explicitly (-dlcm)
 
         Choice is based on supplied :py:obj:`~.cudaJit_CacheMode`.
 
@@ -3524,7 +3524,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaLibraryOption.cudaLibraryBinaryIsPreserved
 
 
-        Specifes that the argument `code` passed to :py:obj:`~.cudaLibraryLoadData()` will be preserved. Specifying this option will let the driver know that `code` can be accessed at any point until :py:obj:`~.cudaLibraryUnload()`. The default behavior is for the driver to allocate and maintain its own copy of `code`. Note that this is only a memory usage optimization hint and the driver can choose to ignore it if required. Specifying this option with :py:obj:`~.cudaLibraryLoadFromFile()` is invalid and will return :py:obj:`~.cudaErrorInvalidValue`.
+        Specifes that the argument ``code`` passed to :py:obj:`~.cudaLibraryLoadData()` will be preserved. Specifying this option will let the driver know that ``code`` can be accessed at any point until :py:obj:`~.cudaLibraryUnload()`. The default behavior is for the driver to allocate and maintain its own copy of ``code``. Note that this is only a memory usage optimization hint and the driver can choose to ignore it if required. Specifying this option with :py:obj:`~.cudaLibraryLoadFromFile()` is invalid and will return :py:obj:`~.cudaErrorInvalidValue`.
 
 .. autoclass:: cuda.bindings.runtime.cudaJit_CacheMode
 
@@ -3614,7 +3614,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphConditionalNodeType.cudaGraphCondTypeIf
 
 
-        Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If `size` == 2, an optional ELSE graph is created and this is executed if the condition is zero.
+        Conditional 'if/else' Node. Body[0] executed if condition is non-zero. If ``size`` == 2, an optional ELSE graph is created and this is executed if the condition is zero.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphConditionalNodeType.cudaGraphCondTypeWhile
@@ -3778,7 +3778,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDependencyType.cudaGraphDependencyTypeProgrammatic
 
 
-        This dependency type allows the downstream node to use `cudaGridDependencySynchronize()`. It may only be used between kernel nodes, and must be used with either the :py:obj:`~.cudaGraphKernelNodePortProgrammatic` or :py:obj:`~.cudaGraphKernelNodePortLaunchCompletion` outgoing port.
+        This dependency type allows the downstream node to use ``cudaGridDependencySynchronize()``. It may only be used between kernel nodes, and must be used with either the :py:obj:`~.cudaGraphKernelNodePortProgrammatic` or :py:obj:`~.cudaGraphKernelNodePortLaunchCompletion` outgoing port.
 
 .. autoclass:: cuda.bindings.runtime.cudaGraphExecUpdateResult
 
@@ -3946,25 +3946,25 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsKernelNodeParams
 
 
-        Adds :py:obj:`~.cudaKernelNodeParams` to output
+        Adds cudaKernelNodeParams to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsMemcpyNodeParams
 
 
-        Adds :py:obj:`~.cudaMemcpy3DParms` to output
+        Adds cudaMemcpy3DParms to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsMemsetNodeParams
 
 
-        Adds :py:obj:`~.cudaMemsetParams` to output
+        Adds cudaMemsetParams to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsHostNodeParams
 
 
-        Adds :py:obj:`~.cudaHostNodeParams` to output
+        Adds cudaHostNodeParams to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsEventNodeParams
@@ -3976,13 +3976,13 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasSignalNodeParams
 
 
-        Adds :py:obj:`~.cudaExternalSemaphoreSignalNodeParams` values to output
+        Adds cudaExternalSemaphoreSignalNodeParams values to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasWaitNodeParams
 
 
-        Adds :py:obj:`~.cudaExternalSemaphoreWaitNodeParams` to output
+        Adds cudaExternalSemaphoreWaitNodeParams to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsKernelNodeAttributes
@@ -4000,7 +4000,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsConditionalNodeParams
 
 
-        Adds :py:obj:`~.cudaConditionalNodeParams` to output
+        Adds cudaConditionalNodeParams to output
 
 .. autoclass:: cuda.bindings.runtime.cudaGraphInstantiateFlags
 
@@ -4013,19 +4013,19 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagUpload
 
 
-        Automatically upload the graph after instantiation. Only supported by 
+        Automatically upload the graph after instantiation. Only supported by
 
-         :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the 
+         :py:obj:`~.cudaGraphInstantiateWithParams`. The upload will be performed using the
 
-         stream provided in `instantiateParams`.
+         stream provided in ``instantiateParams``.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagDeviceLaunch
 
 
-        Instantiate the graph to be launchable from the device. This flag can only 
+        Instantiate the graph to be launchable from the device. This flag can only
 
-         be used on platforms which support unified addressing. This flag cannot be 
+         be used on platforms which support unified addressing. This flag cannot be
 
          used in conjunction with cudaGraphInstantiateFlagAutoFreeOnLaunch.
 
@@ -4078,43 +4078,43 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeAccessPolicyWindow
 
 
-        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::accessPolicyWindow.
+        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.accessPolicyWindow`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeCooperative
 
 
-        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::cooperative.
+        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.cooperative`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeSynchronizationPolicy
 
 
-        Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue`::syncPolicy.
+        Valid for streams. See :py:obj:`~.cudaLaunchAttributeValue.syncPolicy`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeClusterDimension
 
 
-        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::clusterDim.
+        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeClusterSchedulingPolicyPreference
 
 
-        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::clusterSchedulingPolicyPreference.
+        Valid for graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.clusterSchedulingPolicyPreference`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticStreamSerialization
 
 
-        Valid for launches. Setting :py:obj:`~.cudaLaunchAttributeValue`::programmaticStreamSerializationAllowed to non-0 signals that the kernel will use programmatic means to resolve its stream dependency, so that the CUDA runtime should opportunistically allow the grid's execution to overlap with the previous kernel in the stream, if that kernel requests the overlap. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions).
+        Valid for launches. Setting :py:obj:`~.cudaLaunchAttributeValue.programmaticStreamSerializationAllowed` to non-0 signals that the kernel will use programmatic means to resolve its stream dependency, so that the CUDA runtime should opportunistically allow the grid's execution to overlap with the previous kernel in the stream, if that kernel requests the overlap. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions).
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeProgrammaticEvent
 
 
-        Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue`::programmaticEvent to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks. 
+        Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.programmaticEvent` to record the event. Event recorded through this launch attribute is guaranteed to only trigger after all block in the associated kernel trigger the event. A block can trigger the event programmatically in a future CUDA release. A trigger can also be inserted at the beginning of each block's execution if triggerAtBlockStart is set to non-0. The dependent launches can choose to wait on the dependency using the programmatic sync (cudaGridDependencySynchronize() or equivalent PTX instructions). Note that dependents (including the CPU thread calling :py:obj:`~.cudaEventSynchronize()`) are not guaranteed to observe the release precisely when it is released. For example, :py:obj:`~.cudaEventSynchronize()` may only observe the event trigger long after the associated kernel has completed. This recording type is primarily meant for establishing programmatic dependency between device tasks. Note also this type of dependency allows, but does not guarantee, concurrent execution of tasks.
 
          The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set).
 
@@ -4122,39 +4122,39 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePriority
 
 
-        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::priority.
+        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.priority`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomainMap
 
 
-        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::memSyncDomainMap.
+        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.memSyncDomainMap`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeMemSyncDomain
 
 
-        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue`::memSyncDomain.
+        Valid for streams, graph nodes, launches. See :py:obj:`~.cudaLaunchAttributeValue.memSyncDomain`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePreferredClusterDimension
 
 
-        Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue`::preferredClusterDim to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible. 
+        Valid for graph nodes and launches. Set :py:obj:`~.cudaLaunchAttributeValue.preferredClusterDim` to allow the kernel launch to specify a preferred substitute cluster dimension. Blocks may be grouped according to either the dimensions specified with this attribute (grouped into a "preferred substitute cluster"), or the one specified with :py:obj:`~.cudaLaunchAttributeClusterDimension` attribute (grouped into a "regular cluster"). The cluster dimensions of a "preferred substitute cluster" shall be an integer multiple greater than zero of the regular cluster dimensions. The device will attempt - on a best-effort basis - to group thread blocks into preferred clusters over grouping them into regular clusters. When it deems necessary (primarily when the device temporarily runs out of physical resources to launch the larger preferred clusters), the device may switch to launch the regular clusters instead to attempt to utilize as much of the physical device resources as possible.
 
-         Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks. 
+         Each type of cluster will have its enumeration / coordinate setup as if the grid consists solely of its type of cluster. For example, if the preferred substitute cluster dimensions double the regular cluster dimensions, there might be simultaneously a regular cluster indexed at (1,0,0), and a preferred cluster indexed at (1,0,0). In this example, the preferred substitute cluster (1,0,0) replaces regular clusters (2,0,0) and (3,0,0) and groups their blocks.
 
-         This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than `maxBlocksPerCluster`, if it is set in the kernel's `__launch_bounds__`. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted.
+         This attribute will only take effect when a regular cluster dimension has been specified. The preferred substitute cluster dimension must be an integer multiple greater than zero of the regular cluster dimension and must divide the grid. It must also be no more than ``maxBlocksPerCluster``, if it is set in the kernel's ``__launch_bounds__``. Otherwise it must be less than the maximum value the driver can support. Otherwise, setting this attribute to a value physically unable to fit on any particular device is permitted.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeLaunchCompletionEvent
 
 
-        Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue`::launchCompletionEvent to record the event. 
+        Valid for launches. Set :py:obj:`~.cudaLaunchAttributeValue.launchCompletionEvent` to record the event.
 
-         Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock. 
+         Nominally, the event is triggered once all blocks of the kernel have begun execution. Currently this is a best effort. If a kernel B has a launch completion dependency on a kernel A, B may wait until A is complete. Alternatively, blocks of B may begin before all blocks of A have begun, for example if B can claim execution resources unavailable to A (e.g. they run on different GPUs) or if B is a higher priority than A. Exercise caution if such an ordering inversion could lead to deadlock.
 
-         A launch completion event is nominally similar to a programmatic event with `triggerAtBlockStart` set except that it is not visible to `cudaGridDependencySynchronize()` and can be used with compute capability less than 9.0. 
+         A launch completion event is nominally similar to a programmatic event with ``triggerAtBlockStart`` set except that it is not visible to ``cudaGridDependencySynchronize()`` and can be used with compute capability less than 9.0.
 
          The event supplied must not be an interprocess or interop event. The event must disable timing (i.e. must be created with the :py:obj:`~.cudaEventDisableTiming` flag set).
 
@@ -4162,11 +4162,11 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeDeviceUpdatableKernelNode
 
 
-        Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error. 
+        Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error.
 
-         :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue`::deviceUpdatableKernelNode::devNode which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`. 
+         :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode` which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.
 
-         Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`. 
+         Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`.
 
          If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again.
 
@@ -4174,27 +4174,27 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout
 
 
-        Valid for launches. On devices where the L1 cache and shared memory use the same hardware resources, setting :py:obj:`~.cudaLaunchAttributeValue`::sharedMemCarveout to a percentage between 0-100 signals sets the shared memory carveout preference in percent of the total shared memory for that kernel launch. This attribute takes precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`. This is only a hint, and the driver can choose a different configuration if required for the launch.
+        Valid for launches. On devices where the L1 cache and shared memory use the same hardware resources, setting :py:obj:`~.cudaLaunchAttributeValue.sharedMemCarveout` to a percentage between 0-100 signals sets the shared memory carveout preference in percent of the total shared memory for that kernel launch. This attribute takes precedence over :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout`. This is only a hint, and the driver can choose a different configuration if required for the launch.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeNvlinkUtilCentricScheduling
 
 
-        Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization. 
+        Valid for streams, graph nodes, launches. This attribute is a hint to the CUDA runtime that the launch should attempt to make the kernel maximize its NVLINK utilization.
 
 
 
-         When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption. 
+         When possible to honor this hint, CUDA will assume each block in the grid launch will carry out an even amount of NVLINK traffic, and make a best-effort attempt to adjust the kernel launch based on that assumption.
 
-         This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application. 
+         This attribute is a hint only. CUDA makes no functional or performance guarantee. Its applicability can be affected by many different factors, including driver version (i.e. CUDA doesn't guarantee the performance characteristics will be maintained between driver versions or a driver update could alter or regress previously observed perf characteristics.) It also doesn't guarantee a successful result, i.e. applying the attribute may not improve the performance of either the targeted kernel or the encapsulating application.
 
-         Valid values for :py:obj:`~.cudaLaunchAttributeValue`::nvlinkUtilCentricScheduling are 0 (disabled) and 1 (enabled).
+         Valid values for :py:obj:`~.cudaLaunchAttributeValue.nvlinkUtilCentricScheduling` are 0 (disabled) and 1 (enabled).
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePortableClusterSizeMode
 
 
-        Valid for graph nodes, launches. This indicates whether the kernel launch is allowed to use a non-portable cluster size. Valid values for :py:obj:`~.cudaLaunchAttributeValue`::portableClusterSizeMode are values for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will return :py:obj:`~.cudaErrorInvalidValue`
+        Valid for graph nodes, launches. This indicates whether the kernel launch is allowed to use a non-portable cluster size. Valid values for :py:obj:`~.cudaLaunchAttributeValue.portableClusterSizeMode` are values for :py:obj:`~.cudaLaunchAttributePortableClusterMode` Any other value will return :py:obj:`~.cudaErrorInvalidValue`
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributeSharedMemoryMode
@@ -5317,11 +5317,11 @@ Data types used by CUDA Runtime
 
 .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreSignalSkipNvSciBufMemSync
 
-    When the /p flags parameter of :py:obj:`~.cudaExternalSemaphoreSignalParams` contains this flag, it indicates that signaling an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
+    When the /p flags parameter of cudaExternalSemaphoreSignalParams contains this flag, it indicates that signaling an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
 
 .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreWaitSkipNvSciBufMemSync
 
-    When the /p flags parameter of :py:obj:`~.cudaExternalSemaphoreWaitParams` contains this flag, it indicates that waiting an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
+    When the /p flags parameter of cudaExternalSemaphoreWaitParams contains this flag, it indicates that waiting an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
 
 .. autoattribute:: cuda.bindings.runtime.cudaNvSciSyncAttrSignal
 
@@ -5657,7 +5657,7 @@ This section describes the unified addressing functions of the CUDA runtime appl
 
 
 
-CUDA devices can share a unified address space with the host. 
+CUDA devices can share a unified address space with the host.
 
  For these devices there is no distinction between a device pointer and a host pointer -- the same pointer value may be used to access memory from the host program and from a kernel running on the device (with exceptions enumerated below).
 
@@ -5683,7 +5683,7 @@ Unified addressing is automatically enabled in 64-bit processes .
 
 It is possible to look up information about the memory which backs a pointer value. For instance, one may want to know if a pointer points to host or device memory. As another example, in the case of device memory, one may want to know on which CUDA device the memory resides. These properties may be queried using the function cudaPointerGetAttributes()
 
-Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions. 
+Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions.
 
  The copy direction cudaMemcpyDefault may be used to specify that the CUDA runtime should infer the location of the pointer from its value.
 
@@ -5697,7 +5697,7 @@ Since pointers are unique, it is not necessary to specify information about the
 
 All host memory allocated through all devices using cudaMallocHost() and cudaHostAlloc() is always directly accessible from all devices that support unified addressing. This is the case regardless of whether or not the flags cudaHostAllocPortable and cudaHostAllocMapped are specified.
 
-The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations. 
+The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations.
 
 
 
@@ -5721,7 +5721,7 @@ Upon enabling direct access from a device that supports unified addressing to an
 
 
 
-Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing. 
+Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing.
 
 
 
@@ -6064,7 +6064,7 @@ Once you have an execution context at hand, you can perform context-level operat
 
 Note: The above APIs take in an explicit cudaExecutionContext_t handle and ignores the context that is current to the calling thread. This enables explicit context-based programming without relying on thread-local state. If no context is specified, the APIs return cudaErrorInvalidValue.
 
-Note: Developers should treat cudaExecutionContext_t as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into driver-level contexts, such as ::CUcontext or ::CUgreenCtx.
+Note: Developers should treat cudaExecutionContext_t as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into driver-level contexts, such as ``CUcontext`` or ``CUgreenCtx``.
 
 
 
@@ -6236,7 +6236,7 @@ For ``cudaDevResourceTypeWorkqueueConfig``\ , the resource specifies the expecte
 
 
 
-The maximum concurrency limit depends on ::CUDA_DEVICE_MAX_CONNECTIONS and can be queried from the device via cudaDeviceGetDevResource. Configurations may exceed this concurrency limit, but the driver will not guarantee that work submission remains non-overlapping.
+The maximum concurrency limit depends on ``CUDA_DEVICE_MAX_CONNECTIONS`` and can be queried from the device via cudaDeviceGetDevResource. Configurations may exceed this concurrency limit, but the driver will not guarantee that work submission remains non-overlapping.
 
 For ``cudaDevResourceTypeWorkqueue``\ , the resource represents a pre-existing workqueue that can be retrieved from existing execution contexts. This allows reusing workqueue resources across different execution contexts.
 
@@ -6323,7 +6323,7 @@ There are two primary ways to obtain an execution context:
 
 
 
-Note: Developers should treat cudaExecutionContext_t as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into a ::CUcontext or ::CUgreenCtx.
+Note: Developers should treat cudaExecutionContext_t as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into a ``CUcontext`` or ``CUgreenCtx``.
 
 
 
@@ -6337,7 +6337,7 @@ The primary context is the default execution context associated with a device in
 
 From the CUDA Runtime’s perspective, a device and its primary context are functionally synonymous.
 
-Unless explicitly overridden, either by making a different context current via the Driver API (e.g., ::cuCtxSetCurrent()) or by using an explicit execution context handle, the Runtime will implicitly initialize and use the primary context for API calls as needed.
+Unless explicitly overridden, either by making a different context current via the Driver API (e.g., ``cuCtxSetCurrent()``) or by using an explicit execution context handle, the Runtime will implicitly initialize and use the primary context for API calls as needed.
 
 
 
@@ -6347,11 +6347,11 @@ Unless explicitly overridden, either by making a different context current via t
 
 
 
-Unless an explicit execution context is specified (see “Execution Context Management” for APIs), CUDA Runtime API calls operate on the CUDA Driver ::CUcontext which is current to the calling host thread. If no ::CUcontext is current to the calling thread when a CUDA Runtime API call which requires an active context is made, then the primary context (device execution context) for a device will be selected, made current to the calling thread, and initialized. The context will be initialized using the parameters specified by the CUDA Runtime API functions cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(), ::cudaD3D10SetDirect3DDevice(), ::cudaD3D11SetDirect3DDevice(), cudaGLSetGLDevice(), and cudaVDPAUSetVDPAUDevice(). Note that these functions will fail with cudaErrorSetOnActiveProcess if they are called when the primary context for the specified device has already been initialized, except for cudaSetDeviceFlags() which will simply overwrite the previous settings.
+Unless an explicit execution context is specified (see “Execution Context Management” for APIs), CUDA Runtime API calls operate on the CUDA Driver ``CUcontext`` which is current to the calling host thread. If no ``CUcontext`` is current to the calling thread when a CUDA Runtime API call which requires an active context is made, then the primary context (device execution context) for a device will be selected, made current to the calling thread, and initialized. The context will be initialized using the parameters specified by the CUDA Runtime API functions cudaSetDeviceFlags(), ``cudaD3D9SetDirect3DDevice()``, ``cudaD3D10SetDirect3DDevice()``, ``cudaD3D11SetDirect3DDevice()``, cudaGLSetGLDevice(), and cudaVDPAUSetVDPAUDevice(). Note that these functions will fail with cudaErrorSetOnActiveProcess if they are called when the primary context for the specified device has already been initialized, except for cudaSetDeviceFlags() which will simply overwrite the previous settings.
 
 The function cudaInitDevice() ensures that the primary context is initialized for the requested device but does not make it current to the calling thread.
 
-The function cudaSetDevice() initializes the primary context for the specified device and makes it current to the calling thread by calling ::cuCtxSetCurrent().
+The function cudaSetDevice() initializes the primary context for the specified device and makes it current to the calling thread by calling ``cuCtxSetCurrent()``.
 
 Primary contexts will remain active until they are explicitly deinitialized using cudaDeviceReset(). The function cudaDeviceReset() will deinitialize the primary context for the calling thread's current device immediately. The context will remain current to all of the threads that it was current to. The next CUDA Runtime API call on any thread which requires an active context will trigger the reinitialization of that device's primary context.
 
@@ -6365,15 +6365,15 @@ Note that primary contexts are shared resources. It is recommended that the prim
 
 
 
-Note that the use of multiple ::CUcontext s per device within a single process will substantially degrade performance and is strongly discouraged. Instead, it is highly recommended to either use execution contexts cudaExecutionContext_t or the implicit one-to-one device-to-primary context mapping for the process provided by the CUDA Runtime API.
+Note that the use of multiple ``CUcontext`` s per device within a single process will substantially degrade performance and is strongly discouraged. Instead, it is highly recommended to either use execution contexts cudaExecutionContext_t or the implicit one-to-one device-to-primary context mapping for the process provided by the CUDA Runtime API.
 
-If a non-primary ::CUcontext created by the CUDA Driver API is current to a thread then the CUDA Runtime API calls to that thread will operate on that ::CUcontext, with some exceptions listed below. Interoperability between data types is discussed in the following sections.
+If a non-primary ``CUcontext`` created by the CUDA Driver API is current to a thread then the CUDA Runtime API calls to that thread will operate on that ``CUcontext``, with some exceptions listed below. Interoperability between data types is discussed in the following sections.
 
 The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary CUcontext is current. To use the peer access APIs with a context created using the CUDA Driver API, it is necessary that the CUDA Driver API be used to access these features.
 
-All CUDA Runtime API state (e.g, global variables' addresses and values) travels with its underlying ::CUcontext. In particular, if a ::CUcontext is moved from one thread to another then all CUDA Runtime API state will move to that thread as well.
+All CUDA Runtime API state (e.g, global variables' addresses and values) travels with its underlying ``CUcontext``. In particular, if a ``CUcontext`` is moved from one thread to another then all CUDA Runtime API state will move to that thread as well.
 
-Please note that attaching to legacy CUcontext (those with a version of 3010 as returned by ::cuCtxGetApiVersion()) is not possible. The CUDA Runtime will return cudaErrorIncompatibleDriverContext in such cases.
+Please note that attaching to legacy CUcontext (those with a version of 3010 as returned by ``cuCtxGetApiVersion()``) is not possible. The CUDA Runtime will return cudaErrorIncompatibleDriverContext in such cases.
 
 
 
@@ -6383,7 +6383,7 @@ Please note that attaching to legacy CUcontext (those with a version of 3010 as
 
 
 
-The types ::CUstream and cudaStream_t are identical and may be used interchangeably.
+The types ``CUstream`` and cudaStream_t are identical and may be used interchangeably.
 
 
 
@@ -6393,7 +6393,7 @@ The types ::CUstream and cudaStream_t are identical and may be used interchangea
 
 
 
-The types ::CUevent and cudaEvent_t are identical and may be used interchangeably.
+The types ``CUevent`` and cudaEvent_t are identical and may be used interchangeably.
 
 
 
@@ -6407,7 +6407,7 @@ The types ::CUarray and struct ::cudaArray * represent the same data type and ma
 
 In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray *, it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray *.
 
-In order to use a struct ::cudaArray * in a CUDA Driver API function which takes a ::CUarray, it is necessary to explicitly cast the struct ::cudaArray * to a ::CUarray .
+In order to use a ``struct cudaArray *`` in a CUDA Driver API function which takes a ``CUarray``, it is necessary to explicitly cast the ``struct cudaArray *`` to a ``CUarray`` .
 
 
 
@@ -6417,11 +6417,11 @@ In order to use a struct ::cudaArray * in a CUDA Driver API function which takes
 
 
 
-The types ::CUgraphicsResource and cudaGraphicsResource_t represent the same data type and may be used interchangeably by casting the two types between each other.
+The types ``CUgraphicsResource`` and cudaGraphicsResource_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a ::CUgraphicsResource in a CUDA Runtime API function which takes a cudaGraphicsResource_t, it is necessary to explicitly cast the ::CUgraphicsResource to a cudaGraphicsResource_t.
+In order to use a ``CUgraphicsResource`` in a CUDA Runtime API function which takes a cudaGraphicsResource_t, it is necessary to explicitly cast the ``CUgraphicsResource`` to a cudaGraphicsResource_t.
 
-In order to use a cudaGraphicsResource_t in a CUDA Driver API function which takes a ::CUgraphicsResource, it is necessary to explicitly cast the cudaGraphicsResource_t to a ::CUgraphicsResource.
+In order to use a cudaGraphicsResource_t in a CUDA Driver API function which takes a ``CUgraphicsResource``, it is necessary to explicitly cast the cudaGraphicsResource_t to a ``CUgraphicsResource``.
 
 
 
@@ -6431,11 +6431,11 @@ In order to use a cudaGraphicsResource_t in a CUDA Driver API function which tak
 
 
 
-The types ::CUtexObject and cudaTextureObject_t represent the same data type and may be used interchangeably by casting the two types between each other.
+The types ``CUtexObject`` and cudaTextureObject_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a ::CUtexObject in a CUDA Runtime API function which takes a cudaTextureObject_t, it is necessary to explicitly cast the ::CUtexObject to a cudaTextureObject_t.
+In order to use a ``CUtexObject`` in a CUDA Runtime API function which takes a cudaTextureObject_t, it is necessary to explicitly cast the ``CUtexObject`` to a cudaTextureObject_t.
 
-In order to use a cudaTextureObject_t in a CUDA Driver API function which takes a ::CUtexObject, it is necessary to explicitly cast the cudaTextureObject_t to a ::CUtexObject.
+In order to use a cudaTextureObject_t in a CUDA Driver API function which takes a ``CUtexObject``, it is necessary to explicitly cast the cudaTextureObject_t to a ``CUtexObject``.
 
 
 
@@ -6445,11 +6445,11 @@ In order to use a cudaTextureObject_t in a CUDA Driver API function which takes
 
 
 
-The types ::CUsurfObject and cudaSurfaceObject_t represent the same data type and may be used interchangeably by casting the two types between each other.
+The types ``CUsurfObject`` and cudaSurfaceObject_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a ::CUsurfObject in a CUDA Runtime API function which takes a cudaSurfaceObject_t, it is necessary to explicitly cast the ::CUsurfObject to a cudaSurfaceObject_t.
+In order to use a ``CUsurfObject`` in a CUDA Runtime API function which takes a cudaSurfaceObject_t, it is necessary to explicitly cast the ``CUsurfObject`` to a cudaSurfaceObject_t.
 
-In order to use a cudaSurfaceObject_t in a CUDA Driver API function which takes a ::CUsurfObject, it is necessary to explicitly cast the cudaSurfaceObject_t to a ::CUsurfObject.
+In order to use a cudaSurfaceObject_t in a CUDA Driver API function which takes a ``CUsurfObject``, it is necessary to explicitly cast the cudaSurfaceObject_t to a ``CUsurfObject``.
 
 
 
@@ -6459,9 +6459,9 @@ In order to use a cudaSurfaceObject_t in a CUDA Driver API function which takes
 
 
 
-The types ::CUfunction and cudaFunction_t represent the same data type and may be used interchangeably by casting the two types between each other.
+The types ``CUfunction`` and cudaFunction_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a cudaFunction_t in a CUDA Driver API function which takes a ::CUfunction, it is necessary to explicitly cast the cudaFunction_t to a ::CUfunction.
+In order to use a cudaFunction_t in a CUDA Driver API function which takes a ``CUfunction``, it is necessary to explicitly cast the cudaFunction_t to a ``CUfunction``.
 
 
 
@@ -6471,9 +6471,9 @@ In order to use a cudaFunction_t in a CUDA Driver API function which takes a ::C
 
 
 
-The types ::CUkernel and cudaKernel_t represent the same data type and may be used interchangeably by casting the two types between each other.
+The types ``CUkernel`` and cudaKernel_t represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a cudaKernel_t in a CUDA Driver API function which takes a ::CUkernel, it is necessary to explicitly cast the cudaKernel_t to a ::CUkernel.
+In order to use a cudaKernel_t in a CUDA Driver API function which takes a ``CUkernel``, it is necessary to explicitly cast the cudaKernel_t to a ``CUkernel``.
 
 .. autofunction:: cuda.bindings.runtime.cudaGetKernel
 

From 5e4d9ac747e04ec13974ac69879a2092c5267852 Mon Sep 17 00:00:00 2001
From: Keith Kraus <keith.j.kraus@gmail.com>
Date: Wed, 3 Jun 2026 16:12:12 -0400
Subject: [PATCH 2/5] Regenerate pointer type doc markup

---
 cuda_bindings/cuda/bindings/driver.pyx.in    | 20 ++++++++++----------
 cuda_bindings/cuda/bindings/runtime.pyx.in   |  4 ++--
 cuda_bindings/docs/source/module/driver.rst  |  8 ++++----
 cuda_bindings/docs/source/module/runtime.rst |  4 ++--
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index 07c4b5d4f7..723ae76f9d 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -3053,7 +3053,7 @@ class CUjit_option(_FastEnum):
         'Pointer to a buffer in which to print any log messages that are\n'
         'informational in nature (the buffer size is specified via option\n'
         ':py:obj:`~.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES`)\n'
-        'Option type: char *\n'
+        'Option type: ``char *``\n'
         'Applies to: compiler and linker\n'
     ){{endif}}
     {{if 'CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES' in found_values}}
@@ -3073,7 +3073,7 @@ class CUjit_option(_FastEnum):
         'Pointer to a buffer in which to print any log messages that reflect errors\n'
         '(the buffer size is specified via option\n'
         ':py:obj:`~.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES`)\n'
-        'Option type: char *\n'
+        'Option type: ``char *``\n'
         'Applies to: compiler and linker\n'
     ){{endif}}
     {{if 'CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES' in found_values}}
@@ -3182,7 +3182,7 @@ class CUjit_option(_FastEnum):
         'It is only allowed to register symbols that correspond to unresolved global\n'
         'variables.\n'
         'It is illegal to register the same device symbol at multiple addresses.\n'
-        'Option type: const char **\n'
+        'Option type: ``const char **``\n'
         'Applies to: dynamic linker only\n'
     ){{endif}}
     {{if 'CU_JIT_GLOBAL_SYMBOL_ADDRESSES' in found_values}}
@@ -3192,7 +3192,7 @@ class CUjit_option(_FastEnum):
         'Array of host addresses that will be used to relocate corresponding device\n'
         'symbols stored in :py:obj:`~.CU_JIT_GLOBAL_SYMBOL_NAMES`.\n'
         'Must contain :py:obj:`~.CU_JIT_GLOBAL_SYMBOL_COUNT` entries.\n'
-        'Option type: void **\n'
+        'Option type: ``void **``\n'
         'Applies to: dynamic linker only\n'
     ){{endif}}
     {{if 'CU_JIT_GLOBAL_SYMBOL_COUNT' in found_values}}
@@ -29805,7 +29805,7 @@ def cuLinkCreate(unsigned int numOptions, options : Optional[tuple[CUjit_option]
     options : list[:py:obj:`~.CUjit_option`]
         Array of linker and compiler options
     optionValues : list[Any]
-        Array of option values, each cast to void *
+        Array of option values, each cast to ``void *``
 
     Returns
     -------
@@ -29877,7 +29877,7 @@ def cuLinkAddData(state, typename not None : CUjitInputType, data, size_t size,
         Options to be applied only for this input (overrides options from
         :py:obj:`~.cuLinkCreate`).
     optionValues : list[Any]
-        Array of option values, each cast to void *.
+        Array of option values, each cast to ``void *``.
 
     Returns
     -------
@@ -29950,7 +29950,7 @@ def cuLinkAddFile(state, typename not None : CUjitInputType, char* path, unsigne
         Options to be applied only for this input (overrides options from
         :py:obj:`~.cuLinkCreate`)
     optionValues : list[Any]
-        Array of option values, each cast to void *
+        Array of option values, each cast to ``void *``
 
     Returns
     -------
@@ -39597,9 +39597,9 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_HOST_POINTER`:
 
     - Returns in ``*data`` the host pointer value through which ``ptr`` may
-      be accessed by by the host program. The type of ``data`` must be void
-      **. If there exists no host pointer value through which the host
-      program may directly access ``ptr`` then
+      be accessed by by the host program. The type of ``data`` must be
+      ``void **``. If there exists no host pointer value through which the
+      host program may directly access ``ptr`` then
       :py:obj:`~.CUDA_ERROR_INVALID_VALUE` is returned.
 
     - Except in the exceptional disjoint addressing cases discussed below,
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 26a4e6ad67..b6968db2ea 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -5720,7 +5720,7 @@ class cudaJitOption(_FastEnum):
         'Pointer to a buffer in which to print any log messages that are\n'
         'informational in nature (the buffer size is specified via option\n'
         ':py:obj:`~.cudaJitInfoLogBufferSizeBytes`)\n'
-        'Option type: char *\n'
+        'Option type: ``char *``\n'
         'Applies to: compiler and linker\n'
     ){{endif}}
     {{if 'cudaJitInfoLogBufferSizeBytes' in found_values}}
@@ -5740,7 +5740,7 @@ class cudaJitOption(_FastEnum):
         'Pointer to a buffer in which to print any log messages that reflect errors\n'
         '(the buffer size is specified via option\n'
         ':py:obj:`~.cudaJitErrorLogBufferSizeBytes`)\n'
-        'Option type: char *\n'
+        'Option type: ``char *``\n'
         'Applies to: compiler and linker\n'
     ){{endif}}
     {{if 'cudaJitErrorLogBufferSizeBytes' in found_values}}
diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst
index 6a5d29ee79..df896172a3 100644
--- a/cuda_bindings/docs/source/module/driver.rst
+++ b/cuda_bindings/docs/source/module/driver.rst
@@ -2451,7 +2451,7 @@ Data types used by CUDA driver
 
         Pointer to a buffer in which to print any log messages that are informational in nature (the buffer size is specified via option :py:obj:`~.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES`)
 
-        Option type: char *
+        Option type: ``char *``
 
         Applies to: compiler and linker
 
@@ -2473,7 +2473,7 @@ Data types used by CUDA driver
 
         Pointer to a buffer in which to print any log messages that reflect errors (the buffer size is specified via option :py:obj:`~.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES`)
 
-        Option type: char *
+        Option type: ``char *``
 
         Applies to: compiler and linker
 
@@ -2597,7 +2597,7 @@ Data types used by CUDA driver
 
         It is illegal to register the same device symbol at multiple addresses.
 
-        Option type: const char **
+        Option type: ``const char **``
 
         Applies to: dynamic linker only
 
@@ -2609,7 +2609,7 @@ Data types used by CUDA driver
 
         Must contain :py:obj:`~.CU_JIT_GLOBAL_SYMBOL_COUNT` entries.
 
-        Option type: void **
+        Option type: ``void **``
 
         Applies to: dynamic linker only
 
diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst
index 315583c948..5374a37d16 100644
--- a/cuda_bindings/docs/source/module/runtime.rst
+++ b/cuda_bindings/docs/source/module/runtime.rst
@@ -3384,7 +3384,7 @@ Data types used by CUDA Runtime
 
         Pointer to a buffer in which to print any log messages that are informational in nature (the buffer size is specified via option :py:obj:`~.cudaJitInfoLogBufferSizeBytes`)
 
-        Option type: char *
+        Option type: ``char *``
 
         Applies to: compiler and linker
 
@@ -3406,7 +3406,7 @@ Data types used by CUDA Runtime
 
         Pointer to a buffer in which to print any log messages that reflect errors (the buffer size is specified via option :py:obj:`~.cudaJitErrorLogBufferSizeBytes`)
 
-        Option type: char *
+        Option type: ``char *``
 
         Applies to: compiler and linker
 

From 9b167441ba10cde4216db04a93a3959628abdd7b Mon Sep 17 00:00:00 2001
From: Keith Kraus <keith.j.kraus@gmail.com>
Date: Wed, 3 Jun 2026 17:11:04 -0400
Subject: [PATCH 3/5] Regenerate pointer docstring markup

---
 cuda_bindings/cuda/bindings/driver.pyx.in | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index 723ae76f9d..c51f2b6eee 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -39550,7 +39550,7 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
 
     - Returns in ``*data`` the :py:obj:`~.CUcontext` in which ``ptr`` was
       allocated or registered. The type of ``data`` must be
-      :py:obj:`~.CUcontext` *.
+      :py:obj:`~.CUcontext` ``*``.
 
     - If ``ptr`` was not allocated by, mapped by, or registered with a
       :py:obj:`~.CUcontext` which uses unified virtual addressing then
@@ -39582,7 +39582,8 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
 
     - Returns in ``*data`` the device pointer value through which ``ptr``
       may be accessed by kernels running in the current
-      :py:obj:`~.CUcontext`. The type of ``data`` must be CUdeviceptr *.
+      :py:obj:`~.CUcontext`. The type of ``data`` must be ``CUdeviceptr
+      *``.
 
     - If there exists no device pointer value through which kernels running
       in the current :py:obj:`~.CUcontext` may access ``ptr`` then

From a78cdb2d1a49e5d546363f913d0972640ed27f97 Mon Sep 17 00:00:00 2001
From: Keith Kraus <keith.j.kraus@gmail.com>
Date: Wed, 3 Jun 2026 23:24:11 -0400
Subject: [PATCH 4/5] Regenerate generated docs object refs

---
 .../cuda/bindings/_bindings/cydriver.pxd.in   |   2 +-
 .../cuda/bindings/_bindings/cydriver.pyx.in   |   2 +-
 .../cuda/bindings/_bindings/cyruntime.pxd.in  |   2 +-
 .../cuda/bindings/_bindings/cyruntime.pyx.in  |   2 +-
 .../bindings/_bindings/cyruntime_ptds.pxd.in  |   2 +-
 .../bindings/_bindings/cyruntime_ptds.pyx.in  |   2 +-
 .../cuda/bindings/_internal/nvrtc.pxd         |   2 +-
 .../cuda/bindings/_internal/nvrtc_linux.pyx   |   2 +-
 .../cuda/bindings/_internal/nvrtc_windows.pyx |   2 +-
 cuda_bindings/cuda/bindings/cydriver.pxd.in   |   2 +-
 cuda_bindings/cuda/bindings/cydriver.pyx.in   |   2 +-
 cuda_bindings/cuda/bindings/cynvrtc.pxd       |   2 +-
 cuda_bindings/cuda/bindings/cynvrtc.pyx       |   2 +-
 cuda_bindings/cuda/bindings/cyruntime.pxd.in  |   2 +-
 cuda_bindings/cuda/bindings/cyruntime.pyx.in  |   2 +-
 .../cuda/bindings/cyruntime_functions.pxi.in  |   2 +-
 .../cuda/bindings/cyruntime_types.pxi.in      |   2 +-
 cuda_bindings/cuda/bindings/driver.pxd.in     |   2 +-
 cuda_bindings/cuda/bindings/driver.pyx.in     |  19 +-
 cuda_bindings/cuda/bindings/nvrtc.pxd         |   2 +-
 cuda_bindings/cuda/bindings/nvrtc.pyx         |   2 +-
 cuda_bindings/cuda/bindings/runtime.pxd.in    |   2 +-
 cuda_bindings/cuda/bindings/runtime.pyx.in    | 198 +++++++++---------
 cuda_bindings/docs/source/module/runtime.rst  |  20 +-
 24 files changed, 140 insertions(+), 139 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
index 1b643ba64b..0d213878e0 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 from cuda.bindings.cydriver cimport *
 
 {{if 'cuGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
index f0c724a547..786e81b3d5 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 {{if 'Windows' == platform.system()}}
 import os
 cimport cuda.bindings._lib.windll as windll
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
index aa0676a5bd..b3eb3d2e6b 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 include "../cyruntime_types.pxi"
 
 include "../_lib/cyruntime/cyruntime.pxd"
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
index 98bff296bf..77c29eea32 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 include "../cyruntime_functions.pxi"
 
 import os
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
index 2912d4b0d9..2172e481d6 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
diff --git a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
index b268ac8470..c247fbd10a 100644
--- a/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
+++ b/cuda_bindings/cuda/bindings/_bindings/cyruntime_ptds.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 cdef extern from "":
     """
     #define CUDA_API_PER_THREAD_DEFAULT_STREAM
diff --git a/cuda_bindings/cuda/bindings/_internal/nvrtc.pxd b/cuda_bindings/cuda/bindings/_internal/nvrtc.pxd
index 45768a3979..ca16b32f57 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvrtc.pxd
+++ b/cuda_bindings/cuda/bindings/_internal/nvrtc.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 
 from ..cynvrtc cimport *
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvrtc_linux.pyx b/cuda_bindings/cuda/bindings/_internal/nvrtc_linux.pyx
index 5ddb84098f..ff25c6d2ec 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvrtc_linux.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvrtc_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 
 from libc.stdint cimport intptr_t, uintptr_t
 
diff --git a/cuda_bindings/cuda/bindings/_internal/nvrtc_windows.pyx b/cuda_bindings/cuda/bindings/_internal/nvrtc_windows.pyx
index 767e35d8aa..bd4e5a0e6a 100644
--- a/cuda_bindings/cuda/bindings/_internal/nvrtc_windows.pyx
+++ b/cuda_bindings/cuda/bindings/_internal/nvrtc_windows.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/cuda_bindings/cuda/bindings/cydriver.pxd.in b/cuda_bindings/cuda/bindings/cydriver.pxd.in
index 9068460e22..6d11208548 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pxd.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cydriver.pyx.in b/cuda_bindings/cuda/bindings/cydriver.pyx.in
index c4439868c7..1cbe83eadd 100644
--- a/cuda_bindings/cuda/bindings/cydriver.pyx.in
+++ b/cuda_bindings/cuda/bindings/cydriver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 cimport cuda.bindings._bindings.cydriver as cydriver
 
 {{if 'cuGetErrorString' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pxd b/cuda_bindings/cuda/bindings/cynvrtc.pxd
index 90fcd6517a..cc7d92ff0c 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pxd
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cynvrtc.pyx b/cuda_bindings/cuda/bindings/cynvrtc.pyx
index e69d1ffd51..283321850e 100644
--- a/cuda_bindings/cuda/bindings/cynvrtc.pyx
+++ b/cuda_bindings/cuda/bindings/cynvrtc.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 #
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 
 from ._internal cimport nvrtc as _nvrtc
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pxd.in b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
index 2bc4c4f833..c28e4fe763 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pxd.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 
 from libc.stdint cimport uint32_t, uint64_t
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime.pyx.in b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
index e3c471edf1..af7ca89deb 100644
--- a/cuda_bindings/cuda/bindings/cyruntime.pyx.in
+++ b/cuda_bindings/cuda/bindings/cyruntime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 cimport cuda.bindings._bindings.cyruntime as cyruntime
 cimport cython
 
diff --git a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
index f9e78ad2ad..4ba9abef55 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_functions.pxi.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 cdef extern from "cuda_runtime_api.h":
 
     {{if 'cudaDeviceReset' in found_functions}}
diff --git a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
index f151ce8321..d279dfa9e8 100644
--- a/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
+++ b/cuda_bindings/cuda/bindings/cyruntime_types.pxi.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 
 cdef extern from "vector_types.h":
 
diff --git a/cuda_bindings/cuda/bindings/driver.pxd.in b/cuda_bindings/cuda/bindings/driver.pxd.in
index 93c895702e..a607747264 100644
--- a/cuda_bindings/cuda/bindings/driver.pxd.in
+++ b/cuda_bindings/cuda/bindings/driver.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 cimport cuda.bindings.cydriver as cydriver
 
 include "_lib/utils.pxd"
diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index c51f2b6eee..78a1a88c06 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -7136,7 +7136,7 @@ class CUgreenCtxCreate_flags(_FastEnum):
 
 class CUdevSmResourceGroup_flags(_FastEnum):
     """
-    Flags for a CUdevSmResource group
+    Flags for a :py:obj:`~.CUdevSmResource` group
     """
     {{if 'CU_DEV_SM_RESOURCE_GROUP_DEFAULT' in found_values}}
     CU_DEV_SM_RESOURCE_GROUP_DEFAULT = cydriver.CUdevSmResourceGroup_flags.CU_DEV_SM_RESOURCE_GROUP_DEFAULT{{endif}}
@@ -40148,7 +40148,6 @@ def cuMemPrefetchBatchAsync(dptrs : Optional[tuple[CUdeviceptr] | list[CUdevicep
     Returns
     -------
     CUresult
-
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -40252,7 +40251,6 @@ def cuMemDiscardBatchAsync(dptrs : Optional[tuple[CUdeviceptr] | list[CUdevicept
     Returns
     -------
     CUresult
-
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -40361,7 +40359,6 @@ def cuMemDiscardAndPrefetchBatchAsync(dptrs : Optional[tuple[CUdeviceptr] | list
     Returns
     -------
     CUresult
-
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -56622,7 +56619,7 @@ def cuDeviceGetDevResource(device, typename not None : CUdevResourceType):
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     resource : :py:obj:`~.CUdevResource`
-        Output pointer to a CUdevResource structure
+        Output pointer to a :py:obj:`~.CUdevResource` structure
 
     See Also
     --------
@@ -56666,7 +56663,7 @@ def cuCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
     resource : :py:obj:`~.CUdevResource`
-        Output pointer to a CUdevResource structure
+        Output pointer to a :py:obj:`~.CUdevResource` structure
 
     See Also
     --------
@@ -56710,7 +56707,7 @@ def cuGreenCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
     CUresult
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     resource : :py:obj:`~.CUdevResource`
-        Output pointer to a CUdevResource structure
+        Output pointer to a :py:obj:`~.CUdevResource` structure
 
     See Also
     --------
@@ -56908,8 +56905,8 @@ def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource]
     - There are no API ``flags`` at this time, so the value passed in
       should be 0.
 
-    - A CU_DEV_SM_RESOURCE_GROUP_PARAMS array of size ``nbGroups``. Each
-      entry must be zero-initialized.
+    - A :py:obj:`~.CU_DEV_SM_RESOURCE_GROUP_PARAMS` array of size
+      ``nbGroups``. Each entry must be zero-initialized.
 
       - ``smCount:`` must be either 0 or in the range of [2,inputSmCount]
         where inputSmCount is the amount of SMs the ``input`` resource has.
@@ -57414,7 +57411,7 @@ def cuStreamGetDevResource(hStream, typename not None : CUdevResourceType):
     CUresult
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     resource : :py:obj:`~.CUdevResource`
-        Output pointer to a CUdevResource structure
+        Output pointer to a :py:obj:`~.CUdevResource` structure
 
     See Also
     --------
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pxd b/cuda_bindings/cuda/bindings/nvrtc.pxd
index c4f57162eb..760154ccbe 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pxd
+++ b/cuda_bindings/cuda/bindings/nvrtc.pxd
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 cimport cuda.bindings.cynvrtc as cynvrtc
 
 include "_lib/utils.pxd"
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pyx b/cuda_bindings/cuda/bindings/nvrtc.pyx
index 0e325058f8..4576dfd3ad 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pyx
+++ b/cuda_bindings/cuda/bindings/nvrtc.pyx
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
diff --git a/cuda_bindings/cuda/bindings/runtime.pxd.in b/cuda_bindings/cuda/bindings/runtime.pxd.in
index 6c36eb6434..beff0505f8 100644
--- a/cuda_bindings/cuda/bindings/runtime.pxd.in
+++ b/cuda_bindings/cuda/bindings/runtime.pxd.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 cimport cuda.bindings.cyruntime as cyruntime
 
 include "_lib/utils.pxd"
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index b6968db2ea..6644048c22 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2021-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1725+g07a86fe3d.d20260603. Do not modify it directly.
+# This code was automatically generated with version 13.3.0, generator version 0.3.1.dev1731+g535901cce.d20260604. Do not modify it directly.
 from typing import Any, Optional
 import cython
 import ctypes
@@ -228,19 +228,21 @@ CUDA_IPC_HANDLE_SIZE = cyruntime.CUDA_IPC_HANDLE_SIZE
 #: Indicates that the external memory object is a dedicated resource
 cudaExternalMemoryDedicated = cyruntime.cudaExternalMemoryDedicated
 
-#: When the /p flags parameter of cudaExternalSemaphoreSignalParams
-#: contains this flag, it indicates that signaling an external semaphore
-#: object should skip performing appropriate memory synchronization
-#: operations over all the external memory objects that are imported as
+#: When the /p flags parameter of
+#: :py:obj:`~.cudaExternalSemaphoreSignalParams` contains this flag, it
+#: indicates that signaling an external semaphore object should skip
+#: performing appropriate memory synchronization operations over all the
+#: external memory objects that are imported as
 #: :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are
 #: performed by default to ensure data coherency with other importers of
 #: the same NvSciBuf memory objects.
 cudaExternalSemaphoreSignalSkipNvSciBufMemSync = cyruntime.cudaExternalSemaphoreSignalSkipNvSciBufMemSync
 
-#: When the /p flags parameter of cudaExternalSemaphoreWaitParams contains
-#: this flag, it indicates that waiting an external semaphore object should
-#: skip performing appropriate memory synchronization operations over all
-#: the external memory objects that are imported as
+#: When the /p flags parameter of
+#: :py:obj:`~.cudaExternalSemaphoreWaitParams` contains this flag, it
+#: indicates that waiting an external semaphore object should skip
+#: performing appropriate memory synchronization operations over all the
+#: external memory objects that are imported as
 #: :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are
 #: performed by default to ensure data coherency with other importers of
 #: the same NvSciBuf memory objects.
@@ -411,7 +413,8 @@ class cudaError_t(_FastEnum):
         'This indicates that a kernel launch is requesting resources that can never\n'
         'be satisfied by the current device. Requesting more shared memory per block\n'
         'than the device supports will trigger this error, as will requesting too\n'
-        'many threads or blocks. See cudaDeviceProp for more device limitations.\n'
+        'many threads or blocks. See :py:obj:`~.cudaDeviceProp` for more device\n'
+        'limitations.\n'
     ){{endif}}
     {{if 'cudaErrorVersionTranslation' in found_values}}
 
@@ -3217,8 +3220,8 @@ class cudaMemcpyKind(_FastEnum):
 
 class cudaAccessProperty(_FastEnum):
     """
-    Specifies performance hint with cudaAccessPolicyWindow for hitProp
-    and missProp members.
+    Specifies performance hint with :py:obj:`~.cudaAccessPolicyWindow`
+    for hitProp and missProp members.
     """
     {{if 'cudaAccessPropertyNormal' in found_values}}
 
@@ -6349,25 +6352,25 @@ class cudaGraphDebugDotFlags(_FastEnum):
 
     cudaGraphDebugDotFlagsKernelNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsKernelNodeParams,
-        'Adds cudaKernelNodeParams to output\n'
+        'Adds :py:obj:`~.cudaKernelNodeParams` to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsMemcpyNodeParams' in found_values}}
 
     cudaGraphDebugDotFlagsMemcpyNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsMemcpyNodeParams,
-        'Adds cudaMemcpy3DParms to output\n'
+        'Adds :py:obj:`~.cudaMemcpy3DParms` to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsMemsetNodeParams' in found_values}}
 
     cudaGraphDebugDotFlagsMemsetNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsMemsetNodeParams,
-        'Adds cudaMemsetParams to output\n'
+        'Adds :py:obj:`~.cudaMemsetParams` to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsHostNodeParams' in found_values}}
 
     cudaGraphDebugDotFlagsHostNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsHostNodeParams,
-        'Adds cudaHostNodeParams to output\n'
+        'Adds :py:obj:`~.cudaHostNodeParams` to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsEventNodeParams' in found_values}}
 
@@ -6379,13 +6382,13 @@ class cudaGraphDebugDotFlags(_FastEnum):
 
     cudaGraphDebugDotFlagsExtSemasSignalNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasSignalNodeParams,
-        'Adds cudaExternalSemaphoreSignalNodeParams values to output\n'
+        'Adds :py:obj:`~.cudaExternalSemaphoreSignalNodeParams` values to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsExtSemasWaitNodeParams' in found_values}}
 
     cudaGraphDebugDotFlagsExtSemasWaitNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasWaitNodeParams,
-        'Adds cudaExternalSemaphoreWaitNodeParams to output\n'
+        'Adds :py:obj:`~.cudaExternalSemaphoreWaitNodeParams` to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsKernelNodeAttributes' in found_values}}
 
@@ -6403,7 +6406,7 @@ class cudaGraphDebugDotFlags(_FastEnum):
 
     cudaGraphDebugDotFlagsConditionalNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsConditionalNodeParams,
-        'Adds cudaConditionalNodeParams to output\n'
+        'Adds :py:obj:`~.cudaConditionalNodeParams` to output\n'
     ){{endif}}
 
 {{endif}}
@@ -21337,7 +21340,7 @@ def cudaDeviceReset():
     in subsequent API calls and doing so will result in undefined behavior.
     These resources include CUDA types :py:obj:`~.cudaStream_t`,
     :py:obj:`~.cudaEvent_t`, :py:obj:`~.cudaArray_t`,
-    :py:obj:`~.cudaMipmappedArray_t`, cudaPitchedPtr,
+    :py:obj:`~.cudaMipmappedArray_t`, :py:obj:`~.cudaPitchedPtr`,
     :py:obj:`~.cudaTextureObject_t`, :py:obj:`~.cudaSurfaceObject_t`,
     :py:obj:`~.textureReference`, :py:obj:`~.surfaceReference`,
     :py:obj:`~.cudaExternalMemory_t`, :py:obj:`~.cudaExternalSemaphore_t`
@@ -23631,7 +23634,7 @@ def cudaCtxResetPersistingL2Cache():
 
     See Also
     --------
-    cudaAccessPolicyWindow
+    :py:obj:`~.cudaAccessPolicyWindow`
     """
     with nogil:
         err = cyruntime.cudaCtxResetPersistingL2Cache()
@@ -23661,7 +23664,7 @@ def cudaStreamCopyAttributes(dst, src):
 
     See Also
     --------
-    cudaAccessPolicyWindow
+    :py:obj:`~.cudaAccessPolicyWindow`
     """
     cdef cyruntime.cudaStream_t cysrc
     if src is None:
@@ -23709,7 +23712,7 @@ def cudaStreamGetAttribute(hStream, attr not None : cudaStreamAttrID):
 
     See Also
     --------
-    cudaAccessPolicyWindow
+    :py:obj:`~.cudaAccessPolicyWindow`
     """
     cdef cyruntime.cudaStream_t cyhStream
     if hStream is None:
@@ -23754,7 +23757,7 @@ def cudaStreamSetAttribute(hStream, attr not None : cudaStreamAttrID, value : Op
 
     See Also
     --------
-    cudaAccessPolicyWindow
+    :py:obj:`~.cudaAccessPolicyWindow`
     """
     cdef cyruntime.cudaStream_t cyhStream
     if hStream is None:
@@ -25214,8 +25217,8 @@ def cudaImportExternalMemory(memHandleDesc : Optional[cudaExternalMemoryHandleDe
     that in ``extMem_out``.
 
     The properties of the handle being imported must be described in
-    ``memHandleDesc``. The cudaExternalMemoryHandleDesc structure is
-    defined as follows:
+    ``memHandleDesc``. The :py:obj:`~.cudaExternalMemoryHandleDesc`
+    structure is defined as follows:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -25363,8 +25366,8 @@ def cudaExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[cudaExternal
     pointer in ``devPtr``.
 
     The properties of the buffer being mapped must be described in
-    ``bufferDesc``. The cudaExternalMemoryBufferDesc structure is defined
-    as follows:
+    ``bufferDesc``. The :py:obj:`~.cudaExternalMemoryBufferDesc` structure
+    is defined as follows:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -25433,7 +25436,7 @@ def cudaExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[cuda
 
     The properties of the CUDA mipmapped array being mapped must be
     described in ``mipmapDesc``. The structure
-    cudaExternalMemoryMipmappedArrayDesc is defined as follows:
+    :py:obj:`~.cudaExternalMemoryMipmappedArrayDesc` is defined as follows:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -25542,8 +25545,8 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
     handle to that in ``extSem_out``.
 
     The properties of the handle being imported must be described in
-    ``semHandleDesc``. The cudaExternalSemaphoreHandleDesc is defined as
-    follows:
+    ``semHandleDesc``. The :py:obj:`~.cudaExternalSemaphoreHandleDesc` is
+    defined as follows:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -26923,10 +26926,11 @@ def cudaMallocPitch(size_t width, size_t height):
 def cudaMallocArray(desc : Optional[cudaChannelFormatDesc], size_t width, size_t height, unsigned int flags):
     """ Allocate an array on the device.
 
-    Allocates a CUDA array according to the cudaChannelFormatDesc structure
-    ``desc`` and returns a handle to the new CUDA array in ``*array``.
+    Allocates a CUDA array according to the
+    :py:obj:`~.cudaChannelFormatDesc` structure ``desc`` and returns a
+    handle to the new CUDA array in ``*array``.
 
-    The cudaChannelFormatDesc is defined as:
+    The :py:obj:`~.cudaChannelFormatDesc` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -27494,16 +27498,16 @@ def cudaMalloc3D(extent not None : cudaExtent):
     """ Allocates logical 1D, 2D, or 3D memory objects on the device.
 
     Allocates at least ``width`` * ``height`` * ``depth`` bytes of linear
-    memory on the device and returns a cudaPitchedPtr in which ``ptr`` is a
-    pointer to the allocated memory. The function may pad the allocation to
-    ensure hardware alignment requirements are met. The pitch returned in
-    the ``pitch`` field of ``pitchedDevPtr`` is the width in bytes of the
-    allocation.
+    memory on the device and returns a :py:obj:`~.cudaPitchedPtr` in which
+    ``ptr`` is a pointer to the allocated memory. The function may pad the
+    allocation to ensure hardware alignment requirements are met. The pitch
+    returned in the ``pitch`` field of ``pitchedDevPtr`` is the width in
+    bytes of the allocation.
 
-    The returned cudaPitchedPtr contains additional fields ``xsize`` and
-    ``ysize``, the logical width and height of the allocation, which are
-    equivalent to the ``width`` and ``height`` ``extent`` parameters
-    provided by the programmer during allocation.
+    The returned :py:obj:`~.cudaPitchedPtr` contains additional fields
+    ``xsize`` and ``ysize``, the logical width and height of the
+    allocation, which are equivalent to the ``width`` and ``height``
+    ``extent`` parameters provided by the programmer during allocation.
 
     For allocations of 2D and 3D objects, it is highly recommended that
     programmers perform allocations using :py:obj:`~.cudaMalloc3D()` or
@@ -27542,10 +27546,11 @@ def cudaMalloc3D(extent not None : cudaExtent):
 def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None : cudaExtent, unsigned int flags):
     """ Allocate an array on the device.
 
-    Allocates a CUDA array according to the cudaChannelFormatDesc structure
-    ``desc`` and returns a handle to the new CUDA array in ``*array``.
+    Allocates a CUDA array according to the
+    :py:obj:`~.cudaChannelFormatDesc` structure ``desc`` and returns a
+    handle to the new CUDA array in ``*array``.
 
-    The cudaChannelFormatDesc is defined as:
+    The :py:obj:`~.cudaChannelFormatDesc` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -27664,13 +27669,14 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
 def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not None : cudaExtent, unsigned int numLevels, unsigned int flags):
     """ Allocate a mipmapped array on the device.
 
-    Allocates a CUDA mipmapped array according to the cudaChannelFormatDesc
-    structure ``desc`` and returns a handle to the new CUDA mipmapped array
-    in ``*mipmappedArray``. ``numLevels`` specifies the number of mipmap
-    levels to be allocated. This value is clamped to the range [1, 1 +
-    floor(log2(max(width, height, depth)))].
+    Allocates a CUDA mipmapped array according to the
+    :py:obj:`~.cudaChannelFormatDesc` structure ``desc`` and returns a
+    handle to the new CUDA mipmapped array in ``*mipmappedArray``.
+    ``numLevels`` specifies the number of mipmap levels to be allocated.
+    This value is clamped to the range [1, 1 + floor(log2(max(width,
+    height, depth)))].
 
-    The cudaChannelFormatDesc is defined as:
+    The :py:obj:`~.cudaChannelFormatDesc` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -27844,8 +27850,8 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
     :py:obj:`~.cudaMemcpy3D()` copies data betwen two 3D objects. The
     source and destination objects may be in either host memory, device
     memory, or a CUDA array. The source, destination, extent, and kind of
-    copy performed is specified by the cudaMemcpy3DParms struct which
-    should be initialized to zero before use:
+    copy performed is specified by the :py:obj:`~.cudaMemcpy3DParms` struct
+    which should be initialized to zero before use:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -27894,8 +27900,8 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
 
     :py:obj:`~.cudaMemcpy3D()` returns an error if the pitch of ``srcPtr``
     or ``dstPtr`` exceeds the maximum allowed. The pitch of a
-    cudaPitchedPtr allocated with :py:obj:`~.cudaMalloc3D()` will always be
-    valid.
+    :py:obj:`~.cudaPitchedPtr` allocated with :py:obj:`~.cudaMalloc3D()`
+    will always be valid.
 
     Parameters
     ----------
@@ -27924,8 +27930,8 @@ def cudaMemcpy3DPeer(p : Optional[cudaMemcpy3DPeerParms]):
     """ Copies memory between devices.
 
     Perform a 3D memory copy according to the parameters specified in
-    ``p``. See the definition of the cudaMemcpy3DPeerParms structure for
-    documentation of its parameters.
+    ``p``. See the definition of the :py:obj:`~.cudaMemcpy3DPeerParms`
+    structure for documentation of its parameters.
 
     Note that this function is synchronous with respect to the host only if
     the source or destination of the transfer is host memory. Note also
@@ -27965,8 +27971,8 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
     :py:obj:`~.cudaMemcpy3DAsync()` copies data betwen two 3D objects. The
     source and destination objects may be in either host memory, device
     memory, or a CUDA array. The source, destination, extent, and kind of
-    copy performed is specified by the cudaMemcpy3DParms struct which
-    should be initialized to zero before use:
+    copy performed is specified by the :py:obj:`~.cudaMemcpy3DParms` struct
+    which should be initialized to zero before use:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -28016,8 +28022,8 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
 
     :py:obj:`~.cudaMemcpy3DAsync()` returns an error if the pitch of
     ``srcPtr`` or ``dstPtr`` exceeds the maximum allowed. The pitch of a
-    cudaPitchedPtr allocated with :py:obj:`~.cudaMalloc3D()` will always be
-    valid.
+    :py:obj:`~.cudaPitchedPtr` allocated with :py:obj:`~.cudaMalloc3D()`
+    will always be valid.
 
     :py:obj:`~.cudaMemcpy3DAsync()` is asynchronous with respect to the
     host, so the call may return before the copy is complete. The copy can
@@ -28066,8 +28072,8 @@ def cudaMemcpy3DPeerAsync(p : Optional[cudaMemcpy3DPeerParms], stream):
     """ Copies memory between devices asynchronously.
 
     Perform a 3D memory copy according to the parameters specified in
-    ``p``. See the definition of the cudaMemcpy3DPeerParms structure for
-    documentation of its parameters.
+    ``p``. See the definition of the :py:obj:`~.cudaMemcpy3DPeerParms`
+    structure for documentation of its parameters.
 
     Parameters
     ----------
@@ -28278,7 +28284,7 @@ def cudaArrayGetMemoryRequirements(array, int device):
     cudaError_t
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     memoryRequirements : :py:obj:`~.cudaArrayMemoryRequirements`
-        Pointer to cudaArrayMemoryRequirements
+        Pointer to :py:obj:`~.cudaArrayMemoryRequirements`
 
     See Also
     --------
@@ -28328,7 +28334,7 @@ def cudaMipmappedArrayGetMemoryRequirements(mipmap, int device):
     cudaError_t
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     memoryRequirements : :py:obj:`~.cudaArrayMemoryRequirements`
-        Pointer to cudaArrayMemoryRequirements
+        Pointer to :py:obj:`~.cudaArrayMemoryRequirements`
 
     See Also
     --------
@@ -28384,7 +28390,7 @@ def cudaArrayGetSparseProperties(array):
     cudaError_t
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     sparseProperties : :py:obj:`~.cudaArraySparseProperties`
-        Pointer to return the cudaArraySparseProperties
+        Pointer to return the :py:obj:`~.cudaArraySparseProperties`
 
     See Also
     --------
@@ -28440,7 +28446,7 @@ def cudaMipmappedArrayGetSparseProperties(mipmap):
     cudaError_t
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     sparseProperties : :py:obj:`~.cudaArraySparseProperties`
-        Pointer to return cudaArraySparseProperties
+        Pointer to return :py:obj:`~.cudaArraySparseProperties`
 
     See Also
     --------
@@ -29021,8 +29027,8 @@ def cudaMemcpyBatchAsync(dsts : Optional[tuple[Any] | list[Any]], srcs : Optiona
     no prior operations in the stream can be accessing the memory.
     Specifying this flag allows the driver to optimize the copy on certain
     platforms. Each memcpy operation in the batch must have a valid
-    cudaMemcpyAttributes corresponding to it including the appropriate
-    srcAccessOrder setting, otherwise the API will return
+    :py:obj:`~.cudaMemcpyAttributes` corresponding to it including the
+    appropriate srcAccessOrder setting, otherwise the API will return
     :py:obj:`~.cudaErrorInvalidValue`.
 
     The :py:obj:`~.cudaMemcpyAttributes.srcLocHint` and
@@ -29263,7 +29269,8 @@ def cudaMemcpyWithAttributesAsync(dst, src, size_t size, attr : Optional[cudaMem
     the operation in.
 
     For more information regarding the attributes, please refer to
-    cudaMemcpyAttributes and it's usage desciption in::cudaMemcpyBatchAsync
+    :py:obj:`~.cudaMemcpyAttributes` and it's usage desciption
+    in::cudaMemcpyBatchAsync
 
     Parameters
     ----------
@@ -29320,7 +29327,7 @@ def cudaMemcpy3DWithAttributesAsync(op : Optional[cudaMemcpy3DBatchOp], unsigned
     the operation in.
 
     For more information regarding the operation, please refer to
-    cudaMemcpy3DBatchOp and it's usage desciption
+    :py:obj:`~.cudaMemcpy3DBatchOp` and it's usage desciption
     in::cudaMemcpy3DBatchAsync
 
     Parameters
@@ -30128,7 +30135,6 @@ def cudaMemPrefetchBatchAsync(dptrs : Optional[tuple[Any] | list[Any]], sizes :
     Returns
     -------
     cudaError_t
-
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -30221,7 +30227,6 @@ def cudaMemDiscardBatchAsync(dptrs : Optional[tuple[Any] | list[Any]], sizes : t
     Returns
     -------
     cudaError_t
-
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -30319,7 +30324,6 @@ def cudaMemDiscardAndPrefetchBatchAsync(dptrs : Optional[tuple[Any] | list[Any]]
     Returns
     -------
     cudaError_t
-
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -31639,7 +31643,7 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
     /dev/nvidia-caps-imex-channels/channel0 c <major number> 0``
 
     To create a managed memory pool, applications must set
-    cudaMemPoolProps:cudaMemAllocationType to
+    :py:obj:`~.cudaMemPoolProps`:cudaMemAllocationType to
     :py:obj:`~.cudaMemAllocationTypeManaged`.
     :py:obj:`~.cudaMemPoolProps.cudaMemAllocationHandleType` must also be
     set to :py:obj:`~.cudaMemHandleTypeNone` since IPC is not supported.
@@ -32158,7 +32162,7 @@ def cudaPointerGetAttributes(ptr):
     supporting unified addressing :py:obj:`~.cudaErrorInvalidValue` is
     returned.
 
-    The cudaPointerAttributes structure is defined as:
+    The :py:obj:`~.cudaPointerAttributes` structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -32759,7 +32763,7 @@ def cudaCreateChannelDesc(int x, int y, int z, int w, f not None : cudaChannelFo
 
     Returns a channel descriptor with format ``f`` and number of bits of
     each component ``x``, ``y``, ``z``, and ``w``. The
-    cudaChannelFormatDesc is defined as:
+    :py:obj:`~.cudaChannelFormatDesc` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -32819,7 +32823,7 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
     or higher. Additionally, a texture object is an opaque value, and, as
     such, should only be accessed through CUDA API calls.
 
-    The cudaResourceDesc structure is defined as:
+    The :py:obj:`~.cudaResourceDesc` structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -32871,7 +32875,7 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
     :py:obj:`~.cudaDeviceProp.texturePitchAlignment`. Pitch cannot exceed
     :py:obj:`~.cudaDeviceProp.maxTexture2DLinear` ``[2]``.
 
-    The cudaTextureDesc struct is defined as
+    The :py:obj:`~.cudaTextureDesc` struct is defined as
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -32961,7 +32965,7 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
       :py:obj:`~.cudaFilterModeLinear` seamless cube map filtering will be
       performed when sampling along the cube face borders.
 
-    The cudaResourceViewDesc struct is defined as
+    The :py:obj:`~.cudaResourceViewDesc` struct is defined as
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -33632,7 +33636,7 @@ def cudaGraphAddKernelNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
     the root of the graph. ``pDependencies`` may not have any duplicate
     entries. A handle to the new node will be returned in ``pGraphNode``.
 
-    The cudaKernelNodeParams structure is defined as:
+    The :py:obj:`~.cudaKernelNodeParams` structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -33853,7 +33857,7 @@ def cudaGraphKernelNodeCopyAttributes(hDst, hSrc):
 
     See Also
     --------
-    cudaAccessPolicyWindow
+    :py:obj:`~.cudaAccessPolicyWindow`
     """
     cdef cyruntime.cudaGraphNode_t cyhSrc
     if hSrc is None:
@@ -33901,7 +33905,7 @@ def cudaGraphKernelNodeGetAttribute(hNode, attr not None : cudaKernelNodeAttrID)
 
     See Also
     --------
-    cudaAccessPolicyWindow
+    :py:obj:`~.cudaAccessPolicyWindow`
     """
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
@@ -33945,7 +33949,7 @@ def cudaGraphKernelNodeSetAttribute(hNode, attr not None : cudaKernelNodeAttrID,
 
     See Also
     --------
-    cudaAccessPolicyWindow
+    :py:obj:`~.cudaAccessPolicyWindow`
     """
     cdef cyruntime.cudaGraphNode_t cyhNode
     if hNode is None:
@@ -40879,10 +40883,10 @@ def cudaGetKernel(entryFuncAddr):
 
 @cython.embedsignature(True)
 def make_cudaPitchedPtr(d, size_t p, size_t xsz, size_t ysz):
-    """ Returns a cudaPitchedPtr based on input parameters.
+    """ Returns a :py:obj:`~.cudaPitchedPtr` based on input parameters.
 
-    Returns a cudaPitchedPtr based on the specified input parameters ``d``,
-    ``p``, ``xsz``, and ``ysz``.
+    Returns a :py:obj:`~.cudaPitchedPtr` based on the specified input
+    parameters ``d``, ``p``, ``xsz``, and ``ysz``.
 
     Parameters
     ----------
@@ -40900,7 +40904,7 @@ def make_cudaPitchedPtr(d, size_t p, size_t xsz, size_t ysz):
     cudaError_t.cudaSuccess
         cudaError_t.cudaSuccess
     :py:obj:`~.cudaPitchedPtr`
-        cudaPitchedPtr specified by ``d``, ``p``, ``xsz``, and ``ysz``
+        :py:obj:`~.cudaPitchedPtr` specified by ``d``, ``p``, ``xsz``, and ``ysz``
 
     See Also
     --------
@@ -40920,10 +40924,10 @@ def make_cudaPitchedPtr(d, size_t p, size_t xsz, size_t ysz):
 
 @cython.embedsignature(True)
 def make_cudaPos(size_t x, size_t y, size_t z):
-    """ Returns a cudaPos based on input parameters.
+    """ Returns a :py:obj:`~.cudaPos` based on input parameters.
 
-    Returns a cudaPos based on the specified input parameters ``x``, ``y``,
-    and ``z``.
+    Returns a :py:obj:`~.cudaPos` based on the specified input parameters
+    ``x``, ``y``, and ``z``.
 
     Parameters
     ----------
@@ -40939,7 +40943,7 @@ def make_cudaPos(size_t x, size_t y, size_t z):
     cudaError_t.cudaSuccess
         cudaError_t.cudaSuccess
     :py:obj:`~.cudaPos`
-        cudaPos specified by ``x``, ``y``, and ``z``
+        :py:obj:`~.cudaPos` specified by ``x``, ``y``, and ``z``
 
     See Also
     --------
@@ -40956,10 +40960,10 @@ def make_cudaPos(size_t x, size_t y, size_t z):
 
 @cython.embedsignature(True)
 def make_cudaExtent(size_t w, size_t h, size_t d):
-    """ Returns a cudaExtent based on input parameters.
+    """ Returns a :py:obj:`~.cudaExtent` based on input parameters.
 
-    Returns a cudaExtent based on the specified input parameters ``w``,
-    ``h``, and ``d``.
+    Returns a :py:obj:`~.cudaExtent` based on the specified input
+    parameters ``w``, ``h``, and ``d``.
 
     Parameters
     ----------
@@ -40976,7 +40980,7 @@ def make_cudaExtent(size_t w, size_t h, size_t d):
     cudaError_t.cudaSuccess
         cudaError_t.cudaSuccess
     :py:obj:`~.cudaExtent`
-        cudaExtent specified by ``w``, ``h``, and ``d``
+        :py:obj:`~.cudaExtent` specified by ``w``, ``h``, and ``d``
 
     See Also
     --------
diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst
index 5374a37d16..617c318914 100644
--- a/cuda_bindings/docs/source/module/runtime.rst
+++ b/cuda_bindings/docs/source/module/runtime.rst
@@ -137,7 +137,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidConfiguration
 
 
-        This indicates that a kernel launch is requesting resources that can never be satisfied by the current device. Requesting more shared memory per block than the device supports will trigger this error, as will requesting too many threads or blocks. See cudaDeviceProp for more device limitations.
+        This indicates that a kernel launch is requesting resources that can never be satisfied by the current device. Requesting more shared memory per block than the device supports will trigger this error, as will requesting too many threads or blocks. See :py:obj:`~.cudaDeviceProp` for more device limitations.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorVersionTranslation
@@ -3946,25 +3946,25 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsKernelNodeParams
 
 
-        Adds cudaKernelNodeParams to output
+        Adds :py:obj:`~.cudaKernelNodeParams` to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsMemcpyNodeParams
 
 
-        Adds cudaMemcpy3DParms to output
+        Adds :py:obj:`~.cudaMemcpy3DParms` to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsMemsetNodeParams
 
 
-        Adds cudaMemsetParams to output
+        Adds :py:obj:`~.cudaMemsetParams` to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsHostNodeParams
 
 
-        Adds cudaHostNodeParams to output
+        Adds :py:obj:`~.cudaHostNodeParams` to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsEventNodeParams
@@ -3976,13 +3976,13 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasSignalNodeParams
 
 
-        Adds cudaExternalSemaphoreSignalNodeParams values to output
+        Adds :py:obj:`~.cudaExternalSemaphoreSignalNodeParams` values to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasWaitNodeParams
 
 
-        Adds cudaExternalSemaphoreWaitNodeParams to output
+        Adds :py:obj:`~.cudaExternalSemaphoreWaitNodeParams` to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsKernelNodeAttributes
@@ -4000,7 +4000,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsConditionalNodeParams
 
 
-        Adds cudaConditionalNodeParams to output
+        Adds :py:obj:`~.cudaConditionalNodeParams` to output
 
 .. autoclass:: cuda.bindings.runtime.cudaGraphInstantiateFlags
 
@@ -5317,11 +5317,11 @@ Data types used by CUDA Runtime
 
 .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreSignalSkipNvSciBufMemSync
 
-    When the /p flags parameter of cudaExternalSemaphoreSignalParams contains this flag, it indicates that signaling an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
+    When the /p flags parameter of :py:obj:`~.cudaExternalSemaphoreSignalParams` contains this flag, it indicates that signaling an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
 
 .. autoattribute:: cuda.bindings.runtime.cudaExternalSemaphoreWaitSkipNvSciBufMemSync
 
-    When the /p flags parameter of cudaExternalSemaphoreWaitParams contains this flag, it indicates that waiting an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
+    When the /p flags parameter of :py:obj:`~.cudaExternalSemaphoreWaitParams` contains this flag, it indicates that waiting an external semaphore object should skip performing appropriate memory synchronization operations over all the external memory objects that are imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`, which otherwise are performed by default to ensure data coherency with other importers of the same NvSciBuf memory objects.
 
 .. autoattribute:: cuda.bindings.runtime.cudaNvSciSyncAttrSignal
 

From 4ccb9c16a82a56a546c1dfa5d4ab84215f9fc765 Mon Sep 17 00:00:00 2001
From: Keith Kraus <keith.j.kraus@gmail.com>
Date: Thu, 4 Jun 2026 11:50:59 -0400
Subject: [PATCH 5/5] Regenerate CUDA bindings docs refs

---
 cuda_bindings/cuda/bindings/driver.pxd.in    | 2250 ++++----
 cuda_bindings/cuda/bindings/driver.pyx.in    | 4940 +++++++++---------
 cuda_bindings/cuda/bindings/nvrtc.pxd        |    4 +-
 cuda_bindings/cuda/bindings/nvrtc.pyx        |  169 +-
 cuda_bindings/cuda/bindings/runtime.pxd.in   | 1038 ++--
 cuda_bindings/cuda/bindings/runtime.pyx.in   | 2991 +++++------
 cuda_bindings/docs/source/module/driver.rst  |  298 +-
 cuda_bindings/docs/source/module/nvrtc.rst   |   30 +-
 cuda_bindings/docs/source/module/runtime.rst |  364 +-
 9 files changed, 6317 insertions(+), 5767 deletions(-)

diff --git a/cuda_bindings/cuda/bindings/driver.pxd.in b/cuda_bindings/cuda/bindings/driver.pxd.in
index a607747264..a53fc22383 100644
--- a/cuda_bindings/cuda/bindings/driver.pxd.in
+++ b/cuda_bindings/cuda/bindings/driver.pxd.in
@@ -368,7 +368,7 @@ cdef class CUasyncCallbackHandle:
 cdef class CUgreenCtx:
     """
 
-    A green context handle. This handle can be used safely from only one CPU thread at a time. Created via cuGreenCtxCreate
+    A green context handle. This handle can be used safely from only one CPU thread at a time. Created via :func:`~.cuGreenCtxCreate`
 
     Methods
     -------
@@ -418,7 +418,7 @@ cdef class CUcoredumpCallbackHandle:
 cdef class CUdevResourceDesc:
     """
 
-    An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources. Created via cuDevResourceGenerateDesc
+    An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources. Created via :func:`~.cuDevResourceGenerateDesc`
 
     Methods
     -------
@@ -704,27 +704,27 @@ cdef class CUstreamMemOpWaitValueParams_st:
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.waitValue.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.address' in found_struct}}
-    address : CUdeviceptr
+    address : :py:obj:`~.CUdeviceptr`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.value' in found_struct}}
-    value : cuuint32_t
+    value : :py:obj:`~.cuuint32_t`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.value64' in found_struct}}
-    value64 : cuuint64_t
+    value64 : :py:obj:`~.cuuint64_t`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.flags' in found_struct}}
     flags : unsigned int
-        See CUstreamWaitValue_flags.
+        See :py:obj:`~.CUstreamWaitValue_flags`.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.alias' in found_struct}}
-    alias : CUdeviceptr
+    alias : :py:obj:`~.CUdeviceptr`
         For driver internal use. Initial value is unimportant.
     {{endif}}
 
@@ -754,27 +754,27 @@ cdef class CUstreamMemOpWriteValueParams_st:
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.writeValue.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.address' in found_struct}}
-    address : CUdeviceptr
+    address : :py:obj:`~.CUdeviceptr`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.value' in found_struct}}
-    value : cuuint32_t
+    value : :py:obj:`~.cuuint32_t`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.value64' in found_struct}}
-    value64 : cuuint64_t
+    value64 : :py:obj:`~.cuuint64_t`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.flags' in found_struct}}
     flags : unsigned int
-        See CUstreamWriteValue_flags.
+        See :py:obj:`~.CUstreamWriteValue_flags`.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.alias' in found_struct}}
-    alias : CUdeviceptr
+    alias : :py:obj:`~.CUdeviceptr`
         For driver internal use. Initial value is unimportant.
     {{endif}}
 
@@ -804,7 +804,7 @@ cdef class CUstreamMemOpFlushRemoteWritesParams_st:
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites.flags' in found_struct}}
@@ -826,12 +826,12 @@ cdef class CUstreamMemOpMemoryBarrierParams_st:
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
         < Only supported in the _v2 API
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier.flags' in found_struct}}
     flags : unsigned int
-        See CUstreamMemoryBarrier_flags
+        See :py:obj:`~.CUstreamMemoryBarrier_flags`
     {{endif}}
 
     Methods
@@ -848,7 +848,7 @@ cdef class CUstreamMemOpAtomicReductionParams_st:
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.flags' in found_struct}}
@@ -856,23 +856,23 @@ cdef class CUstreamMemOpAtomicReductionParams_st:
         Must be 0
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.reductionOp' in found_struct}}
-    reductionOp : CUstreamAtomicReductionOpType
-        See CUstreamAtomicReductionOpType
+    reductionOp : :py:obj:`~.CUstreamAtomicReductionOpType`
+        See :py:obj:`~.CUstreamAtomicReductionOpType`
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.dataType' in found_struct}}
-    dataType : CUstreamAtomicReductionDataType
-        See CUstreamAtomicReductionDataType
+    dataType : :py:obj:`~.CUstreamAtomicReductionDataType`
+        See :py:obj:`~.CUstreamAtomicReductionDataType`
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.address' in found_struct}}
-    address : CUdeviceptr
+    address : :py:obj:`~.CUdeviceptr`
         The address the atomic operation will be operated on
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.value' in found_struct}}
-    value : cuuint64_t
+    value : :py:obj:`~.cuuint64_t`
         The operand value the atomic operation will operate with
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.alias' in found_struct}}
-    alias : CUdeviceptr
+    alias : :py:obj:`~.CUdeviceptr`
         For driver internal use. Initial value is unimportant.
     {{endif}}
 
@@ -896,39 +896,40 @@ cdef class CUstreamMemOpAtomicReductionParams_st:
 
 cdef class CUstreamBatchMemOpParams_union:
     """
-    Per-operation parameters for cuStreamBatchMemOp
+    Per-operation parameters for :py:obj:`~.cuStreamBatchMemOp`
 
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
         Operation. This is the first field of all the union elemets and
         acts as a TAG to determine which union member is valid.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue' in found_struct}}
-    waitValue : CUstreamMemOpWaitValueParams_st
-        Params for CU_STREAM_MEM_OP_WAIT_VALUE_32 and
-        CU_STREAM_MEM_OP_WAIT_VALUE_64 operations.
+    waitValue : :py:obj:`~.CUstreamMemOpWaitValueParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_WAIT_VALUE_32` and
+        :py:obj:`~.CU_STREAM_MEM_OP_WAIT_VALUE_64` operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue' in found_struct}}
-    writeValue : CUstreamMemOpWriteValueParams_st
-        Params for CU_STREAM_MEM_OP_WRITE_VALUE_32 and
-        CU_STREAM_MEM_OP_WRITE_VALUE_64 operations.
+    writeValue : :py:obj:`~.CUstreamMemOpWriteValueParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_WRITE_VALUE_32` and
+        :py:obj:`~.CU_STREAM_MEM_OP_WRITE_VALUE_64` operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites' in found_struct}}
-    flushRemoteWrites : CUstreamMemOpFlushRemoteWritesParams_st
-        Params for CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES operations.
+    flushRemoteWrites : :py:obj:`~.CUstreamMemOpFlushRemoteWritesParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES`
+        operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
-    memoryBarrier : CUstreamMemOpMemoryBarrierParams_st
-        Params for CU_STREAM_MEM_OP_BARRIER operations.
+    memoryBarrier : :py:obj:`~.CUstreamMemOpMemoryBarrierParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_BARRIER` operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
-    atomicReduction : CUstreamMemOpAtomicReductionParams_st
+    atomicReduction : :py:obj:`~.CUstreamMemOpAtomicReductionParams_st`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
-    pad : list[cuuint64_t]
+    pad : list[:py:obj:`~.cuuint64_t`]
 
     {{endif}}
 
@@ -960,12 +961,13 @@ cdef class CUstreamBatchMemOpParams_union:
 cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st:
     """
     Batch memory operation node parameters  Used in the legacy
-    cuGraphAddBatchMemOpNode api. New code should use cuGraphAddNode()
+    :func:`~.cuGraphAddBatchMemOpNode` api. New code should use
+    :func:`~.cuGraphAddNode`
 
     Attributes
     ----------
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
 
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.count' in found_struct}}
@@ -973,7 +975,7 @@ cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st:
 
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.paramArray' in found_struct}}
-    paramArray : CUstreamBatchMemOpParams
+    paramArray : :py:obj:`~.CUstreamBatchMemOpParams`
 
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.flags' in found_struct}}
@@ -1005,7 +1007,7 @@ cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context to use for the operations.
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st.count' in found_struct}}
@@ -1013,7 +1015,7 @@ cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st:
         Number of operations in paramArray.
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st.paramArray' in found_struct}}
-    paramArray : CUstreamBatchMemOpParams
+    paramArray : :py:obj:`~.CUstreamBatchMemOpParams`
         Array of batch memory operations.
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st.flags' in found_struct}}
@@ -1084,7 +1086,7 @@ cdef class CUasyncNotificationInfo_st:
     Attributes
     ----------
     {{if 'CUasyncNotificationInfo_st.type' in found_struct}}
-    type : CUasyncNotificationType
+    type : :py:obj:`~.CUasyncNotificationType`
         The type of notification being sent
     {{endif}}
     {{if 'CUasyncNotificationInfo_st.info' in found_struct}}
@@ -1168,13 +1170,13 @@ cdef class CUaccessPolicyWindow_st:
     Specifies an access policy for a window, a contiguous extent of
     memory beginning at base_ptr and ending at base_ptr + num_bytes.
     num_bytes is limited by
-    CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE. Partition into
-    many segments and assign segments such that: sum of "hit segments"
-    / window == approx. ratio. sum of "miss segments" / window ==
-    approx 1-ratio. Segments and ratio specifications are fitted to the
-    capabilities of the architecture. Accesses in a hit segment apply
-    the hitProp access policy. Accesses in a miss segment apply the
-    missProp access policy.
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE`.
+    Partition into many segments and assign segments such that: sum of
+    "hit segments" / window == approx. ratio. sum of "miss segments" /
+    window == approx 1-ratio. Segments and ratio specifications are
+    fitted to the capabilities of the architecture. Accesses in a hit
+    segment apply the hitProp access policy. Accesses in a miss segment
+    apply the missProp access policy.
 
     Attributes
     ----------
@@ -1194,12 +1196,13 @@ cdef class CUaccessPolicyWindow_st:
         assigned missProp.
     {{endif}}
     {{if 'CUaccessPolicyWindow_st.hitProp' in found_struct}}
-    hitProp : CUaccessProperty
-        CUaccessProperty set for hit.
+    hitProp : :py:obj:`~.CUaccessProperty`
+        :py:obj:`~.CUaccessProperty` set for hit.
     {{endif}}
     {{if 'CUaccessPolicyWindow_st.missProp' in found_struct}}
-    missProp : CUaccessProperty
-        CUaccessProperty set for miss. Must be either NORMAL or STREAMING
+    missProp : :py:obj:`~.CUaccessProperty`
+        :py:obj:`~.CUaccessProperty` set for miss. Must be either NORMAL or
+        STREAMING
     {{endif}}
 
     Methods
@@ -1222,7 +1225,7 @@ cdef class CUDA_KERNEL_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_KERNEL_NODE_PARAMS_st.func' in found_struct}}
-    func : CUfunction
+    func : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_st.gridDimX' in found_struct}}
@@ -1285,7 +1288,7 @@ cdef class CUDA_KERNEL_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.func' in found_struct}}
-    func : CUfunction
+    func : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.gridDimX' in found_struct}}
@@ -1325,11 +1328,11 @@ cdef class CUDA_KERNEL_NODE_PARAMS_v2_st:
         Extra options
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.kern' in found_struct}}
-    kern : CUkernel
+    kern : :py:obj:`~.CUkernel`
         Kernel to launch, will only be referenced if func is NULL
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context for the kernel task to run in. The value NULL will indicate
         the current context should be used by the api. This field is
         ignored if func is set.
@@ -1364,7 +1367,7 @@ cdef class CUDA_KERNEL_NODE_PARAMS_v3_st:
     Attributes
     ----------
     {{if 'CUDA_KERNEL_NODE_PARAMS_v3_st.func' in found_struct}}
-    func : CUfunction
+    func : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v3_st.gridDimX' in found_struct}}
@@ -1404,11 +1407,11 @@ cdef class CUDA_KERNEL_NODE_PARAMS_v3_st:
         Extra options
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v3_st.kern' in found_struct}}
-    kern : CUkernel
+    kern : :py:obj:`~.CUkernel`
         Kernel to launch, will only be referenced if func is NULL
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v3_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context for the kernel task to run in. The value NULL will indicate
         the current context should be used by the api. This field is
         ignored if func is set.
@@ -1443,7 +1446,7 @@ cdef class CUDA_MEMSET_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_MEMSET_NODE_PARAMS_st.dst' in found_struct}}
-    dst : CUdeviceptr
+    dst : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMSET_NODE_PARAMS_st.pitch' in found_struct}}
@@ -1487,7 +1490,7 @@ cdef class CUDA_MEMSET_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_MEMSET_NODE_PARAMS_v2_st.dst' in found_struct}}
-    dst : CUdeviceptr
+    dst : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMSET_NODE_PARAMS_v2_st.pitch' in found_struct}}
@@ -1511,7 +1514,7 @@ cdef class CUDA_MEMSET_NODE_PARAMS_v2_st:
         Number of rows
     {{endif}}
     {{if 'CUDA_MEMSET_NODE_PARAMS_v2_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context on which to run the node
     {{endif}}
 
@@ -1538,7 +1541,7 @@ cdef class CUDA_HOST_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_HOST_NODE_PARAMS_st.fn' in found_struct}}
-    fn : CUhostFn
+    fn : :py:obj:`~.CUhostFn`
         The function to call when the node executes
     {{endif}}
     {{if 'CUDA_HOST_NODE_PARAMS_st.userData' in found_struct}}
@@ -1569,7 +1572,7 @@ cdef class CUDA_HOST_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_HOST_NODE_PARAMS_v2_st.fn' in found_struct}}
-    fn : CUhostFn
+    fn : :py:obj:`~.CUhostFn`
         The function to call when the node executes
     {{endif}}
     {{if 'CUDA_HOST_NODE_PARAMS_v2_st.userData' in found_struct}}
@@ -1604,22 +1607,23 @@ cdef class CUDA_CONDITIONAL_NODE_PARAMS:
     Attributes
     ----------
     {{if 'CUDA_CONDITIONAL_NODE_PARAMS.handle' in found_struct}}
-    handle : CUgraphConditionalHandle
+    handle : :py:obj:`~.CUgraphConditionalHandle`
         Conditional node handle. Handles must be created in advance of
-        creating the node using cuGraphConditionalHandleCreate.
+        creating the node using :func:`~.cuGraphConditionalHandleCreate`.
     {{endif}}
     {{if 'CUDA_CONDITIONAL_NODE_PARAMS.type' in found_struct}}
-    type : CUgraphConditionalNodeType
+    type : :py:obj:`~.CUgraphConditionalNodeType`
         Type of conditional node.
     {{endif}}
     {{if 'CUDA_CONDITIONAL_NODE_PARAMS.size' in found_struct}}
     size : unsigned int
         Size of graph output array. Allowed values are 1 for
-        CU_GRAPH_COND_TYPE_WHILE, 1 or 2 for CU_GRAPH_COND_TYPE_IF, or any
-        value greater than zero for CU_GRAPH_COND_TYPE_SWITCH.
+        :py:obj:`~.CU_GRAPH_COND_TYPE_WHILE`, 1 or 2 for
+        :py:obj:`~.CU_GRAPH_COND_TYPE_IF`, or any value greater than zero
+        for :py:obj:`~.CU_GRAPH_COND_TYPE_SWITCH`.
     {{endif}}
     {{if 'CUDA_CONDITIONAL_NODE_PARAMS.phGraph_out' in found_struct}}
-    phGraph_out : CUgraph
+    phGraph_out : :py:obj:`~.CUgraph`
         CUDA-owned array populated with conditional node child graphs
         during creation of the node. Valid for the lifetime of the
         conditional node. The contents of the graph(s) are subject to the
@@ -1629,16 +1633,18 @@ cdef class CUDA_CONDITIONAL_NODE_PARAMS:
         - All kernels, including kernels in nested conditionals or child
         graphs at any level, must belong to the same CUDA context.
         These graphs may be populated using graph node creation APIs or
-        cuStreamBeginCaptureToGraph.  CU_GRAPH_COND_TYPE_IF: phGraph_out[0]
-        is executed when the condition is non-zero. If ``size`` == 2,
-        phGraph_out[1] will be executed when the condition is zero.
-        CU_GRAPH_COND_TYPE_WHILE: phGraph_out[0] is executed as long as the
-        condition is non-zero. CU_GRAPH_COND_TYPE_SWITCH: phGraph_out[n] is
-        executed when the condition is equal to n. If the condition >=
-        ``size``, no body graph is executed.
+        :func:`~.cuStreamBeginCaptureToGraph`.
+        :py:obj:`~.CU_GRAPH_COND_TYPE_IF`: phGraph_out[0] is executed when
+        the condition is non-zero. If ``size`` == 2, phGraph_out[1] will be
+        executed when the condition is zero.
+        :py:obj:`~.CU_GRAPH_COND_TYPE_WHILE`: phGraph_out[0] is executed as
+        long as the condition is non-zero.
+        :py:obj:`~.CU_GRAPH_COND_TYPE_SWITCH`: phGraph_out[n] is executed
+        when the condition is equal to n. If the condition >= ``size``, no
+        body graph is executed.
     {{endif}}
     {{if 'CUDA_CONDITIONAL_NODE_PARAMS.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context on which to run the node. Must match context used to create
         the handle and all body nodes.
     {{endif}}
@@ -1667,8 +1673,8 @@ cdef class CUgraphEdgeData_st:
     """
     Optional annotation for edges in a CUDA graph. Note, all edges
     implicitly have annotations and default to a zero-initialized value
-    if not specified. A zero-initialized struct indicates a standard
-    full serialization of two nodes with memory visibility.
+    if not specified. A zero-initialized ``struct indicates`` a
+    standard full serialization of two nodes with memory visibility.
 
     Attributes
     ----------
@@ -1680,9 +1686,9 @@ cdef class CUgraphEdgeData_st:
         memory visibility to the downstream node or portion thereof
         (indicated by ``to_port``).   Only kernel nodes define non-zero
         ports. A kernel node can use the following output port types:
-        CU_GRAPH_KERNEL_NODE_PORT_DEFAULT,
-        CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC, or
-        CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER.
+        :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_DEFAULT`,
+        :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC`, or
+        :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER`.
     {{endif}}
     {{if 'CUgraphEdgeData_st.to_port' in found_struct}}
     to_port : bytes
@@ -1695,9 +1701,10 @@ cdef class CUgraphEdgeData_st:
     {{endif}}
     {{if 'CUgraphEdgeData_st.type' in found_struct}}
     type : bytes
-        This should be populated with a value from CUgraphDependencyType.
-        (It is typed as char due to compiler-specific layout of bitfields.)
-        See CUgraphDependencyType.
+        This should be populated with a value from
+        :py:obj:`~.CUgraphDependencyType`. (It is typed as char due to
+        compiler-specific layout of bitfields.) See
+        :py:obj:`~.CUgraphDependencyType`.
     {{endif}}
     {{if 'CUgraphEdgeData_st.reserved' in found_struct}}
     reserved : bytes
@@ -1722,19 +1729,19 @@ cdef class CUDA_GRAPH_INSTANTIATE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_GRAPH_INSTANTIATE_PARAMS_st.flags' in found_struct}}
-    flags : cuuint64_t
+    flags : :py:obj:`~.cuuint64_t`
         Instantiation flags
     {{endif}}
     {{if 'CUDA_GRAPH_INSTANTIATE_PARAMS_st.hUploadStream' in found_struct}}
-    hUploadStream : CUstream
+    hUploadStream : :py:obj:`~.CUstream`
         Upload stream
     {{endif}}
     {{if 'CUDA_GRAPH_INSTANTIATE_PARAMS_st.hErrNode_out' in found_struct}}
-    hErrNode_out : CUgraphNode
+    hErrNode_out : :py:obj:`~.CUgraphNode`
         The node which caused instantiation to fail, if any
     {{endif}}
     {{if 'CUDA_GRAPH_INSTANTIATE_PARAMS_st.result_out' in found_struct}}
-    result_out : CUgraphInstantiateResult
+    result_out : :py:obj:`~.CUgraphInstantiateResult`
         Whether instantiation was successful. If it failed, the reason why
     {{endif}}
 
@@ -1759,13 +1766,16 @@ cdef class CUDA_GRAPH_INSTANTIATE_PARAMS_st:
 
 cdef class CUlaunchMemSyncDomainMap_st:
     """
-    Memory Synchronization Domain map  See ``cudaLaunchMemSyncDomain``.
-    By default, kernels are launched in domain 0. Kernel launched with
-    CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE will have a different domain ID.
-    User may also alter the domain ID with CUlaunchMemSyncDomainMap for
-    a specific stream / graph node / kernel launch. See
-    CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP.  Domain ID range is
-    available through CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT.
+    Memory Synchronization Domain map  See
+    :py:obj:`~.cudaLaunchMemSyncDomain`.  By default, kernels are
+    launched in domain 0. Kernel launched with
+    :py:obj:`~.CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE` will have a different
+    domain ID. User may also alter the domain ID with
+    :py:obj:`~.CUlaunchMemSyncDomainMap` for a specific stream / graph
+    node / kernel launch. See
+    :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP`.  Domain ID
+    range is available through
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT`.
 
     Attributes
     ----------
@@ -1819,7 +1829,7 @@ cdef class anon_struct2:
     Attributes
     ----------
     {{if 'CUlaunchAttributeValue_union.programmaticEvent.event' in found_struct}}
-    event : CUevent
+    event : :py:obj:`~.CUevent`
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticEvent.flags' in found_struct}}
@@ -1848,7 +1858,7 @@ cdef class anon_struct3:
     Attributes
     ----------
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent.event' in found_struct}}
-    event : CUevent
+    event : :py:obj:`~.CUevent`
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent.flags' in found_struct}}
@@ -1903,7 +1913,7 @@ cdef class anon_struct5:
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode.devNode' in found_struct}}
-    devNode : CUgraphDeviceNode
+    devNode : :py:obj:`~.CUgraphDeviceNode`
 
     {{endif}}
 
@@ -1921,7 +1931,8 @@ cdef class anon_struct5:
 
 cdef class CUlaunchAttributeValue_union:
     """
-    Launch attributes union; used as value field of CUlaunchAttribute
+    Launch attributes union; used as value field of
+    :py:obj:`~.CUlaunchAttribute`
 
     Attributes
     ----------
@@ -1930,115 +1941,125 @@ cdef class CUlaunchAttributeValue_union:
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.accessPolicyWindow' in found_struct}}
-    accessPolicyWindow : CUaccessPolicyWindow
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW.
+    accessPolicyWindow : :py:obj:`~.CUaccessPolicyWindow`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.cooperative' in found_struct}}
     cooperative : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_COOPERATIVE. Nonzero
-        indicates a cooperative kernel (see cuLaunchCooperativeKernel).
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_COOPERATIVE`. Nonzero indicates a
+        cooperative kernel (see :func:`~.cuLaunchCooperativeKernel`).
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
-    syncPolicy : CUsynchronizationPolicy
+    syncPolicy : :py:obj:`~.CUsynchronizationPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
-        for work queued up in this stream
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY`.
+        :py:obj:`~.CUsynchronizationPolicy` for work queued up in this
+        stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
-        that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - ``x`` - The X dimension of
-        the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
-        the cluster, in blocks. Must be a divisor of the grid Z dimension.
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` that represents
+        the desired cluster dimensions for the kernel. Opaque type with the
+        following fields: - ``x`` - The X dimension of the cluster, in
+        blocks. Must be a divisor of the grid X dimension.    - ``y`` - The
+        Y dimension of the cluster, in blocks. Must be a divisor of the
+        grid Y dimension.    - ``z`` - The Z dimension of the cluster, in
+        blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
-    clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
+    clusterSchedulingPolicyPreference : :py:obj:`~.CUclusterSchedulingPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE. Cluster
-        scheduling policy preference for the kernel.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`.
+        Cluster scheduling policy preference for the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticStreamSerializationAllowed' in found_struct}}
     programmaticStreamSerializationAllowed : int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - ``CUevent`` event - Event to fire when
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
         all blocks trigger it.    - ``Event`` record flags, see
-        cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - ``triggerAtBlockStart`` - If this is set to non-0, each block
+        :func:`~.cuEventRecordWithFlags`. Does not accept
+        ``py``:obj:`~.CU_EVENT_RECORD_EXTERNAL`.    -
+        ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - ``CUevent`` event - Event to fire when the last block
-        launches    - ``int`` flags; - Event record flags, see
-        cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
+        the last block launches    - ``int`` flags; - Event record flags,
+        see :func:`~.cuEventRecordWithFlags`. Does not accept
+        :py:obj:`~.CU_EVENT_RECORD_EXTERNAL`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
     priority : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PRIORITY. Execution
-        priority of the kernel.
+        Value of launch attribute :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PRIORITY`.
+        Execution priority of the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomainMap' in found_struct}}
-    memSyncDomainMap : CUlaunchMemSyncDomainMap
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP.
-        See CUlaunchMemSyncDomainMap.
+    memSyncDomainMap : :py:obj:`~.CUlaunchMemSyncDomainMap`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP`. See
+        :py:obj:`~.CUlaunchMemSyncDomainMap`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomain' in found_struct}}
-    memSyncDomain : CUlaunchMemSyncDomain
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN.
-        See::CUlaunchMemSyncDomain
+    memSyncDomain : :py:obj:`~.CUlaunchMemSyncDomain`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN`. See
+        :py:obj:`~.CUlaunchMemSyncDomain`
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.preferredClusterDim' in found_struct}}
     preferredClusterDim : anon_struct4
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
-        desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - ``x`` - The X dimension of the
-        preferred cluster, in blocks. Must be a divisor of the grid X
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION` that
+        represents the desired preferred cluster dimensions for the kernel.
+        Opaque type with the following fields: - ``x`` - The X dimension of
+        the preferred cluster, in blocks. Must be a divisor of the grid X
         dimension, and must be a multiple of the ``x`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the ``y`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the ``z`` field
-        of CUlaunchAttributeValue::clusterDim.
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``y`` - The Y
+        dimension of the preferred cluster, in blocks. Must be a divisor of
+        the grid Y dimension, and must be a multiple of the ``y`` field of
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``z`` - The Z
+        dimension of the preferred cluster, in blocks. Must be equal to the
+        ``z`` field of :py:obj:`~.CUlaunchAttributeValue.clusterDim`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - ``int`` deviceUpdatable - Whether or not the
-        resulting kernel node should be device-updatable.    -
-        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE`. with
+        the following fields: - ``int`` deviceUpdatable - Whether or not
+        the resulting kernel node should be device-updatable.    -
+        :py:obj:`~.CUgraphDeviceNode` devNode - Returns a handle to pass to
+        the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
     nvlinkUtilCentricScheduling : unsigned int
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.portableClusterSizeMode' in found_struct}}
-    portableClusterSizeMode : CUlaunchAttributePortableClusterMode
+    portableClusterSizeMode : :py:obj:`~.CUlaunchAttributePortableClusterMode`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemoryMode' in found_struct}}
-    sharedMemoryMode : CUsharedMemoryMode
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE.
-        See CUsharedMemoryMode for acceptable values.
+    sharedMemoryMode : :py:obj:`~.CUsharedMemoryMode`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE`. See
+        :py:obj:`~.CUsharedMemoryMode` for acceptable values.
     {{endif}}
 
     Methods
@@ -2079,11 +2100,11 @@ cdef class CUlaunchAttribute_st:
     Attributes
     ----------
     {{if 'CUlaunchAttribute_st.id' in found_struct}}
-    id : CUlaunchAttributeID
+    id : :py:obj:`~.CUlaunchAttributeID`
         Attribute to set
     {{endif}}
     {{if 'CUlaunchAttribute_st.value' in found_struct}}
-    value : CUlaunchAttributeValue
+    value : :py:obj:`~.CUlaunchAttributeValue`
         Value of the attribute
     {{endif}}
 
@@ -2135,16 +2156,17 @@ cdef class CUlaunchConfig_st:
         Dynamic shared-memory size per thread block in bytes
     {{endif}}
     {{if 'CUlaunchConfig_st.hStream' in found_struct}}
-    hStream : CUstream
+    hStream : :py:obj:`~.CUstream`
         Stream identifier
     {{endif}}
     {{if 'CUlaunchConfig_st.attrs' in found_struct}}
-    attrs : CUlaunchAttribute
-        List of attributes; nullable if CUlaunchConfig::numAttrs == 0
+    attrs : :py:obj:`~.CUlaunchAttribute`
+        List of attributes; nullable if :py:obj:`~.CUlaunchConfig.numAttrs`
+        == 0
     {{endif}}
     {{if 'CUlaunchConfig_st.numAttrs' in found_struct}}
     numAttrs : unsigned int
-        Number of attributes populated in CUlaunchConfig::attrs
+        Number of attributes populated in :py:obj:`~.CUlaunchConfig.attrs`
     {{endif}}
 
     Methods
@@ -2166,7 +2188,7 @@ cdef class CUlaunchConfig_st:
 
 cdef class CUexecAffinitySmCount_st:
     """
-    Value for CU_EXEC_AFFINITY_TYPE_SM_COUNT
+    Value for :py:obj:`~.CU_EXEC_AFFINITY_TYPE_SM_COUNT`
 
     Attributes
     ----------
@@ -2190,7 +2212,7 @@ cdef class anon_union3:
     Attributes
     ----------
     {{if 'CUexecAffinityParam_st.param.smCount' in found_struct}}
-    smCount : CUexecAffinitySmCount
+    smCount : :py:obj:`~.CUexecAffinitySmCount`
 
     {{endif}}
 
@@ -2213,7 +2235,7 @@ cdef class CUexecAffinityParam_st:
     Attributes
     ----------
     {{if 'CUexecAffinityParam_st.type' in found_struct}}
-    type : CUexecAffinityType
+    type : :py:obj:`~.CUexecAffinityType`
         Type of execution affinity.
     {{endif}}
     {{if 'CUexecAffinityParam_st.param' in found_struct}}
@@ -2241,7 +2263,7 @@ cdef class CUctxCigParam_st:
     Attributes
     ----------
     {{if 'CUctxCigParam_st.sharedDataType' in found_struct}}
-    sharedDataType : CUcigDataType
+    sharedDataType : :py:obj:`~.CUcigDataType`
         Type of shared data from graphics client (D3D12 or Vulkan).
     {{endif}}
     {{if 'CUctxCigParam_st.sharedData' in found_struct}}
@@ -2272,7 +2294,7 @@ cdef class CUctxCreateParams_st:
     Attributes
     ----------
     {{if 'CUctxCreateParams_st.execAffinityParams' in found_struct}}
-    execAffinityParams : CUexecAffinityParam
+    execAffinityParams : :py:obj:`~.CUexecAffinityParam`
         Array of execution affinity parameters to limit context resources
         (e.g., SM count). Only supported Volta+ MPS. Mutually exclusive
         with cigParams.
@@ -2283,7 +2305,7 @@ cdef class CUctxCreateParams_st:
         execAffinityParams is NULL.
     {{endif}}
     {{if 'CUctxCreateParams_st.cigParams' in found_struct}}
-    cigParams : CUctxCigParam
+    cigParams : :py:obj:`~.CUctxCigParam`
         CIG (CUDA in Graphics) parameters for sharing data from
         D3D12/Vulkan graphics clients. Mutually exclusive with
         execAffinityParams.
@@ -2314,7 +2336,7 @@ cdef class CUstreamCigParam_st:
     Attributes
     ----------
     {{if 'CUstreamCigParam_st.streamSharedDataType' in found_struct}}
-    streamSharedDataType : CUstreamCigDataType
+    streamSharedDataType : :py:obj:`~.CUstreamCigDataType`
         Type of shared data from graphics client (D3D12).
     {{endif}}
     {{if 'CUstreamCigParam_st.streamSharedData' in found_struct}}
@@ -2344,7 +2366,7 @@ cdef class CUstreamCigCaptureParams_st:
     Attributes
     ----------
     {{if 'CUstreamCigCaptureParams_st.streamCigParams' in found_struct}}
-    streamCigParams : CUstreamCigParam
+    streamCigParams : :py:obj:`~.CUstreamCigParam`
         CIG (CUDA in Graphics) parameters for sharing command list data
         from D3D12 graphics clients.
     {{endif}}
@@ -2415,7 +2437,7 @@ cdef class CUDA_MEMCPY2D_st:
         Source Y
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcMemoryType' in found_struct}}
-    srcMemoryType : CUmemorytype
+    srcMemoryType : :py:obj:`~.CUmemorytype`
         Source memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcHost' in found_struct}}
@@ -2423,11 +2445,11 @@ cdef class CUDA_MEMCPY2D_st:
         Source host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcDevice' in found_struct}}
-    srcDevice : CUdeviceptr
+    srcDevice : :py:obj:`~.CUdeviceptr`
         Source device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcArray' in found_struct}}
-    srcArray : CUarray
+    srcArray : :py:obj:`~.CUarray`
         Source array reference
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcPitch' in found_struct}}
@@ -2443,7 +2465,7 @@ cdef class CUDA_MEMCPY2D_st:
         Destination Y
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstMemoryType' in found_struct}}
-    dstMemoryType : CUmemorytype
+    dstMemoryType : :py:obj:`~.CUmemorytype`
         Destination memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstHost' in found_struct}}
@@ -2451,11 +2473,11 @@ cdef class CUDA_MEMCPY2D_st:
         Destination host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstDevice' in found_struct}}
-    dstDevice : CUdeviceptr
+    dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstArray' in found_struct}}
-    dstArray : CUarray
+    dstArray : :py:obj:`~.CUarray`
         Destination array reference
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstPitch' in found_struct}}
@@ -2522,7 +2544,7 @@ cdef class CUDA_MEMCPY3D_st:
         Source LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcMemoryType' in found_struct}}
-    srcMemoryType : CUmemorytype
+    srcMemoryType : :py:obj:`~.CUmemorytype`
         Source memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcHost' in found_struct}}
@@ -2530,11 +2552,11 @@ cdef class CUDA_MEMCPY3D_st:
         Source host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcDevice' in found_struct}}
-    srcDevice : CUdeviceptr
+    srcDevice : :py:obj:`~.CUdeviceptr`
         Source device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcArray' in found_struct}}
-    srcArray : CUarray
+    srcArray : :py:obj:`~.CUarray`
         Source array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.reserved0' in found_struct}}
@@ -2566,7 +2588,7 @@ cdef class CUDA_MEMCPY3D_st:
         Destination LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstMemoryType' in found_struct}}
-    dstMemoryType : CUmemorytype
+    dstMemoryType : :py:obj:`~.CUmemorytype`
         Destination memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstHost' in found_struct}}
@@ -2574,11 +2596,11 @@ cdef class CUDA_MEMCPY3D_st:
         Destination host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstDevice' in found_struct}}
-    dstDevice : CUdeviceptr
+    dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstArray' in found_struct}}
-    dstArray : CUarray
+    dstArray : :py:obj:`~.CUarray`
         Destination array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.reserved1' in found_struct}}
@@ -2664,7 +2686,7 @@ cdef class CUDA_MEMCPY3D_PEER_st:
         Source LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcMemoryType' in found_struct}}
-    srcMemoryType : CUmemorytype
+    srcMemoryType : :py:obj:`~.CUmemorytype`
         Source memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcHost' in found_struct}}
@@ -2672,16 +2694,17 @@ cdef class CUDA_MEMCPY3D_PEER_st:
         Source host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcDevice' in found_struct}}
-    srcDevice : CUdeviceptr
+    srcDevice : :py:obj:`~.CUdeviceptr`
         Source device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcArray' in found_struct}}
-    srcArray : CUarray
+    srcArray : :py:obj:`~.CUarray`
         Source array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcContext' in found_struct}}
-    srcContext : CUcontext
-        Source context (ignored with srcMemoryType is CU_MEMORYTYPE_ARRAY)
+    srcContext : :py:obj:`~.CUcontext`
+        Source context (ignored with srcMemoryType is
+        :py:obj:`~.CU_MEMORYTYPE_ARRAY`)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcPitch' in found_struct}}
     srcPitch : size_t
@@ -2708,7 +2731,7 @@ cdef class CUDA_MEMCPY3D_PEER_st:
         Destination LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstMemoryType' in found_struct}}
-    dstMemoryType : CUmemorytype
+    dstMemoryType : :py:obj:`~.CUmemorytype`
         Destination memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstHost' in found_struct}}
@@ -2716,17 +2739,17 @@ cdef class CUDA_MEMCPY3D_PEER_st:
         Destination host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstDevice' in found_struct}}
-    dstDevice : CUdeviceptr
+    dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstArray' in found_struct}}
-    dstArray : CUarray
+    dstArray : :py:obj:`~.CUarray`
         Destination array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstContext' in found_struct}}
-    dstContext : CUcontext
+    dstContext : :py:obj:`~.CUcontext`
         Destination context (ignored with dstMemoryType is
-        CU_MEMORYTYPE_ARRAY)
+        :py:obj:`~.CU_MEMORYTYPE_ARRAY`)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstPitch' in found_struct}}
     dstPitch : size_t
@@ -2799,11 +2822,11 @@ cdef class CUDA_MEMCPY_NODE_PARAMS_st:
         Must be zero
     {{endif}}
     {{if 'CUDA_MEMCPY_NODE_PARAMS_st.copyCtx' in found_struct}}
-    copyCtx : CUcontext
+    copyCtx : :py:obj:`~.CUcontext`
         Context on which to run the node
     {{endif}}
     {{if 'CUDA_MEMCPY_NODE_PARAMS_st.copyParams' in found_struct}}
-    copyParams : CUDA_MEMCPY3D
+    copyParams : :py:obj:`~.CUDA_MEMCPY3D`
         Parameters for the memory copy
     {{endif}}
 
@@ -2838,7 +2861,7 @@ cdef class CUDA_ARRAY_DESCRIPTOR_st:
         Height of array
     {{endif}}
     {{if 'CUDA_ARRAY_DESCRIPTOR_st.Format' in found_struct}}
-    Format : CUarray_format
+    Format : :py:obj:`~.CUarray_format`
         Array format
     {{endif}}
     {{if 'CUDA_ARRAY_DESCRIPTOR_st.NumChannels' in found_struct}}
@@ -2875,7 +2898,7 @@ cdef class CUDA_ARRAY3D_DESCRIPTOR_st:
         Depth of 3D array
     {{endif}}
     {{if 'CUDA_ARRAY3D_DESCRIPTOR_st.Format' in found_struct}}
-    Format : CUarray_format
+    Format : :py:obj:`~.CUarray_format`
         Array format
     {{endif}}
     {{if 'CUDA_ARRAY3D_DESCRIPTOR_st.NumChannels' in found_struct}}
@@ -2944,7 +2967,7 @@ cdef class CUDA_ARRAY_SPARSE_PROPERTIES_st:
     {{if 'CUDA_ARRAY_SPARSE_PROPERTIES_st.flags' in found_struct}}
     flags : unsigned int
         Flags will either be zero or
-        CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL
+        :py:obj:`~.CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL`
     {{endif}}
     {{if 'CUDA_ARRAY_SPARSE_PROPERTIES_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -2998,7 +3021,7 @@ cdef class anon_struct7:
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_DESC_st.res.array.hArray' in found_struct}}
-    hArray : CUarray
+    hArray : :py:obj:`~.CUarray`
 
     {{endif}}
 
@@ -3019,7 +3042,7 @@ cdef class anon_struct8:
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_DESC_st.res.mipmap.hMipmappedArray' in found_struct}}
-    hMipmappedArray : CUmipmappedArray
+    hMipmappedArray : :py:obj:`~.CUmipmappedArray`
 
     {{endif}}
 
@@ -3040,11 +3063,11 @@ cdef class anon_struct9:
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_DESC_st.res.linear.devPtr' in found_struct}}
-    devPtr : CUdeviceptr
+    devPtr : :py:obj:`~.CUdeviceptr`
 
     {{endif}}
     {{if 'CUDA_RESOURCE_DESC_st.res.linear.format' in found_struct}}
-    format : CUarray_format
+    format : :py:obj:`~.CUarray_format`
 
     {{endif}}
     {{if 'CUDA_RESOURCE_DESC_st.res.linear.numChannels' in found_struct}}
@@ -3073,11 +3096,11 @@ cdef class anon_struct10:
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_DESC_st.res.pitch2D.devPtr' in found_struct}}
-    devPtr : CUdeviceptr
+    devPtr : :py:obj:`~.CUdeviceptr`
 
     {{endif}}
     {{if 'CUDA_RESOURCE_DESC_st.res.pitch2D.format' in found_struct}}
-    format : CUarray_format
+    format : :py:obj:`~.CUarray_format`
 
     {{endif}}
     {{if 'CUDA_RESOURCE_DESC_st.res.pitch2D.numChannels' in found_struct}}
@@ -3183,7 +3206,7 @@ cdef class CUDA_RESOURCE_DESC_st:
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_DESC_st.resType' in found_struct}}
-    resType : CUresourcetype
+    resType : :py:obj:`~.CUresourcetype`
         Resource type
     {{endif}}
     {{if 'CUDA_RESOURCE_DESC_st.res' in found_struct}}
@@ -3215,11 +3238,11 @@ cdef class CUDA_TEXTURE_DESC_st:
     Attributes
     ----------
     {{if 'CUDA_TEXTURE_DESC_st.addressMode' in found_struct}}
-    addressMode : list[CUaddress_mode]
+    addressMode : list[:py:obj:`~.CUaddress_mode`]
         Address modes
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.filterMode' in found_struct}}
-    filterMode : CUfilter_mode
+    filterMode : :py:obj:`~.CUfilter_mode`
         Filter mode
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.flags' in found_struct}}
@@ -3231,7 +3254,7 @@ cdef class CUDA_TEXTURE_DESC_st:
         Maximum anisotropy ratio
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.mipmapFilterMode' in found_struct}}
-    mipmapFilterMode : CUfilter_mode
+    mipmapFilterMode : :py:obj:`~.CUfilter_mode`
         Mipmap filter mode
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.mipmapLevelBias' in found_struct}}
@@ -3272,7 +3295,7 @@ cdef class CUDA_RESOURCE_VIEW_DESC_st:
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_VIEW_DESC_st.format' in found_struct}}
-    format : CUresourceViewFormat
+    format : :py:obj:`~.CUresourceViewFormat`
         Resource view format
     {{endif}}
     {{if 'CUDA_RESOURCE_VIEW_DESC_st.width' in found_struct}}
@@ -3326,7 +3349,7 @@ cdef class CUtensorMap_st:
     Attributes
     ----------
     {{if 'CUtensorMap_st.opaque' in found_struct}}
-    opaque : list[cuuint64_t]
+    opaque : list[:py:obj:`~.cuuint64_t`]
 
     {{endif}}
 
@@ -3372,7 +3395,7 @@ cdef class CUDA_LAUNCH_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_LAUNCH_PARAMS_st.function' in found_struct}}
-    function : CUfunction
+    function : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_LAUNCH_PARAMS_st.gridDimX' in found_struct}}
@@ -3404,7 +3427,7 @@ cdef class CUDA_LAUNCH_PARAMS_st:
         Dynamic shared-memory size per thread block in bytes
     {{endif}}
     {{if 'CUDA_LAUNCH_PARAMS_st.hStream' in found_struct}}
-    hStream : CUstream
+    hStream : :py:obj:`~.CUstream`
         Stream identifier
     {{endif}}
     {{if 'CUDA_LAUNCH_PARAMS_st.kernelParams' in found_struct}}
@@ -3498,7 +3521,7 @@ cdef class CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st:
     Attributes
     ----------
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.type' in found_struct}}
-    type : CUexternalMemoryHandleType
+    type : :py:obj:`~.CUexternalMemoryHandleType`
         Type of the handle
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.handle' in found_struct}}
@@ -3511,7 +3534,8 @@ cdef class CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.flags' in found_struct}}
     flags : unsigned int
-        Flags must either be zero or CUDA_EXTERNAL_MEMORY_DEDICATED
+        Flags must either be zero or
+        :py:obj:`~.CUDA_EXTERNAL_MEMORY_DEDICATED`
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -3576,7 +3600,7 @@ cdef class CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st:
         chain is.
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st.arrayDesc' in found_struct}}
-    arrayDesc : CUDA_ARRAY3D_DESCRIPTOR
+    arrayDesc : :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR`
         Format, dimension and type of base level of the mipmap chain
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st.numLevels' in found_struct}}
@@ -3668,7 +3692,7 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st:
     Attributes
     ----------
     {{if 'CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st.type' in found_struct}}
-    type : CUexternalSemaphoreHandleType
+    type : :py:obj:`~.CUexternalSemaphoreHandleType`
         Type of the handle
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st.handle' in found_struct}}
@@ -3809,14 +3833,17 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal a
-        CUexternalSemaphore of type
-        CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
-        CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which
-        indicates that while signaling the CUexternalSemaphore, no memory
-        synchronization operations should be performed for any external
-        memory object imported as CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
-        For all other types of CUexternalSemaphore, flags must be zero.
+        Only when :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS` is used
+        to signal a :py:obj:`~.CUexternalSemaphore` of type
+        :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC`, the valid
+        flag is
+        :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC`
+        which indicates that while signaling the
+        :py:obj:`~.CUexternalSemaphore`, no memory synchronization
+        operations should be performed for any external memory object
+        imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`.
+        For all other types of :py:obj:`~.CUexternalSemaphore`, flags must
+        be zero.
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -3952,14 +3979,17 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on a
-        CUexternalSemaphore of type
-        CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
-        CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC which indicates
-        that while waiting for the CUexternalSemaphore, no memory
-        synchronization operations should be performed for any external
-        memory object imported as CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
-        For all other types of CUexternalSemaphore, flags must be zero.
+        Only when :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS` is used
+        to wait on a :py:obj:`~.CUexternalSemaphore` of type
+        :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC`, the valid
+        flag is
+        :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC`
+        which indicates that while waiting for the
+        :py:obj:`~.CUexternalSemaphore`, no memory synchronization
+        operations should be performed for any external memory object
+        imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`.
+        For all other types of :py:obj:`~.CUexternalSemaphore`, flags must
+        be zero.
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -3986,11 +4016,11 @@ cdef class CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`
         Array of external semaphore signal parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st.numExtSems' in found_struct}}
@@ -4024,11 +4054,11 @@ cdef class CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`
         Array of external semaphore signal parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st.numExtSems' in found_struct}}
@@ -4062,11 +4092,11 @@ cdef class CUDA_EXT_SEM_WAIT_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS`
         Array of external semaphore wait parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_st.numExtSems' in found_struct}}
@@ -4100,11 +4130,11 @@ cdef class CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS`
         Array of external semaphore wait parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st.numExtSems' in found_struct}}
@@ -4136,11 +4166,11 @@ cdef class anon_union9:
     Attributes
     ----------
     {{if 'CUarrayMapInfo_st.resource.mipmap' in found_struct}}
-    mipmap : CUmipmappedArray
+    mipmap : :py:obj:`~.CUmipmappedArray`
 
     {{endif}}
     {{if 'CUarrayMapInfo_st.resource.array' in found_struct}}
-    array : CUarray
+    array : :py:obj:`~.CUarray`
 
     {{endif}}
 
@@ -4264,7 +4294,7 @@ cdef class anon_union11:
     Attributes
     ----------
     {{if 'CUarrayMapInfo_st.memHandle.memHandle' in found_struct}}
-    memHandle : CUmemGenericAllocationHandle
+    memHandle : :py:obj:`~.CUmemGenericAllocationHandle`
 
     {{endif}}
 
@@ -4288,7 +4318,7 @@ cdef class CUarrayMapInfo_st:
     Attributes
     ----------
     {{if 'CUarrayMapInfo_st.resourceType' in found_struct}}
-    resourceType : CUresourcetype
+    resourceType : :py:obj:`~.CUresourcetype`
         Resource type
     {{endif}}
     {{if 'CUarrayMapInfo_st.resource' in found_struct}}
@@ -4296,7 +4326,7 @@ cdef class CUarrayMapInfo_st:
 
     {{endif}}
     {{if 'CUarrayMapInfo_st.subresourceType' in found_struct}}
-    subresourceType : CUarraySparseSubresourceType
+    subresourceType : :py:obj:`~.CUarraySparseSubresourceType`
         Sparse subresource type
     {{endif}}
     {{if 'CUarrayMapInfo_st.subresource' in found_struct}}
@@ -4304,11 +4334,11 @@ cdef class CUarrayMapInfo_st:
 
     {{endif}}
     {{if 'CUarrayMapInfo_st.memOperationType' in found_struct}}
-    memOperationType : CUmemOperationType
+    memOperationType : :py:obj:`~.CUmemOperationType`
         Memory operation type
     {{endif}}
     {{if 'CUarrayMapInfo_st.memHandleType' in found_struct}}
-    memHandleType : CUmemHandleType
+    memHandleType : :py:obj:`~.CUmemHandleType`
         Memory handle type
     {{endif}}
     {{if 'CUarrayMapInfo_st.memHandle' in found_struct}}
@@ -4358,14 +4388,17 @@ cdef class CUmemLocation_st:
     Attributes
     ----------
     {{if 'CUmemLocation_st.type' in found_struct}}
-    type : CUmemLocationType
+    type : :py:obj:`~.CUmemLocationType`
         Specifies the location type, which modifies the meaning of id.
     {{endif}}
     {{if 'CUmemLocation_st.id' in found_struct}}
     id : int
-        Identifier for CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
-        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST,
-        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST_NUMA.
+        Identifier for :py:obj:`~.CUmemLocationType`
+        :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
+        :py:obj:`~.CUmemLocationType`
+        :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`,
+        :py:obj:`~.CUmemLocationType`
+        :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`.
     {{endif}}
 
     Methods
@@ -4415,24 +4448,24 @@ cdef class CUmemAllocationProp_st:
     Attributes
     ----------
     {{if 'CUmemAllocationProp_st.type' in found_struct}}
-    type : CUmemAllocationType
+    type : :py:obj:`~.CUmemAllocationType`
         Allocation type
     {{endif}}
     {{if 'CUmemAllocationProp_st.requestedHandleTypes' in found_struct}}
-    requestedHandleTypes : CUmemAllocationHandleType
-        requested CUmemAllocationHandleType
+    requestedHandleTypes : :py:obj:`~.CUmemAllocationHandleType`
+        requested :py:obj:`~.CUmemAllocationHandleType`
     {{endif}}
     {{if 'CUmemAllocationProp_st.location' in found_struct}}
-    location : CUmemLocation
+    location : :py:obj:`~.CUmemLocation`
         Location of allocation
     {{endif}}
     {{if 'CUmemAllocationProp_st.win32HandleMetaData' in found_struct}}
     win32HandleMetaData : Any
         Windows-specific POBJECT_ATTRIBUTES required when
-        CU_MEM_HANDLE_TYPE_WIN32 is specified. This object attributes
-        structure includes security attributes that define the scope of
-        which exported allocations may be transferred to other processes.
-        In all other cases, this field is required to be zero.
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_WIN32` is specified. This object
+        attributes structure includes security attributes that define the
+        scope of which exported allocations may be transferred to other
+        processes. In all other cases, this field is required to be zero.
     {{endif}}
     {{if 'CUmemAllocationProp_st.allocFlags' in found_struct}}
     allocFlags : anon_struct22
@@ -4476,8 +4509,8 @@ cdef class CUmulticastObjectProp_st:
     {{endif}}
     {{if 'CUmulticastObjectProp_st.handleTypes' in found_struct}}
     handleTypes : unsigned long long
-        Bitmask of exportable handle types (see CUmemAllocationHandleType)
-        for this object
+        Bitmask of exportable handle types (see
+        :py:obj:`~.CUmemAllocationHandleType`) for this object
     {{endif}}
     {{if 'CUmulticastObjectProp_st.flags' in found_struct}}
     flags : unsigned long long
@@ -4501,11 +4534,11 @@ cdef class CUmemAccessDesc_st:
     Attributes
     ----------
     {{if 'CUmemAccessDesc_st.location' in found_struct}}
-    location : CUmemLocation
+    location : :py:obj:`~.CUmemLocation`
         Location on which the request is to change it's accessibility
     {{endif}}
     {{if 'CUmemAccessDesc_st.flags' in found_struct}}
-    flags : CUmemAccess_flags
+    flags : :py:obj:`~.CUmemAccess_flags`
         ``CUmemProt`` accessibility flags to set on the request
     {{endif}}
 
@@ -4524,22 +4557,22 @@ cdef class CUmemAccessDesc_st:
 
 cdef class CUgraphExecUpdateResultInfo_st:
     """
-    Result information returned by cuGraphExecUpdate
+    Result information returned by :py:obj:`~.cuGraphExecUpdate`
 
     Attributes
     ----------
     {{if 'CUgraphExecUpdateResultInfo_st.result' in found_struct}}
-    result : CUgraphExecUpdateResult
+    result : :py:obj:`~.CUgraphExecUpdateResult`
         Gives more specific detail when a cuda graph update fails.
     {{endif}}
     {{if 'CUgraphExecUpdateResultInfo_st.errorNode' in found_struct}}
-    errorNode : CUgraphNode
+    errorNode : :py:obj:`~.CUgraphNode`
         The "to node" of the error edge when the topologies do not match.
         The error node when the error is associated with a specific node.
         NULL when the error is generic.
     {{endif}}
     {{if 'CUgraphExecUpdateResultInfo_st.errorFromNode' in found_struct}}
-    errorFromNode : CUgraphNode
+    errorFromNode : :py:obj:`~.CUgraphNode`
         The from node of error edge when the topologies do not match.
         Otherwise NULL.
     {{endif}}
@@ -4567,25 +4600,25 @@ cdef class CUmemPoolProps_st:
     Attributes
     ----------
     {{if 'CUmemPoolProps_st.allocType' in found_struct}}
-    allocType : CUmemAllocationType
+    allocType : :py:obj:`~.CUmemAllocationType`
         Allocation type. Currently must be specified as
-        CU_MEM_ALLOCATION_TYPE_PINNED
+        :py:obj:`~.CU_MEM_ALLOCATION_TYPE_PINNED`
     {{endif}}
     {{if 'CUmemPoolProps_st.handleTypes' in found_struct}}
-    handleTypes : CUmemAllocationHandleType
+    handleTypes : :py:obj:`~.CUmemAllocationHandleType`
         Handle types that will be supported by allocations from the pool.
     {{endif}}
     {{if 'CUmemPoolProps_st.location' in found_struct}}
-    location : CUmemLocation
+    location : :py:obj:`~.CUmemLocation`
         Location where allocations should reside.
     {{endif}}
     {{if 'CUmemPoolProps_st.win32SecurityAttributes' in found_struct}}
     win32SecurityAttributes : Any
         Windows-specific LPSECURITYATTRIBUTES required when
-        CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute
-        defines the scope of which exported allocations may be transferred
-        to other processes. In all other cases, this field is required to
-        be zero.
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_WIN32` is specified. This security
+        attribute defines the scope of which exported allocations may be
+        transferred to other processes. In all other cases, this field is
+        required to be zero.
     {{endif}}
     {{if 'CUmemPoolProps_st.maxSize' in found_struct}}
     maxSize : size_t
@@ -4641,28 +4674,29 @@ cdef class CUmemPoolPtrExportData_st:
 cdef class CUmemcpyAttributes_st:
     """
     Attributes specific to copies within a batch. For more details on
-    usage see cuMemcpyBatchAsync.
+    usage see :py:obj:`~.cuMemcpyBatchAsync`.
 
     Attributes
     ----------
     {{if 'CUmemcpyAttributes_st.srcAccessOrder' in found_struct}}
-    srcAccessOrder : CUmemcpySrcAccessOrder
+    srcAccessOrder : :py:obj:`~.CUmemcpySrcAccessOrder`
         Source access ordering to be observed for copies with this
         attribute.
     {{endif}}
     {{if 'CUmemcpyAttributes_st.srcLocHint' in found_struct}}
-    srcLocHint : CUmemLocation
+    srcLocHint : :py:obj:`~.CUmemLocation`
         Hint location for the source operand. Ignored when the pointers are
         not managed memory or memory allocated outside CUDA.
     {{endif}}
     {{if 'CUmemcpyAttributes_st.dstLocHint' in found_struct}}
-    dstLocHint : CUmemLocation
+    dstLocHint : :py:obj:`~.CUmemLocation`
         Hint location for the destination operand. Ignored when the
         pointers are not managed memory or memory allocated outside CUDA.
     {{endif}}
     {{if 'CUmemcpyAttributes_st.flags' in found_struct}}
     flags : unsigned int
-        Additional flags for copies with this attribute. See CUmemcpyFlags
+        Additional flags for copies with this attribute. See
+        :py:obj:`~.CUmemcpyFlags`
     {{endif}}
 
     Methods
@@ -4712,7 +4746,8 @@ cdef class CUoffset3D_st:
 
 cdef class CUextent3D_st:
     """
-    Struct representing width/height/depth of a CUarray in elements
+    Struct representing width/height/depth of a :py:obj:`~.CUarray` in
+    elements
 
     Attributes
     ----------
@@ -4744,7 +4779,7 @@ cdef class anon_struct23:
     Attributes
     ----------
     {{if 'CUmemcpy3DOperand_st.op.ptr.ptr' in found_struct}}
-    ptr : CUdeviceptr
+    ptr : :py:obj:`~.CUdeviceptr`
 
     {{endif}}
     {{if 'CUmemcpy3DOperand_st.op.ptr.rowLength' in found_struct}}
@@ -4756,7 +4791,7 @@ cdef class anon_struct23:
 
     {{endif}}
     {{if 'CUmemcpy3DOperand_st.op.ptr.locHint' in found_struct}}
-    locHint : CUmemLocation
+    locHint : :py:obj:`~.CUmemLocation`
 
     {{endif}}
 
@@ -4780,11 +4815,11 @@ cdef class anon_struct24:
     Attributes
     ----------
     {{if 'CUmemcpy3DOperand_st.op.array.array' in found_struct}}
-    array : CUarray
+    array : :py:obj:`~.CUarray`
 
     {{endif}}
     {{if 'CUmemcpy3DOperand_st.op.array.offset' in found_struct}}
-    offset : CUoffset3D
+    offset : :py:obj:`~.CUoffset3D`
 
     {{endif}}
 
@@ -4833,12 +4868,13 @@ cdef class anon_union13:
 
 cdef class CUmemcpy3DOperand_st:
     """
-    Struct representing an operand for copy with cuMemcpy3DBatchAsync
+    Struct representing an operand for copy with
+    :py:obj:`~.cuMemcpy3DBatchAsync`
 
     Attributes
     ----------
     {{if 'CUmemcpy3DOperand_st.type' in found_struct}}
-    type : CUmemcpy3DOperandType
+    type : :py:obj:`~.CUmemcpy3DOperandType`
 
     {{endif}}
     {{if 'CUmemcpy3DOperand_st.op' in found_struct}}
@@ -4864,25 +4900,26 @@ cdef class CUDA_MEMCPY3D_BATCH_OP_st:
     Attributes
     ----------
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.src' in found_struct}}
-    src : CUmemcpy3DOperand
+    src : :py:obj:`~.CUmemcpy3DOperand`
         Source memcpy operand.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.dst' in found_struct}}
-    dst : CUmemcpy3DOperand
+    dst : :py:obj:`~.CUmemcpy3DOperand`
         Destination memcpy operand.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.extent' in found_struct}}
-    extent : CUextent3D
+    extent : :py:obj:`~.CUextent3D`
         Extents of the memcpy between src and dst. The width, height and
         depth components must not be 0.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.srcAccessOrder' in found_struct}}
-    srcAccessOrder : CUmemcpySrcAccessOrder
+    srcAccessOrder : :py:obj:`~.CUmemcpySrcAccessOrder`
         Source access ordering to be observed for copy from src to dst.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.flags' in found_struct}}
     flags : unsigned int
-        Additional flags for copies with this attribute. See CUmemcpyFlags
+        Additional flags for copies with this attribute. See
+        :py:obj:`~.CUmemcpyFlags`
     {{endif}}
 
     Methods
@@ -4911,13 +4948,13 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v1_st:
     Attributes
     ----------
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.poolProps' in found_struct}}
-    poolProps : CUmemPoolProps
+    poolProps : :py:obj:`~.CUmemPoolProps`
         in: location where the allocation should reside (specified in
-        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
-        is not supported.
+        ``location``). ``handleTypes`` must be
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_NONE`. IPC is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.accessDescs' in found_struct}}
-    accessDescs : CUmemAccessDesc
+    accessDescs : :py:obj:`~.CUmemAccessDesc`
         in: array of memory access descriptors. Used to describe peer GPU
         access
     {{endif}}
@@ -4931,7 +4968,7 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v1_st:
         in: size in bytes of the requested allocation
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.dptr' in found_struct}}
-    dptr : CUdeviceptr
+    dptr : :py:obj:`~.CUdeviceptr`
         out: address of the allocation returned by CUDA
     {{endif}}
 
@@ -4962,13 +4999,13 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.poolProps' in found_struct}}
-    poolProps : CUmemPoolProps
+    poolProps : :py:obj:`~.CUmemPoolProps`
         in: location where the allocation should reside (specified in
-        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
-        is not supported.
+        ``location``). ``handleTypes`` must be
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_NONE`. IPC is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.accessDescs' in found_struct}}
-    accessDescs : CUmemAccessDesc
+    accessDescs : :py:obj:`~.CUmemAccessDesc`
         in: array of memory access descriptors. Used to describe peer GPU
         access
     {{endif}}
@@ -4982,7 +5019,7 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v2_st:
         in: size in bytes of the requested allocation
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.dptr' in found_struct}}
-    dptr : CUdeviceptr
+    dptr : :py:obj:`~.CUdeviceptr`
         out: address of the allocation returned by CUDA
     {{endif}}
 
@@ -5013,7 +5050,7 @@ cdef class CUDA_MEM_FREE_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_MEM_FREE_NODE_PARAMS_st.dptr' in found_struct}}
-    dptr : CUdeviceptr
+    dptr : :py:obj:`~.CUdeviceptr`
         in: the pointer to free
     {{endif}}
 
@@ -5037,7 +5074,7 @@ cdef class CUDA_CHILD_GRAPH_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_CHILD_GRAPH_NODE_PARAMS_st.graph' in found_struct}}
-    graph : CUgraph
+    graph : :py:obj:`~.CUgraph`
         The child graph to clone into the node for node creation, or a
         handle to the graph owned by the node for node query. The graph
         must not contain conditional nodes. Graphs containing memory
@@ -5045,7 +5082,7 @@ cdef class CUDA_CHILD_GRAPH_NODE_PARAMS_st:
         to the parent.
     {{endif}}
     {{if 'CUDA_CHILD_GRAPH_NODE_PARAMS_st.ownership' in found_struct}}
-    ownership : CUgraphChildGraphNodeOwnership
+    ownership : :py:obj:`~.CUgraphChildGraphNodeOwnership`
         The ownership relationship of the child graph node.
     {{endif}}
 
@@ -5069,7 +5106,7 @@ cdef class CUDA_EVENT_RECORD_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_EVENT_RECORD_NODE_PARAMS_st.event' in found_struct}}
-    event : CUevent
+    event : :py:obj:`~.CUevent`
         The event to record when the node executes
     {{endif}}
 
@@ -5093,7 +5130,7 @@ cdef class CUDA_EVENT_WAIT_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_EVENT_WAIT_NODE_PARAMS_st.event' in found_struct}}
-    event : CUevent
+    event : :py:obj:`~.CUevent`
         The event to wait on from the node
     {{endif}}
 
@@ -5112,12 +5149,12 @@ cdef class CUDA_EVENT_WAIT_NODE_PARAMS_st:
 
 cdef class CUgraphNodeParams_st:
     """
-    Graph node parameters. See cuGraphAddNode.
+    Graph node parameters. See :py:obj:`~.cuGraphAddNode`.
 
     Attributes
     ----------
     {{if 'CUgraphNodeParams_st.type' in found_struct}}
-    type : CUgraphNodeType
+    type : :py:obj:`~.CUgraphNodeType`
         Type of the node
     {{endif}}
     {{if 'CUgraphNodeParams_st.reserved0' in found_struct}}
@@ -5129,55 +5166,55 @@ cdef class CUgraphNodeParams_st:
         Padding. Unused bytes must be zero.
     {{endif}}
     {{if 'CUgraphNodeParams_st.kernel' in found_struct}}
-    kernel : CUDA_KERNEL_NODE_PARAMS_v3
+    kernel : :py:obj:`~.CUDA_KERNEL_NODE_PARAMS_v3`
         Kernel node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.memcpy' in found_struct}}
-    memcpy : CUDA_MEMCPY_NODE_PARAMS
+    memcpy : :py:obj:`~.CUDA_MEMCPY_NODE_PARAMS`
         Memcpy node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.memset' in found_struct}}
-    memset : CUDA_MEMSET_NODE_PARAMS_v2
+    memset : :py:obj:`~.CUDA_MEMSET_NODE_PARAMS_v2`
         Memset node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.host' in found_struct}}
-    host : CUDA_HOST_NODE_PARAMS_v2
+    host : :py:obj:`~.CUDA_HOST_NODE_PARAMS_v2`
         Host node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.graph' in found_struct}}
-    graph : CUDA_CHILD_GRAPH_NODE_PARAMS
+    graph : :py:obj:`~.CUDA_CHILD_GRAPH_NODE_PARAMS`
         Child graph node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.eventWait' in found_struct}}
-    eventWait : CUDA_EVENT_WAIT_NODE_PARAMS
+    eventWait : :py:obj:`~.CUDA_EVENT_WAIT_NODE_PARAMS`
         Event wait node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.eventRecord' in found_struct}}
-    eventRecord : CUDA_EVENT_RECORD_NODE_PARAMS
+    eventRecord : :py:obj:`~.CUDA_EVENT_RECORD_NODE_PARAMS`
         Event record node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.extSemSignal' in found_struct}}
-    extSemSignal : CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2
+    extSemSignal : :py:obj:`~.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2`
         External semaphore signal node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.extSemWait' in found_struct}}
-    extSemWait : CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2
+    extSemWait : :py:obj:`~.CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2`
         External semaphore wait node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.alloc' in found_struct}}
-    alloc : CUDA_MEM_ALLOC_NODE_PARAMS_v2
+    alloc : :py:obj:`~.CUDA_MEM_ALLOC_NODE_PARAMS_v2`
         Memory allocation node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.free' in found_struct}}
-    free : CUDA_MEM_FREE_NODE_PARAMS
+    free : :py:obj:`~.CUDA_MEM_FREE_NODE_PARAMS`
         Memory free node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.memOp' in found_struct}}
-    memOp : CUDA_BATCH_MEM_OP_NODE_PARAMS_v2
+    memOp : :py:obj:`~.CUDA_BATCH_MEM_OP_NODE_PARAMS_v2`
         MemOp node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.conditional' in found_struct}}
-    conditional : CUDA_CONDITIONAL_NODE_PARAMS
+    conditional : :py:obj:`~.CUDA_CONDITIONAL_NODE_PARAMS`
         Conditional node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.asBytes' in found_struct}}
@@ -5254,7 +5291,7 @@ cdef class CUcheckpointLockArgs_st:
         Reserved for future use, must be zero
     {{endif}}
     {{if 'CUcheckpointLockArgs_st.reserved1' in found_struct}}
-    reserved1 : list[cuuint64_t]
+    reserved1 : list[:py:obj:`~.cuuint64_t`]
         Reserved for future use, must be zeroed
     {{endif}}
 
@@ -5275,7 +5312,7 @@ cdef class CUcheckpointCheckpointArgs_st:
     Attributes
     ----------
     {{if 'CUcheckpointCheckpointArgs_st.reserved' in found_struct}}
-    reserved : list[cuuint64_t]
+    reserved : list[:py:obj:`~.cuuint64_t`]
         Reserved for future use, must be zeroed
     {{endif}}
 
@@ -5296,11 +5333,11 @@ cdef class CUcheckpointGpuPair_st:
     Attributes
     ----------
     {{if 'CUcheckpointGpuPair_st.oldUuid' in found_struct}}
-    oldUuid : CUuuid
+    oldUuid : :py:obj:`~.CUuuid`
         UUID of the GPU that was checkpointed
     {{endif}}
     {{if 'CUcheckpointGpuPair_st.newUuid' in found_struct}}
-    newUuid : CUuuid
+    newUuid : :py:obj:`~.CUuuid`
         UUID of the GPU to restore onto
     {{endif}}
 
@@ -5327,7 +5364,7 @@ cdef class CUcheckpointRestoreArgs_st:
     Attributes
     ----------
     {{if 'CUcheckpointRestoreArgs_st.gpuPairs' in found_struct}}
-    gpuPairs : CUcheckpointGpuPair
+    gpuPairs : :py:obj:`~.CUcheckpointGpuPair`
         Pointer to array of gpu pairs that indicate how to remap GPUs
         during restore
     {{endif}}
@@ -5340,11 +5377,11 @@ cdef class CUcheckpointRestoreArgs_st:
         Reserved for future use, must be zeroed
     {{endif}}
     {{if struct_field_types.get('CUcheckpointRestoreArgs_st.reserved') == 'cuuint64_t'}}
-    reserved : list[cuuint64_t]
+    reserved : list[:py:obj:`~.cuuint64_t`]
         Reserved for future use, must be zeroed
     {{endif}}
     {{if 'CUcheckpointRestoreArgs_st.reserved1' in found_struct}}
-    reserved1 : cuuint64_t
+    reserved1 : :py:obj:`~.cuuint64_t`
         Reserved for future use, must be zeroed
     {{endif}}
 
@@ -5372,7 +5409,7 @@ cdef class CUcheckpointUnlockArgs_st:
     Attributes
     ----------
     {{if 'CUcheckpointUnlockArgs_st.reserved' in found_struct}}
-    reserved : list[cuuint64_t]
+    reserved : list[:py:obj:`~.cuuint64_t`]
         Reserved for future use, must be zeroed
     {{endif}}
 
@@ -5396,34 +5433,36 @@ cdef class CUmemDecompressParams_st:
     {{if 'CUmemDecompressParams_st.srcNumBytes' in found_struct}}
     srcNumBytes : size_t
         The number of bytes to be read and decompressed from
-        CUmemDecompressParams_st.src.
+        :py:obj:`~.CUmemDecompressParams_st`.src.
     {{endif}}
     {{if 'CUmemDecompressParams_st.dstNumBytes' in found_struct}}
     dstNumBytes : size_t
         The number of bytes that the decompression operation will be
-        expected to write to CUmemDecompressParams_st.dst. This value is
-        optional; if present, it may be used by the CUDA driver as a
-        heuristic for scheduling the individual decompression operations.
+        expected to write to :py:obj:`~.CUmemDecompressParams_st`.dst. This
+        value is optional; if present, it may be used by the CUDA driver as
+        a heuristic for scheduling the individual decompression operations.
     {{endif}}
     {{if 'CUmemDecompressParams_st.dstActBytes' in found_struct}}
-    dstActBytes : cuuint32_t
+    dstActBytes : :py:obj:`~.cuuint32_t`
         After the decompression operation has completed, the actual number
-        of bytes written to CUmemDecompressParams.dst will be recorded as a
-        32-bit unsigned integer in the memory at this address.
+        of bytes written to :py:obj:`~.CUmemDecompressParams`.dst will be
+        recorded as a 32-bit unsigned integer in the memory at this
+        address.
     {{endif}}
     {{if 'CUmemDecompressParams_st.src' in found_struct}}
     src : Any
         Pointer to a buffer of at least
-        CUmemDecompressParams_st.srcNumBytes compressed bytes.
+        :py:obj:`~.CUmemDecompressParams_st`.srcNumBytes compressed bytes.
     {{endif}}
     {{if 'CUmemDecompressParams_st.dst' in found_struct}}
     dst : Any
         Pointer to a buffer where the decompressed data will be written.
         The number of bytes written to this location will be recorded in
-        the memory pointed to by CUmemDecompressParams_st.dstActBytes
+        the memory pointed to by
+        :py:obj:`~.CUmemDecompressParams_st`.dstActBytes
     {{endif}}
     {{if 'CUmemDecompressParams_st.algo' in found_struct}}
-    algo : CUmemDecompressAlgorithm
+    algo : :py:obj:`~.CUmemDecompressAlgorithm`
         The decompression algorithm to use.
     {{endif}}
     {{if 'CUmemDecompressParams_st.padding' in found_struct}}
@@ -5473,7 +5512,7 @@ cdef class anon_struct25:
     Attributes
     ----------
     {{if 'CUlogicalEndpointProp_struct.unicast.device' in found_struct}}
-    device : CUdevice
+    device : :py:obj:`~.CUdevice`
 
     {{endif}}
 
@@ -5514,8 +5553,9 @@ cdef class CUlogicalEndpointProp_struct:
     Attributes
     ----------
     {{if 'CUlogicalEndpointProp_struct.type' in found_struct}}
-    type : CUlogicalEndpointType
-        Type of the logical endpoint defined in CUlogicalEndpointType
+    type : :py:obj:`~.CUlogicalEndpointType`
+        Type of the logical endpoint defined in
+        :py:obj:`~.CUlogicalEndpointType`
     {{endif}}
     {{if 'CUlogicalEndpointProp_struct.unicast' in found_struct}}
     unicast : anon_struct25
@@ -5532,11 +5572,11 @@ cdef class CUlogicalEndpointProp_struct:
     {{if 'CUlogicalEndpointProp_struct.ipcHandleTypes' in found_struct}}
     ipcHandleTypes : unsigned int
         A bitmask of IPC handle types defined in
-        CUlogicalEndpointIpcHandleType
+        :py:obj:`~.CUlogicalEndpointIpcHandleType`
     {{endif}}
     {{if 'CUlogicalEndpointProp_struct.flags' in found_struct}}
     flags : unsigned int
-        A bitmask of flags defined in CUlogicalEndpointFlag
+        A bitmask of flags defined in :py:obj:`~.CUlogicalEndpointFlag`
     {{endif}}
 
     Methods
@@ -5578,7 +5618,7 @@ cdef class CUdevSmResource_st:
     {{if 'CUdevSmResource_st.flags' in found_struct}}
     flags : unsigned int
         The flags set on this SM resource. For possible values see
-        CUdevSmResourceGroup_flags.
+        :py:obj:`~.CUdevSmResourceGroup_flags`.
     {{endif}}
 
     Methods
@@ -5596,7 +5636,7 @@ cdef class CUdevWorkqueueConfigResource_st:
     Attributes
     ----------
     {{if 'CUdevWorkqueueConfigResource_st.device' in found_struct}}
-    device : CUdevice
+    device : :py:obj:`~.CUdevice`
         The device on which the workqueue resources are available
     {{endif}}
     {{if 'CUdevWorkqueueConfigResource_st.wqConcurrencyLimit' in found_struct}}
@@ -5604,7 +5644,7 @@ cdef class CUdevWorkqueueConfigResource_st:
         The expected maximum number of concurrent stream-ordered workloads
     {{endif}}
     {{if 'CUdevWorkqueueConfigResource_st.sharingScope' in found_struct}}
-    sharingScope : CUdevWorkqueueConfigScope
+    sharingScope : :py:obj:`~.CUdevWorkqueueConfigScope`
         The sharing scope for the workqueue resources
     {{endif}}
 
@@ -5661,7 +5701,7 @@ cdef class CU_DEV_SM_RESOURCE_GROUP_PARAMS_st:
     {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
         The flags set on this SM resource group. For possible values see
-        CUdevSmResourceGroup_flags.
+        :py:obj:`~.CUdevSmResourceGroup_flags`.
     {{endif}}
     {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -5683,7 +5723,7 @@ cdef class CUdevResource_st:
     Attributes
     ----------
     {{if 'CUdevResource_st.type' in found_struct}}
-    type : CUdevResourceType
+    type : :py:obj:`~.CUdevResourceType`
         Type of resource, dictates which union field was last set
     {{endif}}
     {{if 'CUdevResource_st._internal_padding' in found_struct}}
@@ -5691,25 +5731,26 @@ cdef class CUdevResource_st:
 
     {{endif}}
     {{if 'CUdevResource_st.sm' in found_struct}}
-    sm : CUdevSmResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_SM ``typename``.
+    sm : :py:obj:`~.CUdevSmResource`
+        Resource corresponding to :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`
+        ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wqConfig' in found_struct}}
-    wqConfig : CUdevWorkqueueConfigResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG
-        ``typename``.
+    wqConfig : :py:obj:`~.CUdevWorkqueueConfigResource`
+        Resource corresponding to
+        :py:obj:`~.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG` ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wq' in found_struct}}
-    wq : CUdevWorkqueueResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE
-        ``typename``.
+    wq : :py:obj:`~.CUdevWorkqueueResource`
+        Resource corresponding to
+        :py:obj:`~.CU_DEV_RESOURCE_TYPE_WORKQUEUE` ``typename``.
     {{endif}}
     {{if 'CUdevResource_st._oversize' in found_struct}}
     _oversize : bytes
 
     {{endif}}
     {{if 'CUdevResource_st.nextResource' in found_struct}}
-    nextResource : CUdevResource_st
+    nextResource : :py:obj:`~.CUdevResource_st`
 
     {{endif}}
 
@@ -5741,7 +5782,7 @@ cdef class anon_union17:
     Attributes
     ----------
     {{if True}}
-    pArray : list[CUarray]
+    pArray : list[:py:obj:`~.CUarray`]
 
     {{endif}}
     {{if True}}
@@ -5795,15 +5836,15 @@ cdef class CUeglFrame_st:
         Number of channels for the plane
     {{endif}}
     {{if True}}
-    frameType : CUeglFrameType
+    frameType : :py:obj:`~.CUeglFrameType`
         Array or Pitch
     {{endif}}
     {{if True}}
-    eglColorFormat : CUeglColorFormat
+    eglColorFormat : :py:obj:`~.CUeglColorFormat`
         CUDA EGL Color Format
     {{endif}}
     {{if True}}
-    cuFormat : CUarray_format
+    cuFormat : :py:obj:`~.CUarray_format`
         CUDA Array Format
     {{endif}}
 
@@ -5823,7 +5864,7 @@ cdef class CUeglFrame_st:
 cdef class CUdeviceptr:
     """
 
-    CUDA device pointer CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
+    CUDA device pointer :py:obj:`~.CUdeviceptr` is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
 
     Methods
     -------
@@ -6044,39 +6085,40 @@ cdef class CUipcMemHandle(CUipcMemHandle_v1):
 
 cdef class CUstreamBatchMemOpParams_v1(CUstreamBatchMemOpParams_union):
     """
-    Per-operation parameters for cuStreamBatchMemOp
+    Per-operation parameters for :py:obj:`~.cuStreamBatchMemOp`
 
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
         Operation. This is the first field of all the union elemets and
         acts as a TAG to determine which union member is valid.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue' in found_struct}}
-    waitValue : CUstreamMemOpWaitValueParams_st
-        Params for CU_STREAM_MEM_OP_WAIT_VALUE_32 and
-        CU_STREAM_MEM_OP_WAIT_VALUE_64 operations.
+    waitValue : :py:obj:`~.CUstreamMemOpWaitValueParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_WAIT_VALUE_32` and
+        :py:obj:`~.CU_STREAM_MEM_OP_WAIT_VALUE_64` operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue' in found_struct}}
-    writeValue : CUstreamMemOpWriteValueParams_st
-        Params for CU_STREAM_MEM_OP_WRITE_VALUE_32 and
-        CU_STREAM_MEM_OP_WRITE_VALUE_64 operations.
+    writeValue : :py:obj:`~.CUstreamMemOpWriteValueParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_WRITE_VALUE_32` and
+        :py:obj:`~.CU_STREAM_MEM_OP_WRITE_VALUE_64` operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites' in found_struct}}
-    flushRemoteWrites : CUstreamMemOpFlushRemoteWritesParams_st
-        Params for CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES operations.
+    flushRemoteWrites : :py:obj:`~.CUstreamMemOpFlushRemoteWritesParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES`
+        operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
-    memoryBarrier : CUstreamMemOpMemoryBarrierParams_st
-        Params for CU_STREAM_MEM_OP_BARRIER operations.
+    memoryBarrier : :py:obj:`~.CUstreamMemOpMemoryBarrierParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_BARRIER` operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
-    atomicReduction : CUstreamMemOpAtomicReductionParams_st
+    atomicReduction : :py:obj:`~.CUstreamMemOpAtomicReductionParams_st`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
-    pad : list[cuuint64_t]
+    pad : list[:py:obj:`~.cuuint64_t`]
 
     {{endif}}
 
@@ -6091,39 +6133,40 @@ cdef class CUstreamBatchMemOpParams_v1(CUstreamBatchMemOpParams_union):
 
 cdef class CUstreamBatchMemOpParams(CUstreamBatchMemOpParams_v1):
     """
-    Per-operation parameters for cuStreamBatchMemOp
+    Per-operation parameters for :py:obj:`~.cuStreamBatchMemOp`
 
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
         Operation. This is the first field of all the union elemets and
         acts as a TAG to determine which union member is valid.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue' in found_struct}}
-    waitValue : CUstreamMemOpWaitValueParams_st
-        Params for CU_STREAM_MEM_OP_WAIT_VALUE_32 and
-        CU_STREAM_MEM_OP_WAIT_VALUE_64 operations.
+    waitValue : :py:obj:`~.CUstreamMemOpWaitValueParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_WAIT_VALUE_32` and
+        :py:obj:`~.CU_STREAM_MEM_OP_WAIT_VALUE_64` operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue' in found_struct}}
-    writeValue : CUstreamMemOpWriteValueParams_st
-        Params for CU_STREAM_MEM_OP_WRITE_VALUE_32 and
-        CU_STREAM_MEM_OP_WRITE_VALUE_64 operations.
+    writeValue : :py:obj:`~.CUstreamMemOpWriteValueParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_WRITE_VALUE_32` and
+        :py:obj:`~.CU_STREAM_MEM_OP_WRITE_VALUE_64` operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites' in found_struct}}
-    flushRemoteWrites : CUstreamMemOpFlushRemoteWritesParams_st
-        Params for CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES operations.
+    flushRemoteWrites : :py:obj:`~.CUstreamMemOpFlushRemoteWritesParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES`
+        operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
-    memoryBarrier : CUstreamMemOpMemoryBarrierParams_st
-        Params for CU_STREAM_MEM_OP_BARRIER operations.
+    memoryBarrier : :py:obj:`~.CUstreamMemOpMemoryBarrierParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_BARRIER` operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
-    atomicReduction : CUstreamMemOpAtomicReductionParams_st
+    atomicReduction : :py:obj:`~.CUstreamMemOpAtomicReductionParams_st`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
-    pad : list[cuuint64_t]
+    pad : list[:py:obj:`~.cuuint64_t`]
 
     {{endif}}
 
@@ -6139,12 +6182,13 @@ cdef class CUstreamBatchMemOpParams(CUstreamBatchMemOpParams_v1):
 cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v1(CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st):
     """
     Batch memory operation node parameters  Used in the legacy
-    cuGraphAddBatchMemOpNode api. New code should use cuGraphAddNode()
+    :func:`~.cuGraphAddBatchMemOpNode` api. New code should use
+    :func:`~.cuGraphAddNode`
 
     Attributes
     ----------
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
 
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.count' in found_struct}}
@@ -6152,7 +6196,7 @@ cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v1(CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st)
 
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.paramArray' in found_struct}}
-    paramArray : CUstreamBatchMemOpParams
+    paramArray : :py:obj:`~.CUstreamBatchMemOpParams`
 
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.flags' in found_struct}}
@@ -6172,12 +6216,13 @@ cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v1(CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st)
 cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS(CUDA_BATCH_MEM_OP_NODE_PARAMS_v1):
     """
     Batch memory operation node parameters  Used in the legacy
-    cuGraphAddBatchMemOpNode api. New code should use cuGraphAddNode()
+    :func:`~.cuGraphAddBatchMemOpNode` api. New code should use
+    :func:`~.cuGraphAddNode`
 
     Attributes
     ----------
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
 
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.count' in found_struct}}
@@ -6185,7 +6230,7 @@ cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS(CUDA_BATCH_MEM_OP_NODE_PARAMS_v1):
 
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.paramArray' in found_struct}}
-    paramArray : CUstreamBatchMemOpParams
+    paramArray : :py:obj:`~.CUstreamBatchMemOpParams`
 
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.flags' in found_struct}}
@@ -6209,7 +6254,7 @@ cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v2(CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st)
     Attributes
     ----------
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context to use for the operations.
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st.count' in found_struct}}
@@ -6217,7 +6262,7 @@ cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v2(CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st)
         Number of operations in paramArray.
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st.paramArray' in found_struct}}
-    paramArray : CUstreamBatchMemOpParams
+    paramArray : :py:obj:`~.CUstreamBatchMemOpParams`
         Array of batch memory operations.
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st.flags' in found_struct}}
@@ -6241,7 +6286,7 @@ cdef class CUasyncNotificationInfo(CUasyncNotificationInfo_st):
     Attributes
     ----------
     {{if 'CUasyncNotificationInfo_st.type' in found_struct}}
-    type : CUasyncNotificationType
+    type : :py:obj:`~.CUasyncNotificationType`
         The type of notification being sent
     {{endif}}
     {{if 'CUasyncNotificationInfo_st.info' in found_struct}}
@@ -6376,13 +6421,13 @@ cdef class CUaccessPolicyWindow_v1(CUaccessPolicyWindow_st):
     Specifies an access policy for a window, a contiguous extent of
     memory beginning at base_ptr and ending at base_ptr + num_bytes.
     num_bytes is limited by
-    CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE. Partition into
-    many segments and assign segments such that: sum of "hit segments"
-    / window == approx. ratio. sum of "miss segments" / window ==
-    approx 1-ratio. Segments and ratio specifications are fitted to the
-    capabilities of the architecture. Accesses in a hit segment apply
-    the hitProp access policy. Accesses in a miss segment apply the
-    missProp access policy.
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE`.
+    Partition into many segments and assign segments such that: sum of
+    "hit segments" / window == approx. ratio. sum of "miss segments" /
+    window == approx 1-ratio. Segments and ratio specifications are
+    fitted to the capabilities of the architecture. Accesses in a hit
+    segment apply the hitProp access policy. Accesses in a miss segment
+    apply the missProp access policy.
 
     Attributes
     ----------
@@ -6402,12 +6447,13 @@ cdef class CUaccessPolicyWindow_v1(CUaccessPolicyWindow_st):
         assigned missProp.
     {{endif}}
     {{if 'CUaccessPolicyWindow_st.hitProp' in found_struct}}
-    hitProp : CUaccessProperty
-        CUaccessProperty set for hit.
+    hitProp : :py:obj:`~.CUaccessProperty`
+        :py:obj:`~.CUaccessProperty` set for hit.
     {{endif}}
     {{if 'CUaccessPolicyWindow_st.missProp' in found_struct}}
-    missProp : CUaccessProperty
-        CUaccessProperty set for miss. Must be either NORMAL or STREAMING
+    missProp : :py:obj:`~.CUaccessProperty`
+        :py:obj:`~.CUaccessProperty` set for miss. Must be either NORMAL or
+        STREAMING
     {{endif}}
 
     Methods
@@ -6424,13 +6470,13 @@ cdef class CUaccessPolicyWindow(CUaccessPolicyWindow_v1):
     Specifies an access policy for a window, a contiguous extent of
     memory beginning at base_ptr and ending at base_ptr + num_bytes.
     num_bytes is limited by
-    CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE. Partition into
-    many segments and assign segments such that: sum of "hit segments"
-    / window == approx. ratio. sum of "miss segments" / window ==
-    approx 1-ratio. Segments and ratio specifications are fitted to the
-    capabilities of the architecture. Accesses in a hit segment apply
-    the hitProp access policy. Accesses in a miss segment apply the
-    missProp access policy.
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE`.
+    Partition into many segments and assign segments such that: sum of
+    "hit segments" / window == approx. ratio. sum of "miss segments" /
+    window == approx 1-ratio. Segments and ratio specifications are
+    fitted to the capabilities of the architecture. Accesses in a hit
+    segment apply the hitProp access policy. Accesses in a miss segment
+    apply the missProp access policy.
 
     Attributes
     ----------
@@ -6450,12 +6496,13 @@ cdef class CUaccessPolicyWindow(CUaccessPolicyWindow_v1):
         assigned missProp.
     {{endif}}
     {{if 'CUaccessPolicyWindow_st.hitProp' in found_struct}}
-    hitProp : CUaccessProperty
-        CUaccessProperty set for hit.
+    hitProp : :py:obj:`~.CUaccessProperty`
+        :py:obj:`~.CUaccessProperty` set for hit.
     {{endif}}
     {{if 'CUaccessPolicyWindow_st.missProp' in found_struct}}
-    missProp : CUaccessProperty
-        CUaccessProperty set for miss. Must be either NORMAL or STREAMING
+    missProp : :py:obj:`~.CUaccessProperty`
+        :py:obj:`~.CUaccessProperty` set for miss. Must be either NORMAL or
+        STREAMING
     {{endif}}
 
     Methods
@@ -6474,7 +6521,7 @@ cdef class CUDA_KERNEL_NODE_PARAMS_v1(CUDA_KERNEL_NODE_PARAMS_st):
     Attributes
     ----------
     {{if 'CUDA_KERNEL_NODE_PARAMS_st.func' in found_struct}}
-    func : CUfunction
+    func : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_st.gridDimX' in found_struct}}
@@ -6530,7 +6577,7 @@ cdef class CUDA_KERNEL_NODE_PARAMS_v2(CUDA_KERNEL_NODE_PARAMS_v2_st):
     Attributes
     ----------
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.func' in found_struct}}
-    func : CUfunction
+    func : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.gridDimX' in found_struct}}
@@ -6570,11 +6617,11 @@ cdef class CUDA_KERNEL_NODE_PARAMS_v2(CUDA_KERNEL_NODE_PARAMS_v2_st):
         Extra options
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.kern' in found_struct}}
-    kern : CUkernel
+    kern : :py:obj:`~.CUkernel`
         Kernel to launch, will only be referenced if func is NULL
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context for the kernel task to run in. The value NULL will indicate
         the current context should be used by the api. This field is
         ignored if func is set.
@@ -6596,7 +6643,7 @@ cdef class CUDA_KERNEL_NODE_PARAMS(CUDA_KERNEL_NODE_PARAMS_v2):
     Attributes
     ----------
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.func' in found_struct}}
-    func : CUfunction
+    func : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.gridDimX' in found_struct}}
@@ -6636,11 +6683,11 @@ cdef class CUDA_KERNEL_NODE_PARAMS(CUDA_KERNEL_NODE_PARAMS_v2):
         Extra options
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.kern' in found_struct}}
-    kern : CUkernel
+    kern : :py:obj:`~.CUkernel`
         Kernel to launch, will only be referenced if func is NULL
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context for the kernel task to run in. The value NULL will indicate
         the current context should be used by the api. This field is
         ignored if func is set.
@@ -6662,7 +6709,7 @@ cdef class CUDA_KERNEL_NODE_PARAMS_v3(CUDA_KERNEL_NODE_PARAMS_v3_st):
     Attributes
     ----------
     {{if 'CUDA_KERNEL_NODE_PARAMS_v3_st.func' in found_struct}}
-    func : CUfunction
+    func : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v3_st.gridDimX' in found_struct}}
@@ -6702,11 +6749,11 @@ cdef class CUDA_KERNEL_NODE_PARAMS_v3(CUDA_KERNEL_NODE_PARAMS_v3_st):
         Extra options
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v3_st.kern' in found_struct}}
-    kern : CUkernel
+    kern : :py:obj:`~.CUkernel`
         Kernel to launch, will only be referenced if func is NULL
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v3_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context for the kernel task to run in. The value NULL will indicate
         the current context should be used by the api. This field is
         ignored if func is set.
@@ -6728,7 +6775,7 @@ cdef class CUDA_MEMSET_NODE_PARAMS_v1(CUDA_MEMSET_NODE_PARAMS_st):
     Attributes
     ----------
     {{if 'CUDA_MEMSET_NODE_PARAMS_st.dst' in found_struct}}
-    dst : CUdeviceptr
+    dst : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMSET_NODE_PARAMS_st.pitch' in found_struct}}
@@ -6768,7 +6815,7 @@ cdef class CUDA_MEMSET_NODE_PARAMS(CUDA_MEMSET_NODE_PARAMS_v1):
     Attributes
     ----------
     {{if 'CUDA_MEMSET_NODE_PARAMS_st.dst' in found_struct}}
-    dst : CUdeviceptr
+    dst : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMSET_NODE_PARAMS_st.pitch' in found_struct}}
@@ -6808,7 +6855,7 @@ cdef class CUDA_MEMSET_NODE_PARAMS_v2(CUDA_MEMSET_NODE_PARAMS_v2_st):
     Attributes
     ----------
     {{if 'CUDA_MEMSET_NODE_PARAMS_v2_st.dst' in found_struct}}
-    dst : CUdeviceptr
+    dst : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMSET_NODE_PARAMS_v2_st.pitch' in found_struct}}
@@ -6832,7 +6879,7 @@ cdef class CUDA_MEMSET_NODE_PARAMS_v2(CUDA_MEMSET_NODE_PARAMS_v2_st):
         Number of rows
     {{endif}}
     {{if 'CUDA_MEMSET_NODE_PARAMS_v2_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context on which to run the node
     {{endif}}
 
@@ -6852,7 +6899,7 @@ cdef class CUDA_HOST_NODE_PARAMS_v1(CUDA_HOST_NODE_PARAMS_st):
     Attributes
     ----------
     {{if 'CUDA_HOST_NODE_PARAMS_st.fn' in found_struct}}
-    fn : CUhostFn
+    fn : :py:obj:`~.CUhostFn`
         The function to call when the node executes
     {{endif}}
     {{if 'CUDA_HOST_NODE_PARAMS_st.userData' in found_struct}}
@@ -6876,7 +6923,7 @@ cdef class CUDA_HOST_NODE_PARAMS(CUDA_HOST_NODE_PARAMS_v1):
     Attributes
     ----------
     {{if 'CUDA_HOST_NODE_PARAMS_st.fn' in found_struct}}
-    fn : CUhostFn
+    fn : :py:obj:`~.CUhostFn`
         The function to call when the node executes
     {{endif}}
     {{if 'CUDA_HOST_NODE_PARAMS_st.userData' in found_struct}}
@@ -6900,7 +6947,7 @@ cdef class CUDA_HOST_NODE_PARAMS_v2(CUDA_HOST_NODE_PARAMS_v2_st):
     Attributes
     ----------
     {{if 'CUDA_HOST_NODE_PARAMS_v2_st.fn' in found_struct}}
-    fn : CUhostFn
+    fn : :py:obj:`~.CUhostFn`
         The function to call when the node executes
     {{endif}}
     {{if 'CUDA_HOST_NODE_PARAMS_v2_st.userData' in found_struct}}
@@ -6925,8 +6972,8 @@ cdef class CUgraphEdgeData(CUgraphEdgeData_st):
     """
     Optional annotation for edges in a CUDA graph. Note, all edges
     implicitly have annotations and default to a zero-initialized value
-    if not specified. A zero-initialized struct indicates a standard
-    full serialization of two nodes with memory visibility.
+    if not specified. A zero-initialized ``struct indicates`` a
+    standard full serialization of two nodes with memory visibility.
 
     Attributes
     ----------
@@ -6938,9 +6985,9 @@ cdef class CUgraphEdgeData(CUgraphEdgeData_st):
         memory visibility to the downstream node or portion thereof
         (indicated by ``to_port``).   Only kernel nodes define non-zero
         ports. A kernel node can use the following output port types:
-        CU_GRAPH_KERNEL_NODE_PORT_DEFAULT,
-        CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC, or
-        CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER.
+        :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_DEFAULT`,
+        :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC`, or
+        :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER`.
     {{endif}}
     {{if 'CUgraphEdgeData_st.to_port' in found_struct}}
     to_port : bytes
@@ -6953,9 +7000,10 @@ cdef class CUgraphEdgeData(CUgraphEdgeData_st):
     {{endif}}
     {{if 'CUgraphEdgeData_st.type' in found_struct}}
     type : bytes
-        This should be populated with a value from CUgraphDependencyType.
-        (It is typed as char due to compiler-specific layout of bitfields.)
-        See CUgraphDependencyType.
+        This should be populated with a value from
+        :py:obj:`~.CUgraphDependencyType`. (It is typed as char due to
+        compiler-specific layout of bitfields.) See
+        :py:obj:`~.CUgraphDependencyType`.
     {{endif}}
     {{if 'CUgraphEdgeData_st.reserved' in found_struct}}
     reserved : bytes
@@ -6979,19 +7027,19 @@ cdef class CUDA_GRAPH_INSTANTIATE_PARAMS(CUDA_GRAPH_INSTANTIATE_PARAMS_st):
     Attributes
     ----------
     {{if 'CUDA_GRAPH_INSTANTIATE_PARAMS_st.flags' in found_struct}}
-    flags : cuuint64_t
+    flags : :py:obj:`~.cuuint64_t`
         Instantiation flags
     {{endif}}
     {{if 'CUDA_GRAPH_INSTANTIATE_PARAMS_st.hUploadStream' in found_struct}}
-    hUploadStream : CUstream
+    hUploadStream : :py:obj:`~.CUstream`
         Upload stream
     {{endif}}
     {{if 'CUDA_GRAPH_INSTANTIATE_PARAMS_st.hErrNode_out' in found_struct}}
-    hErrNode_out : CUgraphNode
+    hErrNode_out : :py:obj:`~.CUgraphNode`
         The node which caused instantiation to fail, if any
     {{endif}}
     {{if 'CUDA_GRAPH_INSTANTIATE_PARAMS_st.result_out' in found_struct}}
-    result_out : CUgraphInstantiateResult
+    result_out : :py:obj:`~.CUgraphInstantiateResult`
         Whether instantiation was successful. If it failed, the reason why
     {{endif}}
 
@@ -7006,13 +7054,16 @@ cdef class CUDA_GRAPH_INSTANTIATE_PARAMS(CUDA_GRAPH_INSTANTIATE_PARAMS_st):
 
 cdef class CUlaunchMemSyncDomainMap(CUlaunchMemSyncDomainMap_st):
     """
-    Memory Synchronization Domain map  See ``cudaLaunchMemSyncDomain``.
-    By default, kernels are launched in domain 0. Kernel launched with
-    CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE will have a different domain ID.
-    User may also alter the domain ID with CUlaunchMemSyncDomainMap for
-    a specific stream / graph node / kernel launch. See
-    CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP.  Domain ID range is
-    available through CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT.
+    Memory Synchronization Domain map  See
+    :py:obj:`~.cudaLaunchMemSyncDomain`.  By default, kernels are
+    launched in domain 0. Kernel launched with
+    :py:obj:`~.CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE` will have a different
+    domain ID. User may also alter the domain ID with
+    :py:obj:`~.CUlaunchMemSyncDomainMap` for a specific stream / graph
+    node / kernel launch. See
+    :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP`.  Domain ID
+    range is available through
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT`.
 
     Attributes
     ----------
@@ -7036,7 +7087,8 @@ cdef class CUlaunchMemSyncDomainMap(CUlaunchMemSyncDomainMap_st):
 
 cdef class CUlaunchAttributeValue(CUlaunchAttributeValue_union):
     """
-    Launch attributes union; used as value field of CUlaunchAttribute
+    Launch attributes union; used as value field of
+    :py:obj:`~.CUlaunchAttribute`
 
     Attributes
     ----------
@@ -7045,115 +7097,125 @@ cdef class CUlaunchAttributeValue(CUlaunchAttributeValue_union):
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.accessPolicyWindow' in found_struct}}
-    accessPolicyWindow : CUaccessPolicyWindow
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW.
+    accessPolicyWindow : :py:obj:`~.CUaccessPolicyWindow`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.cooperative' in found_struct}}
     cooperative : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_COOPERATIVE. Nonzero
-        indicates a cooperative kernel (see cuLaunchCooperativeKernel).
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_COOPERATIVE`. Nonzero indicates a
+        cooperative kernel (see :func:`~.cuLaunchCooperativeKernel`).
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
-    syncPolicy : CUsynchronizationPolicy
+    syncPolicy : :py:obj:`~.CUsynchronizationPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
-        for work queued up in this stream
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY`.
+        :py:obj:`~.CUsynchronizationPolicy` for work queued up in this
+        stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
-        that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - ``x`` - The X dimension of
-        the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
-        the cluster, in blocks. Must be a divisor of the grid Z dimension.
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` that represents
+        the desired cluster dimensions for the kernel. Opaque type with the
+        following fields: - ``x`` - The X dimension of the cluster, in
+        blocks. Must be a divisor of the grid X dimension.    - ``y`` - The
+        Y dimension of the cluster, in blocks. Must be a divisor of the
+        grid Y dimension.    - ``z`` - The Z dimension of the cluster, in
+        blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
-    clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
+    clusterSchedulingPolicyPreference : :py:obj:`~.CUclusterSchedulingPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE. Cluster
-        scheduling policy preference for the kernel.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`.
+        Cluster scheduling policy preference for the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticStreamSerializationAllowed' in found_struct}}
     programmaticStreamSerializationAllowed : int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - ``CUevent`` event - Event to fire when
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
         all blocks trigger it.    - ``Event`` record flags, see
-        cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - ``triggerAtBlockStart`` - If this is set to non-0, each block
+        :func:`~.cuEventRecordWithFlags`. Does not accept
+        ``py``:obj:`~.CU_EVENT_RECORD_EXTERNAL`.    -
+        ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - ``CUevent`` event - Event to fire when the last block
-        launches    - ``int`` flags; - Event record flags, see
-        cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
+        the last block launches    - ``int`` flags; - Event record flags,
+        see :func:`~.cuEventRecordWithFlags`. Does not accept
+        :py:obj:`~.CU_EVENT_RECORD_EXTERNAL`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
     priority : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PRIORITY. Execution
-        priority of the kernel.
+        Value of launch attribute :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PRIORITY`.
+        Execution priority of the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomainMap' in found_struct}}
-    memSyncDomainMap : CUlaunchMemSyncDomainMap
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP.
-        See CUlaunchMemSyncDomainMap.
+    memSyncDomainMap : :py:obj:`~.CUlaunchMemSyncDomainMap`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP`. See
+        :py:obj:`~.CUlaunchMemSyncDomainMap`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomain' in found_struct}}
-    memSyncDomain : CUlaunchMemSyncDomain
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN.
-        See::CUlaunchMemSyncDomain
+    memSyncDomain : :py:obj:`~.CUlaunchMemSyncDomain`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN`. See
+        :py:obj:`~.CUlaunchMemSyncDomain`
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.preferredClusterDim' in found_struct}}
     preferredClusterDim : anon_struct4
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
-        desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - ``x`` - The X dimension of the
-        preferred cluster, in blocks. Must be a divisor of the grid X
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION` that
+        represents the desired preferred cluster dimensions for the kernel.
+        Opaque type with the following fields: - ``x`` - The X dimension of
+        the preferred cluster, in blocks. Must be a divisor of the grid X
         dimension, and must be a multiple of the ``x`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the ``y`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the ``z`` field
-        of CUlaunchAttributeValue::clusterDim.
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``y`` - The Y
+        dimension of the preferred cluster, in blocks. Must be a divisor of
+        the grid Y dimension, and must be a multiple of the ``y`` field of
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``z`` - The Z
+        dimension of the preferred cluster, in blocks. Must be equal to the
+        ``z`` field of :py:obj:`~.CUlaunchAttributeValue.clusterDim`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - ``int`` deviceUpdatable - Whether or not the
-        resulting kernel node should be device-updatable.    -
-        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE`. with
+        the following fields: - ``int`` deviceUpdatable - Whether or not
+        the resulting kernel node should be device-updatable.    -
+        :py:obj:`~.CUgraphDeviceNode` devNode - Returns a handle to pass to
+        the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
     nvlinkUtilCentricScheduling : unsigned int
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.portableClusterSizeMode' in found_struct}}
-    portableClusterSizeMode : CUlaunchAttributePortableClusterMode
+    portableClusterSizeMode : :py:obj:`~.CUlaunchAttributePortableClusterMode`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemoryMode' in found_struct}}
-    sharedMemoryMode : CUsharedMemoryMode
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE.
-        See CUsharedMemoryMode for acceptable values.
+    sharedMemoryMode : :py:obj:`~.CUsharedMemoryMode`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE`. See
+        :py:obj:`~.CUsharedMemoryMode` for acceptable values.
     {{endif}}
 
     Methods
@@ -7172,11 +7234,11 @@ cdef class CUlaunchAttribute(CUlaunchAttribute_st):
     Attributes
     ----------
     {{if 'CUlaunchAttribute_st.id' in found_struct}}
-    id : CUlaunchAttributeID
+    id : :py:obj:`~.CUlaunchAttributeID`
         Attribute to set
     {{endif}}
     {{if 'CUlaunchAttribute_st.value' in found_struct}}
-    value : CUlaunchAttributeValue
+    value : :py:obj:`~.CUlaunchAttributeValue`
         Value of the attribute
     {{endif}}
 
@@ -7224,16 +7286,17 @@ cdef class CUlaunchConfig(CUlaunchConfig_st):
         Dynamic shared-memory size per thread block in bytes
     {{endif}}
     {{if 'CUlaunchConfig_st.hStream' in found_struct}}
-    hStream : CUstream
+    hStream : :py:obj:`~.CUstream`
         Stream identifier
     {{endif}}
     {{if 'CUlaunchConfig_st.attrs' in found_struct}}
-    attrs : CUlaunchAttribute
-        List of attributes; nullable if CUlaunchConfig::numAttrs == 0
+    attrs : :py:obj:`~.CUlaunchAttribute`
+        List of attributes; nullable if :py:obj:`~.CUlaunchConfig.numAttrs`
+        == 0
     {{endif}}
     {{if 'CUlaunchConfig_st.numAttrs' in found_struct}}
     numAttrs : unsigned int
-        Number of attributes populated in CUlaunchConfig::attrs
+        Number of attributes populated in :py:obj:`~.CUlaunchConfig.attrs`
     {{endif}}
 
     Methods
@@ -7247,7 +7310,8 @@ cdef class CUlaunchConfig(CUlaunchConfig_st):
 
 cdef class CUkernelNodeAttrValue_v1(CUlaunchAttributeValue):
     """
-    Launch attributes union; used as value field of CUlaunchAttribute
+    Launch attributes union; used as value field of
+    :py:obj:`~.CUlaunchAttribute`
 
     Attributes
     ----------
@@ -7256,115 +7320,125 @@ cdef class CUkernelNodeAttrValue_v1(CUlaunchAttributeValue):
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.accessPolicyWindow' in found_struct}}
-    accessPolicyWindow : CUaccessPolicyWindow
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW.
+    accessPolicyWindow : :py:obj:`~.CUaccessPolicyWindow`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.cooperative' in found_struct}}
     cooperative : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_COOPERATIVE. Nonzero
-        indicates a cooperative kernel (see cuLaunchCooperativeKernel).
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_COOPERATIVE`. Nonzero indicates a
+        cooperative kernel (see :func:`~.cuLaunchCooperativeKernel`).
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
-    syncPolicy : CUsynchronizationPolicy
+    syncPolicy : :py:obj:`~.CUsynchronizationPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
-        for work queued up in this stream
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY`.
+        :py:obj:`~.CUsynchronizationPolicy` for work queued up in this
+        stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
-        that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - ``x`` - The X dimension of
-        the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
-        the cluster, in blocks. Must be a divisor of the grid Z dimension.
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` that represents
+        the desired cluster dimensions for the kernel. Opaque type with the
+        following fields: - ``x`` - The X dimension of the cluster, in
+        blocks. Must be a divisor of the grid X dimension.    - ``y`` - The
+        Y dimension of the cluster, in blocks. Must be a divisor of the
+        grid Y dimension.    - ``z`` - The Z dimension of the cluster, in
+        blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
-    clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
+    clusterSchedulingPolicyPreference : :py:obj:`~.CUclusterSchedulingPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE. Cluster
-        scheduling policy preference for the kernel.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`.
+        Cluster scheduling policy preference for the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticStreamSerializationAllowed' in found_struct}}
     programmaticStreamSerializationAllowed : int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - ``CUevent`` event - Event to fire when
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
         all blocks trigger it.    - ``Event`` record flags, see
-        cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - ``triggerAtBlockStart`` - If this is set to non-0, each block
+        :func:`~.cuEventRecordWithFlags`. Does not accept
+        ``py``:obj:`~.CU_EVENT_RECORD_EXTERNAL`.    -
+        ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - ``CUevent`` event - Event to fire when the last block
-        launches    - ``int`` flags; - Event record flags, see
-        cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
+        the last block launches    - ``int`` flags; - Event record flags,
+        see :func:`~.cuEventRecordWithFlags`. Does not accept
+        :py:obj:`~.CU_EVENT_RECORD_EXTERNAL`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
     priority : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PRIORITY. Execution
-        priority of the kernel.
+        Value of launch attribute :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PRIORITY`.
+        Execution priority of the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomainMap' in found_struct}}
-    memSyncDomainMap : CUlaunchMemSyncDomainMap
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP.
-        See CUlaunchMemSyncDomainMap.
+    memSyncDomainMap : :py:obj:`~.CUlaunchMemSyncDomainMap`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP`. See
+        :py:obj:`~.CUlaunchMemSyncDomainMap`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomain' in found_struct}}
-    memSyncDomain : CUlaunchMemSyncDomain
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN.
-        See::CUlaunchMemSyncDomain
+    memSyncDomain : :py:obj:`~.CUlaunchMemSyncDomain`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN`. See
+        :py:obj:`~.CUlaunchMemSyncDomain`
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.preferredClusterDim' in found_struct}}
     preferredClusterDim : anon_struct4
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
-        desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - ``x`` - The X dimension of the
-        preferred cluster, in blocks. Must be a divisor of the grid X
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION` that
+        represents the desired preferred cluster dimensions for the kernel.
+        Opaque type with the following fields: - ``x`` - The X dimension of
+        the preferred cluster, in blocks. Must be a divisor of the grid X
         dimension, and must be a multiple of the ``x`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the ``y`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the ``z`` field
-        of CUlaunchAttributeValue::clusterDim.
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``y`` - The Y
+        dimension of the preferred cluster, in blocks. Must be a divisor of
+        the grid Y dimension, and must be a multiple of the ``y`` field of
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``z`` - The Z
+        dimension of the preferred cluster, in blocks. Must be equal to the
+        ``z`` field of :py:obj:`~.CUlaunchAttributeValue.clusterDim`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - ``int`` deviceUpdatable - Whether or not the
-        resulting kernel node should be device-updatable.    -
-        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE`. with
+        the following fields: - ``int`` deviceUpdatable - Whether or not
+        the resulting kernel node should be device-updatable.    -
+        :py:obj:`~.CUgraphDeviceNode` devNode - Returns a handle to pass to
+        the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
     nvlinkUtilCentricScheduling : unsigned int
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.portableClusterSizeMode' in found_struct}}
-    portableClusterSizeMode : CUlaunchAttributePortableClusterMode
+    portableClusterSizeMode : :py:obj:`~.CUlaunchAttributePortableClusterMode`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemoryMode' in found_struct}}
-    sharedMemoryMode : CUsharedMemoryMode
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE.
-        See CUsharedMemoryMode for acceptable values.
+    sharedMemoryMode : :py:obj:`~.CUsharedMemoryMode`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE`. See
+        :py:obj:`~.CUsharedMemoryMode` for acceptable values.
     {{endif}}
 
     Methods
@@ -7378,7 +7452,8 @@ cdef class CUkernelNodeAttrValue_v1(CUlaunchAttributeValue):
 
 cdef class CUkernelNodeAttrValue(CUkernelNodeAttrValue_v1):
     """
-    Launch attributes union; used as value field of CUlaunchAttribute
+    Launch attributes union; used as value field of
+    :py:obj:`~.CUlaunchAttribute`
 
     Attributes
     ----------
@@ -7387,115 +7462,125 @@ cdef class CUkernelNodeAttrValue(CUkernelNodeAttrValue_v1):
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.accessPolicyWindow' in found_struct}}
-    accessPolicyWindow : CUaccessPolicyWindow
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW.
+    accessPolicyWindow : :py:obj:`~.CUaccessPolicyWindow`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.cooperative' in found_struct}}
     cooperative : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_COOPERATIVE. Nonzero
-        indicates a cooperative kernel (see cuLaunchCooperativeKernel).
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_COOPERATIVE`. Nonzero indicates a
+        cooperative kernel (see :func:`~.cuLaunchCooperativeKernel`).
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
-    syncPolicy : CUsynchronizationPolicy
+    syncPolicy : :py:obj:`~.CUsynchronizationPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
-        for work queued up in this stream
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY`.
+        :py:obj:`~.CUsynchronizationPolicy` for work queued up in this
+        stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
-        that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - ``x`` - The X dimension of
-        the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
-        the cluster, in blocks. Must be a divisor of the grid Z dimension.
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` that represents
+        the desired cluster dimensions for the kernel. Opaque type with the
+        following fields: - ``x`` - The X dimension of the cluster, in
+        blocks. Must be a divisor of the grid X dimension.    - ``y`` - The
+        Y dimension of the cluster, in blocks. Must be a divisor of the
+        grid Y dimension.    - ``z`` - The Z dimension of the cluster, in
+        blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
-    clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
+    clusterSchedulingPolicyPreference : :py:obj:`~.CUclusterSchedulingPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE. Cluster
-        scheduling policy preference for the kernel.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`.
+        Cluster scheduling policy preference for the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticStreamSerializationAllowed' in found_struct}}
     programmaticStreamSerializationAllowed : int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - ``CUevent`` event - Event to fire when
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
         all blocks trigger it.    - ``Event`` record flags, see
-        cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - ``triggerAtBlockStart`` - If this is set to non-0, each block
+        :func:`~.cuEventRecordWithFlags`. Does not accept
+        ``py``:obj:`~.CU_EVENT_RECORD_EXTERNAL`.    -
+        ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - ``CUevent`` event - Event to fire when the last block
-        launches    - ``int`` flags; - Event record flags, see
-        cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
+        the last block launches    - ``int`` flags; - Event record flags,
+        see :func:`~.cuEventRecordWithFlags`. Does not accept
+        :py:obj:`~.CU_EVENT_RECORD_EXTERNAL`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
     priority : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PRIORITY. Execution
-        priority of the kernel.
+        Value of launch attribute :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PRIORITY`.
+        Execution priority of the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomainMap' in found_struct}}
-    memSyncDomainMap : CUlaunchMemSyncDomainMap
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP.
-        See CUlaunchMemSyncDomainMap.
+    memSyncDomainMap : :py:obj:`~.CUlaunchMemSyncDomainMap`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP`. See
+        :py:obj:`~.CUlaunchMemSyncDomainMap`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomain' in found_struct}}
-    memSyncDomain : CUlaunchMemSyncDomain
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN.
-        See::CUlaunchMemSyncDomain
+    memSyncDomain : :py:obj:`~.CUlaunchMemSyncDomain`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN`. See
+        :py:obj:`~.CUlaunchMemSyncDomain`
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.preferredClusterDim' in found_struct}}
     preferredClusterDim : anon_struct4
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
-        desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - ``x`` - The X dimension of the
-        preferred cluster, in blocks. Must be a divisor of the grid X
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION` that
+        represents the desired preferred cluster dimensions for the kernel.
+        Opaque type with the following fields: - ``x`` - The X dimension of
+        the preferred cluster, in blocks. Must be a divisor of the grid X
         dimension, and must be a multiple of the ``x`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the ``y`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the ``z`` field
-        of CUlaunchAttributeValue::clusterDim.
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``y`` - The Y
+        dimension of the preferred cluster, in blocks. Must be a divisor of
+        the grid Y dimension, and must be a multiple of the ``y`` field of
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``z`` - The Z
+        dimension of the preferred cluster, in blocks. Must be equal to the
+        ``z`` field of :py:obj:`~.CUlaunchAttributeValue.clusterDim`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - ``int`` deviceUpdatable - Whether or not the
-        resulting kernel node should be device-updatable.    -
-        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE`. with
+        the following fields: - ``int`` deviceUpdatable - Whether or not
+        the resulting kernel node should be device-updatable.    -
+        :py:obj:`~.CUgraphDeviceNode` devNode - Returns a handle to pass to
+        the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
     nvlinkUtilCentricScheduling : unsigned int
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.portableClusterSizeMode' in found_struct}}
-    portableClusterSizeMode : CUlaunchAttributePortableClusterMode
+    portableClusterSizeMode : :py:obj:`~.CUlaunchAttributePortableClusterMode`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemoryMode' in found_struct}}
-    sharedMemoryMode : CUsharedMemoryMode
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE.
-        See CUsharedMemoryMode for acceptable values.
+    sharedMemoryMode : :py:obj:`~.CUsharedMemoryMode`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE`. See
+        :py:obj:`~.CUsharedMemoryMode` for acceptable values.
     {{endif}}
 
     Methods
@@ -7509,7 +7594,8 @@ cdef class CUkernelNodeAttrValue(CUkernelNodeAttrValue_v1):
 
 cdef class CUstreamAttrValue_v1(CUlaunchAttributeValue):
     """
-    Launch attributes union; used as value field of CUlaunchAttribute
+    Launch attributes union; used as value field of
+    :py:obj:`~.CUlaunchAttribute`
 
     Attributes
     ----------
@@ -7518,115 +7604,125 @@ cdef class CUstreamAttrValue_v1(CUlaunchAttributeValue):
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.accessPolicyWindow' in found_struct}}
-    accessPolicyWindow : CUaccessPolicyWindow
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW.
+    accessPolicyWindow : :py:obj:`~.CUaccessPolicyWindow`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.cooperative' in found_struct}}
     cooperative : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_COOPERATIVE. Nonzero
-        indicates a cooperative kernel (see cuLaunchCooperativeKernel).
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_COOPERATIVE`. Nonzero indicates a
+        cooperative kernel (see :func:`~.cuLaunchCooperativeKernel`).
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
-    syncPolicy : CUsynchronizationPolicy
+    syncPolicy : :py:obj:`~.CUsynchronizationPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
-        for work queued up in this stream
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY`.
+        :py:obj:`~.CUsynchronizationPolicy` for work queued up in this
+        stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
-        that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - ``x`` - The X dimension of
-        the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
-        the cluster, in blocks. Must be a divisor of the grid Z dimension.
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` that represents
+        the desired cluster dimensions for the kernel. Opaque type with the
+        following fields: - ``x`` - The X dimension of the cluster, in
+        blocks. Must be a divisor of the grid X dimension.    - ``y`` - The
+        Y dimension of the cluster, in blocks. Must be a divisor of the
+        grid Y dimension.    - ``z`` - The Z dimension of the cluster, in
+        blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
-    clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
+    clusterSchedulingPolicyPreference : :py:obj:`~.CUclusterSchedulingPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE. Cluster
-        scheduling policy preference for the kernel.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`.
+        Cluster scheduling policy preference for the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticStreamSerializationAllowed' in found_struct}}
     programmaticStreamSerializationAllowed : int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - ``CUevent`` event - Event to fire when
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
         all blocks trigger it.    - ``Event`` record flags, see
-        cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - ``triggerAtBlockStart`` - If this is set to non-0, each block
+        :func:`~.cuEventRecordWithFlags`. Does not accept
+        ``py``:obj:`~.CU_EVENT_RECORD_EXTERNAL`.    -
+        ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - ``CUevent`` event - Event to fire when the last block
-        launches    - ``int`` flags; - Event record flags, see
-        cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
+        the last block launches    - ``int`` flags; - Event record flags,
+        see :func:`~.cuEventRecordWithFlags`. Does not accept
+        :py:obj:`~.CU_EVENT_RECORD_EXTERNAL`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
     priority : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PRIORITY. Execution
-        priority of the kernel.
+        Value of launch attribute :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PRIORITY`.
+        Execution priority of the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomainMap' in found_struct}}
-    memSyncDomainMap : CUlaunchMemSyncDomainMap
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP.
-        See CUlaunchMemSyncDomainMap.
+    memSyncDomainMap : :py:obj:`~.CUlaunchMemSyncDomainMap`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP`. See
+        :py:obj:`~.CUlaunchMemSyncDomainMap`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomain' in found_struct}}
-    memSyncDomain : CUlaunchMemSyncDomain
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN.
-        See::CUlaunchMemSyncDomain
+    memSyncDomain : :py:obj:`~.CUlaunchMemSyncDomain`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN`. See
+        :py:obj:`~.CUlaunchMemSyncDomain`
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.preferredClusterDim' in found_struct}}
     preferredClusterDim : anon_struct4
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
-        desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - ``x`` - The X dimension of the
-        preferred cluster, in blocks. Must be a divisor of the grid X
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION` that
+        represents the desired preferred cluster dimensions for the kernel.
+        Opaque type with the following fields: - ``x`` - The X dimension of
+        the preferred cluster, in blocks. Must be a divisor of the grid X
         dimension, and must be a multiple of the ``x`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the ``y`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the ``z`` field
-        of CUlaunchAttributeValue::clusterDim.
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``y`` - The Y
+        dimension of the preferred cluster, in blocks. Must be a divisor of
+        the grid Y dimension, and must be a multiple of the ``y`` field of
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``z`` - The Z
+        dimension of the preferred cluster, in blocks. Must be equal to the
+        ``z`` field of :py:obj:`~.CUlaunchAttributeValue.clusterDim`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - ``int`` deviceUpdatable - Whether or not the
-        resulting kernel node should be device-updatable.    -
-        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE`. with
+        the following fields: - ``int`` deviceUpdatable - Whether or not
+        the resulting kernel node should be device-updatable.    -
+        :py:obj:`~.CUgraphDeviceNode` devNode - Returns a handle to pass to
+        the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
     nvlinkUtilCentricScheduling : unsigned int
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.portableClusterSizeMode' in found_struct}}
-    portableClusterSizeMode : CUlaunchAttributePortableClusterMode
+    portableClusterSizeMode : :py:obj:`~.CUlaunchAttributePortableClusterMode`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemoryMode' in found_struct}}
-    sharedMemoryMode : CUsharedMemoryMode
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE.
-        See CUsharedMemoryMode for acceptable values.
+    sharedMemoryMode : :py:obj:`~.CUsharedMemoryMode`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE`. See
+        :py:obj:`~.CUsharedMemoryMode` for acceptable values.
     {{endif}}
 
     Methods
@@ -7640,7 +7736,8 @@ cdef class CUstreamAttrValue_v1(CUlaunchAttributeValue):
 
 cdef class CUstreamAttrValue(CUstreamAttrValue_v1):
     """
-    Launch attributes union; used as value field of CUlaunchAttribute
+    Launch attributes union; used as value field of
+    :py:obj:`~.CUlaunchAttribute`
 
     Attributes
     ----------
@@ -7649,115 +7746,125 @@ cdef class CUstreamAttrValue(CUstreamAttrValue_v1):
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.accessPolicyWindow' in found_struct}}
-    accessPolicyWindow : CUaccessPolicyWindow
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW.
+    accessPolicyWindow : :py:obj:`~.CUaccessPolicyWindow`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.cooperative' in found_struct}}
     cooperative : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_COOPERATIVE. Nonzero
-        indicates a cooperative kernel (see cuLaunchCooperativeKernel).
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_COOPERATIVE`. Nonzero indicates a
+        cooperative kernel (see :func:`~.cuLaunchCooperativeKernel`).
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
-    syncPolicy : CUsynchronizationPolicy
+    syncPolicy : :py:obj:`~.CUsynchronizationPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
-        for work queued up in this stream
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY`.
+        :py:obj:`~.CUsynchronizationPolicy` for work queued up in this
+        stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
-        that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - ``x`` - The X dimension of
-        the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
-        the cluster, in blocks. Must be a divisor of the grid Z dimension.
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` that represents
+        the desired cluster dimensions for the kernel. Opaque type with the
+        following fields: - ``x`` - The X dimension of the cluster, in
+        blocks. Must be a divisor of the grid X dimension.    - ``y`` - The
+        Y dimension of the cluster, in blocks. Must be a divisor of the
+        grid Y dimension.    - ``z`` - The Z dimension of the cluster, in
+        blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
-    clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
+    clusterSchedulingPolicyPreference : :py:obj:`~.CUclusterSchedulingPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE. Cluster
-        scheduling policy preference for the kernel.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`.
+        Cluster scheduling policy preference for the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticStreamSerializationAllowed' in found_struct}}
     programmaticStreamSerializationAllowed : int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - ``CUevent`` event - Event to fire when
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
         all blocks trigger it.    - ``Event`` record flags, see
-        cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - ``triggerAtBlockStart`` - If this is set to non-0, each block
+        :func:`~.cuEventRecordWithFlags`. Does not accept
+        ``py``:obj:`~.CU_EVENT_RECORD_EXTERNAL`.    -
+        ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - ``CUevent`` event - Event to fire when the last block
-        launches    - ``int`` flags; - Event record flags, see
-        cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
+        the last block launches    - ``int`` flags; - Event record flags,
+        see :func:`~.cuEventRecordWithFlags`. Does not accept
+        :py:obj:`~.CU_EVENT_RECORD_EXTERNAL`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
     priority : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PRIORITY. Execution
-        priority of the kernel.
+        Value of launch attribute :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PRIORITY`.
+        Execution priority of the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomainMap' in found_struct}}
-    memSyncDomainMap : CUlaunchMemSyncDomainMap
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP.
-        See CUlaunchMemSyncDomainMap.
+    memSyncDomainMap : :py:obj:`~.CUlaunchMemSyncDomainMap`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP`. See
+        :py:obj:`~.CUlaunchMemSyncDomainMap`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomain' in found_struct}}
-    memSyncDomain : CUlaunchMemSyncDomain
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN.
-        See::CUlaunchMemSyncDomain
+    memSyncDomain : :py:obj:`~.CUlaunchMemSyncDomain`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN`. See
+        :py:obj:`~.CUlaunchMemSyncDomain`
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.preferredClusterDim' in found_struct}}
     preferredClusterDim : anon_struct4
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
-        desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - ``x`` - The X dimension of the
-        preferred cluster, in blocks. Must be a divisor of the grid X
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION` that
+        represents the desired preferred cluster dimensions for the kernel.
+        Opaque type with the following fields: - ``x`` - The X dimension of
+        the preferred cluster, in blocks. Must be a divisor of the grid X
         dimension, and must be a multiple of the ``x`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the ``y`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the ``z`` field
-        of CUlaunchAttributeValue::clusterDim.
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``y`` - The Y
+        dimension of the preferred cluster, in blocks. Must be a divisor of
+        the grid Y dimension, and must be a multiple of the ``y`` field of
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``z`` - The Z
+        dimension of the preferred cluster, in blocks. Must be equal to the
+        ``z`` field of :py:obj:`~.CUlaunchAttributeValue.clusterDim`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - ``int`` deviceUpdatable - Whether or not the
-        resulting kernel node should be device-updatable.    -
-        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE`. with
+        the following fields: - ``int`` deviceUpdatable - Whether or not
+        the resulting kernel node should be device-updatable.    -
+        :py:obj:`~.CUgraphDeviceNode` devNode - Returns a handle to pass to
+        the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
     nvlinkUtilCentricScheduling : unsigned int
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.portableClusterSizeMode' in found_struct}}
-    portableClusterSizeMode : CUlaunchAttributePortableClusterMode
+    portableClusterSizeMode : :py:obj:`~.CUlaunchAttributePortableClusterMode`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemoryMode' in found_struct}}
-    sharedMemoryMode : CUsharedMemoryMode
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE.
-        See CUsharedMemoryMode for acceptable values.
+    sharedMemoryMode : :py:obj:`~.CUsharedMemoryMode`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE`. See
+        :py:obj:`~.CUsharedMemoryMode` for acceptable values.
     {{endif}}
 
     Methods
@@ -7771,7 +7878,7 @@ cdef class CUstreamAttrValue(CUstreamAttrValue_v1):
 
 cdef class CUexecAffinitySmCount_v1(CUexecAffinitySmCount_st):
     """
-    Value for CU_EXEC_AFFINITY_TYPE_SM_COUNT
+    Value for :py:obj:`~.CU_EXEC_AFFINITY_TYPE_SM_COUNT`
 
     Attributes
     ----------
@@ -7791,7 +7898,7 @@ cdef class CUexecAffinitySmCount_v1(CUexecAffinitySmCount_st):
 
 cdef class CUexecAffinitySmCount(CUexecAffinitySmCount_v1):
     """
-    Value for CU_EXEC_AFFINITY_TYPE_SM_COUNT
+    Value for :py:obj:`~.CU_EXEC_AFFINITY_TYPE_SM_COUNT`
 
     Attributes
     ----------
@@ -7816,7 +7923,7 @@ cdef class CUexecAffinityParam_v1(CUexecAffinityParam_st):
     Attributes
     ----------
     {{if 'CUexecAffinityParam_st.type' in found_struct}}
-    type : CUexecAffinityType
+    type : :py:obj:`~.CUexecAffinityType`
         Type of execution affinity.
     {{endif}}
     {{if 'CUexecAffinityParam_st.param' in found_struct}}
@@ -7840,7 +7947,7 @@ cdef class CUexecAffinityParam(CUexecAffinityParam_v1):
     Attributes
     ----------
     {{if 'CUexecAffinityParam_st.type' in found_struct}}
-    type : CUexecAffinityType
+    type : :py:obj:`~.CUexecAffinityType`
         Type of execution affinity.
     {{endif}}
     {{if 'CUexecAffinityParam_st.param' in found_struct}}
@@ -7864,7 +7971,7 @@ cdef class CUctxCigParam(CUctxCigParam_st):
     Attributes
     ----------
     {{if 'CUctxCigParam_st.sharedDataType' in found_struct}}
-    sharedDataType : CUcigDataType
+    sharedDataType : :py:obj:`~.CUcigDataType`
         Type of shared data from graphics client (D3D12 or Vulkan).
     {{endif}}
     {{if 'CUctxCigParam_st.sharedData' in found_struct}}
@@ -7891,7 +7998,7 @@ cdef class CUctxCreateParams(CUctxCreateParams_st):
     Attributes
     ----------
     {{if 'CUctxCreateParams_st.execAffinityParams' in found_struct}}
-    execAffinityParams : CUexecAffinityParam
+    execAffinityParams : :py:obj:`~.CUexecAffinityParam`
         Array of execution affinity parameters to limit context resources
         (e.g., SM count). Only supported Volta+ MPS. Mutually exclusive
         with cigParams.
@@ -7902,7 +8009,7 @@ cdef class CUctxCreateParams(CUctxCreateParams_st):
         execAffinityParams is NULL.
     {{endif}}
     {{if 'CUctxCreateParams_st.cigParams' in found_struct}}
-    cigParams : CUctxCigParam
+    cigParams : :py:obj:`~.CUctxCigParam`
         CIG (CUDA in Graphics) parameters for sharing data from
         D3D12/Vulkan graphics clients. Mutually exclusive with
         execAffinityParams.
@@ -7924,7 +8031,7 @@ cdef class CUstreamCigParam(CUstreamCigParam_st):
     Attributes
     ----------
     {{if 'CUstreamCigParam_st.streamSharedDataType' in found_struct}}
-    streamSharedDataType : CUstreamCigDataType
+    streamSharedDataType : :py:obj:`~.CUstreamCigDataType`
         Type of shared data from graphics client (D3D12).
     {{endif}}
     {{if 'CUstreamCigParam_st.streamSharedData' in found_struct}}
@@ -7950,7 +8057,7 @@ cdef class CUstreamCigCaptureParams(CUstreamCigCaptureParams_st):
     Attributes
     ----------
     {{if 'CUstreamCigCaptureParams_st.streamCigParams' in found_struct}}
-    streamCigParams : CUstreamCigParam
+    streamCigParams : :py:obj:`~.CUstreamCigParam`
         CIG (CUDA in Graphics) parameters for sharing command list data
         from D3D12 graphics clients.
     {{endif}}
@@ -8009,7 +8116,7 @@ cdef class CUDA_MEMCPY2D_v2(CUDA_MEMCPY2D_st):
         Source Y
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcMemoryType' in found_struct}}
-    srcMemoryType : CUmemorytype
+    srcMemoryType : :py:obj:`~.CUmemorytype`
         Source memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcHost' in found_struct}}
@@ -8017,11 +8124,11 @@ cdef class CUDA_MEMCPY2D_v2(CUDA_MEMCPY2D_st):
         Source host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcDevice' in found_struct}}
-    srcDevice : CUdeviceptr
+    srcDevice : :py:obj:`~.CUdeviceptr`
         Source device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcArray' in found_struct}}
-    srcArray : CUarray
+    srcArray : :py:obj:`~.CUarray`
         Source array reference
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcPitch' in found_struct}}
@@ -8037,7 +8144,7 @@ cdef class CUDA_MEMCPY2D_v2(CUDA_MEMCPY2D_st):
         Destination Y
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstMemoryType' in found_struct}}
-    dstMemoryType : CUmemorytype
+    dstMemoryType : :py:obj:`~.CUmemorytype`
         Destination memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstHost' in found_struct}}
@@ -8045,11 +8152,11 @@ cdef class CUDA_MEMCPY2D_v2(CUDA_MEMCPY2D_st):
         Destination host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstDevice' in found_struct}}
-    dstDevice : CUdeviceptr
+    dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstArray' in found_struct}}
-    dstArray : CUarray
+    dstArray : :py:obj:`~.CUarray`
         Destination array reference
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstPitch' in found_struct}}
@@ -8089,7 +8196,7 @@ cdef class CUDA_MEMCPY2D(CUDA_MEMCPY2D_v2):
         Source Y
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcMemoryType' in found_struct}}
-    srcMemoryType : CUmemorytype
+    srcMemoryType : :py:obj:`~.CUmemorytype`
         Source memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcHost' in found_struct}}
@@ -8097,11 +8204,11 @@ cdef class CUDA_MEMCPY2D(CUDA_MEMCPY2D_v2):
         Source host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcDevice' in found_struct}}
-    srcDevice : CUdeviceptr
+    srcDevice : :py:obj:`~.CUdeviceptr`
         Source device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcArray' in found_struct}}
-    srcArray : CUarray
+    srcArray : :py:obj:`~.CUarray`
         Source array reference
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcPitch' in found_struct}}
@@ -8117,7 +8224,7 @@ cdef class CUDA_MEMCPY2D(CUDA_MEMCPY2D_v2):
         Destination Y
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstMemoryType' in found_struct}}
-    dstMemoryType : CUmemorytype
+    dstMemoryType : :py:obj:`~.CUmemorytype`
         Destination memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstHost' in found_struct}}
@@ -8125,11 +8232,11 @@ cdef class CUDA_MEMCPY2D(CUDA_MEMCPY2D_v2):
         Destination host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstDevice' in found_struct}}
-    dstDevice : CUdeviceptr
+    dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstArray' in found_struct}}
-    dstArray : CUarray
+    dstArray : :py:obj:`~.CUarray`
         Destination array reference
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstPitch' in found_struct}}
@@ -8177,7 +8284,7 @@ cdef class CUDA_MEMCPY3D_v2(CUDA_MEMCPY3D_st):
         Source LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcMemoryType' in found_struct}}
-    srcMemoryType : CUmemorytype
+    srcMemoryType : :py:obj:`~.CUmemorytype`
         Source memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcHost' in found_struct}}
@@ -8185,11 +8292,11 @@ cdef class CUDA_MEMCPY3D_v2(CUDA_MEMCPY3D_st):
         Source host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcDevice' in found_struct}}
-    srcDevice : CUdeviceptr
+    srcDevice : :py:obj:`~.CUdeviceptr`
         Source device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcArray' in found_struct}}
-    srcArray : CUarray
+    srcArray : :py:obj:`~.CUarray`
         Source array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.reserved0' in found_struct}}
@@ -8221,7 +8328,7 @@ cdef class CUDA_MEMCPY3D_v2(CUDA_MEMCPY3D_st):
         Destination LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstMemoryType' in found_struct}}
-    dstMemoryType : CUmemorytype
+    dstMemoryType : :py:obj:`~.CUmemorytype`
         Destination memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstHost' in found_struct}}
@@ -8229,11 +8336,11 @@ cdef class CUDA_MEMCPY3D_v2(CUDA_MEMCPY3D_st):
         Destination host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstDevice' in found_struct}}
-    dstDevice : CUdeviceptr
+    dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstArray' in found_struct}}
-    dstArray : CUarray
+    dstArray : :py:obj:`~.CUarray`
         Destination array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.reserved1' in found_struct}}
@@ -8294,7 +8401,7 @@ cdef class CUDA_MEMCPY3D(CUDA_MEMCPY3D_v2):
         Source LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcMemoryType' in found_struct}}
-    srcMemoryType : CUmemorytype
+    srcMemoryType : :py:obj:`~.CUmemorytype`
         Source memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcHost' in found_struct}}
@@ -8302,11 +8409,11 @@ cdef class CUDA_MEMCPY3D(CUDA_MEMCPY3D_v2):
         Source host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcDevice' in found_struct}}
-    srcDevice : CUdeviceptr
+    srcDevice : :py:obj:`~.CUdeviceptr`
         Source device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcArray' in found_struct}}
-    srcArray : CUarray
+    srcArray : :py:obj:`~.CUarray`
         Source array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.reserved0' in found_struct}}
@@ -8338,7 +8445,7 @@ cdef class CUDA_MEMCPY3D(CUDA_MEMCPY3D_v2):
         Destination LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstMemoryType' in found_struct}}
-    dstMemoryType : CUmemorytype
+    dstMemoryType : :py:obj:`~.CUmemorytype`
         Destination memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstHost' in found_struct}}
@@ -8346,11 +8453,11 @@ cdef class CUDA_MEMCPY3D(CUDA_MEMCPY3D_v2):
         Destination host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstDevice' in found_struct}}
-    dstDevice : CUdeviceptr
+    dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstArray' in found_struct}}
-    dstArray : CUarray
+    dstArray : :py:obj:`~.CUarray`
         Destination array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.reserved1' in found_struct}}
@@ -8411,7 +8518,7 @@ cdef class CUDA_MEMCPY3D_PEER_v1(CUDA_MEMCPY3D_PEER_st):
         Source LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcMemoryType' in found_struct}}
-    srcMemoryType : CUmemorytype
+    srcMemoryType : :py:obj:`~.CUmemorytype`
         Source memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcHost' in found_struct}}
@@ -8419,16 +8526,17 @@ cdef class CUDA_MEMCPY3D_PEER_v1(CUDA_MEMCPY3D_PEER_st):
         Source host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcDevice' in found_struct}}
-    srcDevice : CUdeviceptr
+    srcDevice : :py:obj:`~.CUdeviceptr`
         Source device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcArray' in found_struct}}
-    srcArray : CUarray
+    srcArray : :py:obj:`~.CUarray`
         Source array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcContext' in found_struct}}
-    srcContext : CUcontext
-        Source context (ignored with srcMemoryType is CU_MEMORYTYPE_ARRAY)
+    srcContext : :py:obj:`~.CUcontext`
+        Source context (ignored with srcMemoryType is
+        :py:obj:`~.CU_MEMORYTYPE_ARRAY`)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcPitch' in found_struct}}
     srcPitch : size_t
@@ -8455,7 +8563,7 @@ cdef class CUDA_MEMCPY3D_PEER_v1(CUDA_MEMCPY3D_PEER_st):
         Destination LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstMemoryType' in found_struct}}
-    dstMemoryType : CUmemorytype
+    dstMemoryType : :py:obj:`~.CUmemorytype`
         Destination memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstHost' in found_struct}}
@@ -8463,17 +8571,17 @@ cdef class CUDA_MEMCPY3D_PEER_v1(CUDA_MEMCPY3D_PEER_st):
         Destination host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstDevice' in found_struct}}
-    dstDevice : CUdeviceptr
+    dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstArray' in found_struct}}
-    dstArray : CUarray
+    dstArray : :py:obj:`~.CUarray`
         Destination array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstContext' in found_struct}}
-    dstContext : CUcontext
+    dstContext : :py:obj:`~.CUcontext`
         Destination context (ignored with dstMemoryType is
-        CU_MEMORYTYPE_ARRAY)
+        :py:obj:`~.CU_MEMORYTYPE_ARRAY`)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstPitch' in found_struct}}
     dstPitch : size_t
@@ -8529,7 +8637,7 @@ cdef class CUDA_MEMCPY3D_PEER(CUDA_MEMCPY3D_PEER_v1):
         Source LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcMemoryType' in found_struct}}
-    srcMemoryType : CUmemorytype
+    srcMemoryType : :py:obj:`~.CUmemorytype`
         Source memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcHost' in found_struct}}
@@ -8537,16 +8645,17 @@ cdef class CUDA_MEMCPY3D_PEER(CUDA_MEMCPY3D_PEER_v1):
         Source host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcDevice' in found_struct}}
-    srcDevice : CUdeviceptr
+    srcDevice : :py:obj:`~.CUdeviceptr`
         Source device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcArray' in found_struct}}
-    srcArray : CUarray
+    srcArray : :py:obj:`~.CUarray`
         Source array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcContext' in found_struct}}
-    srcContext : CUcontext
-        Source context (ignored with srcMemoryType is CU_MEMORYTYPE_ARRAY)
+    srcContext : :py:obj:`~.CUcontext`
+        Source context (ignored with srcMemoryType is
+        :py:obj:`~.CU_MEMORYTYPE_ARRAY`)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcPitch' in found_struct}}
     srcPitch : size_t
@@ -8573,7 +8682,7 @@ cdef class CUDA_MEMCPY3D_PEER(CUDA_MEMCPY3D_PEER_v1):
         Destination LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstMemoryType' in found_struct}}
-    dstMemoryType : CUmemorytype
+    dstMemoryType : :py:obj:`~.CUmemorytype`
         Destination memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstHost' in found_struct}}
@@ -8581,17 +8690,17 @@ cdef class CUDA_MEMCPY3D_PEER(CUDA_MEMCPY3D_PEER_v1):
         Destination host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstDevice' in found_struct}}
-    dstDevice : CUdeviceptr
+    dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstArray' in found_struct}}
-    dstArray : CUarray
+    dstArray : :py:obj:`~.CUarray`
         Destination array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstContext' in found_struct}}
-    dstContext : CUcontext
+    dstContext : :py:obj:`~.CUcontext`
         Destination context (ignored with dstMemoryType is
-        CU_MEMORYTYPE_ARRAY)
+        :py:obj:`~.CU_MEMORYTYPE_ARRAY`)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstPitch' in found_struct}}
     dstPitch : size_t
@@ -8639,11 +8748,11 @@ cdef class CUDA_MEMCPY_NODE_PARAMS(CUDA_MEMCPY_NODE_PARAMS_st):
         Must be zero
     {{endif}}
     {{if 'CUDA_MEMCPY_NODE_PARAMS_st.copyCtx' in found_struct}}
-    copyCtx : CUcontext
+    copyCtx : :py:obj:`~.CUcontext`
         Context on which to run the node
     {{endif}}
     {{if 'CUDA_MEMCPY_NODE_PARAMS_st.copyParams' in found_struct}}
-    copyParams : CUDA_MEMCPY3D
+    copyParams : :py:obj:`~.CUDA_MEMCPY3D`
         Parameters for the memory copy
     {{endif}}
 
@@ -8671,7 +8780,7 @@ cdef class CUDA_ARRAY_DESCRIPTOR_v2(CUDA_ARRAY_DESCRIPTOR_st):
         Height of array
     {{endif}}
     {{if 'CUDA_ARRAY_DESCRIPTOR_st.Format' in found_struct}}
-    Format : CUarray_format
+    Format : :py:obj:`~.CUarray_format`
         Array format
     {{endif}}
     {{if 'CUDA_ARRAY_DESCRIPTOR_st.NumChannels' in found_struct}}
@@ -8703,7 +8812,7 @@ cdef class CUDA_ARRAY_DESCRIPTOR(CUDA_ARRAY_DESCRIPTOR_v2):
         Height of array
     {{endif}}
     {{if 'CUDA_ARRAY_DESCRIPTOR_st.Format' in found_struct}}
-    Format : CUarray_format
+    Format : :py:obj:`~.CUarray_format`
         Array format
     {{endif}}
     {{if 'CUDA_ARRAY_DESCRIPTOR_st.NumChannels' in found_struct}}
@@ -8739,7 +8848,7 @@ cdef class CUDA_ARRAY3D_DESCRIPTOR_v2(CUDA_ARRAY3D_DESCRIPTOR_st):
         Depth of 3D array
     {{endif}}
     {{if 'CUDA_ARRAY3D_DESCRIPTOR_st.Format' in found_struct}}
-    Format : CUarray_format
+    Format : :py:obj:`~.CUarray_format`
         Array format
     {{endif}}
     {{if 'CUDA_ARRAY3D_DESCRIPTOR_st.NumChannels' in found_struct}}
@@ -8779,7 +8888,7 @@ cdef class CUDA_ARRAY3D_DESCRIPTOR(CUDA_ARRAY3D_DESCRIPTOR_v2):
         Depth of 3D array
     {{endif}}
     {{if 'CUDA_ARRAY3D_DESCRIPTOR_st.Format' in found_struct}}
-    Format : CUarray_format
+    Format : :py:obj:`~.CUarray_format`
         Array format
     {{endif}}
     {{if 'CUDA_ARRAY3D_DESCRIPTOR_st.NumChannels' in found_struct}}
@@ -8821,7 +8930,7 @@ cdef class CUDA_ARRAY_SPARSE_PROPERTIES_v1(CUDA_ARRAY_SPARSE_PROPERTIES_st):
     {{if 'CUDA_ARRAY_SPARSE_PROPERTIES_st.flags' in found_struct}}
     flags : unsigned int
         Flags will either be zero or
-        CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL
+        :py:obj:`~.CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL`
     {{endif}}
     {{if 'CUDA_ARRAY_SPARSE_PROPERTIES_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -8858,7 +8967,7 @@ cdef class CUDA_ARRAY_SPARSE_PROPERTIES(CUDA_ARRAY_SPARSE_PROPERTIES_v1):
     {{if 'CUDA_ARRAY_SPARSE_PROPERTIES_st.flags' in found_struct}}
     flags : unsigned int
         Flags will either be zero or
-        CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL
+        :py:obj:`~.CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL`
     {{endif}}
     {{if 'CUDA_ARRAY_SPARSE_PROPERTIES_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -8937,7 +9046,7 @@ cdef class CUDA_RESOURCE_DESC_v1(CUDA_RESOURCE_DESC_st):
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_DESC_st.resType' in found_struct}}
-    resType : CUresourcetype
+    resType : :py:obj:`~.CUresourcetype`
         Resource type
     {{endif}}
     {{if 'CUDA_RESOURCE_DESC_st.res' in found_struct}}
@@ -8965,7 +9074,7 @@ cdef class CUDA_RESOURCE_DESC(CUDA_RESOURCE_DESC_v1):
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_DESC_st.resType' in found_struct}}
-    resType : CUresourcetype
+    resType : :py:obj:`~.CUresourcetype`
         Resource type
     {{endif}}
     {{if 'CUDA_RESOURCE_DESC_st.res' in found_struct}}
@@ -8993,11 +9102,11 @@ cdef class CUDA_TEXTURE_DESC_v1(CUDA_TEXTURE_DESC_st):
     Attributes
     ----------
     {{if 'CUDA_TEXTURE_DESC_st.addressMode' in found_struct}}
-    addressMode : list[CUaddress_mode]
+    addressMode : list[:py:obj:`~.CUaddress_mode`]
         Address modes
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.filterMode' in found_struct}}
-    filterMode : CUfilter_mode
+    filterMode : :py:obj:`~.CUfilter_mode`
         Filter mode
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.flags' in found_struct}}
@@ -9009,7 +9118,7 @@ cdef class CUDA_TEXTURE_DESC_v1(CUDA_TEXTURE_DESC_st):
         Maximum anisotropy ratio
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.mipmapFilterMode' in found_struct}}
-    mipmapFilterMode : CUfilter_mode
+    mipmapFilterMode : :py:obj:`~.CUfilter_mode`
         Mipmap filter mode
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.mipmapLevelBias' in found_struct}}
@@ -9049,11 +9158,11 @@ cdef class CUDA_TEXTURE_DESC(CUDA_TEXTURE_DESC_v1):
     Attributes
     ----------
     {{if 'CUDA_TEXTURE_DESC_st.addressMode' in found_struct}}
-    addressMode : list[CUaddress_mode]
+    addressMode : list[:py:obj:`~.CUaddress_mode`]
         Address modes
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.filterMode' in found_struct}}
-    filterMode : CUfilter_mode
+    filterMode : :py:obj:`~.CUfilter_mode`
         Filter mode
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.flags' in found_struct}}
@@ -9065,7 +9174,7 @@ cdef class CUDA_TEXTURE_DESC(CUDA_TEXTURE_DESC_v1):
         Maximum anisotropy ratio
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.mipmapFilterMode' in found_struct}}
-    mipmapFilterMode : CUfilter_mode
+    mipmapFilterMode : :py:obj:`~.CUfilter_mode`
         Mipmap filter mode
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.mipmapLevelBias' in found_struct}}
@@ -9105,7 +9214,7 @@ cdef class CUDA_RESOURCE_VIEW_DESC_v1(CUDA_RESOURCE_VIEW_DESC_st):
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_VIEW_DESC_st.format' in found_struct}}
-    format : CUresourceViewFormat
+    format : :py:obj:`~.CUresourceViewFormat`
         Resource view format
     {{endif}}
     {{if 'CUDA_RESOURCE_VIEW_DESC_st.width' in found_struct}}
@@ -9157,7 +9266,7 @@ cdef class CUDA_RESOURCE_VIEW_DESC(CUDA_RESOURCE_VIEW_DESC_v1):
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_VIEW_DESC_st.format' in found_struct}}
-    format : CUresourceViewFormat
+    format : :py:obj:`~.CUresourceViewFormat`
         Resource view format
     {{endif}}
     {{if 'CUDA_RESOURCE_VIEW_DESC_st.width' in found_struct}}
@@ -9210,7 +9319,7 @@ cdef class CUtensorMap(CUtensorMap_st):
     Attributes
     ----------
     {{if 'CUtensorMap_st.opaque' in found_struct}}
-    opaque : list[cuuint64_t]
+    opaque : list[:py:obj:`~.cuuint64_t`]
 
     {{endif}}
 
@@ -9278,7 +9387,7 @@ cdef class CUDA_LAUNCH_PARAMS_v1(CUDA_LAUNCH_PARAMS_st):
     Attributes
     ----------
     {{if 'CUDA_LAUNCH_PARAMS_st.function' in found_struct}}
-    function : CUfunction
+    function : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_LAUNCH_PARAMS_st.gridDimX' in found_struct}}
@@ -9310,7 +9419,7 @@ cdef class CUDA_LAUNCH_PARAMS_v1(CUDA_LAUNCH_PARAMS_st):
         Dynamic shared-memory size per thread block in bytes
     {{endif}}
     {{if 'CUDA_LAUNCH_PARAMS_st.hStream' in found_struct}}
-    hStream : CUstream
+    hStream : :py:obj:`~.CUstream`
         Stream identifier
     {{endif}}
     {{if 'CUDA_LAUNCH_PARAMS_st.kernelParams' in found_struct}}
@@ -9334,7 +9443,7 @@ cdef class CUDA_LAUNCH_PARAMS(CUDA_LAUNCH_PARAMS_v1):
     Attributes
     ----------
     {{if 'CUDA_LAUNCH_PARAMS_st.function' in found_struct}}
-    function : CUfunction
+    function : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_LAUNCH_PARAMS_st.gridDimX' in found_struct}}
@@ -9366,7 +9475,7 @@ cdef class CUDA_LAUNCH_PARAMS(CUDA_LAUNCH_PARAMS_v1):
         Dynamic shared-memory size per thread block in bytes
     {{endif}}
     {{if 'CUDA_LAUNCH_PARAMS_st.hStream' in found_struct}}
-    hStream : CUstream
+    hStream : :py:obj:`~.CUstream`
         Stream identifier
     {{endif}}
     {{if 'CUDA_LAUNCH_PARAMS_st.kernelParams' in found_struct}}
@@ -9390,7 +9499,7 @@ cdef class CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1(CUDA_EXTERNAL_MEMORY_HANDLE_DESC_
     Attributes
     ----------
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.type' in found_struct}}
-    type : CUexternalMemoryHandleType
+    type : :py:obj:`~.CUexternalMemoryHandleType`
         Type of the handle
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.handle' in found_struct}}
@@ -9403,7 +9512,8 @@ cdef class CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1(CUDA_EXTERNAL_MEMORY_HANDLE_DESC_
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.flags' in found_struct}}
     flags : unsigned int
-        Flags must either be zero or CUDA_EXTERNAL_MEMORY_DEDICATED
+        Flags must either be zero or
+        :py:obj:`~.CUDA_EXTERNAL_MEMORY_DEDICATED`
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -9426,7 +9536,7 @@ cdef class CUDA_EXTERNAL_MEMORY_HANDLE_DESC(CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1)
     Attributes
     ----------
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.type' in found_struct}}
-    type : CUexternalMemoryHandleType
+    type : :py:obj:`~.CUexternalMemoryHandleType`
         Type of the handle
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.handle' in found_struct}}
@@ -9439,7 +9549,8 @@ cdef class CUDA_EXTERNAL_MEMORY_HANDLE_DESC(CUDA_EXTERNAL_MEMORY_HANDLE_DESC_v1)
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.flags' in found_struct}}
     flags : unsigned int
-        Flags must either be zero or CUDA_EXTERNAL_MEMORY_DEDICATED
+        Flags must either be zero or
+        :py:obj:`~.CUDA_EXTERNAL_MEMORY_DEDICATED`
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -9531,7 +9642,7 @@ cdef class CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_v1(CUDA_EXTERNAL_MEMORY_MIP
         chain is.
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st.arrayDesc' in found_struct}}
-    arrayDesc : CUDA_ARRAY3D_DESCRIPTOR
+    arrayDesc : :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR`
         Format, dimension and type of base level of the mipmap chain
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st.numLevels' in found_struct}}
@@ -9564,7 +9675,7 @@ cdef class CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC(CUDA_EXTERNAL_MEMORY_MIPMAP
         chain is.
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st.arrayDesc' in found_struct}}
-    arrayDesc : CUDA_ARRAY3D_DESCRIPTOR
+    arrayDesc : :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR`
         Format, dimension and type of base level of the mipmap chain
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st.numLevels' in found_struct}}
@@ -9592,7 +9703,7 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_v1(CUDA_EXTERNAL_SEMAPHORE_HANDLE
     Attributes
     ----------
     {{if 'CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st.type' in found_struct}}
-    type : CUexternalSemaphoreHandleType
+    type : :py:obj:`~.CUexternalSemaphoreHandleType`
         Type of the handle
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st.handle' in found_struct}}
@@ -9624,7 +9735,7 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC(CUDA_EXTERNAL_SEMAPHORE_HANDLE_DE
     Attributes
     ----------
     {{if 'CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st.type' in found_struct}}
-    type : CUexternalSemaphoreHandleType
+    type : :py:obj:`~.CUexternalSemaphoreHandleType`
         Type of the handle
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st.handle' in found_struct}}
@@ -9661,14 +9772,17 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_v1(CUDA_EXTERNAL_SEMAPHORE_SIGN
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal a
-        CUexternalSemaphore of type
-        CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
-        CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which
-        indicates that while signaling the CUexternalSemaphore, no memory
-        synchronization operations should be performed for any external
-        memory object imported as CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
-        For all other types of CUexternalSemaphore, flags must be zero.
+        Only when :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS` is used
+        to signal a :py:obj:`~.CUexternalSemaphore` of type
+        :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC`, the valid
+        flag is
+        :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC`
+        which indicates that while signaling the
+        :py:obj:`~.CUexternalSemaphore`, no memory synchronization
+        operations should be performed for any external memory object
+        imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`.
+        For all other types of :py:obj:`~.CUexternalSemaphore`, flags must
+        be zero.
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -9696,14 +9810,17 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS(CUDA_EXTERNAL_SEMAPHORE_SIGNAL_
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal a
-        CUexternalSemaphore of type
-        CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
-        CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which
-        indicates that while signaling the CUexternalSemaphore, no memory
-        synchronization operations should be performed for any external
-        memory object imported as CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
-        For all other types of CUexternalSemaphore, flags must be zero.
+        Only when :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS` is used
+        to signal a :py:obj:`~.CUexternalSemaphore` of type
+        :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC`, the valid
+        flag is
+        :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC`
+        which indicates that while signaling the
+        :py:obj:`~.CUexternalSemaphore`, no memory synchronization
+        operations should be performed for any external memory object
+        imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`.
+        For all other types of :py:obj:`~.CUexternalSemaphore`, flags must
+        be zero.
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -9731,14 +9848,17 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_v1(CUDA_EXTERNAL_SEMAPHORE_WAIT_P
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on a
-        CUexternalSemaphore of type
-        CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
-        CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC which indicates
-        that while waiting for the CUexternalSemaphore, no memory
-        synchronization operations should be performed for any external
-        memory object imported as CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
-        For all other types of CUexternalSemaphore, flags must be zero.
+        Only when :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS` is used
+        to wait on a :py:obj:`~.CUexternalSemaphore` of type
+        :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC`, the valid
+        flag is
+        :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC`
+        which indicates that while waiting for the
+        :py:obj:`~.CUexternalSemaphore`, no memory synchronization
+        operations should be performed for any external memory object
+        imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`.
+        For all other types of :py:obj:`~.CUexternalSemaphore`, flags must
+        be zero.
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -9766,14 +9886,17 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS(CUDA_EXTERNAL_SEMAPHORE_WAIT_PARA
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on a
-        CUexternalSemaphore of type
-        CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
-        CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC which indicates
-        that while waiting for the CUexternalSemaphore, no memory
-        synchronization operations should be performed for any external
-        memory object imported as CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
-        For all other types of CUexternalSemaphore, flags must be zero.
+        Only when :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS` is used
+        to wait on a :py:obj:`~.CUexternalSemaphore` of type
+        :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC`, the valid
+        flag is
+        :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC`
+        which indicates that while waiting for the
+        :py:obj:`~.CUexternalSemaphore`, no memory synchronization
+        operations should be performed for any external memory object
+        imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`.
+        For all other types of :py:obj:`~.CUexternalSemaphore`, flags must
+        be zero.
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -9796,11 +9919,11 @@ cdef class CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1(CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`
         Array of external semaphore signal parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st.numExtSems' in found_struct}}
@@ -9825,11 +9948,11 @@ cdef class CUDA_EXT_SEM_SIGNAL_NODE_PARAMS(CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v1):
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`
         Array of external semaphore signal parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st.numExtSems' in found_struct}}
@@ -9854,11 +9977,11 @@ cdef class CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2(CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`
         Array of external semaphore signal parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st.numExtSems' in found_struct}}
@@ -9883,11 +10006,11 @@ cdef class CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1(CUDA_EXT_SEM_WAIT_NODE_PARAMS_st):
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS`
         Array of external semaphore wait parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_st.numExtSems' in found_struct}}
@@ -9912,11 +10035,11 @@ cdef class CUDA_EXT_SEM_WAIT_NODE_PARAMS(CUDA_EXT_SEM_WAIT_NODE_PARAMS_v1):
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS`
         Array of external semaphore wait parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_st.numExtSems' in found_struct}}
@@ -9941,11 +10064,11 @@ cdef class CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2(CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st)
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS`
         Array of external semaphore wait parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st.numExtSems' in found_struct}}
@@ -9985,7 +10108,7 @@ cdef class CUarrayMapInfo_v1(CUarrayMapInfo_st):
     Attributes
     ----------
     {{if 'CUarrayMapInfo_st.resourceType' in found_struct}}
-    resourceType : CUresourcetype
+    resourceType : :py:obj:`~.CUresourcetype`
         Resource type
     {{endif}}
     {{if 'CUarrayMapInfo_st.resource' in found_struct}}
@@ -9993,7 +10116,7 @@ cdef class CUarrayMapInfo_v1(CUarrayMapInfo_st):
 
     {{endif}}
     {{if 'CUarrayMapInfo_st.subresourceType' in found_struct}}
-    subresourceType : CUarraySparseSubresourceType
+    subresourceType : :py:obj:`~.CUarraySparseSubresourceType`
         Sparse subresource type
     {{endif}}
     {{if 'CUarrayMapInfo_st.subresource' in found_struct}}
@@ -10001,11 +10124,11 @@ cdef class CUarrayMapInfo_v1(CUarrayMapInfo_st):
 
     {{endif}}
     {{if 'CUarrayMapInfo_st.memOperationType' in found_struct}}
-    memOperationType : CUmemOperationType
+    memOperationType : :py:obj:`~.CUmemOperationType`
         Memory operation type
     {{endif}}
     {{if 'CUarrayMapInfo_st.memHandleType' in found_struct}}
-    memHandleType : CUmemHandleType
+    memHandleType : :py:obj:`~.CUmemHandleType`
         Memory handle type
     {{endif}}
     {{if 'CUarrayMapInfo_st.memHandle' in found_struct}}
@@ -10046,7 +10169,7 @@ cdef class CUarrayMapInfo(CUarrayMapInfo_v1):
     Attributes
     ----------
     {{if 'CUarrayMapInfo_st.resourceType' in found_struct}}
-    resourceType : CUresourcetype
+    resourceType : :py:obj:`~.CUresourcetype`
         Resource type
     {{endif}}
     {{if 'CUarrayMapInfo_st.resource' in found_struct}}
@@ -10054,7 +10177,7 @@ cdef class CUarrayMapInfo(CUarrayMapInfo_v1):
 
     {{endif}}
     {{if 'CUarrayMapInfo_st.subresourceType' in found_struct}}
-    subresourceType : CUarraySparseSubresourceType
+    subresourceType : :py:obj:`~.CUarraySparseSubresourceType`
         Sparse subresource type
     {{endif}}
     {{if 'CUarrayMapInfo_st.subresource' in found_struct}}
@@ -10062,11 +10185,11 @@ cdef class CUarrayMapInfo(CUarrayMapInfo_v1):
 
     {{endif}}
     {{if 'CUarrayMapInfo_st.memOperationType' in found_struct}}
-    memOperationType : CUmemOperationType
+    memOperationType : :py:obj:`~.CUmemOperationType`
         Memory operation type
     {{endif}}
     {{if 'CUarrayMapInfo_st.memHandleType' in found_struct}}
-    memHandleType : CUmemHandleType
+    memHandleType : :py:obj:`~.CUmemHandleType`
         Memory handle type
     {{endif}}
     {{if 'CUarrayMapInfo_st.memHandle' in found_struct}}
@@ -10106,14 +10229,17 @@ cdef class CUmemLocation_v1(CUmemLocation_st):
     Attributes
     ----------
     {{if 'CUmemLocation_st.type' in found_struct}}
-    type : CUmemLocationType
+    type : :py:obj:`~.CUmemLocationType`
         Specifies the location type, which modifies the meaning of id.
     {{endif}}
     {{if 'CUmemLocation_st.id' in found_struct}}
     id : int
-        Identifier for CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
-        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST,
-        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST_NUMA.
+        Identifier for :py:obj:`~.CUmemLocationType`
+        :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
+        :py:obj:`~.CUmemLocationType`
+        :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`,
+        :py:obj:`~.CUmemLocationType`
+        :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`.
     {{endif}}
 
     Methods
@@ -10132,14 +10258,17 @@ cdef class CUmemLocation(CUmemLocation_v1):
     Attributes
     ----------
     {{if 'CUmemLocation_st.type' in found_struct}}
-    type : CUmemLocationType
+    type : :py:obj:`~.CUmemLocationType`
         Specifies the location type, which modifies the meaning of id.
     {{endif}}
     {{if 'CUmemLocation_st.id' in found_struct}}
     id : int
-        Identifier for CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
-        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST,
-        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST_NUMA.
+        Identifier for :py:obj:`~.CUmemLocationType`
+        :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
+        :py:obj:`~.CUmemLocationType`
+        :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`,
+        :py:obj:`~.CUmemLocationType`
+        :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`.
     {{endif}}
 
     Methods
@@ -10158,24 +10287,24 @@ cdef class CUmemAllocationProp_v1(CUmemAllocationProp_st):
     Attributes
     ----------
     {{if 'CUmemAllocationProp_st.type' in found_struct}}
-    type : CUmemAllocationType
+    type : :py:obj:`~.CUmemAllocationType`
         Allocation type
     {{endif}}
     {{if 'CUmemAllocationProp_st.requestedHandleTypes' in found_struct}}
-    requestedHandleTypes : CUmemAllocationHandleType
-        requested CUmemAllocationHandleType
+    requestedHandleTypes : :py:obj:`~.CUmemAllocationHandleType`
+        requested :py:obj:`~.CUmemAllocationHandleType`
     {{endif}}
     {{if 'CUmemAllocationProp_st.location' in found_struct}}
-    location : CUmemLocation
+    location : :py:obj:`~.CUmemLocation`
         Location of allocation
     {{endif}}
     {{if 'CUmemAllocationProp_st.win32HandleMetaData' in found_struct}}
     win32HandleMetaData : Any
         Windows-specific POBJECT_ATTRIBUTES required when
-        CU_MEM_HANDLE_TYPE_WIN32 is specified. This object attributes
-        structure includes security attributes that define the scope of
-        which exported allocations may be transferred to other processes.
-        In all other cases, this field is required to be zero.
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_WIN32` is specified. This object
+        attributes structure includes security attributes that define the
+        scope of which exported allocations may be transferred to other
+        processes. In all other cases, this field is required to be zero.
     {{endif}}
     {{if 'CUmemAllocationProp_st.allocFlags' in found_struct}}
     allocFlags : anon_struct22
@@ -10198,24 +10327,24 @@ cdef class CUmemAllocationProp(CUmemAllocationProp_v1):
     Attributes
     ----------
     {{if 'CUmemAllocationProp_st.type' in found_struct}}
-    type : CUmemAllocationType
+    type : :py:obj:`~.CUmemAllocationType`
         Allocation type
     {{endif}}
     {{if 'CUmemAllocationProp_st.requestedHandleTypes' in found_struct}}
-    requestedHandleTypes : CUmemAllocationHandleType
-        requested CUmemAllocationHandleType
+    requestedHandleTypes : :py:obj:`~.CUmemAllocationHandleType`
+        requested :py:obj:`~.CUmemAllocationHandleType`
     {{endif}}
     {{if 'CUmemAllocationProp_st.location' in found_struct}}
-    location : CUmemLocation
+    location : :py:obj:`~.CUmemLocation`
         Location of allocation
     {{endif}}
     {{if 'CUmemAllocationProp_st.win32HandleMetaData' in found_struct}}
     win32HandleMetaData : Any
         Windows-specific POBJECT_ATTRIBUTES required when
-        CU_MEM_HANDLE_TYPE_WIN32 is specified. This object attributes
-        structure includes security attributes that define the scope of
-        which exported allocations may be transferred to other processes.
-        In all other cases, this field is required to be zero.
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_WIN32` is specified. This object
+        attributes structure includes security attributes that define the
+        scope of which exported allocations may be transferred to other
+        processes. In all other cases, this field is required to be zero.
     {{endif}}
     {{if 'CUmemAllocationProp_st.allocFlags' in found_struct}}
     allocFlags : anon_struct22
@@ -10249,8 +10378,8 @@ cdef class CUmulticastObjectProp_v1(CUmulticastObjectProp_st):
     {{endif}}
     {{if 'CUmulticastObjectProp_st.handleTypes' in found_struct}}
     handleTypes : unsigned long long
-        Bitmask of exportable handle types (see CUmemAllocationHandleType)
-        for this object
+        Bitmask of exportable handle types (see
+        :py:obj:`~.CUmemAllocationHandleType`) for this object
     {{endif}}
     {{if 'CUmulticastObjectProp_st.flags' in found_struct}}
     flags : unsigned long long
@@ -10284,8 +10413,8 @@ cdef class CUmulticastObjectProp(CUmulticastObjectProp_v1):
     {{endif}}
     {{if 'CUmulticastObjectProp_st.handleTypes' in found_struct}}
     handleTypes : unsigned long long
-        Bitmask of exportable handle types (see CUmemAllocationHandleType)
-        for this object
+        Bitmask of exportable handle types (see
+        :py:obj:`~.CUmemAllocationHandleType`) for this object
     {{endif}}
     {{if 'CUmulticastObjectProp_st.flags' in found_struct}}
     flags : unsigned long long
@@ -10308,11 +10437,11 @@ cdef class CUmemAccessDesc_v1(CUmemAccessDesc_st):
     Attributes
     ----------
     {{if 'CUmemAccessDesc_st.location' in found_struct}}
-    location : CUmemLocation
+    location : :py:obj:`~.CUmemLocation`
         Location on which the request is to change it's accessibility
     {{endif}}
     {{if 'CUmemAccessDesc_st.flags' in found_struct}}
-    flags : CUmemAccess_flags
+    flags : :py:obj:`~.CUmemAccess_flags`
         ``CUmemProt`` accessibility flags to set on the request
     {{endif}}
 
@@ -10332,11 +10461,11 @@ cdef class CUmemAccessDesc(CUmemAccessDesc_v1):
     Attributes
     ----------
     {{if 'CUmemAccessDesc_st.location' in found_struct}}
-    location : CUmemLocation
+    location : :py:obj:`~.CUmemLocation`
         Location on which the request is to change it's accessibility
     {{endif}}
     {{if 'CUmemAccessDesc_st.flags' in found_struct}}
-    flags : CUmemAccess_flags
+    flags : :py:obj:`~.CUmemAccess_flags`
         ``CUmemProt`` accessibility flags to set on the request
     {{endif}}
 
@@ -10351,22 +10480,22 @@ cdef class CUmemAccessDesc(CUmemAccessDesc_v1):
 
 cdef class CUgraphExecUpdateResultInfo_v1(CUgraphExecUpdateResultInfo_st):
     """
-    Result information returned by cuGraphExecUpdate
+    Result information returned by :py:obj:`~.cuGraphExecUpdate`
 
     Attributes
     ----------
     {{if 'CUgraphExecUpdateResultInfo_st.result' in found_struct}}
-    result : CUgraphExecUpdateResult
+    result : :py:obj:`~.CUgraphExecUpdateResult`
         Gives more specific detail when a cuda graph update fails.
     {{endif}}
     {{if 'CUgraphExecUpdateResultInfo_st.errorNode' in found_struct}}
-    errorNode : CUgraphNode
+    errorNode : :py:obj:`~.CUgraphNode`
         The "to node" of the error edge when the topologies do not match.
         The error node when the error is associated with a specific node.
         NULL when the error is generic.
     {{endif}}
     {{if 'CUgraphExecUpdateResultInfo_st.errorFromNode' in found_struct}}
-    errorFromNode : CUgraphNode
+    errorFromNode : :py:obj:`~.CUgraphNode`
         The from node of error edge when the topologies do not match.
         Otherwise NULL.
     {{endif}}
@@ -10382,22 +10511,22 @@ cdef class CUgraphExecUpdateResultInfo_v1(CUgraphExecUpdateResultInfo_st):
 
 cdef class CUgraphExecUpdateResultInfo(CUgraphExecUpdateResultInfo_v1):
     """
-    Result information returned by cuGraphExecUpdate
+    Result information returned by :py:obj:`~.cuGraphExecUpdate`
 
     Attributes
     ----------
     {{if 'CUgraphExecUpdateResultInfo_st.result' in found_struct}}
-    result : CUgraphExecUpdateResult
+    result : :py:obj:`~.CUgraphExecUpdateResult`
         Gives more specific detail when a cuda graph update fails.
     {{endif}}
     {{if 'CUgraphExecUpdateResultInfo_st.errorNode' in found_struct}}
-    errorNode : CUgraphNode
+    errorNode : :py:obj:`~.CUgraphNode`
         The "to node" of the error edge when the topologies do not match.
         The error node when the error is associated with a specific node.
         NULL when the error is generic.
     {{endif}}
     {{if 'CUgraphExecUpdateResultInfo_st.errorFromNode' in found_struct}}
-    errorFromNode : CUgraphNode
+    errorFromNode : :py:obj:`~.CUgraphNode`
         The from node of error edge when the topologies do not match.
         Otherwise NULL.
     {{endif}}
@@ -10418,25 +10547,25 @@ cdef class CUmemPoolProps_v1(CUmemPoolProps_st):
     Attributes
     ----------
     {{if 'CUmemPoolProps_st.allocType' in found_struct}}
-    allocType : CUmemAllocationType
+    allocType : :py:obj:`~.CUmemAllocationType`
         Allocation type. Currently must be specified as
-        CU_MEM_ALLOCATION_TYPE_PINNED
+        :py:obj:`~.CU_MEM_ALLOCATION_TYPE_PINNED`
     {{endif}}
     {{if 'CUmemPoolProps_st.handleTypes' in found_struct}}
-    handleTypes : CUmemAllocationHandleType
+    handleTypes : :py:obj:`~.CUmemAllocationHandleType`
         Handle types that will be supported by allocations from the pool.
     {{endif}}
     {{if 'CUmemPoolProps_st.location' in found_struct}}
-    location : CUmemLocation
+    location : :py:obj:`~.CUmemLocation`
         Location where allocations should reside.
     {{endif}}
     {{if 'CUmemPoolProps_st.win32SecurityAttributes' in found_struct}}
     win32SecurityAttributes : Any
         Windows-specific LPSECURITYATTRIBUTES required when
-        CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute
-        defines the scope of which exported allocations may be transferred
-        to other processes. In all other cases, this field is required to
-        be zero.
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_WIN32` is specified. This security
+        attribute defines the scope of which exported allocations may be
+        transferred to other processes. In all other cases, this field is
+        required to be zero.
     {{endif}}
     {{if 'CUmemPoolProps_st.maxSize' in found_struct}}
     maxSize : size_t
@@ -10468,25 +10597,25 @@ cdef class CUmemPoolProps(CUmemPoolProps_v1):
     Attributes
     ----------
     {{if 'CUmemPoolProps_st.allocType' in found_struct}}
-    allocType : CUmemAllocationType
+    allocType : :py:obj:`~.CUmemAllocationType`
         Allocation type. Currently must be specified as
-        CU_MEM_ALLOCATION_TYPE_PINNED
+        :py:obj:`~.CU_MEM_ALLOCATION_TYPE_PINNED`
     {{endif}}
     {{if 'CUmemPoolProps_st.handleTypes' in found_struct}}
-    handleTypes : CUmemAllocationHandleType
+    handleTypes : :py:obj:`~.CUmemAllocationHandleType`
         Handle types that will be supported by allocations from the pool.
     {{endif}}
     {{if 'CUmemPoolProps_st.location' in found_struct}}
-    location : CUmemLocation
+    location : :py:obj:`~.CUmemLocation`
         Location where allocations should reside.
     {{endif}}
     {{if 'CUmemPoolProps_st.win32SecurityAttributes' in found_struct}}
     win32SecurityAttributes : Any
         Windows-specific LPSECURITYATTRIBUTES required when
-        CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute
-        defines the scope of which exported allocations may be transferred
-        to other processes. In all other cases, this field is required to
-        be zero.
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_WIN32` is specified. This security
+        attribute defines the scope of which exported allocations may be
+        transferred to other processes. In all other cases, this field is
+        required to be zero.
     {{endif}}
     {{if 'CUmemPoolProps_st.maxSize' in found_struct}}
     maxSize : size_t
@@ -10554,28 +10683,29 @@ cdef class CUmemPoolPtrExportData(CUmemPoolPtrExportData_v1):
 cdef class CUmemcpyAttributes_v1(CUmemcpyAttributes_st):
     """
     Attributes specific to copies within a batch. For more details on
-    usage see cuMemcpyBatchAsync.
+    usage see :py:obj:`~.cuMemcpyBatchAsync`.
 
     Attributes
     ----------
     {{if 'CUmemcpyAttributes_st.srcAccessOrder' in found_struct}}
-    srcAccessOrder : CUmemcpySrcAccessOrder
+    srcAccessOrder : :py:obj:`~.CUmemcpySrcAccessOrder`
         Source access ordering to be observed for copies with this
         attribute.
     {{endif}}
     {{if 'CUmemcpyAttributes_st.srcLocHint' in found_struct}}
-    srcLocHint : CUmemLocation
+    srcLocHint : :py:obj:`~.CUmemLocation`
         Hint location for the source operand. Ignored when the pointers are
         not managed memory or memory allocated outside CUDA.
     {{endif}}
     {{if 'CUmemcpyAttributes_st.dstLocHint' in found_struct}}
-    dstLocHint : CUmemLocation
+    dstLocHint : :py:obj:`~.CUmemLocation`
         Hint location for the destination operand. Ignored when the
         pointers are not managed memory or memory allocated outside CUDA.
     {{endif}}
     {{if 'CUmemcpyAttributes_st.flags' in found_struct}}
     flags : unsigned int
-        Additional flags for copies with this attribute. See CUmemcpyFlags
+        Additional flags for copies with this attribute. See
+        :py:obj:`~.CUmemcpyFlags`
     {{endif}}
 
     Methods
@@ -10590,28 +10720,29 @@ cdef class CUmemcpyAttributes_v1(CUmemcpyAttributes_st):
 cdef class CUmemcpyAttributes(CUmemcpyAttributes_v1):
     """
     Attributes specific to copies within a batch. For more details on
-    usage see cuMemcpyBatchAsync.
+    usage see :py:obj:`~.cuMemcpyBatchAsync`.
 
     Attributes
     ----------
     {{if 'CUmemcpyAttributes_st.srcAccessOrder' in found_struct}}
-    srcAccessOrder : CUmemcpySrcAccessOrder
+    srcAccessOrder : :py:obj:`~.CUmemcpySrcAccessOrder`
         Source access ordering to be observed for copies with this
         attribute.
     {{endif}}
     {{if 'CUmemcpyAttributes_st.srcLocHint' in found_struct}}
-    srcLocHint : CUmemLocation
+    srcLocHint : :py:obj:`~.CUmemLocation`
         Hint location for the source operand. Ignored when the pointers are
         not managed memory or memory allocated outside CUDA.
     {{endif}}
     {{if 'CUmemcpyAttributes_st.dstLocHint' in found_struct}}
-    dstLocHint : CUmemLocation
+    dstLocHint : :py:obj:`~.CUmemLocation`
         Hint location for the destination operand. Ignored when the
         pointers are not managed memory or memory allocated outside CUDA.
     {{endif}}
     {{if 'CUmemcpyAttributes_st.flags' in found_struct}}
     flags : unsigned int
-        Additional flags for copies with this attribute. See CUmemcpyFlags
+        Additional flags for copies with this attribute. See
+        :py:obj:`~.CUmemcpyFlags`
     {{endif}}
 
     Methods
@@ -10681,7 +10812,8 @@ cdef class CUoffset3D(CUoffset3D_v1):
 
 cdef class CUextent3D_v1(CUextent3D_st):
     """
-    Struct representing width/height/depth of a CUarray in elements
+    Struct representing width/height/depth of a :py:obj:`~.CUarray` in
+    elements
 
     Attributes
     ----------
@@ -10709,7 +10841,8 @@ cdef class CUextent3D_v1(CUextent3D_st):
 
 cdef class CUextent3D(CUextent3D_v1):
     """
-    Struct representing width/height/depth of a CUarray in elements
+    Struct representing width/height/depth of a :py:obj:`~.CUarray` in
+    elements
 
     Attributes
     ----------
@@ -10737,12 +10870,13 @@ cdef class CUextent3D(CUextent3D_v1):
 
 cdef class CUmemcpy3DOperand_v1(CUmemcpy3DOperand_st):
     """
-    Struct representing an operand for copy with cuMemcpy3DBatchAsync
+    Struct representing an operand for copy with
+    :py:obj:`~.cuMemcpy3DBatchAsync`
 
     Attributes
     ----------
     {{if 'CUmemcpy3DOperand_st.type' in found_struct}}
-    type : CUmemcpy3DOperandType
+    type : :py:obj:`~.CUmemcpy3DOperandType`
 
     {{endif}}
     {{if 'CUmemcpy3DOperand_st.op' in found_struct}}
@@ -10761,12 +10895,13 @@ cdef class CUmemcpy3DOperand_v1(CUmemcpy3DOperand_st):
 
 cdef class CUmemcpy3DOperand(CUmemcpy3DOperand_v1):
     """
-    Struct representing an operand for copy with cuMemcpy3DBatchAsync
+    Struct representing an operand for copy with
+    :py:obj:`~.cuMemcpy3DBatchAsync`
 
     Attributes
     ----------
     {{if 'CUmemcpy3DOperand_st.type' in found_struct}}
-    type : CUmemcpy3DOperandType
+    type : :py:obj:`~.CUmemcpy3DOperandType`
 
     {{endif}}
     {{if 'CUmemcpy3DOperand_st.op' in found_struct}}
@@ -10788,25 +10923,26 @@ cdef class CUDA_MEMCPY3D_BATCH_OP_v1(CUDA_MEMCPY3D_BATCH_OP_st):
     Attributes
     ----------
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.src' in found_struct}}
-    src : CUmemcpy3DOperand
+    src : :py:obj:`~.CUmemcpy3DOperand`
         Source memcpy operand.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.dst' in found_struct}}
-    dst : CUmemcpy3DOperand
+    dst : :py:obj:`~.CUmemcpy3DOperand`
         Destination memcpy operand.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.extent' in found_struct}}
-    extent : CUextent3D
+    extent : :py:obj:`~.CUextent3D`
         Extents of the memcpy between src and dst. The width, height and
         depth components must not be 0.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.srcAccessOrder' in found_struct}}
-    srcAccessOrder : CUmemcpySrcAccessOrder
+    srcAccessOrder : :py:obj:`~.CUmemcpySrcAccessOrder`
         Source access ordering to be observed for copy from src to dst.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.flags' in found_struct}}
     flags : unsigned int
-        Additional flags for copies with this attribute. See CUmemcpyFlags
+        Additional flags for copies with this attribute. See
+        :py:obj:`~.CUmemcpyFlags`
     {{endif}}
 
     Methods
@@ -10823,25 +10959,26 @@ cdef class CUDA_MEMCPY3D_BATCH_OP(CUDA_MEMCPY3D_BATCH_OP_v1):
     Attributes
     ----------
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.src' in found_struct}}
-    src : CUmemcpy3DOperand
+    src : :py:obj:`~.CUmemcpy3DOperand`
         Source memcpy operand.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.dst' in found_struct}}
-    dst : CUmemcpy3DOperand
+    dst : :py:obj:`~.CUmemcpy3DOperand`
         Destination memcpy operand.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.extent' in found_struct}}
-    extent : CUextent3D
+    extent : :py:obj:`~.CUextent3D`
         Extents of the memcpy between src and dst. The width, height and
         depth components must not be 0.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.srcAccessOrder' in found_struct}}
-    srcAccessOrder : CUmemcpySrcAccessOrder
+    srcAccessOrder : :py:obj:`~.CUmemcpySrcAccessOrder`
         Source access ordering to be observed for copy from src to dst.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.flags' in found_struct}}
     flags : unsigned int
-        Additional flags for copies with this attribute. See CUmemcpyFlags
+        Additional flags for copies with this attribute. See
+        :py:obj:`~.CUmemcpyFlags`
     {{endif}}
 
     Methods
@@ -10860,13 +10997,13 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v1(CUDA_MEM_ALLOC_NODE_PARAMS_v1_st):
     Attributes
     ----------
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.poolProps' in found_struct}}
-    poolProps : CUmemPoolProps
+    poolProps : :py:obj:`~.CUmemPoolProps`
         in: location where the allocation should reside (specified in
-        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
-        is not supported.
+        ``location``). ``handleTypes`` must be
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_NONE`. IPC is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.accessDescs' in found_struct}}
-    accessDescs : CUmemAccessDesc
+    accessDescs : :py:obj:`~.CUmemAccessDesc`
         in: array of memory access descriptors. Used to describe peer GPU
         access
     {{endif}}
@@ -10880,7 +11017,7 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v1(CUDA_MEM_ALLOC_NODE_PARAMS_v1_st):
         in: size in bytes of the requested allocation
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.dptr' in found_struct}}
-    dptr : CUdeviceptr
+    dptr : :py:obj:`~.CUdeviceptr`
         out: address of the allocation returned by CUDA
     {{endif}}
 
@@ -10900,13 +11037,13 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS(CUDA_MEM_ALLOC_NODE_PARAMS_v1):
     Attributes
     ----------
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.poolProps' in found_struct}}
-    poolProps : CUmemPoolProps
+    poolProps : :py:obj:`~.CUmemPoolProps`
         in: location where the allocation should reside (specified in
-        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
-        is not supported.
+        ``location``). ``handleTypes`` must be
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_NONE`. IPC is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.accessDescs' in found_struct}}
-    accessDescs : CUmemAccessDesc
+    accessDescs : :py:obj:`~.CUmemAccessDesc`
         in: array of memory access descriptors. Used to describe peer GPU
         access
     {{endif}}
@@ -10920,7 +11057,7 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS(CUDA_MEM_ALLOC_NODE_PARAMS_v1):
         in: size in bytes of the requested allocation
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.dptr' in found_struct}}
-    dptr : CUdeviceptr
+    dptr : :py:obj:`~.CUdeviceptr`
         out: address of the allocation returned by CUDA
     {{endif}}
 
@@ -10940,13 +11077,13 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v2(CUDA_MEM_ALLOC_NODE_PARAMS_v2_st):
     Attributes
     ----------
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.poolProps' in found_struct}}
-    poolProps : CUmemPoolProps
+    poolProps : :py:obj:`~.CUmemPoolProps`
         in: location where the allocation should reside (specified in
-        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
-        is not supported.
+        ``location``). ``handleTypes`` must be
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_NONE`. IPC is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.accessDescs' in found_struct}}
-    accessDescs : CUmemAccessDesc
+    accessDescs : :py:obj:`~.CUmemAccessDesc`
         in: array of memory access descriptors. Used to describe peer GPU
         access
     {{endif}}
@@ -10960,7 +11097,7 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v2(CUDA_MEM_ALLOC_NODE_PARAMS_v2_st):
         in: size in bytes of the requested allocation
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.dptr' in found_struct}}
-    dptr : CUdeviceptr
+    dptr : :py:obj:`~.CUdeviceptr`
         out: address of the allocation returned by CUDA
     {{endif}}
 
@@ -10980,7 +11117,7 @@ cdef class CUDA_MEM_FREE_NODE_PARAMS(CUDA_MEM_FREE_NODE_PARAMS_st):
     Attributes
     ----------
     {{if 'CUDA_MEM_FREE_NODE_PARAMS_st.dptr' in found_struct}}
-    dptr : CUdeviceptr
+    dptr : :py:obj:`~.CUdeviceptr`
         in: the pointer to free
     {{endif}}
 
@@ -11000,7 +11137,7 @@ cdef class CUDA_CHILD_GRAPH_NODE_PARAMS(CUDA_CHILD_GRAPH_NODE_PARAMS_st):
     Attributes
     ----------
     {{if 'CUDA_CHILD_GRAPH_NODE_PARAMS_st.graph' in found_struct}}
-    graph : CUgraph
+    graph : :py:obj:`~.CUgraph`
         The child graph to clone into the node for node creation, or a
         handle to the graph owned by the node for node query. The graph
         must not contain conditional nodes. Graphs containing memory
@@ -11008,7 +11145,7 @@ cdef class CUDA_CHILD_GRAPH_NODE_PARAMS(CUDA_CHILD_GRAPH_NODE_PARAMS_st):
         to the parent.
     {{endif}}
     {{if 'CUDA_CHILD_GRAPH_NODE_PARAMS_st.ownership' in found_struct}}
-    ownership : CUgraphChildGraphNodeOwnership
+    ownership : :py:obj:`~.CUgraphChildGraphNodeOwnership`
         The ownership relationship of the child graph node.
     {{endif}}
 
@@ -11028,7 +11165,7 @@ cdef class CUDA_EVENT_RECORD_NODE_PARAMS(CUDA_EVENT_RECORD_NODE_PARAMS_st):
     Attributes
     ----------
     {{if 'CUDA_EVENT_RECORD_NODE_PARAMS_st.event' in found_struct}}
-    event : CUevent
+    event : :py:obj:`~.CUevent`
         The event to record when the node executes
     {{endif}}
 
@@ -11048,7 +11185,7 @@ cdef class CUDA_EVENT_WAIT_NODE_PARAMS(CUDA_EVENT_WAIT_NODE_PARAMS_st):
     Attributes
     ----------
     {{if 'CUDA_EVENT_WAIT_NODE_PARAMS_st.event' in found_struct}}
-    event : CUevent
+    event : :py:obj:`~.CUevent`
         The event to wait on from the node
     {{endif}}
 
@@ -11063,12 +11200,12 @@ cdef class CUDA_EVENT_WAIT_NODE_PARAMS(CUDA_EVENT_WAIT_NODE_PARAMS_st):
 
 cdef class CUgraphNodeParams(CUgraphNodeParams_st):
     """
-    Graph node parameters. See cuGraphAddNode.
+    Graph node parameters. See :py:obj:`~.cuGraphAddNode`.
 
     Attributes
     ----------
     {{if 'CUgraphNodeParams_st.type' in found_struct}}
-    type : CUgraphNodeType
+    type : :py:obj:`~.CUgraphNodeType`
         Type of the node
     {{endif}}
     {{if 'CUgraphNodeParams_st.reserved0' in found_struct}}
@@ -11080,55 +11217,55 @@ cdef class CUgraphNodeParams(CUgraphNodeParams_st):
         Padding. Unused bytes must be zero.
     {{endif}}
     {{if 'CUgraphNodeParams_st.kernel' in found_struct}}
-    kernel : CUDA_KERNEL_NODE_PARAMS_v3
+    kernel : :py:obj:`~.CUDA_KERNEL_NODE_PARAMS_v3`
         Kernel node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.memcpy' in found_struct}}
-    memcpy : CUDA_MEMCPY_NODE_PARAMS
+    memcpy : :py:obj:`~.CUDA_MEMCPY_NODE_PARAMS`
         Memcpy node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.memset' in found_struct}}
-    memset : CUDA_MEMSET_NODE_PARAMS_v2
+    memset : :py:obj:`~.CUDA_MEMSET_NODE_PARAMS_v2`
         Memset node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.host' in found_struct}}
-    host : CUDA_HOST_NODE_PARAMS_v2
+    host : :py:obj:`~.CUDA_HOST_NODE_PARAMS_v2`
         Host node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.graph' in found_struct}}
-    graph : CUDA_CHILD_GRAPH_NODE_PARAMS
+    graph : :py:obj:`~.CUDA_CHILD_GRAPH_NODE_PARAMS`
         Child graph node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.eventWait' in found_struct}}
-    eventWait : CUDA_EVENT_WAIT_NODE_PARAMS
+    eventWait : :py:obj:`~.CUDA_EVENT_WAIT_NODE_PARAMS`
         Event wait node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.eventRecord' in found_struct}}
-    eventRecord : CUDA_EVENT_RECORD_NODE_PARAMS
+    eventRecord : :py:obj:`~.CUDA_EVENT_RECORD_NODE_PARAMS`
         Event record node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.extSemSignal' in found_struct}}
-    extSemSignal : CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2
+    extSemSignal : :py:obj:`~.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2`
         External semaphore signal node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.extSemWait' in found_struct}}
-    extSemWait : CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2
+    extSemWait : :py:obj:`~.CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2`
         External semaphore wait node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.alloc' in found_struct}}
-    alloc : CUDA_MEM_ALLOC_NODE_PARAMS_v2
+    alloc : :py:obj:`~.CUDA_MEM_ALLOC_NODE_PARAMS_v2`
         Memory allocation node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.free' in found_struct}}
-    free : CUDA_MEM_FREE_NODE_PARAMS
+    free : :py:obj:`~.CUDA_MEM_FREE_NODE_PARAMS`
         Memory free node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.memOp' in found_struct}}
-    memOp : CUDA_BATCH_MEM_OP_NODE_PARAMS_v2
+    memOp : :py:obj:`~.CUDA_BATCH_MEM_OP_NODE_PARAMS_v2`
         MemOp node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.conditional' in found_struct}}
-    conditional : CUDA_CONDITIONAL_NODE_PARAMS
+    conditional : :py:obj:`~.CUDA_CONDITIONAL_NODE_PARAMS`
         Conditional node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.asBytes' in found_struct}}
@@ -11165,7 +11302,7 @@ cdef class CUcheckpointLockArgs(CUcheckpointLockArgs_st):
         Reserved for future use, must be zero
     {{endif}}
     {{if 'CUcheckpointLockArgs_st.reserved1' in found_struct}}
-    reserved1 : list[cuuint64_t]
+    reserved1 : list[:py:obj:`~.cuuint64_t`]
         Reserved for future use, must be zeroed
     {{endif}}
 
@@ -11185,7 +11322,7 @@ cdef class CUcheckpointCheckpointArgs(CUcheckpointCheckpointArgs_st):
     Attributes
     ----------
     {{if 'CUcheckpointCheckpointArgs_st.reserved' in found_struct}}
-    reserved : list[cuuint64_t]
+    reserved : list[:py:obj:`~.cuuint64_t`]
         Reserved for future use, must be zeroed
     {{endif}}
 
@@ -11205,11 +11342,11 @@ cdef class CUcheckpointGpuPair(CUcheckpointGpuPair_st):
     Attributes
     ----------
     {{if 'CUcheckpointGpuPair_st.oldUuid' in found_struct}}
-    oldUuid : CUuuid
+    oldUuid : :py:obj:`~.CUuuid`
         UUID of the GPU that was checkpointed
     {{endif}}
     {{if 'CUcheckpointGpuPair_st.newUuid' in found_struct}}
-    newUuid : CUuuid
+    newUuid : :py:obj:`~.CUuuid`
         UUID of the GPU to restore onto
     {{endif}}
 
@@ -11229,7 +11366,7 @@ cdef class CUcheckpointRestoreArgs(CUcheckpointRestoreArgs_st):
     Attributes
     ----------
     {{if 'CUcheckpointRestoreArgs_st.gpuPairs' in found_struct}}
-    gpuPairs : CUcheckpointGpuPair
+    gpuPairs : :py:obj:`~.CUcheckpointGpuPair`
         Pointer to array of gpu pairs that indicate how to remap GPUs
         during restore
     {{endif}}
@@ -11242,11 +11379,11 @@ cdef class CUcheckpointRestoreArgs(CUcheckpointRestoreArgs_st):
         Reserved for future use, must be zeroed
     {{endif}}
     {{if struct_field_types.get('CUcheckpointRestoreArgs_st.reserved') == 'cuuint64_t'}}
-    reserved : list[cuuint64_t]
+    reserved : list[:py:obj:`~.cuuint64_t`]
         Reserved for future use, must be zeroed
     {{endif}}
     {{if 'CUcheckpointRestoreArgs_st.reserved1' in found_struct}}
-    reserved1 : cuuint64_t
+    reserved1 : :py:obj:`~.cuuint64_t`
         Reserved for future use, must be zeroed
     {{endif}}
 
@@ -11266,7 +11403,7 @@ cdef class CUcheckpointUnlockArgs(CUcheckpointUnlockArgs_st):
     Attributes
     ----------
     {{if 'CUcheckpointUnlockArgs_st.reserved' in found_struct}}
-    reserved : list[cuuint64_t]
+    reserved : list[:py:obj:`~.cuuint64_t`]
         Reserved for future use, must be zeroed
     {{endif}}
 
@@ -11289,34 +11426,36 @@ cdef class CUmemDecompressParams(CUmemDecompressParams_st):
     {{if 'CUmemDecompressParams_st.srcNumBytes' in found_struct}}
     srcNumBytes : size_t
         The number of bytes to be read and decompressed from
-        CUmemDecompressParams_st.src.
+        :py:obj:`~.CUmemDecompressParams_st`.src.
     {{endif}}
     {{if 'CUmemDecompressParams_st.dstNumBytes' in found_struct}}
     dstNumBytes : size_t
         The number of bytes that the decompression operation will be
-        expected to write to CUmemDecompressParams_st.dst. This value is
-        optional; if present, it may be used by the CUDA driver as a
-        heuristic for scheduling the individual decompression operations.
+        expected to write to :py:obj:`~.CUmemDecompressParams_st`.dst. This
+        value is optional; if present, it may be used by the CUDA driver as
+        a heuristic for scheduling the individual decompression operations.
     {{endif}}
     {{if 'CUmemDecompressParams_st.dstActBytes' in found_struct}}
-    dstActBytes : cuuint32_t
+    dstActBytes : :py:obj:`~.cuuint32_t`
         After the decompression operation has completed, the actual number
-        of bytes written to CUmemDecompressParams.dst will be recorded as a
-        32-bit unsigned integer in the memory at this address.
+        of bytes written to :py:obj:`~.CUmemDecompressParams`.dst will be
+        recorded as a 32-bit unsigned integer in the memory at this
+        address.
     {{endif}}
     {{if 'CUmemDecompressParams_st.src' in found_struct}}
     src : Any
         Pointer to a buffer of at least
-        CUmemDecompressParams_st.srcNumBytes compressed bytes.
+        :py:obj:`~.CUmemDecompressParams_st`.srcNumBytes compressed bytes.
     {{endif}}
     {{if 'CUmemDecompressParams_st.dst' in found_struct}}
     dst : Any
         Pointer to a buffer where the decompressed data will be written.
         The number of bytes written to this location will be recorded in
-        the memory pointed to by CUmemDecompressParams_st.dstActBytes
+        the memory pointed to by
+        :py:obj:`~.CUmemDecompressParams_st`.dstActBytes
     {{endif}}
     {{if 'CUmemDecompressParams_st.algo' in found_struct}}
-    algo : CUmemDecompressAlgorithm
+    algo : :py:obj:`~.CUmemDecompressAlgorithm`
         The decompression algorithm to use.
     {{endif}}
     {{if 'CUmemDecompressParams_st.padding' in found_struct}}
@@ -11374,8 +11513,9 @@ cdef class CUlogicalEndpointProp(CUlogicalEndpointProp_struct):
     Attributes
     ----------
     {{if 'CUlogicalEndpointProp_struct.type' in found_struct}}
-    type : CUlogicalEndpointType
-        Type of the logical endpoint defined in CUlogicalEndpointType
+    type : :py:obj:`~.CUlogicalEndpointType`
+        Type of the logical endpoint defined in
+        :py:obj:`~.CUlogicalEndpointType`
     {{endif}}
     {{if 'CUlogicalEndpointProp_struct.unicast' in found_struct}}
     unicast : anon_struct25
@@ -11392,11 +11532,11 @@ cdef class CUlogicalEndpointProp(CUlogicalEndpointProp_struct):
     {{if 'CUlogicalEndpointProp_struct.ipcHandleTypes' in found_struct}}
     ipcHandleTypes : unsigned int
         A bitmask of IPC handle types defined in
-        CUlogicalEndpointIpcHandleType
+        :py:obj:`~.CUlogicalEndpointIpcHandleType`
     {{endif}}
     {{if 'CUlogicalEndpointProp_struct.flags' in found_struct}}
     flags : unsigned int
-        A bitmask of flags defined in CUlogicalEndpointFlag
+        A bitmask of flags defined in :py:obj:`~.CUlogicalEndpointFlag`
     {{endif}}
 
     Methods
@@ -11431,7 +11571,7 @@ cdef class CUdevSmResource(CUdevSmResource_st):
     {{if 'CUdevSmResource_st.flags' in found_struct}}
     flags : unsigned int
         The flags set on this SM resource. For possible values see
-        CUdevSmResourceGroup_flags.
+        :py:obj:`~.CUdevSmResourceGroup_flags`.
     {{endif}}
 
     Methods
@@ -11448,7 +11588,7 @@ cdef class CUdevWorkqueueConfigResource(CUdevWorkqueueConfigResource_st):
     Attributes
     ----------
     {{if 'CUdevWorkqueueConfigResource_st.device' in found_struct}}
-    device : CUdevice
+    device : :py:obj:`~.CUdevice`
         The device on which the workqueue resources are available
     {{endif}}
     {{if 'CUdevWorkqueueConfigResource_st.wqConcurrencyLimit' in found_struct}}
@@ -11456,7 +11596,7 @@ cdef class CUdevWorkqueueConfigResource(CUdevWorkqueueConfigResource_st):
         The expected maximum number of concurrent stream-ordered workloads
     {{endif}}
     {{if 'CUdevWorkqueueConfigResource_st.sharingScope' in found_struct}}
-    sharingScope : CUdevWorkqueueConfigScope
+    sharingScope : :py:obj:`~.CUdevWorkqueueConfigScope`
         The sharing scope for the workqueue resources
     {{endif}}
 
@@ -11508,7 +11648,7 @@ cdef class CU_DEV_SM_RESOURCE_GROUP_PARAMS(CU_DEV_SM_RESOURCE_GROUP_PARAMS_st):
     {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
         The flags set on this SM resource group. For possible values see
-        CUdevSmResourceGroup_flags.
+        :py:obj:`~.CUdevSmResourceGroup_flags`.
     {{endif}}
     {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -11529,7 +11669,7 @@ cdef class CUdevResource_v1(CUdevResource_st):
     Attributes
     ----------
     {{if 'CUdevResource_st.type' in found_struct}}
-    type : CUdevResourceType
+    type : :py:obj:`~.CUdevResourceType`
         Type of resource, dictates which union field was last set
     {{endif}}
     {{if 'CUdevResource_st._internal_padding' in found_struct}}
@@ -11537,25 +11677,26 @@ cdef class CUdevResource_v1(CUdevResource_st):
 
     {{endif}}
     {{if 'CUdevResource_st.sm' in found_struct}}
-    sm : CUdevSmResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_SM ``typename``.
+    sm : :py:obj:`~.CUdevSmResource`
+        Resource corresponding to :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`
+        ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wqConfig' in found_struct}}
-    wqConfig : CUdevWorkqueueConfigResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG
-        ``typename``.
+    wqConfig : :py:obj:`~.CUdevWorkqueueConfigResource`
+        Resource corresponding to
+        :py:obj:`~.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG` ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wq' in found_struct}}
-    wq : CUdevWorkqueueResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE
-        ``typename``.
+    wq : :py:obj:`~.CUdevWorkqueueResource`
+        Resource corresponding to
+        :py:obj:`~.CU_DEV_RESOURCE_TYPE_WORKQUEUE` ``typename``.
     {{endif}}
     {{if 'CUdevResource_st._oversize' in found_struct}}
     _oversize : bytes
 
     {{endif}}
     {{if 'CUdevResource_st.nextResource' in found_struct}}
-    nextResource : CUdevResource_st
+    nextResource : :py:obj:`~.CUdevResource_st`
 
     {{endif}}
 
@@ -11573,7 +11714,7 @@ cdef class CUdevResource(CUdevResource_v1):
     Attributes
     ----------
     {{if 'CUdevResource_st.type' in found_struct}}
-    type : CUdevResourceType
+    type : :py:obj:`~.CUdevResourceType`
         Type of resource, dictates which union field was last set
     {{endif}}
     {{if 'CUdevResource_st._internal_padding' in found_struct}}
@@ -11581,25 +11722,26 @@ cdef class CUdevResource(CUdevResource_v1):
 
     {{endif}}
     {{if 'CUdevResource_st.sm' in found_struct}}
-    sm : CUdevSmResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_SM ``typename``.
+    sm : :py:obj:`~.CUdevSmResource`
+        Resource corresponding to :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`
+        ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wqConfig' in found_struct}}
-    wqConfig : CUdevWorkqueueConfigResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG
-        ``typename``.
+    wqConfig : :py:obj:`~.CUdevWorkqueueConfigResource`
+        Resource corresponding to
+        :py:obj:`~.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG` ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wq' in found_struct}}
-    wq : CUdevWorkqueueResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE
-        ``typename``.
+    wq : :py:obj:`~.CUdevWorkqueueResource`
+        Resource corresponding to
+        :py:obj:`~.CU_DEV_RESOURCE_TYPE_WORKQUEUE` ``typename``.
     {{endif}}
     {{if 'CUdevResource_st._oversize' in found_struct}}
     _oversize : bytes
 
     {{endif}}
     {{if 'CUdevResource_st.nextResource' in found_struct}}
-    nextResource : CUdevResource_st
+    nextResource : :py:obj:`~.CUdevResource_st`
 
     {{endif}}
 
@@ -11649,15 +11791,15 @@ cdef class CUeglFrame_v1(CUeglFrame_st):
         Number of channels for the plane
     {{endif}}
     {{if True}}
-    frameType : CUeglFrameType
+    frameType : :py:obj:`~.CUeglFrameType`
         Array or Pitch
     {{endif}}
     {{if True}}
-    eglColorFormat : CUeglColorFormat
+    eglColorFormat : :py:obj:`~.CUeglColorFormat`
         CUDA EGL Color Format
     {{endif}}
     {{if True}}
-    cuFormat : CUarray_format
+    cuFormat : :py:obj:`~.CUarray_format`
         CUDA Array Format
     {{endif}}
 
@@ -11707,15 +11849,15 @@ cdef class CUeglFrame(CUeglFrame_v1):
         Number of channels for the plane
     {{endif}}
     {{if True}}
-    frameType : CUeglFrameType
+    frameType : :py:obj:`~.CUeglFrameType`
         Array or Pitch
     {{endif}}
     {{if True}}
-    eglColorFormat : CUeglColorFormat
+    eglColorFormat : :py:obj:`~.CUeglColorFormat`
         CUDA EGL Color Format
     {{endif}}
     {{if True}}
-    cuFormat : CUarray_format
+    cuFormat : :py:obj:`~.CUarray_format`
         CUDA Array Format
     {{endif}}
 
@@ -11761,7 +11903,7 @@ cdef class cuuint64_t:
 cdef class CUdeviceptr_v2:
     """
 
-    CUDA device pointer CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
+    CUDA device pointer :py:obj:`~.CUdeviceptr` is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
 
     Methods
     -------
diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in
index 78a1a88c06..c5be5e1af4 100644
--- a/cuda_bindings/cuda/bindings/driver.pyx.in
+++ b/cuda_bindings/cuda/bindings/driver.pyx.in
@@ -52,16 +52,16 @@ CU_IPC_HANDLE_SIZE = cydriver.CU_IPC_HANDLE_SIZE
 
 #: Legacy stream handle
 #:
-#: Stream handle that can be passed as a CUstream to use an implicit stream
-#: with legacy synchronization behavior.
+#: Stream handle that can be passed as a :py:obj:`~.CUstream` to use an
+#: implicit stream with legacy synchronization behavior.
 #:
 #: See details of the \link_sync_behavior
 CU_STREAM_LEGACY = cydriver.CU_STREAM_LEGACY
 
 #: Per-thread stream handle
 #:
-#: Stream handle that can be passed as a CUstream to use an implicit stream
-#: with per-thread synchronization behavior.
+#: Stream handle that can be passed as a :py:obj:`~.CUstream` to use an
+#: implicit stream with per-thread synchronization behavior.
 #:
 #: See details of the \link_sync_behavior
 CU_STREAM_PER_THREAD = cydriver.CU_STREAM_PER_THREAD
@@ -229,11 +229,11 @@ CUDA_COOPERATIVE_LAUNCH_MULTI_DEVICE_NO_POST_LAUNCH_SYNC = cydriver.CUDA_COOPERA
 
 #: If set, the CUDA array is a collection of layers, where each layer is
 #: either a 1D or a 2D array and the Depth member of
-#: CUDA_ARRAY3D_DESCRIPTOR specifies the number of layers, not the depth of
-#: a 3D array.
+#: :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR` specifies the number of layers, not
+#: the depth of a 3D array.
 CUDA_ARRAY3D_LAYERED = cydriver.CUDA_ARRAY3D_LAYERED
 
-#: Deprecated, use CUDA_ARRAY3D_LAYERED
+#: Deprecated, use :py:obj:`~.CUDA_ARRAY3D_LAYERED`
 CUDA_ARRAY3D_2DARRAY = cydriver.CUDA_ARRAY3D_2DARRAY
 
 #: This flag must be set in order to bind a surface reference to the CUDA
@@ -299,14 +299,14 @@ CU_TRSF_SEAMLESS_CUBEMAP = cydriver.CU_TRSF_SEAMLESS_CUBEMAP
 #: Launch with the required block dimension.
 CU_LAUNCH_KERNEL_REQUIRED_BLOCK_DIM = cydriver.CU_LAUNCH_KERNEL_REQUIRED_BLOCK_DIM
 
-#: C++ compile time constant for CU_LAUNCH_PARAM_END
+#: C++ compile time constant for :py:obj:`~.CU_LAUNCH_PARAM_END`
 CU_LAUNCH_PARAM_END_AS_INT = cydriver.CU_LAUNCH_PARAM_END_AS_INT
 
 #: End of array terminator for the ``extra`` parameter to
 #: :py:obj:`~.cuLaunchKernel`
 CU_LAUNCH_PARAM_END = cydriver.CU_LAUNCH_PARAM_END
 
-#: C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_POINTER
+#: C++ compile time constant for :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER`
 CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT = cydriver.CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT
 
 #: Indicator that the next value in the ``extra`` parameter to
@@ -318,7 +318,7 @@ CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT = cydriver.CU_LAUNCH_PARAM_BUFFER_POINTER_
 #: have no effect.
 CU_LAUNCH_PARAM_BUFFER_POINTER = cydriver.CU_LAUNCH_PARAM_BUFFER_POINTER
 
-#: C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_SIZE
+#: C++ compile time constant for :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE`
 CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT = cydriver.CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT
 
 #: Indicator that the next value in the ``extra`` parameter to
@@ -623,7 +623,8 @@ class CUevent_flags(_FastEnum):
 
     CU_EVENT_INTERPROCESS = (
         cydriver.CUevent_flags_enum.CU_EVENT_INTERPROCESS,
-        'Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set\n'
+        'Event is suitable for interprocess use. :py:obj:`~.CU_EVENT_DISABLE_TIMING`\n'
+        'must be set\n'
     ){{endif}}
 
 {{endif}}
@@ -879,7 +880,7 @@ class CUstreamMemoryBarrier_flags(_FastEnum):
 class CUstreamAtomicReductionOpType(_FastEnum):
     """
     Atomic reduction operation types for
-    :py:obj:`~.CUstreamBatchMemOpParams.atomicReduction.reductionOp`
+    :py:obj:`~.CUstreamBatchMemOpParams```atomicReduction````reductionOp``
     """
     {{if 'CU_STREAM_ATOMIC_REDUCTION_OP_ADD' in found_values}}
 
@@ -906,7 +907,7 @@ class CUstreamAtomicReductionOpType(_FastEnum):
 class CUstreamAtomicReductionDataType(_FastEnum):
     """
     Atomic reduction data types for
-    :py:obj:`~.CUstreamBatchMemOpParams.atomicReduction.dataType`
+    :py:obj:`~.CUstreamBatchMemOpParams```atomicReduction````dataType``
     """
     {{if 'CU_STREAM_ATOMIC_REDUCTION_UNSIGNED_32' in found_values}}
     CU_STREAM_ATOMIC_REDUCTION_UNSIGNED_32 = cydriver.CUstreamAtomicReductionDataType_enum.CU_STREAM_ATOMIC_REDUCTION_UNSIGNED_32{{endif}}
@@ -1482,7 +1483,7 @@ class CUdevice_attribute(_FastEnum):
 
     CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK,
-        'Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK\n'
+        'Deprecated, use :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK`\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY' in found_values}}
 
@@ -1513,7 +1514,7 @@ class CUdevice_attribute(_FastEnum):
 
     CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK,
-        'Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK\n'
+        'Deprecated, use :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK`\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_CLOCK_RATE' in found_values}}
 
@@ -1532,7 +1533,7 @@ class CUdevice_attribute(_FastEnum):
     CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_GPU_OVERLAP,
         'Device can possibly copy memory and execute a kernel concurrently.\n'
-        'Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT.\n'
+        'Deprecated. Use instead :py:obj:`~.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT`.\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT' in found_values}}
 
@@ -1610,7 +1611,8 @@ class CUdevice_attribute(_FastEnum):
 
     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH,
-        'Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH\n'
+        'Deprecated, use\n'
+        ':py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH`\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT' in found_values}}
 
@@ -1622,7 +1624,8 @@ class CUdevice_attribute(_FastEnum):
 
     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT,
-        'Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT\n'
+        'Deprecated, use\n'
+        ':py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT`\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS' in found_values}}
 
@@ -1634,7 +1637,8 @@ class CUdevice_attribute(_FastEnum):
 
     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES,
-        'Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS\n'
+        'Deprecated, use\n'
+        ':py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS`\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT' in found_values}}
 
@@ -1730,13 +1734,13 @@ class CUdevice_attribute(_FastEnum):
 
     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH,
-        'Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set\n'
+        'Maximum 2D texture width if :py:obj:`~.CUDA_ARRAY3D_TEXTURE_GATHER` is set\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT' in found_values}}
 
     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT,
-        'Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set\n'
+        'Maximum 2D texture height if :py:obj:`~.CUDA_ARRAY3D_TEXTURE_GATHER` is set\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE' in found_values}}
 
@@ -1874,8 +1878,8 @@ class CUdevice_attribute(_FastEnum):
 
     CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH,
-        'Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or\n'
-        ':py:obj:`~.cuDeviceGetTexture1DLinearMaxWidth()` instead.\n'
+        'Deprecated, do not use. Use :func:`~.cudaDeviceGetTexture1DLinearMaxWidth`\n'
+        'or :py:obj:`~.cuDeviceGetTexture1DLinearMaxWidth()` instead.\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH' in found_values}}
 
@@ -1991,7 +1995,7 @@ class CUdevice_attribute(_FastEnum):
     CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS,
         'Device supports coherently accessing pageable memory without calling\n'
-        'cudaHostRegister on it\n'
+        ':func:`~.cudaHostRegister` on it\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS' in found_values}}
 
@@ -2061,13 +2065,14 @@ class CUdevice_attribute(_FastEnum):
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES,
         'The :py:obj:`~.CU_STREAM_WAIT_VALUE_FLUSH` flag and the\n'
         ':py:obj:`~.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES` MemOp are supported on the\n'
-        'device. See Stream Memory Operations for additional details.\n'
+        'device. See :ref:`Stream Memory Operations <cuda-bindings-driver-\n'
+        'group__cuda__memop>` for additional details.\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED' in found_values}}
 
     CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED,
-        'Device supports host memory registration via :py:obj:`~.cudaHostRegister`.\n'
+        'Device supports host memory registration via :func:`~.cudaHostRegister`.\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES' in found_values}}
 
@@ -2086,7 +2091,8 @@ class CUdevice_attribute(_FastEnum):
 
     CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED,
-        'Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED\n'
+        'Deprecated, Use\n'
+        ':py:obj:`~.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED`\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED' in found_values}}
 
@@ -2168,8 +2174,8 @@ class CUdevice_attribute(_FastEnum):
     CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED,
         'Device supports using the :py:obj:`~.cuMemHostRegister` flag\n'
-        ':py:obj:`~.CU_MEMHOSTERGISTER_READ_ONLY` to register memory that must be\n'
-        'mapped as read-only to the GPU\n'
+        '``CU_MEMHOSTERGISTER_READ_ONLY`` to register memory that must be mapped as\n'
+        'read-only to the GPU\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED' in found_values}}
 
@@ -2181,8 +2187,8 @@ class CUdevice_attribute(_FastEnum):
 
     CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED,
-        'Device supports using the :py:obj:`~.cuMemAllocAsync` and\n'
-        ':py:obj:`~.cuMemPool` family of APIs\n'
+        'Device supports using the :py:obj:`~.cuMemAllocAsync` and ``cuMemPool``\n'
+        'family of APIs\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED' in found_values}}
 
@@ -2359,7 +2365,7 @@ class CUdevice_attribute(_FastEnum):
     CU_DEVICE_ATTRIBUTE_HOST_NUMA_MEMORY_POOLS_SUPPORTED = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_HOST_NUMA_MEMORY_POOLS_SUPPORTED,
         'Device supports HOST_NUMA location with the :py:obj:`~.cuMemAllocAsync` and\n'
-        ':py:obj:`~.cuMemPool` family of APIs\n'
+        '``cuMemPool`` family of APIs\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_HOST_NUMA_MULTINODE_IPC_SUPPORTED' in found_values}}
 
@@ -2373,7 +2379,7 @@ class CUdevice_attribute(_FastEnum):
     CU_DEVICE_ATTRIBUTE_HOST_MEMORY_POOLS_SUPPORTED = (
         cydriver.CUdevice_attribute_enum.CU_DEVICE_ATTRIBUTE_HOST_MEMORY_POOLS_SUPPORTED,
         'Device suports HOST location with the :py:obj:`~.cuMemAllocAsync` and\n'
-        ':py:obj:`~.cuMemPool` family of APIs\n'
+        '``cuMemPool`` family of APIs\n'
     ){{endif}}
     {{if 'CU_DEVICE_ATTRIBUTE_HOST_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED' in found_values}}
 
@@ -2509,7 +2515,7 @@ class CUpointer_attribute(_FastEnum):
     CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE = (
         cydriver.CUpointer_attribute_enum.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE,
         '1 if this pointer maps to an allocation that is suitable for\n'
-        ':py:obj:`~.cudaIpcGetMemHandle`, 0 otherwise\n'
+        ':func:`~.cudaIpcGetMemHandle`, 0 otherwise\n'
     ){{endif}}
     {{if 'CU_POINTER_ATTRIBUTE_RANGE_START_ADDR' in found_values}}
 
@@ -2697,7 +2703,7 @@ class CUfunction_attribute(_FastEnum):
         'all be positive. The validity of the cluster dimensions is otherwise\n'
         'checked at launch time.\n'
         'If the value is set during compile time, it cannot be set at runtime.\n'
-        'Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. See\n'
+        'Setting it at runtime will return :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`. See\n'
         ':py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`\n'
     ){{endif}}
     {{if 'CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT' in found_values}}
@@ -2708,8 +2714,8 @@ class CUfunction_attribute(_FastEnum):
         'all be positive. The validity of the cluster dimensions is otherwise\n'
         'checked at launch time.\n'
         'If the value is set during compile time, it cannot be set at runtime.\n'
-        'Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. See\n'
-        ':py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`\n'
+        'Setting it at runtime should return :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`.\n'
+        'See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`\n'
     ){{endif}}
     {{if 'CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH' in found_values}}
 
@@ -2719,8 +2725,8 @@ class CUfunction_attribute(_FastEnum):
         'all be positive. The validity of the cluster dimensions is otherwise\n'
         'checked at launch time.\n'
         'If the value is set during compile time, it cannot be set at runtime.\n'
-        'Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. See\n'
-        ':py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`\n'
+        'Setting it at runtime should return :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`.\n'
+        'See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`\n'
     ){{endif}}
     {{if 'CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED' in found_values}}
 
@@ -2730,8 +2736,8 @@ class CUfunction_attribute(_FastEnum):
         'allowed, 0 is disallowed. A non-portable cluster size may only function on\n'
         'the specific SKUs the program is tested on. The launch might fail if the\n'
         'program is run on a different hardware platform.\n'
-        'CUDA API provides cudaOccupancyMaxActiveClusters to assist with checking\n'
-        'whether the desired size can be launched on the current device.\n'
+        'CUDA API provides :func:`~.cudaOccupancyMaxActiveClusters` to assist with\n'
+        'checking whether the desired size can be launched on the current device.\n'
         'Portable Cluster Size\n'
         'A portable cluster size is guaranteed to be functional on all compute\n'
         'capabilities higher than the target compute capability. The portable\n'
@@ -2746,7 +2752,8 @@ class CUfunction_attribute(_FastEnum):
     CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE = (
         cydriver.CUfunction_attribute_enum.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE,
         'The block scheduling policy of a function. The value type is\n'
-        'CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy. See\n'
+        ':py:obj:`~.CUclusterSchedulingPolicy` /\n'
+        ':py:obj:`~.cudaClusterSchedulingPolicy`. See\n'
         ':py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`\n'
     ){{endif}}
     {{if 'CU_FUNC_ATTRIBUTE_DEVICE_NODE_UPDATE_SUPPORTED' in found_values}}
@@ -3998,7 +4005,7 @@ class CUgraphNodeType(_FastEnum):
         '                                        Handles must be created in advance\n'
         'of creating the node\n'
         '                                        using\n'
-        ':py:obj:`~.cuGraphConditionalHandleCreate`.\n'
+        ':func:`~.cuGraphConditionalHandleCreate`.\n'
         '                                        The following restrictions apply to\n'
         'graphs which contain conditional nodes:\n'
         '                                         The graph cannot be used in a\n'
@@ -4008,8 +4015,8 @@ class CUgraphNodeType(_FastEnum):
         '                                         The graph cannot be cloned.\n'
         '                                        To set the control value, supply a\n'
         'default value when creating the handle and/or\n'
-        '                                        call\n'
-        ':py:obj:`~.cudaGraphSetConditional` from device code.\n'
+        '                                        call ``cudaGraphSetConditional``\n'
+        'from device code.\n'
     ){{endif}}
     {{if 'CU_GRAPH_NODE_TYPE_RESERVED_16' in found_values}}
 
@@ -4408,7 +4415,7 @@ class CUlaunchAttributeID(_FastEnum):
         'can be passed to the various device-side update functions to update the\n'
         "node's kernel parameters from within another kernel. For more information\n"
         'on the types of device updates that can be made, as well as the relevant\n'
-        'limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
+        'limitations thereof, see ``cudaGraphKernelNodeUpdatesApply``.\n'
         ' Nodes which are device-updatable have additional restrictions compared to\n'
         'regular kernel nodes. Firstly, device-updatable nodes cannot be removed\n'
         'from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once\n'
@@ -4933,7 +4940,7 @@ class CUresult(_FastEnum):
     CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC = (
         cydriver.cudaError_enum.CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC,
         'This indicates that the code to be compiled by the PTX JIT contains\n'
-        'unsupported call to cudaDeviceSynchronize.\n'
+        'unsupported call to :func:`~.cudaDeviceSynchronize`.\n'
     ){{endif}}
     {{if 'CUDA_ERROR_CONTAINED' in found_values}}
 
@@ -5350,7 +5357,7 @@ class CUresult(_FastEnum):
     CUDA_ERROR_STREAM_CAPTURE_IMPLICIT = (
         cydriver.cudaError_enum.CUDA_ERROR_STREAM_CAPTURE_IMPLICIT,
         'This error indicates a disallowed implicit dependency on a current capture\n'
-        'sequence from cudaStreamLegacy.\n'
+        'sequence from :py:obj:`~.cudaStreamLegacy`.\n'
     ){{endif}}
     {{if 'CUDA_ERROR_CAPTURED_EVENT' in found_values}}
 
@@ -6028,7 +6035,8 @@ class CUmemAllocationHandleType(_FastEnum):
 
     CU_MEM_HANDLE_TYPE_FABRIC = (
         cydriver.CUmemAllocationHandleType_enum.CU_MEM_HANDLE_TYPE_FABRIC,
-        'Allows a fabric handle to be used for exporting. (CUmemFabricHandle)\n'
+        'Allows a fabric handle to be used for exporting.\n'
+        '(:py:obj:`~.CUmemFabricHandle`)\n'
     ){{endif}}
     {{if 'CU_MEM_HANDLE_TYPE_MAX' in found_values}}
     CU_MEM_HANDLE_TYPE_MAX = cydriver.CUmemAllocationHandleType_enum.CU_MEM_HANDLE_TYPE_MAX{{endif}}
@@ -6106,7 +6114,7 @@ class CUmemLocationType(_FastEnum):
     CU_MEM_LOCATION_TYPE_INVISIBLE = (
         cydriver.CUmemLocationType_enum.CU_MEM_LOCATION_TYPE_INVISIBLE,
         'Location is not visible but device is accessible, id is always\n'
-        'CU_DEVICE_INVALID\n'
+        ':py:obj:`~.CU_DEVICE_INVALID`\n'
     ){{endif}}
     {{if 'CU_MEM_LOCATION_TYPE_MAX' in found_values}}
     CU_MEM_LOCATION_TYPE_MAX = cydriver.CUmemLocationType_enum.CU_MEM_LOCATION_TYPE_MAX{{endif}}
@@ -6335,11 +6343,11 @@ class CUmemPool_attribute(_FastEnum):
 
     CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES,
-        '(value type = int) Allow cuMemAllocAsync to use memory asynchronously freed\n'
-        'in another streams as long as a stream ordering dependency of the\n'
-        'allocating stream on the free action exists. Cuda events and null stream\n'
-        'interactions can create the required stream ordered dependencies. (default\n'
-        'enabled)\n'
+        '(value type = int) Allow :func:`~.cuMemAllocAsync` to use memory\n'
+        'asynchronously freed in another streams as long as a stream ordering\n'
+        'dependency of the allocating stream on the free action exists. Cuda events\n'
+        'and null stream interactions can create the required stream ordered\n'
+        'dependencies. (default enabled)\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC' in found_values}}
 
@@ -6352,90 +6360,92 @@ class CUmemPool_attribute(_FastEnum):
 
     CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
-        '(value type = int) Allow cuMemAllocAsync to insert new stream dependencies\n'
-        'in order to establish the stream ordering required to reuse a piece of\n'
-        'memory released by cuMemFreeAsync (default enabled).\n'
+        '(value type = int) Allow :func:`~.cuMemAllocAsync` to insert new stream\n'
+        'dependencies in order to establish the stream ordering required to reuse a\n'
+        'piece of memory released by :func:`~.cuMemFreeAsync` (default enabled).\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_RELEASE_THRESHOLD' in found_values}}
 
     CU_MEMPOOL_ATTR_RELEASE_THRESHOLD = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
-        '(value type = cuuint64_t) Amount of reserved memory in bytes to hold onto\n'
-        'before trying to release memory back to the OS. When more than the release\n'
-        'threshold bytes of memory are held by the memory pool, the allocator will\n'
-        'try to release memory back to the OS on the next call to stream, event or\n'
-        'context synchronize. (default 0)\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of reserved memory in bytes to\n'
+        'hold onto before trying to release memory back to the OS. When more than\n'
+        'the release threshold bytes of memory are held by the memory pool, the\n'
+        'allocator will try to release memory back to the OS on the next call to\n'
+        'stream, event or context synchronize. (default 0)\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT' in found_values}}
 
     CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT,
-        '(value type = cuuint64_t) Amount of backing memory currently allocated for\n'
-        'the mempool.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of backing memory currently\n'
+        'allocated for the mempool.\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH' in found_values}}
 
     CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,
-        '(value type = cuuint64_t) High watermark of backing memory allocated for\n'
-        'the mempool since the last time it was reset. High watermark can only be\n'
-        'reset to zero.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) High watermark of backing memory\n'
+        'allocated for the mempool since the last time it was reset. High watermark\n'
+        'can only be reset to zero.\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_USED_MEM_CURRENT' in found_values}}
 
     CU_MEMPOOL_ATTR_USED_MEM_CURRENT = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_USED_MEM_CURRENT,
-        '(value type = cuuint64_t) Amount of memory from the pool that is currently\n'
-        'in use by the application.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of memory from the pool that\n'
+        'is currently in use by the application.\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_USED_MEM_HIGH' in found_values}}
 
     CU_MEMPOOL_ATTR_USED_MEM_HIGH = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_USED_MEM_HIGH,
-        '(value type = cuuint64_t) High watermark of the amount of memory from the\n'
-        'pool that was in use by the application since the last time it was reset.\n'
-        'High watermark can only be reset to zero.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) High watermark of the amount of\n'
+        'memory from the pool that was in use by the application since the last time\n'
+        'it was reset. High watermark can only be reset to zero.\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_ALLOCATION_TYPE' in found_values}}
 
     CU_MEMPOOL_ATTR_ALLOCATION_TYPE = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_ALLOCATION_TYPE,
-        '(value type = CUmemAllocationType) The allocation type of the mempool\n'
+        '(value type = :py:obj:`~.CUmemAllocationType`) The allocation type of the\n'
+        'mempool\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_EXPORT_HANDLE_TYPES' in found_values}}
 
     CU_MEMPOOL_ATTR_EXPORT_HANDLE_TYPES = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_EXPORT_HANDLE_TYPES,
-        '(value type = CUmemAllocationHandleType) Available export handle types for\n'
-        'the mempool. For imported pools this value is always\n'
-        'CU_MEM_HANDLE_TYPE_NONE as an imported pool cannot be re-exported\n'
+        '(value type = :py:obj:`~.CUmemAllocationHandleType`) Available export\n'
+        'handle types for the mempool. For imported pools this value is always\n'
+        ':py:obj:`~.CU_MEM_HANDLE_TYPE_NONE` as an imported pool cannot be re-\n'
+        'exported\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_LOCATION_ID' in found_values}}
 
     CU_MEMPOOL_ATTR_LOCATION_ID = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_LOCATION_ID,
         '(value type = int) The location id for the mempool. If the location type\n'
-        'for this pool is CU_MEM_LOCATION_TYPE_INVISIBLE then ID will be\n'
-        'CU_DEVICE_INVALID.\n'
+        'for this pool is :py:obj:`~.CU_MEM_LOCATION_TYPE_INVISIBLE` then ID will be\n'
+        ':py:obj:`~.CU_DEVICE_INVALID`.\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_LOCATION_TYPE' in found_values}}
 
     CU_MEMPOOL_ATTR_LOCATION_TYPE = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_LOCATION_TYPE,
-        '(value type = CUmemLocationType) The location type for the mempool. For\n'
-        'imported memory pools where the device is not directly visible to the\n'
-        'importing process or pools imported via fabric handles across nodes this\n'
-        'will be CU_MEM_LOCATION_TYPE_INVISIBLE.\n'
+        '(value type = :py:obj:`~.CUmemLocationType`) The location type for the\n'
+        'mempool. For imported memory pools where the device is not directly visible\n'
+        'to the importing process or pools imported via fabric handles across nodes\n'
+        'this will be :py:obj:`~.CU_MEM_LOCATION_TYPE_INVISIBLE`.\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_MAX_POOL_SIZE' in found_values}}
 
     CU_MEMPOOL_ATTR_MAX_POOL_SIZE = (
         cydriver.CUmemPool_attribute_enum.CU_MEMPOOL_ATTR_MAX_POOL_SIZE,
-        '(value type = cuuint64_t) Maximum size of the pool in bytes, this value may\n'
-        'be higher than what was initially passed to cuMemPoolCreate due to\n'
-        'alignment requirements. A value of 0 indicates no maximum size. For\n'
-        'CU_MEM_ALLOCATION_TYPE_MANAGED and IPC imported pools this value will be\n'
-        'system dependent.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Maximum size of the pool in bytes,\n'
+        'this value may be higher than what was initially passed to\n'
+        ':func:`~.cuMemPoolCreate` due to alignment requirements. A value of 0\n'
+        'indicates no maximum size. For :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`\n'
+        'and IPC imported pools this value will be system dependent.\n'
     ){{endif}}
     {{if 'CU_MEMPOOL_ATTR_HW_DECOMPRESS_ENABLED' in found_values}}
 
@@ -6529,7 +6539,7 @@ class CUmemcpy3DOperandType(_FastEnum):
 
     CU_MEMCPY_OPERAND_TYPE_ARRAY = (
         cydriver.CUmemcpy3DOperandType_enum.CU_MEMCPY_OPERAND_TYPE_ARRAY,
-        'Memcpy operand is a CUarray.\n'
+        'Memcpy operand is a :py:obj:`~.CUarray`.\n'
     ){{endif}}
     {{if 'CU_MEMCPY_OPERAND_TYPE_MAX' in found_values}}
     CU_MEMCPY_OPERAND_TYPE_MAX = cydriver.CUmemcpy3DOperandType_enum.CU_MEMCPY_OPERAND_TYPE_MAX{{endif}}
@@ -6545,30 +6555,30 @@ class CUgraphMem_attribute(_FastEnum):
 
     CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT = (
         cydriver.CUgraphMem_attribute_enum.CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT,
-        '(value type = cuuint64_t) Amount of memory, in bytes, currently associated\n'
-        'with graphs\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of memory, in bytes, currently\n'
+        'associated with graphs\n'
     ){{endif}}
     {{if 'CU_GRAPH_MEM_ATTR_USED_MEM_HIGH' in found_values}}
 
     CU_GRAPH_MEM_ATTR_USED_MEM_HIGH = (
         cydriver.CUgraphMem_attribute_enum.CU_GRAPH_MEM_ATTR_USED_MEM_HIGH,
-        '(value type = cuuint64_t) High watermark of memory, in bytes, associated\n'
-        'with graphs since the last time it was reset. High watermark can only be\n'
-        'reset to zero.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) High watermark of memory, in bytes,\n'
+        'associated with graphs since the last time it was reset. High watermark can\n'
+        'only be reset to zero.\n'
     ){{endif}}
     {{if 'CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT' in found_values}}
 
     CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT = (
         cydriver.CUgraphMem_attribute_enum.CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT,
-        '(value type = cuuint64_t) Amount of memory, in bytes, currently allocated\n'
-        'for use by the CUDA graphs asynchronous allocator.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of memory, in bytes, currently\n'
+        'allocated for use by the CUDA graphs asynchronous allocator.\n'
     ){{endif}}
     {{if 'CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH' in found_values}}
 
     CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH = (
         cydriver.CUgraphMem_attribute_enum.CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH,
-        '(value type = cuuint64_t) High watermark of memory, in bytes, currently\n'
-        'allocated for use by the CUDA graphs asynchronous allocator.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) High watermark of memory, in bytes,\n'
+        'currently allocated for use by the CUDA graphs asynchronous allocator.\n'
     ){{endif}}
 
 {{endif}}
@@ -6602,8 +6612,8 @@ class CUgraphChildGraphNodeOwnership(_FastEnum):
         'The following restrictions apply to child graphs after they have been\n'
         'moved: Cannot be independently instantiated or destroyed; Cannot be added\n'
         'as a child graph of a separate parent graph; Cannot be used as an argument\n'
-        'to cuGraphExecUpdate; Cannot have additional memory allocation or free\n'
-        'nodes added.\n'
+        'to :py:obj:`~.cuGraphExecUpdate`; Cannot have additional memory allocation\n'
+        'or free nodes added.\n'
     ){{endif}}
 
 {{endif}}
@@ -6718,49 +6728,49 @@ class CUgraphDebugDot_flags(_FastEnum):
 
     CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS,
-        'Adds CUDA_KERNEL_NODE_PARAMS values to output\n'
+        'Adds :py:obj:`~.CUDA_KERNEL_NODE_PARAMS` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS,
-        'Adds CUDA_MEMCPY3D values to output\n'
+        'Adds :py:obj:`~.CUDA_MEMCPY3D` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS,
-        'Adds CUDA_MEMSET_NODE_PARAMS values to output\n'
+        'Adds :py:obj:`~.CUDA_MEMSET_NODE_PARAMS` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS,
-        'Adds CUDA_HOST_NODE_PARAMS values to output\n'
+        'Adds :py:obj:`~.CUDA_HOST_NODE_PARAMS` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS,
-        'Adds CUevent handle from record and wait nodes to output\n'
+        'Adds :py:obj:`~.CUevent` handle from record and wait nodes to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS,
-        'Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output\n'
+        'Adds :py:obj:`~.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS,
-        'Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output\n'
+        'Adds :py:obj:`~.CUDA_EXT_SEM_WAIT_NODE_PARAMS` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES' in found_values}}
 
     CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES = (
         cydriver.CUgraphDebugDot_flags_enum.CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES,
-        'Adds CUkernelNodeAttrValue values to output\n'
+        'Adds :py:obj:`~.CUkernelNodeAttrValue` values to output\n'
     ){{endif}}
     {{if 'CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES' in found_values}}
 
@@ -6854,7 +6864,8 @@ class CUgraphInstantiate_flags(_FastEnum):
         cydriver.CUgraphInstantiate_flags_enum.CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH,
         'Instantiate the graph to be launchable from the device. This flag can only\n'
         'be used on platforms which support unified addressing. This flag cannot be\n'
-        'used in conjunction with CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH.\n'
+        'used in conjunction with\n'
+        ':py:obj:`~.CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH`.\n'
     ){{endif}}
     {{if 'CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY' in found_values}}
 
@@ -6881,7 +6892,8 @@ class CUdeviceNumaConfig(_FastEnum):
 
     CU_DEVICE_NUMA_CONFIG_NUMA_NODE = (
         cydriver.CUdeviceNumaConfig_enum.CU_DEVICE_NUMA_CONFIG_NUMA_NODE,
-        'The GPU is a NUMA node, CU_DEVICE_ATTRIBUTE_NUMA_ID contains its NUMA ID\n'
+        'The GPU is a NUMA node, :py:obj:`~.CU_DEVICE_ATTRIBUTE_NUMA_ID` contains\n'
+        'its NUMA ID\n'
     ){{endif}}
 
 {{endif}}
@@ -6943,7 +6955,8 @@ class CUmoduleLoadingMode(_FastEnum):
 
 class CUmemDecompressAlgorithm(_FastEnum):
     """
-    Bitmasks for CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK.
+    Bitmasks for
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_MEM_DECOMPRESS_ALGORITHM_MASK`.
     """
     {{if 'CU_MEM_DECOMPRESS_UNSUPPORTED' in found_values}}
 
@@ -8137,7 +8150,7 @@ cdef object _CUresult_SUCCESS = CUresult.CUDA_SUCCESS
 cdef class CUdeviceptr:
     """
 
-    CUDA device pointer CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
+    CUDA device pointer :py:obj:`~.CUdeviceptr` is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
 
     Methods
     -------
@@ -8456,7 +8469,7 @@ class CUkernelNodeAttrID(_FastEnum):
         'can be passed to the various device-side update functions to update the\n'
         "node's kernel parameters from within another kernel. For more information\n"
         'on the types of device updates that can be made, as well as the relevant\n'
-        'limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
+        'limitations thereof, see ``cudaGraphKernelNodeUpdatesApply``.\n'
         ' Nodes which are device-updatable have additional restrictions compared to\n'
         'regular kernel nodes. Firstly, device-updatable nodes cannot be removed\n'
         'from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once\n'
@@ -8704,7 +8717,7 @@ class CUstreamAttrID(_FastEnum):
         'can be passed to the various device-side update functions to update the\n'
         "node's kernel parameters from within another kernel. For more information\n"
         'on the types of device updates that can be made, as well as the relevant\n'
-        'limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
+        'limitations thereof, see ``cudaGraphKernelNodeUpdatesApply``.\n'
         ' Nodes which are device-updatable have additional restrictions compared to\n'
         'regular kernel nodes. Firstly, device-updatable nodes cannot be removed\n'
         'from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once\n'
@@ -9617,7 +9630,7 @@ cdef class CUasyncCallbackHandle:
 cdef class CUgreenCtx:
     """
 
-    A green context handle. This handle can be used safely from only one CPU thread at a time. Created via cuGreenCtxCreate
+    A green context handle. This handle can be used safely from only one CPU thread at a time. Created via :func:`~.cuGreenCtxCreate`
 
     Methods
     -------
@@ -9726,7 +9739,7 @@ cdef class CUcoredumpCallbackHandle:
 cdef class CUdevResourceDesc:
     """
 
-    An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources. Created via cuDevResourceGenerateDesc
+    An opaque descriptor handle. The descriptor encapsulates multiple created and configured resources. Created via :func:`~.cuDevResourceGenerateDesc`
 
     Methods
     -------
@@ -10368,27 +10381,27 @@ cdef class CUstreamMemOpWaitValueParams_st:
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.waitValue.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.address' in found_struct}}
-    address : CUdeviceptr
+    address : :py:obj:`~.CUdeviceptr`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.value' in found_struct}}
-    value : cuuint32_t
+    value : :py:obj:`~.cuuint32_t`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.value64' in found_struct}}
-    value64 : cuuint64_t
+    value64 : :py:obj:`~.cuuint64_t`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.flags' in found_struct}}
     flags : unsigned int
-        See CUstreamWaitValue_flags.
+        See :py:obj:`~.CUstreamWaitValue_flags`.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue.alias' in found_struct}}
-    alias : CUdeviceptr
+    alias : :py:obj:`~.CUdeviceptr`
         For driver internal use. Initial value is unimportant.
     {{endif}}
 
@@ -10556,27 +10569,27 @@ cdef class CUstreamMemOpWriteValueParams_st:
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.writeValue.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.address' in found_struct}}
-    address : CUdeviceptr
+    address : :py:obj:`~.CUdeviceptr`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.value' in found_struct}}
-    value : cuuint32_t
+    value : :py:obj:`~.cuuint32_t`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.value64' in found_struct}}
-    value64 : cuuint64_t
+    value64 : :py:obj:`~.cuuint64_t`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.flags' in found_struct}}
     flags : unsigned int
-        See CUstreamWriteValue_flags.
+        See :py:obj:`~.CUstreamWriteValue_flags`.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue.alias' in found_struct}}
-    alias : CUdeviceptr
+    alias : :py:obj:`~.CUdeviceptr`
         For driver internal use. Initial value is unimportant.
     {{endif}}
 
@@ -10744,7 +10757,7 @@ cdef class CUstreamMemOpFlushRemoteWritesParams_st:
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites.flags' in found_struct}}
@@ -10808,12 +10821,12 @@ cdef class CUstreamMemOpMemoryBarrierParams_st:
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
         < Only supported in the _v2 API
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier.flags' in found_struct}}
     flags : unsigned int
-        See CUstreamMemoryBarrier_flags
+        See :py:obj:`~.CUstreamMemoryBarrier_flags`
     {{endif}}
 
     Methods
@@ -10872,7 +10885,7 @@ cdef class CUstreamMemOpAtomicReductionParams_st:
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.flags' in found_struct}}
@@ -10880,23 +10893,23 @@ cdef class CUstreamMemOpAtomicReductionParams_st:
         Must be 0
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.reductionOp' in found_struct}}
-    reductionOp : CUstreamAtomicReductionOpType
-        See CUstreamAtomicReductionOpType
+    reductionOp : :py:obj:`~.CUstreamAtomicReductionOpType`
+        See :py:obj:`~.CUstreamAtomicReductionOpType`
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.dataType' in found_struct}}
-    dataType : CUstreamAtomicReductionDataType
-        See CUstreamAtomicReductionDataType
+    dataType : :py:obj:`~.CUstreamAtomicReductionDataType`
+        See :py:obj:`~.CUstreamAtomicReductionDataType`
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.address' in found_struct}}
-    address : CUdeviceptr
+    address : :py:obj:`~.CUdeviceptr`
         The address the atomic operation will be operated on
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.value' in found_struct}}
-    value : cuuint64_t
+    value : :py:obj:`~.cuuint64_t`
         The operand value the atomic operation will operate with
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction.alias' in found_struct}}
-    alias : CUdeviceptr
+    alias : :py:obj:`~.CUdeviceptr`
         For driver internal use. Initial value is unimportant.
     {{endif}}
 
@@ -11062,39 +11075,40 @@ cdef class CUstreamMemOpAtomicReductionParams_st:
 
 cdef class CUstreamBatchMemOpParams_union:
     """
-    Per-operation parameters for cuStreamBatchMemOp
+    Per-operation parameters for :py:obj:`~.cuStreamBatchMemOp`
 
     Attributes
     ----------
     {{if 'CUstreamBatchMemOpParams_union.operation' in found_struct}}
-    operation : CUstreamBatchMemOpType
+    operation : :py:obj:`~.CUstreamBatchMemOpType`
         Operation. This is the first field of all the union elemets and
         acts as a TAG to determine which union member is valid.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.waitValue' in found_struct}}
-    waitValue : CUstreamMemOpWaitValueParams_st
-        Params for CU_STREAM_MEM_OP_WAIT_VALUE_32 and
-        CU_STREAM_MEM_OP_WAIT_VALUE_64 operations.
+    waitValue : :py:obj:`~.CUstreamMemOpWaitValueParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_WAIT_VALUE_32` and
+        :py:obj:`~.CU_STREAM_MEM_OP_WAIT_VALUE_64` operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.writeValue' in found_struct}}
-    writeValue : CUstreamMemOpWriteValueParams_st
-        Params for CU_STREAM_MEM_OP_WRITE_VALUE_32 and
-        CU_STREAM_MEM_OP_WRITE_VALUE_64 operations.
+    writeValue : :py:obj:`~.CUstreamMemOpWriteValueParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_WRITE_VALUE_32` and
+        :py:obj:`~.CU_STREAM_MEM_OP_WRITE_VALUE_64` operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.flushRemoteWrites' in found_struct}}
-    flushRemoteWrites : CUstreamMemOpFlushRemoteWritesParams_st
-        Params for CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES operations.
+    flushRemoteWrites : :py:obj:`~.CUstreamMemOpFlushRemoteWritesParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES`
+        operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.memoryBarrier' in found_struct}}
-    memoryBarrier : CUstreamMemOpMemoryBarrierParams_st
-        Params for CU_STREAM_MEM_OP_BARRIER operations.
+    memoryBarrier : :py:obj:`~.CUstreamMemOpMemoryBarrierParams_st`
+        Params for :py:obj:`~.CU_STREAM_MEM_OP_BARRIER` operations.
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.atomicReduction' in found_struct}}
-    atomicReduction : CUstreamMemOpAtomicReductionParams_st
+    atomicReduction : :py:obj:`~.CUstreamMemOpAtomicReductionParams_st`
 
     {{endif}}
     {{if 'CUstreamBatchMemOpParams_union.pad' in found_struct}}
-    pad : list[cuuint64_t]
+    pad : list[:py:obj:`~.cuuint64_t`]
 
     {{endif}}
 
@@ -11240,12 +11254,13 @@ cdef class CUstreamBatchMemOpParams_union:
 cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st:
     """
     Batch memory operation node parameters  Used in the legacy
-    cuGraphAddBatchMemOpNode api. New code should use cuGraphAddNode()
+    :func:`~.cuGraphAddBatchMemOpNode` api. New code should use
+    :func:`~.cuGraphAddNode`
 
     Attributes
     ----------
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
 
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.count' in found_struct}}
@@ -11253,7 +11268,7 @@ cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st:
 
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.paramArray' in found_struct}}
-    paramArray : CUstreamBatchMemOpParams
+    paramArray : :py:obj:`~.CUstreamBatchMemOpParams`
 
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v1_st.flags' in found_struct}}
@@ -11382,7 +11397,7 @@ cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context to use for the operations.
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st.count' in found_struct}}
@@ -11390,7 +11405,7 @@ cdef class CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st:
         Number of operations in paramArray.
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st.paramArray' in found_struct}}
-    paramArray : CUstreamBatchMemOpParams
+    paramArray : :py:obj:`~.CUstreamBatchMemOpParams`
         Array of batch memory operations.
     {{endif}}
     {{if 'CUDA_BATCH_MEM_OP_NODE_PARAMS_v2_st.flags' in found_struct}}
@@ -11614,7 +11629,7 @@ cdef class CUasyncNotificationInfo_st:
     Attributes
     ----------
     {{if 'CUasyncNotificationInfo_st.type' in found_struct}}
-    type : CUasyncNotificationType
+    type : :py:obj:`~.CUasyncNotificationType`
         The type of notification being sent
     {{endif}}
     {{if 'CUasyncNotificationInfo_st.info' in found_struct}}
@@ -11898,13 +11913,13 @@ cdef class CUaccessPolicyWindow_st:
     Specifies an access policy for a window, a contiguous extent of
     memory beginning at base_ptr and ending at base_ptr + num_bytes.
     num_bytes is limited by
-    CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE. Partition into
-    many segments and assign segments such that: sum of "hit segments"
-    / window == approx. ratio. sum of "miss segments" / window ==
-    approx 1-ratio. Segments and ratio specifications are fitted to the
-    capabilities of the architecture. Accesses in a hit segment apply
-    the hitProp access policy. Accesses in a miss segment apply the
-    missProp access policy.
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE`.
+    Partition into many segments and assign segments such that: sum of
+    "hit segments" / window == approx. ratio. sum of "miss segments" /
+    window == approx 1-ratio. Segments and ratio specifications are
+    fitted to the capabilities of the architecture. Accesses in a hit
+    segment apply the hitProp access policy. Accesses in a miss segment
+    apply the missProp access policy.
 
     Attributes
     ----------
@@ -11924,12 +11939,13 @@ cdef class CUaccessPolicyWindow_st:
         assigned missProp.
     {{endif}}
     {{if 'CUaccessPolicyWindow_st.hitProp' in found_struct}}
-    hitProp : CUaccessProperty
-        CUaccessProperty set for hit.
+    hitProp : :py:obj:`~.CUaccessProperty`
+        :py:obj:`~.CUaccessProperty` set for hit.
     {{endif}}
     {{if 'CUaccessPolicyWindow_st.missProp' in found_struct}}
-    missProp : CUaccessProperty
-        CUaccessProperty set for miss. Must be either NORMAL or STREAMING
+    missProp : :py:obj:`~.CUaccessProperty`
+        :py:obj:`~.CUaccessProperty` set for miss. Must be either NORMAL or
+        STREAMING
     {{endif}}
 
     Methods
@@ -12035,7 +12051,7 @@ cdef class CUDA_KERNEL_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_KERNEL_NODE_PARAMS_st.func' in found_struct}}
-    func : CUfunction
+    func : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_st.gridDimX' in found_struct}}
@@ -12260,7 +12276,7 @@ cdef class CUDA_KERNEL_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.func' in found_struct}}
-    func : CUfunction
+    func : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.gridDimX' in found_struct}}
@@ -12300,11 +12316,11 @@ cdef class CUDA_KERNEL_NODE_PARAMS_v2_st:
         Extra options
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.kern' in found_struct}}
-    kern : CUkernel
+    kern : :py:obj:`~.CUkernel`
         Kernel to launch, will only be referenced if func is NULL
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v2_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context for the kernel task to run in. The value NULL will indicate
         the current context should be used by the api. This field is
         ignored if func is set.
@@ -12547,7 +12563,7 @@ cdef class CUDA_KERNEL_NODE_PARAMS_v3_st:
     Attributes
     ----------
     {{if 'CUDA_KERNEL_NODE_PARAMS_v3_st.func' in found_struct}}
-    func : CUfunction
+    func : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v3_st.gridDimX' in found_struct}}
@@ -12587,11 +12603,11 @@ cdef class CUDA_KERNEL_NODE_PARAMS_v3_st:
         Extra options
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v3_st.kern' in found_struct}}
-    kern : CUkernel
+    kern : :py:obj:`~.CUkernel`
         Kernel to launch, will only be referenced if func is NULL
     {{endif}}
     {{if 'CUDA_KERNEL_NODE_PARAMS_v3_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context for the kernel task to run in. The value NULL will indicate
         the current context should be used by the api. This field is
         ignored if func is set.
@@ -12834,7 +12850,7 @@ cdef class CUDA_MEMSET_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_MEMSET_NODE_PARAMS_st.dst' in found_struct}}
-    dst : CUdeviceptr
+    dst : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMSET_NODE_PARAMS_st.pitch' in found_struct}}
@@ -12987,7 +13003,7 @@ cdef class CUDA_MEMSET_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_MEMSET_NODE_PARAMS_v2_st.dst' in found_struct}}
-    dst : CUdeviceptr
+    dst : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMSET_NODE_PARAMS_v2_st.pitch' in found_struct}}
@@ -13011,7 +13027,7 @@ cdef class CUDA_MEMSET_NODE_PARAMS_v2_st:
         Number of rows
     {{endif}}
     {{if 'CUDA_MEMSET_NODE_PARAMS_v2_st.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context on which to run the node
     {{endif}}
 
@@ -13170,7 +13186,7 @@ cdef class CUDA_HOST_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_HOST_NODE_PARAMS_st.fn' in found_struct}}
-    fn : CUhostFn
+    fn : :py:obj:`~.CUhostFn`
         The function to call when the node executes
     {{endif}}
     {{if 'CUDA_HOST_NODE_PARAMS_st.userData' in found_struct}}
@@ -13251,7 +13267,7 @@ cdef class CUDA_HOST_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_HOST_NODE_PARAMS_v2_st.fn' in found_struct}}
-    fn : CUhostFn
+    fn : :py:obj:`~.CUhostFn`
         The function to call when the node executes
     {{endif}}
     {{if 'CUDA_HOST_NODE_PARAMS_v2_st.userData' in found_struct}}
@@ -13350,22 +13366,23 @@ cdef class CUDA_CONDITIONAL_NODE_PARAMS:
     Attributes
     ----------
     {{if 'CUDA_CONDITIONAL_NODE_PARAMS.handle' in found_struct}}
-    handle : CUgraphConditionalHandle
+    handle : :py:obj:`~.CUgraphConditionalHandle`
         Conditional node handle. Handles must be created in advance of
-        creating the node using cuGraphConditionalHandleCreate.
+        creating the node using :func:`~.cuGraphConditionalHandleCreate`.
     {{endif}}
     {{if 'CUDA_CONDITIONAL_NODE_PARAMS.type' in found_struct}}
-    type : CUgraphConditionalNodeType
+    type : :py:obj:`~.CUgraphConditionalNodeType`
         Type of conditional node.
     {{endif}}
     {{if 'CUDA_CONDITIONAL_NODE_PARAMS.size' in found_struct}}
     size : unsigned int
         Size of graph output array. Allowed values are 1 for
-        CU_GRAPH_COND_TYPE_WHILE, 1 or 2 for CU_GRAPH_COND_TYPE_IF, or any
-        value greater than zero for CU_GRAPH_COND_TYPE_SWITCH.
+        :py:obj:`~.CU_GRAPH_COND_TYPE_WHILE`, 1 or 2 for
+        :py:obj:`~.CU_GRAPH_COND_TYPE_IF`, or any value greater than zero
+        for :py:obj:`~.CU_GRAPH_COND_TYPE_SWITCH`.
     {{endif}}
     {{if 'CUDA_CONDITIONAL_NODE_PARAMS.phGraph_out' in found_struct}}
-    phGraph_out : CUgraph
+    phGraph_out : :py:obj:`~.CUgraph`
         CUDA-owned array populated with conditional node child graphs
         during creation of the node. Valid for the lifetime of the
         conditional node. The contents of the graph(s) are subject to the
@@ -13375,16 +13392,18 @@ cdef class CUDA_CONDITIONAL_NODE_PARAMS:
         - All kernels, including kernels in nested conditionals or child
         graphs at any level, must belong to the same CUDA context.
         These graphs may be populated using graph node creation APIs or
-        cuStreamBeginCaptureToGraph.  CU_GRAPH_COND_TYPE_IF: phGraph_out[0]
-        is executed when the condition is non-zero. If ``size`` == 2,
-        phGraph_out[1] will be executed when the condition is zero.
-        CU_GRAPH_COND_TYPE_WHILE: phGraph_out[0] is executed as long as the
-        condition is non-zero. CU_GRAPH_COND_TYPE_SWITCH: phGraph_out[n] is
-        executed when the condition is equal to n. If the condition >=
-        ``size``, no body graph is executed.
+        :func:`~.cuStreamBeginCaptureToGraph`.
+        :py:obj:`~.CU_GRAPH_COND_TYPE_IF`: phGraph_out[0] is executed when
+        the condition is non-zero. If ``size`` == 2, phGraph_out[1] will be
+        executed when the condition is zero.
+        :py:obj:`~.CU_GRAPH_COND_TYPE_WHILE`: phGraph_out[0] is executed as
+        long as the condition is non-zero.
+        :py:obj:`~.CU_GRAPH_COND_TYPE_SWITCH`: phGraph_out[n] is executed
+        when the condition is equal to n. If the condition >= ``size``, no
+        body graph is executed.
     {{endif}}
     {{if 'CUDA_CONDITIONAL_NODE_PARAMS.ctx' in found_struct}}
-    ctx : CUcontext
+    ctx : :py:obj:`~.CUcontext`
         Context on which to run the node. Must match context used to create
         the handle and all body nodes.
     {{endif}}
@@ -13511,8 +13530,8 @@ cdef class CUgraphEdgeData_st:
     """
     Optional annotation for edges in a CUDA graph. Note, all edges
     implicitly have annotations and default to a zero-initialized value
-    if not specified. A zero-initialized struct indicates a standard
-    full serialization of two nodes with memory visibility.
+    if not specified. A zero-initialized ``struct indicates`` a
+    standard full serialization of two nodes with memory visibility.
 
     Attributes
     ----------
@@ -13524,9 +13543,9 @@ cdef class CUgraphEdgeData_st:
         memory visibility to the downstream node or portion thereof
         (indicated by ``to_port``).   Only kernel nodes define non-zero
         ports. A kernel node can use the following output port types:
-        CU_GRAPH_KERNEL_NODE_PORT_DEFAULT,
-        CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC, or
-        CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER.
+        :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_DEFAULT`,
+        :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_PROGRAMMATIC`, or
+        :py:obj:`~.CU_GRAPH_KERNEL_NODE_PORT_LAUNCH_ORDER`.
     {{endif}}
     {{if 'CUgraphEdgeData_st.to_port' in found_struct}}
     to_port : bytes
@@ -13539,9 +13558,10 @@ cdef class CUgraphEdgeData_st:
     {{endif}}
     {{if 'CUgraphEdgeData_st.type' in found_struct}}
     type : bytes
-        This should be populated with a value from CUgraphDependencyType.
-        (It is typed as char due to compiler-specific layout of bitfields.)
-        See CUgraphDependencyType.
+        This should be populated with a value from
+        :py:obj:`~.CUgraphDependencyType`. (It is typed as char due to
+        compiler-specific layout of bitfields.) See
+        :py:obj:`~.CUgraphDependencyType`.
     {{endif}}
     {{if 'CUgraphEdgeData_st.reserved' in found_struct}}
     reserved : bytes
@@ -13640,19 +13660,19 @@ cdef class CUDA_GRAPH_INSTANTIATE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_GRAPH_INSTANTIATE_PARAMS_st.flags' in found_struct}}
-    flags : cuuint64_t
+    flags : :py:obj:`~.cuuint64_t`
         Instantiation flags
     {{endif}}
     {{if 'CUDA_GRAPH_INSTANTIATE_PARAMS_st.hUploadStream' in found_struct}}
-    hUploadStream : CUstream
+    hUploadStream : :py:obj:`~.CUstream`
         Upload stream
     {{endif}}
     {{if 'CUDA_GRAPH_INSTANTIATE_PARAMS_st.hErrNode_out' in found_struct}}
-    hErrNode_out : CUgraphNode
+    hErrNode_out : :py:obj:`~.CUgraphNode`
         The node which caused instantiation to fail, if any
     {{endif}}
     {{if 'CUDA_GRAPH_INSTANTIATE_PARAMS_st.result_out' in found_struct}}
-    result_out : CUgraphInstantiateResult
+    result_out : :py:obj:`~.CUgraphInstantiateResult`
         Whether instantiation was successful. If it failed, the reason why
     {{endif}}
 
@@ -13776,13 +13796,16 @@ cdef class CUDA_GRAPH_INSTANTIATE_PARAMS_st:
 
 cdef class CUlaunchMemSyncDomainMap_st:
     """
-    Memory Synchronization Domain map  See ``cudaLaunchMemSyncDomain``.
-    By default, kernels are launched in domain 0. Kernel launched with
-    CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE will have a different domain ID.
-    User may also alter the domain ID with CUlaunchMemSyncDomainMap for
-    a specific stream / graph node / kernel launch. See
-    CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP.  Domain ID range is
-    available through CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT.
+    Memory Synchronization Domain map  See
+    :py:obj:`~.cudaLaunchMemSyncDomain`.  By default, kernels are
+    launched in domain 0. Kernel launched with
+    :py:obj:`~.CU_LAUNCH_MEM_SYNC_DOMAIN_REMOTE` will have a different
+    domain ID. User may also alter the domain ID with
+    :py:obj:`~.CUlaunchMemSyncDomainMap` for a specific stream / graph
+    node / kernel launch. See
+    :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP`.  Domain ID
+    range is available through
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_MEM_SYNC_DOMAIN_COUNT`.
 
     Attributes
     ----------
@@ -13935,7 +13958,7 @@ cdef class anon_struct2:
     Attributes
     ----------
     {{if 'CUlaunchAttributeValue_union.programmaticEvent.event' in found_struct}}
-    event : CUevent
+    event : :py:obj:`~.CUevent`
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticEvent.flags' in found_struct}}
@@ -14029,7 +14052,7 @@ cdef class anon_struct3:
     Attributes
     ----------
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent.event' in found_struct}}
-    event : CUevent
+    event : :py:obj:`~.CUevent`
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent.flags' in found_struct}}
@@ -14191,7 +14214,7 @@ cdef class anon_struct5:
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode.devNode' in found_struct}}
-    devNode : CUgraphDeviceNode
+    devNode : :py:obj:`~.CUgraphDeviceNode`
 
     {{endif}}
 
@@ -14260,7 +14283,8 @@ cdef class anon_struct5:
 
 cdef class CUlaunchAttributeValue_union:
     """
-    Launch attributes union; used as value field of CUlaunchAttribute
+    Launch attributes union; used as value field of
+    :py:obj:`~.CUlaunchAttribute`
 
     Attributes
     ----------
@@ -14269,115 +14293,125 @@ cdef class CUlaunchAttributeValue_union:
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.accessPolicyWindow' in found_struct}}
-    accessPolicyWindow : CUaccessPolicyWindow
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW.
+    accessPolicyWindow : :py:obj:`~.CUaccessPolicyWindow`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_ACCESS_POLICY_WINDOW`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.cooperative' in found_struct}}
     cooperative : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_COOPERATIVE. Nonzero
-        indicates a cooperative kernel (see cuLaunchCooperativeKernel).
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_COOPERATIVE`. Nonzero indicates a
+        cooperative kernel (see :func:`~.cuLaunchCooperativeKernel`).
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.syncPolicy' in found_struct}}
-    syncPolicy : CUsynchronizationPolicy
+    syncPolicy : :py:obj:`~.CUsynchronizationPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY. CUsynchronizationPolicy
-        for work queued up in this stream
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SYNCHRONIZATION_POLICY`.
+        :py:obj:`~.CUsynchronizationPolicy` for work queued up in this
+        stream
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterDim' in found_struct}}
     clusterDim : anon_struct1
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
-        that represents the desired cluster dimensions for the kernel.
-        Opaque type with the following fields: - ``x`` - The X dimension of
-        the cluster, in blocks. Must be a divisor of the grid X dimension.
-        - ``y`` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
-        the cluster, in blocks. Must be a divisor of the grid Z dimension.
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION` that represents
+        the desired cluster dimensions for the kernel. Opaque type with the
+        following fields: - ``x`` - The X dimension of the cluster, in
+        blocks. Must be a divisor of the grid X dimension.    - ``y`` - The
+        Y dimension of the cluster, in blocks. Must be a divisor of the
+        grid Y dimension.    - ``z`` - The Z dimension of the cluster, in
+        blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.clusterSchedulingPolicyPreference' in found_struct}}
-    clusterSchedulingPolicyPreference : CUclusterSchedulingPolicy
+    clusterSchedulingPolicyPreference : :py:obj:`~.CUclusterSchedulingPolicy`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE. Cluster
-        scheduling policy preference for the kernel.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`.
+        Cluster scheduling policy preference for the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticStreamSerializationAllowed' in found_struct}}
     programmaticStreamSerializationAllowed : int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct2
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT
-        with the following fields: - ``CUevent`` event - Event to fire when
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
         all blocks trigger it.    - ``Event`` record flags, see
-        cuEventRecordWithFlags. Does not accept :CU_EVENT_RECORD_EXTERNAL.
-        - ``triggerAtBlockStart`` - If this is set to non-0, each block
+        :func:`~.cuEventRecordWithFlags`. Does not accept
+        ``py``:obj:`~.CU_EVENT_RECORD_EXTERNAL`.    -
+        ``triggerAtBlockStart`` - If this is set to non-0, each block
         launch will automatically trigger the event.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct3
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT with the following
-        fields: - ``CUevent`` event - Event to fire when the last block
-        launches    - ``int`` flags; - Event record flags, see
-        cuEventRecordWithFlags. Does not accept CU_EVENT_RECORD_EXTERNAL.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_LAUNCH_COMPLETION_EVENT` with the
+        following fields: - :py:obj:`~.CUevent` event - Event to fire when
+        the last block launches    - ``int`` flags; - Event record flags,
+        see :func:`~.cuEventRecordWithFlags`. Does not accept
+        :py:obj:`~.CU_EVENT_RECORD_EXTERNAL`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.priority' in found_struct}}
     priority : int
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_PRIORITY. Execution
-        priority of the kernel.
+        Value of launch attribute :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PRIORITY`.
+        Execution priority of the kernel.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomainMap' in found_struct}}
-    memSyncDomainMap : CUlaunchMemSyncDomainMap
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP.
-        See CUlaunchMemSyncDomainMap.
+    memSyncDomainMap : :py:obj:`~.CUlaunchMemSyncDomainMap`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN_MAP`. See
+        :py:obj:`~.CUlaunchMemSyncDomainMap`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.memSyncDomain' in found_struct}}
-    memSyncDomain : CUlaunchMemSyncDomain
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN.
-        See::CUlaunchMemSyncDomain
+    memSyncDomain : :py:obj:`~.CUlaunchMemSyncDomain`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_MEM_SYNC_DOMAIN`. See
+        :py:obj:`~.CUlaunchMemSyncDomain`
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.preferredClusterDim' in found_struct}}
     preferredClusterDim : anon_struct4
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION that represents the
-        desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - ``x`` - The X dimension of the
-        preferred cluster, in blocks. Must be a divisor of the grid X
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_CLUSTER_DIMENSION` that
+        represents the desired preferred cluster dimensions for the kernel.
+        Opaque type with the following fields: - ``x`` - The X dimension of
+        the preferred cluster, in blocks. Must be a divisor of the grid X
         dimension, and must be a multiple of the ``x`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension of
-        the preferred cluster, in blocks. Must be a divisor of the grid Y
-        dimension, and must be a multiple of the ``y`` field of
-        CUlaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension of
-        the preferred cluster, in blocks. Must be equal to the ``z`` field
-        of CUlaunchAttributeValue::clusterDim.
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``y`` - The Y
+        dimension of the preferred cluster, in blocks. Must be a divisor of
+        the grid Y dimension, and must be a multiple of the ``y`` field of
+        :py:obj:`~.CUlaunchAttributeValue.clusterDim`.    - ``z`` - The Z
+        dimension of the preferred cluster, in blocks. Must be equal to the
+        ``z`` field of :py:obj:`~.CUlaunchAttributeValue.clusterDim`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct5
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE. with the
-        following fields: - ``int`` deviceUpdatable - Whether or not the
-        resulting kernel node should be device-updatable.    -
-        ``CUgraphDeviceNode`` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_DEVICE_UPDATABLE_KERNEL_NODE`. with
+        the following fields: - ``int`` deviceUpdatable - Whether or not
+        the resulting kernel node should be device-updatable.    -
+        :py:obj:`~.CUgraphDeviceNode` devNode - Returns a handle to pass to
+        the various device-side update functions.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.nvlinkUtilCentricScheduling' in found_struct}}
     nvlinkUtilCentricScheduling : unsigned int
 
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.portableClusterSizeMode' in found_struct}}
-    portableClusterSizeMode : CUlaunchAttributePortableClusterMode
+    portableClusterSizeMode : :py:obj:`~.CUlaunchAttributePortableClusterMode`
         Value of launch attribute
-        CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE.
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_PORTABLE_CLUSTER_SIZE_MODE`.
     {{endif}}
     {{if 'CUlaunchAttributeValue_union.sharedMemoryMode' in found_struct}}
-    sharedMemoryMode : CUsharedMemoryMode
-        Value of launch attribute CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE.
-        See CUsharedMemoryMode for acceptable values.
+    sharedMemoryMode : :py:obj:`~.CUsharedMemoryMode`
+        Value of launch attribute
+        :py:obj:`~.CU_LAUNCH_ATTRIBUTE_SHARED_MEMORY_MODE`. See
+        :py:obj:`~.CUsharedMemoryMode` for acceptable values.
     {{endif}}
 
     Methods
@@ -14696,11 +14730,11 @@ cdef class CUlaunchAttribute_st:
     Attributes
     ----------
     {{if 'CUlaunchAttribute_st.id' in found_struct}}
-    id : CUlaunchAttributeID
+    id : :py:obj:`~.CUlaunchAttributeID`
         Attribute to set
     {{endif}}
     {{if 'CUlaunchAttribute_st.value' in found_struct}}
-    value : CUlaunchAttributeValue
+    value : :py:obj:`~.CUlaunchAttributeValue`
         Value of the attribute
     {{endif}}
 
@@ -14795,16 +14829,17 @@ cdef class CUlaunchConfig_st:
         Dynamic shared-memory size per thread block in bytes
     {{endif}}
     {{if 'CUlaunchConfig_st.hStream' in found_struct}}
-    hStream : CUstream
+    hStream : :py:obj:`~.CUstream`
         Stream identifier
     {{endif}}
     {{if 'CUlaunchConfig_st.attrs' in found_struct}}
-    attrs : CUlaunchAttribute
-        List of attributes; nullable if CUlaunchConfig::numAttrs == 0
+    attrs : :py:obj:`~.CUlaunchAttribute`
+        List of attributes; nullable if :py:obj:`~.CUlaunchConfig.numAttrs`
+        == 0
     {{endif}}
     {{if 'CUlaunchConfig_st.numAttrs' in found_struct}}
     numAttrs : unsigned int
-        Number of attributes populated in CUlaunchConfig::attrs
+        Number of attributes populated in :py:obj:`~.CUlaunchConfig.attrs`
     {{endif}}
 
     Methods
@@ -15007,7 +15042,7 @@ cdef class CUlaunchConfig_st:
 
 cdef class CUexecAffinitySmCount_st:
     """
-    Value for CU_EXEC_AFFINITY_TYPE_SM_COUNT
+    Value for :py:obj:`~.CU_EXEC_AFFINITY_TYPE_SM_COUNT`
 
     Attributes
     ----------
@@ -15060,7 +15095,7 @@ cdef class anon_union3:
     Attributes
     ----------
     {{if 'CUexecAffinityParam_st.param.smCount' in found_struct}}
-    smCount : CUexecAffinitySmCount
+    smCount : :py:obj:`~.CUexecAffinitySmCount`
 
     {{endif}}
 
@@ -15111,7 +15146,7 @@ cdef class CUexecAffinityParam_st:
     Attributes
     ----------
     {{if 'CUexecAffinityParam_st.type' in found_struct}}
-    type : CUexecAffinityType
+    type : :py:obj:`~.CUexecAffinityType`
         Type of execution affinity.
     {{endif}}
     {{if 'CUexecAffinityParam_st.param' in found_struct}}
@@ -15184,7 +15219,7 @@ cdef class CUctxCigParam_st:
     Attributes
     ----------
     {{if 'CUctxCigParam_st.sharedDataType' in found_struct}}
-    sharedDataType : CUcigDataType
+    sharedDataType : :py:obj:`~.CUcigDataType`
         Type of shared data from graphics client (D3D12 or Vulkan).
     {{endif}}
     {{if 'CUctxCigParam_st.sharedData' in found_struct}}
@@ -15256,7 +15291,7 @@ cdef class CUctxCreateParams_st:
     Attributes
     ----------
     {{if 'CUctxCreateParams_st.execAffinityParams' in found_struct}}
-    execAffinityParams : CUexecAffinityParam
+    execAffinityParams : :py:obj:`~.CUexecAffinityParam`
         Array of execution affinity parameters to limit context resources
         (e.g., SM count). Only supported Volta+ MPS. Mutually exclusive
         with cigParams.
@@ -15267,7 +15302,7 @@ cdef class CUctxCreateParams_st:
         execAffinityParams is NULL.
     {{endif}}
     {{if 'CUctxCreateParams_st.cigParams' in found_struct}}
-    cigParams : CUctxCigParam
+    cigParams : :py:obj:`~.CUctxCigParam`
         CIG (CUDA in Graphics) parameters for sharing data from
         D3D12/Vulkan graphics clients. Mutually exclusive with
         execAffinityParams.
@@ -15389,7 +15424,7 @@ cdef class CUstreamCigParam_st:
     Attributes
     ----------
     {{if 'CUstreamCigParam_st.streamSharedDataType' in found_struct}}
-    streamSharedDataType : CUstreamCigDataType
+    streamSharedDataType : :py:obj:`~.CUstreamCigDataType`
         Type of shared data from graphics client (D3D12).
     {{endif}}
     {{if 'CUstreamCigParam_st.streamSharedData' in found_struct}}
@@ -15460,7 +15495,7 @@ cdef class CUstreamCigCaptureParams_st:
     Attributes
     ----------
     {{if 'CUstreamCigCaptureParams_st.streamCigParams' in found_struct}}
-    streamCigParams : CUstreamCigParam
+    streamCigParams : :py:obj:`~.CUstreamCigParam`
         CIG (CUDA in Graphics) parameters for sharing command list data
         from D3D12 graphics clients.
     {{endif}}
@@ -15644,7 +15679,7 @@ cdef class CUDA_MEMCPY2D_st:
         Source Y
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcMemoryType' in found_struct}}
-    srcMemoryType : CUmemorytype
+    srcMemoryType : :py:obj:`~.CUmemorytype`
         Source memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcHost' in found_struct}}
@@ -15652,11 +15687,11 @@ cdef class CUDA_MEMCPY2D_st:
         Source host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcDevice' in found_struct}}
-    srcDevice : CUdeviceptr
+    srcDevice : :py:obj:`~.CUdeviceptr`
         Source device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcArray' in found_struct}}
-    srcArray : CUarray
+    srcArray : :py:obj:`~.CUarray`
         Source array reference
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.srcPitch' in found_struct}}
@@ -15672,7 +15707,7 @@ cdef class CUDA_MEMCPY2D_st:
         Destination Y
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstMemoryType' in found_struct}}
-    dstMemoryType : CUmemorytype
+    dstMemoryType : :py:obj:`~.CUmemorytype`
         Destination memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstHost' in found_struct}}
@@ -15680,11 +15715,11 @@ cdef class CUDA_MEMCPY2D_st:
         Destination host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstDevice' in found_struct}}
-    dstDevice : CUdeviceptr
+    dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstArray' in found_struct}}
-    dstArray : CUarray
+    dstArray : :py:obj:`~.CUarray`
         Destination array reference
     {{endif}}
     {{if 'CUDA_MEMCPY2D_st.dstPitch' in found_struct}}
@@ -16024,7 +16059,7 @@ cdef class CUDA_MEMCPY3D_st:
         Source LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcMemoryType' in found_struct}}
-    srcMemoryType : CUmemorytype
+    srcMemoryType : :py:obj:`~.CUmemorytype`
         Source memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcHost' in found_struct}}
@@ -16032,11 +16067,11 @@ cdef class CUDA_MEMCPY3D_st:
         Source host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcDevice' in found_struct}}
-    srcDevice : CUdeviceptr
+    srcDevice : :py:obj:`~.CUdeviceptr`
         Source device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.srcArray' in found_struct}}
-    srcArray : CUarray
+    srcArray : :py:obj:`~.CUarray`
         Source array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.reserved0' in found_struct}}
@@ -16068,7 +16103,7 @@ cdef class CUDA_MEMCPY3D_st:
         Destination LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstMemoryType' in found_struct}}
-    dstMemoryType : CUmemorytype
+    dstMemoryType : :py:obj:`~.CUmemorytype`
         Destination memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstHost' in found_struct}}
@@ -16076,11 +16111,11 @@ cdef class CUDA_MEMCPY3D_st:
         Destination host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstDevice' in found_struct}}
-    dstDevice : CUdeviceptr
+    dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.dstArray' in found_struct}}
-    dstArray : CUarray
+    dstArray : :py:obj:`~.CUarray`
         Destination array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_st.reserved1' in found_struct}}
@@ -16561,7 +16596,7 @@ cdef class CUDA_MEMCPY3D_PEER_st:
         Source LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcMemoryType' in found_struct}}
-    srcMemoryType : CUmemorytype
+    srcMemoryType : :py:obj:`~.CUmemorytype`
         Source memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcHost' in found_struct}}
@@ -16569,16 +16604,17 @@ cdef class CUDA_MEMCPY3D_PEER_st:
         Source host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcDevice' in found_struct}}
-    srcDevice : CUdeviceptr
+    srcDevice : :py:obj:`~.CUdeviceptr`
         Source device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcArray' in found_struct}}
-    srcArray : CUarray
+    srcArray : :py:obj:`~.CUarray`
         Source array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcContext' in found_struct}}
-    srcContext : CUcontext
-        Source context (ignored with srcMemoryType is CU_MEMORYTYPE_ARRAY)
+    srcContext : :py:obj:`~.CUcontext`
+        Source context (ignored with srcMemoryType is
+        :py:obj:`~.CU_MEMORYTYPE_ARRAY`)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.srcPitch' in found_struct}}
     srcPitch : size_t
@@ -16605,7 +16641,7 @@ cdef class CUDA_MEMCPY3D_PEER_st:
         Destination LOD
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstMemoryType' in found_struct}}
-    dstMemoryType : CUmemorytype
+    dstMemoryType : :py:obj:`~.CUmemorytype`
         Destination memory type (host, device, array)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstHost' in found_struct}}
@@ -16613,17 +16649,17 @@ cdef class CUDA_MEMCPY3D_PEER_st:
         Destination host pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstDevice' in found_struct}}
-    dstDevice : CUdeviceptr
+    dstDevice : :py:obj:`~.CUdeviceptr`
         Destination device pointer
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstArray' in found_struct}}
-    dstArray : CUarray
+    dstArray : :py:obj:`~.CUarray`
         Destination array reference
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstContext' in found_struct}}
-    dstContext : CUcontext
+    dstContext : :py:obj:`~.CUcontext`
         Destination context (ignored with dstMemoryType is
-        CU_MEMORYTYPE_ARRAY)
+        :py:obj:`~.CU_MEMORYTYPE_ARRAY`)
     {{endif}}
     {{if 'CUDA_MEMCPY3D_PEER_st.dstPitch' in found_struct}}
     dstPitch : size_t
@@ -17113,11 +17149,11 @@ cdef class CUDA_MEMCPY_NODE_PARAMS_st:
         Must be zero
     {{endif}}
     {{if 'CUDA_MEMCPY_NODE_PARAMS_st.copyCtx' in found_struct}}
-    copyCtx : CUcontext
+    copyCtx : :py:obj:`~.CUcontext`
         Context on which to run the node
     {{endif}}
     {{if 'CUDA_MEMCPY_NODE_PARAMS_st.copyParams' in found_struct}}
-    copyParams : CUDA_MEMCPY3D
+    copyParams : :py:obj:`~.CUDA_MEMCPY3D`
         Parameters for the memory copy
     {{endif}}
 
@@ -17232,7 +17268,7 @@ cdef class CUDA_ARRAY_DESCRIPTOR_st:
         Height of array
     {{endif}}
     {{if 'CUDA_ARRAY_DESCRIPTOR_st.Format' in found_struct}}
-    Format : CUarray_format
+    Format : :py:obj:`~.CUarray_format`
         Array format
     {{endif}}
     {{if 'CUDA_ARRAY_DESCRIPTOR_st.NumChannels' in found_struct}}
@@ -17340,7 +17376,7 @@ cdef class CUDA_ARRAY3D_DESCRIPTOR_st:
         Depth of 3D array
     {{endif}}
     {{if 'CUDA_ARRAY3D_DESCRIPTOR_st.Format' in found_struct}}
-    Format : CUarray_format
+    Format : :py:obj:`~.CUarray_format`
         Array format
     {{endif}}
     {{if 'CUDA_ARRAY3D_DESCRIPTOR_st.NumChannels' in found_struct}}
@@ -17564,7 +17600,7 @@ cdef class CUDA_ARRAY_SPARSE_PROPERTIES_st:
     {{if 'CUDA_ARRAY_SPARSE_PROPERTIES_st.flags' in found_struct}}
     flags : unsigned int
         Flags will either be zero or
-        CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL
+        :py:obj:`~.CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL`
     {{endif}}
     {{if 'CUDA_ARRAY_SPARSE_PROPERTIES_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -17760,7 +17796,7 @@ cdef class anon_struct7:
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_DESC_st.res.array.hArray' in found_struct}}
-    hArray : CUarray
+    hArray : :py:obj:`~.CUarray`
 
     {{endif}}
 
@@ -17818,7 +17854,7 @@ cdef class anon_struct8:
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_DESC_st.res.mipmap.hMipmappedArray' in found_struct}}
-    hMipmappedArray : CUmipmappedArray
+    hMipmappedArray : :py:obj:`~.CUmipmappedArray`
 
     {{endif}}
 
@@ -17876,11 +17912,11 @@ cdef class anon_struct9:
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_DESC_st.res.linear.devPtr' in found_struct}}
-    devPtr : CUdeviceptr
+    devPtr : :py:obj:`~.CUdeviceptr`
 
     {{endif}}
     {{if 'CUDA_RESOURCE_DESC_st.res.linear.format' in found_struct}}
-    format : CUarray_format
+    format : :py:obj:`~.CUarray_format`
 
     {{endif}}
     {{if 'CUDA_RESOURCE_DESC_st.res.linear.numChannels' in found_struct}}
@@ -17989,11 +18025,11 @@ cdef class anon_struct10:
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_DESC_st.res.pitch2D.devPtr' in found_struct}}
-    devPtr : CUdeviceptr
+    devPtr : :py:obj:`~.CUdeviceptr`
 
     {{endif}}
     {{if 'CUDA_RESOURCE_DESC_st.res.pitch2D.format' in found_struct}}
-    format : CUarray_format
+    format : :py:obj:`~.CUarray_format`
 
     {{endif}}
     {{if 'CUDA_RESOURCE_DESC_st.res.pitch2D.numChannels' in found_struct}}
@@ -18319,7 +18355,7 @@ cdef class CUDA_RESOURCE_DESC_st:
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_DESC_st.resType' in found_struct}}
-    resType : CUresourcetype
+    resType : :py:obj:`~.CUresourcetype`
         Resource type
     {{endif}}
     {{if 'CUDA_RESOURCE_DESC_st.res' in found_struct}}
@@ -18410,11 +18446,11 @@ cdef class CUDA_TEXTURE_DESC_st:
     Attributes
     ----------
     {{if 'CUDA_TEXTURE_DESC_st.addressMode' in found_struct}}
-    addressMode : list[CUaddress_mode]
+    addressMode : list[:py:obj:`~.CUaddress_mode`]
         Address modes
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.filterMode' in found_struct}}
-    filterMode : CUfilter_mode
+    filterMode : :py:obj:`~.CUfilter_mode`
         Filter mode
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.flags' in found_struct}}
@@ -18426,7 +18462,7 @@ cdef class CUDA_TEXTURE_DESC_st:
         Maximum anisotropy ratio
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.mipmapFilterMode' in found_struct}}
-    mipmapFilterMode : CUfilter_mode
+    mipmapFilterMode : :py:obj:`~.CUfilter_mode`
         Mipmap filter mode
     {{endif}}
     {{if 'CUDA_TEXTURE_DESC_st.mipmapLevelBias' in found_struct}}
@@ -18622,7 +18658,7 @@ cdef class CUDA_RESOURCE_VIEW_DESC_st:
     Attributes
     ----------
     {{if 'CUDA_RESOURCE_VIEW_DESC_st.format' in found_struct}}
-    format : CUresourceViewFormat
+    format : :py:obj:`~.CUresourceViewFormat`
         Resource view format
     {{endif}}
     {{if 'CUDA_RESOURCE_VIEW_DESC_st.width' in found_struct}}
@@ -18817,7 +18853,7 @@ cdef class CUtensorMap_st:
     Attributes
     ----------
     {{if 'CUtensorMap_st.opaque' in found_struct}}
-    opaque : list[cuuint64_t]
+    opaque : list[:py:obj:`~.cuuint64_t`]
 
     {{endif}}
 
@@ -18936,7 +18972,7 @@ cdef class CUDA_LAUNCH_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_LAUNCH_PARAMS_st.function' in found_struct}}
-    function : CUfunction
+    function : :py:obj:`~.CUfunction`
         Kernel to launch
     {{endif}}
     {{if 'CUDA_LAUNCH_PARAMS_st.gridDimX' in found_struct}}
@@ -18968,7 +19004,7 @@ cdef class CUDA_LAUNCH_PARAMS_st:
         Dynamic shared-memory size per thread block in bytes
     {{endif}}
     {{if 'CUDA_LAUNCH_PARAMS_st.hStream' in found_struct}}
-    hStream : CUstream
+    hStream : :py:obj:`~.CUstream`
         Stream identifier
     {{endif}}
     {{if 'CUDA_LAUNCH_PARAMS_st.kernelParams' in found_struct}}
@@ -19325,7 +19361,7 @@ cdef class CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st:
     Attributes
     ----------
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.type' in found_struct}}
-    type : CUexternalMemoryHandleType
+    type : :py:obj:`~.CUexternalMemoryHandleType`
         Type of the handle
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.handle' in found_struct}}
@@ -19338,7 +19374,8 @@ cdef class CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.flags' in found_struct}}
     flags : unsigned int
-        Flags must either be zero or CUDA_EXTERNAL_MEMORY_DEDICATED
+        Flags must either be zero or
+        :py:obj:`~.CUDA_EXTERNAL_MEMORY_DEDICATED`
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -19561,7 +19598,7 @@ cdef class CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st:
         chain is.
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st.arrayDesc' in found_struct}}
-    arrayDesc : CUDA_ARRAY3D_DESCRIPTOR
+    arrayDesc : :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR`
         Format, dimension and type of base level of the mipmap chain
     {{endif}}
     {{if 'CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC_st.numLevels' in found_struct}}
@@ -19816,7 +19853,7 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st:
     Attributes
     ----------
     {{if 'CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st.type' in found_struct}}
-    type : CUexternalSemaphoreHandleType
+    type : :py:obj:`~.CUexternalSemaphoreHandleType`
         Type of the handle
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st.handle' in found_struct}}
@@ -20196,14 +20233,17 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS is used to signal a
-        CUexternalSemaphore of type
-        CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
-        CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC which
-        indicates that while signaling the CUexternalSemaphore, no memory
-        synchronization operations should be performed for any external
-        memory object imported as CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
-        For all other types of CUexternalSemaphore, flags must be zero.
+        Only when :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS` is used
+        to signal a :py:obj:`~.CUexternalSemaphore` of type
+        :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC`, the valid
+        flag is
+        :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_SKIP_NVSCIBUF_MEMSYNC`
+        which indicates that while signaling the
+        :py:obj:`~.CUexternalSemaphore`, no memory synchronization
+        operations should be performed for any external memory object
+        imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`.
+        For all other types of :py:obj:`~.CUexternalSemaphore`, flags must
+        be zero.
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -20576,14 +20616,17 @@ cdef class CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st:
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
-        Only when CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS is used to wait on a
-        CUexternalSemaphore of type
-        CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC, the valid flag is
-        CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC which indicates
-        that while waiting for the CUexternalSemaphore, no memory
-        synchronization operations should be performed for any external
-        memory object imported as CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF.
-        For all other types of CUexternalSemaphore, flags must be zero.
+        Only when :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS` is used
+        to wait on a :py:obj:`~.CUexternalSemaphore` of type
+        :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC`, the valid
+        flag is
+        :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_SKIP_NVSCIBUF_MEMSYNC`
+        which indicates that while waiting for the
+        :py:obj:`~.CUexternalSemaphore`, no memory synchronization
+        operations should be performed for any external memory object
+        imported as :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`.
+        For all other types of :py:obj:`~.CUexternalSemaphore`, flags must
+        be zero.
     {{endif}}
     {{if 'CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -20667,11 +20710,11 @@ cdef class CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`
         Array of external semaphore signal parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_st.numExtSems' in found_struct}}
@@ -20796,11 +20839,11 @@ cdef class CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS`
         Array of external semaphore signal parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2_st.numExtSems' in found_struct}}
@@ -20925,11 +20968,11 @@ cdef class CUDA_EXT_SEM_WAIT_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS`
         Array of external semaphore wait parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_st.numExtSems' in found_struct}}
@@ -21054,11 +21097,11 @@ cdef class CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st.extSemArray' in found_struct}}
-    extSemArray : CUexternalSemaphore
+    extSemArray : :py:obj:`~.CUexternalSemaphore`
         Array of external semaphore handles.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st.paramsArray' in found_struct}}
-    paramsArray : CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS
+    paramsArray : :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS`
         Array of external semaphore wait parameters.
     {{endif}}
     {{if 'CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2_st.numExtSems' in found_struct}}
@@ -21181,11 +21224,11 @@ cdef class anon_union9:
     Attributes
     ----------
     {{if 'CUarrayMapInfo_st.resource.mipmap' in found_struct}}
-    mipmap : CUmipmappedArray
+    mipmap : :py:obj:`~.CUmipmappedArray`
 
     {{endif}}
     {{if 'CUarrayMapInfo_st.resource.array' in found_struct}}
-    array : CUarray
+    array : :py:obj:`~.CUarray`
 
     {{endif}}
 
@@ -21593,7 +21636,7 @@ cdef class anon_union11:
     Attributes
     ----------
     {{if 'CUarrayMapInfo_st.memHandle.memHandle' in found_struct}}
-    memHandle : CUmemGenericAllocationHandle
+    memHandle : :py:obj:`~.CUmemGenericAllocationHandle`
 
     {{endif}}
 
@@ -21655,7 +21698,7 @@ cdef class CUarrayMapInfo_st:
     Attributes
     ----------
     {{if 'CUarrayMapInfo_st.resourceType' in found_struct}}
-    resourceType : CUresourcetype
+    resourceType : :py:obj:`~.CUresourcetype`
         Resource type
     {{endif}}
     {{if 'CUarrayMapInfo_st.resource' in found_struct}}
@@ -21663,7 +21706,7 @@ cdef class CUarrayMapInfo_st:
 
     {{endif}}
     {{if 'CUarrayMapInfo_st.subresourceType' in found_struct}}
-    subresourceType : CUarraySparseSubresourceType
+    subresourceType : :py:obj:`~.CUarraySparseSubresourceType`
         Sparse subresource type
     {{endif}}
     {{if 'CUarrayMapInfo_st.subresource' in found_struct}}
@@ -21671,11 +21714,11 @@ cdef class CUarrayMapInfo_st:
 
     {{endif}}
     {{if 'CUarrayMapInfo_st.memOperationType' in found_struct}}
-    memOperationType : CUmemOperationType
+    memOperationType : :py:obj:`~.CUmemOperationType`
         Memory operation type
     {{endif}}
     {{if 'CUarrayMapInfo_st.memHandleType' in found_struct}}
-    memHandleType : CUmemHandleType
+    memHandleType : :py:obj:`~.CUmemHandleType`
         Memory handle type
     {{endif}}
     {{if 'CUarrayMapInfo_st.memHandle' in found_struct}}
@@ -21896,14 +21939,17 @@ cdef class CUmemLocation_st:
     Attributes
     ----------
     {{if 'CUmemLocation_st.type' in found_struct}}
-    type : CUmemLocationType
+    type : :py:obj:`~.CUmemLocationType`
         Specifies the location type, which modifies the meaning of id.
     {{endif}}
     {{if 'CUmemLocation_st.id' in found_struct}}
     id : int
-        Identifier for CUmemLocationType::CU_MEM_LOCATION_TYPE_DEVICE,
-        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST,
-        CUmemLocationType::CU_MEM_LOCATION_TYPE_HOST_NUMA.
+        Identifier for :py:obj:`~.CUmemLocationType`
+        :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE`,
+        :py:obj:`~.CUmemLocationType`
+        :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`,
+        :py:obj:`~.CUmemLocationType`
+        :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`.
     {{endif}}
 
     Methods
@@ -22071,24 +22117,24 @@ cdef class CUmemAllocationProp_st:
     Attributes
     ----------
     {{if 'CUmemAllocationProp_st.type' in found_struct}}
-    type : CUmemAllocationType
+    type : :py:obj:`~.CUmemAllocationType`
         Allocation type
     {{endif}}
     {{if 'CUmemAllocationProp_st.requestedHandleTypes' in found_struct}}
-    requestedHandleTypes : CUmemAllocationHandleType
-        requested CUmemAllocationHandleType
+    requestedHandleTypes : :py:obj:`~.CUmemAllocationHandleType`
+        requested :py:obj:`~.CUmemAllocationHandleType`
     {{endif}}
     {{if 'CUmemAllocationProp_st.location' in found_struct}}
-    location : CUmemLocation
+    location : :py:obj:`~.CUmemLocation`
         Location of allocation
     {{endif}}
     {{if 'CUmemAllocationProp_st.win32HandleMetaData' in found_struct}}
     win32HandleMetaData : Any
         Windows-specific POBJECT_ATTRIBUTES required when
-        CU_MEM_HANDLE_TYPE_WIN32 is specified. This object attributes
-        structure includes security attributes that define the scope of
-        which exported allocations may be transferred to other processes.
-        In all other cases, this field is required to be zero.
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_WIN32` is specified. This object
+        attributes structure includes security attributes that define the
+        scope of which exported allocations may be transferred to other
+        processes. In all other cases, this field is required to be zero.
     {{endif}}
     {{if 'CUmemAllocationProp_st.allocFlags' in found_struct}}
     allocFlags : anon_struct22
@@ -22215,8 +22261,8 @@ cdef class CUmulticastObjectProp_st:
     {{endif}}
     {{if 'CUmulticastObjectProp_st.handleTypes' in found_struct}}
     handleTypes : unsigned long long
-        Bitmask of exportable handle types (see CUmemAllocationHandleType)
-        for this object
+        Bitmask of exportable handle types (see
+        :py:obj:`~.CUmemAllocationHandleType`) for this object
     {{endif}}
     {{if 'CUmulticastObjectProp_st.flags' in found_struct}}
     flags : unsigned long long
@@ -22311,11 +22357,11 @@ cdef class CUmemAccessDesc_st:
     Attributes
     ----------
     {{if 'CUmemAccessDesc_st.location' in found_struct}}
-    location : CUmemLocation
+    location : :py:obj:`~.CUmemLocation`
         Location on which the request is to change it's accessibility
     {{endif}}
     {{if 'CUmemAccessDesc_st.flags' in found_struct}}
-    flags : CUmemAccess_flags
+    flags : :py:obj:`~.CUmemAccess_flags`
         ``CUmemProt`` accessibility flags to set on the request
     {{endif}}
 
@@ -22377,22 +22423,22 @@ cdef class CUmemAccessDesc_st:
 
 cdef class CUgraphExecUpdateResultInfo_st:
     """
-    Result information returned by cuGraphExecUpdate
+    Result information returned by :py:obj:`~.cuGraphExecUpdate`
 
     Attributes
     ----------
     {{if 'CUgraphExecUpdateResultInfo_st.result' in found_struct}}
-    result : CUgraphExecUpdateResult
+    result : :py:obj:`~.CUgraphExecUpdateResult`
         Gives more specific detail when a cuda graph update fails.
     {{endif}}
     {{if 'CUgraphExecUpdateResultInfo_st.errorNode' in found_struct}}
-    errorNode : CUgraphNode
+    errorNode : :py:obj:`~.CUgraphNode`
         The "to node" of the error edge when the topologies do not match.
         The error node when the error is associated with a specific node.
         NULL when the error is generic.
     {{endif}}
     {{if 'CUgraphExecUpdateResultInfo_st.errorFromNode' in found_struct}}
-    errorFromNode : CUgraphNode
+    errorFromNode : :py:obj:`~.CUgraphNode`
         The from node of error edge when the topologies do not match.
         Otherwise NULL.
     {{endif}}
@@ -22495,25 +22541,25 @@ cdef class CUmemPoolProps_st:
     Attributes
     ----------
     {{if 'CUmemPoolProps_st.allocType' in found_struct}}
-    allocType : CUmemAllocationType
+    allocType : :py:obj:`~.CUmemAllocationType`
         Allocation type. Currently must be specified as
-        CU_MEM_ALLOCATION_TYPE_PINNED
+        :py:obj:`~.CU_MEM_ALLOCATION_TYPE_PINNED`
     {{endif}}
     {{if 'CUmemPoolProps_st.handleTypes' in found_struct}}
-    handleTypes : CUmemAllocationHandleType
+    handleTypes : :py:obj:`~.CUmemAllocationHandleType`
         Handle types that will be supported by allocations from the pool.
     {{endif}}
     {{if 'CUmemPoolProps_st.location' in found_struct}}
-    location : CUmemLocation
+    location : :py:obj:`~.CUmemLocation`
         Location where allocations should reside.
     {{endif}}
     {{if 'CUmemPoolProps_st.win32SecurityAttributes' in found_struct}}
     win32SecurityAttributes : Any
         Windows-specific LPSECURITYATTRIBUTES required when
-        CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute
-        defines the scope of which exported allocations may be transferred
-        to other processes. In all other cases, this field is required to
-        be zero.
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_WIN32` is specified. This security
+        attribute defines the scope of which exported allocations may be
+        transferred to other processes. In all other cases, this field is
+        required to be zero.
     {{endif}}
     {{if 'CUmemPoolProps_st.maxSize' in found_struct}}
     maxSize : size_t
@@ -22715,28 +22761,29 @@ cdef class CUmemPoolPtrExportData_st:
 cdef class CUmemcpyAttributes_st:
     """
     Attributes specific to copies within a batch. For more details on
-    usage see cuMemcpyBatchAsync.
+    usage see :py:obj:`~.cuMemcpyBatchAsync`.
 
     Attributes
     ----------
     {{if 'CUmemcpyAttributes_st.srcAccessOrder' in found_struct}}
-    srcAccessOrder : CUmemcpySrcAccessOrder
+    srcAccessOrder : :py:obj:`~.CUmemcpySrcAccessOrder`
         Source access ordering to be observed for copies with this
         attribute.
     {{endif}}
     {{if 'CUmemcpyAttributes_st.srcLocHint' in found_struct}}
-    srcLocHint : CUmemLocation
+    srcLocHint : :py:obj:`~.CUmemLocation`
         Hint location for the source operand. Ignored when the pointers are
         not managed memory or memory allocated outside CUDA.
     {{endif}}
     {{if 'CUmemcpyAttributes_st.dstLocHint' in found_struct}}
-    dstLocHint : CUmemLocation
+    dstLocHint : :py:obj:`~.CUmemLocation`
         Hint location for the destination operand. Ignored when the
         pointers are not managed memory or memory allocated outside CUDA.
     {{endif}}
     {{if 'CUmemcpyAttributes_st.flags' in found_struct}}
     flags : unsigned int
-        Additional flags for copies with this attribute. See CUmemcpyFlags
+        Additional flags for copies with this attribute. See
+        :py:obj:`~.CUmemcpyFlags`
     {{endif}}
 
     Methods
@@ -22914,7 +22961,8 @@ cdef class CUoffset3D_st:
 
 cdef class CUextent3D_st:
     """
-    Struct representing width/height/depth of a CUarray in elements
+    Struct representing width/height/depth of a :py:obj:`~.CUarray` in
+    elements
 
     Attributes
     ----------
@@ -23003,7 +23051,7 @@ cdef class anon_struct23:
     Attributes
     ----------
     {{if 'CUmemcpy3DOperand_st.op.ptr.ptr' in found_struct}}
-    ptr : CUdeviceptr
+    ptr : :py:obj:`~.CUdeviceptr`
 
     {{endif}}
     {{if 'CUmemcpy3DOperand_st.op.ptr.rowLength' in found_struct}}
@@ -23015,7 +23063,7 @@ cdef class anon_struct23:
 
     {{endif}}
     {{if 'CUmemcpy3DOperand_st.op.ptr.locHint' in found_struct}}
-    locHint : CUmemLocation
+    locHint : :py:obj:`~.CUmemLocation`
 
     {{endif}}
 
@@ -23119,11 +23167,11 @@ cdef class anon_struct24:
     Attributes
     ----------
     {{if 'CUmemcpy3DOperand_st.op.array.array' in found_struct}}
-    array : CUarray
+    array : :py:obj:`~.CUarray`
 
     {{endif}}
     {{if 'CUmemcpy3DOperand_st.op.array.offset' in found_struct}}
-    offset : CUoffset3D
+    offset : :py:obj:`~.CUoffset3D`
 
     {{endif}}
 
@@ -23265,12 +23313,13 @@ cdef class anon_union13:
 
 cdef class CUmemcpy3DOperand_st:
     """
-    Struct representing an operand for copy with cuMemcpy3DBatchAsync
+    Struct representing an operand for copy with
+    :py:obj:`~.cuMemcpy3DBatchAsync`
 
     Attributes
     ----------
     {{if 'CUmemcpy3DOperand_st.type' in found_struct}}
-    type : CUmemcpy3DOperandType
+    type : :py:obj:`~.CUmemcpy3DOperandType`
 
     {{endif}}
     {{if 'CUmemcpy3DOperand_st.op' in found_struct}}
@@ -23341,25 +23390,26 @@ cdef class CUDA_MEMCPY3D_BATCH_OP_st:
     Attributes
     ----------
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.src' in found_struct}}
-    src : CUmemcpy3DOperand
+    src : :py:obj:`~.CUmemcpy3DOperand`
         Source memcpy operand.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.dst' in found_struct}}
-    dst : CUmemcpy3DOperand
+    dst : :py:obj:`~.CUmemcpy3DOperand`
         Destination memcpy operand.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.extent' in found_struct}}
-    extent : CUextent3D
+    extent : :py:obj:`~.CUextent3D`
         Extents of the memcpy between src and dst. The width, height and
         depth components must not be 0.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.srcAccessOrder' in found_struct}}
-    srcAccessOrder : CUmemcpySrcAccessOrder
+    srcAccessOrder : :py:obj:`~.CUmemcpySrcAccessOrder`
         Source access ordering to be observed for copy from src to dst.
     {{endif}}
     {{if 'CUDA_MEMCPY3D_BATCH_OP_st.flags' in found_struct}}
     flags : unsigned int
-        Additional flags for copies with this attribute. See CUmemcpyFlags
+        Additional flags for copies with this attribute. See
+        :py:obj:`~.CUmemcpyFlags`
     {{endif}}
 
     Methods
@@ -23473,13 +23523,13 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v1_st:
     Attributes
     ----------
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.poolProps' in found_struct}}
-    poolProps : CUmemPoolProps
+    poolProps : :py:obj:`~.CUmemPoolProps`
         in: location where the allocation should reside (specified in
-        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
-        is not supported.
+        ``location``). ``handleTypes`` must be
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_NONE`. IPC is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.accessDescs' in found_struct}}
-    accessDescs : CUmemAccessDesc
+    accessDescs : :py:obj:`~.CUmemAccessDesc`
         in: array of memory access descriptors. Used to describe peer GPU
         access
     {{endif}}
@@ -23493,7 +23543,7 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v1_st:
         in: size in bytes of the requested allocation
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v1_st.dptr' in found_struct}}
-    dptr : CUdeviceptr
+    dptr : :py:obj:`~.CUdeviceptr`
         out: address of the allocation returned by CUDA
     {{endif}}
 
@@ -23636,13 +23686,13 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v2_st:
     Attributes
     ----------
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.poolProps' in found_struct}}
-    poolProps : CUmemPoolProps
+    poolProps : :py:obj:`~.CUmemPoolProps`
         in: location where the allocation should reside (specified in
-        ``location``). ``handleTypes`` must be CU_MEM_HANDLE_TYPE_NONE. IPC
-        is not supported.
+        ``location``). ``handleTypes`` must be
+        :py:obj:`~.CU_MEM_HANDLE_TYPE_NONE`. IPC is not supported.
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.accessDescs' in found_struct}}
-    accessDescs : CUmemAccessDesc
+    accessDescs : :py:obj:`~.CUmemAccessDesc`
         in: array of memory access descriptors. Used to describe peer GPU
         access
     {{endif}}
@@ -23656,7 +23706,7 @@ cdef class CUDA_MEM_ALLOC_NODE_PARAMS_v2_st:
         in: size in bytes of the requested allocation
     {{endif}}
     {{if 'CUDA_MEM_ALLOC_NODE_PARAMS_v2_st.dptr' in found_struct}}
-    dptr : CUdeviceptr
+    dptr : :py:obj:`~.CUdeviceptr`
         out: address of the allocation returned by CUDA
     {{endif}}
 
@@ -23799,7 +23849,7 @@ cdef class CUDA_MEM_FREE_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_MEM_FREE_NODE_PARAMS_st.dptr' in found_struct}}
-    dptr : CUdeviceptr
+    dptr : :py:obj:`~.CUdeviceptr`
         in: the pointer to free
     {{endif}}
 
@@ -23862,7 +23912,7 @@ cdef class CUDA_CHILD_GRAPH_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_CHILD_GRAPH_NODE_PARAMS_st.graph' in found_struct}}
-    graph : CUgraph
+    graph : :py:obj:`~.CUgraph`
         The child graph to clone into the node for node creation, or a
         handle to the graph owned by the node for node query. The graph
         must not contain conditional nodes. Graphs containing memory
@@ -23870,7 +23920,7 @@ cdef class CUDA_CHILD_GRAPH_NODE_PARAMS_st:
         to the parent.
     {{endif}}
     {{if 'CUDA_CHILD_GRAPH_NODE_PARAMS_st.ownership' in found_struct}}
-    ownership : CUgraphChildGraphNodeOwnership
+    ownership : :py:obj:`~.CUgraphChildGraphNodeOwnership`
         The ownership relationship of the child graph node.
     {{endif}}
 
@@ -23946,7 +23996,7 @@ cdef class CUDA_EVENT_RECORD_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_EVENT_RECORD_NODE_PARAMS_st.event' in found_struct}}
-    event : CUevent
+    event : :py:obj:`~.CUevent`
         The event to record when the node executes
     {{endif}}
 
@@ -24008,7 +24058,7 @@ cdef class CUDA_EVENT_WAIT_NODE_PARAMS_st:
     Attributes
     ----------
     {{if 'CUDA_EVENT_WAIT_NODE_PARAMS_st.event' in found_struct}}
-    event : CUevent
+    event : :py:obj:`~.CUevent`
         The event to wait on from the node
     {{endif}}
 
@@ -24065,12 +24115,12 @@ cdef class CUDA_EVENT_WAIT_NODE_PARAMS_st:
 
 cdef class CUgraphNodeParams_st:
     """
-    Graph node parameters. See cuGraphAddNode.
+    Graph node parameters. See :py:obj:`~.cuGraphAddNode`.
 
     Attributes
     ----------
     {{if 'CUgraphNodeParams_st.type' in found_struct}}
-    type : CUgraphNodeType
+    type : :py:obj:`~.CUgraphNodeType`
         Type of the node
     {{endif}}
     {{if 'CUgraphNodeParams_st.reserved0' in found_struct}}
@@ -24082,55 +24132,55 @@ cdef class CUgraphNodeParams_st:
         Padding. Unused bytes must be zero.
     {{endif}}
     {{if 'CUgraphNodeParams_st.kernel' in found_struct}}
-    kernel : CUDA_KERNEL_NODE_PARAMS_v3
+    kernel : :py:obj:`~.CUDA_KERNEL_NODE_PARAMS_v3`
         Kernel node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.memcpy' in found_struct}}
-    memcpy : CUDA_MEMCPY_NODE_PARAMS
+    memcpy : :py:obj:`~.CUDA_MEMCPY_NODE_PARAMS`
         Memcpy node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.memset' in found_struct}}
-    memset : CUDA_MEMSET_NODE_PARAMS_v2
+    memset : :py:obj:`~.CUDA_MEMSET_NODE_PARAMS_v2`
         Memset node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.host' in found_struct}}
-    host : CUDA_HOST_NODE_PARAMS_v2
+    host : :py:obj:`~.CUDA_HOST_NODE_PARAMS_v2`
         Host node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.graph' in found_struct}}
-    graph : CUDA_CHILD_GRAPH_NODE_PARAMS
+    graph : :py:obj:`~.CUDA_CHILD_GRAPH_NODE_PARAMS`
         Child graph node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.eventWait' in found_struct}}
-    eventWait : CUDA_EVENT_WAIT_NODE_PARAMS
+    eventWait : :py:obj:`~.CUDA_EVENT_WAIT_NODE_PARAMS`
         Event wait node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.eventRecord' in found_struct}}
-    eventRecord : CUDA_EVENT_RECORD_NODE_PARAMS
+    eventRecord : :py:obj:`~.CUDA_EVENT_RECORD_NODE_PARAMS`
         Event record node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.extSemSignal' in found_struct}}
-    extSemSignal : CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2
+    extSemSignal : :py:obj:`~.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS_v2`
         External semaphore signal node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.extSemWait' in found_struct}}
-    extSemWait : CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2
+    extSemWait : :py:obj:`~.CUDA_EXT_SEM_WAIT_NODE_PARAMS_v2`
         External semaphore wait node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.alloc' in found_struct}}
-    alloc : CUDA_MEM_ALLOC_NODE_PARAMS_v2
+    alloc : :py:obj:`~.CUDA_MEM_ALLOC_NODE_PARAMS_v2`
         Memory allocation node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.free' in found_struct}}
-    free : CUDA_MEM_FREE_NODE_PARAMS
+    free : :py:obj:`~.CUDA_MEM_FREE_NODE_PARAMS`
         Memory free node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.memOp' in found_struct}}
-    memOp : CUDA_BATCH_MEM_OP_NODE_PARAMS_v2
+    memOp : :py:obj:`~.CUDA_BATCH_MEM_OP_NODE_PARAMS_v2`
         MemOp node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.conditional' in found_struct}}
-    conditional : CUDA_CONDITIONAL_NODE_PARAMS
+    conditional : :py:obj:`~.CUDA_CONDITIONAL_NODE_PARAMS`
         Conditional node parameters.
     {{endif}}
     {{if 'CUgraphNodeParams_st.asBytes' in found_struct}}
@@ -24487,7 +24537,7 @@ cdef class CUcheckpointLockArgs_st:
         Reserved for future use, must be zero
     {{endif}}
     {{if 'CUcheckpointLockArgs_st.reserved1' in found_struct}}
-    reserved1 : list[cuuint64_t]
+    reserved1 : list[:py:obj:`~.cuuint64_t`]
         Reserved for future use, must be zeroed
     {{endif}}
 
@@ -24566,7 +24616,7 @@ cdef class CUcheckpointCheckpointArgs_st:
     Attributes
     ----------
     {{if 'CUcheckpointCheckpointArgs_st.reserved' in found_struct}}
-    reserved : list[cuuint64_t]
+    reserved : list[:py:obj:`~.cuuint64_t`]
         Reserved for future use, must be zeroed
     {{endif}}
 
@@ -24617,11 +24667,11 @@ cdef class CUcheckpointGpuPair_st:
     Attributes
     ----------
     {{if 'CUcheckpointGpuPair_st.oldUuid' in found_struct}}
-    oldUuid : CUuuid
+    oldUuid : :py:obj:`~.CUuuid`
         UUID of the GPU that was checkpointed
     {{endif}}
     {{if 'CUcheckpointGpuPair_st.newUuid' in found_struct}}
-    newUuid : CUuuid
+    newUuid : :py:obj:`~.CUuuid`
         UUID of the GPU to restore onto
     {{endif}}
 
@@ -24691,7 +24741,7 @@ cdef class CUcheckpointRestoreArgs_st:
     Attributes
     ----------
     {{if 'CUcheckpointRestoreArgs_st.gpuPairs' in found_struct}}
-    gpuPairs : CUcheckpointGpuPair
+    gpuPairs : :py:obj:`~.CUcheckpointGpuPair`
         Pointer to array of gpu pairs that indicate how to remap GPUs
         during restore
     {{endif}}
@@ -24704,11 +24754,11 @@ cdef class CUcheckpointRestoreArgs_st:
         Reserved for future use, must be zeroed
     {{endif}}
     {{if struct_field_types.get('CUcheckpointRestoreArgs_st.reserved') == 'cuuint64_t'}}
-    reserved : list[cuuint64_t]
+    reserved : list[:py:obj:`~.cuuint64_t`]
         Reserved for future use, must be zeroed
     {{endif}}
     {{if 'CUcheckpointRestoreArgs_st.reserved1' in found_struct}}
-    reserved1 : cuuint64_t
+    reserved1 : :py:obj:`~.cuuint64_t`
         Reserved for future use, must be zeroed
     {{endif}}
 
@@ -24854,7 +24904,7 @@ cdef class CUcheckpointUnlockArgs_st:
     Attributes
     ----------
     {{if 'CUcheckpointUnlockArgs_st.reserved' in found_struct}}
-    reserved : list[cuuint64_t]
+    reserved : list[:py:obj:`~.cuuint64_t`]
         Reserved for future use, must be zeroed
     {{endif}}
 
@@ -24908,34 +24958,36 @@ cdef class CUmemDecompressParams_st:
     {{if 'CUmemDecompressParams_st.srcNumBytes' in found_struct}}
     srcNumBytes : size_t
         The number of bytes to be read and decompressed from
-        CUmemDecompressParams_st.src.
+        :py:obj:`~.CUmemDecompressParams_st`.src.
     {{endif}}
     {{if 'CUmemDecompressParams_st.dstNumBytes' in found_struct}}
     dstNumBytes : size_t
         The number of bytes that the decompression operation will be
-        expected to write to CUmemDecompressParams_st.dst. This value is
-        optional; if present, it may be used by the CUDA driver as a
-        heuristic for scheduling the individual decompression operations.
+        expected to write to :py:obj:`~.CUmemDecompressParams_st`.dst. This
+        value is optional; if present, it may be used by the CUDA driver as
+        a heuristic for scheduling the individual decompression operations.
     {{endif}}
     {{if 'CUmemDecompressParams_st.dstActBytes' in found_struct}}
-    dstActBytes : cuuint32_t
+    dstActBytes : :py:obj:`~.cuuint32_t`
         After the decompression operation has completed, the actual number
-        of bytes written to CUmemDecompressParams.dst will be recorded as a
-        32-bit unsigned integer in the memory at this address.
+        of bytes written to :py:obj:`~.CUmemDecompressParams`.dst will be
+        recorded as a 32-bit unsigned integer in the memory at this
+        address.
     {{endif}}
     {{if 'CUmemDecompressParams_st.src' in found_struct}}
     src : Any
         Pointer to a buffer of at least
-        CUmemDecompressParams_st.srcNumBytes compressed bytes.
+        :py:obj:`~.CUmemDecompressParams_st`.srcNumBytes compressed bytes.
     {{endif}}
     {{if 'CUmemDecompressParams_st.dst' in found_struct}}
     dst : Any
         Pointer to a buffer where the decompressed data will be written.
         The number of bytes written to this location will be recorded in
-        the memory pointed to by CUmemDecompressParams_st.dstActBytes
+        the memory pointed to by
+        :py:obj:`~.CUmemDecompressParams_st`.dstActBytes
     {{endif}}
     {{if 'CUmemDecompressParams_st.algo' in found_struct}}
-    algo : CUmemDecompressAlgorithm
+    algo : :py:obj:`~.CUmemDecompressAlgorithm`
         The decompression algorithm to use.
     {{endif}}
     {{if 'CUmemDecompressParams_st.padding' in found_struct}}
@@ -25126,7 +25178,7 @@ cdef class anon_struct25:
     Attributes
     ----------
     {{if 'CUlogicalEndpointProp_struct.unicast.device' in found_struct}}
-    device : CUdevice
+    device : :py:obj:`~.CUdevice`
 
     {{endif}}
 
@@ -25233,8 +25285,9 @@ cdef class CUlogicalEndpointProp_struct:
     Attributes
     ----------
     {{if 'CUlogicalEndpointProp_struct.type' in found_struct}}
-    type : CUlogicalEndpointType
-        Type of the logical endpoint defined in CUlogicalEndpointType
+    type : :py:obj:`~.CUlogicalEndpointType`
+        Type of the logical endpoint defined in
+        :py:obj:`~.CUlogicalEndpointType`
     {{endif}}
     {{if 'CUlogicalEndpointProp_struct.unicast' in found_struct}}
     unicast : anon_struct25
@@ -25251,11 +25304,11 @@ cdef class CUlogicalEndpointProp_struct:
     {{if 'CUlogicalEndpointProp_struct.ipcHandleTypes' in found_struct}}
     ipcHandleTypes : unsigned int
         A bitmask of IPC handle types defined in
-        CUlogicalEndpointIpcHandleType
+        :py:obj:`~.CUlogicalEndpointIpcHandleType`
     {{endif}}
     {{if 'CUlogicalEndpointProp_struct.flags' in found_struct}}
     flags : unsigned int
-        A bitmask of flags defined in CUlogicalEndpointFlag
+        A bitmask of flags defined in :py:obj:`~.CUlogicalEndpointFlag`
     {{endif}}
 
     Methods
@@ -25398,7 +25451,7 @@ cdef class CUdevSmResource_st:
     {{if 'CUdevSmResource_st.flags' in found_struct}}
     flags : unsigned int
         The flags set on this SM resource. For possible values see
-        CUdevSmResourceGroup_flags.
+        :py:obj:`~.CUdevSmResourceGroup_flags`.
     {{endif}}
 
     Methods
@@ -25487,7 +25540,7 @@ cdef class CUdevWorkqueueConfigResource_st:
     Attributes
     ----------
     {{if 'CUdevWorkqueueConfigResource_st.device' in found_struct}}
-    device : CUdevice
+    device : :py:obj:`~.CUdevice`
         The device on which the workqueue resources are available
     {{endif}}
     {{if 'CUdevWorkqueueConfigResource_st.wqConcurrencyLimit' in found_struct}}
@@ -25495,7 +25548,7 @@ cdef class CUdevWorkqueueConfigResource_st:
         The expected maximum number of concurrent stream-ordered workloads
     {{endif}}
     {{if 'CUdevWorkqueueConfigResource_st.sharingScope' in found_struct}}
-    sharingScope : CUdevWorkqueueConfigScope
+    sharingScope : :py:obj:`~.CUdevWorkqueueConfigScope`
         The sharing scope for the workqueue resources
     {{endif}}
 
@@ -25651,7 +25704,7 @@ cdef class CU_DEV_SM_RESOURCE_GROUP_PARAMS_st:
     {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.flags' in found_struct}}
     flags : unsigned int
         The flags set on this SM resource group. For possible values see
-        CUdevSmResourceGroup_flags.
+        :py:obj:`~.CUdevSmResourceGroup_flags`.
     {{endif}}
     {{if 'CU_DEV_SM_RESOURCE_GROUP_PARAMS_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -25758,7 +25811,7 @@ cdef class CUdevResource_st:
     Attributes
     ----------
     {{if 'CUdevResource_st.type' in found_struct}}
-    type : CUdevResourceType
+    type : :py:obj:`~.CUdevResourceType`
         Type of resource, dictates which union field was last set
     {{endif}}
     {{if 'CUdevResource_st._internal_padding' in found_struct}}
@@ -25766,25 +25819,26 @@ cdef class CUdevResource_st:
 
     {{endif}}
     {{if 'CUdevResource_st.sm' in found_struct}}
-    sm : CUdevSmResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_SM ``typename``.
+    sm : :py:obj:`~.CUdevSmResource`
+        Resource corresponding to :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`
+        ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wqConfig' in found_struct}}
-    wqConfig : CUdevWorkqueueConfigResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG
-        ``typename``.
+    wqConfig : :py:obj:`~.CUdevWorkqueueConfigResource`
+        Resource corresponding to
+        :py:obj:`~.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG` ``typename``.
     {{endif}}
     {{if 'CUdevResource_st.wq' in found_struct}}
-    wq : CUdevWorkqueueResource
-        Resource corresponding to CU_DEV_RESOURCE_TYPE_WORKQUEUE
-        ``typename``.
+    wq : :py:obj:`~.CUdevWorkqueueResource`
+        Resource corresponding to
+        :py:obj:`~.CU_DEV_RESOURCE_TYPE_WORKQUEUE` ``typename``.
     {{endif}}
     {{if 'CUdevResource_st._oversize' in found_struct}}
     _oversize : bytes
 
     {{endif}}
     {{if 'CUdevResource_st.nextResource' in found_struct}}
-    nextResource : CUdevResource_st
+    nextResource : :py:obj:`~.CUdevResource_st`
 
     {{endif}}
 
@@ -25954,7 +26008,7 @@ cdef class anon_union17:
     Attributes
     ----------
     {{if True}}
-    pArray : list[CUarray]
+    pArray : list[:py:obj:`~.CUarray`]
 
     {{endif}}
     {{if True}}
@@ -26059,15 +26113,15 @@ cdef class CUeglFrame_st:
         Number of channels for the plane
     {{endif}}
     {{if True}}
-    frameType : CUeglFrameType
+    frameType : :py:obj:`~.CUeglFrameType`
         Array or Pitch
     {{endif}}
     {{if True}}
-    eglColorFormat : CUeglColorFormat
+    eglColorFormat : :py:obj:`~.CUeglColorFormat`
         CUDA EGL Color Format
     {{endif}}
     {{if True}}
-    cuFormat : CUarray_format
+    cuFormat : :py:obj:`~.CUarray_format`
         CUDA Array Format
     {{endif}}
 
@@ -26300,7 +26354,7 @@ cdef class cuuint64_t:
 cdef class CUdeviceptr_v2:
     """
 
-    CUDA device pointer CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
+    CUDA device pointer :py:obj:`~.CUdeviceptr` is defined as an unsigned integer type whose size matches the size of a pointer on the target platform.
 
     Methods
     -------
@@ -26685,14 +26739,14 @@ def cuGetErrorString(error not None : CUresult):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pStr : bytes
         Address of the string pointer.
 
     See Also
     --------
-    :py:obj:`~.CUresult`, :py:obj:`~.cudaGetErrorString`
+    :py:obj:`~.CUresult`, :func:`~.cudaGetErrorString`
     """
     cdef cydriver.CUresult cyerror = int(error)
     cdef const char* pStr = NULL
@@ -26721,14 +26775,14 @@ def cuGetErrorName(error not None : CUresult):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pStr : bytes
         Address of the string pointer.
 
     See Also
     --------
-    :py:obj:`~.CUresult`, :py:obj:`~.cudaGetErrorName`
+    :py:obj:`~.CUresult`, :func:`~.cudaGetErrorName`
     """
     cdef cydriver.CUresult cyerror = int(error)
     cdef const char* pStr = NULL
@@ -26745,8 +26799,8 @@ def cuGetErrorName(error not None : CUresult):
 def cuInit(unsigned int Flags):
     """ Initialize the CUDA driver API Initializes the driver API and must be called before any other function from the driver API in the current process. Currently, the ``Flags`` parameter must be 0. If :py:obj:`~.cuInit()` has not been called, any function from the driver API will return :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`.
 
-    Note: cuInit preloads various libraries needed for JIT compilation. To
-    opt-out of this behavior, set the environment variable
+    Note: :func:`~.cuInit` preloads various libraries needed for JIT
+    compilation. To opt-out of this behavior, set the environment variable
     CUDA_FORCE_PRELOAD_LIBRARIES=0. CUDA will lazily load JIT libraries as
     needed. To disable JIT entirely, set the environment variable
     CUDA_DISABLE_JIT=1.
@@ -26758,7 +26812,7 @@ def cuInit(unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_SYSTEM_DRIVER_MISMATCH`, :py:obj:`~.CUDA_ERROR_COMPAT_NOT_SUPPORTED_ON_DEVICE`
     """
     with nogil:
@@ -26781,14 +26835,14 @@ def cuDriverGetVersion():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     driverVersion : int
         Returns the CUDA driver version
 
     See Also
     --------
-    :py:obj:`~.cudaDriverGetVersion`, :py:obj:`~.cudaRuntimeGetVersion`
+    :func:`~.cudaDriverGetVersion`, :func:`~.cudaRuntimeGetVersion`
     """
     cdef int driverVersion = 0
     with nogil:
@@ -26814,7 +26868,7 @@ def cuDeviceGet(int ordinal):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     device : :py:obj:`~.CUdevice`
         Returned device handle
@@ -26843,14 +26897,14 @@ def cuDeviceGetCount():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     count : int
         Returned number of compute-capable devices
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGetUuid`, :py:obj:`~.cuDeviceGetLuid`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceTotalMem`, :py:obj:`~.cuDeviceGetExecAffinitySupport`, :py:obj:`~.cudaGetDeviceCount`
+    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGetUuid`, :py:obj:`~.cuDeviceGetLuid`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceTotalMem`, :py:obj:`~.cuDeviceGetExecAffinitySupport`, :func:`~.cudaGetDeviceCount`
     """
     cdef int count = 0
     with nogil:
@@ -26881,14 +26935,14 @@ def cuDeviceGetName(int length, dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     name : bytes
         Returned identifier string for the device
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetUuid`, :py:obj:`~.cuDeviceGetLuid`, :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceTotalMem`, :py:obj:`~.cuDeviceGetExecAffinitySupport`, :py:obj:`~.cudaGetDeviceProperties`
+    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetUuid`, :py:obj:`~.cuDeviceGetLuid`, :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceTotalMem`, :py:obj:`~.cuDeviceGetExecAffinitySupport`, :func:`~.cudaGetDeviceProperties`
     """
     cdef cydriver.CUdevice cydev
     if dev is None:
@@ -26924,14 +26978,14 @@ def cuDeviceGetUuid(dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     uuid : :py:obj:`~.CUuuid`
         Returned UUID
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGetLuid`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceTotalMem`, :py:obj:`~.cudaGetDeviceProperties`
+    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGetLuid`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceTotalMem`, :func:`~.cudaGetDeviceProperties`
     """
     cdef cydriver.CUdevice cydev
     if dev is None:
@@ -26965,7 +27019,7 @@ def cuDeviceGetLuid(dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     luid : bytes
         Returned LUID
@@ -26974,7 +27028,7 @@ def cuDeviceGetLuid(dev):
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceTotalMem`, :py:obj:`~.cuDeviceGetExecAffinitySupport`, :py:obj:`~.cudaGetDeviceProperties`
+    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceTotalMem`, :py:obj:`~.cuDeviceGetExecAffinitySupport`, :func:`~.cudaGetDeviceProperties`
     """
     cdef cydriver.CUdevice cydev
     if dev is None:
@@ -27009,14 +27063,14 @@ def cuDeviceTotalMem(dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     numbytes : int
         Returned memory available on device in bytes
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGetUuid`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceGetExecAffinitySupport`, :py:obj:`~.cudaMemGetInfo`
+    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGetUuid`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceGetExecAffinitySupport`, :func:`~.cudaMemGetInfo`
     """
     cdef cydriver.CUdevice cydev
     if dev is None:
@@ -27055,7 +27109,7 @@ def cuDeviceGetTexture1DLinearMaxWidth(pformat not None : CUarray_format, unsign
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     maxWidthInElements : int
         Returned maximum number of texture elements allocatable for given
@@ -27063,7 +27117,7 @@ def cuDeviceGetTexture1DLinearMaxWidth(pformat not None : CUarray_format, unsign
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGetUuid`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cudaMemGetInfo`, :py:obj:`~.cuDeviceTotalMem`
+    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGetUuid`, :py:obj:`~.cuDeviceGet`, :func:`~.cudaMemGetInfo`, :py:obj:`~.cuDeviceTotalMem`
     """
     cdef cydriver.CUdevice cydev
     if dev is None:
@@ -27100,14 +27154,14 @@ def cuDeviceGetAttribute(attrib not None : CUdevice_attribute, dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     pi : int
         Returned device attribute value
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGetUuid`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceTotalMem`, :py:obj:`~.cuDeviceGetExecAffinitySupport`, :py:obj:`~.cudaDeviceGetAttribute`, :py:obj:`~.cudaGetDeviceProperties`
+    :py:obj:`~.cuDeviceGetCount`, :py:obj:`~.cuDeviceGetName`, :py:obj:`~.cuDeviceGetUuid`, :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceTotalMem`, :py:obj:`~.cuDeviceGetExecAffinitySupport`, :func:`~.cudaDeviceGetAttribute`, :func:`~.cudaGetDeviceProperties`
     """
     cdef cydriver.CUdevice cydev
     if dev is None:
@@ -27159,14 +27213,14 @@ def cuDeviceGetHostAtomicCapabilities(operations : Optional[tuple[CUatomicOperat
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     capabilities : list[unsigned int]
         Returned capability details of each requested operation
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetP2PAtomicCapabilities`, :py:obj:`~.cudaDeviceGeHostAtomicCapabilities`
+    :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetP2PAtomicCapabilities`, ``cudaDeviceGeHostAtomicCapabilities``
     """
     cdef cydriver.CUdevice cydev
     if dev is None:
@@ -27275,7 +27329,7 @@ def cuDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, dev, int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
 
 
     See Also
@@ -27318,7 +27372,7 @@ def cuDeviceSetMemPool(dev, pool):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -27369,7 +27423,7 @@ def cuDeviceGetMemPool(dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pool : :py:obj:`~.CUmemoryPool`
         None
@@ -27410,7 +27464,7 @@ def cuDeviceGetDefaultMemPool(dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     pool_out : :py:obj:`~.CUmemoryPool`
         None
@@ -27456,7 +27510,7 @@ def cuDeviceGetExecAffinitySupport(typename not None : CUexecAffinityType, dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     pi : int
         1 if the execution affinity type ``typename`` is supported by the
@@ -27511,7 +27565,7 @@ def cuFlushGPUDirectRDMAWrites(target not None : CUflushGPUDirectRDMAWritesTarge
     by using mechanisms outside the scope of CUDA.
 
     Users may query support for this API via
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS`.
+    ``CU_DEVICE_ATTRIBUTE_FLUSH_FLUSH_GPU_DIRECT_RDMA_OPTIONS``.
 
     Parameters
     ----------
@@ -27524,7 +27578,7 @@ def cuFlushGPUDirectRDMAWrites(target not None : CUflushGPUDirectRDMAWritesTarge
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     """
     cdef cydriver.CUflushGPUDirectRDMAWritesTarget cytarget = int(target)
@@ -27552,35 +27606,33 @@ def cuDeviceGetProperties(dev):
 
     where:
 
-    - :py:obj:`~.maxThreadsPerBlock` is the maximum number of threads per
-      block;
+    - ``maxThreadsPerBlock`` is the maximum number of threads per block;
 
-    - :py:obj:`~.maxThreadsDim` ``[3]`` is the maximum sizes of each
-      dimension of a block;
+    - ``maxThreadsDim[3]`` is the maximum sizes of each dimension of a
+      block;
 
-    - :py:obj:`~.maxGridSize` ``[3]`` is the maximum sizes of each
-      dimension of a grid;
+    - ``maxGridSize[3]`` is the maximum sizes of each dimension of a grid;
 
-    - :py:obj:`~.sharedMemPerBlock` is the total amount of shared memory
-      available per block in bytes;
+    - ``sharedMemPerBlock`` is the total amount of shared memory available
+      per block in bytes;
 
-    - :py:obj:`~.totalConstantMemory` is the total amount of constant
-      memory available on the device in bytes;
+    - ``totalConstantMemory`` is the total amount of constant memory
+      available on the device in bytes;
 
-    - :py:obj:`~.SIMDWidth` is the warp size;
+    - ``SIMDWidth`` is the warp size;
 
-    - :py:obj:`~.memPitch` is the maximum pitch allowed by the memory copy
+    - ``memPitch`` is the maximum pitch allowed by the memory copy
       functions that involve memory regions allocated through
       :py:obj:`~.cuMemAllocPitch()`;
 
-    - :py:obj:`~.regsPerBlock` is the total number of registers available
-      per block;
+    - ``regsPerBlock`` is the total number of registers available per
+      block;
 
-    - :py:obj:`~.clockRate` is the clock frequency in kilohertz;
+    - ``clockRate`` is the clock frequency in kilohertz;
 
-    - :py:obj:`~.textureAlign` is the alignment requirement; texture base
-      addresses that are aligned to :py:obj:`~.textureAlign` bytes do not
-      need an offset applied to texture fetches.
+    - ``textureAlign`` is the alignment requirement; texture base addresses
+      that are aligned to ``textureAlign`` bytes do not need an offset
+      applied to texture fetches.
 
     Parameters
     ----------
@@ -27589,7 +27641,7 @@ def cuDeviceGetProperties(dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     prop : :py:obj:`~.CUdevprop`
         Returned properties of device
@@ -27635,7 +27687,7 @@ def cuDeviceComputeCapability(dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     major : int
         Major revision number
@@ -27696,7 +27748,7 @@ def cuDevicePrimaryCtxRetain(dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     pctx : :py:obj:`~.CUcontext`
         Returned context handle of the new context
@@ -27747,7 +27799,7 @@ def cuDevicePrimaryCtxRelease(dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
 
     See Also
@@ -27862,12 +27914,12 @@ def cuDevicePrimaryCtxSetFlags(dev, unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
     --------
-    :py:obj:`~.cuDevicePrimaryCtxRetain`, :py:obj:`~.cuDevicePrimaryCtxGetState`, :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxSetFlags`, :py:obj:`~.cudaSetDeviceFlags`
+    :py:obj:`~.cuDevicePrimaryCtxRetain`, :py:obj:`~.cuDevicePrimaryCtxGetState`, :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxSetFlags`, :func:`~.cudaSetDeviceFlags`
     """
     cdef cydriver.CUdevice cydev
     if dev is None:
@@ -27899,7 +27951,7 @@ def cuDevicePrimaryCtxGetState(dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     flags : unsigned int
         Pointer to store flags
@@ -27908,7 +27960,7 @@ def cuDevicePrimaryCtxGetState(dev):
 
     See Also
     --------
-    :py:obj:`~.cuDevicePrimaryCtxSetFlags`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxSetFlags`, :py:obj:`~.cudaGetDeviceFlags`
+    :py:obj:`~.cuDevicePrimaryCtxSetFlags`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxSetFlags`, :func:`~.cudaGetDeviceFlags`
     """
     cdef cydriver.CUdevice cydev
     if dev is None:
@@ -27952,12 +28004,12 @@ def cuDevicePrimaryCtxReset(dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE`
 
     See Also
     --------
-    :py:obj:`~.cuDevicePrimaryCtxRetain`, :py:obj:`~.cuDevicePrimaryCtxRelease`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cudaDeviceReset`
+    :py:obj:`~.cuDevicePrimaryCtxRetain`, :py:obj:`~.cuDevicePrimaryCtxRelease`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :func:`~.cudaDeviceReset`
     """
     cdef cydriver.CUdevice cydev
     if dev is None:
@@ -27992,17 +28044,17 @@ def cuCtxCreate(ctxCreateParams : Optional[CUctxCreateParams], unsigned int flag
     A CUDA context can be created with execution affinity. The type and the
     amount of execution resource the context can use is limited by
     ``paramsArray`` and ``numExecAffinityParams`` in ``execAffinity``. The
-    ``paramsArray`` is an array of ``CUexecAffinityParam`` and the
+    ``paramsArray`` is an array of :py:obj:`~.CUexecAffinityParam` and the
     ``numExecAffinityParams`` describes the size of the paramsArray. If two
-    ``CUexecAffinityParam`` in the array have the same type, the latter
-    execution affinity parameter overrides the former execution affinity
-    parameter. The supported execution affinity types are:
+    :py:obj:`~.CUexecAffinityParam` in the array have the same type, the
+    latter execution affinity parameter overrides the former execution
+    affinity parameter. The supported execution affinity types are:
 
     - :py:obj:`~.CU_EXEC_AFFINITY_TYPE_SM_COUNT` limits the portion of SMs
       that the context can use. The portion of SMs is specified as the
-      number of SMs via ``CUexecAffinitySmCount``. This limit will be
-      internally rounded up to the next hardware-supported amount. Hence,
-      it is imperative to query the actual execution affinity of the
+      number of SMs via :py:obj:`~.CUexecAffinitySmCount`. This limit will
+      be internally rounded up to the next hardware-supported amount.
+      Hence, it is imperative to query the actual execution affinity of the
       context via :py:obj:`~.cuCtxGetExecAffinity` after context creation.
       Currently, this attribute is only supported under Volta+ MPS.
 
@@ -28136,7 +28188,7 @@ def cuCtxCreate(ctxCreateParams : Optional[CUctxCreateParams], unsigned int flag
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     pctx : :py:obj:`~.CUcontext`
         Returned context handle of the new context
@@ -28202,7 +28254,7 @@ def cuCtxDestroy(ctx):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -28247,7 +28299,7 @@ def cuCtxPushCurrent(ctx):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -28283,7 +28335,7 @@ def cuCtxPopCurrent():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
     pctx : :py:obj:`~.CUcontext`
         Returned popped context handle
@@ -28323,12 +28375,12 @@ def cuCtxSetCurrent(ctx):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
 
     See Also
     --------
-    :py:obj:`~.cuCtxGetCurrent`, :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cudaSetDevice`
+    :py:obj:`~.cuCtxGetCurrent`, :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :func:`~.cudaSetDevice`
     """
     cdef cydriver.CUcontext cyctx
     if ctx is None:
@@ -28355,14 +28407,14 @@ def cuCtxGetCurrent():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`,
     pctx : :py:obj:`~.CUcontext`
         Returned context handle
 
     See Also
     --------
-    :py:obj:`~.cuCtxSetCurrent`, :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cudaGetDevice`
+    :py:obj:`~.cuCtxSetCurrent`, :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :func:`~.cudaGetDevice`
     """
     cdef CUcontext pctx = CUcontext()
     with nogil:
@@ -28382,14 +28434,14 @@ def cuCtxGetDevice():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     device : :py:obj:`~.CUdevice`
         Returned device handle for the current context
 
     See Also
     --------
-    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cudaGetDevice`
+    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :func:`~.cudaGetDevice`
     """
     cdef CUdevice device = CUdevice()
     with nogil:
@@ -28416,7 +28468,7 @@ def cuCtxGetDevice_v2(ctx):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     device : :py:obj:`~.CUdevice`
         Returned device handle for the specified context
@@ -28452,14 +28504,14 @@ def cuCtxGetFlags():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     flags : unsigned int
         Pointer to store flags of current context
 
     See Also
     --------
-    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetCurrent`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuCtxGetStreamPriorityRange`, :py:obj:`~.cuCtxSetFlags`, :py:obj:`~.cudaGetDeviceFlags`
+    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetCurrent`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuCtxGetStreamPriorityRange`, :py:obj:`~.cuCtxSetFlags`, :func:`~.cudaGetDeviceFlags`
     """
     cdef unsigned int flags = 0
     with nogil:
@@ -28485,12 +28537,12 @@ def cuCtxSetFlags(unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
     --------
-    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetCurrent`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuCtxGetStreamPriorityRange`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cudaGetDeviceFlags`, :py:obj:`~.cuDevicePrimaryCtxSetFlags`,
+    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetCurrent`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuCtxGetStreamPriorityRange`, :py:obj:`~.cuCtxGetFlags`, :func:`~.cudaGetDeviceFlags`, :py:obj:`~.cuDevicePrimaryCtxSetFlags`,
     """
     with nogil:
         err = cydriver.cuCtxSetFlags(flags)
@@ -28515,7 +28567,7 @@ def cuCtxGetId(ctx):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     ctxId : unsigned long long
         Pointer to store the Id of the context
@@ -28556,12 +28608,12 @@ def cuCtxSynchronize():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED`
 
     See Also
     --------
-    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cudaDeviceSynchronize`
+    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :func:`~.cudaDeviceSynchronize`
     """
     with nogil:
         err = cydriver.cuCtxSynchronize()
@@ -28593,12 +28645,12 @@ def cuCtxSynchronize_v2(ctx):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED`
 
     See Also
     --------
-    :py:obj:`~.cuCtxGetCurrent`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuGreenCtxCreate`, :py:obj:`~.cuCtxFromGreenCtx`, :py:obj:`~.cudaDeviceSynchronize`
+    :py:obj:`~.cuCtxGetCurrent`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuGreenCtxCreate`, :py:obj:`~.cuCtxFromGreenCtx`, :func:`~.cudaDeviceSynchronize`
     """
     cdef cydriver.CUcontext cyctx
     if ctx is None:
@@ -28637,33 +28689,31 @@ def cuCtxSetLimit(limit not None : CUlimit, size_t value):
       until all preceding requested tasks are complete.
 
     - :py:obj:`~.CU_LIMIT_PRINTF_FIFO_SIZE` controls the size in bytes of
-      the FIFO used by the :py:obj:`~.printf()` device system call. Setting
+      the FIFO used by the ``printf()`` device system call. Setting
       :py:obj:`~.CU_LIMIT_PRINTF_FIFO_SIZE` must be performed before
-      launching any kernel that uses the :py:obj:`~.printf()` device system
-      call, otherwise :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be
-      returned.
+      launching any kernel that uses the ``printf()`` device system call,
+      otherwise :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned.
 
     - :py:obj:`~.CU_LIMIT_MALLOC_HEAP_SIZE` controls the size in bytes of
-      the heap used by the :py:obj:`~.malloc()` and :py:obj:`~.free()`
-      device system calls. Setting :py:obj:`~.CU_LIMIT_MALLOC_HEAP_SIZE`
-      must be performed before launching any kernel that uses the
-      :py:obj:`~.malloc()` or :py:obj:`~.free()` device system calls,
-      otherwise :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned.
+      the heap used by the ``malloc()`` and ``free()`` device system calls.
+      Setting :py:obj:`~.CU_LIMIT_MALLOC_HEAP_SIZE` must be performed
+      before launching any kernel that uses the ``malloc()`` or ``free()``
+      device system calls, otherwise :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
+      will be returned.
 
     - :py:obj:`~.CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH` controls the maximum
       nesting depth of a grid at which a thread can safely call
-      :py:obj:`~.cudaDeviceSynchronize()`. Setting this limit must be
-      performed before any launch of a kernel that uses the device runtime
-      and calls :py:obj:`~.cudaDeviceSynchronize()` above the default sync
-      depth, two levels of grids. Calls to
-      :py:obj:`~.cudaDeviceSynchronize()` will fail with error code
-      :py:obj:`~.cudaErrorSyncDepthExceeded` if the limitation is violated.
-      This limit can be set smaller than the default or up the maximum
-      launch depth of 24. When setting this limit, keep in mind that
-      additional levels of sync depth require the driver to reserve large
-      amounts of device memory which can no longer be used for user
-      allocations. If these reservations of device memory fail,
-      :py:obj:`~.cuCtxSetLimit()` will return
+      :func:`~.cudaDeviceSynchronize`. Setting this limit must be performed
+      before any launch of a kernel that uses the device runtime and calls
+      :func:`~.cudaDeviceSynchronize` above the default sync depth, two
+      levels of grids. Calls to :func:`~.cudaDeviceSynchronize` will fail
+      with error code :py:obj:`~.cudaErrorSyncDepthExceeded` if the
+      limitation is violated. This limit can be set smaller than the
+      default or up the maximum launch depth of 24. When setting this
+      limit, keep in mind that additional levels of sync depth require the
+      driver to reserve large amounts of device memory which can no longer
+      be used for user allocations. If these reservations of device memory
+      fail, :py:obj:`~.cuCtxSetLimit()` will return
       :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, and the limit can be reset to a
       lower value. This limit is only applicable to devices of compute
       capability < 9.0. Attempting to set this limit on devices of other
@@ -28676,13 +28726,13 @@ def cuCtxSetLimit(limit not None : CUlimit, size_t value):
       of launch up until the grid is known to have been completed. Device
       runtime launches which violate this limitation fail and return
       :py:obj:`~.cudaErrorLaunchPendingCountExceeded` when
-      :py:obj:`~.cudaGetLastError()` is called after launch. If more
-      pending launches than the default (2048 launches) are needed for a
-      module using the device runtime, this limit can be increased. Keep in
-      mind that being able to sustain additional pending launches will
-      require the driver to reserve larger amounts of device memory upfront
-      which can no longer be used for allocations. If these reservations
-      fail, :py:obj:`~.cuCtxSetLimit()` will return
+      :func:`~.cudaGetLastError` is called after launch. If more pending
+      launches than the default (2048 launches) are needed for a module
+      using the device runtime, this limit can be increased. Keep in mind
+      that being able to sustain additional pending launches will require
+      the driver to reserve larger amounts of device memory upfront which
+      can no longer be used for allocations. If these reservations fail,
+      :py:obj:`~.cuCtxSetLimit()` will return
       :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, and the limit can be reset to a
       lower value. This limit is only applicable to devices of compute
       capability 3.5 and higher. Attempting to set this limit on devices of
@@ -28707,12 +28757,12 @@ def cuCtxSetLimit(limit not None : CUlimit, size_t value):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_UNSUPPORTED_LIMIT`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
 
     See Also
     --------
-    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cudaDeviceSetLimit`
+    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSynchronize`, :func:`~.cudaDeviceSetLimit`
     """
     cdef cydriver.CUlimit cylimit = int(limit)
     with nogil:
@@ -28733,15 +28783,14 @@ def cuCtxGetLimit(limit not None : CUlimit):
       thread.
 
     - :py:obj:`~.CU_LIMIT_PRINTF_FIFO_SIZE`: size in bytes of the FIFO used
-      by the :py:obj:`~.printf()` device system call.
+      by the ``printf()`` device system call.
 
     - :py:obj:`~.CU_LIMIT_MALLOC_HEAP_SIZE`: size in bytes of the heap used
-      by the :py:obj:`~.malloc()` and :py:obj:`~.free()` device system
-      calls.
+      by the ``malloc()`` and ``free()`` device system calls.
 
     - :py:obj:`~.CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH`: maximum grid depth at
       which a thread can issue the device runtime call
-      :py:obj:`~.cudaDeviceSynchronize()` to wait on child grid launches to
+      :func:`~.cudaDeviceSynchronize` to wait on child grid launches to
       complete.
 
     - :py:obj:`~.CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT`: maximum number
@@ -28761,7 +28810,7 @@ def cuCtxGetLimit(limit not None : CUlimit):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
 
     pvalue : int
         None
@@ -28807,14 +28856,14 @@ def cuCtxGetCacheConfig():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pconfig : :py:obj:`~.CUfunc_cache`
         Returned cache configuration
 
     See Also
     --------
-    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cudaDeviceGetCacheConfig`
+    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cuFuncSetCacheConfig`, :func:`~.cudaDeviceGetCacheConfig`
     """
     cdef cydriver.CUfunc_cache pconfig
     with nogil:
@@ -28870,12 +28919,12 @@ def cuCtxSetCacheConfig(config not None : CUfunc_cache):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cuKernelSetCacheConfig`
+    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cuFuncSetCacheConfig`, :func:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cuKernelSetCacheConfig`
     """
     cdef cydriver.CUfunc_cache cyconfig = int(config)
     with nogil:
@@ -28907,7 +28956,7 @@ def cuCtxGetApiVersion(ctx):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     version : unsigned int
         Pointer to version
@@ -28957,7 +29006,7 @@ def cuCtxGetStreamPriorityRange():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     leastPriority : int
         Pointer to an int in which the numerical value for least stream
@@ -28968,7 +29017,7 @@ def cuCtxGetStreamPriorityRange():
 
     See Also
     --------
-    :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cudaDeviceGetStreamPriorityRange`
+    :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :func:`~.cudaDeviceGetStreamPriorityRange`
     """
     cdef int leastPriority = 0
     cdef int greatestPriority = 0
@@ -28990,7 +29039,7 @@ def cuCtxResetPersistingL2Cache():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -29021,7 +29070,7 @@ def cuCtxGetExecAffinity(typename not None : CUexecAffinityType):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY`
     pExecAffinity : :py:obj:`~.CUexecAffinityParam`
         Returned execution affinity
@@ -29067,7 +29116,7 @@ def cuCtxRecordEvent(hCtx, hEvent):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED`
 
     See Also
@@ -29124,7 +29173,7 @@ def cuCtxWaitEvent(hCtx, hEvent):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED`
 
     See Also
@@ -29183,7 +29232,7 @@ def cuCtxAttach(unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pctx : :py:obj:`~.CUcontext`
         Returned context handle of the current context
@@ -29222,7 +29271,7 @@ def cuCtxDetach(ctx):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
 
     See Also
@@ -29268,14 +29317,14 @@ def cuCtxGetSharedMemConfig():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pConfig : :py:obj:`~.CUsharedconfig`
         returned shared memory configuration
 
     See Also
     --------
-    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cudaDeviceGetSharedMemConfig`
+    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :func:`~.cudaDeviceGetSharedMemConfig`
     """
     cdef cydriver.CUsharedconfig pConfig
     with nogil:
@@ -29327,12 +29376,12 @@ def cuCtxSetSharedMemConfig(config not None : CUsharedconfig):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cudaDeviceSetSharedMemConfig`
+    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuCtxDestroy`, :py:obj:`~.cuCtxGetApiVersion`, :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxGetDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuCtxGetLimit`, :py:obj:`~.cuCtxPopCurrent`, :py:obj:`~.cuCtxPushCurrent`, :py:obj:`~.cuCtxSetLimit`, :py:obj:`~.cuCtxSynchronize`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :func:`~.cudaDeviceSetSharedMemConfig`
     """
     cdef cydriver.CUsharedconfig cyconfig = int(config)
     with nogil:
@@ -29362,7 +29411,7 @@ def cuModuleLoad(char* fname):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_PTX`, :py:obj:`~.CUDA_ERROR_UNSUPPORTED_PTX_VERSION`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_FILE_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_NO_BINARY_FOR_GPU`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`, :py:obj:`~.CUDA_ERROR_JIT_COMPILER_NOT_FOUND`
     module : :py:obj:`~.CUmodule`
         Returned module
@@ -29397,7 +29446,7 @@ def cuModuleLoadData(image):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_PTX`, :py:obj:`~.CUDA_ERROR_UNSUPPORTED_PTX_VERSION`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_NO_BINARY_FOR_GPU`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`, :py:obj:`~.CUDA_ERROR_JIT_COMPILER_NOT_FOUND`
     module : :py:obj:`~.CUmodule`
         Returned module
@@ -29441,7 +29490,7 @@ def cuModuleLoadDataEx(image, unsigned int numOptions, options : Optional[tuple[
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_PTX`, :py:obj:`~.CUDA_ERROR_UNSUPPORTED_PTX_VERSION`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_NO_BINARY_FOR_GPU`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`, :py:obj:`~.CUDA_ERROR_JIT_COMPILER_NOT_FOUND`
     module : :py:obj:`~.CUmodule`
         Returned module
@@ -29495,7 +29544,7 @@ def cuModuleLoadFatBinary(fatCubin):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_PTX`, :py:obj:`~.CUDA_ERROR_UNSUPPORTED_PTX_VERSION`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_NO_BINARY_FOR_GPU`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`, :py:obj:`~.CUDA_ERROR_JIT_COMPILER_NOT_FOUND`
     module : :py:obj:`~.CUmodule`
         Returned module
@@ -29533,7 +29582,7 @@ def cuModuleUnload(hmod):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`
 
     See Also
@@ -29564,7 +29613,7 @@ def cuModuleGetLoadingMode():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     mode : :py:obj:`~.CUmoduleLoadingMode`
         Returns the lazy loading mode
@@ -29601,7 +29650,7 @@ def cuModuleGetFunction(hmod, char* name):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`
     hfunc : :py:obj:`~.CUfunction`
         Returned function handle
@@ -29641,7 +29690,7 @@ def cuModuleGetFunctionCount(mod):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     count : unsigned int
         Number of functions found within the module
@@ -29671,12 +29720,11 @@ def cuModuleEnumerateFunctions(unsigned int numFunctions, mod):
     Returns in ``functions`` a maximum number of ``numFunctions`` function
     handles within ``mod``. When function loading mode is set to LAZY the
     function retrieved may be partially loaded. The loading state of a
-    function can be queried using :py:obj:`~.cuFunctionIsLoaded`. CUDA APIs
-    may load the function automatically when called with partially loaded
+    function can be queried using ``cuFunctionIsLoaded``. CUDA APIs may
+    load the function automatically when called with partially loaded
     function handle which may incur additional latency. Alternatively,
-    :py:obj:`~.cuFunctionLoad` can be used to explicitly load a function.
-    The returned function handles become invalid when the module is
-    unloaded.
+    ``cuFunctionLoad`` can be used to explicitly load a function. The
+    returned function handles become invalid when the module is unloaded.
 
     Parameters
     ----------
@@ -29687,7 +29735,7 @@ def cuModuleEnumerateFunctions(unsigned int numFunctions, mod):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     functions : list[:py:obj:`~.CUfunction`]
         Buffer where the function handles are returned to
@@ -29742,7 +29790,7 @@ def cuModuleGetGlobal(hmod, char* name):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`
     dptr : :py:obj:`~.CUdeviceptr`
         Returned global device pointer
@@ -29751,7 +29799,7 @@ def cuModuleGetGlobal(hmod, char* name):
 
     See Also
     --------
-    :py:obj:`~.cuModuleGetFunction`, :py:obj:`~.cuModuleGetTexRef`, :py:obj:`~.cuModuleLoad`, :py:obj:`~.cuModuleLoadData`, :py:obj:`~.cuModuleLoadDataEx`, :py:obj:`~.cuModuleLoadFatBinary`, :py:obj:`~.cuModuleUnload`, :py:obj:`~.cudaGetSymbolAddress`, :py:obj:`~.cudaGetSymbolSize`
+    :py:obj:`~.cuModuleGetFunction`, :py:obj:`~.cuModuleGetTexRef`, :py:obj:`~.cuModuleLoad`, :py:obj:`~.cuModuleLoadData`, :py:obj:`~.cuModuleLoadDataEx`, :py:obj:`~.cuModuleLoadFatBinary`, :py:obj:`~.cuModuleUnload`, :func:`~.cudaGetSymbolAddress`, :func:`~.cudaGetSymbolSize`
     """
     cdef cydriver.CUmodule cyhmod
     if hmod is None:
@@ -29776,17 +29824,17 @@ def cuModuleGetGlobal(hmod, char* name):
 def cuLinkCreate(unsigned int numOptions, options : Optional[tuple[CUjit_option] | list[CUjit_option]], optionValues : Optional[tuple[Any] | list[Any]]):
     """ Creates a pending JIT linker invocation.
 
-    If the call is successful, the caller owns the returned CUlinkState,
-    which should eventually be destroyed with :py:obj:`~.cuLinkDestroy`.
-    The device code machine size (32 or 64 bit) will match the calling
-    application.
+    If the call is successful, the caller owns the returned
+    :py:obj:`~.CUlinkState`, which should eventually be destroyed with
+    :py:obj:`~.cuLinkDestroy`. The device code machine size (32 or 64 bit)
+    will match the calling application.
 
     Both linker and compiler options may be specified. Compiler options
     will be applied to inputs to this linker action which must be compiled
     from PTX. The options :py:obj:`~.CU_JIT_WALL_TIME`,
     :py:obj:`~.CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES`, and
     :py:obj:`~.CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES` will accumulate data
-    until the CUlinkState is destroyed.
+    until the :py:obj:`~.CUlinkState` is destroyed.
 
     The data passed in via :py:obj:`~.cuLinkAddData` and
     :py:obj:`~.cuLinkAddFile` will be treated as relocatable (-rdc=true to
@@ -29794,9 +29842,9 @@ def cuLinkCreate(unsigned int numOptions, options : Optional[tuple[CUjit_option]
     and will have similar consequences as offline relocatable device code
     linking.
 
-    ``optionValues`` must remain valid for the life of the CUlinkState if
-    output options are used. No other references to inputs are maintained
-    after this call returns.
+    ``optionValues`` must remain valid for the life of the
+    :py:obj:`~.CUlinkState` if output options are used. No other references
+    to inputs are maintained after this call returns.
 
     Parameters
     ----------
@@ -29809,11 +29857,11 @@ def cuLinkCreate(unsigned int numOptions, options : Optional[tuple[CUjit_option]
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_JIT_COMPILER_NOT_FOUND`
     stateOut : :py:obj:`~.CUlinkState`
-        On success, this will contain a CUlinkState to specify and complete
-        this action
+        On success, this will contain a :py:obj:`~.CUlinkState` to specify
+        and complete this action
 
     See Also
     --------
@@ -29881,7 +29929,7 @@ def cuLinkAddData(state, typename not None : CUjitInputType, data, size_t size,
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_IMAGE`, :py:obj:`~.CUDA_ERROR_INVALID_PTX`, :py:obj:`~.CUDA_ERROR_UNSUPPORTED_PTX_VERSION`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_NO_BINARY_FOR_GPU`
 
     See Also
@@ -29954,7 +30002,7 @@ def cuLinkAddFile(state, typename not None : CUjitInputType, char* path, unsigne
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_FILE_NOT_FOUND` :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_IMAGE`, :py:obj:`~.CUDA_ERROR_INVALID_PTX`, :py:obj:`~.CUDA_ERROR_UNSUPPORTED_PTX_VERSION`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_NO_BINARY_FOR_GPU`
 
     See Also
@@ -30008,7 +30056,7 @@ def cuLinkComplete(state):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     cubinOut : Any
         On success, this will point to the output image
@@ -30049,7 +30097,7 @@ def cuLinkDestroy(state):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
@@ -30093,7 +30141,7 @@ def cuModuleGetTexRef(hmod, char* name):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`
     pTexRef : :py:obj:`~.CUtexref`
         Returned texture reference
@@ -30140,7 +30188,7 @@ def cuModuleGetSurfRef(hmod, char* name):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`
     pSurfRef : :py:obj:`~.CUsurfref`
         Returned surface reference
@@ -30221,7 +30269,7 @@ def cuLibraryLoadData(code, jitOptions : Optional[tuple[CUjit_option] | list[CUj
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_PTX`, :py:obj:`~.CUDA_ERROR_UNSUPPORTED_PTX_VERSION`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_NO_BINARY_FOR_GPU`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`, :py:obj:`~.CUDA_ERROR_JIT_COMPILER_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     library : :py:obj:`~.CUlibrary`
         Returned library
@@ -30321,7 +30369,7 @@ def cuLibraryLoadFromFile(char* fileName, jitOptions : Optional[tuple[CUjit_opti
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_PTX`, :py:obj:`~.CUDA_ERROR_UNSUPPORTED_PTX_VERSION`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_NO_BINARY_FOR_GPU`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`, :py:obj:`~.CUDA_ERROR_JIT_COMPILER_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     library : :py:obj:`~.CUlibrary`
         Returned library
@@ -30377,7 +30425,7 @@ def cuLibraryUnload(library):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -30416,7 +30464,7 @@ def cuLibraryGetKernel(library, char* name):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`
     pKernel : :py:obj:`~.CUkernel`
         Returned kernel handle
@@ -30456,7 +30504,7 @@ def cuLibraryGetKernelCount(lib):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     count : unsigned int
         Number of kernels found within the library
@@ -30496,7 +30544,7 @@ def cuLibraryEnumerateKernels(unsigned int numKernels, lib):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     kernels : list[:py:obj:`~.CUkernel`]
         Buffer where the kernel handles are returned to
@@ -30547,7 +30595,7 @@ def cuLibraryGetModule(library):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`
     pMod : :py:obj:`~.CUmodule`
         Returned module handle
@@ -30589,7 +30637,7 @@ def cuKernelGetFunction(kernel):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`
     pFunc : :py:obj:`~.CUfunction`
         Returned function handle
@@ -30630,7 +30678,7 @@ def cuKernelGetLibrary(kernel):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`
     pLib : :py:obj:`~.CUlibrary`
         Returned library handle
@@ -30677,7 +30725,7 @@ def cuLibraryGetGlobal(library, char* name):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`
     dptr : :py:obj:`~.CUdeviceptr`
         Returned global device pointer for the requested context
@@ -30729,7 +30777,7 @@ def cuLibraryGetManaged(library, char* name):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`
     dptr : :py:obj:`~.CUdeviceptr`
         Returned pointer to the managed memory
@@ -30779,7 +30827,7 @@ def cuLibraryGetUnifiedFunction(library, char* symbol):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`
     fptr : Any
         Returned pointer to a unified function
@@ -30848,8 +30896,8 @@ def cuKernelGetAttribute(attrib not None : CUfunction_attribute, kernel, dev):
       value of 10 for legacy cubins that do not have a properly-encoded
       binary architecture version.
 
-    - :py:obj:`~.CU_FUNC_CACHE_MODE_CA`: The attribute to indicate whether
-      the kernel has been compiled with user specified option "-Xptxas
+    - ``CU_FUNC_CACHE_MODE_CA``: The attribute to indicate whether the
+      kernel has been compiled with user specified option "-Xptxas
       --dlcm=ca" set.
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES`: The
@@ -30877,18 +30925,18 @@ def cuKernelGetAttribute(attrib not None : CUfunction_attribute, kernel, dev):
       cluster size. 1 is allowed, 0 is disallowed. A non-portable cluster
       size may only function on the specific SKUs the program is tested on.
       The launch might fail if the program is run on a different hardware
-      platform. CUDA API provides cudaOccupancyMaxActiveClusters to assist
-      with checking whether the desired size can be launched on the current
-      device. A portable cluster size is guaranteed to be functional on all
-      compute capabilities higher than the target compute capability. The
-      portable cluster size for sm_90 is 8 blocks per cluster. This value
-      may increase for future compute capabilities. The specific hardware
-      unit may support higher cluster sizes that’s not guaranteed to be
-      portable.
+      platform. CUDA API provides :func:`~.cudaOccupancyMaxActiveClusters`
+      to assist with checking whether the desired size can be launched on
+      the current device. A portable cluster size is guaranteed to be
+      functional on all compute capabilities higher than the target compute
+      capability. The portable cluster size for sm_90 is 8 blocks per
+      cluster. This value may increase for future compute capabilities. The
+      specific hardware unit may support higher cluster sizes that’s not
+      guaranteed to be portable.
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`:
       The block scheduling policy of a function. The value type is
-      CUclusterSchedulingPolicy.
+      :py:obj:`~.CUclusterSchedulingPolicy`.
 
     Parameters
     ----------
@@ -30901,7 +30949,7 @@ def cuKernelGetAttribute(attrib not None : CUfunction_attribute, kernel, dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     pi : int
         Returned attribute value
@@ -30947,11 +30995,11 @@ def cuKernelSetAttribute(attrib not None : CUfunction_attribute, int val, kernel
 
     This call sets the value of a specified attribute ``attrib`` on the
     kernel ``kernel`` for the requested device ``dev`` to an integer value
-    specified by ``val``. This function returns CUDA_SUCCESS if the new
-    value of the attribute could be successfully set. If the set fails,
-    this call will return an error. Not all attributes can have values set.
-    Attempting to set a value on a read-only attribute will result in an
-    error (CUDA_ERROR_INVALID_VALUE)
+    specified by ``val``. This function returns :py:obj:`~.CUDA_SUCCESS` if
+    the new value of the attribute could be successfully set. If the set
+    fails, this call will return an error. Not all attributes can have
+    values set. Attempting to set a value on a read-only attribute will
+    result in an error (:py:obj:`~.CUDA_ERROR_INVALID_VALUE`)
 
     Note that attributes set using :py:obj:`~.cuFuncSetAttribute()` will
     override the attribute set by this API irrespective of whether the call
@@ -30984,21 +31032,21 @@ def cuKernelSetAttribute(attrib not None : CUfunction_attribute, int val, kernel
       either all be 0 or all be positive. The validity of the cluster
       dimensions is checked at launch time. If the value is set during
       compile time, it cannot be set at runtime. Setting it at runtime will
-      return CUDA_ERROR_NOT_PERMITTED.
+      return :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`.
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT`: The required
       cluster height in blocks. The width, height, and depth values must
       either all be 0 or all be positive. The validity of the cluster
       dimensions is checked at launch time. If the value is set during
       compile time, it cannot be set at runtime. Setting it at runtime will
-      return CUDA_ERROR_NOT_PERMITTED.
+      return :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`.
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH`: The required
       cluster depth in blocks. The width, height, and depth values must
       either all be 0 or all be positive. The validity of the cluster
       dimensions is checked at launch time. If the value is set during
       compile time, it cannot be set at runtime. Setting it at runtime will
-      return CUDA_ERROR_NOT_PERMITTED.
+      return :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`.
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED`:
       Indicates whether the function can be launched with non-portable
@@ -31006,7 +31054,7 @@ def cuKernelSetAttribute(attrib not None : CUfunction_attribute, int val, kernel
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`:
       The block scheduling policy of a function. The value type is
-      CUclusterSchedulingPolicy.
+      :py:obj:`~.CUclusterSchedulingPolicy`.
 
     Parameters
     ----------
@@ -31021,7 +31069,7 @@ def cuKernelSetAttribute(attrib not None : CUfunction_attribute, int val, kernel
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
 
     See Also
@@ -31105,7 +31153,7 @@ def cuKernelSetCacheConfig(kernel, config not None : CUfunc_cache, dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
 
     See Also
@@ -31159,7 +31207,7 @@ def cuKernelGetName(hfunc):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     name : bytes
         The returned name of the function
@@ -31191,10 +31239,10 @@ def cuKernelGetParamInfo(kernel, size_t paramIndex):
     offset and size, respectively, where the parameter will reside in the
     device-side parameter layout. This information can be used to update
     kernel node parameters from the device via
-    :py:obj:`~.cudaGraphKernelNodeSetParam()` and
-    :py:obj:`~.cudaGraphKernelNodeUpdatesApply()`. ``paramIndex`` must be
-    less than the number of parameters that ``kernel`` takes. ``paramSize``
-    can be set to NULL if only the parameter offset is desired.
+    ``cudaGraphKernelNodeSetParam()`` and
+    ``cudaGraphKernelNodeUpdatesApply()``. ``paramIndex`` must be less than
+    the number of parameters that ``kernel`` takes. ``paramSize`` can be
+    set to NULL if only the parameter offset is desired.
 
     Parameters
     ----------
@@ -31205,7 +31253,7 @@ def cuKernelGetParamInfo(kernel, size_t paramIndex):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     paramOffset : int
         Returns the offset into the device-side parameter layout at which
@@ -31251,7 +31299,7 @@ def cuKernelGetParamCount(kernel):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     paramCount : int
         Returns the number of parameters used by the function
@@ -31301,7 +31349,7 @@ def cuMemGetInfo():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     free : int
         Returned free memory in bytes
@@ -31310,7 +31358,7 @@ def cuMemGetInfo():
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemGetInfo`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemGetInfo`
     """
     cdef size_t free = 0
     cdef size_t total = 0
@@ -31340,14 +31388,14 @@ def cuMemAlloc(size_t bytesize):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY` :py:obj:`~.CUDA_ERROR_EXTERNAL_DEVICE`
     dptr : :py:obj:`~.CUdeviceptr`
         Returned device pointer
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMalloc`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMalloc`
     """
     cdef CUdeviceptr dptr = CUdeviceptr()
     with nogil:
@@ -31406,7 +31454,7 @@ def cuMemAllocPitch(size_t WidthInBytes, size_t Height, unsigned int ElementSize
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     dptr : :py:obj:`~.CUdeviceptr`
         Returned device pointer
@@ -31415,7 +31463,7 @@ def cuMemAllocPitch(size_t WidthInBytes, size_t Height, unsigned int ElementSize
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMallocPitch`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMallocPitch`
     """
     cdef CUdeviceptr dptr = CUdeviceptr()
     cdef size_t pPitch = 0
@@ -31454,12 +31502,12 @@ def cuMemFree(dptr):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemAllocManaged`, :py:obj:`~.cuMemAllocAsync`, :py:obj:`~.cuMemAllocFromPoolAsync`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemFreeAsync`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaFree`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemAllocManaged`, :py:obj:`~.cuMemAllocAsync`, :py:obj:`~.cuMemAllocFromPoolAsync`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemFreeAsync`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaFree`
     """
     cdef cydriver.CUdeviceptr cydptr
     if dptr is None:
@@ -31492,7 +31540,7 @@ def cuMemGetAddressRange(dptr):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pbase : :py:obj:`~.CUdeviceptr`
         Returned base address
@@ -31532,7 +31580,7 @@ def cuMemAllocHost(size_t bytesize):
     functions such as :py:obj:`~.cuMemcpy()`. Since the memory can be
     accessed directly by the device, it can be read or written with much
     higher bandwidth than pageable memory obtained with functions such as
-    :py:obj:`~.malloc()`.
+    ``malloc()``.
 
     On systems where
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES`
@@ -31550,8 +31598,9 @@ def cuMemAllocHost(size_t bytesize):
     which support unified addressing (as may be queried using
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING`). The device pointer
     that may be used to access this host memory from those contexts is
-    always equal to the returned host pointer ``*pp``. See Unified
-    Addressing for additional details.
+    always equal to the returned host pointer ``*pp``. See :ref:`Unified
+    Addressing <cuda-bindings-driver-group__cuda__unified>` for additional
+    details.
 
     Parameters
     ----------
@@ -31560,14 +31609,14 @@ def cuMemAllocHost(size_t bytesize):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY` :py:obj:`~.CUDA_ERROR_EXTERNAL_DEVICE`
     pp : Any
         Returned pointer to host memory
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMallocHost`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMallocHost`
     """
     cdef void_ptr pp = 0
     with nogil:
@@ -31593,12 +31642,12 @@ def cuMemFreeHost(p):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaFreeHost`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaFreeHost`
     """
     cdef _HelperInputVoidPtrStruct cypHelper
     cdef void* cyp = _helper_input_void_ptr(p, &cypHelper)
@@ -31620,7 +31669,7 @@ def cuMemHostAlloc(size_t bytesize, unsigned int Flags):
     functions such as :py:obj:`~.cuMemcpyHtoD()`. Since the memory can be
     accessed directly by the device, it can be read or written with much
     higher bandwidth than pageable memory obtained with functions such as
-    :py:obj:`~.malloc()`.
+    ``malloc()``.
 
     On systems where
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES`
@@ -31673,7 +31722,8 @@ def cuMemHostAlloc(size_t bytesize, unsigned int Flags):
     :py:obj:`~.CU_MEMHOSTALLOC_WRITECOMBINED` is specified, then the
     function :py:obj:`~.cuMemHostGetDevicePointer()` must be used to query
     the device pointer, even if the context supports unified addressing.
-    See Unified Addressing for additional details.
+    See :ref:`Unified Addressing <cuda-bindings-driver-
+    group__cuda__unified>` for additional details.
 
     Parameters
     ----------
@@ -31684,14 +31734,14 @@ def cuMemHostAlloc(size_t bytesize, unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY` :py:obj:`~.CUDA_ERROR_EXTERNAL_DEVICE`
     pp : Any
         Returned pointer to host memory
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaHostAlloc`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaHostAlloc`
     """
     cdef void_ptr pp = 0
     with nogil:
@@ -31745,14 +31795,14 @@ def cuMemHostGetDevicePointer(p, unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pdptr : :py:obj:`~.CUdeviceptr`
         Returned device pointer
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaHostGetDevicePointer`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaHostGetDevicePointer`
     """
     cdef CUdeviceptr pdptr = CUdeviceptr()
     cdef _HelperInputVoidPtrStruct cypHelper
@@ -31785,14 +31835,14 @@ def cuMemHostGetFlags(p):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pFlags : unsigned int
         Returned flags word
 
     See Also
     --------
-    :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cudaHostGetFlags`
+    :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemHostAlloc`, :func:`~.cudaHostGetFlags`
     """
     cdef unsigned int pFlags = 0
     cdef _HelperInputVoidPtrStruct cypHelper
@@ -31926,14 +31976,14 @@ def cuMemAllocManaged(size_t bytesize, unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     dptr : :py:obj:`~.CUdeviceptr`
         Returned device pointer
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuStreamAttachMemAsync`, :py:obj:`~.cudaMallocManaged`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuStreamAttachMemAsync`, :func:`~.cudaMallocManaged`
     """
     cdef CUdeviceptr dptr = CUdeviceptr()
     with nogil:
@@ -31971,8 +32021,8 @@ def cuDeviceRegisterAsyncNotification(device, callbackFunc, userData):
     quickly (~10ms).   Any long running tasks should be queued for
     execution on an application thread.
 
-    Callbacks may not call cuDeviceRegisterAsyncNotification or
-    cuDeviceUnregisterAsyncNotification. Doing so will result in
+    Callbacks may not call :func:`~.cuDeviceRegisterAsyncNotification` or
+    :func:`~.cuDeviceUnregisterAsyncNotification`. Doing so will result in
     :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`. Async notification callbacks
     execute in an undefined order and may be serialized.
 
@@ -31991,7 +32041,7 @@ def cuDeviceRegisterAsyncNotification(device, callbackFunc, userData):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     callback : :py:obj:`~.CUasyncCallbackHandle`
         A handle representing the registered callback instance
@@ -32058,7 +32108,7 @@ def cuDeviceUnregisterAsyncNotification(device, callback):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
 
     See Also
@@ -32107,14 +32157,14 @@ def cuDeviceGetByPCIBusId(char* pciBusId):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     dev : :py:obj:`~.CUdevice`
         Returned device handle
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetPCIBusId`, :py:obj:`~.cudaDeviceGetByPCIBusId`
+    :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetPCIBusId`, :func:`~.cudaDeviceGetByPCIBusId`
     """
     cdef CUdevice dev = CUdevice()
     with nogil:
@@ -32147,14 +32197,14 @@ def cuDeviceGetPCIBusId(int length, dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     pciBusId : bytes
         Returned identifier string for the device in the following format
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetByPCIBusId`, :py:obj:`~.cudaDeviceGetPCIBusId`
+    :py:obj:`~.cuDeviceGet`, :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetByPCIBusId`, :func:`~.cudaDeviceGetPCIBusId`
     """
     cdef cydriver.CUdevice cydev
     if dev is None:
@@ -32208,15 +32258,15 @@ def cuIpcGetEventHandle(event):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_MAP_FAILED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pHandle : :py:obj:`~.CUipcEventHandle`
-        Pointer to a user allocated CUipcEventHandle in which to return the
-        opaque event handle
+        Pointer to a user allocated :py:obj:`~.CUipcEventHandle` in which
+        to return the opaque event handle
 
     See Also
     --------
-    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuIpcOpenEventHandle`, :py:obj:`~.cuIpcGetMemHandle`, :py:obj:`~.cuIpcOpenMemHandle`, :py:obj:`~.cuIpcCloseMemHandle`, :py:obj:`~.cudaIpcGetEventHandle`
+    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuIpcOpenEventHandle`, :py:obj:`~.cuIpcGetMemHandle`, :py:obj:`~.cuIpcOpenMemHandle`, :py:obj:`~.cuIpcCloseMemHandle`, :func:`~.cudaIpcGetEventHandle`
     """
     cdef cydriver.CUevent cyevent
     if event is None:
@@ -32254,7 +32304,7 @@ def cuIpcOpenEventHandle(handle not None : CUipcEventHandle):
     addressing on Linux and Windows operating systems. IPC functionality on
     Windows is supported for compatibility purposes but not recommended as
     it comes with performance cost. Users can test their device for IPC
-    functionality by calling :py:obj:`~.cuapiDeviceGetAttribute` with
+    functionality by calling ``cuapiDeviceGetAttribute`` with
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED`
 
     Parameters
@@ -32264,14 +32314,14 @@ def cuIpcOpenEventHandle(handle not None : CUipcEventHandle):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_MAP_FAILED`, :py:obj:`~.CUDA_ERROR_PEER_ACCESS_UNSUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phEvent : :py:obj:`~.CUevent`
         Returns the imported event
 
     See Also
     --------
-    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuIpcGetEventHandle`, :py:obj:`~.cuIpcGetMemHandle`, :py:obj:`~.cuIpcOpenMemHandle`, :py:obj:`~.cuIpcCloseMemHandle`, :py:obj:`~.cudaIpcOpenEventHandle`
+    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuIpcGetEventHandle`, :py:obj:`~.cuIpcGetMemHandle`, :py:obj:`~.cuIpcOpenMemHandle`, :py:obj:`~.cuIpcCloseMemHandle`, :func:`~.cudaIpcOpenEventHandle`
     """
     cdef CUevent phEvent = CUevent()
     with nogil:
@@ -32301,7 +32351,7 @@ def cuIpcGetMemHandle(dptr):
     addressing on Linux and Windows operating systems. IPC functionality on
     Windows is supported for compatibility purposes but not recommended as
     it comes with performance cost. Users can test their device for IPC
-    functionality by calling :py:obj:`~.cuapiDeviceGetAttribute` with
+    functionality by calling ``cuapiDeviceGetAttribute`` with
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED`
 
     Parameters
@@ -32311,7 +32361,7 @@ def cuIpcGetMemHandle(dptr):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_MAP_FAILED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pHandle : :py:obj:`~.CUipcMemHandle`
         Pointer to user allocated :py:obj:`~.CUipcMemHandle` to return the
@@ -32319,7 +32369,7 @@ def cuIpcGetMemHandle(dptr):
 
     See Also
     --------
-    :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuIpcGetEventHandle`, :py:obj:`~.cuIpcOpenEventHandle`, :py:obj:`~.cuIpcOpenMemHandle`, :py:obj:`~.cuIpcCloseMemHandle`, :py:obj:`~.cudaIpcGetMemHandle`
+    :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuIpcGetEventHandle`, :py:obj:`~.cuIpcOpenEventHandle`, :py:obj:`~.cuIpcOpenMemHandle`, :py:obj:`~.cuIpcCloseMemHandle`, :func:`~.cudaIpcGetMemHandle`
     """
     cdef cydriver.CUdeviceptr cydptr
     if dptr is None:
@@ -32352,10 +32402,10 @@ def cuIpcOpenMemHandle(handle not None : CUipcMemHandle, unsigned int Flags):
     :py:obj:`~.cuDeviceCanAccessPeer` can determine if a mapping is
     possible.
 
-    Contexts that may open :py:obj:`~.CUipcMemHandles` are restricted in
-    the following way. :py:obj:`~.CUipcMemHandles` from each
-    :py:obj:`~.CUdevice` in a given process may only be opened by one
-    :py:obj:`~.CUcontext` per :py:obj:`~.CUdevice` per other process.
+    Contexts that may open ``CUipcMemHandles`` are restricted in the
+    following way. ``CUipcMemHandles`` from each :py:obj:`~.CUdevice` in a
+    given process may only be opened by one :py:obj:`~.CUcontext` per
+    :py:obj:`~.CUdevice` per other process.
 
     If the memory handle has already been opened by the current context,
     the reference count on the handle is incremented by 1 and the existing
@@ -32372,7 +32422,7 @@ def cuIpcOpenMemHandle(handle not None : CUipcMemHandle, unsigned int Flags):
     addressing on Linux and Windows operating systems. IPC functionality on
     Windows is supported for compatibility purposes but not recommended as
     it comes with performance cost. Users can test their device for IPC
-    functionality by calling :py:obj:`~.cuapiDeviceGetAttribute` with
+    functionality by calling ``cuapiDeviceGetAttribute`` with
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED`
 
     Parameters
@@ -32385,14 +32435,14 @@ def cuIpcOpenMemHandle(handle not None : CUipcMemHandle, unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_MAP_FAILED`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_TOO_MANY_PEERS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pdptr : :py:obj:`~.CUdeviceptr`
         Returned device pointer
 
     See Also
     --------
-    :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuIpcGetEventHandle`, :py:obj:`~.cuIpcOpenEventHandle`, :py:obj:`~.cuIpcGetMemHandle`, :py:obj:`~.cuIpcCloseMemHandle`, :py:obj:`~.cuCtxEnablePeerAccess`, :py:obj:`~.cuDeviceCanAccessPeer`, :py:obj:`~.cudaIpcOpenMemHandle`
+    :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuIpcGetEventHandle`, :py:obj:`~.cuIpcOpenEventHandle`, :py:obj:`~.cuIpcGetMemHandle`, :py:obj:`~.cuIpcCloseMemHandle`, :py:obj:`~.cuCtxEnablePeerAccess`, :py:obj:`~.cuDeviceCanAccessPeer`, :func:`~.cudaIpcOpenMemHandle`
 
     Notes
     -----
@@ -32425,7 +32475,7 @@ def cuIpcCloseMemHandle(dptr):
     addressing on Linux and Windows operating systems. IPC functionality on
     Windows is supported for compatibility purposes but not recommended as
     it comes with performance cost. Users can test their device for IPC
-    functionality by calling :py:obj:`~.cuapiDeviceGetAttribute` with
+    functionality by calling ``cuapiDeviceGetAttribute`` with
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_IPC_EVENT_SUPPORTED`
 
     Parameters
@@ -32435,12 +32485,12 @@ def cuIpcCloseMemHandle(dptr):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_MAP_FAILED`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuIpcGetEventHandle`, :py:obj:`~.cuIpcOpenEventHandle`, :py:obj:`~.cuIpcGetMemHandle`, :py:obj:`~.cuIpcOpenMemHandle`, :py:obj:`~.cudaIpcCloseMemHandle`
+    :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuIpcGetEventHandle`, :py:obj:`~.cuIpcOpenEventHandle`, :py:obj:`~.cuIpcGetMemHandle`, :py:obj:`~.cuIpcOpenMemHandle`, :func:`~.cudaIpcCloseMemHandle`
     """
     cdef cydriver.CUdeviceptr cydptr
     if dptr is None:
@@ -32503,7 +32553,8 @@ def cuMemHostRegister(p, size_t bytesize, unsigned int Flags):
       :py:obj:`~.CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED`.
       Using this flag with a current context associated with a device that
       does not have this attribute set will cause
-      :py:obj:`~.cuMemHostRegister` to error with CUDA_ERROR_NOT_SUPPORTED.
+      :py:obj:`~.cuMemHostRegister` to error with
+      :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`.
 
     All of these flags are orthogonal to one another: a developer may page-
     lock memory that is portable or mapped with no restrictions.
@@ -32546,12 +32597,12 @@ def cuMemHostRegister(p, size_t bytesize, unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED` :py:obj:`~.CUDA_ERROR_EXTERNAL_DEVICE`
 
     See Also
     --------
-    :py:obj:`~.cuMemHostUnregister`, :py:obj:`~.cuMemHostGetFlags`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cudaHostRegister`
+    :py:obj:`~.cuMemHostUnregister`, :py:obj:`~.cuMemHostGetFlags`, :py:obj:`~.cuMemHostGetDevicePointer`, :func:`~.cudaHostRegister`
     """
     cdef _HelperInputVoidPtrStruct cypHelper
     cdef void* cyp = _helper_input_void_ptr(p, &cypHelper)
@@ -32565,7 +32616,7 @@ def cuMemHostRegister(p, size_t bytesize, unsigned int Flags):
 
 @cython.embedsignature(True)
 def cuMemHostUnregister(p):
-    """ Unregisters a memory range that was registered with cuMemHostRegister.
+    """ Unregisters a memory range that was registered with :py:obj:`~.cuMemHostRegister`.
 
     Unmaps the memory range whose base address is specified by ``p``, and
     makes it pageable again.
@@ -32580,12 +32631,12 @@ def cuMemHostUnregister(p):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED`,
 
     See Also
     --------
-    :py:obj:`~.cuMemHostRegister`, :py:obj:`~.cudaHostUnregister`
+    :py:obj:`~.cuMemHostRegister`, :func:`~.cudaHostUnregister`
     """
     cdef _HelperInputVoidPtrStruct cypHelper
     cdef void* cyp = _helper_input_void_ptr(p, &cypHelper)
@@ -32619,12 +32670,12 @@ def cuMemcpy(dst, src, size_t ByteCount):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemcpy`, :func:`~.cudaMemcpyToSymbol`, :func:`~.cudaMemcpyFromSymbol`
     """
     cdef cydriver.CUdeviceptr cysrc
     if src is None:
@@ -32674,12 +32725,12 @@ def cuMemcpyPeer(dstDevice, dstContext, srcDevice, srcContext, size_t ByteCount)
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpy3DPeer`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyPeerAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cudaMemcpyPeer`
+    :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpy3DPeer`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyPeerAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :func:`~.cudaMemcpyPeer`
     """
     cdef cydriver.CUcontext cysrcContext
     if srcContext is None:
@@ -32739,12 +32790,12 @@ def cuMemcpyHtoD(dstDevice, srcHost, size_t ByteCount):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyToSymbol`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemcpy`, :func:`~.cudaMemcpyToSymbol`
     """
     cdef cydriver.CUdeviceptr cydstDevice
     if dstDevice is None:
@@ -32783,12 +32834,12 @@ def cuMemcpyDtoH(dstHost, srcDevice, size_t ByteCount):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyFromSymbol`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemcpy`, :func:`~.cudaMemcpyFromSymbol`
     """
     cdef cydriver.CUdeviceptr cysrcDevice
     if srcDevice is None:
@@ -32827,12 +32878,12 @@ def cuMemcpyDtoD(dstDevice, srcDevice, size_t ByteCount):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemcpy`, :func:`~.cudaMemcpyToSymbol`, :func:`~.cudaMemcpyFromSymbol`
     """
     cdef cydriver.CUdeviceptr cysrcDevice
     if srcDevice is None:
@@ -32879,12 +32930,12 @@ def cuMemcpyDtoA(dstArray, size_t dstOffset, srcDevice, size_t ByteCount):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpyToArray`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemcpyToArray`
     """
     cdef cydriver.CUdeviceptr cysrcDevice
     if srcDevice is None:
@@ -32933,12 +32984,12 @@ def cuMemcpyAtoD(dstDevice, srcArray, size_t srcOffset, size_t ByteCount):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpyFromArray`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemcpyFromArray`
     """
     cdef cydriver.CUarray cysrcArray
     if srcArray is None:
@@ -32985,12 +33036,12 @@ def cuMemcpyHtoA(dstArray, size_t dstOffset, srcHost, size_t ByteCount):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpyToArray`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemcpyToArray`
     """
     cdef cydriver.CUarray cydstArray
     if dstArray is None:
@@ -33032,12 +33083,12 @@ def cuMemcpyAtoH(dstHost, srcArray, size_t srcOffset, size_t ByteCount):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpyFromArray`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemcpyFromArray`
     """
     cdef cydriver.CUarray cysrcArray
     if srcArray is None:
@@ -33085,12 +33136,12 @@ def cuMemcpyAtoA(dstArray, size_t dstOffset, srcArray, size_t srcOffset, size_t
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpyArrayToArray`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemcpyArrayToArray`
     """
     cdef cydriver.CUarray cysrcArray
     if srcArray is None:
@@ -33126,56 +33177,50 @@ def cuMemcpy2D(pCopy : Optional[CUDA_MEMCPY2D]):
 
     where:
 
-    - :py:obj:`~.srcMemoryType` and :py:obj:`~.dstMemoryType` specify the
-      type of memory of the source and destination, respectively;
+    - ``srcMemoryType`` and ``dstMemoryType`` specify the type of memory of
+      the source and destination, respectively;
       :py:obj:`~.CUmemorytype_enum` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
-    :py:obj:`~.srcDevice` and :py:obj:`~.srcPitch` specify the (unified
-    virtual address space) base address of the source data and the bytes
-    per row to apply. :py:obj:`~.srcArray` is ignored. This value may be
-    used only if unified addressing is supported in the calling context.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_HOST`,
-    :py:obj:`~.srcHost` and :py:obj:`~.srcPitch` specify the (host) base
-    address of the source data and the bytes per row to apply.
-    :py:obj:`~.srcArray` is ignored.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`,
-    :py:obj:`~.srcDevice` and :py:obj:`~.srcPitch` specify the (device)
-    base address of the source data and the bytes per row to apply.
-    :py:obj:`~.srcArray` is ignored.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`,
-    :py:obj:`~.srcArray` specifies the handle of the source data.
-    :py:obj:`~.srcHost`, :py:obj:`~.srcDevice` and :py:obj:`~.srcPitch` are
-    ignored.
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
+    ``srcDevice`` and ``srcPitch`` specify the (unified virtual address
+    space) base address of the source data and the bytes per row to apply.
+    ``srcArray`` is ignored. This value may be used only if unified
+    addressing is supported in the calling context.
 
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_HOST`,
-    :py:obj:`~.dstHost` and :py:obj:`~.dstPitch` specify the (host) base
-    address of the destination data and the bytes per row to apply.
-    :py:obj:`~.dstArray` is ignored.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
-    :py:obj:`~.dstDevice` and :py:obj:`~.dstPitch` specify the (unified
-    virtual address space) base address of the source data and the bytes
-    per row to apply. :py:obj:`~.dstArray` is ignored. This value may be
-    used only if unified addressing is supported in the calling context.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`,
-    :py:obj:`~.dstDevice` and :py:obj:`~.dstPitch` specify the (device)
-    base address of the destination data and the bytes per row to apply.
-    :py:obj:`~.dstArray` is ignored.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`,
-    :py:obj:`~.dstArray` specifies the handle of the destination data.
-    :py:obj:`~.dstHost`, :py:obj:`~.dstDevice` and :py:obj:`~.dstPitch` are
-    ignored.
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_HOST`, ``srcHost`` and
+    ``srcPitch`` specify the (host) base address of the source data and the
+    bytes per row to apply. ``srcArray`` is ignored.
+
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`, ``srcDevice``
+    and ``srcPitch`` specify the (device) base address of the source data
+    and the bytes per row to apply. ``srcArray`` is ignored.
 
-    - :py:obj:`~.srcXInBytes` and :py:obj:`~.srcY` specify the base address
-      of the source data for the copy.
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`, ``srcArray``
+    specifies the handle of the source data. ``srcHost``, ``srcDevice`` and
+    ``srcPitch`` are ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_HOST`, ``dstHost`` and
+    ``dstPitch`` specify the (host) base address of the destination data
+    and the bytes per row to apply. ``dstArray`` is ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
+    ``dstDevice`` and ``dstPitch`` specify the (unified virtual address
+    space) base address of the source data and the bytes per row to apply.
+    ``dstArray`` is ignored. This value may be used only if unified
+    addressing is supported in the calling context.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`, ``dstDevice``
+    and ``dstPitch`` specify the (device) base address of the destination
+    data and the bytes per row to apply. ``dstArray`` is ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`, ``dstArray``
+    specifies the handle of the destination data. ``dstHost``,
+    ``dstDevice`` and ``dstPitch`` are ignored.
+
+    - ``srcXInBytes`` and ``srcY`` specify the base address of the source
+      data for the copy.
 
     For host pointers, the starting address is
 
@@ -33185,11 +33230,11 @@ def cuMemcpy2D(pCopy : Optional[CUDA_MEMCPY2D]):
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    For CUDA arrays, :py:obj:`~.srcXInBytes` must be evenly divisible by
-    the array element size.
+    For CUDA arrays, ``srcXInBytes`` must be evenly divisible by the array
+    element size.
 
-    - :py:obj:`~.dstXInBytes` and :py:obj:`~.dstY` specify the base address
-      of the destination data for the copy.
+    - ``dstXInBytes`` and ``dstY`` specify the base address of the
+      destination data for the copy.
 
     For host pointers, the base address is
 
@@ -33199,16 +33244,15 @@ def cuMemcpy2D(pCopy : Optional[CUDA_MEMCPY2D]):
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    For CUDA arrays, :py:obj:`~.dstXInBytes` must be evenly divisible by
-    the array element size.
+    For CUDA arrays, ``dstXInBytes`` must be evenly divisible by the array
+    element size.
 
-    - :py:obj:`~.WidthInBytes` and :py:obj:`~.Height` specify the width (in
-      bytes) and height of the 2D copy being performed.
+    - ``WidthInBytes`` and ``Height`` specify the width (in bytes) and
+      height of the 2D copy being performed.
 
-    - If specified, :py:obj:`~.srcPitch` must be greater than or equal to
-      :py:obj:`~.WidthInBytes` + :py:obj:`~.srcXInBytes`, and
-      :py:obj:`~.dstPitch` must be greater than or equal to
-      :py:obj:`~.WidthInBytes` + dstXInBytes.
+    - If specified, ``srcPitch`` must be greater than or equal to
+      ``WidthInBytes`` + ``srcXInBytes``, and ``dstPitch`` must be greater
+      than or equal to ``WidthInBytes`` + dstXInBytes.
 
     :py:obj:`~.cuMemcpy2D()` returns an error if any pitch is greater than
     the maximum allowed (:py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_PITCH`).
@@ -33227,12 +33271,12 @@ def cuMemcpy2D(pCopy : Optional[CUDA_MEMCPY2D]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemcpy2D`, :func:`~.cudaMemcpy2DToArray`, :func:`~.cudaMemcpy2DFromArray`
     """
     cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = <cydriver.CUDA_MEMCPY2D*>pCopy._pvt_ptr if pCopy is not None else NULL
     with nogil:
@@ -33253,56 +33297,50 @@ def cuMemcpy2DUnaligned(pCopy : Optional[CUDA_MEMCPY2D]):
 
     where:
 
-    - :py:obj:`~.srcMemoryType` and :py:obj:`~.dstMemoryType` specify the
-      type of memory of the source and destination, respectively;
+    - ``srcMemoryType`` and ``dstMemoryType`` specify the type of memory of
+      the source and destination, respectively;
       :py:obj:`~.CUmemorytype_enum` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
-    :py:obj:`~.srcDevice` and :py:obj:`~.srcPitch` specify the (unified
-    virtual address space) base address of the source data and the bytes
-    per row to apply. :py:obj:`~.srcArray` is ignored. This value may be
-    used only if unified addressing is supported in the calling context.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_HOST`,
-    :py:obj:`~.srcHost` and :py:obj:`~.srcPitch` specify the (host) base
-    address of the source data and the bytes per row to apply.
-    :py:obj:`~.srcArray` is ignored.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`,
-    :py:obj:`~.srcDevice` and :py:obj:`~.srcPitch` specify the (device)
-    base address of the source data and the bytes per row to apply.
-    :py:obj:`~.srcArray` is ignored.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`,
-    :py:obj:`~.srcArray` specifies the handle of the source data.
-    :py:obj:`~.srcHost`, :py:obj:`~.srcDevice` and :py:obj:`~.srcPitch` are
-    ignored.
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
+    ``srcDevice`` and ``srcPitch`` specify the (unified virtual address
+    space) base address of the source data and the bytes per row to apply.
+    ``srcArray`` is ignored. This value may be used only if unified
+    addressing is supported in the calling context.
 
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
-    :py:obj:`~.dstDevice` and :py:obj:`~.dstPitch` specify the (unified
-    virtual address space) base address of the source data and the bytes
-    per row to apply. :py:obj:`~.dstArray` is ignored. This value may be
-    used only if unified addressing is supported in the calling context.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_HOST`,
-    :py:obj:`~.dstHost` and :py:obj:`~.dstPitch` specify the (host) base
-    address of the destination data and the bytes per row to apply.
-    :py:obj:`~.dstArray` is ignored.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`,
-    :py:obj:`~.dstDevice` and :py:obj:`~.dstPitch` specify the (device)
-    base address of the destination data and the bytes per row to apply.
-    :py:obj:`~.dstArray` is ignored.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`,
-    :py:obj:`~.dstArray` specifies the handle of the destination data.
-    :py:obj:`~.dstHost`, :py:obj:`~.dstDevice` and :py:obj:`~.dstPitch` are
-    ignored.
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_HOST`, ``srcHost`` and
+    ``srcPitch`` specify the (host) base address of the source data and the
+    bytes per row to apply. ``srcArray`` is ignored.
 
-    - :py:obj:`~.srcXInBytes` and :py:obj:`~.srcY` specify the base address
-      of the source data for the copy.
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`, ``srcDevice``
+    and ``srcPitch`` specify the (device) base address of the source data
+    and the bytes per row to apply. ``srcArray`` is ignored.
+
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`, ``srcArray``
+    specifies the handle of the source data. ``srcHost``, ``srcDevice`` and
+    ``srcPitch`` are ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
+    ``dstDevice`` and ``dstPitch`` specify the (unified virtual address
+    space) base address of the source data and the bytes per row to apply.
+    ``dstArray`` is ignored. This value may be used only if unified
+    addressing is supported in the calling context.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_HOST`, ``dstHost`` and
+    ``dstPitch`` specify the (host) base address of the destination data
+    and the bytes per row to apply. ``dstArray`` is ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`, ``dstDevice``
+    and ``dstPitch`` specify the (device) base address of the destination
+    data and the bytes per row to apply. ``dstArray`` is ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`, ``dstArray``
+    specifies the handle of the destination data. ``dstHost``,
+    ``dstDevice`` and ``dstPitch`` are ignored.
+
+    - ``srcXInBytes`` and ``srcY`` specify the base address of the source
+      data for the copy.
 
     For host pointers, the starting address is
 
@@ -33312,11 +33350,11 @@ def cuMemcpy2DUnaligned(pCopy : Optional[CUDA_MEMCPY2D]):
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    For CUDA arrays, :py:obj:`~.srcXInBytes` must be evenly divisible by
-    the array element size.
+    For CUDA arrays, ``srcXInBytes`` must be evenly divisible by the array
+    element size.
 
-    - :py:obj:`~.dstXInBytes` and :py:obj:`~.dstY` specify the base address
-      of the destination data for the copy.
+    - ``dstXInBytes`` and ``dstY`` specify the base address of the
+      destination data for the copy.
 
     For host pointers, the base address is
 
@@ -33326,16 +33364,15 @@ def cuMemcpy2DUnaligned(pCopy : Optional[CUDA_MEMCPY2D]):
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    For CUDA arrays, :py:obj:`~.dstXInBytes` must be evenly divisible by
-    the array element size.
+    For CUDA arrays, ``dstXInBytes`` must be evenly divisible by the array
+    element size.
 
-    - :py:obj:`~.WidthInBytes` and :py:obj:`~.Height` specify the width (in
-      bytes) and height of the 2D copy being performed.
+    - ``WidthInBytes`` and ``Height`` specify the width (in bytes) and
+      height of the 2D copy being performed.
 
-    - If specified, :py:obj:`~.srcPitch` must be greater than or equal to
-      :py:obj:`~.WidthInBytes` + :py:obj:`~.srcXInBytes`, and
-      :py:obj:`~.dstPitch` must be greater than or equal to
-      :py:obj:`~.WidthInBytes` + dstXInBytes.
+    - If specified, ``srcPitch`` must be greater than or equal to
+      ``WidthInBytes`` + ``srcXInBytes``, and ``dstPitch`` must be greater
+      than or equal to ``WidthInBytes`` + dstXInBytes.
 
     :py:obj:`~.cuMemcpy2D()` returns an error if any pitch is greater than
     the maximum allowed (:py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_PITCH`).
@@ -33354,12 +33391,12 @@ def cuMemcpy2DUnaligned(pCopy : Optional[CUDA_MEMCPY2D]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemcpy2D`, :func:`~.cudaMemcpy2DToArray`, :func:`~.cudaMemcpy2DFromArray`
     """
     cdef cydriver.CUDA_MEMCPY2D* cypCopy_ptr = <cydriver.CUDA_MEMCPY2D*>pCopy._pvt_ptr if pCopy is not None else NULL
     with nogil:
@@ -33380,58 +33417,54 @@ def cuMemcpy3D(pCopy : Optional[CUDA_MEMCPY3D]):
 
     where:
 
-    - :py:obj:`~.srcMemoryType` and :py:obj:`~.dstMemoryType` specify the
-      type of memory of the source and destination, respectively;
+    - ``srcMemoryType`` and ``dstMemoryType`` specify the type of memory of
+      the source and destination, respectively;
       :py:obj:`~.CUmemorytype_enum` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
-    :py:obj:`~.srcDevice` and :py:obj:`~.srcPitch` specify the (unified
-    virtual address space) base address of the source data and the bytes
-    per row to apply. :py:obj:`~.srcArray` is ignored. This value may be
-    used only if unified addressing is supported in the calling context.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_HOST`,
-    :py:obj:`~.srcHost`, :py:obj:`~.srcPitch` and :py:obj:`~.srcHeight`
-    specify the (host) base address of the source data, the bytes per row,
-    and the height of each 2D slice of the 3D array. :py:obj:`~.srcArray`
-    is ignored.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`,
-    :py:obj:`~.srcDevice`, :py:obj:`~.srcPitch` and :py:obj:`~.srcHeight`
-    specify the (device) base address of the source data, the bytes per
-    row, and the height of each 2D slice of the 3D array.
-    :py:obj:`~.srcArray` is ignored.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`,
-    :py:obj:`~.srcArray` specifies the handle of the source data.
-    :py:obj:`~.srcHost`, :py:obj:`~.srcDevice`, :py:obj:`~.srcPitch` and
-    :py:obj:`~.srcHeight` are ignored.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
-    :py:obj:`~.dstDevice` and :py:obj:`~.dstPitch` specify the (unified
-    virtual address space) base address of the source data and the bytes
-    per row to apply. :py:obj:`~.dstArray` is ignored. This value may be
-    used only if unified addressing is supported in the calling context.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_HOST`,
-    :py:obj:`~.dstHost` and :py:obj:`~.dstPitch` specify the (host) base
-    address of the destination data, the bytes per row, and the height of
-    each 2D slice of the 3D array. :py:obj:`~.dstArray` is ignored.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`,
-    :py:obj:`~.dstDevice` and :py:obj:`~.dstPitch` specify the (device)
-    base address of the destination data, the bytes per row, and the height
-    of each 2D slice of the 3D array. :py:obj:`~.dstArray` is ignored.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`,
-    :py:obj:`~.dstArray` specifies the handle of the destination data.
-    :py:obj:`~.dstHost`, :py:obj:`~.dstDevice`, :py:obj:`~.dstPitch` and
-    :py:obj:`~.dstHeight` are ignored.
-
-    - :py:obj:`~.srcXInBytes`, :py:obj:`~.srcY` and :py:obj:`~.srcZ`
-      specify the base address of the source data for the copy.
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
+    ``srcDevice`` and ``srcPitch`` specify the (unified virtual address
+    space) base address of the source data and the bytes per row to apply.
+    ``srcArray`` is ignored. This value may be used only if unified
+    addressing is supported in the calling context.
+
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_HOST`, ``srcHost``,
+    ``srcPitch`` and ``srcHeight`` specify the (host) base address of the
+    source data, the bytes per row, and the height of each 2D slice of the
+    3D array. ``srcArray`` is ignored.
+
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`,
+    ``srcDevice``, ``srcPitch`` and ``srcHeight`` specify the (device) base
+    address of the source data, the bytes per row, and the height of each
+    2D slice of the 3D array. ``srcArray`` is ignored.
+
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`, ``srcArray``
+    specifies the handle of the source data. ``srcHost``, ``srcDevice``,
+    ``srcPitch`` and ``srcHeight`` are ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
+    ``dstDevice`` and ``dstPitch`` specify the (unified virtual address
+    space) base address of the source data and the bytes per row to apply.
+    ``dstArray`` is ignored. This value may be used only if unified
+    addressing is supported in the calling context.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_HOST`, ``dstHost`` and
+    ``dstPitch`` specify the (host) base address of the destination data,
+    the bytes per row, and the height of each 2D slice of the 3D array.
+    ``dstArray`` is ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`, ``dstDevice``
+    and ``dstPitch`` specify the (device) base address of the destination
+    data, the bytes per row, and the height of each 2D slice of the 3D
+    array. ``dstArray`` is ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`, ``dstArray``
+    specifies the handle of the destination data. ``dstHost``,
+    ``dstDevice``, ``dstPitch`` and ``dstHeight`` are ignored.
+
+    - ``srcXInBytes``, ``srcY`` and ``srcZ`` specify the base address of
+      the source data for the copy.
 
     For host pointers, the starting address is
 
@@ -33441,11 +33474,11 @@ def cuMemcpy3D(pCopy : Optional[CUDA_MEMCPY3D]):
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    For CUDA arrays, :py:obj:`~.srcXInBytes` must be evenly divisible by
-    the array element size.
+    For CUDA arrays, ``srcXInBytes`` must be evenly divisible by the array
+    element size.
 
-    - dstXInBytes, :py:obj:`~.dstY` and :py:obj:`~.dstZ` specify the base
-      address of the destination data for the copy.
+    - dstXInBytes, ``dstY`` and ``dstZ`` specify the base address of the
+      destination data for the copy.
 
     For host pointers, the base address is
 
@@ -33455,27 +33488,25 @@ def cuMemcpy3D(pCopy : Optional[CUDA_MEMCPY3D]):
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    For CUDA arrays, :py:obj:`~.dstXInBytes` must be evenly divisible by
-    the array element size.
+    For CUDA arrays, ``dstXInBytes`` must be evenly divisible by the array
+    element size.
 
-    - :py:obj:`~.WidthInBytes`, :py:obj:`~.Height` and :py:obj:`~.Depth`
-      specify the width (in bytes), height and depth of the 3D copy being
-      performed.
+    - ``WidthInBytes``, ``Height`` and ``Depth`` specify the width (in
+      bytes), height and depth of the 3D copy being performed.
 
-    - If specified, :py:obj:`~.srcPitch` must be greater than or equal to
-      :py:obj:`~.WidthInBytes` + :py:obj:`~.srcXInBytes`, and
-      :py:obj:`~.dstPitch` must be greater than or equal to
-      :py:obj:`~.WidthInBytes` + dstXInBytes.
+    - If specified, ``srcPitch`` must be greater than or equal to
+      ``WidthInBytes`` + ``srcXInBytes``, and ``dstPitch`` must be greater
+      than or equal to ``WidthInBytes`` + dstXInBytes.
 
-    - If specified, :py:obj:`~.srcHeight` must be greater than or equal to
-      :py:obj:`~.Height` + :py:obj:`~.srcY`, and :py:obj:`~.dstHeight` must
-      be greater than or equal to :py:obj:`~.Height` + :py:obj:`~.dstY`.
+    - If specified, ``srcHeight`` must be greater than or equal to
+      ``Height`` + ``srcY``, and ``dstHeight`` must be greater than or
+      equal to ``Height`` + ``dstY``.
 
     :py:obj:`~.cuMemcpy3D()` returns an error if any pitch is greater than
     the maximum allowed (:py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_PITCH`).
 
-    The :py:obj:`~.srcLOD` and :py:obj:`~.dstLOD` members of the
-    :py:obj:`~.CUDA_MEMCPY3D` structure must be set to 0.
+    The ``srcLOD`` and ``dstLOD`` members of the :py:obj:`~.CUDA_MEMCPY3D`
+    structure must be set to 0.
 
     Parameters
     ----------
@@ -33484,12 +33515,12 @@ def cuMemcpy3D(pCopy : Optional[CUDA_MEMCPY3D]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemcpy3D`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemcpy3D`
     """
     cdef cydriver.CUDA_MEMCPY3D* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D*>pCopy._pvt_ptr if pCopy is not None else NULL
     with nogil:
@@ -33514,12 +33545,12 @@ def cuMemcpy3DPeer(pCopy : Optional[CUDA_MEMCPY3D_PEER]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyPeerAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cudaMemcpy3DPeer`
+    :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyPeerAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :func:`~.cudaMemcpy3DPeer`
     """
     cdef cydriver.CUDA_MEMCPY3D_PEER* cypCopy_ptr = <cydriver.CUDA_MEMCPY3D_PEER*>pCopy._pvt_ptr if pCopy is not None else NULL
     with nogil:
@@ -33553,12 +33584,12 @@ def cuMemcpyAsync(dst, src, size_t ByteCount, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemcpyAsync`, :func:`~.cudaMemcpyToSymbolAsync`, :func:`~.cudaMemcpyFromSymbolAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -33618,12 +33649,12 @@ def cuMemcpyPeerAsync(dstDevice, dstContext, srcDevice, srcContext, size_t ByteC
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
     --------
-    :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpy3DPeer`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cudaMemcpyPeerAsync`
+    :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpy3DPeer`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :func:`~.cudaMemcpyPeerAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -33693,12 +33724,12 @@ def cuMemcpyHtoDAsync(dstDevice, srcHost, size_t ByteCount, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemcpyAsync`, :func:`~.cudaMemcpyToSymbolAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -33747,12 +33778,12 @@ def cuMemcpyDtoHAsync(dstHost, srcDevice, size_t ByteCount, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemcpyAsync`, :func:`~.cudaMemcpyFromSymbolAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -33801,12 +33832,12 @@ def cuMemcpyDtoDAsync(dstDevice, srcDevice, size_t ByteCount, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemcpyAsync`, :func:`~.cudaMemcpyToSymbolAsync`, :func:`~.cudaMemcpyFromSymbolAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -33863,12 +33894,12 @@ def cuMemcpyHtoAAsync(dstArray, size_t dstOffset, srcHost, size_t ByteCount, hSt
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemcpyToArrayAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemcpyToArrayAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -33920,12 +33951,12 @@ def cuMemcpyAtoHAsync(dstHost, srcArray, size_t srcOffset, size_t ByteCount, hSt
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemcpyFromArrayAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemcpyFromArrayAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -33964,56 +33995,50 @@ def cuMemcpy2DAsync(pCopy : Optional[CUDA_MEMCPY2D], hStream):
 
     where:
 
-    - :py:obj:`~.srcMemoryType` and :py:obj:`~.dstMemoryType` specify the
-      type of memory of the source and destination, respectively;
+    - ``srcMemoryType`` and ``dstMemoryType`` specify the type of memory of
+      the source and destination, respectively;
       :py:obj:`~.CUmemorytype_enum` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_HOST`,
-    :py:obj:`~.srcHost` and :py:obj:`~.srcPitch` specify the (host) base
-    address of the source data and the bytes per row to apply.
-    :py:obj:`~.srcArray` is ignored.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
-    :py:obj:`~.srcDevice` and :py:obj:`~.srcPitch` specify the (unified
-    virtual address space) base address of the source data and the bytes
-    per row to apply. :py:obj:`~.srcArray` is ignored. This value may be
-    used only if unified addressing is supported in the calling context.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`,
-    :py:obj:`~.srcDevice` and :py:obj:`~.srcPitch` specify the (device)
-    base address of the source data and the bytes per row to apply.
-    :py:obj:`~.srcArray` is ignored.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`,
-    :py:obj:`~.srcArray` specifies the handle of the source data.
-    :py:obj:`~.srcHost`, :py:obj:`~.srcDevice` and :py:obj:`~.srcPitch` are
-    ignored.
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_HOST`, ``srcHost`` and
+    ``srcPitch`` specify the (host) base address of the source data and the
+    bytes per row to apply. ``srcArray`` is ignored.
 
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
-    :py:obj:`~.dstDevice` and :py:obj:`~.dstPitch` specify the (unified
-    virtual address space) base address of the source data and the bytes
-    per row to apply. :py:obj:`~.dstArray` is ignored. This value may be
-    used only if unified addressing is supported in the calling context.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_HOST`,
-    :py:obj:`~.dstHost` and :py:obj:`~.dstPitch` specify the (host) base
-    address of the destination data and the bytes per row to apply.
-    :py:obj:`~.dstArray` is ignored.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`,
-    :py:obj:`~.dstDevice` and :py:obj:`~.dstPitch` specify the (device)
-    base address of the destination data and the bytes per row to apply.
-    :py:obj:`~.dstArray` is ignored.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`,
-    :py:obj:`~.dstArray` specifies the handle of the destination data.
-    :py:obj:`~.dstHost`, :py:obj:`~.dstDevice` and :py:obj:`~.dstPitch` are
-    ignored.
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
+    ``srcDevice`` and ``srcPitch`` specify the (unified virtual address
+    space) base address of the source data and the bytes per row to apply.
+    ``srcArray`` is ignored. This value may be used only if unified
+    addressing is supported in the calling context.
+
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`, ``srcDevice``
+    and ``srcPitch`` specify the (device) base address of the source data
+    and the bytes per row to apply. ``srcArray`` is ignored.
+
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`, ``srcArray``
+    specifies the handle of the source data. ``srcHost``, ``srcDevice`` and
+    ``srcPitch`` are ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
+    ``dstDevice`` and ``dstPitch`` specify the (unified virtual address
+    space) base address of the source data and the bytes per row to apply.
+    ``dstArray`` is ignored. This value may be used only if unified
+    addressing is supported in the calling context.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_HOST`, ``dstHost`` and
+    ``dstPitch`` specify the (host) base address of the destination data
+    and the bytes per row to apply. ``dstArray`` is ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`, ``dstDevice``
+    and ``dstPitch`` specify the (device) base address of the destination
+    data and the bytes per row to apply. ``dstArray`` is ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`, ``dstArray``
+    specifies the handle of the destination data. ``dstHost``,
+    ``dstDevice`` and ``dstPitch`` are ignored.
 
-    - :py:obj:`~.srcXInBytes` and :py:obj:`~.srcY` specify the base address
-      of the source data for the copy.
+    - ``srcXInBytes`` and ``srcY`` specify the base address of the source
+      data for the copy.
 
     For host pointers, the starting address is
 
@@ -34023,11 +34048,11 @@ def cuMemcpy2DAsync(pCopy : Optional[CUDA_MEMCPY2D], hStream):
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    For CUDA arrays, :py:obj:`~.srcXInBytes` must be evenly divisible by
-    the array element size.
+    For CUDA arrays, ``srcXInBytes`` must be evenly divisible by the array
+    element size.
 
-    - :py:obj:`~.dstXInBytes` and :py:obj:`~.dstY` specify the base address
-      of the destination data for the copy.
+    - ``dstXInBytes`` and ``dstY`` specify the base address of the
+      destination data for the copy.
 
     For host pointers, the base address is
 
@@ -34037,25 +34062,23 @@ def cuMemcpy2DAsync(pCopy : Optional[CUDA_MEMCPY2D], hStream):
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    For CUDA arrays, :py:obj:`~.dstXInBytes` must be evenly divisible by
-    the array element size.
+    For CUDA arrays, ``dstXInBytes`` must be evenly divisible by the array
+    element size.
 
-    - :py:obj:`~.WidthInBytes` and :py:obj:`~.Height` specify the width (in
-      bytes) and height of the 2D copy being performed.
+    - ``WidthInBytes`` and ``Height`` specify the width (in bytes) and
+      height of the 2D copy being performed.
 
-    - If specified, :py:obj:`~.srcPitch` must be greater than or equal to
-      :py:obj:`~.WidthInBytes` + :py:obj:`~.srcXInBytes`, and
-      :py:obj:`~.dstPitch` must be greater than or equal to
-      :py:obj:`~.WidthInBytes` + dstXInBytes.
+    - If specified, ``srcPitch`` must be greater than or equal to
+      ``WidthInBytes`` + ``srcXInBytes``, and ``dstPitch`` must be greater
+      than or equal to ``WidthInBytes`` + dstXInBytes.
 
-    - If specified, :py:obj:`~.srcPitch` must be greater than or equal to
-      :py:obj:`~.WidthInBytes` + :py:obj:`~.srcXInBytes`, and
-      :py:obj:`~.dstPitch` must be greater than or equal to
-      :py:obj:`~.WidthInBytes` + dstXInBytes.
+    - If specified, ``srcPitch`` must be greater than or equal to
+      ``WidthInBytes`` + ``srcXInBytes``, and ``dstPitch`` must be greater
+      than or equal to ``WidthInBytes`` + dstXInBytes.
 
-    - If specified, :py:obj:`~.srcHeight` must be greater than or equal to
-      :py:obj:`~.Height` + :py:obj:`~.srcY`, and :py:obj:`~.dstHeight` must
-      be greater than or equal to :py:obj:`~.Height` + :py:obj:`~.dstY`.
+    - If specified, ``srcHeight`` must be greater than or equal to
+      ``Height`` + ``srcY``, and ``dstHeight`` must be greater than or
+      equal to ``Height`` + ``dstY``.
 
     :py:obj:`~.cuMemcpy2DAsync()` returns an error if any pitch is greater
     than the maximum allowed (:py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_PITCH`).
@@ -34074,12 +34097,12 @@ def cuMemcpy2DAsync(pCopy : Optional[CUDA_MEMCPY2D], hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemcpy2DAsync`, :func:`~.cudaMemcpy2DToArrayAsync`, :func:`~.cudaMemcpy2DFromArrayAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -34108,58 +34131,54 @@ def cuMemcpy3DAsync(pCopy : Optional[CUDA_MEMCPY3D], hStream):
 
     where:
 
-    - :py:obj:`~.srcMemoryType` and :py:obj:`~.dstMemoryType` specify the
-      type of memory of the source and destination, respectively;
+    - ``srcMemoryType`` and ``dstMemoryType`` specify the type of memory of
+      the source and destination, respectively;
       :py:obj:`~.CUmemorytype_enum` is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
-    :py:obj:`~.srcDevice` and :py:obj:`~.srcPitch` specify the (unified
-    virtual address space) base address of the source data and the bytes
-    per row to apply. :py:obj:`~.srcArray` is ignored. This value may be
-    used only if unified addressing is supported in the calling context.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_HOST`,
-    :py:obj:`~.srcHost`, :py:obj:`~.srcPitch` and :py:obj:`~.srcHeight`
-    specify the (host) base address of the source data, the bytes per row,
-    and the height of each 2D slice of the 3D array. :py:obj:`~.srcArray`
-    is ignored.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`,
-    :py:obj:`~.srcDevice`, :py:obj:`~.srcPitch` and :py:obj:`~.srcHeight`
-    specify the (device) base address of the source data, the bytes per
-    row, and the height of each 2D slice of the 3D array.
-    :py:obj:`~.srcArray` is ignored.
-
-    If :py:obj:`~.srcMemoryType` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`,
-    :py:obj:`~.srcArray` specifies the handle of the source data.
-    :py:obj:`~.srcHost`, :py:obj:`~.srcDevice`, :py:obj:`~.srcPitch` and
-    :py:obj:`~.srcHeight` are ignored.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
-    :py:obj:`~.dstDevice` and :py:obj:`~.dstPitch` specify the (unified
-    virtual address space) base address of the source data and the bytes
-    per row to apply. :py:obj:`~.dstArray` is ignored. This value may be
-    used only if unified addressing is supported in the calling context.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_HOST`,
-    :py:obj:`~.dstHost` and :py:obj:`~.dstPitch` specify the (host) base
-    address of the destination data, the bytes per row, and the height of
-    each 2D slice of the 3D array. :py:obj:`~.dstArray` is ignored.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`,
-    :py:obj:`~.dstDevice` and :py:obj:`~.dstPitch` specify the (device)
-    base address of the destination data, the bytes per row, and the height
-    of each 2D slice of the 3D array. :py:obj:`~.dstArray` is ignored.
-
-    If :py:obj:`~.dstMemoryType` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`,
-    :py:obj:`~.dstArray` specifies the handle of the destination data.
-    :py:obj:`~.dstHost`, :py:obj:`~.dstDevice`, :py:obj:`~.dstPitch` and
-    :py:obj:`~.dstHeight` are ignored.
-
-    - :py:obj:`~.srcXInBytes`, :py:obj:`~.srcY` and :py:obj:`~.srcZ`
-      specify the base address of the source data for the copy.
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
+    ``srcDevice`` and ``srcPitch`` specify the (unified virtual address
+    space) base address of the source data and the bytes per row to apply.
+    ``srcArray`` is ignored. This value may be used only if unified
+    addressing is supported in the calling context.
+
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_HOST`, ``srcHost``,
+    ``srcPitch`` and ``srcHeight`` specify the (host) base address of the
+    source data, the bytes per row, and the height of each 2D slice of the
+    3D array. ``srcArray`` is ignored.
+
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`,
+    ``srcDevice``, ``srcPitch`` and ``srcHeight`` specify the (device) base
+    address of the source data, the bytes per row, and the height of each
+    2D slice of the 3D array. ``srcArray`` is ignored.
+
+    If ``srcMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`, ``srcArray``
+    specifies the handle of the source data. ``srcHost``, ``srcDevice``,
+    ``srcPitch`` and ``srcHeight`` are ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_UNIFIED`,
+    ``dstDevice`` and ``dstPitch`` specify the (unified virtual address
+    space) base address of the source data and the bytes per row to apply.
+    ``dstArray`` is ignored. This value may be used only if unified
+    addressing is supported in the calling context.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_HOST`, ``dstHost`` and
+    ``dstPitch`` specify the (host) base address of the destination data,
+    the bytes per row, and the height of each 2D slice of the 3D array.
+    ``dstArray`` is ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_DEVICE`, ``dstDevice``
+    and ``dstPitch`` specify the (device) base address of the destination
+    data, the bytes per row, and the height of each 2D slice of the 3D
+    array. ``dstArray`` is ignored.
+
+    If ``dstMemoryType`` is :py:obj:`~.CU_MEMORYTYPE_ARRAY`, ``dstArray``
+    specifies the handle of the destination data. ``dstHost``,
+    ``dstDevice``, ``dstPitch`` and ``dstHeight`` are ignored.
+
+    - ``srcXInBytes``, ``srcY`` and ``srcZ`` specify the base address of
+      the source data for the copy.
 
     For host pointers, the starting address is
 
@@ -34169,11 +34188,11 @@ def cuMemcpy3DAsync(pCopy : Optional[CUDA_MEMCPY3D], hStream):
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    For CUDA arrays, :py:obj:`~.srcXInBytes` must be evenly divisible by
-    the array element size.
+    For CUDA arrays, ``srcXInBytes`` must be evenly divisible by the array
+    element size.
 
-    - dstXInBytes, :py:obj:`~.dstY` and :py:obj:`~.dstZ` specify the base
-      address of the destination data for the copy.
+    - dstXInBytes, ``dstY`` and ``dstZ`` specify the base address of the
+      destination data for the copy.
 
     For host pointers, the base address is
 
@@ -34183,27 +34202,25 @@ def cuMemcpy3DAsync(pCopy : Optional[CUDA_MEMCPY3D], hStream):
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    For CUDA arrays, :py:obj:`~.dstXInBytes` must be evenly divisible by
-    the array element size.
+    For CUDA arrays, ``dstXInBytes`` must be evenly divisible by the array
+    element size.
 
-    - :py:obj:`~.WidthInBytes`, :py:obj:`~.Height` and :py:obj:`~.Depth`
-      specify the width (in bytes), height and depth of the 3D copy being
-      performed.
+    - ``WidthInBytes``, ``Height`` and ``Depth`` specify the width (in
+      bytes), height and depth of the 3D copy being performed.
 
-    - If specified, :py:obj:`~.srcPitch` must be greater than or equal to
-      :py:obj:`~.WidthInBytes` + :py:obj:`~.srcXInBytes`, and
-      :py:obj:`~.dstPitch` must be greater than or equal to
-      :py:obj:`~.WidthInBytes` + dstXInBytes.
+    - If specified, ``srcPitch`` must be greater than or equal to
+      ``WidthInBytes`` + ``srcXInBytes``, and ``dstPitch`` must be greater
+      than or equal to ``WidthInBytes`` + dstXInBytes.
 
-    - If specified, :py:obj:`~.srcHeight` must be greater than or equal to
-      :py:obj:`~.Height` + :py:obj:`~.srcY`, and :py:obj:`~.dstHeight` must
-      be greater than or equal to :py:obj:`~.Height` + :py:obj:`~.dstY`.
+    - If specified, ``srcHeight`` must be greater than or equal to
+      ``Height`` + ``srcY``, and ``dstHeight`` must be greater than or
+      equal to ``Height`` + ``dstY``.
 
     :py:obj:`~.cuMemcpy3DAsync()` returns an error if any pitch is greater
     than the maximum allowed (:py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_PITCH`).
 
-    The :py:obj:`~.srcLOD` and :py:obj:`~.dstLOD` members of the
-    :py:obj:`~.CUDA_MEMCPY3D` structure must be set to 0.
+    The ``srcLOD`` and ``dstLOD`` members of the :py:obj:`~.CUDA_MEMCPY3D`
+    structure must be set to 0.
 
     Parameters
     ----------
@@ -34214,12 +34231,12 @@ def cuMemcpy3DAsync(pCopy : Optional[CUDA_MEMCPY3D], hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemcpy3DAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemcpy3DAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -34254,12 +34271,12 @@ def cuMemcpy3DPeerAsync(pCopy : Optional[CUDA_MEMCPY3D_PEER], hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyPeerAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`
+    :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyPeerAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :func:`~.cudaMemcpy3DPeerAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -34376,7 +34393,7 @@ def cuMemcpyBatchAsync(dsts : Optional[tuple[CUdeviceptr] | list[CUdeviceptr]],
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     """
     cdef cydriver.CUstream cyhStream
@@ -34475,31 +34492,32 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[CUDA_MEMCPY3D_BA
 
     For a given operand, if :py:obj:`~.CUmemcpy3DOperand.type` is specified
     as :py:obj:`~.CU_MEMCPY_OPERAND_TYPE_POINTER`, then
-    :py:obj:`~.CUmemcpy3DOperand.op.ptr` will be used. The
-    :py:obj:`~.CUmemcpy3DOperand.op.ptr.ptr` field must contain the pointer
-    where the copy should begin. The
-    :py:obj:`~.CUmemcpy3DOperand.op.ptr.rowLength` field specifies the
-    length of each row in elements and must either be zero or be greater
-    than or equal to the width of the copy specified in
-    :py:obj:`~.CUDA_MEMCPY3D_BATCH_OP.extent.width`. The
-    :py:obj:`~.CUmemcpy3DOperand.op.ptr.layerHeight` field specifies the
-    height of each layer and must either be zero or be greater than or
+    :py:obj:`~.CUmemcpy3DOperand```op````ptr`` will be used. The
+    :py:obj:`~.CUmemcpy3DOperand```op````ptr``::ptr field must contain the
+    pointer where the copy should begin. The
+    :py:obj:`~.CUmemcpy3DOperand```op````ptr``::rowLength field specifies
+    the length of each row in elements and must either be zero or be
+    greater than or equal to the width of the copy specified in
+    :py:obj:`~.CUDA_MEMCPY3D_BATCH_OP```extent````width``. The
+    :py:obj:`~.CUmemcpy3DOperand```op````ptr``::layerHeight field specifies
+    the height of each layer and must either be zero or be greater than or
     equal to the height of the copy specified in
-    :py:obj:`~.CUDA_MEMCPY3D_BATCH_OP.extent.height`. When either of these
-    values is zero, that aspect of the operand is considered to be tightly
-    packed according to the copy extent. For managed memory pointers on
-    devices where :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS`
-    is true or system-allocated pageable memory on devices where
+    :py:obj:`~.CUDA_MEMCPY3D_BATCH_OP```extent````height``. When either of
+    these values is zero, that aspect of the operand is considered to be
+    tightly packed according to the copy extent. For managed memory
+    pointers on devices where
+    :py:obj:`~.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS` is true or
+    system-allocated pageable memory on devices where
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS` is true, the
-    :py:obj:`~.CUmemcpy3DOperand.op.ptr.locHint` field can be used to hint
-    the location of the operand.
+    :py:obj:`~.CUmemcpy3DOperand```op````ptr``::locHint field can be used
+    to hint the location of the operand.
 
     If an operand's type is specified as
     :py:obj:`~.CU_MEMCPY_OPERAND_TYPE_ARRAY`, then
-    :py:obj:`~.CUmemcpy3DOperand.op.array` will be used. The
-    :py:obj:`~.CUmemcpy3DOperand.op.array.array` field specifies the CUDA
-    array and :py:obj:`~.CUmemcpy3DOperand.op.array.offset` specifies the
-    3D offset into that array where the copy begins.
+    :py:obj:`~.CUmemcpy3DOperand```op````array`` will be used. The
+    :py:obj:`~.CUmemcpy3DOperand```op````array``::array field specifies the
+    CUDA array and :py:obj:`~.CUmemcpy3DOperand```op````array``::offset
+    specifies the 3D offset into that array where the copy begins.
 
     The :py:obj:`~.CUmemcpyAttributes.srcAccessOrder` indicates the source
     access ordering to be observed for copies associated with the
@@ -34546,7 +34564,7 @@ def cuMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[CUDA_MEMCPY3D_BA
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     """
     cdef cydriver.CUstream cyhStream
@@ -34593,8 +34611,8 @@ def cuMemcpyWithAttributesAsync(dst, src, size_t size, attr : Optional[CUmemcpyA
     the operation in.
 
     For more information regarding the attributes, please refer to
-    :py:obj:`~.CUmemcpyAttributes` and it's usage desciption
-    in::cuMemcpyBatchAsync
+    :py:obj:`~.CUmemcpyAttributes` and it's usage desciption in
+    :py:obj:`~.cuMemcpyBatchAsync`
 
     Parameters
     ----------
@@ -34611,7 +34629,7 @@ def cuMemcpyWithAttributesAsync(dst, src, size_t size, attr : Optional[CUmemcpyA
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -34661,8 +34679,8 @@ def cuMemcpy3DWithAttributesAsync(op : Optional[CUDA_MEMCPY3D_BATCH_OP], unsigne
     the operation in.
 
     For more information regarding the operation, please refer to
-    :py:obj:`~.CUDA_MEMCPY3D_BATCH_OP` and it's usage desciption
-    in::cuMemcpy3DBatchAsync
+    :py:obj:`~.CUDA_MEMCPY3D_BATCH_OP` and it's usage desciption in
+    :py:obj:`~.cuMemcpy3DBatchAsync`
 
     Parameters
     ----------
@@ -34675,7 +34693,7 @@ def cuMemcpy3DWithAttributesAsync(op : Optional[CUDA_MEMCPY3D_BATCH_OP], unsigne
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -34716,12 +34734,12 @@ def cuMemsetD8(dstDevice, unsigned char uc, size_t N):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemset`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemset`
     """
     cdef cydriver.CUdeviceptr cydstDevice
     if dstDevice is None:
@@ -34756,12 +34774,12 @@ def cuMemsetD16(dstDevice, unsigned short us, size_t N):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemset`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemset`
     """
     cdef cydriver.CUdeviceptr cydstDevice
     if dstDevice is None:
@@ -34796,12 +34814,12 @@ def cuMemsetD32(dstDevice, unsigned int ui, size_t N):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemset`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemset`
     """
     cdef cydriver.CUdeviceptr cydstDevice
     if dstDevice is None:
@@ -34843,12 +34861,12 @@ def cuMemsetD2D8(dstDevice, size_t dstPitch, unsigned char uc, size_t Width, siz
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemset2D`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemset2D`
     """
     cdef cydriver.CUdeviceptr cydstDevice
     if dstDevice is None:
@@ -34891,12 +34909,12 @@ def cuMemsetD2D16(dstDevice, size_t dstPitch, unsigned short us, size_t Width, s
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemset2D`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemset2D`
     """
     cdef cydriver.CUdeviceptr cydstDevice
     if dstDevice is None:
@@ -34939,12 +34957,12 @@ def cuMemsetD2D32(dstDevice, size_t dstPitch, unsigned int ui, size_t Width, siz
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemset2D`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemset2D`
     """
     cdef cydriver.CUdeviceptr cydstDevice
     if dstDevice is None:
@@ -34981,12 +34999,12 @@ def cuMemsetD8Async(dstDevice, unsigned char uc, size_t N, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemsetAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemsetAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -35031,12 +35049,12 @@ def cuMemsetD16Async(dstDevice, unsigned short us, size_t N, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemsetAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemsetAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -35081,12 +35099,12 @@ def cuMemsetD32Async(dstDevice, unsigned int ui, size_t N, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMemsetAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMemsetAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -35138,12 +35156,12 @@ def cuMemsetD2D8Async(dstDevice, size_t dstPitch, unsigned char uc, size_t Width
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemset2DAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemset2DAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -35196,12 +35214,12 @@ def cuMemsetD2D16Async(dstDevice, size_t dstPitch, unsigned short us, size_t Wid
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemset2DAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD2D32Async`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemset2DAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -35254,12 +35272,12 @@ def cuMemsetD2D32Async(dstDevice, size_t dstPitch, unsigned int ui, size_t Width
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :py:obj:`~.cudaMemset2DAsync`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuMemsetD32Async`, :func:`~.cudaMemset2DAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -35300,7 +35318,7 @@ def cuArrayCreate(pAllocateArray : Optional[CUDA_ARRAY_DESCRIPTOR]):
       (in elements); the CUDA array is one-dimensional if height is 0, two-
       dimensional otherwise;
 
-    - :py:obj:`~.Format` specifies the format of the elements;
+    - ``Format`` specifies the format of the elements;
       :py:obj:`~.CUarray_format` is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
@@ -35335,14 +35353,14 @@ def cuArrayCreate(pAllocateArray : Optional[CUDA_ARRAY_DESCRIPTOR]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     pHandle : :py:obj:`~.CUarray`
         Returned array
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMallocArray`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMallocArray`
     """
     cdef CUarray pHandle = CUarray()
     cdef cydriver.CUDA_ARRAY_DESCRIPTOR* cypAllocateArray_ptr = <cydriver.CUDA_ARRAY_DESCRIPTOR*>pAllocateArray._pvt_ptr if pAllocateArray is not None else NULL
@@ -35371,14 +35389,14 @@ def cuArrayGetDescriptor(hArray):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     pArrayDescriptor : :py:obj:`~.CUDA_ARRAY_DESCRIPTOR`
         Returned array descriptor
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaArrayGetInfo`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaArrayGetInfo`
     """
     cdef cydriver.CUarray cyhArray
     if hArray is None:
@@ -35427,7 +35445,7 @@ def cuArrayGetSparseProperties(array):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     sparseProperties : :py:obj:`~.CUDA_ARRAY_SPARSE_PROPERTIES`
         Pointer to :py:obj:`~.CUDA_ARRAY_SPARSE_PROPERTIES`
@@ -35485,7 +35503,7 @@ def cuMipmappedArrayGetSparseProperties(mipmap):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     sparseProperties : :py:obj:`~.CUDA_ARRAY_SPARSE_PROPERTIES`
         Pointer to :py:obj:`~.CUDA_ARRAY_SPARSE_PROPERTIES`
@@ -35535,7 +35553,7 @@ def cuArrayGetMemoryRequirements(array, device):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     memoryRequirements : :py:obj:`~.CUDA_ARRAY_MEMORY_REQUIREMENTS`
         Pointer to :py:obj:`~.CUDA_ARRAY_MEMORY_REQUIREMENTS`
@@ -35594,7 +35612,7 @@ def cuMipmappedArrayGetMemoryRequirements(mipmap, device):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     memoryRequirements : :py:obj:`~.CUDA_ARRAY_MEMORY_REQUIREMENTS`
         Pointer to :py:obj:`~.CUDA_ARRAY_MEMORY_REQUIREMENTS`
@@ -35658,14 +35676,14 @@ def cuArrayGetPlane(hArray, unsigned int planeIdx):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     pPlaneArray : :py:obj:`~.CUarray`
         Returned CUDA array referenced by the ``planeIdx``
 
     See Also
     --------
-    :py:obj:`~.cuArrayCreate`, :py:obj:`~.cudaArrayGetPlane`
+    :py:obj:`~.cuArrayCreate`, :func:`~.cudaArrayGetPlane`
     """
     cdef cydriver.CUarray cyhArray
     if hArray is None:
@@ -35698,12 +35716,12 @@ def cuArrayDestroy(hArray):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_ARRAY_IS_MAPPED`, :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaFreeArray`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaFreeArray`
     """
     cdef cydriver.CUarray cyhArray
     if hArray is None:
@@ -35770,7 +35788,7 @@ def cuArray3DCreate(pAllocateArray : Optional[CUDA_ARRAY3D_DESCRIPTOR]):
         layers represent the first cubemap, the next six layers form the
         second cubemap, and so on.
 
-    - :py:obj:`~.Format` specifies the format of the elements;
+    - ``Format`` specifies the format of the elements;
       :py:obj:`~.CUarray_format` is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
@@ -35778,7 +35796,7 @@ def cuArray3DCreate(pAllocateArray : Optional[CUDA_ARRAY3D_DESCRIPTOR]):
     - ``NumChannels`` specifies the number of packed components per CUDA
       array element; it may be 1, 2, or 4;
 
-    - :py:obj:`~.Flags` may be set to
+    - ``Flags`` may be set to
 
       - :py:obj:`~.CUDA_ARRAY3D_LAYERED` to enable creation of layered CUDA
         arrays. If this flag is set, ``Depth`` specifies the number of
@@ -35835,14 +35853,14 @@ def cuArray3DCreate(pAllocateArray : Optional[CUDA_ARRAY3D_DESCRIPTOR]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     pHandle : :py:obj:`~.CUarray`
         Returned array
 
     See Also
     --------
-    :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaMalloc3DArray`
+    :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaMalloc3DArray`
     """
     cdef CUarray pHandle = CUarray()
     cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypAllocateArray_ptr = <cydriver.CUDA_ARRAY3D_DESCRIPTOR*>pAllocateArray._pvt_ptr if pAllocateArray is not None else NULL
@@ -35865,8 +35883,8 @@ def cuArray3DGetDescriptor(hArray):
     the CUDA array parameters for validation or other purposes.
 
     This function may be called on 1D and 2D arrays, in which case the
-    ``Height`` and/or ``Depth`` members of the descriptor struct will be
-    set to 0.
+    ``Height`` and/or ``Depth`` members of the descriptor ``struct will``
+    be set to 0.
 
     Parameters
     ----------
@@ -35875,14 +35893,14 @@ def cuArray3DGetDescriptor(hArray):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`
     pArrayDescriptor : :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR`
         Returned 3D array descriptor
 
     See Also
     --------
-    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cudaArrayGetInfo`
+    :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :func:`~.cudaArrayGetInfo`
     """
     cdef cydriver.CUarray cyhArray
     if hArray is None:
@@ -35958,7 +35976,7 @@ def cuMipmappedArrayCreate(pMipmappedArrayDesc : Optional[CUDA_ARRAY3D_DESCRIPTO
         layers represent the first cubemap, the next six layers form the
         second cubemap, and so on.
 
-    - :py:obj:`~.Format` specifies the format of the elements;
+    - ``Format`` specifies the format of the elements;
       :py:obj:`~.CUarray_format` is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
@@ -35966,7 +35984,7 @@ def cuMipmappedArrayCreate(pMipmappedArrayDesc : Optional[CUDA_ARRAY3D_DESCRIPTO
     - ``NumChannels`` specifies the number of packed components per CUDA
       array element; it may be 1, 2, or 4;
 
-    - :py:obj:`~.Flags` may be set to
+    - ``Flags`` may be set to
 
       - :py:obj:`~.CUDA_ARRAY3D_LAYERED` to enable creation of layered CUDA
         mipmapped arrays. If this flag is set, ``Depth`` specifies the
@@ -36006,14 +36024,14 @@ def cuMipmappedArrayCreate(pMipmappedArrayDesc : Optional[CUDA_ARRAY3D_DESCRIPTO
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     pHandle : :py:obj:`~.CUmipmappedArray`
         Returned mipmapped array
 
     See Also
     --------
-    :py:obj:`~.cuMipmappedArrayDestroy`, :py:obj:`~.cuMipmappedArrayGetLevel`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cudaMallocMipmappedArray`
+    :py:obj:`~.cuMipmappedArrayDestroy`, :py:obj:`~.cuMipmappedArrayGetLevel`, :py:obj:`~.cuArrayCreate`, :func:`~.cudaMallocMipmappedArray`
     """
     cdef CUmipmappedArray pHandle = CUmipmappedArray()
     cdef cydriver.CUDA_ARRAY3D_DESCRIPTOR* cypMipmappedArrayDesc_ptr = <cydriver.CUDA_ARRAY3D_DESCRIPTOR*>pMipmappedArrayDesc._pvt_ptr if pMipmappedArrayDesc is not None else NULL
@@ -36045,14 +36063,14 @@ def cuMipmappedArrayGetLevel(hMipmappedArray, unsigned int level):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     pLevelArray : :py:obj:`~.CUarray`
         Returned mipmap level CUDA array
 
     See Also
     --------
-    :py:obj:`~.cuMipmappedArrayCreate`, :py:obj:`~.cuMipmappedArrayDestroy`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cudaGetMipmappedArrayLevel`
+    :py:obj:`~.cuMipmappedArrayCreate`, :py:obj:`~.cuMipmappedArrayDestroy`, :py:obj:`~.cuArrayCreate`, :func:`~.cudaGetMipmappedArrayLevel`
     """
     cdef cydriver.CUmipmappedArray cyhMipmappedArray
     if hMipmappedArray is None:
@@ -36085,12 +36103,12 @@ def cuMipmappedArrayDestroy(hMipmappedArray):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_ARRAY_IS_MAPPED`, :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`
 
     See Also
     --------
-    :py:obj:`~.cuMipmappedArrayCreate`, :py:obj:`~.cuMipmappedArrayGetLevel`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cudaFreeMipmappedArray`
+    :py:obj:`~.cuMipmappedArrayCreate`, :py:obj:`~.cuMipmappedArrayGetLevel`, :py:obj:`~.cuArrayCreate`, :func:`~.cudaFreeMipmappedArray`
     """
     cdef cydriver.CUmipmappedArray cyhMipmappedArray
     if hMipmappedArray is None:
@@ -36112,8 +36130,9 @@ def cuMemGetHandleForAddressRange(dptr, size_t size, handleType not None : CUmem
     """ Retrieve handle for an address range.
 
     Get a handle of the specified type to an address range. When requesting
-    CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, address
-    range obtained by a prior call to either :py:obj:`~.cuMemAlloc` or
+    :py:obj:`~.CUmemRangeHandleType`
+    :py:obj:`~.CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD`, address range obtained
+    by a prior call to either :py:obj:`~.cuMemAlloc` or
     :py:obj:`~.cuMemAddressReserve` is supported if the
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_DMA_BUF_SUPPORTED` device attribute
     returns true. If the address range was obtained via
@@ -36140,17 +36159,18 @@ def cuMemGetHandleForAddressRange(dptr, size_t size, handleType not None : CUmem
     new handle every time the underlying physical allocation(s)
     corresponding to a previously queried VA range are changed.
 
-    For CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD, users
-    may set flags to
+    For :py:obj:`~.CUmemRangeHandleType`
+    :py:obj:`~.CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD`, users may set flags to
     :py:obj:`~.CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE`. Which when set
     on a supported platform, will give a DMA_BUF handle mapped via PCIE
     BAR1 or will return an error otherwise.
 
     If the device attribute
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_DMA_BUF_MMAP_SUPPORTED` is set and a
-    CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD is requested
-    for a device memory range then the returned dmabuf file descriptor may
-    be passed as the file descriptor argument to the mmap() system call.
+    :py:obj:`~.CUmemRangeHandleType`
+    :py:obj:`~.CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD` is requested for a
+    device memory range then the returned dmabuf file descriptor may be
+    passed as the file descriptor argument to the mmap() system call.
 
     For device memory on x86 systems the mapping will be a write combined
     mapping. On coherent ARM platforms these mappings will be regular
@@ -36167,14 +36187,14 @@ def cuMemGetHandleForAddressRange(dptr, size_t size, handleType not None : CUmem
         Type of handle requested (defines type and size of the ``handle``
         output parameter)
     flags : unsigned long long
-        When requesting
-        CUmemRangeHandleType::CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD the value
-        could be :py:obj:`~.CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE`,
-        otherwise 0.
+        When requesting :py:obj:`~.CUmemRangeHandleType`
+        :py:obj:`~.CU_MEM_RANGE_HANDLE_TYPE_DMA_BUF_FD` the value could be
+        :py:obj:`~.CU_MEM_RANGE_FLAG_DMA_BUF_MAPPING_TYPE_PCIE`, otherwise
+        0.
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     handle : Any
         Pointer to the location where the returned handle will be stored.
@@ -36250,7 +36270,7 @@ def cuMemBatchDecompressAsync(paramsArray : Optional[CUmemDecompressParams], siz
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     errorIndex : int
         The index into ``paramsArray`` of the decompression operation for
@@ -36310,7 +36330,7 @@ def cuMemAddressReserve(size_t size, size_t alignment, addr, unsigned long long
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     ptr : :py:obj:`~.CUdeviceptr`
         Resulting pointer to start of virtual address range allocated
@@ -36341,9 +36361,10 @@ def cuMemAddressReserve(size_t size, size_t alignment, addr, unsigned long long
 def cuMemAddressFree(ptr, size_t size):
     """ Free an address range reservation.
 
-    Frees a virtual address range reserved by cuMemAddressReserve. The size
-    must match what was given to memAddressReserve and the ptr given must
-    match what was returned from memAddressReserve.
+    Frees a virtual address range reserved by
+    :func:`~.cuMemAddressReserve`. The size must match what was given to
+    memAddressReserve and the ptr given must match what was returned from
+    memAddressReserve.
 
     Parameters
     ----------
@@ -36354,7 +36375,7 @@ def cuMemAddressFree(ptr, size_t size):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -36394,21 +36415,22 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     via :py:obj:`~.cuMemGetAllocationGranularity` with the
     :py:obj:`~.CU_MEM_ALLOC_GRANULARITY_MINIMUM` flag. To create a CPU
     allocation that doesn't target any specific NUMA nodes, applications
-    must set :py:obj:`~.CUmemAllocationProp.CUmemLocation.type` to
+    must set
+    :py:obj:`~.CUmemAllocationProp`:py:obj:`~.CUmemLocation```type`` to
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`.
-    :py:obj:`~.CUmemAllocationProp.CUmemLocation.id` is ignored for HOST
-    allocations. HOST allocations are not IPC capable and
+    :py:obj:`~.CUmemAllocationProp`:py:obj:`~.CUmemLocation```id`` is
+    ignored for HOST allocations. HOST allocations are not IPC capable and
     :py:obj:`~.CUmemAllocationProp.requestedHandleTypes` must be 0, any
     other value will result in :py:obj:`~.CUDA_ERROR_INVALID_VALUE`. To
     create a CPU allocation targeting a specific host NUMA node,
     applications must set
-    :py:obj:`~.CUmemAllocationProp.CUmemLocation.type` to
+    :py:obj:`~.CUmemAllocationProp`:py:obj:`~.CUmemLocation```type`` to
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` and
-    :py:obj:`~.CUmemAllocationProp.CUmemLocation.id` must specify the NUMA
-    ID of the CPU. On systems where NUMA is not available
-    :py:obj:`~.CUmemAllocationProp.CUmemLocation.id` must be set to 0.
-    Specifying :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT` as the
-    :py:obj:`~.CUmemLocation.type` will result in
+    :py:obj:`~.CUmemAllocationProp`:py:obj:`~.CUmemLocation```id`` must
+    specify the NUMA ID of the CPU. On systems where NUMA is not available
+    :py:obj:`~.CUmemAllocationProp`:py:obj:`~.CUmemLocation```id`` must be
+    set to 0. Specifying :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT`
+    as the :py:obj:`~.CUmemLocation.type` will result in
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
     Applications that intend to use :py:obj:`~.CU_MEM_HANDLE_TYPE_FABRIC`
@@ -36431,7 +36453,7 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
     /proc/devices users can execute the following command: ``mknod
     /dev/nvidia-caps-imex-channels/channel0 c <major number> 0``
 
-    If :py:obj:`~.CUmemAllocationProp.allocFlags.usage` contains
+    If :py:obj:`~.CUmemAllocationProp```allocFlags````usage`` contains
     :py:obj:`~.CU_MEM_CREATE_USAGE_TILE_POOL` flag then the memory
     allocation is intended only to be used as backing tile pool for sparse
     CUDA arrays and sparse CUDA mipmapped arrays. (see
@@ -36448,7 +36470,7 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     handle : :py:obj:`~.CUmemGenericAllocationHandle`
         Value of handle returned. All operations on this allocation are to
@@ -36471,9 +36493,10 @@ def cuMemCreate(size_t size, prop : Optional[CUmemAllocationProp], unsigned long
 
 @cython.embedsignature(True)
 def cuMemRelease(handle):
-    """ Release a memory handle representing a memory allocation which was previously allocated through cuMemCreate.
+    """ Release a memory handle representing a memory allocation which was previously allocated through :func:`~.cuMemCreate`.
 
-    Frees the memory that was allocated on a device through cuMemCreate.
+    Frees the memory that was allocated on a device through
+    :func:`~.cuMemCreate`.
 
     The memory allocation will be freed when all outstanding mappings to
     the memory are unmapped and when all outstanding references to the
@@ -36487,11 +36510,12 @@ def cuMemRelease(handle):
     Parameters
     ----------
     handle : :py:obj:`~.CUmemGenericAllocationHandle`
-        Value of handle which was returned previously by cuMemCreate.
+        Value of handle which was returned previously by
+        :func:`~.cuMemCreate`.
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -36527,17 +36551,17 @@ def cuMemMap(ptr, size_t size, size_t offset, handle, unsigned long long flags):
     :py:obj:`~.CU_MEM_ALLOC_GRANULARITY_MINIMUM` flag. If ``handle``
     represents a multicast object, ``ptr``, ``size`` and ``offset`` must be
     aligned to the value returned by :py:obj:`~.cuMulticastGetGranularity`
-    with the flag :py:obj:`~.CU_MULTICAST_MINIMUM_GRANULARITY`. For best
+    with the flag ``CU_MULTICAST_MINIMUM_GRANULARITY``. For best
     performance however, it is recommended that ``ptr``, ``size`` and
     ``offset`` be aligned to the value returned by
     :py:obj:`~.cuMulticastGetGranularity` with the flag
-    :py:obj:`~.CU_MULTICAST_RECOMMENDED_GRANULARITY`.
+    ``CU_MULTICAST_RECOMMENDED_GRANULARITY``.
 
     When ``handle`` represents a multicast object, this call may return
-    CUDA_ERROR_ILLEGAL_STATE if the system configuration is in an illegal
-    state. In such cases, to continue using multicast, verify that the
-    system configuration is in a valid state and all required driver
-    daemons are running properly.
+    :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` if the system configuration is in
+    an illegal state. In such cases, to continue using multicast, verify
+    that the system configuration is in a valid state and all required
+    driver daemons are running properly.
 
     Please note calling :py:obj:`~.cuMemMap` does not make the address
     accessible, the caller needs to update accessibility of a contiguous
@@ -36566,7 +36590,7 @@ def cuMemMap(ptr, size_t size, size_t offset, handle, unsigned long long flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
 
     See Also
@@ -36610,21 +36634,22 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
 
     where :py:obj:`~.CUarrayMapInfo.resourceType` specifies the type of
     resource to be operated on. If :py:obj:`~.CUarrayMapInfo.resourceType`
-    is set to :py:obj:`~.CUresourcetype.CU_RESOURCE_TYPE_ARRAY` then
-    :py:obj:`~.CUarrayMapInfo.resource.array` must be set to a valid sparse
-    CUDA array handle. The CUDA array must be either a 2D, 2D layered or 3D
-    CUDA array and must have been allocated using :py:obj:`~.cuArrayCreate`
-    or :py:obj:`~.cuArray3DCreate` with the flag
+    is set to
+    :py:obj:`~.CUresourcetype`:::py:obj:`~.CU_RESOURCE_TYPE_ARRAY` then
+    :py:obj:`~.CUarrayMapInfo```resource````array`` must be set to a valid
+    sparse CUDA array handle. The CUDA array must be either a 2D, 2D
+    layered or 3D CUDA array and must have been allocated using
+    :py:obj:`~.cuArrayCreate` or :py:obj:`~.cuArray3DCreate` with the flag
     :py:obj:`~.CUDA_ARRAY3D_SPARSE` or
     :py:obj:`~.CUDA_ARRAY3D_DEFERRED_MAPPING`. For CUDA arrays obtained
     using :py:obj:`~.cuMipmappedArrayGetLevel`,
     :py:obj:`~.CUDA_ERROR_INVALID_VALUE` will be returned. If
     :py:obj:`~.CUarrayMapInfo.resourceType` is set to
-    :py:obj:`~.CUresourcetype.CU_RESOURCE_TYPE_MIPMAPPED_ARRAY` then
-    :py:obj:`~.CUarrayMapInfo.resource.mipmap` must be set to a valid
-    sparse CUDA mipmapped array handle. The CUDA mipmapped array must be
-    either a 2D, 2D layered or 3D CUDA mipmapped array and must have been
-    allocated using :py:obj:`~.cuMipmappedArrayCreate` with the flag
+    :py:obj:`~.CUresourcetype`:::py:obj:`~.CU_RESOURCE_TYPE_MIPMAPPED_ARRAY`
+    then :py:obj:`~.CUarrayMapInfo```resource````mipmap`` must be set to a
+    valid sparse CUDA mipmapped array handle. The CUDA mipmapped array must
+    be either a 2D, 2D layered or 3D CUDA mipmapped array and must have
+    been allocated using :py:obj:`~.cuMipmappedArrayCreate` with the flag
     :py:obj:`~.CUDA_ARRAY3D_SPARSE` or
     :py:obj:`~.CUDA_ARRAY3D_DEFERRED_MAPPING`.
 
@@ -36635,56 +36660,58 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
     **View CUDA Toolkit Documentation for a C++ code example**
 
     where
-    :py:obj:`~.CUarraySparseSubresourceType.CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL`
+    :py:obj:`~.CUarraySparseSubresourceType`:::py:obj:`~.CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL`
     indicates a sparse-miplevel which spans at least one tile in every
     dimension. The remaining miplevels which are too small to span at least
     one tile in any dimension constitute the mip tail region as indicated
     by
-    :py:obj:`~.CUarraySparseSubresourceType.CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL`
+    :py:obj:`~.CUarraySparseSubresourceType`:::py:obj:`~.CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL`
     subresource type.
 
     If :py:obj:`~.CUarrayMapInfo.subresourceType` is set to
-    :py:obj:`~.CUarraySparseSubresourceType.CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL`
-    then :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel` struct must
-    contain valid array subregion offsets and extents. The
-    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.offsetX`,
-    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.offsetY` and
-    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.offsetZ` must specify
-    valid X, Y and Z offsets respectively. The
-    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.extentWidth`,
-    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.extentHeight` and
-    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.extentDepth` must
-    specify valid width, height and depth extents respectively. These
+    :py:obj:`~.CUarraySparseSubresourceType`:::py:obj:`~.CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL`
+    then :py:obj:`~.CUarrayMapInfo```subresource````sparseLevel`` ``struct
+    must`` contain valid array subregion offsets and extents. The
+    :py:obj:`~.CUarrayMapInfo```subresource````sparseLevel``::offsetX,
+    :py:obj:`~.CUarrayMapInfo```subresource````sparseLevel``::offsetY and
+    :py:obj:`~.CUarrayMapInfo```subresource````sparseLevel``::offsetZ must
+    specify valid X, Y and Z offsets respectively. The
+    :py:obj:`~.CUarrayMapInfo```subresource````sparseLevel``::extentWidth,
+    :py:obj:`~.CUarrayMapInfo```subresource````sparseLevel``::extentHeight
+    and
+    :py:obj:`~.CUarrayMapInfo```subresource````sparseLevel``::extentDepth
+    must specify valid width, height and depth extents respectively. These
     offsets and extents must be aligned to the corresponding tile
     dimension. For CUDA mipmapped arrays
-    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.level` must specify a
-    valid mip level index. Otherwise, must be zero. For layered CUDA arrays
-    and layered CUDA mipmapped arrays
-    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.layer` must specify a
-    valid layer index. Otherwise, must be zero.
-    :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.offsetZ` must be zero
-    and :py:obj:`~.CUarrayMapInfo.subresource.sparseLevel.extentDepth` must
-    be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped
+    :py:obj:`~.CUarrayMapInfo```subresource````sparseLevel``::level must
+    specify a valid mip level index. Otherwise, must be zero. For layered
+    CUDA arrays and layered CUDA mipmapped arrays
+    :py:obj:`~.CUarrayMapInfo```subresource````sparseLevel``::layer must
+    specify a valid layer index. Otherwise, must be zero.
+    :py:obj:`~.CUarrayMapInfo```subresource````sparseLevel``::offsetZ must
+    be zero and
+    :py:obj:`~.CUarrayMapInfo```subresource````sparseLevel``::extentDepth
+    must be set to 1 for 2D and 2D layered CUDA arrays and CUDA mipmapped
     arrays. Tile extents can be obtained by calling
     :py:obj:`~.cuArrayGetSparseProperties` and
     :py:obj:`~.cuMipmappedArrayGetSparseProperties`
 
     If :py:obj:`~.CUarrayMapInfo.subresourceType` is set to
-    :py:obj:`~.CUarraySparseSubresourceType.CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL`
-    then :py:obj:`~.CUarrayMapInfo.subresource.miptail` struct must contain
-    valid mip tail offset in
-    :py:obj:`~.CUarrayMapInfo.subresource.miptail.offset` and size in
-    :py:obj:`~.CUarrayMapInfo.subresource.miptail.size`. Both, mip tail
-    offset and mip tail size must be aligned to the tile size. For layered
-    CUDA mipmapped arrays which don't have the flag
+    :py:obj:`~.CUarraySparseSubresourceType`:::py:obj:`~.CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL`
+    then :py:obj:`~.CUarrayMapInfo```subresource````miptail`` ``struct
+    must`` contain valid mip tail offset in
+    :py:obj:`~.CUarrayMapInfo```subresource````miptail``::offset and size
+    in :py:obj:`~.CUarrayMapInfo```subresource````miptail``::size. Both,
+    mip tail offset and mip tail size must be aligned to the tile size. For
+    layered CUDA mipmapped arrays which don't have the flag
     :py:obj:`~.CU_ARRAY_SPARSE_PROPERTIES_SINGLE_MIPTAIL` set in
     :py:obj:`~.CUDA_ARRAY_SPARSE_PROPERTIES.flags` as returned by
     :py:obj:`~.cuMipmappedArrayGetSparseProperties`,
-    :py:obj:`~.CUarrayMapInfo.subresource.miptail.layer` must specify a
-    valid layer index. Otherwise, must be zero.
+    :py:obj:`~.CUarrayMapInfo```subresource````miptail``::layer must
+    specify a valid layer index. Otherwise, must be zero.
 
-    If :py:obj:`~.CUarrayMapInfo.resource.array` or
-    :py:obj:`~.CUarrayMapInfo.resource.mipmap` was created with
+    If :py:obj:`~.CUarrayMapInfo```resource````array`` or
+    :py:obj:`~.CUarrayMapInfo```resource````mipmap`` was created with
     :py:obj:`~.CUDA_ARRAY3D_DEFERRED_MAPPING` flag set the
     :py:obj:`~.CUarrayMapInfo.subresourceType` and the contents of
     :py:obj:`~.CUarrayMapInfo.subresource` will be ignored.
@@ -36695,28 +36722,28 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
     **View CUDA Toolkit Documentation for a C++ code example**
 
     If :py:obj:`~.CUarrayMapInfo.memOperationType` is set to
-    :py:obj:`~.CUmemOperationType.CU_MEM_OPERATION_TYPE_MAP` then the
-    subresource will be mapped onto the tile pool memory specified by
-    :py:obj:`~.CUarrayMapInfo.memHandle` at offset
+    :py:obj:`~.CUmemOperationType`:::py:obj:`~.CU_MEM_OPERATION_TYPE_MAP`
+    then the subresource will be mapped onto the tile pool memory specified
+    by :py:obj:`~.CUarrayMapInfo.memHandle` at offset
     :py:obj:`~.CUarrayMapInfo.offset`. The tile pool allocation has to be
     created by specifying the :py:obj:`~.CU_MEM_CREATE_USAGE_TILE_POOL`
     flag when calling :py:obj:`~.cuMemCreate`. Also,
     :py:obj:`~.CUarrayMapInfo.memHandleType` must be set to
-    :py:obj:`~.CUmemHandleType.CU_MEM_HANDLE_TYPE_GENERIC`.
+    :py:obj:`~.CUmemHandleType`:::py:obj:`~.CU_MEM_HANDLE_TYPE_GENERIC`.
 
     If :py:obj:`~.CUarrayMapInfo.memOperationType` is set to
-    :py:obj:`~.CUmemOperationType.CU_MEM_OPERATION_TYPE_UNMAP` then an
-    unmapping operation is performed. :py:obj:`~.CUarrayMapInfo.memHandle`
-    must be NULL.
+    :py:obj:`~.CUmemOperationType`:::py:obj:`~.CU_MEM_OPERATION_TYPE_UNMAP`
+    then an unmapping operation is performed.
+    :py:obj:`~.CUarrayMapInfo.memHandle` must be NULL.
 
     :py:obj:`~.CUarrayMapInfo.deviceBitMask` specifies the list of devices
     that must map or unmap physical memory. Currently, this mask must have
     exactly one bit set, and the corresponding device must match the device
     associated with the stream. If
     :py:obj:`~.CUarrayMapInfo.memOperationType` is set to
-    :py:obj:`~.CUmemOperationType.CU_MEM_OPERATION_TYPE_MAP`, the device
-    must also match the device associated with the tile pool memory
-    allocation as specified by :py:obj:`~.CUarrayMapInfo.memHandle`.
+    :py:obj:`~.CUmemOperationType`:::py:obj:`~.CU_MEM_OPERATION_TYPE_MAP`,
+    the device must also match the device associated with the tile pool
+    memory allocation as specified by :py:obj:`~.CUarrayMapInfo.memHandle`.
 
     :py:obj:`~.CUarrayMapInfo.flags` and
     :py:obj:`~.CUarrayMapInfo.reserved` ``[]`` are unused and must be set
@@ -36733,7 +36760,7 @@ def cuMemMapArrayAsync(mapInfoList : Optional[tuple[CUarrayMapInfo] | list[CUarr
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
@@ -36796,7 +36823,7 @@ def cuMemUnmap(ptr, size_t size):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -36829,16 +36856,16 @@ def cuMemSetAccess(ptr, size_t size, desc : Optional[tuple[CUmemAccessDesc] | li
     :py:obj:`~.cuMemMap` / :py:obj:`~.cuMemCreate`. Users cannot specify
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` accessibility for
     allocations created on with other location types. Note: When
-    :py:obj:`~.CUmemAccessDesc.CUmemLocation.type` is
+    :py:obj:`~.CUmemAccessDesc`:py:obj:`~.CUmemLocation```type`` is
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA`,
-    :py:obj:`~.CUmemAccessDesc.CUmemLocation.id` is ignored. When setting
-    the access flags for a virtual address range mapping a multicast
-    object, ``ptr`` and ``size`` must be aligned to the value returned by
-    :py:obj:`~.cuMulticastGetGranularity` with the flag
-    :py:obj:`~.CU_MULTICAST_MINIMUM_GRANULARITY`. For best performance
-    however, it is recommended that ``ptr`` and ``size`` be aligned to the
-    value returned by :py:obj:`~.cuMulticastGetGranularity` with the flag
-    :py:obj:`~.CU_MULTICAST_RECOMMENDED_GRANULARITY`.
+    :py:obj:`~.CUmemAccessDesc`:py:obj:`~.CUmemLocation```id`` is ignored.
+    When setting the access flags for a virtual address range mapping a
+    multicast object, ``ptr`` and ``size`` must be aligned to the value
+    returned by :py:obj:`~.cuMulticastGetGranularity` with the flag
+    ``CU_MULTICAST_MINIMUM_GRANULARITY``. For best performance however, it
+    is recommended that ``ptr`` and ``size`` be aligned to the value
+    returned by :py:obj:`~.cuMulticastGetGranularity` with the flag
+    ``CU_MULTICAST_RECOMMENDED_GRANULARITY``.
 
     Parameters
     ----------
@@ -36854,12 +36881,12 @@ def cuMemSetAccess(ptr, size_t size, desc : Optional[tuple[CUmemAccessDesc] | li
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
     --------
-    :py:obj:`~.cuMemSetAccess`, :py:obj:`~.cuMemCreate`, :py:obj:`~.py`:obj:`~.cuMemMap`
+    :py:obj:`~.cuMemSetAccess`, :py:obj:`~.cuMemCreate`, ``py``:obj:`~.cuMemMap`
     """
     desc = [] if desc is None else desc
     if not all(isinstance(_x, (CUmemAccessDesc,)) for _x in desc):
@@ -36904,7 +36931,7 @@ def cuMemGetAccess(location : Optional[CUmemLocation], ptr):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     flags : unsigned long long
         Flags set for this location
@@ -36963,7 +36990,7 @@ def cuMemExportToShareableHandle(handle, handleType not None : CUmemAllocationHa
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     shareableHandle : Any
         Pointer to the location in which to store the requested handle type
@@ -37016,7 +37043,7 @@ def cuMemImportFromShareableHandle(osHandle, shHandleType not None : CUmemAlloca
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     handle : :py:obj:`~.CUmemGenericAllocationHandle`
         CUDA Memory handle for the memory allocation.
@@ -37027,7 +37054,7 @@ def cuMemImportFromShareableHandle(osHandle, shHandleType not None : CUmemAlloca
 
     Notes
     -----
-    Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc) created on devices under an SLI group may not be supported, and thus this API will return CUDA_ERROR_NOT_SUPPORTED. There is no guarantee that the contents of ``handle`` will be the same CUDA memory handle for the same given OS shareable handle, or the same underlying allocation.
+    Importing shareable handles exported from some graphics APIs(VUlkan, OpenGL, etc) created on devices under an SLI group may not be supported, and thus this API will return :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`. There is no guarantee that the contents of ``handle`` will be the same CUDA memory handle for the same given OS shareable handle, or the same underlying allocation.
     """
     cdef CUmemGenericAllocationHandle handle = CUmemGenericAllocationHandle()
     cdef _HelperInputVoidPtrStruct cyosHandleHelper
@@ -37061,7 +37088,7 @@ def cuMemGetAllocationGranularity(prop : Optional[CUmemAllocationProp], option n
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     granularity : int
         Returned granularity.
@@ -37093,7 +37120,7 @@ def cuMemGetAllocationPropertiesFromHandle(handle):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     prop : :py:obj:`~.CUmemAllocationProp`
         Pointer to a properties structure which will hold the information
@@ -37137,7 +37164,7 @@ def cuMemRetainAllocationHandle(addr):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     handle : :py:obj:`~.CUmemGenericAllocationHandle`
         CUDA Memory handle for the backing memory allocation.
@@ -37181,7 +37208,7 @@ def cuMemFreeAsync(dptr, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT` (default stream specified with no current context), :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     Notes
@@ -37230,7 +37257,7 @@ def cuMemAllocAsync(size_t bytesize, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT` (default stream specified with no current context), :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     dptr : :py:obj:`~.CUdeviceptr`
         Returned device pointer
@@ -37287,7 +37314,7 @@ def cuMemPoolTrimTo(pool, size_t minBytesToKeep):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -37322,11 +37349,11 @@ def cuMemPoolSetAttribute(pool, attr not None : CUmemPool_attribute, value):
     Supported attributes are:
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD`: (value type =
-      cuuint64_t) Amount of reserved memory in bytes to hold onto before
-      trying to release memory back to the OS. When more than the release
-      threshold bytes of memory are held by the memory pool, the allocator
-      will try to release memory back to the OS on the next call to stream,
-      event or context synchronize. (default 0)
+      :py:obj:`~.cuuint64_t`) Amount of reserved memory in bytes to hold
+      onto before trying to release memory back to the OS. When more than
+      the release threshold bytes of memory are held by the memory pool,
+      the allocator will try to release memory back to the OS on the next
+      call to stream, event or context synchronize. (default 0)
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES`: (value
       type = int) Allow :py:obj:`~.cuMemAllocAsync` to use memory
@@ -37346,13 +37373,13 @@ def cuMemPoolSetAttribute(pool, attr not None : CUmemPool_attribute, value):
       (default enabled).
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH`: (value type =
-      cuuint64_t) Reset the high watermark that tracks the amount of
-      backing memory that was allocated for the memory pool. It is illegal
-      to set this attribute to a non-zero value.
+      :py:obj:`~.cuuint64_t`) Reset the high watermark that tracks the
+      amount of backing memory that was allocated for the memory pool. It
+      is illegal to set this attribute to a non-zero value.
 
-    - :py:obj:`~.CU_MEMPOOL_ATTR_USED_MEM_HIGH`: (value type = cuuint64_t)
-      Reset the high watermark that tracks the amount of used memory that
-      was allocated for the memory pool.
+    - :py:obj:`~.CU_MEMPOOL_ATTR_USED_MEM_HIGH`: (value type =
+      :py:obj:`~.cuuint64_t`) Reset the high watermark that tracks the
+      amount of used memory that was allocated for the memory pool.
 
     Parameters
     ----------
@@ -37365,7 +37392,7 @@ def cuMemPoolSetAttribute(pool, attr not None : CUmemPool_attribute, value):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -37397,11 +37424,11 @@ def cuMemPoolGetAttribute(pool, attr not None : CUmemPool_attribute):
     Supported attributes are:
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD`: (value type =
-      cuuint64_t) Amount of reserved memory in bytes to hold onto before
-      trying to release memory back to the OS. When more than the release
-      threshold bytes of memory are held by the memory pool, the allocator
-      will try to release memory back to the OS on the next call to stream,
-      event or context synchronize. (default 0)
+      :py:obj:`~.cuuint64_t`) Amount of reserved memory in bytes to hold
+      onto before trying to release memory back to the OS. When more than
+      the release threshold bytes of memory are held by the memory pool,
+      the allocator will try to release memory back to the OS on the next
+      call to stream, event or context synchronize. (default 0)
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES`: (value
       type = int) Allow :py:obj:`~.cuMemAllocAsync` to use memory
@@ -37421,48 +37448,50 @@ def cuMemPoolGetAttribute(pool, attr not None : CUmemPool_attribute):
       (default enabled).
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT`: (value type =
-      cuuint64_t) Amount of backing memory currently allocated for the
-      mempool
+      :py:obj:`~.cuuint64_t`) Amount of backing memory currently allocated
+      for the mempool
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH`: (value type =
-      cuuint64_t) High watermark of backing memory allocated for the
-      mempool since the last time it was reset.
+      :py:obj:`~.cuuint64_t`) High watermark of backing memory allocated
+      for the mempool since the last time it was reset.
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_USED_MEM_CURRENT`: (value type =
-      cuuint64_t) Amount of memory from the pool that is currently in use
-      by the application.
+      :py:obj:`~.cuuint64_t`) Amount of memory from the pool that is
+      currently in use by the application.
 
-    - :py:obj:`~.CU_MEMPOOL_ATTR_USED_MEM_HIGH`: (value type = cuuint64_t)
-      High watermark of the amount of memory from the pool that was in use
-      by the application.
+    - :py:obj:`~.CU_MEMPOOL_ATTR_USED_MEM_HIGH`: (value type =
+      :py:obj:`~.cuuint64_t`) High watermark of the amount of memory from
+      the pool that was in use by the application.
 
     The following properties can be also be queried on imported and default
     pools:
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_ALLOCATION_TYPE`: (value type =
-      CUmemAllocationType) The allocation type of the mempool
+      :py:obj:`~.CUmemAllocationType`) The allocation type of the mempool
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_EXPORT_HANDLE_TYPES`: (value type =
-      CUmemAllocationHandleType) Available export handle types for the
-      mempool. For imported pools this value is always
-      CU_MEM_HANDLE_TYPE_NONE as an imported pool cannot be re-exported
+      :py:obj:`~.CUmemAllocationHandleType`) Available export handle types
+      for the mempool. For imported pools this value is always
+      :py:obj:`~.CU_MEM_HANDLE_TYPE_NONE` as an imported pool cannot be re-
+      exported
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_LOCATION_ID`: (value type = int) The
       location id for the mempool. If the location type for this pool is
-      CU_MEM_LOCATION_TYPE_INVISIBLE then ID will be CU_DEVICE_INVALID.
+      :py:obj:`~.CU_MEM_LOCATION_TYPE_INVISIBLE` then ID will be
+      :py:obj:`~.CU_DEVICE_INVALID`.
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_LOCATION_TYPE`: (value type =
-      CUmemLocationType) The location type for the mempool. For imported
-      memory pools where the device is not directly visible to the
+      :py:obj:`~.CUmemLocationType`) The location type for the mempool. For
+      imported memory pools where the device is not directly visible to the
       importing process or pools imported via fabric handles across nodes
-      this will be CU_MEM_LOCATION_TYPE_INVISIBLE.
+      this will be :py:obj:`~.CU_MEM_LOCATION_TYPE_INVISIBLE`.
 
-    - :py:obj:`~.CU_MEMPOOL_ATTR_MAX_POOL_SIZE`: (value type = cuuint64_t)
-      Maximum size of the pool in bytes, this value may be higher than what
-      was initially passed to cuMemPoolCreate due to alignment
-      requirements. A value of 0 indicates no maximum size. For
-      CU__MEM_ALLOCATION_TYPE_MANAGED and IPC imported pools this value
-      will be system dependent.
+    - :py:obj:`~.CU_MEMPOOL_ATTR_MAX_POOL_SIZE`: (value type =
+      :py:obj:`~.cuuint64_t`) Maximum size of the pool in bytes, this value
+      may be higher than what was initially passed to
+      :func:`~.cuMemPoolCreate` due to alignment requirements. A value of 0
+      indicates no maximum size. For CU__MEM_ALLOCATION_TYPE_MANAGED and
+      IPC imported pools this value will be system dependent.
 
     - :py:obj:`~.CU_MEMPOOL_ATTR_HW_DECOMPRESS_ENABLED`: (value type = int)
       Indicates whether the pool has hardware compresssion enabled
@@ -37476,7 +37505,7 @@ def cuMemPoolGetAttribute(pool, attr not None : CUmemPool_attribute):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
 
     value : Any
         None
@@ -37517,7 +37546,7 @@ def cuMemPoolSetAccess(pool, map : Optional[tuple[CUmemAccessDesc] | list[CUmemA
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -37570,7 +37599,7 @@ def cuMemPoolGetAccess(memPool, location : Optional[CUmemLocation]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
 
     flags : :py:obj:`~.CUmemAccess_flags`
         the accessibility of the pool from the specified location
@@ -37608,20 +37637,21 @@ def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
 
     To create a memory pool for HOST memory not targeting a specific NUMA
     node, applications must set set
-    :py:obj:`~.CUmemPoolProps.CUmemLocation.type` to
+    :py:obj:`~.CUmemPoolProps`:py:obj:`~.CUmemLocation```type`` to
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST`.
-    :py:obj:`~.CUmemPoolProps.CUmemLocation.id` is ignored for such pools.
-    Pools created with the type :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` are
-    not IPC capable and :py:obj:`~.CUmemPoolProps.handleTypes` must be 0,
-    any other values will result in :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
-    To create a memory pool targeting a specific host NUMA node,
-    applications must set :py:obj:`~.CUmemPoolProps.CUmemLocation.type` to
+    :py:obj:`~.CUmemPoolProps`:py:obj:`~.CUmemLocation```id`` is ignored
+    for such pools. Pools created with the type
+    :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` are not IPC capable and
+    :py:obj:`~.CUmemPoolProps.handleTypes` must be 0, any other values will
+    result in :py:obj:`~.CUDA_ERROR_INVALID_VALUE`. To create a memory pool
+    targeting a specific host NUMA node, applications must set
+    :py:obj:`~.CUmemPoolProps`:py:obj:`~.CUmemLocation```type`` to
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA` and
-    :py:obj:`~.CUmemPoolProps.CUmemLocation.id` must specify the NUMA ID of
-    the host memory node. Specifying
+    :py:obj:`~.CUmemPoolProps`:py:obj:`~.CUmemLocation```id`` must specify
+    the NUMA ID of the host memory node. Specifying
     :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT` as the
-    :py:obj:`~.CUmemPoolProps.CUmemLocation.type` will result in
-    :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
+    :py:obj:`~.CUmemPoolProps`:py:obj:`~.CUmemLocation```type`` will result
+    in :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
     By default, the pool's memory will be accessible from the device it is
     allocated on. In the case of pools created with
@@ -37653,19 +37683,20 @@ def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
     /dev/nvidia-caps-imex-channels/channel0 c <major number> 0``
 
     To create a managed memory pool, applications must set
-    :py:obj:`~.CUmemPoolProps.CUmemAllocationType` to
-    CU_MEM_ALLOCATION_TYPE_MANAGED.
-    :py:obj:`~.CUmemPoolProps.CUmemAllocationHandleType` must also be set
-    to CU_MEM_HANDLE_TYPE_NONE since IPC is not supported. For managed
-    memory pools, :py:obj:`~.CUmemPoolProps.CUmemLocation` will be treated
-    as the preferred location for all allocations created from the pool. An
-    application can also set CU_MEM_LOCATION_TYPE_NONE to indicate no
-    preferred location. :py:obj:`~.CUmemPoolProps.maxSize` must be set to
-    zero for managed memory pools. :py:obj:`~.CUmemPoolProps.usage` should
-    be zero as decompress for managed memory is not supported. For managed
-    memory pools, all devices on the system must have non-zero
-    :py:obj:`~.concurrentManagedAccess`. If not, this call returns
-    CUDA_ERROR_NOT_SUPPORTED
+    :py:obj:`~.CUmemPoolProps`:::py:obj:`~.CUmemAllocationType` to
+    :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED`.
+    :py:obj:`~.CUmemPoolProps`:::py:obj:`~.CUmemAllocationHandleType` must
+    also be set to :py:obj:`~.CU_MEM_HANDLE_TYPE_NONE` since IPC is not
+    supported. For managed memory pools,
+    :py:obj:`~.CUmemPoolProps`:py:obj:`~.CUmemLocation` will be treated as
+    the preferred location for all allocations created from the pool. An
+    application can also set :py:obj:`~.CU_MEM_LOCATION_TYPE_NONE` to
+    indicate no preferred location. :py:obj:`~.CUmemPoolProps.maxSize` must
+    be set to zero for managed memory pools.
+    :py:obj:`~.CUmemPoolProps.usage` should be zero as decompress for
+    managed memory is not supported. For managed memory pools, all devices
+    on the system must have non-zero ``concurrentManagedAccess``. If not,
+    this call returns :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     Parameters
     ----------
@@ -37674,7 +37705,7 @@ def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     pool : :py:obj:`~.CUmemoryPool`
         None
@@ -37685,7 +37716,7 @@ def cuMemPoolCreate(poolProps : Optional[CUmemPoolProps]):
 
     Notes
     -----
-    Specifying CU_MEM_HANDLE_TYPE_NONE creates a memory pool that will not support IPC.
+    Specifying :py:obj:`~.CU_MEM_HANDLE_TYPE_NONE` creates a memory pool that will not support IPC.
     """
     cdef CUmemoryPool pool = CUmemoryPool()
     cdef cydriver.CUmemPoolProps* cypoolProps_ptr = <cydriver.CUmemPoolProps*>poolProps._pvt_ptr if poolProps is not None else NULL
@@ -37718,7 +37749,7 @@ def cuMemPoolDestroy(pool):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -37767,7 +37798,7 @@ def cuMemGetDefaultMemPool(location : Optional[CUmemLocation], typename not None
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     pool_out : :py:obj:`~.CUmemoryPool`
         None
@@ -37821,7 +37852,7 @@ def cuMemGetMemPool(location : Optional[CUmemLocation], typename not None : CUme
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pool : :py:obj:`~.CUmemoryPool`
         None
@@ -37856,8 +37887,8 @@ def cuMemSetMemPool(location : Optional[CUmemLocation], typename not None : CUme
     be :py:obj:`~.CU_MEM_LOCATION_TYPE_NONE` to indicate no preferred
     location for the managed memory pool.
     :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED` can not be used with
-    :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE_MEMORY_NODE`. In all other
-    cases, the call returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
+    ``CU_MEM_LOCATION_TYPE_DEVICE_MEMORY_NODE``. In all other cases, the
+    call returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE`.
 
     When a memory pool is set as the current memory pool, the location
     parameter should be the same as the location of the pool. The location
@@ -37882,7 +37913,7 @@ def cuMemSetMemPool(location : Optional[CUmemLocation], typename not None : CUme
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -37930,7 +37961,7 @@ def cuMemAllocFromPoolAsync(size_t bytesize, pool, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT` (default stream specified with no current context), :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     dptr : :py:obj:`~.CUdeviceptr`
         Returned device pointer
@@ -37993,7 +38024,7 @@ def cuMemPoolExportToShareableHandle(pool, handleType not None : CUmemAllocation
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     handle_out : Any
         Returned OS handle
@@ -38004,7 +38035,7 @@ def cuMemPoolExportToShareableHandle(pool, handleType not None : CUmemAllocation
 
     Notes
     -----
-    : To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than CU_MEM_HANDLE_TYPE_NONE.
+    : To create an IPC capable mempool, create a mempool with a :py:obj:`~.CUmemAllocationHandleType` other than :py:obj:`~.CU_MEM_HANDLE_TYPE_NONE`.
     """
     cdef cydriver.CUmemoryPool cypool
     if pool is None:
@@ -38031,7 +38062,7 @@ def cuMemPoolImportFromShareableHandle(handle, handleType not None : CUmemAlloca
     """ imports a memory pool from a shared handle.
 
     Specific allocations can be imported from the imported pool with
-    cuMemPoolImportPointer.
+    :func:`~.cuMemPoolImportPointer`.
 
     If ``handleType`` is :py:obj:`~.CU_MEM_HANDLE_TYPE_FABRIC` and the
     importer process has not been granted access to the same IMEX channel
@@ -38049,7 +38080,7 @@ def cuMemPoolImportFromShareableHandle(handle, handleType not None : CUmemAlloca
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     pool_out : :py:obj:`~.CUmemoryPool`
         Returned memory pool
@@ -38060,7 +38091,7 @@ def cuMemPoolImportFromShareableHandle(handle, handleType not None : CUmemAlloca
 
     Notes
     -----
-    Imported memory pools do not support creating new allocations. As such imported memory pools may not be used in cuDeviceSetMemPool or :py:obj:`~.cuMemAllocFromPoolAsync` calls.
+    Imported memory pools do not support creating new allocations. As such imported memory pools may not be used in :func:`~.cuDeviceSetMemPool` or :py:obj:`~.cuMemAllocFromPoolAsync` calls.
     """
     cdef CUmemoryPool pool_out = CUmemoryPool()
     cdef _HelperInputVoidPtrStruct cyhandleHelper
@@ -38092,7 +38123,7 @@ def cuMemPoolExportPointer(ptr):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     shareData_out : :py:obj:`~.CUmemPoolPtrExportData`
         Returned export data
@@ -38127,9 +38158,10 @@ def cuMemPoolImportPointer(pool, shareData : Optional[CUmemPoolPtrExportData]):
     memory must not be accessed before the allocation operation completes
     in the exporting process. The imported memory must be freed from all
     importing processes before being freed in the exporting process. The
-    pointer may be freed with cuMemFree or cuMemFreeAsync. If
-    cuMemFreeAsync is used, the free must be completed on the importing
-    process before the free operation on the exporting process.
+    pointer may be freed with :py:obj:`~.cuMemFree` or
+    :func:`~.cuMemFreeAsync`. If :func:`~.cuMemFreeAsync` is used, the free
+    must be completed on the importing process before the free operation on
+    the exporting process.
 
     Parameters
     ----------
@@ -38140,7 +38172,7 @@ def cuMemPoolImportPointer(pool, shareData : Optional[CUmemPoolPtrExportData]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     ptr_out : :py:obj:`~.CUdeviceptr`
         pointer to imported memory
@@ -38151,7 +38183,7 @@ def cuMemPoolImportPointer(pool, shareData : Optional[CUmemPoolPtrExportData]):
 
     Notes
     -----
-    The cuMemFreeAsync api may be used in the exporting process before the cuMemFreeAsync operation completes in its stream as long as the cuMemFreeAsync in the exporting process specifies a stream with a stream dependency on the importing process's cuMemFreeAsync.
+    The :func:`~.cuMemFreeAsync` api may be used in the exporting process before the :func:`~.cuMemFreeAsync` operation completes in its stream as long as the :func:`~.cuMemFreeAsync` in the exporting process specifies a stream with a stream dependency on the importing process's :func:`~.cuMemFreeAsync`.
     """
     cdef cydriver.CUmemoryPool cypool
     if pool is None:
@@ -38186,11 +38218,11 @@ def cuMulticastCreate(prop : Optional[CUmulticastObjectProp]):
     :py:obj:`~.cuMulticastBindAddr`, or :py:obj:`~.cuMulticastBindAddr_v2`.
     and can be unbound via :py:obj:`~.cuMulticastUnbind`. The total amount
     of memory that can be bound per device is specified by
-    :py:obj:`~.py`:obj:`~.CUmulticastObjectProp.size`. This size must be a
-    multiple of the value returned by :py:obj:`~.cuMulticastGetGranularity`
-    with the flag :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best
-    performance however, the size should be aligned to the value returned
-    by :py:obj:`~.cuMulticastGetGranularity` with the flag
+    ``py``:obj:`~.CUmulticastObjectProp.size`. This size must be a multiple
+    of the value returned by :py:obj:`~.cuMulticastGetGranularity` with the
+    flag :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best performance
+    however, the size should be aligned to the value returned by
+    :py:obj:`~.cuMulticastGetGranularity` with the flag
     :py:obj:`~.CU_MULTICAST_GRANULARITY_RECOMMENDED`.
 
     After all participating devices have been added, multicast objects can
@@ -38211,7 +38243,7 @@ def cuMulticastCreate(prop : Optional[CUmulticastObjectProp]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     mcHandle : :py:obj:`~.CUmemGenericAllocationHandle`
         Value of handle returned.
@@ -38262,7 +38294,7 @@ def cuMulticastAddDevice(mcHandle, dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -38306,7 +38338,7 @@ def cuMulticastBindMem(mcHandle, size_t mcOffset, memHandle, size_t memOffset, s
     :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best performance
     however, ``size``, ``mcOffset`` and ``memOffset`` should be aligned to
     the granularity of the memory allocation(see
-    :py:obj:`~.cuMemGetAllocationGranularity`) or to the value returned by
+    :func:`~.cuMemGetAllocationGranularity`) or to the value returned by
     :py:obj:`~.cuMulticastGetGranularity` with the flag
     :py:obj:`~.CU_MULTICAST_GRANULARITY_RECOMMENDED`.
 
@@ -38318,12 +38350,13 @@ def cuMulticastBindMem(mcHandle, size_t mcOffset, memHandle, size_t memOffset, s
     that was added to the multicast team via
     :py:obj:`~.cuMulticastAddDevice`. Externally shareable as well as
     imported multicast objects can be bound only to externally shareable
-    memory. Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if
-    there are insufficient resources required to perform the bind. This
-    call may also return CUDA_ERROR_SYSTEM_NOT_READY if the necessary
-    system software is not initialized or running.
+    memory. Note that this call will return
+    :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY` if there are insufficient
+    resources required to perform the bind. This call may also return
+    :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY` if the necessary system
+    software is not initialized or running.
 
-    This call may return CUDA_ERROR_ILLEGAL_STATE if the system
+    This call may return :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` if the system
     configuration is in an illegal state. In such cases, to continue using
     multicast, verify that the system configuration is in a valid state and
     all required driver daemons are running properly.
@@ -38345,7 +38378,7 @@ def cuMulticastBindMem(mcHandle, size_t mcOffset, memHandle, size_t memOffset, s
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`,
 
     See Also
@@ -38391,7 +38424,7 @@ def cuMulticastBindMem_v2(mcHandle, dev, size_t mcOffset, memHandle, size_t memO
     :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best performance
     however, ``size``, ``mcOffset`` and ``memOffset`` should be aligned to
     the granularity of the memory allocation(see
-    :py:obj:`~.cuMemGetAllocationGranularity`) or to the value returned by
+    :func:`~.cuMemGetAllocationGranularity`) or to the value returned by
     :py:obj:`~.cuMulticastGetGranularity` with the flag
     :py:obj:`~.CU_MULTICAST_GRANULARITY_RECOMMENDED`.
 
@@ -38412,12 +38445,13 @@ def cuMulticastBindMem_v2(mcHandle, dev, size_t mcOffset, memHandle, size_t memO
     named by ``dev`` must have been added to the multicast team via
     :py:obj:`~.cuMulticastAddDevice`. Externally shareable as well as
     imported multicast objects can be bound only to externally shareable
-    memory. Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if
-    there are insufficient resources required to perform the bind. This
-    call may also return CUDA_ERROR_SYSTEM_NOT_READY if the necessary
-    system software is not initialized or running.
+    memory. Note that this call will return
+    :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY` if there are insufficient
+    resources required to perform the bind. This call may also return
+    :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY` if the necessary system
+    software is not initialized or running.
 
-    This call may return CUDA_ERROR_ILLEGAL_STATE if the system
+    This call may return :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` if the system
     configuration is in an illegal state. In such cases, to continue using
     multicast, verify that the system configuration is in a valid state and
     all required driver daemons are running properly.
@@ -38442,7 +38476,7 @@ def cuMulticastBindMem_v2(mcHandle, dev, size_t mcOffset, memHandle, size_t memO
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`,
 
     See Also
@@ -38487,7 +38521,7 @@ def cuMulticastBindAddr(mcHandle, size_t mcOffset, memptr, size_t size, unsigned
     Binds a memory allocation specified by its mapped address ``memptr`` to
     a multicast object represented by ``mcHandle``. The memory must have
     been allocated via :py:obj:`~.cuMemCreate` or
-    :py:obj:`~.cudaMallocAsync`. The intended ``size`` of the bind, the
+    :func:`~.cudaMallocAsync`. The intended ``size`` of the bind, the
     offset in the multicast range ``mcOffset`` and ``memptr`` must be a
     multiple of the value returned by :py:obj:`~.cuMulticastGetGranularity`
     with the flag :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best
@@ -38503,12 +38537,13 @@ def cuMulticastBindAddr(mcHandle, size_t mcOffset, memptr, size_t size, unsigned
     that was added to the multicast team via
     :py:obj:`~.cuMulticastAddDevice`. Externally shareable as well as
     imported multicast objects can be bound only to externally shareable
-    memory. Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if
-    there are insufficient resources required to perform the bind. This
-    call may also return CUDA_ERROR_SYSTEM_NOT_READY if the necessary
-    system software is not initialized or running.
+    memory. Note that this call will return
+    :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY` if there are insufficient
+    resources required to perform the bind. This call may also return
+    :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY` if the necessary system
+    software is not initialized or running.
 
-    This call may return CUDA_ERROR_ILLEGAL_STATE if the system
+    This call may return :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` if the system
     configuration is in an illegal state. In such cases, to continue using
     multicast, verify that the system configuration is in a valid state and
     all required driver daemons are running properly.
@@ -38528,7 +38563,7 @@ def cuMulticastBindAddr(mcHandle, size_t mcOffset, memptr, size_t size, unsigned
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`,
 
     See Also
@@ -38567,9 +38602,9 @@ def cuMulticastBindAddr_v2(mcHandle, dev, size_t mcOffset, memptr, size_t size,
     Binds a memory allocation specified by its mapped address ``memptr`` to
     a multicast object represented by ``mcHandle``. The binding will be
     applicable for the device ``dev``. The memory must have been allocated
-    via :py:obj:`~.cuMemCreate` or :py:obj:`~.cudaMallocAsync`. The
-    intended ``size`` of the bind, the offset in the multicast range
-    ``mcOffset`` and ``memptr`` must be a multiple of the value returned by
+    via :py:obj:`~.cuMemCreate` or :func:`~.cudaMallocAsync`. The intended
+    ``size`` of the bind, the offset in the multicast range ``mcOffset``
+    and ``memptr`` must be a multiple of the value returned by
     :py:obj:`~.cuMulticastGetGranularity` with the flag
     :py:obj:`~.CU_MULTICAST_GRANULARITY_MINIMUM`. For best performance
     however, ``size``, ``mcOffset`` and ``memptr`` should be aligned to the
@@ -38591,12 +38626,13 @@ def cuMulticastBindAddr_v2(mcHandle, dev, size_t mcOffset, memptr, size_t size,
     named by ``dev`` must have been added to the multicast team via
     :py:obj:`~.cuMulticastAddDevice`. Externally shareable as well as
     imported multicast objects can be bound only to externally shareable
-    memory. Note that this call will return CUDA_ERROR_OUT_OF_MEMORY if
-    there are insufficient resources required to perform the bind. This
-    call may also return CUDA_ERROR_SYSTEM_NOT_READY if the necessary
-    system software is not initialized or running.
+    memory. Note that this call will return
+    :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY` if there are insufficient
+    resources required to perform the bind. This call may also return
+    :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY` if the necessary system
+    software is not initialized or running.
 
-    This call may return CUDA_ERROR_ILLEGAL_STATE if the system
+    This call may return :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` if the system
     configuration is in an illegal state. In such cases, to continue using
     multicast, verify that the system configuration is in a valid state and
     all required driver daemons are running properly.
@@ -38619,7 +38655,7 @@ def cuMulticastBindAddr_v2(mcHandle, dev, size_t mcOffset, memptr, size_t size,
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`,
 
     See Also
@@ -38683,7 +38719,7 @@ def cuMulticastUnbind(mcHandle, dev, size_t mcOffset, size_t size):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -38737,7 +38773,7 @@ def cuMulticastGetGranularity(prop : Optional[CUmulticastObjectProp], option not
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     granularity : int
         Returned granularity.
@@ -38776,12 +38812,12 @@ def cuLogicalEndpointIdReserve(count):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`,
     baseLeId : :py:obj:`~.CUlogicalEndpointId`
-        If :py:obj:`~.cuLogicalEndpointIdReserve` returns CUDA_SUCCESS,
-        \\*baseLeId contains the base logical endpoint id of the reserved
-        logical endpoint id range.
+        If :py:obj:`~.cuLogicalEndpointIdReserve` returns
+        :py:obj:`~.CUDA_SUCCESS`, \\*baseLeId contains the base logical
+        endpoint id of the reserved logical endpoint id range.
 
     See Also
     --------
@@ -38823,7 +38859,7 @@ def cuLogicalEndpointIdRelease(baseLeId, count):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`,
 
     See Also
@@ -38900,7 +38936,7 @@ def cuLogicalEndpointCreate(leId, prop : Optional[CUlogicalEndpointProp]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`,
 
     See Also
@@ -38930,7 +38966,7 @@ def cuLogicalEndpointAddDevice(leId, dev):
     Associates a device to a logical endpoint. The type of the logical
     endpoint must be :py:obj:`~.CU_LOGICAL_ENDPOINT_TYPE_MULTICAST`. The
     added device will be a part of the multicast team of size specified by
-    CUlogicalEndpointProp::multicast::numDevices during
+    :py:obj:`~.CUlogicalEndpointProp.multicast.numDevices` during
     :py:obj:`~.cuLogicalEndpointCreate`. The association of the device to
     the multicast logical endpoint is permanent during the life time of the
     multicast logical endpoint. All devices must be added to the multicast
@@ -38948,7 +38984,7 @@ def cuLogicalEndpointAddDevice(leId, dev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`,
 
     See Also
@@ -38995,7 +39031,7 @@ def cuLogicalEndpointDestroy(leId):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`,
 
     See Also
@@ -39024,7 +39060,7 @@ def cuLogicalEndpointBindAddr(leId, dev, offset, ptr, size, unsigned long long f
     Binds the memory allocation specified by its mapped address ``ptr`` to
     a logical endpoint represented by ``leId`` at the offset ``offset``.
     The memory must have been allocated via :py:obj:`~.cuMemCreate` or
-    :py:obj:`~.cudaMallocAsync`. The intended ``size`` of the bind, the
+    :func:`~.cudaMallocAsync`. The intended ``size`` of the bind, the
     ``offset`` in the logical endpoint range and ``ptr`` must be multiples
     of the value for ``bindAlignment`` as returned by
     :py:obj:`~.cuLogicalEndpointGetLimits`.
@@ -39083,7 +39119,7 @@ def cuLogicalEndpointBindAddr(leId, dev, offset, ptr, size, unsigned long long f
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
 
     See Also
@@ -39200,7 +39236,7 @@ def cuLogicalEndpointBindMem(leId, dev, offset, memHandle, memOffset, size, unsi
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
 
     See Also
@@ -39285,7 +39321,7 @@ def cuLogicalEndpointUnbind(leId, dev, offset, size):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
 
     See Also
@@ -39356,7 +39392,7 @@ def cuLogicalEndpointExport(leId, handleType not None : CUlogicalEndpointIpcHand
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`,
     handle : Any
         Pointer to the location in which to store the requested handle
@@ -39412,7 +39448,7 @@ def cuLogicalEndpointImport(leId, handle, handleType not None : CUlogicalEndpoin
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`,
 
     See Also
@@ -39455,7 +39491,7 @@ def cuLogicalEndpointGetLimits(prop : Optional[CUlogicalEndpointProp]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
     bindAlignment : :py:obj:`~.cuuint64_t`
         Minimum alignment granularity of the proposed logical endpoint.
@@ -39503,7 +39539,7 @@ def cuLogicalEndpointQuery(leId, count):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`, :py:obj:`~.CUDA_ERROR_SYSTEM_NOT_READY`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
     queryStatus : int
         Status of the logical endpoints. Returns 0 if any logical endpoint
@@ -39609,17 +39645,16 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_P2P_TOKENS`:
 
     - Returns in ``*data`` two tokens for use with the nv-p2p.h Linux
-      kernel interface. ``data`` must be a struct of type
-      CUDA_POINTER_ATTRIBUTE_P2P_TOKENS.
+      kernel interface. ``data`` must be a ``struct of`` type
+      :py:obj:`~.CUDA_POINTER_ATTRIBUTE_P2P_TOKENS`.
 
     - ``ptr`` must be a pointer to memory obtained from
-      :py:obj:`~.py`:obj:`~.cuMemAlloc()`. Note that p2pToken and
-      vaSpaceToken are only valid for the lifetime of the source
-      allocation. A subsequent allocation at the same address may return
-      completely different tokens. Querying this attribute has a side
-      effect of setting the attribute
-      :py:obj:`~.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS` for the region of memory
-      that ``ptr`` points to.
+      ``py``:obj:`~.cuMemAlloc()`. Note that p2pToken and vaSpaceToken are
+      only valid for the lifetime of the source allocation. A subsequent
+      allocation at the same address may return completely different
+      tokens. Querying this attribute has a side effect of setting the
+      attribute :py:obj:`~.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS` for the region
+      of memory that ``ptr`` points to.
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_SYNC_MEMOPS`:
 
@@ -39656,7 +39691,7 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE`:
 
     - Returns in ``*data`` a boolean that indicates if this pointer maps to
-      an allocation that is suitable for :py:obj:`~.cudaIpcGetMemHandle`.
+      an allocation that is suitable for :func:`~.cudaIpcGetMemHandle`.
 
     - :py:obj:`~.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR`:
 
@@ -39726,14 +39761,14 @@ def cuPointerGetAttribute(attribute not None : CUpointer_attribute, ptr):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     data : Any
         Returned pointer attribute value
 
     See Also
     --------
-    :py:obj:`~.cuPointerSetAttribute`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostRegister`, :py:obj:`~.cuMemHostUnregister`, :py:obj:`~.cudaPointerGetAttributes`
+    :py:obj:`~.cuPointerSetAttribute`, :py:obj:`~.cuMemAlloc`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostRegister`, :py:obj:`~.cuMemHostUnregister`, :func:`~.cudaPointerGetAttributes`
     """
     cdef cydriver.CUdeviceptr cyptr
     if ptr is None:
@@ -39764,8 +39799,8 @@ def cuMemPrefetchAsync(devPtr, size_t count, location not None : CUmemLocation,
     specifies the destination location. ``count`` specifies the number of
     bytes to copy. ``hStream`` is the stream in which the operation is
     enqueued. The memory range must refer to managed memory allocated via
-    :py:obj:`~.cuMemAllocManaged`, via :py:obj:`~.cuMemAllocFromPool` from
-    a managed memory pool or declared via managed variables.
+    :py:obj:`~.cuMemAllocManaged`, via ``cuMemAllocFromPool`` from a
+    managed memory pool or declared via managed variables.
 
     Specifying :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` for
     :py:obj:`~.CUmemLocation.type` will prefetch memory to GPU specified by
@@ -39848,12 +39883,12 @@ def cuMemPrefetchAsync(devPtr, size_t count, location not None : CUmemLocation,
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
 
     See Also
     --------
-    :py:obj:`~.cuMemcpy`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cuMemAdvise`, :py:obj:`~.cudaMemPrefetchAsync`
+    :py:obj:`~.cuMemcpy`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cuMemAdvise`, :func:`~.cudaMemPrefetchAsync`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -39978,7 +40013,8 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
       created on that procesor as outlined in description for the advice
       :py:obj:`~.CU_MEM_ADVISE_SET_READ_MOSTLY`. If the memory region
       refers to valid system-allocated pageable memory, and
-      :py:obj:`~.CUmemLocation.type` is CU_MEM_LOCATION_TYPE_DEVICE then
+      :py:obj:`~.CUmemLocation.type` is
+      :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` then
       :py:obj:`~.CUmemLocation.id` must be a valid device that has a non-
       zero alue for the device attribute
       :py:obj:`~.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS`.
@@ -40060,12 +40096,12 @@ def cuMemAdvise(devPtr, size_t count, advice not None : CUmem_advise, location n
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
 
     See Also
     --------
-    :py:obj:`~.cuMemcpy`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cuMemPrefetchAsync`, :py:obj:`~.cudaMemAdvise`
+    :py:obj:`~.cuMemcpy`, :py:obj:`~.cuMemcpyPeer`, :py:obj:`~.cuMemcpyAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`, :py:obj:`~.cuMemPrefetchAsync`, :func:`~.cudaMemAdvise`
     """
     cdef cydriver.CUdeviceptr cydevPtr
     if devPtr is None:
@@ -40147,7 +40183,7 @@ def cuMemPrefetchBatchAsync(dptrs : Optional[tuple[CUdeviceptr] | list[CUdevicep
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -40250,7 +40286,7 @@ def cuMemDiscardBatchAsync(dptrs : Optional[tuple[CUdeviceptr] | list[CUdevicept
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -40358,7 +40394,7 @@ def cuMemDiscardAndPrefetchBatchAsync(dptrs : Optional[tuple[CUdeviceptr] | list
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -40434,13 +40470,13 @@ def cuMemRangeGetAttribute(size_t dataSize, attribute not None : CUmem_range_att
       attribute is specified, ``data`` will be interpreted as a 32-bit
       integer, and ``dataSize`` must be 4. The result returned will be a
       GPU device id if all pages in the memory range have that GPU as their
-      preferred location, or it will be CU_DEVICE_CPU if all pages in the
-      memory range have the CPU as their preferred location, or it will be
-      CU_DEVICE_INVALID if either all the pages don't have the same
-      preferred location or some of the pages don't have a preferred
-      location at all. Note that the actual location of the pages in the
-      memory range at the time of the query may be different from the
-      preferred location.
+      preferred location, or it will be :py:obj:`~.CU_DEVICE_CPU` if all
+      pages in the memory range have the CPU as their preferred location,
+      or it will be :py:obj:`~.CU_DEVICE_INVALID` if either all the pages
+      don't have the same preferred location or some of the pages don't
+      have a preferred location at all. Note that the actual location of
+      the pages in the memory range at the time of the query may be
+      different from the preferred location.
 
     - :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY`: If this attribute is
       specified, ``data`` will be interpreted as an array of 32-bit
@@ -40450,35 +40486,36 @@ def cuMemRangeGetAttribute(size_t dataSize, attribute not None : CUmem_range_att
       range. If any device does not have that advice set for the entire
       memory range, that device will not be included. If ``data`` is larger
       than the number of devices that have that advice set for that memory
-      range, CU_DEVICE_INVALID will be returned in all the extra space
-      provided. For ex., if ``dataSize`` is 12 (i.e. ``data`` has 3
-      elements) and only device 0 has the advice set, then the result
-      returned will be { 0, CU_DEVICE_INVALID, CU_DEVICE_INVALID }. If
-      ``data`` is smaller than the number of devices that have that advice
-      set, then only as many devices will be returned as can fit in the
-      array. There is no guarantee on which specific devices will be
-      returned, however.
+      range, :py:obj:`~.CU_DEVICE_INVALID` will be returned in all the
+      extra space provided. For ex., if ``dataSize`` is 12 (i.e. ``data``
+      has 3 elements) and only device 0 has the advice set, then the result
+      returned will be { 0, :py:obj:`~.CU_DEVICE_INVALID`,
+      :py:obj:`~.CU_DEVICE_INVALID` }. If ``data`` is smaller than the
+      number of devices that have that advice set, then only as many
+      devices will be returned as can fit in the array. There is no
+      guarantee on which specific devices will be returned, however.
 
     - :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION`: If this
       attribute is specified, ``data`` will be interpreted as a 32-bit
       integer, and ``dataSize`` must be 4. The result returned will be the
       last location to which all pages in the memory range were prefetched
       explicitly via :py:obj:`~.cuMemPrefetchAsync`. This will either be a
-      GPU id or CU_DEVICE_CPU depending on whether the last location for
-      prefetch was a GPU or the CPU respectively. If any page in the memory
-      range was never explicitly prefetched or if all pages were not
-      prefetched to the same location, CU_DEVICE_INVALID will be returned.
-      Note that this simply returns the last location that the application
-      requested to prefetch the memory range to. It gives no indication as
-      to whether the prefetch operation to that location has completed or
-      even begun.
+      GPU id or :py:obj:`~.CU_DEVICE_CPU` depending on whether the last
+      location for prefetch was a GPU or the CPU respectively. If any page
+      in the memory range was never explicitly prefetched or if all pages
+      were not prefetched to the same location,
+      :py:obj:`~.CU_DEVICE_INVALID` will be returned. Note that this simply
+      returns the last location that the application requested to prefetch
+      the memory range to. It gives no indication as to whether the
+      prefetch operation to that location has completed or even begun.
 
     - :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION_TYPE`: If this
       attribute is specified, ``data`` will be interpreted as a
       :py:obj:`~.CUmemLocationType`, and ``dataSize`` must be
-      sizeof(CUmemLocationType). The :py:obj:`~.CUmemLocationType` returned
-      will be :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` if all pages in the
-      memory range have the same GPU as their preferred location, or
+      sizeof(:py:obj:`~.CUmemLocationType`). The
+      :py:obj:`~.CUmemLocationType` returned will be
+      :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` if all pages in the memory
+      range have the same GPU as their preferred location, or
       :py:obj:`~.CUmemLocationType` will be
       :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` if all pages in the memory
       range have the CPU as their preferred location, or it will be
@@ -40504,9 +40541,9 @@ def cuMemRangeGetAttribute(size_t dataSize, attribute not None : CUmem_range_att
     - :py:obj:`~.CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION_TYPE`: If
       this attribute is specified, ``data`` will be interpreted as a
       :py:obj:`~.CUmemLocationType`, and ``dataSize`` must be
-      sizeof(CUmemLocationType). The result returned will be the last
-      location to which all pages in the memory range were prefetched
-      explicitly via :py:obj:`~.cuMemPrefetchAsync`. The
+      sizeof(:py:obj:`~.CUmemLocationType`). The result returned will be
+      the last location to which all pages in the memory range were
+      prefetched explicitly via :py:obj:`~.cuMemPrefetchAsync`. The
       :py:obj:`~.CUmemLocationType` returned will be
       :py:obj:`~.CU_MEM_LOCATION_TYPE_DEVICE` if the last prefetch location
       was a GPU or :py:obj:`~.CU_MEM_LOCATION_TYPE_HOST` if it was the CPU
@@ -40543,7 +40580,7 @@ def cuMemRangeGetAttribute(size_t dataSize, attribute not None : CUmem_range_att
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     data : Any
         A pointers to a memory location where the result of each attribute
@@ -40551,7 +40588,7 @@ def cuMemRangeGetAttribute(size_t dataSize, attribute not None : CUmem_range_att
 
     See Also
     --------
-    :py:obj:`~.cuMemRangeGetAttributes`, :py:obj:`~.cuMemPrefetchAsync`, :py:obj:`~.cuMemAdvise`, :py:obj:`~.cudaMemRangeGetAttribute`
+    :py:obj:`~.cuMemRangeGetAttributes`, :py:obj:`~.cuMemPrefetchAsync`, :py:obj:`~.cuMemAdvise`, :func:`~.cudaMemRangeGetAttribute`
     """
     cdef cydriver.CUdeviceptr cydevPtr
     if devPtr is None:
@@ -40621,7 +40658,7 @@ def cuMemRangeGetAttributes(dataSizes : tuple[int] | list[int], attributes : Opt
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     data : list[Any]
         A two-dimensional array containing pointers to memory locations
@@ -40629,7 +40666,7 @@ def cuMemRangeGetAttributes(dataSizes : tuple[int] | list[int], attributes : Opt
 
     See Also
     --------
-    :py:obj:`~.cuMemRangeGetAttribute`, :py:obj:`~.cuMemAdvise`, :py:obj:`~.cuMemPrefetchAsync`, :py:obj:`~.cudaMemRangeGetAttributes`
+    :py:obj:`~.cuMemRangeGetAttribute`, :py:obj:`~.cuMemAdvise`, :py:obj:`~.cuMemPrefetchAsync`, :func:`~.cudaMemRangeGetAttributes`
     """
     cdef cydriver.CUdeviceptr cydevPtr
     if devPtr is None:
@@ -40691,7 +40728,7 @@ def cuPointerSetAttribute(value, attribute not None : CUpointer_attribute, ptr):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
 
     See Also
@@ -40757,7 +40794,7 @@ def cuPointerGetAttributes(unsigned int numAttributes, attributes : Optional[tup
     Unlike :py:obj:`~.cuPointerGetAttribute`, this function will not return
     an error when the ``ptr`` encountered is not a valid CUDA pointer.
     Instead, the attributes are assigned default NULL values and
-    CUDA_SUCCESS is returned.
+    :py:obj:`~.CUDA_SUCCESS` is returned.
 
     If ``ptr`` was not allocated by, mapped by, or registered with a
     :py:obj:`~.CUcontext` which uses UVA (Unified Virtual Addressing),
@@ -40775,7 +40812,7 @@ def cuPointerGetAttributes(unsigned int numAttributes, attributes : Optional[tup
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     data : list[Any]
         A two-dimensional array containing pointers to memory locations
@@ -40783,7 +40820,7 @@ def cuPointerGetAttributes(unsigned int numAttributes, attributes : Optional[tup
 
     See Also
     --------
-    :py:obj:`~.cuPointerGetAttribute`, :py:obj:`~.cuPointerSetAttribute`, :py:obj:`~.cudaPointerGetAttributes`
+    :py:obj:`~.cuPointerGetAttribute`, :py:obj:`~.cuPointerSetAttribute`, :func:`~.cudaPointerGetAttributes`
     """
     cdef cydriver.CUdeviceptr cyptr
     if ptr is None:
@@ -40833,14 +40870,14 @@ def cuStreamCreate(unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY` :py:obj:`~.CUDA_ERROR_EXTERNAL_DEVICE`
     phStream : :py:obj:`~.CUstream`
         Returned newly created stream
 
     See Also
     --------
-    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice` :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`
+    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice` :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :func:`~.cudaStreamCreate`, :func:`~.cudaStreamCreateWithFlags`
     """
     cdef CUstream phStream = CUstream()
     with nogil:
@@ -40882,14 +40919,14 @@ def cuStreamCreateWithPriority(unsigned int flags, int priority):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY` :py:obj:`~.CUDA_ERROR_EXTERNAL_DEVICE`
     phStream : :py:obj:`~.CUstream`
         Returned newly created stream
 
     See Also
     --------
-    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuCtxGetStreamPriorityRange`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamCreateWithPriority`
+    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuCtxGetStreamPriorityRange`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :func:`~.cudaStreamCreateWithPriority`
 
     Notes
     -----
@@ -40925,9 +40962,9 @@ def cuStreamBeginCaptureToCig(hStream, streamCigCaptureParams : Optional[CUstrea
     in order along with other graphics API commands in the command list.
 
     CIG stream capture may not be initiated if ``stream`` is
-    CU_STREAM_LEGACY. Capture must be ended on the same stream in which it
-    was initiated, and it may only be initiated if the stream is not
-    already in CIG capture mode.
+    :py:obj:`~.CU_STREAM_LEGACY`. Capture must be ended on the same stream
+    in which it was initiated, and it may only be initiated if the stream
+    is not already in CIG capture mode.
 
     The context must be also created in CIG mode previously, otherwise this
     operation will fail and :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT` will be
@@ -40956,11 +40993,11 @@ def cuStreamBeginCaptureToCig(hStream, streamCigCaptureParams : Optional[CUstrea
     CUDA work with regards to other CUDA work submitted under the same CIG
     context. Out-of-order execution can lead to device hangs or exceptions.
 
-    CIG capture mode operates similarly to ``cuStreamBeginCapture`` with
-    the ``CU_STREAM_CAPTURE_MODE_RELAXED`` option. There are additional
-    limitations to streams in CIG capture mode. The following functions are
-    not allowed for CIG streams whether directly or indirectly via a
-    recorded graph launch: :py:obj:`~.cuLaunchHostFunc`
+    CIG capture mode operates similarly to :func:`~.cuStreamBeginCapture`
+    with the :py:obj:`~.CU_STREAM_CAPTURE_MODE_RELAXED` option. There are
+    additional limitations to streams in CIG capture mode. The following
+    functions are not allowed for CIG streams whether directly or
+    indirectly via a recorded graph launch: :py:obj:`~.cuLaunchHostFunc`
     :py:obj:`~.cuStreamAddCallback` :py:obj:`~.cuStreamSynchronize`
     :py:obj:`~.cuStreamWaitValue32` :py:obj:`~.cuStreamWaitValue64`
     :py:obj:`~.cuStreamBatchMemOp` :py:obj:`~.cuStreamBeginCapture`
@@ -40976,7 +41013,7 @@ def cuStreamBeginCaptureToCig(hStream, streamCigCaptureParams : Optional[CUstrea
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -41022,7 +41059,7 @@ def cuStreamEndCaptureToCig(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD`
 
     See Also
@@ -41064,7 +41101,7 @@ def cuStreamGetPriority(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     priority : int
         Pointer to a signed integer in which the stream's priority is
@@ -41072,7 +41109,7 @@ def cuStreamGetPriority(hStream):
 
     See Also
     --------
-    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuCtxGetStreamPriorityRange`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`, :py:obj:`~.cudaStreamGetPriority`
+    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuCtxGetStreamPriorityRange`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`, :func:`~.cudaStreamGetPriority`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -41105,7 +41142,7 @@ def cuStreamGetDevice(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     device : :py:obj:`~.CUdevice`
         Returns the device to which a stream belongs
@@ -41147,7 +41184,7 @@ def cuStreamGetFlags(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     flags : unsigned int
         Pointer to an unsigned integer in which the stream's flags are
@@ -41157,7 +41194,7 @@ def cuStreamGetFlags(hStream):
 
     See Also
     --------
-    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`
+    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :func:`~.cudaStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -41189,9 +41226,9 @@ def cuStreamGetId(hStream):
     - a stream created via any of the CUDA driver APIs such as
       :py:obj:`~.cuStreamCreate` and
       :py:obj:`~.cuStreamCreateWithPriority`, or their runtime API
-      equivalents such as :py:obj:`~.cudaStreamCreate`,
-      :py:obj:`~.cudaStreamCreateWithFlags` and
-      :py:obj:`~.cudaStreamCreateWithPriority`. Passing an invalid handle
+      equivalents such as :func:`~.cudaStreamCreate`,
+      :func:`~.cudaStreamCreateWithFlags` and
+      :func:`~.cudaStreamCreateWithPriority`. Passing an invalid handle
       will result in undefined behavior.
 
     - any of the special streams such as the NULL stream,
@@ -41207,14 +41244,14 @@ def cuStreamGetId(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     streamId : unsigned long long
         Pointer to store the Id of the stream
 
     See Also
     --------
-    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cudaStreamGetId`
+    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :func:`~.cudaStreamGetId`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -41250,9 +41287,9 @@ def cuStreamGetCtx(hStream):
     - a stream created via any of the CUDA driver APIs such as
       :py:obj:`~.cuStreamCreate` and
       :py:obj:`~.cuStreamCreateWithPriority`, or their runtime API
-      equivalents such as :py:obj:`~.cudaStreamCreate`,
-      :py:obj:`~.cudaStreamCreateWithFlags` and
-      :py:obj:`~.cudaStreamCreateWithPriority`. The returned context is the
+      equivalents such as :func:`~.cudaStreamCreate`,
+      :func:`~.cudaStreamCreateWithFlags` and
+      :func:`~.cudaStreamCreateWithPriority`. The returned context is the
       context that was active in the calling thread when the stream was
       created. Passing an invalid handle will result in undefined behavior.
 
@@ -41272,14 +41309,14 @@ def cuStreamGetCtx(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     pctx : :py:obj:`~.CUcontext`
         Returned context associated with the stream
 
     See Also
     --------
-    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice` :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`
+    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice` :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :func:`~.cudaStreamCreate`, :func:`~.cudaStreamCreateWithFlags`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -41317,9 +41354,9 @@ def cuStreamGetCtx_v2(hStream):
     - a stream created via any of the CUDA driver APIs such as
       :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamCreateWithPriority`
       and :py:obj:`~.cuGreenCtxStreamCreate`, or their runtime API
-      equivalents such as :py:obj:`~.cudaStreamCreate`,
-      :py:obj:`~.cudaStreamCreateWithFlags` and
-      :py:obj:`~.cudaStreamCreateWithPriority`. Passing an invalid handle
+      equivalents such as :func:`~.cudaStreamCreate`,
+      :func:`~.cudaStreamCreateWithFlags` and
+      :func:`~.cudaStreamCreateWithPriority`. Passing an invalid handle
       will result in undefined behavior.
 
     - any of the special streams such as the NULL stream,
@@ -41347,7 +41384,7 @@ def cuStreamGetCtx_v2(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     pCtx : :py:obj:`~.CUcontext`
         Returned regular context associated with the stream
@@ -41357,7 +41394,7 @@ def cuStreamGetCtx_v2(hStream):
 
     See Also
     --------
-    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate` :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`,
+    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate` :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :func:`~.cudaStreamCreate`, :func:`~.cudaStreamCreateWithFlags`,
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -41403,16 +41440,16 @@ def cuStreamWaitEvent(hStream, hEvent, unsigned int Flags):
     hEvent : :py:obj:`~.CUevent` or :py:obj:`~.cudaEvent_t`
         Event to wait on (may not be NULL)
     Flags : unsigned int
-        See :py:obj:`~.CUevent_capture_flags`
+        See ``CUevent_capture_flags``
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`,
 
     See Also
     --------
-    :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuEventRecord`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cudaStreamWaitEvent`
+    :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuEventRecord`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cuStreamDestroy`, :func:`~.cudaStreamWaitEvent`
     """
     cdef cydriver.CUevent cyhEvent
     if hEvent is None:
@@ -41455,9 +41492,10 @@ def cuStreamAddCallback(hStream, callback, userData, unsigned int flags):
     """ Add a callback to a compute stream.
 
     Adds a callback to be called on the host after all currently enqueued
-    items in the stream have completed. For each cuStreamAddCallback call,
-    the callback will be executed exactly once. The callback will block
-    later work in the stream until it is finished.
+    items in the stream have completed. For each
+    :func:`~.cuStreamAddCallback` call, the callback will be executed
+    exactly once. The callback will block later work in the stream until it
+    is finished.
 
     The callback may be passed :py:obj:`~.CUDA_SUCCESS` or an error code.
     In the event of a device error, all subsequently executed callbacks
@@ -41508,12 +41546,12 @@ def cuStreamAddCallback(hStream, callback, userData, unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
     --------
-    :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuMemAllocManaged`, :py:obj:`~.cuStreamAttachMemAsync`, :py:obj:`~.cuLaunchHostFunc`, :py:obj:`~.cudaStreamAddCallback`
+    :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuMemAllocManaged`, :py:obj:`~.cuStreamAttachMemAsync`, :py:obj:`~.cuLaunchHostFunc`, :func:`~.cudaStreamAddCallback`
 
     Notes
     -----
@@ -41563,11 +41601,12 @@ def cuStreamBeginCapture(hStream, mode not None : CUstreamCaptureMode):
     all operations pushed into the stream will not be executed, but will
     instead be captured into a graph, which will be returned via
     :py:obj:`~.cuStreamEndCapture`. Capture may not be initiated if
-    ``stream`` is CU_STREAM_LEGACY. Capture must be ended on the same
-    stream in which it was initiated, and it may only be initiated if the
-    stream is not already in capture mode. The capture mode may be queried
-    via :py:obj:`~.cuStreamIsCapturing`. A unique id representing the
-    capture sequence may be queried via :py:obj:`~.cuStreamGetCaptureInfo`.
+    ``stream`` is :py:obj:`~.CU_STREAM_LEGACY`. Capture must be ended on
+    the same stream in which it was initiated, and it may only be initiated
+    if the stream is not already in capture mode. The capture mode may be
+    queried via :py:obj:`~.cuStreamIsCapturing`. A unique id representing
+    the capture sequence may be queried via
+    :py:obj:`~.cuStreamGetCaptureInfo`.
 
     If ``mode`` is not :py:obj:`~.CU_STREAM_CAPTURE_MODE_RELAXED`,
     :py:obj:`~.cuStreamEndCapture` must be called on this stream from the
@@ -41584,7 +41623,7 @@ def cuStreamBeginCapture(hStream, mode not None : CUstreamCaptureMode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -41627,13 +41666,13 @@ def cuStreamBeginRecaptureToGraph(hStream, mode not None : CUstreamCaptureMode,
     Any other node parameter mismatches during recapture can be configured
     to call the function provided in ``callbackFunc``. The recapture will
     fail immediately if the callback returns anything other than
-    CUDA_SUCCESS.
+    :py:obj:`~.CUDA_SUCCESS`.
 
     If the recapture fails for any reason, the ``graph`` will be in an
     undefined state and should be destroyed.
 
-    See cuStreamBeginCapture for additional detail on beginning the
-    capture.
+    See :py:obj:`~.cuStreamBeginCapture` for additional detail on beginning
+    the capture.
 
     Parameters
     ----------
@@ -41654,7 +41693,7 @@ def cuStreamBeginRecaptureToGraph(hStream, mode not None : CUstreamCaptureMode,
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -41710,12 +41749,12 @@ def cuStreamBeginCaptureToGraph(hStream, hGraph, dependencies : Optional[tuple[C
     ``hGraph``. The graph will not be instantiable until the user calls
     :py:obj:`~.cuStreamEndCapture`.
 
-    Capture may not be initiated if ``stream`` is CU_STREAM_LEGACY. Capture
-    must be ended on the same stream in which it was initiated, and it may
-    only be initiated if the stream is not already in capture mode. The
-    capture mode may be queried via :py:obj:`~.cuStreamIsCapturing`. A
-    unique id representing the capture sequence may be queried via
-    :py:obj:`~.cuStreamGetCaptureInfo`.
+    Capture may not be initiated if ``stream`` is
+    :py:obj:`~.CU_STREAM_LEGACY`. Capture must be ended on the same stream
+    in which it was initiated, and it may only be initiated if the stream
+    is not already in capture mode. The capture mode may be queried via
+    :py:obj:`~.cuStreamIsCapturing`. A unique id representing the capture
+    sequence may be queried via :py:obj:`~.cuStreamGetCaptureInfo`.
 
     If ``mode`` is not :py:obj:`~.CU_STREAM_CAPTURE_MODE_RELAXED`,
     :py:obj:`~.cuStreamEndCapture` must be called on this stream from the
@@ -41741,7 +41780,7 @@ def cuStreamBeginCaptureToGraph(hStream, hGraph, dependencies : Optional[tuple[C
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -41819,8 +41858,8 @@ def cuThreadExchangeStreamCaptureMode(mode not None : CUstreamCaptureMode):
     **View CUDA Toolkit Documentation for a C++ code example**
 
     During stream capture (see :py:obj:`~.cuStreamBeginCapture`), some
-    actions, such as a call to :py:obj:`~.cudaMalloc`, may be unsafe. In
-    the case of :py:obj:`~.cudaMalloc`, the operation is not enqueued
+    actions, such as a call to :func:`~.cudaMalloc`, may be unsafe. In the
+    case of :func:`~.cudaMalloc`, the operation is not enqueued
     asynchronously to a stream, and is not observed by stream capture.
     Therefore, if the sequence of operations captured via
     :py:obj:`~.cuStreamBeginCapture` depended on the allocation being
@@ -41835,20 +41874,21 @@ def cuThreadExchangeStreamCaptureMode(mode not None : CUstreamCaptureMode):
 
     A thread's mode is one of the following:
 
-    - ``CU_STREAM_CAPTURE_MODE_GLOBAL:`` This is the default mode. If the
-      local thread has an ongoing capture sequence that was not initiated
-      with ``CU_STREAM_CAPTURE_MODE_RELAXED`` at ``cuStreamBeginCapture``,
-      or if any other thread has a concurrent capture sequence initiated
-      with ``CU_STREAM_CAPTURE_MODE_GLOBAL``, this thread is prohibited
+    - :py:obj:`~.CU_STREAM_CAPTURE_MODE_GLOBAL`: This is the default mode.
+      If the local thread has an ongoing capture sequence that was not
+      initiated with :py:obj:`~.CU_STREAM_CAPTURE_MODE_RELAXED` at
+      :func:`~.cuStreamBeginCapture`, or if any other thread has a
+      concurrent capture sequence initiated with
+      :py:obj:`~.CU_STREAM_CAPTURE_MODE_GLOBAL`, this thread is prohibited
       from potentially unsafe API calls.
 
-    - ``CU_STREAM_CAPTURE_MODE_THREAD_LOCAL:`` If the local thread has an
-      ongoing capture sequence not initiated with
-      ``CU_STREAM_CAPTURE_MODE_RELAXED``, it is prohibited from potentially
-      unsafe API calls. Concurrent capture sequences in other threads are
-      ignored.
+    - :py:obj:`~.CU_STREAM_CAPTURE_MODE_THREAD_LOCAL`: If the local thread
+      has an ongoing capture sequence not initiated with
+      :py:obj:`~.CU_STREAM_CAPTURE_MODE_RELAXED`, it is prohibited from
+      potentially unsafe API calls. Concurrent capture sequences in other
+      threads are ignored.
 
-    - ``CU_STREAM_CAPTURE_MODE_RELAXED:`` The local thread is not
+    - :py:obj:`~.CU_STREAM_CAPTURE_MODE_RELAXED`: The local thread is not
       prohibited from potentially unsafe API calls. Note that the thread is
       still prohibited from API calls which necessarily conflict with
       stream capture, for example, attempting :py:obj:`~.cuEventQuery` on
@@ -41861,7 +41901,7 @@ def cuThreadExchangeStreamCaptureMode(mode not None : CUstreamCaptureMode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     mode : :py:obj:`~.CUstreamCaptureMode`
         Pointer to mode value to swap with the current mode
@@ -41901,7 +41941,7 @@ def cuStreamEndCapture(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD`
     phGraph : :py:obj:`~.CUgraph`
         The captured graph
@@ -41964,7 +42004,7 @@ def cuStreamIsCapturing(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_IMPLICIT`
     captureStatus : :py:obj:`~.CUstreamCaptureStatus`
         Returns the stream's capture status
@@ -42004,7 +42044,7 @@ def cuStreamGetCaptureInfo(hStream):
     Valid data (other than capture status) is returned only if both of the
     following are true:
 
-    - the call returns CUDA_SUCCESS
+    - the call returns :py:obj:`~.CUDA_SUCCESS`
 
     - the returned capture status is
       :py:obj:`~.CU_STREAM_CAPTURE_STATUS_ACTIVE`
@@ -42021,7 +42061,7 @@ def cuStreamGetCaptureInfo(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_IMPLICIT`, :py:obj:`~.CUDA_ERROR_LOSSY_QUERY`
     captureStatus_out : :py:obj:`~.CUstreamCaptureStatus`
         Location to return the capture status of the stream; required
@@ -42131,7 +42171,7 @@ def cuStreamUpdateCaptureDependencies(hStream, dependencies : Optional[tuple[CUg
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE`
 
     See Also
@@ -42266,12 +42306,12 @@ def cuStreamAttachMemAsync(hStream, dptr, size_t length, unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
     --------
-    :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuMemAllocManaged`, :py:obj:`~.cudaStreamAttachMemAsync`
+    :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuMemAllocManaged`, :func:`~.cudaStreamAttachMemAsync`
     """
     cdef cydriver.CUdeviceptr cydptr
     if dptr is None:
@@ -42315,12 +42355,12 @@ def cuStreamQuery(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_READY`
 
     See Also
     --------
-    :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamQuery`
+    :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :func:`~.cudaStreamQuery`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -42355,12 +42395,12 @@ def cuStreamSynchronize(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
     --------
-    :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamSynchronize`
+    :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamAddCallback`, :func:`~.cudaStreamSynchronize`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -42396,12 +42436,12 @@ def cuStreamDestroy(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE` :py:obj:`~.CUDA_ERROR_EXTERNAL_DEVICE`
 
     See Also
     --------
-    :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamDestroy`
+    :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :func:`~.cudaStreamDestroy`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -42434,7 +42474,7 @@ def cuStreamCopyAttributes(dst, src):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -42480,7 +42520,7 @@ def cuStreamGetAttribute(hStream, attr not None : CUstreamAttrID):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     value_out : :py:obj:`~.CUstreamAttrValue`
 
@@ -42527,7 +42567,7 @@ def cuStreamSetAttribute(hStream, attr not None : CUstreamAttrID, value : Option
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
@@ -42583,14 +42623,14 @@ def cuEventCreate(unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     phEvent : :py:obj:`~.CUevent`
         Returns newly created event
 
     See Also
     --------
-    :py:obj:`~.cuEventRecord`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventElapsedTime`, :py:obj:`~.cudaEventCreate`, :py:obj:`~.cudaEventCreateWithFlags`
+    :py:obj:`~.cuEventRecord`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventElapsedTime`, :func:`~.cudaEventCreate`, :func:`~.cudaEventCreateWithFlags`
     """
     cdef CUevent phEvent = CUevent()
     with nogil:
@@ -42632,12 +42672,12 @@ def cuEventRecord(hEvent, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventElapsedTime`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cuEventRecordWithFlags`
+    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventElapsedTime`, :func:`~.cudaEventRecord`, :py:obj:`~.cuEventRecordWithFlags`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -42698,16 +42738,16 @@ def cuEventRecordWithFlags(hEvent, hStream, unsigned int flags):
     hStream : :py:obj:`~.CUstream` or :py:obj:`~.cudaStream_t`
         Stream to record event for
     flags : unsigned int
-        See :py:obj:`~.CUevent_capture_flags`
+        See ``CUevent_capture_flags``
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventElapsedTime`, :py:obj:`~.cuEventRecord`, :py:obj:`~.cudaEventRecord`
+    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventElapsedTime`, :py:obj:`~.cuEventRecord`, :func:`~.cudaEventRecord`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -42755,12 +42795,12 @@ def cuEventQuery(hEvent):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_READY`
 
     See Also
     --------
-    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventRecord`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventElapsedTime`, :py:obj:`~.cudaEventQuery`
+    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventRecord`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventElapsedTime`, :func:`~.cudaEventQuery`
     """
     cdef cydriver.CUevent cyhEvent
     if hEvent is None:
@@ -42799,12 +42839,12 @@ def cuEventSynchronize(hEvent):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
     --------
-    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventRecord`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventElapsedTime`, :py:obj:`~.cudaEventSynchronize`
+    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventRecord`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cuEventElapsedTime`, :func:`~.cudaEventSynchronize`
     """
     cdef cydriver.CUevent cyhEvent
     if hEvent is None:
@@ -42840,12 +42880,12 @@ def cuEventDestroy(hEvent):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
     --------
-    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventRecord`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventElapsedTime`, :py:obj:`~.cudaEventDestroy`
+    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventRecord`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventElapsedTime`, :func:`~.cudaEventDestroy`
     """
     cdef cydriver.CUevent cyhEvent
     if hEvent is None:
@@ -42900,14 +42940,14 @@ def cuEventElapsedTime(hStart, hEnd):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_READY`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     pMilliseconds : float
         Time between ``hStart`` and ``hEnd`` in ms
 
     See Also
     --------
-    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventRecord`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventDestroy`, :py:obj:`~.cudaEventElapsedTime`
+    :py:obj:`~.cuEventCreate`, :py:obj:`~.cuEventRecord`, :py:obj:`~.cuEventQuery`, :py:obj:`~.cuEventSynchronize`, :py:obj:`~.cuEventDestroy`, :func:`~.cudaEventElapsedTime`
     """
     cdef cydriver.CUevent cyhEnd
     if hEnd is None:
@@ -42956,84 +42996,93 @@ def cuImportExternalMemory(memHandleDesc : Optional[CUDA_EXTERNAL_MEMORY_HANDLE_
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.fd` must be a valid
-    file descriptor referencing a memory object. Ownership of the file
-    descriptor is transferred to the CUDA driver when the handle is
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````fd`` must be a
+    valid file descriptor referencing a memory object. Ownership of the
+    file descriptor is transferred to the CUDA driver when the handle is
     imported successfully. Performing any operations on the file descriptor
     after it is imported results in undefined behavior.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32`, then exactly
-    one of :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle`
-    and :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must
-    not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` is not
-    NULL, then it must represent a valid shared NT handle that references a
-    memory object. Ownership of this handle is not transferred to CUDA
-    after the import operation, so the application must release the handle
-    using the appropriate system call. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` is not
-    NULL, then it must point to a NULL-terminated array of UTF-16
+    one of
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::handle
+    and
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::name
+    must not be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::handle
+    is not NULL, then it must represent a valid shared NT handle that
+    references a memory object. Ownership of this handle is not transferred
+    to CUDA after the import operation, so the application must release the
+    handle using the appropriate system call. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::name
+    is not NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a memory object.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_KMT`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` must
-    be non-NULL and
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must be
-    NULL. The handle specified must be a globally shared KMT handle. This
-    handle does not hold a reference to the underlying object, and thus
-    will be invalid when all references to the memory object are destroyed.
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::handle
+    must be non-NULL and
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::name
+    must be NULL. The handle specified must be a globally shared KMT
+    handle. This handle does not hold a reference to the underlying object,
+    and thus will be invalid when all references to the memory object are
+    destroyed.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_HEAP`, then exactly one
-    of :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` and
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must not
-    be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` is not
-    NULL, then it must represent a valid shared NT handle that is returned
-    by ID3D12Device::CreateSharedHandle when referring to a ID3D12Heap
-    object. This handle holds a reference to the underlying object. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` is not
-    NULL, then it must point to a NULL-terminated array of UTF-16
+    of
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::handle
+    and
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::name
+    must not be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::handle
+    is not NULL, then it must represent a valid shared NT handle that is
+    returned by ID3D12Device::CreateSharedHandle when referring to a
+    ID3D12Heap object. This handle holds a reference to the underlying
+    object. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::name
+    is not NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a ID3D12Heap object.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D12_RESOURCE`, then exactly
-    one of :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle`
-    and :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must
-    not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` is not
-    NULL, then it must represent a valid shared NT handle that is returned
-    by ID3D12Device::CreateSharedHandle when referring to a ID3D12Resource
-    object. This handle holds a reference to the underlying object. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` is not
-    NULL, then it must point to a NULL-terminated array of UTF-16
+    one of
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::handle
+    and
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::name
+    must not be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::handle
+    is not NULL, then it must represent a valid shared NT handle that is
+    returned by ID3D12Device::CreateSharedHandle when referring to a
+    ID3D12Resource object. This handle holds a reference to the underlying
+    object. If
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::name
+    is not NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a ID3D12Resource object.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` must
-    represent a valid shared NT handle that is returned by
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::handle
+    must represent a valid shared NT handle that is returned by
     IDXGIResource1::CreateSharedHandle when referring to a ID3D11Resource
     object. If
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` is not
-    NULL, then it must point to a NULL-terminated array of UTF-16
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::name
+    is not NULL, then it must point to a NULL-terminated array of UTF-16
     characters that refers to a ID3D11Resource object.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_D3D11_RESOURCE_KMT`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.handle` must
-    represent a valid shared KMT handle that is returned by
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::handle
+    must represent a valid shared KMT handle that is returned by
     IDXGIResource::GetSharedHandle when referring to a ID3D11Resource
     object and
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.win32.name` must be
-    NULL.
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````win32``::name
+    must be NULL.
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_NVSCIBUF`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.nvSciBufObject` must
-    be non-NULL and reference a valid NvSciBuf object. If the NvSciBuf
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````nvSciBufObject``
+    must be non-NULL and reference a valid NvSciBuf object. If the NvSciBuf
     object imported into CUDA is also mapped by other drivers, then the
     application must use :py:obj:`~.cuWaitExternalSemaphoresAsync` or
     :py:obj:`~.cuSignalExternalSemaphoresAsync` as appropriate barriers to
@@ -43044,8 +43093,8 @@ def cuImportExternalMemory(memHandleDesc : Optional[CUDA_EXTERNAL_MEMORY_HANDLE_
 
     If :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_MEMORY_HANDLE_TYPE_DMABUF_FD`, then
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.handle.fd` must be a valid
-    file descriptor referencing a dma_buf object and
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC```handle````fd`` must be a
+    valid file descriptor referencing a dma_buf object and
     :py:obj:`~.CUDA_EXTERNAL_MEMORY_HANDLE_DESC.flags` must be zero.
     Importing a dma_buf object is supported only on Tegra Jetson platform
     starting with Thor series. Mapping an imported dma_buf object as CUDA
@@ -43071,7 +43120,7 @@ def cuImportExternalMemory(memHandleDesc : Optional[CUDA_EXTERNAL_MEMORY_HANDLE_
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OPERATING_SYSTEM`
     extMem_out : :py:obj:`~.CUexternalMemory`
         Returned handle to an external memory object
@@ -43138,7 +43187,7 @@ def cuExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[CUDA_EXTERNAL_
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     devPtr : :py:obj:`~.CUdeviceptr`
         Returned device pointer to buffer
@@ -43190,7 +43239,7 @@ def cuExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[CUDA_E
     the mipmapped array is bound as a color target in the graphics API,
     then the flag :py:obj:`~.CUDA_ARRAY3D_COLOR_ATTACHMENT` must be
     specified in
-    :py:obj:`~.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC.arrayDesc.Flags`.
+    :py:obj:`~.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC```arrayDesc````Flags``.
     :py:obj:`~.CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC.numLevels`
     specifies the total number of levels in the mipmap chain.
 
@@ -43214,7 +43263,7 @@ def cuExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[CUDA_E
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     mipmap : :py:obj:`~.CUmipmappedArray`
         Returned CUDA mipmapped array
@@ -43258,7 +43307,7 @@ def cuDestroyExternalMemory(extMem):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
@@ -43301,8 +43350,8 @@ def cuImportExternalSemaphore(semHandleDesc : Optional[CUDA_EXTERNAL_SEMAPHORE_H
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.fd` must be a
-    valid file descriptor referencing a synchronization object. Ownership
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````fd`` must be
+    a valid file descriptor referencing a synchronization object. Ownership
     of the file descriptor is transferred to the CUDA driver when the
     handle is imported successfully. Performing any operations on the file
     descriptor after it is imported results in undefined behavior.
@@ -43310,96 +43359,99 @@ def cuImportExternalSemaphore(semHandleDesc : Optional[CUDA_EXTERNAL_SEMAPHORE_H
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32`, then
     exactly one of
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
-    not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` is
-    not NULL, then it must represent a valid shared NT handle that
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::handle
+    and
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::name
+    must not be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::handle
+    is not NULL, then it must represent a valid shared NT handle that
     references a synchronization object. Ownership of this handle is not
     transferred to CUDA after the import operation, so the application must
     release the handle using the appropriate system call. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
-    not NULL, then it must name a valid synchronization object.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::name
+    is not NULL, then it must name a valid synchronization object.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_KMT`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle`
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::handle
     must be non-NULL and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
-    be NULL. The handle specified must be a globally shared KMT handle.
-    This handle does not hold a reference to the underlying object, and
-    thus will be invalid when all references to the synchronization object
-    are destroyed.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::name
+    must be NULL. The handle specified must be a globally shared KMT
+    handle. This handle does not hold a reference to the underlying object,
+    and thus will be invalid when all references to the synchronization
+    object are destroyed.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D12_FENCE`, then exactly
     one of
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
-    not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` is
-    not NULL, then it must represent a valid shared NT handle that is
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::handle
+    and
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::name
+    must not be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::handle
+    is not NULL, then it must represent a valid shared NT handle that is
     returned by ID3D12Device::CreateSharedHandle when referring to a
     ID3D12Fence object. This handle holds a reference to the underlying
     object. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
-    not NULL, then it must name a valid synchronization object that refers
-    to a valid ID3D12Fence object.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::name
+    is not NULL, then it must name a valid synchronization object that
+    refers to a valid ID3D12Fence object.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_FENCE`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle`
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::handle
     represents a valid shared NT handle that is returned by
     ID3D11Fence::CreateSharedHandle. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
-    not NULL, then it must name a valid synchronization object that refers
-    to a valid ID3D11Fence object.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::name
+    is not NULL, then it must name a valid synchronization object that
+    refers to a valid ID3D11Fence object.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.nvSciSyncObj`
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````nvSciSyncObj``
     represents a valid NvSciSyncObj.
 
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX`, then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle`
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::handle
     represents a valid shared NT handle that is returned by
     IDXGIResource1::CreateSharedHandle when referring to a IDXGIKeyedMutex
     object. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
-    not NULL, then it must name a valid synchronization object that refers
-    to a valid IDXGIKeyedMutex object.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::name
+    is not NULL, then it must name a valid synchronization object that
+    refers to a valid IDXGIKeyedMutex object.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT`,
     then
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle`
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::handle
     represents a valid shared KMT handle that is returned by
     IDXGIResource::GetSharedHandle when referring to a IDXGIKeyedMutex
     object and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
-    be NULL.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::name
+    must be NULL.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD`,
-    then :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.fd` must be
-    a valid file descriptor referencing a synchronization object. Ownership
-    of the file descriptor is transferred to the CUDA driver when the
-    handle is imported successfully. Performing any operations on the file
-    descriptor after it is imported results in undefined behavior.
+    then :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````fd``
+    must be a valid file descriptor referencing a synchronization object.
+    Ownership of the file descriptor is transferred to the CUDA driver when
+    the handle is imported successfully. Performing any operations on the
+    file descriptor after it is imported results in undefined behavior.
 
     If :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.type` is
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32`,
     then exactly one of
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` and
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` must
-    not be NULL. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.handle` is
-    not NULL, then it must represent a valid shared NT handle that
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::handle
+    and
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::name
+    must not be NULL. If
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::handle
+    is not NULL, then it must represent a valid shared NT handle that
     references a synchronization object. Ownership of this handle is not
     transferred to CUDA after the import operation, so the application must
     release the handle using the appropriate system call. If
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC.handle.win32.name` is
-    not NULL, then it must name a valid synchronization object.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC```handle````win32``::name
+    is not NULL, then it must name a valid synchronization object.
 
     Parameters
     ----------
@@ -43408,7 +43460,7 @@ def cuImportExternalSemaphore(semHandleDesc : Optional[CUDA_EXTERNAL_SEMAPHORE_H
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OPERATING_SYSTEM`
     extSem_out : :py:obj:`~.CUexternalSemaphore`
         Returned handle to an external semaphore
@@ -43451,15 +43503,15 @@ def cuSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemap
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_FD`,
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32`
     then the semaphore will be set to the value specified in
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.fence.value`.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS```params````fence``::value.
 
     If the semaphore object is of the type
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC` this API sets
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.nvSciSync.fence`
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS```params````nvSciSync``::fence
     to a value that can be used by subsequent waiters of the same NvSciSync
     object to order operations with those currently submitted in
     ``stream``. Such an update will overwrite previous contents of
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.nvSciSync.fence`.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS```params````nvSciSync``::fence.
     By default, signaling such an external semaphore object causes
     appropriate memory synchronization operations to be performed over all
     external memory objects that are imported as
@@ -43474,9 +43526,9 @@ def cuSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemap
     the type :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC`, if
     the NvSciSyncAttrList used to create the NvSciSyncObj had not set the
     flags in :py:obj:`~.cuDeviceGetNvSciSyncAttributes` to
-    CUDA_NVSCISYNC_ATTR_SIGNAL, this API will return
-    CUDA_ERROR_NOT_SUPPORTED. NvSciSyncFence associated with semaphore
-    object of the type
+    :py:obj:`~.CUDA_NVSCISYNC_ATTR_SIGNAL`, this API will return
+    :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`. NvSciSyncFence associated with
+    semaphore object of the type
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC` can be
     deterministic. For this the NvSciSyncAttrList used to create the
     semaphore object must have value of
@@ -43513,7 +43565,7 @@ def cuSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemap
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX`,
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT`
     then the keyed mutex will be released with the key specified in
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_PARAMS.params.keyedmutex.key`.
+    ``CUDA_EXTERNAL_SEMAPHORE_PARAMS````params````keyedmutex``::key.
 
     Parameters
     ----------
@@ -43528,7 +43580,7 @@ def cuSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemap
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -43608,12 +43660,12 @@ def cuWaitExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemapho
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_TIMELINE_SEMAPHORE_WIN32`
     then waiting on the semaphore will wait until the value of the
     semaphore is greater than or equal to
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS.params.fence.value`.
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS```params````fence``::value.
 
     If the semaphore object is of the type
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC` then, waiting
     on the semaphore will wait until the
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS.params.nvSciSync.fence`
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS```params````nvSciSync``::fence
     is signaled by the signaler of the NvSciSyncObj that was associated
     with this semaphore object. By default, waiting on such an external
     semaphore object causes appropriate memory synchronization operations
@@ -43629,17 +43681,17 @@ def cuWaitExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemapho
     the type :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_NVSCISYNC`, if
     the NvSciSyncAttrList used to create the NvSciSyncObj had not set the
     flags in :py:obj:`~.cuDeviceGetNvSciSyncAttributes` to
-    CUDA_NVSCISYNC_ATTR_WAIT, this API will return
-    CUDA_ERROR_NOT_SUPPORTED.
+    :py:obj:`~.CUDA_NVSCISYNC_ATTR_WAIT`, this API will return
+    :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`.
 
     If the semaphore object is any one of the following types:
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX`,
     :py:obj:`~.CU_EXTERNAL_SEMAPHORE_HANDLE_TYPE_D3D11_KEYED_MUTEX_KMT`
     then the keyed mutex will be acquired when it is released with the key
     specified in
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS.params.keyedmutex.key`
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS```params````keyedmutex``::key
     or until the timeout specified by
-    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS.params.keyedmutex.timeoutMs`
+    :py:obj:`~.CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS```params````keyedmutex``::timeoutMs
     has lapsed. The timeout interval can either be a finite value specified
     in milliseconds or an infinite value. In case an infinite value is
     specified the timeout never elapses. The windows INFINITE macro must be
@@ -43658,7 +43710,7 @@ def cuWaitExternalSemaphoresAsync(extSemArray : Optional[tuple[CUexternalSemapho
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_TIMEOUT`
 
     See Also
@@ -43726,7 +43778,7 @@ def cuDestroyExternalSemaphore(extSem):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
@@ -43763,9 +43815,9 @@ def cuStreamWaitValue32(stream, addr, value, unsigned int flags):
     :py:obj:`~.cuMemHostGetDevicePointer()`. This function cannot be used
     with managed memory (:py:obj:`~.cuMemAllocManaged`).
 
-    Support for CU_STREAM_WAIT_VALUE_NOR can be queried with
+    Support for :py:obj:`~.CU_STREAM_WAIT_VALUE_NOR` can be queried with
     :py:obj:`~.cuDeviceGetAttribute()` and
-    :py:obj:`~.CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2`.
+    ``CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2``.
 
     Parameters
     ----------
@@ -43780,7 +43832,7 @@ def cuStreamWaitValue32(stream, addr, value, unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -43852,7 +43904,7 @@ def cuStreamWaitValue64(stream, addr, value, unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -43918,7 +43970,7 @@ def cuStreamWriteValue32(stream, addr, value, unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -43982,7 +44034,7 @@ def cuStreamWriteValue64(stream, addr, value, unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -44052,7 +44104,7 @@ def cuStreamBatchMemOp(stream, unsigned int count, paramArray : Optional[tuple[C
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -44134,8 +44186,8 @@ def cuFuncGetAttribute(attrib not None : CUfunction_attribute, hfunc):
       value of 10 for legacy cubins that do not have a properly-encoded
       binary architecture version.
 
-    - :py:obj:`~.CU_FUNC_CACHE_MODE_CA`: The attribute to indicate whether
-      the function has been compiled with user specified option "-Xptxas
+    - ``CU_FUNC_CACHE_MODE_CA``: The attribute to indicate whether the
+      function has been compiled with user specified option "-Xptxas
       --dlcm=ca" set .
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES`: The
@@ -44163,28 +44215,28 @@ def cuFuncGetAttribute(attrib not None : CUfunction_attribute, hfunc):
       cluster size. 1 is allowed, 0 is disallowed. A non-portable cluster
       size may only function on the specific SKUs the program is tested on.
       The launch might fail if the program is run on a different hardware
-      platform. CUDA API provides cudaOccupancyMaxActiveClusters to assist
-      with checking whether the desired size can be launched on the current
-      device. A portable cluster size is guaranteed to be functional on all
-      compute capabilities higher than the target compute capability. The
-      portable cluster size for sm_90 is 8 blocks per cluster. This value
-      may increase for future compute capabilities. The specific hardware
-      unit may support higher cluster sizes that’s not guaranteed to be
-      portable.
+      platform. CUDA API provides :func:`~.cudaOccupancyMaxActiveClusters`
+      to assist with checking whether the desired size can be launched on
+      the current device. A portable cluster size is guaranteed to be
+      functional on all compute capabilities higher than the target compute
+      capability. The portable cluster size for sm_90 is 8 blocks per
+      cluster. This value may increase for future compute capabilities. The
+      specific hardware unit may support higher cluster sizes that’s not
+      guaranteed to be portable.
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`:
       The block scheduling policy of a function. The value type is
-      CUclusterSchedulingPolicy.
+      :py:obj:`~.CUclusterSchedulingPolicy`.
 
     With a few execeptions, function attributes may also be queried on
     unloaded function handles returned from
     :py:obj:`~.cuModuleEnumerateFunctions`.
     :py:obj:`~.CUDA_ERROR_FUNCTION_NOT_LOADED` is returned if the attribute
     requires a fully loaded function but the function is not loaded. The
-    loading state of a function may be queried using
-    :py:obj:`~.cuFuncIsloaded`. :py:obj:`~.cuFuncLoad` may be called to
-    explicitly load a function before querying the following attributes
-    that require the function to be loaded:
+    loading state of a function may be queried using ``cuFuncIsloaded``.
+    :py:obj:`~.cuFuncLoad` may be called to explicitly load a function
+    before querying the following attributes that require the function to
+    be loaded:
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK`
 
@@ -44201,14 +44253,14 @@ def cuFuncGetAttribute(attrib not None : CUfunction_attribute, hfunc):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_FUNCTION_NOT_LOADED`
     pi : int
         Returned attribute value
 
     See Also
     --------
-    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuLaunchKernel`, :py:obj:`~.cudaFuncGetAttributes`, :py:obj:`~.cudaFuncSetAttribute`, :py:obj:`~.cuFuncIsLoaded`, :py:obj:`~.cuFuncLoad`, :py:obj:`~.cuKernelGetAttribute`
+    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuLaunchKernel`, :func:`~.cudaFuncGetAttributes`, :func:`~.cudaFuncSetAttribute`, :py:obj:`~.cuFuncIsLoaded`, :py:obj:`~.cuFuncLoad`, :py:obj:`~.cuKernelGetAttribute`
     """
     cdef cydriver.CUfunction cyhfunc
     if hfunc is None:
@@ -44235,12 +44287,13 @@ def cuFuncSetAttribute(hfunc, attrib not None : CUfunction_attribute, int value)
 
     This call sets the value of a specified attribute ``attrib`` on the
     kernel given by ``hfunc`` to an integer value specified by ``val`` This
-    function returns CUDA_SUCCESS if the new value of the attribute could
-    be successfully set. If the set fails, this call will return an error.
-    Not all attributes can have values set. Attempting to set a value on a
-    read-only attribute will result in an error (CUDA_ERROR_INVALID_VALUE)
+    function returns :py:obj:`~.CUDA_SUCCESS` if the new value of the
+    attribute could be successfully set. If the set fails, this call will
+    return an error. Not all attributes can have values set. Attempting to
+    set a value on a read-only attribute will result in an error
+    (:py:obj:`~.CUDA_ERROR_INVALID_VALUE`)
 
-    Supported attributes for the cuFuncSetAttribute call are:
+    Supported attributes for the :func:`~.cuFuncSetAttribute` call are:
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES`: This
       maximum size in bytes of dynamically-allocated shared memory. The
@@ -44265,21 +44318,21 @@ def cuFuncSetAttribute(hfunc, attrib not None : CUfunction_attribute, int value)
       either all be 0 or all be positive. The validity of the cluster
       dimensions is checked at launch time. If the value is set during
       compile time, it cannot be set at runtime. Setting it at runtime will
-      return CUDA_ERROR_NOT_PERMITTED.
+      return :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`.
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT`: The required
       cluster height in blocks. The width, height, and depth values must
       either all be 0 or all be positive. The validity of the cluster
       dimensions is checked at launch time. If the value is set during
       compile time, it cannot be set at runtime. Setting it at runtime will
-      return CUDA_ERROR_NOT_PERMITTED.
+      return :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`.
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH`: The required
       cluster depth in blocks. The width, height, and depth values must
       either all be 0 or all be positive. The validity of the cluster
       dimensions is checked at launch time. If the value is set during
       compile time, it cannot be set at runtime. Setting it at runtime will
-      return CUDA_ERROR_NOT_PERMITTED.
+      return :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`.
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED`:
       Indicates whether the function can be launched with non-portable
@@ -44287,7 +44340,7 @@ def cuFuncSetAttribute(hfunc, attrib not None : CUfunction_attribute, int value)
 
     - :py:obj:`~.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE`:
       The block scheduling policy of a function. The value type is
-      CUclusterSchedulingPolicy.
+      :py:obj:`~.CUclusterSchedulingPolicy`.
 
     Parameters
     ----------
@@ -44300,12 +44353,12 @@ def cuFuncSetAttribute(hfunc, attrib not None : CUfunction_attribute, int value)
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuLaunchKernel`, :py:obj:`~.cudaFuncGetAttributes`, :py:obj:`~.cudaFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
+    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuLaunchKernel`, :func:`~.cudaFuncGetAttributes`, :func:`~.cudaFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
     """
     cdef cydriver.CUfunction cyhfunc
     if hfunc is None:
@@ -44367,12 +44420,12 @@ def cuFuncSetCacheConfig(hfunc, config not None : CUfunc_cache):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
 
     See Also
     --------
-    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncGetAttribute`, :py:obj:`~.cuLaunchKernel`, :py:obj:`~.cudaFuncSetCacheConfig`, :py:obj:`~.cuKernelSetCacheConfig`
+    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncGetAttribute`, :py:obj:`~.cuLaunchKernel`, :func:`~.cudaFuncSetCacheConfig`, :py:obj:`~.cuKernelSetCacheConfig`
     """
     cdef cydriver.CUfunction cyhfunc
     if hfunc is None:
@@ -44411,7 +44464,7 @@ def cuFuncGetModule(hfunc):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`
     hmod : :py:obj:`~.CUmodule`
         Returned module handle
@@ -44453,7 +44506,7 @@ def cuFuncGetName(hfunc):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     name : bytes
         The returned name of the function
@@ -44484,11 +44537,10 @@ def cuFuncGetParamInfo(func, size_t paramIndex):
     parameters, and returns in ``paramOffset`` and ``paramSize`` the offset
     and size, respectively, where the parameter will reside in the device-
     side parameter layout. This information can be used to update kernel
-    node parameters from the device via
-    :py:obj:`~.cudaGraphKernelNodeSetParam()` and
-    :py:obj:`~.cudaGraphKernelNodeUpdatesApply()`. ``paramIndex`` must be
-    less than the number of parameters that ``func`` takes. ``paramSize``
-    can be set to NULL if only the parameter offset is desired.
+    node parameters from the device via ``cudaGraphKernelNodeSetParam()``
+    and ``cudaGraphKernelNodeUpdatesApply()``. ``paramIndex`` must be less
+    than the number of parameters that ``func`` takes. ``paramSize`` can be
+    set to NULL if only the parameter offset is desired.
 
     Parameters
     ----------
@@ -44499,7 +44551,7 @@ def cuFuncGetParamInfo(func, size_t paramIndex):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     paramOffset : int
         Returns the offset into the device-side parameter layout at which
@@ -44545,7 +44597,7 @@ def cuFuncGetParamCount(func):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     paramCount : int
         Returns the number of parameters used by the function
@@ -44585,7 +44637,7 @@ def cuFuncIsLoaded(function):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     state : :py:obj:`~.CUfunctionLoadingState`
         returned loading state
@@ -44626,7 +44678,7 @@ def cuFuncLoad(function):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -44749,12 +44801,12 @@ def cuLaunchKernel(f, unsigned int gridDimX, unsigned int gridDimY, unsigned int
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_IMAGE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_LAUNCH_FAILED`, :py:obj:`~.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`, :py:obj:`~.CUDA_ERROR_LAUNCH_TIMEOUT`, :py:obj:`~.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`
 
     See Also
     --------
-    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuFuncGetAttribute`, :py:obj:`~.cudaLaunchKernel`, :py:obj:`~.cuLibraryGetKernel`, :py:obj:`~.cuKernelSetCacheConfig`, :py:obj:`~.cuKernelGetAttribute`, :py:obj:`~.cuKernelSetAttribute`
+    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuFuncGetAttribute`, :func:`~.cudaLaunchKernel`, :py:obj:`~.cuLibraryGetKernel`, :py:obj:`~.cuKernelSetCacheConfig`, :py:obj:`~.cuKernelGetAttribute`, :py:obj:`~.cuKernelSetAttribute`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -44915,7 +44967,7 @@ def cuLaunchKernelEx(config : Optional[CUlaunchConfig], f, kernelParams, void_pt
     update the node's kernel parameters from within another kernel. For
     more information on the types of device updates that can be made, as
     well as the relevant limitations thereof, see
-    :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.
+    ``cudaGraphKernelNodeUpdatesApply``.
 
     Kernel nodes which are device-updatable have additional restrictions
     compared to regular kernel nodes. Firstly, device-updatable nodes
@@ -44997,12 +45049,12 @@ def cuLaunchKernelEx(config : Optional[CUlaunchConfig], f, kernelParams, void_pt
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_IMAGE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_LAUNCH_FAILED`, :py:obj:`~.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`, :py:obj:`~.CUDA_ERROR_LAUNCH_TIMEOUT`, :py:obj:`~.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`, :py:obj:`~.CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`
 
     See Also
     --------
-    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuFuncGetAttribute`, :py:obj:`~.cudaLaunchKernel`, :py:obj:`~.cudaLaunchKernelEx`, :py:obj:`~.cuLibraryGetKernel`, :py:obj:`~.cuKernelSetCacheConfig`, :py:obj:`~.cuKernelGetAttribute`, :py:obj:`~.cuKernelSetAttribute`
+    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuFuncGetAttribute`, :func:`~.cudaLaunchKernel`, ``cudaLaunchKernelEx``, :py:obj:`~.cuLibraryGetKernel`, :py:obj:`~.cuKernelSetCacheConfig`, :py:obj:`~.cuKernelGetAttribute`, :py:obj:`~.cuKernelSetAttribute`
     """
     cdef cydriver.CUfunction cyf
     if f is None:
@@ -45103,12 +45155,12 @@ def cuLaunchCooperativeKernel(f, unsigned int gridDimX, unsigned int gridDimY, u
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_IMAGE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_LAUNCH_FAILED`, :py:obj:`~.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`, :py:obj:`~.CUDA_ERROR_LAUNCH_TIMEOUT`, :py:obj:`~.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`, :py:obj:`~.CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`, :py:obj:`~.CUDA_ERROR_NOT_FOUND`
 
     See Also
     --------
-    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuFuncGetAttribute`, :py:obj:`~.cuLaunchCooperativeKernelMultiDevice`, :py:obj:`~.cudaLaunchCooperativeKernel`, :py:obj:`~.cuLibraryGetKernel`, :py:obj:`~.cuKernelSetCacheConfig`, :py:obj:`~.cuKernelGetAttribute`, :py:obj:`~.cuKernelSetAttribute`
+    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuFuncGetAttribute`, :py:obj:`~.cuLaunchCooperativeKernelMultiDevice`, :func:`~.cudaLaunchCooperativeKernel`, :py:obj:`~.cuLibraryGetKernel`, :py:obj:`~.cuKernelSetCacheConfig`, :py:obj:`~.cuKernelGetAttribute`, :py:obj:`~.cuKernelSetAttribute`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -45202,7 +45254,7 @@ def cuLaunchHostFunc(hStream, fn, userData):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -45301,7 +45353,7 @@ def cuLaunchHostFunc_v2(hStream, fn, userData, unsigned int syncMode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -45356,7 +45408,7 @@ def cuFuncSetBlockShape(hfunc, int x, int y, int z):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -45397,7 +45449,7 @@ def cuFuncSetSharedSize(hfunc, unsigned int numbytes):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -45437,7 +45489,7 @@ def cuParamSetSize(hfunc, unsigned int numbytes):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -45480,7 +45532,7 @@ def cuParamSeti(hfunc, int offset, unsigned int value):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -45523,7 +45575,7 @@ def cuParamSetf(hfunc, int offset, float value):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -45568,7 +45620,7 @@ def cuParamSetv(hfunc, int offset, ptr, unsigned int numbytes):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -45611,9 +45663,9 @@ def cuLaunch(f):
 
     Launching a function via :py:obj:`~.cuLaunchKernel()` invalidates the
     function's block shape, dynamic shared memory size, and parameter
-    information. After launching via cuLaunchKernel, this state must be re-
-    initialized prior to calling this function. Failure to do so results in
-    undefined behavior.
+    information. After launching via :func:`~.cuLaunchKernel`, this state
+    must be re-initialized prior to calling this function. Failure to do so
+    results in undefined behavior.
 
     Parameters
     ----------
@@ -45622,7 +45674,7 @@ def cuLaunch(f):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_LAUNCH_FAILED`, :py:obj:`~.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`, :py:obj:`~.CUDA_ERROR_LAUNCH_TIMEOUT`, :py:obj:`~.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`
 
     See Also
@@ -45662,9 +45714,9 @@ def cuLaunchGrid(f, int grid_width, int grid_height):
 
     Launching a function via :py:obj:`~.cuLaunchKernel()` invalidates the
     function's block shape, dynamic shared memory size, and parameter
-    information. After launching via cuLaunchKernel, this state must be re-
-    initialized prior to calling this function. Failure to do so results in
-    undefined behavior.
+    information. After launching via :func:`~.cuLaunchKernel`, this state
+    must be re-initialized prior to calling this function. Failure to do so
+    results in undefined behavior.
 
     Parameters
     ----------
@@ -45677,7 +45729,7 @@ def cuLaunchGrid(f, int grid_width, int grid_height):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_LAUNCH_FAILED`, :py:obj:`~.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`, :py:obj:`~.CUDA_ERROR_LAUNCH_TIMEOUT`, :py:obj:`~.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`
 
     See Also
@@ -45717,9 +45769,9 @@ def cuLaunchGridAsync(f, int grid_width, int grid_height, hStream):
 
     Launching a function via :py:obj:`~.cuLaunchKernel()` invalidates the
     function's block shape, dynamic shared memory size, and parameter
-    information. After launching via cuLaunchKernel, this state must be re-
-    initialized prior to calling this function. Failure to do so results in
-    undefined behavior.
+    information. After launching via :func:`~.cuLaunchKernel`, this state
+    must be re-initialized prior to calling this function. Failure to do so
+    results in undefined behavior.
 
     \\note_null_stream
 
@@ -45736,7 +45788,7 @@ def cuLaunchGridAsync(f, int grid_width, int grid_height, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_LAUNCH_FAILED`, :py:obj:`~.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`, :py:obj:`~.CUDA_ERROR_LAUNCH_TIMEOUT`, :py:obj:`~.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`
 
     See Also
@@ -45916,12 +45968,12 @@ def cuLaunchCooperativeKernelMultiDevice(launchParamsList : Optional[tuple[CUDA_
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_IMAGE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_LAUNCH_FAILED`, :py:obj:`~.CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`, :py:obj:`~.CUDA_ERROR_LAUNCH_TIMEOUT`, :py:obj:`~.CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`, :py:obj:`~.CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE`, :py:obj:`~.CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`
 
     See Also
     --------
-    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuFuncGetAttribute`, :py:obj:`~.cuLaunchCooperativeKernel`, :py:obj:`~.cudaLaunchCooperativeKernelMultiDevice`
+    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuFuncSetCacheConfig`, :py:obj:`~.cuFuncGetAttribute`, :py:obj:`~.cuLaunchCooperativeKernel`, ``cudaLaunchCooperativeKernelMultiDevice``
     """
     launchParamsList = [] if launchParamsList is None else launchParamsList
     if not all(isinstance(_x, (CUDA_LAUNCH_PARAMS,)) for _x in launchParamsList):
@@ -45968,7 +46020,7 @@ def cuParamSetTexRef(hfunc, int texunit, hTexRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     """
     cdef cydriver.CUtexref cyhTexRef
@@ -46044,12 +46096,12 @@ def cuFuncSetSharedMemConfig(hfunc, config not None : CUsharedconfig):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
 
     See Also
     --------
-    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuCtxSetSharedMemConfig`, :py:obj:`~.cuFuncGetAttribute`, :py:obj:`~.cuLaunchKernel`, :py:obj:`~.cudaFuncSetSharedMemConfig`
+    :py:obj:`~.cuCtxGetCacheConfig`, :py:obj:`~.cuCtxSetCacheConfig`, :py:obj:`~.cuCtxGetSharedMemConfig`, :py:obj:`~.cuCtxSetSharedMemConfig`, :py:obj:`~.cuFuncGetAttribute`, :py:obj:`~.cuLaunchKernel`, :func:`~.cudaFuncSetSharedMemConfig`
     """
     cdef cydriver.CUfunction cyhfunc
     if hfunc is None:
@@ -46080,7 +46132,7 @@ def cuGraphCreate(unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     phGraph : :py:obj:`~.CUgraph`
         Returns newly created graph
@@ -46110,7 +46162,7 @@ def cuGraphAddKernelNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | li
     the root of the graph. ``dependencies`` may not have any duplicate
     entries. A handle to the new node will be returned in ``phGraphNode``.
 
-    The CUDA_KERNEL_NODE_PARAMS structure is defined as:
+    The :py:obj:`~.CUDA_KERNEL_NODE_PARAMS` structure is defined as:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -46139,7 +46191,7 @@ def cuGraphAddKernelNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | li
     less commonly used arguments. ``extra`` specifies a list of names of
     extra settings and their corresponding values. Each extra setting name
     is immediately followed by the corresponding value. The list must be
-    terminated with either NULL or CU_LAUNCH_PARAM_END.
+    terminated with either NULL or :py:obj:`~.CU_LAUNCH_PARAM_END`.
 
     - :py:obj:`~.CU_LAUNCH_PARAM_END`, which indicates the end of the
       ``extra`` array;
@@ -46175,7 +46227,7 @@ def cuGraphAddKernelNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | li
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -46245,7 +46297,7 @@ def cuGraphKernelNodeGetParams(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     nodeParams : :py:obj:`~.CUDA_KERNEL_NODE_PARAMS`
         Pointer to return the parameters
@@ -46287,7 +46339,7 @@ def cuGraphKernelNodeSetParams(hNode, nodeParams : Optional[CUDA_KERNEL_NODE_PAR
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
 
     See Also
@@ -46349,7 +46401,7 @@ def cuGraphAddMemcpyNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | li
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -46414,7 +46466,7 @@ def cuGraphMemcpyNodeGetParams(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     nodeParams : :py:obj:`~.CUDA_MEMCPY3D`
         Pointer to return the parameters
@@ -46456,7 +46508,7 @@ def cuGraphMemcpyNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMCPY3D]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -46508,7 +46560,7 @@ def cuGraphAddMemsetNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | li
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -46573,7 +46625,7 @@ def cuGraphMemsetNodeGetParams(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     nodeParams : :py:obj:`~.CUDA_MEMSET_NODE_PARAMS`
         Pointer to return the parameters
@@ -46615,7 +46667,7 @@ def cuGraphMemsetNodeSetParams(hNode, nodeParams : Optional[CUDA_MEMSET_NODE_PAR
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -46665,7 +46717,7 @@ def cuGraphAddHostNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -46722,7 +46774,7 @@ def cuGraphHostNodeGetParams(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     nodeParams : :py:obj:`~.CUDA_HOST_NODE_PARAMS`
         Pointer to return the parameters
@@ -46764,7 +46816,7 @@ def cuGraphHostNodeSetParams(hNode, nodeParams : Optional[CUDA_HOST_NODE_PARAMS]
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -46817,7 +46869,7 @@ def cuGraphAddChildGraphNode(hGraph, dependencies : Optional[tuple[CUgraphNode]
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -46886,7 +46938,7 @@ def cuGraphChildGraphNodeGetGraph(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     phGraph : :py:obj:`~.CUgraph`
         Location to store a handle to the graph
@@ -46941,7 +46993,7 @@ def cuGraphAddEmptyNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | lis
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -47011,7 +47063,7 @@ def cuGraphAddEventRecordNode(hGraph, dependencies : Optional[tuple[CUgraphNode]
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -47075,7 +47127,7 @@ def cuGraphEventRecordNodeGetEvent(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     event_out : :py:obj:`~.CUevent`
         Pointer to return the event
@@ -47117,7 +47169,7 @@ def cuGraphEventRecordNodeSetEvent(hNode, event):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
 
     See Also
@@ -47176,7 +47228,7 @@ def cuGraphAddEventWaitNode(hGraph, dependencies : Optional[tuple[CUgraphNode] |
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -47240,7 +47292,7 @@ def cuGraphEventWaitNodeGetEvent(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     event_out : :py:obj:`~.CUevent`
         Pointer to return the event
@@ -47282,7 +47334,7 @@ def cuGraphEventWaitNodeSetEvent(hNode, event):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
 
     See Also
@@ -47340,7 +47392,7 @@ def cuGraphAddExternalSemaphoresSignalNode(hGraph, dependencies : Optional[tuple
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -47403,7 +47455,7 @@ def cuGraphExternalSemaphoresSignalNodeGetParams(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     params_out : :py:obj:`~.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS`
         Pointer to return the parameters
@@ -47446,7 +47498,7 @@ def cuGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[CU
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
 
     See Also
@@ -47497,7 +47549,7 @@ def cuGraphAddExternalSemaphoresWaitNode(hGraph, dependencies : Optional[tuple[C
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -47560,7 +47612,7 @@ def cuGraphExternalSemaphoresWaitNodeGetParams(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     params_out : :py:obj:`~.CUDA_EXT_SEM_WAIT_NODE_PARAMS`
         Pointer to return the parameters
@@ -47603,7 +47655,7 @@ def cuGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[CUDA
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
 
     See Also
@@ -47653,7 +47705,7 @@ def cuGraphAddBatchMemOpNode(hGraph, dependencies : Optional[tuple[CUgraphNode]
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -47719,7 +47771,7 @@ def cuGraphBatchMemOpNodeGetParams(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     nodeParams_out : :py:obj:`~.CUDA_BATCH_MEM_OP_NODE_PARAMS`
         Pointer to return the parameters
@@ -47764,7 +47816,7 @@ def cuGraphBatchMemOpNodeSetParams(hNode, nodeParams : Optional[CUDA_BATCH_MEM_O
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
 
     See Also
@@ -47800,8 +47852,8 @@ def cuGraphExecBatchMemOpNodeSetParams(hGraphExec, hNode, nodeParams : Optional[
     graph:
 
     op.waitValue.address op.waitValue.value[64] op.waitValue.flags bits
-    corresponding to wait type (i.e. CU_STREAM_WAIT_VALUE_FLUSH bit cannot
-    be modified) op.writeValue.address op.writeValue.value[64]
+    corresponding to wait type (i.e. :py:obj:`~.CU_STREAM_WAIT_VALUE_FLUSH`
+    bit cannot be modified) op.writeValue.address op.writeValue.value[64]
 
     Other fields, such as the context, count or type of operations, and
     other types of operations such as membars, may not be modified.
@@ -47827,7 +47879,7 @@ def cuGraphExecBatchMemOpNodeSetParams(hGraphExec, hNode, nodeParams : Optional[
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -47926,7 +47978,7 @@ def cuGraphAddMemAllocNode(hGraph, dependencies : Optional[tuple[CUgraphNode] |
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -47986,7 +48038,7 @@ def cuGraphMemAllocNodeGetParams(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     params_out : :py:obj:`~.CUDA_MEM_ALLOC_NODE_PARAMS`
         Pointer to return the parameters
@@ -48058,7 +48110,7 @@ def cuGraphAddMemFreeNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | l
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -48122,7 +48174,7 @@ def cuGraphMemFreeNodeGetParams(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     dptr_out : :py:obj:`~.CUdeviceptr`
         Pointer to return the device address
@@ -48164,7 +48216,7 @@ def cuDeviceGraphMemTrim(device):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
 
     See Also
@@ -48216,7 +48268,7 @@ def cuDeviceGetGraphMemAttribute(device, attr not None : CUgraphMem_attribute):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     value : Any
         retrieved value
@@ -48270,7 +48322,7 @@ def cuDeviceSetGraphMemAttribute(device, attr not None : CUgraphMem_attribute, v
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
 
     See Also
@@ -48314,7 +48366,7 @@ def cuGraphClone(originalGraph):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     phGraphClone : :py:obj:`~.CUgraph`
         Returns newly created cloned graph
@@ -48367,7 +48419,7 @@ def cuGraphNodeFindInClone(hOriginalNode, hClonedGraph):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     phNode : :py:obj:`~.CUgraphNode`
         Returns handle to the cloned node
@@ -48415,7 +48467,7 @@ def cuGraphNodeGetType(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     typename : :py:obj:`~.CUgraphNodeType`
         Pointer to return the node type
@@ -48456,7 +48508,7 @@ def cuGraphNodeGetContainingGraph(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     \\*phGraph : :py:obj:`~.CUgraph`
         Pointer to return the containing graph
@@ -48498,7 +48550,7 @@ def cuGraphNodeGetLocalId(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     nodeId : unsigned int
         Pointer to return the nodeId
@@ -48536,7 +48588,7 @@ def cuGraphNodeGetToolsId(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     \\*toolsNodeId : unsigned long long
         Pointer to return the id used by tools
@@ -48577,7 +48629,7 @@ def cuGraphGetId(hGraph):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     \\*graphId : unsigned int
         Pointer to return the graphId
@@ -48619,7 +48671,7 @@ def cuGraphExecGetId(hGraphExec):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     \\*graphId : unsigned int
         Pointer to return the graphId
@@ -48666,7 +48718,7 @@ def cuGraphGetNodes(hGraph, size_t numNodes = 0):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     nodes : list[:py:obj:`~.CUgraphNode`]
         Pointer to return the nodes
@@ -48727,7 +48779,7 @@ def cuGraphGetRootNodes(hGraph, size_t numRootNodes = 0):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     rootNodes : list[:py:obj:`~.CUgraphNode`]
         Pointer to return the root nodes
@@ -48795,7 +48847,7 @@ def cuGraphGetEdges(hGraph, size_t numEdges = 0):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_LOSSY_QUERY`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     from : list[:py:obj:`~.CUgraphNode`]
         Location to return edge endpoints
@@ -48890,7 +48942,7 @@ def cuGraphNodeGetDependencies(hNode, size_t numDependencies = 0):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_LOSSY_QUERY`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     dependencies : list[:py:obj:`~.CUgraphNode`]
         Pointer to return the dependencies
@@ -48971,7 +49023,7 @@ def cuGraphNodeGetDependentNodes(hNode, size_t numDependentNodes = 0):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_LOSSY_QUERY`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     dependentNodes : list[:py:obj:`~.CUgraphNode`]
         Pointer to return the dependent nodes
@@ -49054,7 +49106,7 @@ def cuGraphAddDependencies(hGraph, from_ : Optional[tuple[CUgraphNode] | list[CU
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -49153,7 +49205,7 @@ def cuGraphRemoveDependencies(hGraph, from_ : Optional[tuple[CUgraphNode] | list
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -49236,7 +49288,7 @@ def cuGraphDestroyNode(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -49296,8 +49348,9 @@ def cuGraphInstantiate(hGraph, unsigned long long flags):
     same also applies if ``hGraph`` contains any device-updatable kernel
     nodes.
 
-    If ``hGraph`` contains kernels which call device-side cudaGraphLaunch()
-    from multiple contexts, this will result in an error.
+    If ``hGraph`` contains kernels which call device-side
+    :func:`~.cudaGraphLaunch` from multiple contexts, this will result in
+    an error.
 
     Graphs instantiated for launch on the device have additional
     restrictions which do not apply to host graphs:
@@ -49337,7 +49390,7 @@ def cuGraphInstantiate(hGraph, unsigned long long flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phGraphExec : :py:obj:`~.CUgraphExec`
         Returns instantiated graph
@@ -49414,8 +49467,9 @@ def cuGraphInstantiateWithParams(hGraph, instantiateParams : Optional[CUDA_GRAPH
     same also applies if ``hGraph`` contains any device-updatable kernel
     nodes.
 
-    If ``hGraph`` contains kernels which call device-side cudaGraphLaunch()
-    from multiple contexts, this will result in an error.
+    If ``hGraph`` contains kernels which call device-side
+    :func:`~.cudaGraphLaunch` from multiple contexts, this will result in
+    an error.
 
     Graphs instantiated for launch on the device have additional
     restrictions which do not apply to host graphs:
@@ -49467,8 +49521,8 @@ def cuGraphInstantiateWithParams(hGraph, instantiateParams : Optional[CUDA_GRAPH
       the graph is instantiated for device launch but a node’s context
       differs from that of another node. This error can also be returned if
       a graph is not instantiated for device launch and it contains kernels
-      which call device-side cudaGraphLaunch() from multiple contexts.
-      ``hErrNode_out`` will be set to this node.
+      which call device-side :func:`~.cudaGraphLaunch` from multiple
+      contexts. ``hErrNode_out`` will be set to this node.
 
     If instantiation is successful, ``result_out`` will be set to
     :py:obj:`~.CUDA_GRAPH_INSTANTIATE_SUCCESS`, and ``hErrNode_out`` will
@@ -49483,7 +49537,7 @@ def cuGraphInstantiateWithParams(hGraph, instantiateParams : Optional[CUDA_GRAPH
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     phGraphExec : :py:obj:`~.CUgraphExec`
         Returns instantiated graph
@@ -49527,7 +49581,7 @@ def cuGraphExecGetFlags(hGraphExec):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     flags : :py:obj:`~.cuuint64_t`
         Returns the instantiation flags
@@ -49577,12 +49631,12 @@ def cuGraphExecKernelNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA
       calls.
 
     - If ``hGraphExec`` was not instantiated for device launch, a node
-      whose function originally did not use device-side cudaGraphLaunch()
-      cannot be updated to a function which uses device-side
-      cudaGraphLaunch() unless the node resides on the same context as
-      nodes which contained such calls at instantiate-time. If no such
-      calls were present at instantiation, these updates cannot be
-      performed at all.
+      whose function originally did not use device-side
+      :func:`~.cudaGraphLaunch` cannot be updated to a function which uses
+      device-side :func:`~.cudaGraphLaunch` unless the node resides on the
+      same context as nodes which contained such calls at instantiate-time.
+      If no such calls were present at instantiation, these updates cannot
+      be performed at all.
 
     The modifications only affect future launches of ``hGraphExec``.
     Already enqueued or running launches of ``hGraphExec`` are not affected
@@ -49605,7 +49659,7 @@ def cuGraphExecKernelNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -49655,8 +49709,8 @@ def cuGraphExecMemcpyNodeSetParams(hGraphExec, hNode, copyParams : Optional[CUDA
     Already enqueued or running launches of ``hGraphExec`` are not affected
     by this call. hNode is also not modified by this call.
 
-    Returns CUDA_ERROR_INVALID_VALUE if the memory operands' mappings
-    changed or either the original or new memory operands are
+    Returns :py:obj:`~.CUDA_ERROR_INVALID_VALUE` if the memory operands'
+    mappings changed or either the original or new memory operands are
     multidimensional.
 
     Parameters
@@ -49672,7 +49726,7 @@ def cuGraphExecMemcpyNodeSetParams(hGraphExec, hNode, copyParams : Optional[CUDA
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -49752,7 +49806,7 @@ def cuGraphExecMemsetNodeSetParams(hGraphExec, hNode, memsetParams : Optional[CU
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -49815,7 +49869,7 @@ def cuGraphExecHostNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUDA_H
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -49878,7 +49932,7 @@ def cuGraphExecChildGraphNodeSetParams(hGraphExec, hNode, childGraph):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -49941,7 +49995,7 @@ def cuGraphExecEventRecordNodeSetEvent(hGraphExec, hNode, event):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -50004,7 +50058,7 @@ def cuGraphExecEventWaitNodeSetEvent(hGraphExec, hNode, event):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -50071,7 +50125,7 @@ def cuGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePara
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -50131,7 +50185,7 @@ def cuGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodeParams
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -50197,7 +50251,7 @@ def cuGraphNodeSetEnabled(hGraphExec, hNode, unsigned int isEnabled):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
 
     See Also
@@ -50252,7 +50306,7 @@ def cuGraphNodeGetEnabled(hGraphExec, hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     isEnabled : unsigned int
         Location to return the enabled status of the node
@@ -50312,7 +50366,7 @@ def cuGraphUpload(hGraphExec, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -50366,7 +50420,7 @@ def cuGraphLaunch(hGraphExec, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -50411,7 +50465,7 @@ def cuGraphExecDestroy(hGraphExec):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -50447,7 +50501,7 @@ def cuGraphDestroy(hGraph):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -50494,18 +50548,18 @@ def cuGraphExecUpdate(hGraphExec, hGraph):
         vice-versa.
 
       - If the graph was instantiated with
-        CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY, the priority
-        attribute cannot change. Equality is checked on the originally
-        requested priority values, before they are clamped to the device's
-        supported range.
+        :py:obj:`~.CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY`, the
+        priority attribute cannot change. Equality is checked on the
+        originally requested priority values, before they are clamped to
+        the device's supported range.
 
       - If ``hGraphExec`` was not instantiated for device launch, a node
-        whose function originally did not use device-side cudaGraphLaunch()
-        cannot be updated to a function which uses device-side
-        cudaGraphLaunch() unless the node resides on the same context as
-        nodes which contained such calls at instantiate-time. If no such
-        calls were present at instantiation, these updates cannot be
-        performed at all.
+        whose function originally did not use device-side
+        :func:`~.cudaGraphLaunch` cannot be updated to a function which
+        uses device-side :func:`~.cudaGraphLaunch` unless the node resides
+        on the same context as nodes which contained such calls at
+        instantiate-time. If no such calls were present at instantiation,
+        these updates cannot be performed at all.
 
       - Neither ``hGraph`` nor ``hGraphExec`` may contain device-updatable
         kernel nodes.
@@ -50527,7 +50581,8 @@ def cuGraphExecUpdate(hGraphExec, hGraph):
     - Additional memcpy node restrictions:
 
       - Changing either the source or destination memory type(i.e.
-        CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_ARRAY, etc.) is not supported.
+        :py:obj:`~.CU_MEMORYTYPE_DEVICE`, :py:obj:`~.CU_MEMORYTYPE_ARRAY`,
+        etc.) is not supported.
 
     - External semaphore wait nodes and record nodes:
 
@@ -50546,9 +50601,9 @@ def cuGraphExecUpdate(hGraphExec, hGraph):
     Note: The API may add further restrictions in future releases. The
     return code should always be checked.
 
-    cuGraphExecUpdate sets the result member of ``resultInfo`` to
-    CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED under the following
-    conditions:
+    :py:obj:`~.cuGraphExecUpdate` sets the result member of ``resultInfo``
+    to :py:obj:`~.CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED` under the
+    following conditions:
 
     - The count of nodes directly in ``hGraphExec`` and ``hGraph`` differ,
       in which case resultInfo->errorNode is set to NULL.
@@ -50568,42 +50623,44 @@ def cuGraphExecUpdate(hGraphExec, hGraph):
       not match when the nodes are already paired based on other edges
       examined in the graph.
 
-    cuGraphExecUpdate sets the result member of ``resultInfo`` to:
+    :py:obj:`~.cuGraphExecUpdate` sets the result member of ``resultInfo``
+    to:
 
-    - CU_GRAPH_EXEC_UPDATE_ERROR if passed an invalid value.
+    - :py:obj:`~.CU_GRAPH_EXEC_UPDATE_ERROR` if passed an invalid value.
 
-    - CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED if the graph topology
-      changed
+    - :py:obj:`~.CU_GRAPH_EXEC_UPDATE_ERROR_TOPOLOGY_CHANGED` if the graph
+      topology changed
 
-    - CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED if the type of a node
-      changed, in which case ``hErrorNode_out`` is set to the node from
-      ``hGraph``.
+    - :py:obj:`~.CU_GRAPH_EXEC_UPDATE_ERROR_NODE_TYPE_CHANGED` if the type
+      of a node changed, in which case ``hErrorNode_out`` is set to the
+      node from ``hGraph``.
 
-    - CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE if the
-      function changed in an unsupported way(see note above), in which case
-      ``hErrorNode_out`` is set to the node from ``hGraph``
+    - :py:obj:`~.CU_GRAPH_EXEC_UPDATE_ERROR_UNSUPPORTED_FUNCTION_CHANGE` if
+      the function changed in an unsupported way(see note above), in which
+      case ``hErrorNode_out`` is set to the node from ``hGraph``
 
-    - CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED if any parameters to a
-      node changed in a way that is not supported, in which case
-      ``hErrorNode_out`` is set to the node from ``hGraph``.
+    - :py:obj:`~.CU_GRAPH_EXEC_UPDATE_ERROR_PARAMETERS_CHANGED` if any
+      parameters to a node changed in a way that is not supported, in which
+      case ``hErrorNode_out`` is set to the node from ``hGraph``.
 
-    - CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED if any attributes of a
-      node changed in a way that is not supported, in which case
-      ``hErrorNode_out`` is set to the node from ``hGraph``.
+    - :py:obj:`~.CU_GRAPH_EXEC_UPDATE_ERROR_ATTRIBUTES_CHANGED` if any
+      attributes of a node changed in a way that is not supported, in which
+      case ``hErrorNode_out`` is set to the node from ``hGraph``.
 
-    - CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED if something about a node is
-      unsupported, like the node's type or configuration, in which case
-      ``hErrorNode_out`` is set to the node from ``hGraph``
+    - :py:obj:`~.CU_GRAPH_EXEC_UPDATE_ERROR_NOT_SUPPORTED` if something
+      about a node is unsupported, like the node's type or configuration,
+      in which case ``hErrorNode_out`` is set to the node from ``hGraph``
 
     If the update fails for a reason not listed above, the result member of
-    ``resultInfo`` will be set to CU_GRAPH_EXEC_UPDATE_ERROR. If the update
-    succeeds, the result member will be set to
-    CU_GRAPH_EXEC_UPDATE_SUCCESS.
+    ``resultInfo`` will be set to :py:obj:`~.CU_GRAPH_EXEC_UPDATE_ERROR`.
+    If the update succeeds, the result member will be set to
+    :py:obj:`~.CU_GRAPH_EXEC_UPDATE_SUCCESS`.
 
-    cuGraphExecUpdate returns CUDA_SUCCESS when the updated was performed
-    successfully. It returns CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE if the
-    graph update was not performed because it included changes which
-    violated constraints specific to instantiated graph update.
+    :py:obj:`~.cuGraphExecUpdate` returns :py:obj:`~.CUDA_SUCCESS` when the
+    updated was performed successfully. It returns
+    :py:obj:`~.CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE` if the graph update
+    was not performed because it included changes which violated
+    constraints specific to instantiated graph update.
 
     Parameters
     ----------
@@ -50614,7 +50671,7 @@ def cuGraphExecUpdate(hGraphExec, hGraph):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE`,
     resultInfo : :py:obj:`~.CUgraphExecUpdateResultInfo`
         the error info structure
@@ -50666,7 +50723,7 @@ def cuGraphKernelNodeCopyAttributes(dst, src):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -50712,7 +50769,7 @@ def cuGraphKernelNodeGetAttribute(hNode, attr not None : CUkernelNodeAttrID):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     value_out : :py:obj:`~.CUkernelNodeAttrValue`
 
@@ -50758,7 +50815,7 @@ def cuGraphKernelNodeSetAttribute(hNode, attr not None : CUkernelNodeAttrID, val
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
 
     See Also
@@ -50799,12 +50856,12 @@ def cuGraphDebugDotPrint(hGraph, char* path, unsigned int flags):
     path : bytes
         The path to write the DOT file to
     flags : unsigned int
-        Flags from CUgraphDebugDot_flags for specifying which additional
-        node information to write
+        Flags from :py:obj:`~.CUgraphDebugDot_flags` for specifying which
+        additional node information to write
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OPERATING_SYSTEM`
     """
     cdef cydriver.CUgraph cyhGraph
@@ -50855,7 +50912,7 @@ def cuUserObjectCreate(ptr, destroy, unsigned int initialRefcount, unsigned int
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     object_out : :py:obj:`~.CUuserObject`
         Location to return the user object handle
@@ -50905,7 +50962,7 @@ def cuUserObjectRetain(object, unsigned int count):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -50950,7 +51007,7 @@ def cuUserObjectRelease(object, unsigned int count):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -50998,7 +51055,7 @@ def cuGraphRetainUserObject(graph, object, unsigned int count, unsigned int flag
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -51049,7 +51106,7 @@ def cuGraphReleaseUserObject(graph, object, unsigned int count):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -51118,7 +51175,7 @@ def cuGraphAddNode(hGraph, dependencies : Optional[tuple[CUgraphNode] | list[CUg
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     phGraphNode : :py:obj:`~.CUgraphNode`
         Returns newly created node
@@ -51185,7 +51242,8 @@ def cuGraphNodeSetParams(hNode, nodeParams : Optional[CUgraphNodeParams]):
     bytes (reserved, padding) zeroed.
 
     Modifying parameters is not supported for node types
-    CU_GRAPH_NODE_TYPE_MEM_ALLOC and CU_GRAPH_NODE_TYPE_MEM_FREE.
+    :py:obj:`~.CU_GRAPH_NODE_TYPE_MEM_ALLOC` and
+    :py:obj:`~.CU_GRAPH_NODE_TYPE_MEM_FREE`.
 
     Parameters
     ----------
@@ -51196,7 +51254,7 @@ def cuGraphNodeSetParams(hNode, nodeParams : Optional[CUgraphNodeParams]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -51231,12 +51289,12 @@ def cuGraphNodeGetParams(hNode):
     modified.
 
     The returned parameters are a description of the node, but may not be
-    identical to the struct provided at creation and may not be suitable
-    for direct creation of identical nodes. This is because parameters may
-    be partially unspecified and filled in by the driver at creation, may
-    reference non-copyable handles, or may describe ownership semantics or
-    other parameters that govern behavior of node creation but are not part
-    of the final functional descriptor.
+    identical to the ``struct provided`` at creation and may not be
+    suitable for direct creation of identical nodes. This is because
+    parameters may be partially unspecified and filled in by the driver at
+    creation, may reference non-copyable handles, or may describe ownership
+    semantics or other parameters that govern behavior of node creation but
+    are not part of the final functional descriptor.
 
     Parameters
     ----------
@@ -51245,7 +51303,7 @@ def cuGraphNodeGetParams(hNode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     nodeParams : :py:obj:`~.CUgraphNodeParams`
         Pointer to return the parameters
@@ -51301,7 +51359,7 @@ def cuGraphExecNodeSetParams(hGraphExec, hNode, nodeParams : Optional[CUgraphNod
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -51355,14 +51413,14 @@ def cuGraphConditionalHandleCreate(hGraph, ctx, unsigned int defaultLaunchValue,
         Context for the handle and associated conditional node.
     defaultLaunchValue : unsigned int
         Optional initial value for the conditional variable. Applied at the
-        beginning of each graph execution if CU_GRAPH_COND_ASSIGN_DEFAULT
-        is set in ``flags``.
+        beginning of each graph execution if
+        :py:obj:`~.CU_GRAPH_COND_ASSIGN_DEFAULT` is set in ``flags``.
     flags : unsigned int
-        Currently must be CU_GRAPH_COND_ASSIGN_DEFAULT or 0.
+        Currently must be :py:obj:`~.CU_GRAPH_COND_ASSIGN_DEFAULT` or 0.
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     pHandle_out : :py:obj:`~.CUgraphConditionalHandle`
         Pointer used to return the handle to the caller.
@@ -51421,14 +51479,14 @@ def cuOccupancyMaxActiveBlocksPerMultiprocessor(func, int blockSize, size_t dyna
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     numBlocks : int
         Returned occupancy
 
     See Also
     --------
-    :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessor`
+    :func:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessor`
     """
     cdef cydriver.CUfunction cyfunc
     if func is None:
@@ -51490,14 +51548,14 @@ def cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(func, int blockSize, si
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     numBlocks : int
         Returned occupancy
 
     See Also
     --------
-    :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`
+    :func:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`
     """
     cdef cydriver.CUfunction cyfunc
     if func is None:
@@ -51566,7 +51624,7 @@ def cuOccupancyMaxPotentialBlockSize(func, blockSizeToDynamicSMemSize, size_t dy
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     minGridSize : int
         Returned minimum grid size needed to achieve the maximum occupancy
@@ -51575,7 +51633,7 @@ def cuOccupancyMaxPotentialBlockSize(func, blockSizeToDynamicSMemSize, size_t dy
 
     See Also
     --------
-    :py:obj:`~.cudaOccupancyMaxPotentialBlockSize`
+    ``cudaOccupancyMaxPotentialBlockSize``
     """
     cdef cydriver.CUoccupancyB2DSize cyblockSizeToDynamicSMemSize
     if blockSizeToDynamicSMemSize is None:
@@ -51652,7 +51710,7 @@ def cuOccupancyMaxPotentialBlockSizeWithFlags(func, blockSizeToDynamicSMemSize,
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     minGridSize : int
         Returned minimum grid size needed to achieve the maximum occupancy
@@ -51661,7 +51719,7 @@ def cuOccupancyMaxPotentialBlockSizeWithFlags(func, blockSizeToDynamicSMemSize,
 
     See Also
     --------
-    :py:obj:`~.cudaOccupancyMaxPotentialBlockSizeWithFlags`
+    ``cudaOccupancyMaxPotentialBlockSizeWithFlags``
     """
     cdef cydriver.CUoccupancyB2DSize cyblockSizeToDynamicSMemSize
     if blockSizeToDynamicSMemSize is None:
@@ -51714,7 +51772,7 @@ def cuOccupancyAvailableDynamicSMemPerBlock(func, int numBlocks, int blockSize):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     dynamicSmemSize : int
         Returned maximum dynamic shared memory
@@ -51742,7 +51800,7 @@ def cuOccupancyMaxPotentialClusterSize(func, config : Optional[CUlaunchConfig]):
     """ Given the kernel function (``func``) and launch configuration (``config``), return the maximum cluster size in ``*clusterSize``.
 
     The cluster dimensions in ``config`` are ignored. If func has a
-    required cluster size set (see :py:obj:`~.cudaFuncGetAttributes` /
+    required cluster size set (see :func:`~.cudaFuncGetAttributes` /
     :py:obj:`~.cuFuncGetAttribute`),``*clusterSize`` will reflect the
     required cluster size.
 
@@ -51768,7 +51826,7 @@ def cuOccupancyMaxPotentialClusterSize(func, config : Optional[CUlaunchConfig]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     clusterSize : int
         Returned maximum cluster size that can be launched for the given
@@ -51776,7 +51834,7 @@ def cuOccupancyMaxPotentialClusterSize(func, config : Optional[CUlaunchConfig]):
 
     See Also
     --------
-    :py:obj:`~.cudaFuncGetAttributes`, :py:obj:`~.cuFuncGetAttribute`
+    :func:`~.cudaFuncGetAttributes`, :py:obj:`~.cuFuncGetAttribute`
     """
     cdef cydriver.CUfunction cyfunc
     if func is None:
@@ -51802,8 +51860,8 @@ def cuOccupancyMaxActiveClusters(func, config : Optional[CUlaunchConfig]):
     """ Given the kernel function (``func``) and launch configuration (``config``), return the maximum number of clusters that could co-exist on the target device in ``*numClusters``.
 
     If the function has required cluster size already set (see
-    :py:obj:`~.cudaFuncGetAttributes` / :py:obj:`~.cuFuncGetAttribute`),
-    the cluster size from config must either be unspecified or match the
+    :func:`~.cudaFuncGetAttributes` / :py:obj:`~.cuFuncGetAttribute`), the
+    cluster size from config must either be unspecified or match the
     required size. Without required sizes, the cluster size must be
     specified in config, else the function will return an error.
 
@@ -51828,7 +51886,7 @@ def cuOccupancyMaxActiveClusters(func, config : Optional[CUlaunchConfig]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_CLUSTER_SIZE`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
     numClusters : int
         Returned maximum number of clusters that could co-exist on the
@@ -51836,7 +51894,7 @@ def cuOccupancyMaxActiveClusters(func, config : Optional[CUlaunchConfig]):
 
     See Also
     --------
-    :py:obj:`~.cudaFuncGetAttributes`, :py:obj:`~.cuFuncGetAttribute`
+    :func:`~.cudaFuncGetAttributes`, :py:obj:`~.cuFuncGetAttribute`
     """
     cdef cydriver.CUfunction cyfunc
     if func is None:
@@ -51880,7 +51938,7 @@ def cuTexRefSetArray(hTexRef, hArray, unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -51933,7 +51991,7 @@ def cuTexRefSetMipmappedArray(hTexRef, hMipmappedArray, unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -51979,7 +52037,7 @@ def cuTexRefSetAddress(hTexRef, dptr, size_t numbytes):
     in ``*ByteOffset`` that must be applied to texture fetches in order to
     read from the desired memory. This offset must be divided by the texel
     size and passed to kernels that read from the texture so they can be
-    applied to the :py:obj:`~.tex1Dfetch()` function.
+    applied to the ``tex1Dfetch()`` function.
 
     If the device memory pointer was returned from
     :py:obj:`~.cuMemAlloc()`, the offset is guaranteed to be 0 and NULL may
@@ -52003,7 +52061,7 @@ def cuTexRefSetAddress(hTexRef, dptr, size_t numbytes):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     ByteOffset : int
         Returned byte offset
@@ -52049,8 +52107,8 @@ def cuTexRefSetAddress2D(hTexRef, desc : Optional[CUDA_ARRAY_DESCRIPTOR], dptr,
     reference is superseded by this function. Any memory previously bound
     to ``hTexRef`` is unbound.
 
-    Using a :py:obj:`~.tex2D()` function inside a kernel requires a call to
-    either :py:obj:`~.cuTexRefSetArray()` to bind the corresponding texture
+    Using a ``tex2D()`` function inside a kernel requires a call to either
+    :py:obj:`~.cuTexRefSetArray()` to bind the corresponding texture
     reference to an array, or :py:obj:`~.cuTexRefSetAddress2D()` to bind
     the texture reference to linear memory.
 
@@ -52088,7 +52146,7 @@ def cuTexRefSetAddress2D(hTexRef, desc : Optional[CUDA_ARRAY_DESCRIPTOR], dptr,
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -52127,7 +52185,7 @@ def cuTexRefSetFormat(hTexRef, fmt not None : CUarray_format, int NumPackedCompo
 
     Specifies the format of the data to be read by the texture reference
     ``hTexRef``. ``fmt`` and ``NumPackedComponents`` are exactly analogous
-    to the :py:obj:`~.Format` and :py:obj:`~.NumChannels` members of the
+    to the ``Format`` and ``NumChannels`` members of the
     :py:obj:`~.CUDA_ARRAY_DESCRIPTOR` structure: They specify the format of
     each component and the number of components per array element.
 
@@ -52142,12 +52200,12 @@ def cuTexRefSetFormat(hTexRef, fmt not None : CUarray_format, int NumPackedCompo
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuTexRefSetAddress`, :py:obj:`~.cuTexRefSetAddress2D`, :py:obj:`~.cuTexRefSetAddressMode`, :py:obj:`~.cuTexRefSetArray`, :py:obj:`~.cuTexRefSetFilterMode`, :py:obj:`~.cuTexRefSetFlags`, :py:obj:`~.cuTexRefGetAddress`, :py:obj:`~.cuTexRefGetAddressMode`, :py:obj:`~.cuTexRefGetArray`, :py:obj:`~.cuTexRefGetFilterMode`, :py:obj:`~.cuTexRefGetFlags`, :py:obj:`~.cuTexRefGetFormat`, :py:obj:`~.cudaCreateChannelDesc`
+    :py:obj:`~.cuTexRefSetAddress`, :py:obj:`~.cuTexRefSetAddress2D`, :py:obj:`~.cuTexRefSetAddressMode`, :py:obj:`~.cuTexRefSetArray`, :py:obj:`~.cuTexRefSetFilterMode`, :py:obj:`~.cuTexRefSetFlags`, :py:obj:`~.cuTexRefGetAddress`, :py:obj:`~.cuTexRefGetAddressMode`, :py:obj:`~.cuTexRefGetArray`, :py:obj:`~.cuTexRefGetFilterMode`, :py:obj:`~.cuTexRefGetFlags`, :py:obj:`~.cuTexRefGetFormat`, :func:`~.cudaCreateChannelDesc`
     """
     cdef cydriver.CUtexref cyhTexRef
     if hTexRef is None:
@@ -52195,7 +52253,7 @@ def cuTexRefSetAddressMode(hTexRef, int dim, am not None : CUaddress_mode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -52242,7 +52300,7 @@ def cuTexRefSetFilterMode(hTexRef, fm not None : CUfilter_mode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -52289,7 +52347,7 @@ def cuTexRefSetMipmapFilterMode(hTexRef, fm not None : CUfilter_mode):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -52334,7 +52392,7 @@ def cuTexRefSetMipmapLevelBias(hTexRef, float bias):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -52380,7 +52438,7 @@ def cuTexRefSetMipmapLevelClamp(hTexRef, float minMipmapLevelClamp, float maxMip
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -52423,7 +52481,7 @@ def cuTexRefSetMaxAnisotropy(hTexRef, unsigned int maxAniso):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -52458,9 +52516,9 @@ def cuTexRefSetBorderColor(hTexRef, float pBorderColor):
     holds 'B' component pBorderColor[3] holds 'A' component
 
     Note that the color values can be set only when the Address mode is set
-    to CU_TR_ADDRESS_MODE_BORDER using :py:obj:`~.cuTexRefSetAddressMode`.
-    Applications using integer border color values have to
-    "reinterpret_cast" their values to float.
+    to :py:obj:`~.CU_TR_ADDRESS_MODE_BORDER` using
+    :py:obj:`~.cuTexRefSetAddressMode`. Applications using integer border
+    color values have to "reinterpret_cast" their values to float.
 
     Parameters
     ----------
@@ -52471,7 +52529,7 @@ def cuTexRefSetBorderColor(hTexRef, float pBorderColor):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -52530,7 +52588,7 @@ def cuTexRefSetFlags(hTexRef, unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -52569,7 +52627,7 @@ def cuTexRefGetAddress(hTexRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pdptr : :py:obj:`~.CUdeviceptr`
         Returned device address
@@ -52613,7 +52671,7 @@ def cuTexRefGetArray(hTexRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phArray : :py:obj:`~.CUarray`
         Returned array
@@ -52658,7 +52716,7 @@ def cuTexRefGetMipmappedArray(hTexRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phMipmappedArray : :py:obj:`~.CUmipmappedArray`
         Returned mipmapped array
@@ -52704,7 +52762,7 @@ def cuTexRefGetAddressMode(hTexRef, int dim):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pam : :py:obj:`~.CUaddress_mode`
         Returned addressing mode
@@ -52747,7 +52805,7 @@ def cuTexRefGetFilterMode(hTexRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pfm : :py:obj:`~.CUfilter_mode`
         Returned filtering mode
@@ -52792,7 +52850,7 @@ def cuTexRefGetFormat(hTexRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pFormat : :py:obj:`~.CUarray_format`
         Returned format
@@ -52838,7 +52896,7 @@ def cuTexRefGetMipmapFilterMode(hTexRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pfm : :py:obj:`~.CUfilter_mode`
         Returned mipmap filtering mode
@@ -52882,7 +52940,7 @@ def cuTexRefGetMipmapLevelBias(hTexRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pbias : float
         Returned mipmap level bias
@@ -52926,7 +52984,7 @@ def cuTexRefGetMipmapLevelClamp(hTexRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pminMipmapLevelClamp : float
         Returned mipmap min level clamp
@@ -52972,7 +53030,7 @@ def cuTexRefGetMaxAnisotropy(hTexRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pmaxAniso : int
         Returned maximum anisotropy
@@ -53018,7 +53076,7 @@ def cuTexRefGetBorderColor(hTexRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     hTexRef : float
         Texture reference
@@ -53060,7 +53118,7 @@ def cuTexRefGetFlags(hTexRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pFlags : unsigned int
         Returned flags
@@ -53102,7 +53160,7 @@ def cuTexRefCreate():
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pTexRef : :py:obj:`~.CUtexref`
         Returned texture reference
@@ -53136,7 +53194,7 @@ def cuTexRefDestroy(hTexRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -53182,7 +53240,7 @@ def cuSurfRefSetArray(hSurfRef, hArray, unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -53229,7 +53287,7 @@ def cuSurfRefGetArray(hSurfRef):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     phArray : :py:obj:`~.CUarray`
         Surface reference handle
@@ -53286,23 +53344,23 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
 
     If :py:obj:`~.CUDA_RESOURCE_DESC.resType` is set to
     :py:obj:`~.CU_RESOURCE_TYPE_ARRAY`,
-    :py:obj:`~.CUDA_RESOURCE_DESC.res.array.hArray` must be set to a valid
-    CUDA array handle.
+    :py:obj:`~.CUDA_RESOURCE_DESC```res````array``::hArray must be set to a
+    valid CUDA array handle.
 
     If :py:obj:`~.CUDA_RESOURCE_DESC.resType` is set to
     :py:obj:`~.CU_RESOURCE_TYPE_MIPMAPPED_ARRAY`,
-    :py:obj:`~.CUDA_RESOURCE_DESC.res.mipmap.hMipmappedArray` must be set
-    to a valid CUDA mipmapped array handle.
+    :py:obj:`~.CUDA_RESOURCE_DESC```res````mipmap``::hMipmappedArray must
+    be set to a valid CUDA mipmapped array handle.
 
     If :py:obj:`~.CUDA_RESOURCE_DESC.resType` is set to
     :py:obj:`~.CU_RESOURCE_TYPE_LINEAR`,
-    :py:obj:`~.CUDA_RESOURCE_DESC.res.linear.devPtr` must be set to a valid
-    device pointer, that is aligned to
+    :py:obj:`~.CUDA_RESOURCE_DESC```res````linear``::devPtr must be set to
+    a valid device pointer, that is aligned to
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT`.
-    :py:obj:`~.CUDA_RESOURCE_DESC.res.linear.format` and
-    :py:obj:`~.CUDA_RESOURCE_DESC.res.linear.numChannels` describe the
-    format of each component and the number of components per array
-    element. :py:obj:`~.CUDA_RESOURCE_DESC.res.linear.sizeInBytes`
+    :py:obj:`~.CUDA_RESOURCE_DESC```res````linear``::format and
+    :py:obj:`~.CUDA_RESOURCE_DESC```res````linear``::numChannels describe
+    the format of each component and the number of components per array
+    element. :py:obj:`~.CUDA_RESOURCE_DESC```res````linear``::sizeInBytes
     specifies the size of the array in bytes. The total number of elements
     in the linear address range cannot exceed
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH`. The
@@ -53311,25 +53369,26 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
 
     If :py:obj:`~.CUDA_RESOURCE_DESC.resType` is set to
     :py:obj:`~.CU_RESOURCE_TYPE_PITCH2D`,
-    :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.devPtr` must be set to a
-    valid device pointer, that is aligned to
+    :py:obj:`~.CUDA_RESOURCE_DESC```res````pitch2D``::devPtr must be set to
+    a valid device pointer, that is aligned to
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT`.
-    :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.format` and
-    :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.numChannels` describe the
-    format of each component and the number of components per array
-    element. :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.width` and
-    :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.height` specify the width and
-    height of the array in elements, and cannot exceed
+    :py:obj:`~.CUDA_RESOURCE_DESC```res````pitch2D``::format and
+    :py:obj:`~.CUDA_RESOURCE_DESC```res````pitch2D``::numChannels describe
+    the format of each component and the number of components per array
+    element. :py:obj:`~.CUDA_RESOURCE_DESC```res````pitch2D``::width and
+    :py:obj:`~.CUDA_RESOURCE_DESC```res````pitch2D``::height specify the
+    width and height of the array in elements, and cannot exceed
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH` and
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT`
-    respectively. :py:obj:`~.CUDA_RESOURCE_DESC.res.pitch2D.pitchInBytes`
+    respectively.
+    :py:obj:`~.CUDA_RESOURCE_DESC```res````pitch2D``::pitchInBytes
     specifies the pitch between two rows in bytes and has to be aligned to
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT`. Pitch cannot
     exceed :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH`.
 
-    - :py:obj:`~.flags` must be set to zero.
+    - ``flags`` must be set to zero.
 
-    The :py:obj:`~.CUDA_TEXTURE_DESC` struct is defined as
+    The :py:obj:`~.CUDA_TEXTURE_DESC` ``struct is`` defined as
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -53347,8 +53406,8 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
       supported address mode is :py:obj:`~.CU_TR_ADDRESS_MODE_CLAMP`.
 
     - :py:obj:`~.CUDA_TEXTURE_DESC.filterMode` specifies the filtering mode
-      to be used when fetching from the texture. CUfilter_mode is defined
-      as:
+      to be used when fetching from the texture. :py:obj:`~.CUfilter_mode`
+      is defined as:
 
     - **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -53407,7 +53466,7 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
     - :py:obj:`~.CUDA_TEXTURE_DESC.maxMipmapLevelClamp` specifies the upper
       end of the mipmap level range to clamp access to.
 
-    The :py:obj:`~.CUDA_RESOURCE_VIEW_DESC` struct is defined as
+    The :py:obj:`~.CUDA_RESOURCE_VIEW_DESC` ``struct is`` defined as
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -53473,14 +53532,14 @@ def cuTexObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC], pTexDesc : Option
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pTexObject : :py:obj:`~.CUtexObject`
         Texture object to create
 
     See Also
     --------
-    :py:obj:`~.cuTexObjectDestroy`, :py:obj:`~.cudaCreateTextureObject`
+    :py:obj:`~.cuTexObjectDestroy`, :func:`~.cudaCreateTextureObject`
     """
     cdef CUtexObject pTexObject = CUtexObject()
     cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = <cydriver.CUDA_RESOURCE_DESC*>pResDesc._pvt_ptr if pResDesc is not None else NULL
@@ -53508,12 +53567,12 @@ def cuTexObjectDestroy(texObject):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuTexObjectCreate`, :py:obj:`~.cudaDestroyTextureObject`
+    :py:obj:`~.cuTexObjectCreate`, :func:`~.cudaDestroyTextureObject`
     """
     cdef cydriver.CUtexObject cytexObject
     if texObject is None:
@@ -53544,14 +53603,14 @@ def cuTexObjectGetResourceDesc(texObject):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pResDesc : :py:obj:`~.CUDA_RESOURCE_DESC`
         Resource descriptor
 
     See Also
     --------
-    :py:obj:`~.cuTexObjectCreate`, :py:obj:`~.cudaGetTextureObjectResourceDesc`,
+    :py:obj:`~.cuTexObjectCreate`, :func:`~.cudaGetTextureObjectResourceDesc`,
     """
     cdef cydriver.CUtexObject cytexObject
     if texObject is None:
@@ -53585,14 +53644,14 @@ def cuTexObjectGetTextureDesc(texObject):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pTexDesc : :py:obj:`~.CUDA_TEXTURE_DESC`
         Texture descriptor
 
     See Also
     --------
-    :py:obj:`~.cuTexObjectCreate`, :py:obj:`~.cudaGetTextureObjectTextureDesc`
+    :py:obj:`~.cuTexObjectCreate`, :func:`~.cudaGetTextureObjectTextureDesc`
     """
     cdef cydriver.CUtexObject cytexObject
     if texObject is None:
@@ -53627,14 +53686,14 @@ def cuTexObjectGetResourceViewDesc(texObject):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pResViewDesc : :py:obj:`~.CUDA_RESOURCE_VIEW_DESC`
         Resource view descriptor
 
     See Also
     --------
-    :py:obj:`~.cuTexObjectCreate`, :py:obj:`~.cudaGetTextureObjectResourceViewDesc`
+    :py:obj:`~.cuTexObjectCreate`, :func:`~.cudaGetTextureObjectResourceViewDesc`
     """
     cdef cydriver.CUtexObject cytexObject
     if texObject is None:
@@ -53662,9 +53721,9 @@ def cuSurfObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC]):
     ``pResDesc`` describes the data to perform surface load/stores on.
     :py:obj:`~.CUDA_RESOURCE_DESC.resType` must be
     :py:obj:`~.CU_RESOURCE_TYPE_ARRAY` and
-    :py:obj:`~.CUDA_RESOURCE_DESC.res.array.hArray` must be set to a valid
-    CUDA array handle. :py:obj:`~.CUDA_RESOURCE_DESC.flags` must be set to
-    zero.
+    :py:obj:`~.CUDA_RESOURCE_DESC```res````array``::hArray must be set to a
+    valid CUDA array handle. :py:obj:`~.CUDA_RESOURCE_DESC.flags` must be
+    set to zero.
 
     Surface objects are only supported on devices of compute capability 3.0
     or higher. Additionally, a surface object is an opaque value, and, as
@@ -53677,14 +53736,14 @@ def cuSurfObjectCreate(pResDesc : Optional[CUDA_RESOURCE_DESC]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pSurfObject : :py:obj:`~.CUsurfObject`
         Surface object to create
 
     See Also
     --------
-    :py:obj:`~.cuSurfObjectDestroy`, :py:obj:`~.cudaCreateSurfaceObject`
+    :py:obj:`~.cuSurfObjectDestroy`, :func:`~.cudaCreateSurfaceObject`
     """
     cdef CUsurfObject pSurfObject = CUsurfObject()
     cdef cydriver.CUDA_RESOURCE_DESC* cypResDesc_ptr = <cydriver.CUDA_RESOURCE_DESC*>pResDesc._pvt_ptr if pResDesc is not None else NULL
@@ -53710,12 +53769,12 @@ def cuSurfObjectDestroy(surfObject):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuSurfObjectCreate`, :py:obj:`~.cudaDestroySurfaceObject`
+    :py:obj:`~.cuSurfObjectCreate`, :func:`~.cudaDestroySurfaceObject`
     """
     cdef cydriver.CUsurfObject cysurfObject
     if surfObject is None:
@@ -53746,14 +53805,14 @@ def cuSurfObjectGetResourceDesc(surfObject):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pResDesc : :py:obj:`~.CUDA_RESOURCE_DESC`
         Resource descriptor
 
     See Also
     --------
-    :py:obj:`~.cuSurfObjectCreate`, :py:obj:`~.cudaGetSurfaceObjectResourceDesc`
+    :py:obj:`~.cuSurfObjectCreate`, :func:`~.cudaGetSurfaceObjectResourceDesc`
     """
     cdef cydriver.CUsurfObject cysurfObject
     if surfObject is None:
@@ -53893,14 +53952,14 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
       derived from ``tensorDataType``) must be less than or equal to the
       swizzle size.
 
-      - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension
-        to be <= 32.
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_32B` requires the bounding box
+        inner dimension to be <= 32.
 
-      - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension
-        to be <= 64.
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_64B` requires the bounding box
+        inner dimension to be <= 64.
 
-      - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner
-        dimension to be <= 128. Additionally, ``tensorDataType`` of
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B`* require the bounding box
+        inner dimension to be <= 128. Additionally, ``tensorDataType`` of
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` requires
         ``interleave`` to be :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`.
 
@@ -53922,22 +53981,22 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B`, only the following
       swizzle modes are supported:
 
-      - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_NONE` (Load & Store)
 
-      - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B` (Load & Store)
 
-      - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B` (Load & Store)
 
-      - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only) When the
-        ``tensorDataType`` is
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B` (Store only) When
+        the ``tensorDataType`` is
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, only the
         following swizzle modes are supported:
 
-      - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_NONE` (Load only)
 
-      - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B` (Load only)
 
-      - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B` (Load only)
 
     - ``l2Promotion`` specifies L2 fetch size which indicates the byte
       granurality at which L2 requests is filled from DRAM. It must be of
@@ -53992,7 +54051,7 @@ def cuTensorMapEncodeTiled(tensorDataType not None : CUtensorMapDataType, tensor
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     tensorMap : :py:obj:`~.CUtensorMap`
         Tensor map object to create
@@ -54226,14 +54285,14 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
       size in bytes derived from ``tensorDataType``) must be less than or
       equal to the swizzle size.
 
-      - CU_TENSOR_MAP_SWIZZLE_32B requires the bounding box inner dimension
-        to be <= 32.
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_32B` requires the bounding box
+        inner dimension to be <= 32.
 
-      - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension
-        to be <= 64.
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_64B` requires the bounding box
+        inner dimension to be <= 64.
 
-      - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner
-        dimension to be <= 128. Additionally, ``tensorDataType`` of
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B`* require the bounding box
+        inner dimension to be <= 128. Additionally, ``tensorDataType`` of
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` requires
         ``interleave`` to be :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`.
 
@@ -54255,22 +54314,22 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B`, only the following
       swizzle modes are supported:
 
-      - CU_TENSOR_MAP_SWIZZLE_NONE (Load & Store)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_NONE` (Load & Store)
 
-      - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B` (Load & Store)
 
-      - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B` (Load & Store)
 
-      - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B (Store only) When the
-        ``tensorDataType`` is
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B` (Store only) When
+        the ``tensorDataType`` is
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, only the
         following swizzle modes are supported:
 
-      - CU_TENSOR_MAP_SWIZZLE_NONE (Load only)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_NONE` (Load only)
 
-      - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B` (Load only)
 
-      - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B` (Load only)
 
     - ``l2Promotion`` specifies L2 fetch size which indicates the byte
       granularity at which L2 requests are filled from DRAM. It must be of
@@ -54329,7 +54388,7 @@ def cuTensorMapEncodeIm2col(tensorDataType not None : CUtensorMapDataType, tenso
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     tensorMap : :py:obj:`~.CUtensorMap`
         Tensor map object to create
@@ -54558,11 +54617,11 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
       size in bytes derived from ``tensorDataType``) must be less than or
       equal to the swizzle size.
 
-      - CU_TENSOR_MAP_SWIZZLE_64B requires the bounding box inner dimension
-        to be <= 64.
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_64B` requires the bounding box
+        inner dimension to be <= 64.
 
-      - CU_TENSOR_MAP_SWIZZLE_128B* require the bounding box inner
-        dimension to be <= 128. Additionally, ``tensorDataType`` of
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B`* require the bounding box
+        inner dimension to be <= 128. Additionally, ``tensorDataType`` of
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B` requires
         ``interleave`` to be :py:obj:`~.CU_TENSOR_MAP_INTERLEAVE_NONE`.
 
@@ -54591,21 +54650,21 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
       :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B`, only the following
       swizzle modes are supported:
 
-      - CU_TENSOR_MAP_SWIZZLE_64B (Store only)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_64B` (Store only)
 
-      - CU_TENSOR_MAP_SWIZZLE_128B (Load & Store)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B` (Load & Store)
 
-      - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load & Store) When the
-        ``tensorDataType`` is
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B` (Load & Store) When
+        the ``tensorDataType`` is
         :py:obj:`~.CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B`, only the
         following swizzle modes are supported:
 
-      - CU_TENSOR_MAP_SWIZZLE_128B (Load only)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B` (Load only)
 
-      - CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B (Load only)
+      - :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B` (Load only)
 
-    Additionally, :py:obj:`~.CU_TENSOR_MAP_SWIZZLE_96B` is supported only
-    when ``mode`` is :py:obj:`~.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W`.
+    Additionally, ``CU_TENSOR_MAP_SWIZZLE_96B`` is supported only when
+    ``mode`` is :py:obj:`~.CU_TENSOR_MAP_IM2COL_WIDE_MODE_W`.
 
     - ``l2Promotion`` specifies L2 fetch size which indicates the byte
       granularity at which L2 requests are filled from DRAM. It must be of
@@ -54666,7 +54725,7 @@ def cuTensorMapEncodeIm2colWide(tensorDataType not None : CUtensorMapDataType, t
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     tensorMap : :py:obj:`~.CUtensorMap`
         Tensor map object to create
@@ -54781,7 +54840,7 @@ def cuTensorMapReplaceAddress(tensorMap : Optional[CUtensorMap], globalAddress):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -54820,14 +54879,14 @@ def cuDeviceCanAccessPeer(dev, peerDev):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     canAccessPeer : int
         Returned access capability
 
     See Also
     --------
-    :py:obj:`~.cuCtxEnablePeerAccess`, :py:obj:`~.cuCtxDisablePeerAccess`, :py:obj:`~.cudaDeviceCanAccessPeer`
+    :py:obj:`~.cuCtxEnablePeerAccess`, :py:obj:`~.cuCtxDisablePeerAccess`, :func:`~.cudaDeviceCanAccessPeer`
     """
     cdef cydriver.CUdevice cypeerDev
     if peerDev is None:
@@ -54864,7 +54923,8 @@ def cuCtxEnablePeerAccess(peerContext, unsigned int Flags):
     :py:obj:`~.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING`) and same major
     compute capability, then on success all allocations from
     ``peerContext`` will immediately be accessible by the current context.
-    See Unified Addressing for additional details.
+    See :ref:`Unified Addressing <cuda-bindings-driver-
+    group__cuda__unified>` for additional details.
 
     Note that access granted by this call is unidirectional and that in
     order to access memory from the current context in ``peerContext``, a
@@ -54903,12 +54963,12 @@ def cuCtxEnablePeerAccess(peerContext, unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED`, :py:obj:`~.CUDA_ERROR_TOO_MANY_PEERS`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_PEER_ACCESS_UNSUPPORTED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
     --------
-    :py:obj:`~.cuDeviceCanAccessPeer`, :py:obj:`~.cuCtxDisablePeerAccess`, :py:obj:`~.cudaDeviceEnablePeerAccess`
+    :py:obj:`~.cuDeviceCanAccessPeer`, :py:obj:`~.cuCtxDisablePeerAccess`, :func:`~.cudaDeviceEnablePeerAccess`
     """
     cdef cydriver.CUcontext cypeerContext
     if peerContext is None:
@@ -54943,12 +55003,12 @@ def cuCtxDisablePeerAccess(peerContext):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_PEER_ACCESS_NOT_ENABLED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`,
 
     See Also
     --------
-    :py:obj:`~.cuDeviceCanAccessPeer`, :py:obj:`~.cuCtxEnablePeerAccess`, :py:obj:`~.cudaDeviceDisablePeerAccess`
+    :py:obj:`~.cuDeviceCanAccessPeer`, :py:obj:`~.cuCtxEnablePeerAccess`, :func:`~.cudaDeviceDisablePeerAccess`
     """
     cdef cydriver.CUcontext cypeerContext
     if peerContext is None:
@@ -55008,14 +55068,14 @@ def cuDeviceGetP2PAttribute(attrib not None : CUdevice_P2PAttribute, srcDevice,
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     value : int
         Returned value of the requested attribute
 
     See Also
     --------
-    :py:obj:`~.cuCtxEnablePeerAccess`, :py:obj:`~.cuCtxDisablePeerAccess`, :py:obj:`~.cuDeviceCanAccessPeer`, :py:obj:`~.cuDeviceGetP2PAtomicCapabilities`, :py:obj:`~.cudaDeviceGetP2PAttribute`
+    :py:obj:`~.cuCtxEnablePeerAccess`, :py:obj:`~.cuCtxDisablePeerAccess`, :py:obj:`~.cuDeviceCanAccessPeer`, :py:obj:`~.cuDeviceGetP2PAtomicCapabilities`, :func:`~.cudaDeviceGetP2PAttribute`
     """
     cdef cydriver.CUdevice cydstDevice
     if dstDevice is None:
@@ -55078,14 +55138,14 @@ def cuDeviceGetP2PAtomicCapabilities(operations : Optional[tuple[CUatomicOperati
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     capabilities : list[unsigned int]
         Returned capability details of each requested operation
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetP2PAttribute`, :py:obj:`~.cudaDeviceGetP2PAttribute`, :py:obj:`~.cudaDeviceGetP2PAtomicCapabilities`
+    :py:obj:`~.cuDeviceGetP2PAttribute`, :func:`~.cudaDeviceGetP2PAttribute`, :func:`~.cudaDeviceGetP2PAtomicCapabilities`
     """
     cdef cydriver.CUdevice cydstDevice
     if dstDevice is None:
@@ -55144,12 +55204,12 @@ def cuGraphicsUnregisterResource(resource):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
 
     See Also
     --------
-    :py:obj:`~.cuGraphicsD3D9RegisterResource`, :py:obj:`~.cuGraphicsD3D10RegisterResource`, :py:obj:`~.cuGraphicsD3D11RegisterResource`, :py:obj:`~.cuGraphicsGLRegisterBuffer`, :py:obj:`~.cuGraphicsGLRegisterImage`, :py:obj:`~.cudaGraphicsUnregisterResource`
+    ``cuGraphicsD3D9RegisterResource``, ``cuGraphicsD3D10RegisterResource``, ``cuGraphicsD3D11RegisterResource``, :py:obj:`~.cuGraphicsGLRegisterBuffer`, :py:obj:`~.cuGraphicsGLRegisterImage`, :func:`~.cudaGraphicsUnregisterResource`
     """
     cdef cydriver.CUgraphicsResource cyresource
     if resource is None:
@@ -55196,7 +55256,7 @@ def cuGraphicsSubResourceGetMappedArray(resource, unsigned int arrayIndex, unsig
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_MAPPED`, :py:obj:`~.CUDA_ERROR_NOT_MAPPED_AS_ARRAY`
     pArray : :py:obj:`~.CUarray`
         Returned array through which a subresource of ``resource`` may be
@@ -55204,7 +55264,7 @@ def cuGraphicsSubResourceGetMappedArray(resource, unsigned int arrayIndex, unsig
 
     See Also
     --------
-    :py:obj:`~.cuGraphicsResourceGetMappedPointer`, :py:obj:`~.cudaGraphicsSubResourceGetMappedArray`
+    :py:obj:`~.cuGraphicsResourceGetMappedPointer`, :func:`~.cudaGraphicsSubResourceGetMappedArray`
     """
     cdef cydriver.CUgraphicsResource cyresource
     if resource is None:
@@ -55244,14 +55304,14 @@ def cuGraphicsResourceGetMappedMipmappedArray(resource):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_MAPPED`, :py:obj:`~.CUDA_ERROR_NOT_MAPPED_AS_ARRAY`
     pMipmappedArray : :py:obj:`~.CUmipmappedArray`
         Returned mipmapped array through which ``resource`` may be accessed
 
     See Also
     --------
-    :py:obj:`~.cuGraphicsResourceGetMappedPointer`, :py:obj:`~.cudaGraphicsResourceGetMappedMipmappedArray`
+    :py:obj:`~.cuGraphicsResourceGetMappedPointer`, :func:`~.cudaGraphicsResourceGetMappedMipmappedArray`
     """
     cdef cydriver.CUgraphicsResource cyresource
     if resource is None:
@@ -55292,7 +55352,7 @@ def cuGraphicsResourceGetMappedPointer(resource):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
 
     pDevPtr : :py:obj:`~.CUdeviceptr`
         None
@@ -55332,14 +55392,13 @@ def cuGraphicsResourceSetMapFlags(resource, unsigned int flags):
       this resource will be read from and written to by CUDA kernels. This
       is the default value.
 
-    - :py:obj:`~.CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY`: Specifies that
-      CUDA kernels which access this resource will not write to this
-      resource.
+    - ``CU_GRAPHICS_MAP_RESOURCE_FLAGS_READONLY``: Specifies that CUDA
+      kernels which access this resource will not write to this resource.
 
-    - :py:obj:`~.CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD`: Specifies
-      that CUDA kernels which access this resource will not read from this
-      resource and will write over the entire contents of the resource, so
-      none of the data previously stored in the resource will be preserved.
+    - ``CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITEDISCARD``: Specifies that CUDA
+      kernels which access this resource will not read from this resource
+      and will write over the entire contents of the resource, so none of
+      the data previously stored in the resource will be preserved.
 
     If ``resource`` is presently mapped for access by CUDA then
     :py:obj:`~.CUDA_ERROR_ALREADY_MAPPED` is returned. If ``flags`` is not
@@ -55355,12 +55414,12 @@ def cuGraphicsResourceSetMapFlags(resource, unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_ALREADY_MAPPED`
 
     See Also
     --------
-    :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cudaGraphicsResourceSetMapFlags`
+    :py:obj:`~.cuGraphicsMapResources`, :func:`~.cudaGraphicsResourceSetMapFlags`
     """
     cdef cydriver.CUgraphicsResource cyresource
     if resource is None:
@@ -55409,12 +55468,12 @@ def cuGraphicsMapResources(unsigned int count, resources, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_ALREADY_MAPPED`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
 
     See Also
     --------
-    :py:obj:`~.cuGraphicsResourceGetMappedPointer`, :py:obj:`~.cuGraphicsSubResourceGetMappedArray`, :py:obj:`~.cuGraphicsUnmapResources`, :py:obj:`~.cudaGraphicsMapResources`
+    :py:obj:`~.cuGraphicsResourceGetMappedPointer`, :py:obj:`~.cuGraphicsSubResourceGetMappedArray`, :py:obj:`~.cuGraphicsUnmapResources`, :func:`~.cudaGraphicsMapResources`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -55470,12 +55529,12 @@ def cuGraphicsUnmapResources(unsigned int count, resources, hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_NOT_MAPPED`, :py:obj:`~.CUDA_ERROR_UNKNOWN`
 
     See Also
     --------
-    :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cudaGraphicsUnmapResources`
+    :py:obj:`~.cuGraphicsMapResources`, :func:`~.cudaGraphicsUnmapResources`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -55565,9 +55624,9 @@ def cuGetProcAddress(char* symbol, int cudaVersion, flags):
     ----------
     symbol : bytes
         The base name of the driver API function to look for. As an
-        example, for the driver API :py:obj:`~.cuMemAlloc_v2`, ``symbol``
-        would be cuMemAlloc and ``cudaVersion`` would be the ABI compatible
-        CUDA version for the _v2 variant.
+        example, for the driver API :func:`~.cuMemAlloc`, ``symbol`` would
+        be :py:obj:`~.cuMemAlloc` and ``cudaVersion`` would be the ABI
+        compatible CUDA version for the _v2 variant.
     cudaVersion : int
         The CUDA version to look for the requested driver symbol
     flags : Any
@@ -55575,7 +55634,7 @@ def cuGetProcAddress(char* symbol, int cudaVersion, flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     pfn : Any
         Location to return the function pointer to the requested driver
@@ -55587,7 +55646,7 @@ def cuGetProcAddress(char* symbol, int cudaVersion, flags):
 
     See Also
     --------
-    :py:obj:`~.cudaGetDriverEntryPointByVersion`
+    :func:`~.cudaGetDriverEntryPointByVersion`
     """
     cdef cydriver.cuuint64_t cyflags
     if flags is None:
@@ -55622,46 +55681,43 @@ def cuCoredumpGetAttribute(attrib not None : CUcoredumpSettings):
 
     The supported attributes are:
 
-    - :py:obj:`~.CU_COREDUMP_ENABLE_ON_EXCEPTION`: Bool where
-      :py:obj:`~.true` means that GPU exceptions from this context will
-      create a coredump at the location specified by
-      :py:obj:`~.CU_COREDUMP_FILE`. The default value is :py:obj:`~.false`
-      unless set to :py:obj:`~.true` globally or locally, or the
-      CU_CTX_USER_COREDUMP_ENABLE flag was set during context creation.
-
-    - :py:obj:`~.CU_COREDUMP_TRIGGER_HOST`: Bool where :py:obj:`~.true`
-      means that the host CPU will also create a coredump. The default
-      value is :py:obj:`~.true` unless set to :py:obj:`~.false` globally or
-      or locally. This value is deprecated as of CUDA 12.5 - raise the
+    - :py:obj:`~.CU_COREDUMP_ENABLE_ON_EXCEPTION`: Bool where ``true``
+      means that GPU exceptions from this context will create a coredump at
+      the location specified by :py:obj:`~.CU_COREDUMP_FILE`. The default
+      value is ``false`` unless set to ``true`` globally or locally, or the
+      :py:obj:`~.CU_CTX_USER_COREDUMP_ENABLE` flag was set during context
+      creation.
+
+    - :py:obj:`~.CU_COREDUMP_TRIGGER_HOST`: Bool where ``true`` means that
+      the host CPU will also create a coredump. The default value is
+      ``true`` unless set to ``false`` globally or or locally. This value
+      is deprecated as of CUDA 12.5 - raise the
       :py:obj:`~.CU_COREDUMP_SKIP_ABORT` flag to disable host device
       abort() if needed.
 
-    - :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT`: Bool where :py:obj:`~.true`
-      means that any resulting coredumps will not have a dump of GPU memory
-      or non-reloc ELF images. The default value is :py:obj:`~.false`
-      unless set to :py:obj:`~.true` globally or locally. This attribute is
-      deprecated as of CUDA 12.5, please use
-      :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS` instead.
+    - :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT`: Bool where ``true`` means that
+      any resulting coredumps will not have a dump of GPU memory or non-
+      reloc ELF images. The default value is ``false`` unless set to
+      ``true`` globally or locally. This attribute is deprecated as of CUDA
+      12.5, please use :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS` instead.
 
-    - :py:obj:`~.CU_COREDUMP_ENABLE_USER_TRIGGER`: Bool where
-      :py:obj:`~.true` means that a coredump can be created by writing to
-      the system pipe specified by :py:obj:`~.CU_COREDUMP_PIPE`. The
-      default value is :py:obj:`~.false` unless set to :py:obj:`~.true`
-      globally or locally.
+    - :py:obj:`~.CU_COREDUMP_ENABLE_USER_TRIGGER`: Bool where ``true``
+      means that a coredump can be created by writing to the system pipe
+      specified by :py:obj:`~.CU_COREDUMP_PIPE`. The default value is
+      ``false`` unless set to ``true`` globally or locally.
 
     - :py:obj:`~.CU_COREDUMP_FILE`: String of up to 1023 characters that
       defines the location where any coredumps generated by this context
-      will be written. The default value is
-      :py:obj:`~.core`.cuda.HOSTNAME.PID where :py:obj:`~.HOSTNAME` is the
-      host name of the machine running the CUDA applications and
-      :py:obj:`~.PID` is the process ID of the CUDA application.
+      will be written. The default value is ``core``.cuda.HOSTNAME.PID
+      where ``HOSTNAME`` is the host name of the machine running the CUDA
+      applications and ``PID`` is the process ID of the CUDA application.
 
     - :py:obj:`~.CU_COREDUMP_PIPE`: String of up to 1023 characters that
       defines the name of the pipe that will be monitored if user-triggered
       coredumps are enabled. The default value is
-      :py:obj:`~.corepipe`.cuda.HOSTNAME.PID where :py:obj:`~.HOSTNAME` is
-      the host name of the machine running the CUDA application and
-      :py:obj:`~.PID` is the process ID of the CUDA application.
+      ``corepipe``.cuda.HOSTNAME.PID where ``HOSTNAME`` is the host name of
+      the machine running the CUDA application and ``PID`` is the process
+      ID of the CUDA application.
 
     - :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS`: An integer with values to
       allow granular control the data contained in a coredump specified as
@@ -55688,7 +55744,7 @@ def cuCoredumpGetAttribute(attrib not None : CUcoredumpSettings):
 
       - :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT_FLAGS` - Enables all of the
         above options. Equiavlent to setting the
-        :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT` attribute to :py:obj:`~.true`.
+        :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT` attribute to ``true``.
 
       - :py:obj:`~.CU_COREDUMP_SKIP_ABORT` - If set, GPU exceptions will
         not raise an abort() in the host CPU process. Same functional goal
@@ -55726,10 +55782,10 @@ def cuCoredumpGetAttribute(attrib not None : CUcoredumpSettings):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`
     value : Any
-        void* containing the requested data.
+        :py:obj:`~.EGLImageKHR`* containing the requested data.
     size : int
         The size of the memory region ``value`` points to.
 
@@ -55764,42 +55820,41 @@ def cuCoredumpGetAttributeGlobal(attrib not None : CUcoredumpSettings):
 
     The supported attributes are:
 
-    - :py:obj:`~.CU_COREDUMP_ENABLE_ON_EXCEPTION`: Bool where
-      :py:obj:`~.true` means that GPU exceptions from this context will
-      create a coredump at the location specified by
-      :py:obj:`~.CU_COREDUMP_FILE`. The default value is :py:obj:`~.false`.
+    - :py:obj:`~.CU_COREDUMP_ENABLE_ON_EXCEPTION`: Bool where ``true``
+      means that GPU exceptions from this context will create a coredump at
+      the location specified by :py:obj:`~.CU_COREDUMP_FILE`. The default
+      value is ``false``.
 
-    - :py:obj:`~.CU_COREDUMP_TRIGGER_HOST`: Bool where :py:obj:`~.true`
-      means that the host CPU will also create a coredump. The default
-      value is :py:obj:`~.true` unless set to :py:obj:`~.false` globally or
-      or locally. This value is deprecated as of CUDA 12.5 - raise the
+    - :py:obj:`~.CU_COREDUMP_TRIGGER_HOST`: Bool where ``true`` means that
+      the host CPU will also create a coredump. The default value is
+      ``true`` unless set to ``false`` globally or or locally. This value
+      is deprecated as of CUDA 12.5 - raise the
       :py:obj:`~.CU_COREDUMP_SKIP_ABORT` flag to disable host device
       abort() if needed.
 
-    - :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT`: Bool where :py:obj:`~.true`
-      means that any resulting coredumps will not have a dump of GPU memory
-      or non-reloc ELF images. The default value is :py:obj:`~.false`. This
-      attribute is deprecated as of CUDA 12.5, please use
+    - :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT`: Bool where ``true`` means that
+      any resulting coredumps will not have a dump of GPU memory or non-
+      reloc ELF images. The default value is ``false``. This attribute is
+      deprecated as of CUDA 12.5, please use
       :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS` instead.
 
-    - :py:obj:`~.CU_COREDUMP_ENABLE_USER_TRIGGER`: Bool where
-      :py:obj:`~.true` means that a coredump can be created by writing to
-      the system pipe specified by :py:obj:`~.CU_COREDUMP_PIPE`. The
-      default value is :py:obj:`~.false`.
+    - :py:obj:`~.CU_COREDUMP_ENABLE_USER_TRIGGER`: Bool where ``true``
+      means that a coredump can be created by writing to the system pipe
+      specified by :py:obj:`~.CU_COREDUMP_PIPE`. The default value is
+      ``false``.
 
     - :py:obj:`~.CU_COREDUMP_FILE`: String of up to 1023 characters that
       defines the location where any coredumps generated by this context
-      will be written. The default value is
-      :py:obj:`~.core`.cuda.HOSTNAME.PID where :py:obj:`~.HOSTNAME` is the
-      host name of the machine running the CUDA applications and
-      :py:obj:`~.PID` is the process ID of the CUDA application.
+      will be written. The default value is ``core``.cuda.HOSTNAME.PID
+      where ``HOSTNAME`` is the host name of the machine running the CUDA
+      applications and ``PID`` is the process ID of the CUDA application.
 
     - :py:obj:`~.CU_COREDUMP_PIPE`: String of up to 1023 characters that
       defines the name of the pipe that will be monitored if user-triggered
       coredumps are enabled. The default value is
-      :py:obj:`~.corepipe`.cuda.HOSTNAME.PID where :py:obj:`~.HOSTNAME` is
-      the host name of the machine running the CUDA application and
-      :py:obj:`~.PID` is the process ID of the CUDA application.
+      ``corepipe``.cuda.HOSTNAME.PID where ``HOSTNAME`` is the host name of
+      the machine running the CUDA application and ``PID`` is the process
+      ID of the CUDA application.
 
     - :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS`: An integer with values to
       allow granular control the data contained in a coredump specified as
@@ -55826,7 +55881,7 @@ def cuCoredumpGetAttributeGlobal(attrib not None : CUcoredumpSettings):
 
       - :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT_FLAGS` - Enables all of the
         above options. Equiavlent to setting the
-        :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT` attribute to :py:obj:`~.true`.
+        :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT` attribute to ``true``.
 
       - :py:obj:`~.CU_COREDUMP_SKIP_ABORT` - If set, GPU exceptions will
         not raise an abort() in the host CPU process. Same functional goal
@@ -55864,10 +55919,10 @@ def cuCoredumpGetAttributeGlobal(attrib not None : CUcoredumpSettings):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     value : Any
-        void* containing the requested data.
+        :py:obj:`~.EGLImageKHR`* containing the requested data.
     size : int
         The size of the memory region ``value`` points to.
 
@@ -55921,30 +55976,29 @@ def cuCoredumpSetAttribute(attrib not None : CUcoredumpSettings, value):
 
     The supported attributes are:
 
-    - :py:obj:`~.CU_COREDUMP_ENABLE_ON_EXCEPTION`: Bool where
-      :py:obj:`~.true` means that GPU exceptions from this context will
-      create a coredump at the location specified by
-      :py:obj:`~.CU_COREDUMP_FILE`. The default value is :py:obj:`~.false`.
+    - :py:obj:`~.CU_COREDUMP_ENABLE_ON_EXCEPTION`: Bool where ``true``
+      means that GPU exceptions from this context will create a coredump at
+      the location specified by :py:obj:`~.CU_COREDUMP_FILE`. The default
+      value is ``false``.
 
-    - :py:obj:`~.CU_COREDUMP_TRIGGER_HOST`: Bool where :py:obj:`~.true`
-      means that the host CPU will also create a coredump. The default
-      value is :py:obj:`~.true` unless set to :py:obj:`~.false` globally or
-      or locally. This value is deprecated as of CUDA 12.5 - raise the
+    - :py:obj:`~.CU_COREDUMP_TRIGGER_HOST`: Bool where ``true`` means that
+      the host CPU will also create a coredump. The default value is
+      ``true`` unless set to ``false`` globally or or locally. This value
+      is deprecated as of CUDA 12.5 - raise the
       :py:obj:`~.CU_COREDUMP_SKIP_ABORT` flag to disable host device
       abort() if needed.
 
-    - :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT`: Bool where :py:obj:`~.true`
-      means that any resulting coredumps will not have a dump of GPU memory
-      or non-reloc ELF images. The default value is :py:obj:`~.false`. This
-      attribute is deprecated as of CUDA 12.5, please use
+    - :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT`: Bool where ``true`` means that
+      any resulting coredumps will not have a dump of GPU memory or non-
+      reloc ELF images. The default value is ``false``. This attribute is
+      deprecated as of CUDA 12.5, please use
       :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS` instead.
 
     - :py:obj:`~.CU_COREDUMP_FILE`: String of up to 1023 characters that
       defines the location where any coredumps generated by this context
-      will be written. The default value is
-      :py:obj:`~.core`.cuda.HOSTNAME.PID where :py:obj:`~.HOSTNAME` is the
-      host name of the machine running the CUDA applications and
-      :py:obj:`~.PID` is the process ID of the CUDA application.
+      will be written. The default value is ``core``.cuda.HOSTNAME.PID
+      where ``HOSTNAME`` is the host name of the machine running the CUDA
+      applications and ``PID`` is the process ID of the CUDA application.
 
     - :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS`: An integer with values to
       allow granular control the data contained in a coredump specified as
@@ -55971,7 +56025,7 @@ def cuCoredumpSetAttribute(attrib not None : CUcoredumpSettings, value):
 
       - :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT_FLAGS` - Enables all of the
         above options. Equiavlent to setting the
-        :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT` attribute to :py:obj:`~.true`.
+        :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT` attribute to ``true``.
 
       - :py:obj:`~.CU_COREDUMP_SKIP_ABORT` - If set, GPU exceptions will
         not raise an abort() in the host CPU process. Same functional goal
@@ -56005,13 +56059,13 @@ def cuCoredumpSetAttribute(attrib not None : CUcoredumpSettings, value):
     attrib : :py:obj:`~.CUcoredumpSettings`
         The enum defining which value to set.
     value : Any
-        void* containing the requested data.
+        :py:obj:`~.EGLImageKHR`* containing the requested data.
     size : int
         The size of the memory region ``value`` points to.
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     size : int
         The size of the memory region ``value`` points to.
@@ -56059,44 +56113,42 @@ def cuCoredumpSetAttributeGlobal(attrib not None : CUcoredumpSettings, value):
 
     The supported attributes are:
 
-    - :py:obj:`~.CU_COREDUMP_ENABLE_ON_EXCEPTION`: Bool where
-      :py:obj:`~.true` means that GPU exceptions from this context will
-      create a coredump at the location specified by
-      :py:obj:`~.CU_COREDUMP_FILE`. The default value is :py:obj:`~.false`.
+    - :py:obj:`~.CU_COREDUMP_ENABLE_ON_EXCEPTION`: Bool where ``true``
+      means that GPU exceptions from this context will create a coredump at
+      the location specified by :py:obj:`~.CU_COREDUMP_FILE`. The default
+      value is ``false``.
 
-    - :py:obj:`~.CU_COREDUMP_TRIGGER_HOST`: Bool where :py:obj:`~.true`
-      means that the host CPU will also create a coredump. The default
-      value is :py:obj:`~.true` unless set to :py:obj:`~.false` globally or
-      or locally. This value is deprecated as of CUDA 12.5 - raise the
+    - :py:obj:`~.CU_COREDUMP_TRIGGER_HOST`: Bool where ``true`` means that
+      the host CPU will also create a coredump. The default value is
+      ``true`` unless set to ``false`` globally or or locally. This value
+      is deprecated as of CUDA 12.5 - raise the
       :py:obj:`~.CU_COREDUMP_SKIP_ABORT` flag to disable host device
       abort() if needed.
 
-    - :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT`: Bool where :py:obj:`~.true`
-      means that any resulting coredumps will not have a dump of GPU memory
-      or non-reloc ELF images. The default value is :py:obj:`~.false`. This
-      attribute is deprecated as of CUDA 12.5, please use
+    - :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT`: Bool where ``true`` means that
+      any resulting coredumps will not have a dump of GPU memory or non-
+      reloc ELF images. The default value is ``false``. This attribute is
+      deprecated as of CUDA 12.5, please use
       :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS` instead.
 
-    - :py:obj:`~.CU_COREDUMP_ENABLE_USER_TRIGGER`: Bool where
-      :py:obj:`~.true` means that a coredump can be created by writing to
-      the system pipe specified by :py:obj:`~.CU_COREDUMP_PIPE`. The
-      default value is :py:obj:`~.false`.
+    - :py:obj:`~.CU_COREDUMP_ENABLE_USER_TRIGGER`: Bool where ``true``
+      means that a coredump can be created by writing to the system pipe
+      specified by :py:obj:`~.CU_COREDUMP_PIPE`. The default value is
+      ``false``.
 
     - :py:obj:`~.CU_COREDUMP_FILE`: String of up to 1023 characters that
       defines the location where any coredumps generated by this context
-      will be written. The default value is
-      :py:obj:`~.core`.cuda.HOSTNAME.PID where :py:obj:`~.HOSTNAME` is the
-      host name of the machine running the CUDA applications and
-      :py:obj:`~.PID` is the process ID of the CUDA application.
+      will be written. The default value is ``core``.cuda.HOSTNAME.PID
+      where ``HOSTNAME`` is the host name of the machine running the CUDA
+      applications and ``PID`` is the process ID of the CUDA application.
 
     - :py:obj:`~.CU_COREDUMP_PIPE`: String of up to 1023 characters that
       defines the name of the pipe that will be monitored if user-triggered
       coredumps are enabled. This value may not be changed after
-      :py:obj:`~.CU_COREDUMP_ENABLE_USER_TRIGGER` is set to
-      :py:obj:`~.true`. The default value is
-      :py:obj:`~.corepipe`.cuda.HOSTNAME.PID where :py:obj:`~.HOSTNAME` is
-      the host name of the machine running the CUDA application and
-      :py:obj:`~.PID` is the process ID of the CUDA application.
+      :py:obj:`~.CU_COREDUMP_ENABLE_USER_TRIGGER` is set to ``true``. The
+      default value is ``corepipe``.cuda.HOSTNAME.PID where ``HOSTNAME`` is
+      the host name of the machine running the CUDA application and ``PID``
+      is the process ID of the CUDA application.
 
     - :py:obj:`~.CU_COREDUMP_GENERATION_FLAGS`: An integer with values to
       allow granular control the data contained in a coredump specified as
@@ -56123,7 +56175,7 @@ def cuCoredumpSetAttributeGlobal(attrib not None : CUcoredumpSettings, value):
 
       - :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT_FLAGS` - Enables all of the
         above options. Equiavlent to setting the
-        :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT` attribute to :py:obj:`~.true`.
+        :py:obj:`~.CU_COREDUMP_LIGHTWEIGHT` attribute to ``true``.
 
       - :py:obj:`~.CU_COREDUMP_SKIP_ABORT` - If set, GPU exceptions will
         not raise an abort() in the host CPU process. Same functional goal
@@ -56157,13 +56209,13 @@ def cuCoredumpSetAttributeGlobal(attrib not None : CUcoredumpSettings, value):
     attrib : :py:obj:`~.CUcoredumpSettings`
         The enum defining which value to set.
     value : Any
-        void* containing the requested data.
+        :py:obj:`~.EGLImageKHR`* containing the requested data.
     size : int
         The size of the memory region ``value`` points to.
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`
     size : int
         The size of the memory region ``value`` points to.
@@ -56206,7 +56258,7 @@ def cuCoredumpRegisterStartCallback(callback, userData):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     callbackOut : :py:obj:`~.CUcoredumpCallbackHandle`
         Location to store the callback handle (optional, may be NULL)
@@ -56259,7 +56311,7 @@ def cuCoredumpRegisterCompleteCallback(callback, userData):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     callbackOut : :py:obj:`~.CUcoredumpCallbackHandle`
         Location to store the callback handle (optional, may be NULL)
@@ -56308,7 +56360,7 @@ def cuCoredumpDeregisterStartCallback(callback):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -56349,7 +56401,7 @@ def cuCoredumpDeregisterCompleteCallback(callback):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
 
     See Also
@@ -56386,7 +56438,7 @@ def cuGetExportTable(pExportTableId : Optional[CUuuid]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
 
     ppExportTable : Any
         None
@@ -56416,21 +56468,21 @@ def cuGreenCtxCreate(desc, dev, unsigned int flags):
 
     The API does not set the green context current. In order to set it
     current, you need to explicitly set it current by first converting the
-    green context to a CUcontext using :py:obj:`~.cuCtxFromGreenCtx` and
-    subsequently calling :py:obj:`~.cuCtxSetCurrent` /
-    :py:obj:`~.cuCtxPushCurrent`. It should be noted that a green context
-    can be current to only one thread at a time. There is no internal
-    synchronization to make API calls accessing the same green context from
-    multiple threads work.
+    green context to a :py:obj:`~.CUcontext` using
+    :py:obj:`~.cuCtxFromGreenCtx` and subsequently calling
+    :py:obj:`~.cuCtxSetCurrent` / :py:obj:`~.cuCtxPushCurrent`. It should
+    be noted that a green context can be current to only one thread at a
+    time. There is no internal synchronization to make API calls accessing
+    the same green context from multiple threads work.
 
     Note: The API is not supported on 32-bit platforms.
 
     The supported flags are:
 
-    - ``CU_GREEN_CTX_NONE`` : Default behavior.
+    - :py:obj:`~.CU_GREEN_CTX_NONE` : Default behavior.
 
-    - ``CU_GREEN_CTX_DEFAULT_STREAM`` : Creates a default stream to use
-      inside the green context.
+    - :py:obj:`~.CU_GREEN_CTX_DEFAULT_STREAM` : Creates a default stream to
+      use inside the green context.
 
     Parameters
     ----------
@@ -56444,7 +56496,7 @@ def cuGreenCtxCreate(desc, dev, unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     phCtx : :py:obj:`~.CUgreenCtx`
         Pointer for the output handle to the green context
@@ -56507,7 +56559,7 @@ def cuGreenCtxDestroy(hCtx):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`
 
     See Also
@@ -56570,7 +56622,7 @@ def cuCtxFromGreenCtx(hCtx):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pContext : :py:obj:`~.CUcontext`
         Returned :py:obj:`~.CUcontext` with green context resources
@@ -56616,7 +56668,7 @@ def cuDeviceGetDevResource(device, typename not None : CUdevResourceType):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`
     resource : :py:obj:`~.CUdevResource`
         Output pointer to a :py:obj:`~.CUdevResource` structure
@@ -56660,7 +56712,7 @@ def cuCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
     resource : :py:obj:`~.CUdevResource`
         Output pointer to a :py:obj:`~.CUdevResource` structure
@@ -56704,7 +56756,7 @@ def cuGreenCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     resource : :py:obj:`~.CUdevResource`
         Output pointer to a :py:obj:`~.CUdevResource` structure
@@ -56734,9 +56786,9 @@ def cuGreenCtxGetDevResource(hCtx, typename not None : CUdevResourceType):
 
 @cython.embedsignature(True)
 def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevResource], unsigned int flags, unsigned int minCount):
-    """ Splits ``CU_DEV_RESOURCE_TYPE_SM`` resources.
+    """ Splits :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM` resources.
 
-    Splits ``CU_DEV_RESOURCE_TYPE_SM`` resources into ``nbGroups``,
+    Splits :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM` resources into ``nbGroups``,
     adhering to the minimum SM count specified in ``minCount`` and the
     usage flags in ``flags``. If ``result`` is NULL, the API simulates a
     split and provides the amount of groups that would be created in
@@ -56772,30 +56824,31 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
 
     The following flags are supported:
 
-    - ``CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING`` : Lower the
-      minimum SM count and alignment, and treat each SM independent of its
-      hierarchy. This allows more fine grained partitions but at the cost
-      of advanced features (such as large clusters on compute capability
-      9.0+).
+    - :py:obj:`~.CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING` : Lower
+      the minimum SM count and alignment, and treat each SM independent of
+      its hierarchy. This allows more fine grained partitions but at the
+      cost of advanced features (such as large clusters on compute
+      capability 9.0+).
 
-    - ``CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE`` : Compute
-      Capability 9.0+ only. Attempt to create groups that may allow for
-      maximally sized thread clusters. This can be queried post green
+    - :py:obj:`~.CU_DEV_SM_RESOURCE_SPLIT_MAX_POTENTIAL_CLUSTER_SIZE` :
+      Compute Capability 9.0+ only. Attempt to create groups that may allow
+      for maximally sized thread clusters. This can be queried post green
       context creation using
       :py:obj:`~.cuOccupancyMaxPotentialClusterSize`.
 
     A successful API call must either have:
 
     - A valid array of ``result`` pointers of size passed in ``nbGroups``,
-      with ``input`` of type ``CU_DEV_RESOURCE_TYPE_SM``. Value of
+      with ``input`` of type :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`. Value of
       ``minCount`` must be between 0 and the SM count specified in
       ``input``. ``remainder`` may be NULL.
 
     - NULL passed in for ``result``, with a valid integer pointer in
-      ``nbGroups`` and ``input`` of type ``CU_DEV_RESOURCE_TYPE_SM``. Value
-      of ``minCount`` must be between 0 and the SM count specified in
-      ``input``. ``remainder`` may be NULL. This queries the number of
-      groups that would be created by the API.
+      ``nbGroups`` and ``input`` of type
+      :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`. Value of ``minCount`` must be
+      between 0 and the SM count specified in ``input``. ``remainder`` may
+      be NULL. This queries the number of groups that would be created by
+      the API.
 
     Note: The API is not supported on 32-bit platforms.
 
@@ -56806,7 +56859,7 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
         should be created as described below.
     input : :py:obj:`~.CUdevResource`
         Input SM resource to be split. Must be a valid
-        ``CU_DEV_RESOURCE_TYPE_SM`` resource.
+        :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM` resource.
     flags : unsigned int
         Flags specifying how these partitions are used or which constraints
         to abide by when splitting the input. Zero is valid for default
@@ -56816,11 +56869,11 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION`
     result : list[:py:obj:`~.CUdevResource`]
-        Output array of ``CUdevResource`` resources. Can be NULL to query
-        the number of groups.
+        Output array of :py:obj:`~.CUdevResource` resources. Can be NULL to
+        query the number of groups.
     nbGroups : unsigned int
         This is a pointer, specifying the number of groups that would be or
         should be created as described below.
@@ -56858,7 +56911,7 @@ def cuDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[CUdevRe
 
 @cython.embedsignature(True)
 def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource], unsigned int flags, groupParams : Optional[tuple[CU_DEV_SM_RESOURCE_GROUP_PARAMS] | list[CU_DEV_SM_RESOURCE_GROUP_PARAMS]]):
-    """ Splits a ``CU_DEV_RESOURCE_TYPE_SM`` resource into structured groups.
+    """ Splits a :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM` resource into structured groups.
 
     This API will split a resource of :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`
     into ``nbGroups`` structured device resource groups (the ``result``
@@ -56892,7 +56945,7 @@ def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource]
 
     For a valid call:
 
-    - ``result`` should point to a ``CUdevResource`` array of size
+    - ``result`` should point to a :py:obj:`~.CUdevResource` array of size
       ``nbGroups``, or alternatively, may be NULL, if the developer wishes
       for only the groupParams entries to be updated
 
@@ -56915,7 +56968,8 @@ def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource]
         results are expected by having the ``result`` parameter set),
         ``smCount`` cannot end up with 0 or a value less than
         ``coscheduledSmCount`` otherwise
-        CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION will be returned.
+        :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION` will be
+        returned.
 
       - ``coscheduledSmCount:`` allows grouping SMs together in order to be
         able to launch clusters on Compute Architecture 9.0+. The default
@@ -56931,11 +56985,11 @@ def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource]
 
       - ``flags:``
 
-        - ``CU_DEV_SM_RESOURCE_GROUP_BACKFILL:`` lets ``smCount`` be a non-
-          multiple of ``coscheduledSmCount``, filling the difference
-          between SM count and already assigned co-scheduled groupings with
-          other SMs. This lets any resulting group behave similar to the
-          ``remainder`` group for example.
+        - :py:obj:`~.CU_DEV_SM_RESOURCE_GROUP_BACKFILL`: lets ``smCount``
+          be a non-multiple of ``coscheduledSmCount``, filling the
+          difference between SM count and already assigned co-scheduled
+          groupings with other SMs. This lets any resulting group behave
+          similar to the ``remainder`` group for example.
 
     Example params and their effect:
 
@@ -56959,7 +57013,7 @@ def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource]
       always need to adhere to a structure of coscheduledSmCount (even if
       its just 2), and therefore must always have enough coscheduled SMs to
       cover that requirement (even with the
-      ``CU_DEV_SM_RESOURCE_GROUP_BACKFILL`` flag enabled).
+      :py:obj:`~.CU_DEV_SM_RESOURCE_GROUP_BACKFILL` flag enabled).
 
     Splitting an input into N groups, can be accomplished by repeatedly
     splitting off 1 group and re-splitting the remainder (a bisect
@@ -56972,7 +57026,7 @@ def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource]
         Specifies the number of groups in ``result`` and ``groupParams``
     input : :py:obj:`~.CUdevResource`
         Input SM resource to be split. Must be a valid
-        ``CU_DEV_RESOURCE_TYPE_SM`` resource.
+        :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM` resource.
     flags : unsigned int
         Flags specifying how the API should behave. The value should be 0
         for now.
@@ -56982,11 +57036,11 @@ def cuDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[CUdevResource]
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_DEVICE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION`
     result : list[:py:obj:`~.CUdevResource`]
-        Output array of ``CUdevResource`` resources. Can be NULL, alongside
-        an smCount of 0, for discovery purpose.
+        Output array of :py:obj:`~.CUdevResource` resources. Can be NULL,
+        alongside an smCount of 0, for discovery purpose.
     remainder : :py:obj:`~.CUdevResource`
         If splitting the input resource leaves any SMs, the remainder is
         placed in here.
@@ -57047,12 +57101,12 @@ def cuDevResourceGenerateDesc(resources : Optional[tuple[CUdevResource] | list[C
       valid array of ``resources`` pointers, with the array size passed in
       ``nbResources``. If multiple resources are provided in ``resources``,
       the device they came from must be the same, otherwise
-      CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned. If multiple
-      resources are provided in ``resources`` and they are of type
+      :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION` is returned. If
+      multiple resources are provided in ``resources`` and they are of type
       :py:obj:`~.CU_DEV_RESOURCE_TYPE_SM`, they must be outputs (whether
       ``result`` or ``remaining``) from the same split API instance and
       have the same smCoscheduledAlignment values, otherwise
-      CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION is returned.
+      :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION` is returned.
 
     Note: The API is not supported on 32-bit platforms.
 
@@ -57065,7 +57119,7 @@ def cuDevResourceGenerateDesc(resources : Optional[tuple[CUdevResource] | list[C
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_CONFIGURATION`
     phDesc : :py:obj:`~.CUdevResourceDesc`
         Output descriptor
@@ -57120,7 +57174,7 @@ def cuGreenCtxRecordEvent(hCtx, hEvent):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED`
 
     See Also
@@ -57173,7 +57227,7 @@ def cuGreenCtxWaitEvent(hCtx, hEvent):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED`
 
     See Also
@@ -57221,17 +57275,18 @@ def cuStreamGetGreenCtx(hStream):
     - a stream created via any of the CUDA driver APIs such as
       :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamCreateWithPriority`
       and :py:obj:`~.cuGreenCtxStreamCreate`, or their runtime API
-      equivalents such as :py:obj:`~.cudaStreamCreate`,
-      :py:obj:`~.cudaStreamCreateWithFlags` and
-      :py:obj:`~.cudaStreamCreateWithPriority`. If during stream creation
-      the context that was active in the calling thread was obtained with
-      cuCtxFromGreenCtx, that green context is returned in ``phCtx``.
-      Otherwise, ``*phCtx`` is set to NULL instead.
+      equivalents such as :func:`~.cudaStreamCreate`,
+      :func:`~.cudaStreamCreateWithFlags` and
+      :func:`~.cudaStreamCreateWithPriority`. If during stream creation the
+      context that was active in the calling thread was obtained with
+      :func:`~.cuCtxFromGreenCtx`, that green context is returned in
+      ``phCtx``. Otherwise, ``*phCtx`` is set to NULL instead.
 
     - special stream such as the NULL stream or
       :py:obj:`~.CU_STREAM_LEGACY`. In that case if context that is active
-      in the calling thread was obtained with cuCtxFromGreenCtx, that green
-      context is returned. Otherwise, ``*phCtx`` is set to NULL instead.
+      in the calling thread was obtained with :func:`~.cuCtxFromGreenCtx`,
+      that green context is returned. Otherwise, ``*phCtx`` is set to NULL
+      instead.
 
     Passing an invalid handle will result in undefined behavior.
 
@@ -57242,14 +57297,14 @@ def cuStreamGetGreenCtx(hStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`,
     phCtx : :py:obj:`~.CUgreenCtx`
         Returned green context associated with the stream
 
     See Also
     --------
-    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuStreamGetCtx`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`
+    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamCreateWithPriority`, :py:obj:`~.cuStreamGetCtx`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :func:`~.cudaStreamCreate`, :func:`~.cudaStreamCreateWithFlags`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -57304,8 +57359,8 @@ def cuGreenCtxStreamCreate(greenCtx, unsigned int flags, int priority):
     greenCtx : :py:obj:`~.CUgreenCtx`
         Green context for which to create the stream for
     flags : unsigned int
-        Flags for stream creation. ``CU_STREAM_NON_BLOCKING`` must be
-        specified.
+        Flags for stream creation. :py:obj:`~.CU_STREAM_NON_BLOCKING` must
+        be specified.
     priority : int
         Stream priority. Lower numbers represent higher priorities. See
         :py:obj:`~.cuCtxGetStreamPriorityRange` for more information about
@@ -57313,14 +57368,14 @@ def cuGreenCtxStreamCreate(greenCtx, unsigned int flags, int priority):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     phStream : :py:obj:`~.CUstream`
         Returned newly created stream
 
     See Also
     --------
-    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuGreenCtxCreate` :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuCtxGetStreamPriorityRange`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :py:obj:`~.cudaStreamCreateWithPriority`
+    :py:obj:`~.cuStreamDestroy`, :py:obj:`~.cuGreenCtxCreate` :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuStreamGetPriority`, :py:obj:`~.cuCtxGetStreamPriorityRange`, :py:obj:`~.cuStreamGetFlags`, :py:obj:`~.cuStreamGetDevice`, :py:obj:`~.cuStreamWaitEvent`, :py:obj:`~.cuStreamQuery`, :py:obj:`~.cuStreamSynchronize`, :py:obj:`~.cuStreamAddCallback`, :func:`~.cudaStreamCreateWithPriority`
 
     Notes
     -----
@@ -57361,7 +57416,7 @@ def cuGreenCtxGetId(greenCtx):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_CONTEXT_IS_DESTROYED`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     greenCtxId : unsigned long long
         Pointer to store the Id of the green context
@@ -57396,8 +57451,8 @@ def cuStreamGetDevResource(hStream, typename not None : CUdevResourceType):
     them in ``resource``.
 
     Note: The API will return :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`
-    is ``typename`` is ``CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG`` or
-    ``CU_DEV_RESOURCE_TYPE_WORKQUEUE``.
+    is ``typename`` is :py:obj:`~.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG` or
+    :py:obj:`~.CU_DEV_RESOURCE_TYPE_WORKQUEUE`.
 
     Parameters
     ----------
@@ -57408,14 +57463,14 @@ def cuStreamGetDevResource(hStream, typename not None : CUdevResourceType):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_RESOURCE_TYPE`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`
     resource : :py:obj:`~.CUdevResource`
         Output pointer to a :py:obj:`~.CUdevResource` structure
 
     See Also
     --------
-    :py:obj:`~.cuGreenCtxCreate`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuDevSmResourceSplitByCount`, :py:obj:`~.cuDevResourceGenerateDesc`, :py:obj:`~.cudaStreamGetDevResource`
+    :py:obj:`~.cuGreenCtxCreate`, :py:obj:`~.cuGreenCtxStreamCreate`, :py:obj:`~.cuStreamCreate`, :py:obj:`~.cuDevSmResourceSplitByCount`, :py:obj:`~.cuDevResourceGenerateDesc`, :func:`~.cudaStreamGetDevResource`
     """
     cdef cydriver.CUstream cyhStream
     if hStream is None:
@@ -57462,7 +57517,7 @@ def cuLogsRegisterCallback(callbackFunc, userData):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     callback_out : :py:obj:`~.CUlogsCallbackHandle`
         Optional location to store the callback handle after it is
@@ -57512,7 +57567,7 @@ def cuLogsUnregisterCallback(callback):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     """
     cdef cydriver.CUlogsCallbackHandle cycallback
@@ -57544,7 +57599,7 @@ def cuLogsCurrent(unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     iterator_out : :py:obj:`~.CUlogIterator`
         Location to store an iterator to the current tail of the logs
@@ -57579,7 +57634,7 @@ def cuLogsDumpToFile(iterator : Optional[CUlogIterator], char* pathToFile, unsig
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     iterator : :py:obj:`~.CUlogIterator`
         Optional auto-advancing iterator specifying the starting log to
@@ -57633,7 +57688,7 @@ def cuLogsDumpToMemory(iterator : Optional[CUlogIterator], char* buffer, size_t
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     iterator : :py:obj:`~.CUlogIterator`
         Optional auto-advancing iterator specifying the starting log to
@@ -57675,7 +57730,7 @@ def cuCheckpointProcessGetRestoreThreadId(int pid):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     tid : int
         Returned restore thread ID
@@ -57704,7 +57759,7 @@ def cuCheckpointProcessGetState(int pid):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     state : :py:obj:`~.CUprocessState`
         Returned CUDA process state
@@ -57740,7 +57795,7 @@ def cuCheckpointProcessLock(int pid, args : Optional[CUcheckpointLockArgs]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED` :py:obj:`~.CUDA_ERROR_NOT_READY`
     """
     cdef cydriver.CUcheckpointLockArgs* cyargs_ptr = <cydriver.CUcheckpointLockArgs*>args._pvt_ptr if args is not None else NULL
@@ -57771,7 +57826,7 @@ def cuCheckpointProcessCheckpoint(int pid, args : Optional[CUcheckpointCheckpoin
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     """
     cdef cydriver.CUcheckpointCheckpointArgs* cyargs_ptr = <cydriver.CUcheckpointCheckpointArgs*>args._pvt_ptr if args is not None else NULL
@@ -57808,7 +57863,7 @@ def cuCheckpointProcessRestore(int pid, args : Optional[CUcheckpointRestoreArgs]
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
 
     See Also
@@ -57841,7 +57896,7 @@ def cuCheckpointProcessUnlock(int pid, args : Optional[CUcheckpointUnlockArgs]):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.CUDA_ERROR_INVALID_VALUE` :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED` :py:obj:`~.CUDA_ERROR_ILLEGAL_STATE` :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     """
     cdef cydriver.CUcheckpointUnlockArgs* cyargs_ptr = <cydriver.CUcheckpointUnlockArgs*>args._pvt_ptr if args is not None else NULL
@@ -57860,18 +57915,18 @@ def cuProfilerStart():
     context. If profiling is already enabled, then
     :py:obj:`~.cuProfilerStart()` has no effect.
 
-    cuProfilerStart and cuProfilerStop APIs are used to programmatically
-    control the profiling granularity by allowing profiling to be done only
-    on selective pieces of code.
+    :func:`~.cuProfilerStart` and :func:`~.cuProfilerStop` APIs are used to
+    programmatically control the profiling granularity by allowing
+    profiling to be done only on selective pieces of code.
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
 
     See Also
     --------
-    :py:obj:`~.cuProfilerInitialize`, :py:obj:`~.cuProfilerStop`, :py:obj:`~.cudaProfilerStart`
+    :py:obj:`~.cuProfilerInitialize`, :py:obj:`~.cuProfilerStop`, :func:`~.cudaProfilerStart`
     """
     with nogil:
         err = cydriver.cuProfilerStart()
@@ -57888,18 +57943,18 @@ def cuProfilerStop():
     current context. If profiling is already disabled, then
     :py:obj:`~.cuProfilerStop()` has no effect.
 
-    cuProfilerStart and cuProfilerStop APIs are used to programmatically
-    control the profiling granularity by allowing profiling to be done only
-    on selective pieces of code.
+    :func:`~.cuProfilerStart` and :func:`~.cuProfilerStop` APIs are used to
+    programmatically control the profiling granularity by allowing
+    profiling to be done only on selective pieces of code.
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`
 
     See Also
     --------
-    :py:obj:`~.cuProfilerInitialize`, :py:obj:`~.cuProfilerStart`, :py:obj:`~.cudaProfilerStop`
+    :py:obj:`~.cuProfilerInitialize`, :py:obj:`~.cuProfilerStart`, :func:`~.cudaProfilerStop`
     """
     with nogil:
         err = cydriver.cuProfilerStop()
@@ -57912,11 +57967,11 @@ def cuProfilerStop():
 def cuGraphicsEGLRegisterImage(image, unsigned int flags):
     """ Registers an EGL image.
 
-    Registers the EGLImageKHR specified by ``image`` for access by CUDA. A
-    handle to the registered object is returned as ``pCudaResource``.
-    Additional Mapping/Unmapping is not required for the registered
-    resource and :py:obj:`~.cuGraphicsResourceGetMappedEglFrame` can be
-    directly called on the ``pCudaResource``.
+    Registers the :py:obj:`~.EGLImageKHR` specified by ``image`` for access
+    by CUDA. A handle to the registered object is returned as
+    ``pCudaResource``. Additional Mapping/Unmapping is not required for the
+    registered resource and :py:obj:`~.cuGraphicsResourceGetMappedEglFrame`
+    can be directly called on the ``pCudaResource``.
 
     The application will be responsible for synchronizing access to shared
     objects. The application must ensure that any pending operation which
@@ -57926,8 +57981,8 @@ def cuGraphicsEGLRegisterImage(image, unsigned int flags):
     will be also responsible for ensuring that any pending operation on the
     registered CUDA resource has completed prior to executing subsequent
     commands in other APIs accesing the same memory objects. This can be
-    accomplished by calling cuCtxSynchronize or cuEventSynchronize
-    (preferably).
+    accomplished by calling :func:`~.cuCtxSynchronize` or
+    :func:`~.cuEventSynchronize` (preferably).
 
     The surface's intended usage is specified using ``flags``, as follows:
 
@@ -57944,27 +57999,28 @@ def cuGraphicsEGLRegisterImage(image, unsigned int flags):
       entire contents of the resource, so none of the data previously
       stored in the resource will be preserved.
 
-    The EGLImageKHR is an object which can be used to create EGLImage
-    target resource. It is defined as a void pointer. typedef void*
-    EGLImageKHR
+    The :py:obj:`~.EGLImageKHR` is an object which can be used to create
+    EGLImage target resource. It is defined as a :py:obj:`~.EGLImageKHR`
+    pointer. typedef :py:obj:`~.EGLImageKHR`* :py:obj:`~.EGLImageKHR`
 
     Parameters
     ----------
     image : :py:obj:`~.EGLImageKHR`
-        An EGLImageKHR image which can be used to create target resource.
+        An :py:obj:`~.EGLImageKHR` image which can be used to create target
+        resource.
     flags : unsigned int
         Map flags
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_ALREADY_MAPPED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`,
     pCudaResource : :py:obj:`~.CUgraphicsResource`
         Pointer to the returned object handle
 
     See Also
     --------
-    :py:obj:`~.cuGraphicsEGLRegisterImage`, :py:obj:`~.cuGraphicsUnregisterResource`, :py:obj:`~.cuGraphicsResourceSetMapFlags`, :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cuGraphicsUnmapResources`, :py:obj:`~.cudaGraphicsEGLRegisterImage`
+    :py:obj:`~.cuGraphicsEGLRegisterImage`, :py:obj:`~.cuGraphicsUnregisterResource`, :py:obj:`~.cuGraphicsResourceSetMapFlags`, :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cuGraphicsUnmapResources`, :func:`~.cudaGraphicsEGLRegisterImage`
     """
     cdef cydriver.EGLImageKHR cyimage
     if image is None:
@@ -57988,26 +58044,27 @@ def cuGraphicsEGLRegisterImage(image, unsigned int flags):
 def cuEGLStreamConsumerConnect(stream):
     """ Connect CUDA to EGLStream as a consumer.
 
-    Connect CUDA as a consumer to EGLStreamKHR specified by ``stream``.
+    Connect CUDA as a consumer to :py:obj:`~.EGLStreamKHR` specified by
+    ``stream``.
 
-    The EGLStreamKHR is an EGL object that transfers a sequence of image
-    frames from one API to another.
+    The :py:obj:`~.EGLStreamKHR` is an EGL object that transfers a sequence
+    of image frames from one API to another.
 
     Parameters
     ----------
     stream : :py:obj:`~.EGLStreamKHR`
-        EGLStreamKHR handle
+        :py:obj:`~.EGLStreamKHR` handle
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`,
     conn : :py:obj:`~.CUeglStreamConnection`
         Pointer to the returned connection handle
 
     See Also
     --------
-    :py:obj:`~.cuEGLStreamConsumerConnect`, :py:obj:`~.cuEGLStreamConsumerDisconnect`, :py:obj:`~.cuEGLStreamConsumerAcquireFrame`, :py:obj:`~.cuEGLStreamConsumerReleaseFrame`, :py:obj:`~.cudaEGLStreamConsumerConnect`
+    :py:obj:`~.cuEGLStreamConsumerConnect`, :py:obj:`~.cuEGLStreamConsumerDisconnect`, :py:obj:`~.cuEGLStreamConsumerAcquireFrame`, :py:obj:`~.cuEGLStreamConsumerReleaseFrame`, :func:`~.cudaEGLStreamConsumerConnect`
     """
     cdef cydriver.EGLStreamKHR cystream
     if stream is None:
@@ -58031,8 +58088,9 @@ def cuEGLStreamConsumerConnect(stream):
 def cuEGLStreamConsumerConnectWithFlags(stream, unsigned int flags):
     """ Connect CUDA to EGLStream as a consumer with given flags.
 
-    Connect CUDA as a consumer to EGLStreamKHR specified by ``stream`` with
-    specified ``flags`` defined by CUeglResourceLocationFlags.
+    Connect CUDA as a consumer to :py:obj:`~.EGLStreamKHR` specified by
+    ``stream`` with specified ``flags`` defined by
+    :py:obj:`~.CUeglResourceLocationFlags`.
 
     The flags specify whether the consumer wants to access frames from
     system memory or video memory. Default is
@@ -58041,20 +58099,20 @@ def cuEGLStreamConsumerConnectWithFlags(stream, unsigned int flags):
     Parameters
     ----------
     stream : :py:obj:`~.EGLStreamKHR`
-        EGLStreamKHR handle
+        :py:obj:`~.EGLStreamKHR` handle
     flags : unsigned int
         Flags denote intended location - system or video.
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`,
     conn : :py:obj:`~.CUeglStreamConnection`
         Pointer to the returned connection handle
 
     See Also
     --------
-    :py:obj:`~.cuEGLStreamConsumerConnect`, :py:obj:`~.cuEGLStreamConsumerDisconnect`, :py:obj:`~.cuEGLStreamConsumerAcquireFrame`, :py:obj:`~.cuEGLStreamConsumerReleaseFrame`, :py:obj:`~.cudaEGLStreamConsumerConnectWithFlags`
+    :py:obj:`~.cuEGLStreamConsumerConnect`, :py:obj:`~.cuEGLStreamConsumerDisconnect`, :py:obj:`~.cuEGLStreamConsumerAcquireFrame`, :py:obj:`~.cuEGLStreamConsumerReleaseFrame`, :func:`~.cudaEGLStreamConsumerConnectWithFlags`
     """
     cdef cydriver.EGLStreamKHR cystream
     if stream is None:
@@ -58078,7 +58136,7 @@ def cuEGLStreamConsumerConnectWithFlags(stream, unsigned int flags):
 def cuEGLStreamConsumerDisconnect(conn):
     """ Disconnect CUDA as a consumer to EGLStream .
 
-    Disconnect CUDA as a consumer to EGLStreamKHR.
+    Disconnect CUDA as a consumer to :py:obj:`~.EGLStreamKHR`.
 
     Parameters
     ----------
@@ -58087,12 +58145,12 @@ def cuEGLStreamConsumerDisconnect(conn):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`,
 
     See Also
     --------
-    :py:obj:`~.cuEGLStreamConsumerConnect`, :py:obj:`~.cuEGLStreamConsumerDisconnect`, :py:obj:`~.cuEGLStreamConsumerAcquireFrame`, :py:obj:`~.cuEGLStreamConsumerReleaseFrame`, :py:obj:`~.cudaEGLStreamConsumerDisconnect`
+    :py:obj:`~.cuEGLStreamConsumerConnect`, :py:obj:`~.cuEGLStreamConsumerDisconnect`, :py:obj:`~.cuEGLStreamConsumerAcquireFrame`, :py:obj:`~.cuEGLStreamConsumerReleaseFrame`, :func:`~.cudaEGLStreamConsumerDisconnect`
     """
     cdef cydriver.CUeglStreamConnection *cyconn
     if conn is None:
@@ -58115,12 +58173,12 @@ def cuEGLStreamConsumerDisconnect(conn):
 def cuEGLStreamConsumerAcquireFrame(conn, pCudaResource, pStream, unsigned int timeout):
     """ Acquire an image frame from the EGLStream with CUDA as a consumer.
 
-    Acquire an image frame from EGLStreamKHR. This API can also acquire an
-    old frame presented by the producer unless explicitly disabled by
-    setting EGL_SUPPORT_REUSE_NV flag to EGL_FALSE during stream
-    initialization. By default, EGLStream is created with this flag set to
-    EGL_TRUE. :py:obj:`~.cuGraphicsResourceGetMappedEglFrame` can be called
-    on ``pCudaResource`` to get :py:obj:`~.CUeglFrame`.
+    Acquire an image frame from :py:obj:`~.EGLStreamKHR`. This API can also
+    acquire an old frame presented by the producer unless explicitly
+    disabled by setting EGL_SUPPORT_REUSE_NV flag to EGL_FALSE during
+    stream initialization. By default, EGLStream is created with this flag
+    set to EGL_TRUE. :py:obj:`~.cuGraphicsResourceGetMappedEglFrame` can be
+    called on ``pCudaResource`` to get :py:obj:`~.CUeglFrame`.
 
     Parameters
     ----------
@@ -58139,12 +58197,12 @@ def cuEGLStreamConsumerAcquireFrame(conn, pCudaResource, pStream, unsigned int t
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_LAUNCH_TIMEOUT`,
 
     See Also
     --------
-    :py:obj:`~.cuEGLStreamConsumerConnect`, :py:obj:`~.cuEGLStreamConsumerDisconnect`, :py:obj:`~.cuEGLStreamConsumerAcquireFrame`, :py:obj:`~.cuEGLStreamConsumerReleaseFrame`, :py:obj:`~.cudaEGLStreamConsumerAcquireFrame`
+    :py:obj:`~.cuEGLStreamConsumerConnect`, :py:obj:`~.cuEGLStreamConsumerDisconnect`, :py:obj:`~.cuEGLStreamConsumerAcquireFrame`, :py:obj:`~.cuEGLStreamConsumerReleaseFrame`, :func:`~.cudaEGLStreamConsumerAcquireFrame`
     """
     cdef cydriver.CUstream *cypStream
     if pStream is None:
@@ -58188,10 +58246,10 @@ def cuEGLStreamConsumerReleaseFrame(conn, pCudaResource, pStream):
     """ Releases the last frame acquired from the EGLStream.
 
     Release the acquired image frame specified by ``pCudaResource`` to
-    EGLStreamKHR. If EGL_SUPPORT_REUSE_NV flag is set to EGL_TRUE, at the
-    time of EGL creation this API doesn't release the last frame acquired
-    on the EGLStream. By default, EGLStream is created with this flag set
-    to EGL_TRUE.
+    :py:obj:`~.EGLStreamKHR`. If EGL_SUPPORT_REUSE_NV flag is set to
+    EGL_TRUE, at the time of EGL creation this API doesn't release the last
+    frame acquired on the EGLStream. By default, EGLStream is created with
+    this flag set to EGL_TRUE.
 
     Parameters
     ----------
@@ -58204,12 +58262,12 @@ def cuEGLStreamConsumerReleaseFrame(conn, pCudaResource, pStream):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`,
 
     See Also
     --------
-    :py:obj:`~.cuEGLStreamConsumerConnect`, :py:obj:`~.cuEGLStreamConsumerDisconnect`, :py:obj:`~.cuEGLStreamConsumerAcquireFrame`, :py:obj:`~.cuEGLStreamConsumerReleaseFrame`, :py:obj:`~.cudaEGLStreamConsumerReleaseFrame`
+    :py:obj:`~.cuEGLStreamConsumerConnect`, :py:obj:`~.cuEGLStreamConsumerDisconnect`, :py:obj:`~.cuEGLStreamConsumerAcquireFrame`, :py:obj:`~.cuEGLStreamConsumerReleaseFrame`, :func:`~.cudaEGLStreamConsumerReleaseFrame`
     """
     cdef cydriver.CUstream *cypStream
     if pStream is None:
@@ -58250,15 +58308,16 @@ def cuEGLStreamConsumerReleaseFrame(conn, pCudaResource, pStream):
 def cuEGLStreamProducerConnect(stream, width, height):
     """ Connect CUDA to EGLStream as a producer.
 
-    Connect CUDA as a producer to EGLStreamKHR specified by ``stream``.
+    Connect CUDA as a producer to :py:obj:`~.EGLStreamKHR` specified by
+    ``stream``.
 
-    The EGLStreamKHR is an EGL object that transfers a sequence of image
-    frames from one API to another.
+    The :py:obj:`~.EGLStreamKHR` is an EGL object that transfers a sequence
+    of image frames from one API to another.
 
     Parameters
     ----------
     stream : :py:obj:`~.EGLStreamKHR`
-        EGLStreamKHR handle
+        :py:obj:`~.EGLStreamKHR` handle
     width : :py:obj:`~.EGLint`
         width of the image to be submitted to the stream
     height : :py:obj:`~.EGLint`
@@ -58266,14 +58325,14 @@ def cuEGLStreamProducerConnect(stream, width, height):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`,
     conn : :py:obj:`~.CUeglStreamConnection`
         Pointer to the returned connection handle
 
     See Also
     --------
-    :py:obj:`~.cuEGLStreamProducerConnect`, :py:obj:`~.cuEGLStreamProducerDisconnect`, :py:obj:`~.cuEGLStreamProducerPresentFrame`, :py:obj:`~.cudaEGLStreamProducerConnect`
+    :py:obj:`~.cuEGLStreamProducerConnect`, :py:obj:`~.cuEGLStreamProducerDisconnect`, :py:obj:`~.cuEGLStreamProducerPresentFrame`, :func:`~.cudaEGLStreamProducerConnect`
     """
     cdef cydriver.EGLint cyheight
     if height is None:
@@ -58313,7 +58372,7 @@ def cuEGLStreamProducerConnect(stream, width, height):
 def cuEGLStreamProducerDisconnect(conn):
     """ Disconnect CUDA as a producer to EGLStream .
 
-    Disconnect CUDA as a producer to EGLStreamKHR.
+    Disconnect CUDA as a producer to :py:obj:`~.EGLStreamKHR`.
 
     Parameters
     ----------
@@ -58322,12 +58381,12 @@ def cuEGLStreamProducerDisconnect(conn):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`,
 
     See Also
     --------
-    :py:obj:`~.cuEGLStreamProducerConnect`, :py:obj:`~.cuEGLStreamProducerDisconnect`, :py:obj:`~.cuEGLStreamProducerPresentFrame`, :py:obj:`~.cudaEGLStreamProducerDisconnect`
+    :py:obj:`~.cuEGLStreamProducerConnect`, :py:obj:`~.cuEGLStreamProducerDisconnect`, :py:obj:`~.cuEGLStreamProducerPresentFrame`, :func:`~.cudaEGLStreamProducerDisconnect`
     """
     cdef cydriver.CUeglStreamConnection *cyconn
     if conn is None:
@@ -58382,12 +58441,12 @@ def cuEGLStreamProducerPresentFrame(conn, eglframe not None : CUeglFrame, pStrea
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`,
 
     See Also
     --------
-    :py:obj:`~.cuEGLStreamProducerConnect`, :py:obj:`~.cuEGLStreamProducerDisconnect`, :py:obj:`~.cuEGLStreamProducerReturnFrame`, :py:obj:`~.cudaEGLStreamProducerPresentFrame`
+    :py:obj:`~.cuEGLStreamProducerConnect`, :py:obj:`~.cuEGLStreamProducerDisconnect`, :py:obj:`~.cuEGLStreamProducerReturnFrame`, :func:`~.cudaEGLStreamProducerPresentFrame`
     """
     cdef cydriver.CUstream *cypStream
     if pStream is None:
@@ -58420,9 +58479,9 @@ def cuEGLStreamProducerPresentFrame(conn, eglframe not None : CUeglFrame, pStrea
 def cuEGLStreamProducerReturnFrame(conn, eglframe : Optional[CUeglFrame], pStream):
     """ Return the CUDA eglFrame to the EGLStream released by the consumer.
 
-    This API can potentially return CUDA_ERROR_LAUNCH_TIMEOUT if the
-    consumer has not returned a frame to EGL stream. If timeout is returned
-    the application can retry.
+    This API can potentially return :py:obj:`~.CUDA_ERROR_LAUNCH_TIMEOUT`
+    if the consumer has not returned a frame to EGL stream. If timeout is
+    returned the application can retry.
 
     Parameters
     ----------
@@ -58436,12 +58495,12 @@ def cuEGLStreamProducerReturnFrame(conn, eglframe : Optional[CUeglFrame], pStrea
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_LAUNCH_TIMEOUT`
 
     See Also
     --------
-    :py:obj:`~.cuEGLStreamProducerConnect`, :py:obj:`~.cuEGLStreamProducerDisconnect`, :py:obj:`~.cuEGLStreamProducerPresentFrame`, :py:obj:`~.cudaEGLStreamProducerReturnFrame`
+    :py:obj:`~.cuEGLStreamProducerConnect`, :py:obj:`~.cuEGLStreamProducerDisconnect`, :py:obj:`~.cuEGLStreamProducerPresentFrame`, :func:`~.cudaEGLStreamProducerReturnFrame`
     """
     cdef cydriver.CUstream *cypStream
     if pStream is None:
@@ -58497,7 +58556,7 @@ def cuGraphicsResourceGetMappedEglFrame(resource, unsigned int index, unsigned i
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
 
     eglFrame : :py:obj:`~.CUeglFrame`
         None
@@ -58524,8 +58583,8 @@ def cuGraphicsResourceGetMappedEglFrame(resource, unsigned int index, unsigned i
 def cuEventCreateFromEGLSync(eglSync, unsigned int flags):
     """ Creates an event from EGLSync object.
 
-    Creates an event \\*phEvent from an EGLSyncKHR eglSync with the flags
-    specified via ``flags``. Valid flags include:
+    Creates an event \\*phEvent from an :py:obj:`~.EGLSyncKHR` eglSync with
+    the flags specified via ``flags``. Valid flags include:
 
     - :py:obj:`~.CU_EVENT_DEFAULT`: Default event creation flag.
 
@@ -58540,8 +58599,8 @@ def cuEventCreateFromEGLSync(eglSync, unsigned int flags):
     :py:obj:`~.cuEventRecord` and TimingData are not supported for events
     created from EGLSync.
 
-    The EGLSyncKHR is an opaque handle to an EGL sync object. typedef void*
-    EGLSyncKHR
+    The :py:obj:`~.EGLSyncKHR` is an opaque handle to an EGL sync object.
+    typedef :py:obj:`~.EGLImageKHR`* :py:obj:`~.EGLSyncKHR`
 
     Parameters
     ----------
@@ -58552,7 +58611,7 @@ def cuEventCreateFromEGLSync(eglSync, unsigned int flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     phEvent : :py:obj:`~.CUevent`
         Returns newly created event
@@ -58609,14 +58668,14 @@ def cuGraphicsGLRegisterBuffer(buffer, unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_ALREADY_MAPPED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_OPERATING_SYSTEM`
     pCudaResource : :py:obj:`~.CUgraphicsResource`
         Pointer to the returned object handle
 
     See Also
     --------
-    :py:obj:`~.cuGraphicsUnregisterResource`, :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cuGraphicsResourceGetMappedPointer`, :py:obj:`~.cudaGraphicsGLRegisterBuffer`
+    :py:obj:`~.cuGraphicsUnregisterResource`, :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cuGraphicsResourceGetMappedPointer`, :func:`~.cudaGraphicsGLRegisterBuffer`
     """
     cdef cydriver.GLuint cybuffer
     if buffer is None:
@@ -58645,9 +58704,8 @@ def cuGraphicsGLRegisterImage(image, target, unsigned int Flags):
     ``pCudaResource``.
 
     ``target`` must match the type of the object, and must be one of
-    :py:obj:`~.GL_TEXTURE_2D`, :py:obj:`~.GL_TEXTURE_RECTANGLE`,
-    :py:obj:`~.GL_TEXTURE_CUBE_MAP`, :py:obj:`~.GL_TEXTURE_3D`,
-    :py:obj:`~.GL_TEXTURE_2D_ARRAY`, or :py:obj:`~.GL_RENDERBUFFER`.
+    ``GL_TEXTURE_2D``, ``GL_TEXTURE_RECTANGLE``, ``GL_TEXTURE_CUBE_MAP``,
+    ``GL_TEXTURE_3D``, ``GL_TEXTURE_2D_ARRAY``, or ``GL_RENDERBUFFER``.
 
     The register flags ``Flags`` specify the intended usage, as follows:
 
@@ -58701,14 +58759,14 @@ def cuGraphicsGLRegisterImage(image, target, unsigned int Flags):
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_ALREADY_MAPPED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_OPERATING_SYSTEM`
     pCudaResource : :py:obj:`~.CUgraphicsResource`
         Pointer to the returned object handle
 
     See Also
     --------
-    :py:obj:`~.cuGraphicsUnregisterResource`, :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cuGraphicsSubResourceGetMappedArray`, :py:obj:`~.cudaGraphicsGLRegisterImage`
+    :py:obj:`~.cuGraphicsUnregisterResource`, :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cuGraphicsSubResourceGetMappedArray`, :func:`~.cudaGraphicsGLRegisterImage`
     """
     cdef cydriver.GLenum cytarget
     if target is None:
@@ -58815,20 +58873,20 @@ def cuVDPAUGetDevice(vdpDevice, vdpGetProcAddress):
     Parameters
     ----------
     vdpDevice : :py:obj:`~.VdpDevice`
-        A VdpDevice handle
+        A :py:obj:`~.VdpDevice` handle
     vdpGetProcAddress : :py:obj:`~.VdpGetProcAddress`
-        VDPAU's VdpGetProcAddress function pointer
+        VDPAU's :py:obj:`~.VdpGetProcAddress` function pointer
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`
     pDevice : :py:obj:`~.CUdevice`
         Device associated with vdpDevice
 
     See Also
     --------
-    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuVDPAUCtxCreate`, :py:obj:`~.cuGraphicsVDPAURegisterVideoSurface`, :py:obj:`~.cuGraphicsVDPAURegisterOutputSurface`, :py:obj:`~.cuGraphicsUnregisterResource`, :py:obj:`~.cuGraphicsResourceSetMapFlags`, :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cuGraphicsUnmapResources`, :py:obj:`~.cuGraphicsSubResourceGetMappedArray`, :py:obj:`~.cudaVDPAUGetDevice`
+    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuVDPAUCtxCreate`, :py:obj:`~.cuGraphicsVDPAURegisterVideoSurface`, :py:obj:`~.cuGraphicsVDPAURegisterOutputSurface`, :py:obj:`~.cuGraphicsUnregisterResource`, :py:obj:`~.cuGraphicsResourceSetMapFlags`, :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cuGraphicsUnmapResources`, :py:obj:`~.cuGraphicsSubResourceGetMappedArray`, :func:`~.cudaVDPAUGetDevice`
     """
     cdef cydriver.VdpGetProcAddress *cyvdpGetProcAddress
     if vdpGetProcAddress is None:
@@ -58875,13 +58933,13 @@ def cuVDPAUCtxCreate(unsigned int flags, device, vdpDevice, vdpGetProcAddress):
     device : :py:obj:`~.CUdevice`
         Device on which to create the context
     vdpDevice : :py:obj:`~.VdpDevice`
-        The VdpDevice to interop with
+        The :py:obj:`~.VdpDevice` to interop with
     vdpGetProcAddress : :py:obj:`~.VdpGetProcAddress`
-        VDPAU's VdpGetProcAddress function pointer
+        VDPAU's :py:obj:`~.VdpGetProcAddress` function pointer
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_DEINITIALIZED`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     pCtx : :py:obj:`~.CUcontext`
         Returned CUDA context
@@ -58928,10 +58986,10 @@ def cuVDPAUCtxCreate(unsigned int flags, device, vdpDevice, vdpGetProcAddress):
 
 @cython.embedsignature(True)
 def cuGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
-    """ Registers a VDPAU VdpVideoSurface object.
+    """ Registers a VDPAU :py:obj:`~.VdpVideoSurface` object.
 
-    Registers the VdpVideoSurface specified by ``vdpSurface`` for access by
-    CUDA. A handle to the registered object is returned as
+    Registers the :py:obj:`~.VdpVideoSurface` specified by ``vdpSurface``
+    for access by CUDA. A handle to the registered object is returned as
     ``pCudaResource``. The surface's intended usage is specified using
     ``flags``, as follows:
 
@@ -58948,8 +59006,8 @@ def cuGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
       entire contents of the resource, so none of the data previously
       stored in the resource will be preserved.
 
-    The VdpVideoSurface is presented as an array of subresources that may
-    be accessed using pointers returned by
+    The :py:obj:`~.VdpVideoSurface` is presented as an array of
+    subresources that may be accessed using pointers returned by
     :py:obj:`~.cuGraphicsSubResourceGetMappedArray`. The exact number of
     valid ``arrayIndex`` values depends on the VDPAU surface format. The
     mapping is shown in the table below. ``mipLevel`` must be 0.
@@ -58957,20 +59015,20 @@ def cuGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
     Parameters
     ----------
     vdpSurface : :py:obj:`~.VdpVideoSurface`
-        The VdpVideoSurface to be registered
+        The :py:obj:`~.VdpVideoSurface` to be registered
     flags : unsigned int
         Map flags
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_ALREADY_MAPPED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`,
     pCudaResource : :py:obj:`~.CUgraphicsResource`
         Pointer to the returned object handle
 
     See Also
     --------
-    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuVDPAUCtxCreate`, :py:obj:`~.cuGraphicsVDPAURegisterOutputSurface`, :py:obj:`~.cuGraphicsUnregisterResource`, :py:obj:`~.cuGraphicsResourceSetMapFlags`, :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cuGraphicsUnmapResources`, :py:obj:`~.cuGraphicsSubResourceGetMappedArray`, :py:obj:`~.cuVDPAUGetDevice`, :py:obj:`~.cudaGraphicsVDPAURegisterVideoSurface`
+    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuVDPAUCtxCreate`, :py:obj:`~.cuGraphicsVDPAURegisterOutputSurface`, :py:obj:`~.cuGraphicsUnregisterResource`, :py:obj:`~.cuGraphicsResourceSetMapFlags`, :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cuGraphicsUnmapResources`, :py:obj:`~.cuGraphicsSubResourceGetMappedArray`, :py:obj:`~.cuVDPAUGetDevice`, :func:`~.cudaGraphicsVDPAURegisterVideoSurface`
     """
     cdef cydriver.VdpVideoSurface cyvdpSurface
     if vdpSurface is None:
@@ -58992,10 +59050,10 @@ def cuGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
 
 @cython.embedsignature(True)
 def cuGraphicsVDPAURegisterOutputSurface(vdpSurface, unsigned int flags):
-    """ Registers a VDPAU VdpOutputSurface object.
+    """ Registers a VDPAU :py:obj:`~.VdpOutputSurface` object.
 
-    Registers the VdpOutputSurface specified by ``vdpSurface`` for access
-    by CUDA. A handle to the registered object is returned as
+    Registers the :py:obj:`~.VdpOutputSurface` specified by ``vdpSurface``
+    for access by CUDA. A handle to the registered object is returned as
     ``pCudaResource``. The surface's intended usage is specified using
     ``flags``, as follows:
 
@@ -59012,8 +59070,8 @@ def cuGraphicsVDPAURegisterOutputSurface(vdpSurface, unsigned int flags):
       entire contents of the resource, so none of the data previously
       stored in the resource will be preserved.
 
-    The VdpOutputSurface is presented as an array of subresources that may
-    be accessed using pointers returned by
+    The :py:obj:`~.VdpOutputSurface` is presented as an array of
+    subresources that may be accessed using pointers returned by
     :py:obj:`~.cuGraphicsSubResourceGetMappedArray`. The exact number of
     valid ``arrayIndex`` values depends on the VDPAU surface format. The
     mapping is shown in the table below. ``mipLevel`` must be 0.
@@ -59021,20 +59079,20 @@ def cuGraphicsVDPAURegisterOutputSurface(vdpSurface, unsigned int flags):
     Parameters
     ----------
     vdpSurface : :py:obj:`~.VdpOutputSurface`
-        The VdpOutputSurface to be registered
+        The :py:obj:`~.VdpOutputSurface` to be registered
     flags : unsigned int
         Map flags
 
     Returns
     -------
-    CUresult
+    :py:obj:`~.CUresult`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_HANDLE`, :py:obj:`~.CUDA_ERROR_ALREADY_MAPPED`, :py:obj:`~.CUDA_ERROR_INVALID_CONTEXT`,
     pCudaResource : :py:obj:`~.CUgraphicsResource`
         Pointer to the returned object handle
 
     See Also
     --------
-    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuVDPAUCtxCreate`, :py:obj:`~.cuGraphicsVDPAURegisterVideoSurface`, :py:obj:`~.cuGraphicsUnregisterResource`, :py:obj:`~.cuGraphicsResourceSetMapFlags`, :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cuGraphicsUnmapResources`, :py:obj:`~.cuGraphicsSubResourceGetMappedArray`, :py:obj:`~.cuVDPAUGetDevice`, :py:obj:`~.cudaGraphicsVDPAURegisterOutputSurface`
+    :py:obj:`~.cuCtxCreate`, :py:obj:`~.cuVDPAUCtxCreate`, :py:obj:`~.cuGraphicsVDPAURegisterVideoSurface`, :py:obj:`~.cuGraphicsUnregisterResource`, :py:obj:`~.cuGraphicsResourceSetMapFlags`, :py:obj:`~.cuGraphicsMapResources`, :py:obj:`~.cuGraphicsUnmapResources`, :py:obj:`~.cuGraphicsSubResourceGetMappedArray`, :py:obj:`~.cuVDPAUGetDevice`, :func:`~.cudaGraphicsVDPAURegisterOutputSurface`
     """
     cdef cydriver.VdpOutputSurface cyvdpSurface
     if vdpSurface is None:
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pxd b/cuda_bindings/cuda/bindings/nvrtc.pxd
index 760154ccbe..5e46a78342 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pxd
+++ b/cuda_bindings/cuda/bindings/nvrtc.pxd
@@ -7,9 +7,9 @@ cimport cuda.bindings.cynvrtc as cynvrtc
 include "_lib/utils.pxd"
 
 cdef class nvrtcProgram:
-    """ nvrtcProgram is the unit of compilation, and an opaque handle for a program.
+    """ :py:obj:`~.nvrtcProgram` is the unit of compilation, and an opaque handle for a program.
 
-    To compile a CUDA program string, an instance of nvrtcProgram must be created first with nvrtcCreateProgram, then compiled with nvrtcCompileProgram.
+    To compile a CUDA program string, an instance of :py:obj:`~.nvrtcProgram` must be created first with :func:`~.nvrtcCreateProgram`, then compiled with :func:`~.nvrtcCompileProgram`.
 
     Methods
     -------
diff --git a/cuda_bindings/cuda/bindings/nvrtc.pyx b/cuda_bindings/cuda/bindings/nvrtc.pyx
index 4576dfd3ad..fd9906cd02 100644
--- a/cuda_bindings/cuda/bindings/nvrtc.pyx
+++ b/cuda_bindings/cuda/bindings/nvrtc.pyx
@@ -43,23 +43,25 @@ ctypedef unsigned long long float_ptr
 ctypedef unsigned long long double_ptr
 ctypedef unsigned long long void_ptr
 
-#: Flags for nvrtcInstallBundledHeaders.Skip installation if version marker
-#: exists and version matches. This is the default behavior when flags=0.
+#: Flags for :func:`~.nvrtcInstallBundledHeaders`.Skip installation if
+#: version marker exists and version matches. This is the default behavior
+#: when flags=0.
 NVRTC_INSTALL_HEADERS_SKIP_IF_EXISTS = cynvrtc.NVRTC_INSTALL_HEADERS_SKIP_IF_EXISTS
 
 #: Clear existing directory contents before installation. Guarantees
 #: consistency by removing any existing files first.
 NVRTC_INSTALL_HEADERS_FORCE_OVERWRITE = cynvrtc.NVRTC_INSTALL_HEADERS_FORCE_OVERWRITE
 
-#: Return NVRTC_ERROR_BUSY immediately if installation is in progress by
-#: another process, instead of waiting for the lock. Can be combined with
-#: FORCE_OVERWRITE using bitwise OR.
+#: Return :py:obj:`~.NVRTC_ERROR_BUSY` immediately if installation is in
+#: progress by another process, instead of waiting for the lock. Can be
+#: combined with FORCE_OVERWRITE using bitwise OR.
 NVRTC_INSTALL_HEADERS_NO_WAIT = cynvrtc.NVRTC_INSTALL_HEADERS_NO_WAIT
 
 class nvrtcResult(_FastEnum):
     """
-    The enumerated type nvrtcResult defines API call result codes.
-    NVRTC API functions return nvrtcResult to indicate the call result.
+    The enumerated type :py:obj:`~.nvrtcResult` defines API call result
+    codes. NVRTC API functions return :py:obj:`~.nvrtcResult` to
+    indicate the call result.
     """
 
     NVRTC_SUCCESS = cynvrtc.nvrtcResult.NVRTC_SUCCESS
@@ -104,9 +106,9 @@ cdef object _nvrtcResult = nvrtcResult
 cdef object _nvrtcResult_SUCCESS = nvrtcResult.NVRTC_SUCCESS
 
 cdef class nvrtcProgram:
-    """ nvrtcProgram is the unit of compilation, and an opaque handle for a program.
+    """ :py:obj:`~.nvrtcProgram` is the unit of compilation, and an opaque handle for a program.
 
-    To compile a CUDA program string, an instance of nvrtcProgram must be created first with nvrtcCreateProgram, then compiled with nvrtcCompileProgram.
+    To compile a CUDA program string, an instance of :py:obj:`~.nvrtcProgram` must be created first with :func:`~.nvrtcCreateProgram`, then compiled with :func:`~.nvrtcCompileProgram`.
 
     Methods
     -------
@@ -316,7 +318,7 @@ cdef class nvrtcBundledHeadersInfo(anon_struct0):
 
 @cython.embedsignature(True)
 def nvrtcGetErrorString(result not None : nvrtcResult):
-    """ nvrtcGetErrorString is a helper function that returns a string describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to ``"NVRTC_SUCCESS"``. For unrecognized enumeration values, it returns ``"NVRTC_ERROR unknown"``.
+    """ :func:`~.nvrtcGetErrorString` is a helper function that returns a string describing the given :py:obj:`~.nvrtcResult` code, e.g., :py:obj:`~.NVRTC_SUCCESS` to ``"NVRTC_SUCCESS"``. For unrecognized enumeration values, it returns ``"NVRTC_ERROR unknown"``.
 
     Parameters
     ----------
@@ -337,11 +339,11 @@ def nvrtcGetErrorString(result not None : nvrtcResult):
 
 @cython.embedsignature(True)
 def nvrtcVersion():
-    """ nvrtcVersion sets the output parameters ``major`` and ``minor`` with the CUDA Runtime Compilation version number.
+    """ :func:`~.nvrtcVersion` sets the output parameters ``major`` and ``minor`` with the CUDA Runtime Compilation version number.
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
     major : int
@@ -359,13 +361,13 @@ def nvrtcVersion():
 
 @cython.embedsignature(True)
 def nvrtcGetNumSupportedArchs():
-    """ nvrtcGetNumSupportedArchs sets the output parameter ``numArchs`` with the number of architectures supported by NVRTC. This can then be used to pass an array to :py:obj:`~.nvrtcGetSupportedArchs` to get the supported architectures.
+    """ :func:`~.nvrtcGetNumSupportedArchs` sets the output parameter ``numArchs`` with the number of architectures supported by NVRTC. This can then be used to pass an array to :py:obj:`~.nvrtcGetSupportedArchs` to get the supported architectures.
 
     see :py:obj:`~.nvrtcGetSupportedArchs`
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
     numArchs : int
@@ -380,13 +382,13 @@ def nvrtcGetNumSupportedArchs():
 
 @cython.embedsignature(True)
 def nvrtcGetSupportedArchs():
-    """ nvrtcGetSupportedArchs populates the array passed via the output parameter ``supportedArchs`` with the architectures supported by NVRTC. The array is sorted in the ascending order. The size of the array to be passed can be determined using :py:obj:`~.nvrtcGetNumSupportedArchs`.
+    """ :func:`~.nvrtcGetSupportedArchs` populates the array passed via the output parameter ``supportedArchs`` with the architectures supported by NVRTC. The array is sorted in the ascending order. The size of the array to be passed can be determined using :py:obj:`~.nvrtcGetNumSupportedArchs`.
 
     see :py:obj:`~.nvrtcGetNumSupportedArchs`
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
     supportedArchs : list[int]
@@ -404,7 +406,7 @@ def nvrtcGetSupportedArchs():
 
 @cython.embedsignature(True)
 def nvrtcCreateProgram(char* src, char* name, int numHeaders, headers : Optional[tuple[bytes] | list[bytes]], includeNames : Optional[tuple[bytes] | list[bytes]]):
-    """ nvrtcCreateProgram creates an instance of nvrtcProgram with the given input parameters, and sets the output parameter ``prog`` with it.
+    """ :func:`~.nvrtcCreateProgram` creates an instance of :py:obj:`~.nvrtcProgram` with the given input parameters, and sets the output parameter ``prog`` with it.
 
     Parameters
     ----------
@@ -427,7 +429,7 @@ def nvrtcCreateProgram(char* src, char* name, int numHeaders, headers : Optional
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_OUT_OF_MEMORY`
         - :py:obj:`~.NVRTC_ERROR_PROGRAM_CREATION_FAILURE`
@@ -459,7 +461,7 @@ def nvrtcCreateProgram(char* src, char* name, int numHeaders, headers : Optional
 
 @cython.embedsignature(True)
 def nvrtcDestroyProgram(prog):
-    """ nvrtcDestroyProgram destroys the given program.
+    """ :func:`~.nvrtcDestroyProgram` destroys the given program.
 
     Parameters
     ----------
@@ -468,7 +470,7 @@ def nvrtcDestroyProgram(prog):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
 
@@ -492,9 +494,10 @@ def nvrtcDestroyProgram(prog):
 
 @cython.embedsignature(True)
 def nvrtcCompileProgram(prog, int numOptions, options : Optional[tuple[bytes] | list[bytes]]):
-    """ nvrtcCompileProgram compiles the given program.
+    """ :func:`~.nvrtcCompileProgram` compiles the given program.
 
-    It supports compile options listed in Supported Compile Options.
+    It supports compile options listed in :ref:`Supported Compile Options
+    <cuda-bindings-nvrtc-group__options>`.
 
     Parameters
     ----------
@@ -508,7 +511,7 @@ def nvrtcCompileProgram(prog, int numOptions, options : Optional[tuple[bytes] |
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_OUT_OF_MEMORY`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
@@ -538,7 +541,7 @@ def nvrtcCompileProgram(prog, int numOptions, options : Optional[tuple[bytes] |
 
 @cython.embedsignature(True)
 def nvrtcGetPTXSize(prog):
-    """ nvrtcGetPTXSize sets the value of ``ptxSizeRet`` with the size of the PTX generated by the previous compilation of ``prog`` (including the trailing ``NULL``).
+    """ :func:`~.nvrtcGetPTXSize` sets the value of ``ptxSizeRet`` with the size of the PTX generated by the previous compilation of ``prog`` (including the trailing ``NULL``).
 
     Parameters
     ----------
@@ -547,7 +550,7 @@ def nvrtcGetPTXSize(prog):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
@@ -575,7 +578,7 @@ def nvrtcGetPTXSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetPTX(prog, char* ptx):
-    """ nvrtcGetPTX stores the PTX generated by the previous compilation of ``prog`` in the memory pointed by ``ptx``.
+    """ :func:`~.nvrtcGetPTX` stores the PTX generated by the previous compilation of ``prog`` in the memory pointed by ``ptx``.
 
     Parameters
     ----------
@@ -586,7 +589,7 @@ def nvrtcGetPTX(prog, char* ptx):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
@@ -609,7 +612,7 @@ def nvrtcGetPTX(prog, char* ptx):
 
 @cython.embedsignature(True)
 def nvrtcGetCUBINSize(prog):
-    """ nvrtcGetCUBINSize sets the value of ``cubinSizeRet`` with the size of the cubin generated by the previous compilation of ``prog``. The value of cubinSizeRet is set to 0 if the value specified to ``-arch`` is a virtual architecture instead of an actual architecture.
+    """ :func:`~.nvrtcGetCUBINSize` sets the value of ``cubinSizeRet`` with the size of the cubin generated by the previous compilation of ``prog``. The value of cubinSizeRet is set to 0 if the value specified to ``-arch`` is a virtual architecture instead of an actual architecture.
 
     Parameters
     ----------
@@ -618,7 +621,7 @@ def nvrtcGetCUBINSize(prog):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
@@ -646,7 +649,7 @@ def nvrtcGetCUBINSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetCUBIN(prog, char* cubin):
-    """ nvrtcGetCUBIN stores the cubin generated by the previous compilation of ``prog`` in the memory pointed by ``cubin``. No cubin is available if the value specified to ``-arch`` is a virtual architecture instead of an actual architecture. The cubin does not contain code for the Tile functions (``__tile__`` / ``__tile_global__``) or variables (``__tile__``); use ``nvrtcGetTileIR()`` to extract the cuda_tile IR generated for Tile code.
+    """ :func:`~.nvrtcGetCUBIN` stores the cubin generated by the previous compilation of ``prog`` in the memory pointed by ``cubin``. No cubin is available if the value specified to ``-arch`` is a virtual architecture instead of an actual architecture. The cubin does not contain code for the Tile functions (``__tile__`` / ``__tile_global__``) or variables (``__tile__``); use :func:`~.nvrtcGetTileIR` to extract the cuda_tile IR generated for Tile code.
 
     Parameters
     ----------
@@ -657,7 +660,7 @@ def nvrtcGetCUBIN(prog, char* cubin):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
@@ -680,7 +683,7 @@ def nvrtcGetCUBIN(prog, char* cubin):
 
 @cython.embedsignature(True)
 def nvrtcGetLTOIRSize(prog):
-    """ nvrtcGetLTOIRSize sets the value of ``LTOIRSizeRet`` with the size of the LTO IR generated by the previous compilation of ``prog``. The value of LTOIRSizeRet is set to 0 if the program was not compiled with ``-dlto``.
+    """ :func:`~.nvrtcGetLTOIRSize` sets the value of ``LTOIRSizeRet`` with the size of the LTO IR generated by the previous compilation of ``prog``. The value of LTOIRSizeRet is set to 0 if the program was not compiled with ``-dlto``.
 
     Parameters
     ----------
@@ -689,7 +692,7 @@ def nvrtcGetLTOIRSize(prog):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
@@ -717,7 +720,7 @@ def nvrtcGetLTOIRSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetLTOIR(prog, char* LTOIR):
-    """ nvrtcGetLTOIR stores the LTO IR generated by the previous compilation of ``prog`` in the memory pointed by ``LTOIR``. No LTO IR is available if the program was compiled without ``-dlto``.
+    """ :func:`~.nvrtcGetLTOIR` stores the LTO IR generated by the previous compilation of ``prog`` in the memory pointed by ``LTOIR``. No LTO IR is available if the program was compiled without ``-dlto``.
 
     Parameters
     ----------
@@ -728,7 +731,7 @@ def nvrtcGetLTOIR(prog, char* LTOIR):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
@@ -751,7 +754,7 @@ def nvrtcGetLTOIR(prog, char* LTOIR):
 
 @cython.embedsignature(True)
 def nvrtcGetOptiXIRSize(prog):
-    """ nvrtcGetOptiXIRSize sets the value of ``optixirSizeRet`` with the size of the OptiX IR generated by the previous compilation of ``prog``. The value of nvrtcGetOptiXIRSize is set to 0 if the program was compiled with options incompatible with OptiX IR generation.
+    """ :func:`~.nvrtcGetOptiXIRSize` sets the value of ``optixirSizeRet`` with the size of the OptiX IR generated by the previous compilation of ``prog``. The value of :func:`~.nvrtcGetOptiXIRSize` is set to 0 if the program was compiled with options incompatible with OptiX IR generation.
 
     Parameters
     ----------
@@ -760,7 +763,7 @@ def nvrtcGetOptiXIRSize(prog):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
@@ -788,7 +791,7 @@ def nvrtcGetOptiXIRSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetOptiXIR(prog, char* optixir):
-    """ nvrtcGetOptiXIR stores the OptiX IR generated by the previous compilation of ``prog`` in the memory pointed by ``optixir``. No OptiX IR is available if the program was compiled with options incompatible with OptiX IR generation.
+    """ :func:`~.nvrtcGetOptiXIR` stores the OptiX IR generated by the previous compilation of ``prog`` in the memory pointed by ``optixir``. No OptiX IR is available if the program was compiled with options incompatible with OptiX IR generation.
 
     Parameters
     ----------
@@ -799,7 +802,7 @@ def nvrtcGetOptiXIR(prog, char* optixir):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
@@ -822,7 +825,7 @@ def nvrtcGetOptiXIR(prog, char* optixir):
 
 @cython.embedsignature(True)
 def nvrtcGetProgramLogSize(prog):
-    """ nvrtcGetProgramLogSize sets ``logSizeRet`` with the size of the log generated by the previous compilation of ``prog`` (including the trailing ``NULL``).
+    """ :func:`~.nvrtcGetProgramLogSize` sets ``logSizeRet`` with the size of the log generated by the previous compilation of ``prog`` (including the trailing ``NULL``).
 
     Note that compilation log may be generated with warnings and
     informative messages, even when the compilation of ``prog`` succeeds.
@@ -834,7 +837,7 @@ def nvrtcGetProgramLogSize(prog):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
@@ -862,7 +865,7 @@ def nvrtcGetProgramLogSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetProgramLog(prog, char* log):
-    """ nvrtcGetProgramLog stores the log generated by the previous compilation of ``prog`` in the memory pointed by ``log``.
+    """ :func:`~.nvrtcGetProgramLog` stores the log generated by the previous compilation of ``prog`` in the memory pointed by ``log``.
 
     Parameters
     ----------
@@ -873,7 +876,7 @@ def nvrtcGetProgramLog(prog, char* log):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
@@ -896,10 +899,10 @@ def nvrtcGetProgramLog(prog, char* log):
 
 @cython.embedsignature(True)
 def nvrtcAddNameExpression(prog, char* name_expression):
-    """ nvrtcAddNameExpression notes the given name expression denoting the address of a global function or device/__constant__ variable.
+    """ :func:`~.nvrtcAddNameExpression` notes the given name expression denoting the address of a global function or device/__constant__ variable.
 
     The identical name expression string must be provided on a subsequent
-    call to nvrtcGetLoweredName to extract the lowered name.
+    call to :func:`~.nvrtcGetLoweredName` to extract the lowered name.
 
     Parameters
     ----------
@@ -911,7 +914,7 @@ def nvrtcAddNameExpression(prog, char* name_expression):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
@@ -981,7 +984,7 @@ def nvrtcGetPCHHeapSize():
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
     ret : int
@@ -1009,7 +1012,7 @@ def nvrtcSetPCHHeapSize(size_t size):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
     """
     with nogil:
@@ -1020,20 +1023,20 @@ def nvrtcSetPCHHeapSize(size_t size):
 def nvrtcGetPCHCreateStatus(prog):
     """ returns the PCH creation status.
 
-    NVRTC_SUCCESS indicates that the PCH was successfully created.
-    NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED indicates that no PCH creation was
-    attempted, either because PCH functionality was not requested during
-    the preceding nvrtcCompileProgram call, or automatic PCH processing was
-    requested, and compiler chose not to create a PCH file.
-    NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED indicates that a PCH file could
-    potentially have been created, but the compiler ran out space in the
-    PCH heap. In this scenario, the
+    :py:obj:`~.NVRTC_SUCCESS` indicates that the PCH was successfully
+    created. :py:obj:`~.NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED` indicates that
+    no PCH creation was attempted, either because PCH functionality was not
+    requested during the preceding :func:`~.nvrtcCompileProgram` call, or
+    automatic PCH processing was requested, and compiler chose not to
+    create a PCH file. :py:obj:`~.NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED`
+    indicates that a PCH file could potentially have been created, but the
+    compiler ran out space in the PCH heap. In this scenario, the
     :py:obj:`~.nvrtcGetPCHHeapSizeRequired()` can be used to query the
     required heap size, the heap can be reallocated for this size with
     :py:obj:`~.nvrtcSetPCHHeapSize()` and PCH creation may be reattempted
     again invoking :py:obj:`~.nvrtcCompileProgram()` with a new NVRTC
-    program instance. NVRTC_ERROR_PCH_CREATE indicates that an error
-    condition prevented the PCH file from being created.
+    program instance. :py:obj:`~.NVRTC_ERROR_PCH_CREATE` indicates that an
+    error condition prevented the PCH file from being created.
 
     Parameters
     ----------
@@ -1042,7 +1045,7 @@ def nvrtcGetPCHCreateStatus(prog):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_NO_PCH_CREATE_ATTEMPTED`
         - :py:obj:`~.NVRTC_ERROR_PCH_CREATE`
@@ -1072,10 +1075,10 @@ def nvrtcGetPCHHeapSizeRequired(prog):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
-        - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT` The size retrieved using this function is only valid if :py:obj:`~.nvrtcGetPCHCreateStatus()` returned NVRTC_SUCCESS or NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED
+        - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT` The size retrieved using this function is only valid if :py:obj:`~.nvrtcGetPCHCreateStatus()` returned :py:obj:`~.NVRTC_SUCCESS` or :py:obj:`~.NVRTC_ERROR_PCH_CREATE_HEAP_EXHAUSTED`
     size : int
         pointer to location where the required size of the PCH Heap will be
         stored
@@ -1097,7 +1100,7 @@ def nvrtcGetPCHHeapSizeRequired(prog):
 
 @cython.embedsignature(True)
 def nvrtcSetFlowCallback(prog, callback, payload):
-    """ nvrtcSetFlowCallback registers a callback function that the compiler will invoke at different points during a call to nvrtcCompileProgram, and the callback function can decide whether to cancel compilation by returning specific values.
+    """ :func:`~.nvrtcSetFlowCallback` registers a callback function that the compiler will invoke at different points during a call to :func:`~.nvrtcCompileProgram`, and the callback function can decide whether to cancel compilation by returning specific values.
 
     The callback function must satisfy the following constraints:
 
@@ -1115,7 +1118,7 @@ def nvrtcSetFlowCallback(prog, callback, payload):
 
     (3) It must return consistent values. Once it returns 1 at one point,
     it must return 1 in all following invocations during the current
-    nvrtcCompileProgram call in progress.
+    :func:`~.nvrtcCompileProgram` call in progress.
 
     (4) It must be thread-safe.
 
@@ -1132,7 +1135,7 @@ def nvrtcSetFlowCallback(prog, callback, payload):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
@@ -1157,7 +1160,7 @@ def nvrtcSetFlowCallback(prog, callback, payload):
 
 @cython.embedsignature(True)
 def nvrtcGetTileIRSize(prog):
-    """ nvrtcGetTileIRSize sets the value of ``TileIRSizeRet`` with the size of the cuda_tile IR generated by the previous compilation of ``prog``.
+    """ :func:`~.nvrtcGetTileIRSize` sets the value of ``TileIRSizeRet`` with the size of the cuda_tile IR generated by the previous compilation of ``prog``.
 
     Parameters
     ----------
@@ -1166,7 +1169,7 @@ def nvrtcGetTileIRSize(prog):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
@@ -1194,7 +1197,7 @@ def nvrtcGetTileIRSize(prog):
 
 @cython.embedsignature(True)
 def nvrtcGetTileIR(prog, char* TileIR):
-    """ nvrtcGetTileIR stores the cuda_tile IR generated by the previous compilation of ``prog`` in the memory pointed by ``TileIR``.
+    """ :func:`~.nvrtcGetTileIR` stores the cuda_tile IR generated by the previous compilation of ``prog`` in the memory pointed by ``TileIR``.
 
     Parameters
     ----------
@@ -1205,7 +1208,7 @@ def nvrtcGetTileIR(prog, char* TileIR):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT`
         - :py:obj:`~.NVRTC_ERROR_INVALID_PROGRAM`
@@ -1228,7 +1231,7 @@ def nvrtcGetTileIR(prog, char* TileIR):
 
 @cython.embedsignature(True)
 def nvrtcInstallBundledHeaders(char* installPath, unsigned int flags):
-    """ nvrtcInstallBundledHeaders extracts CUDA headers bundled with NVRTC to a specified directory for use during compilation.
+    """ :func:`~.nvrtcInstallBundledHeaders` extracts CUDA headers bundled with NVRTC to a specified directory for use during compilation.
 
     NVRTC bundles a set of CUDA Toolkit headers and CUDA C++ Core Libraries
     (CCCL) within libnvrtc-builtins. This function extracts these headers
@@ -1238,7 +1241,7 @@ def nvrtcInstallBundledHeaders(char* installPath, unsigned int flags):
 
     After extraction, users can compile kernels by passing appropriate
     include paths (such as "-I<installPath>" and "-I<installPath>/cccl") to
-    nvrtcCompileProgram.
+    :func:`~.nvrtcCompileProgram`.
 
     A version marker file (.nvrtc_headers_version) is created in the
     installation directory to track the installed version.
@@ -1246,8 +1249,8 @@ def nvrtcInstallBundledHeaders(char* installPath, unsigned int flags):
     This function is thread-safe and process-safe. Concurrent calls from
     multiple threads or processes will be serialized using file locking. By
     default, the function waits for the lock; use
-    NVRTC_INSTALL_HEADERS_NO_WAIT to return immediately with
-    NVRTC_ERROR_BUSY if another process holds the lock.
+    :py:obj:`~.NVRTC_INSTALL_HEADERS_NO_WAIT` to return immediately with
+    :py:obj:`~.NVRTC_ERROR_BUSY` if another process holds the lock.
 
     Parameters
     ----------
@@ -1259,11 +1262,11 @@ def nvrtcInstallBundledHeaders(char* installPath, unsigned int flags):
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT` (invalid path or conflicting flags like SKIP_IF_EXISTS | FORCE_OVERWRITE)
         - :py:obj:`~.NVRTC_ERROR_BUILTIN_OPERATION_FAILURE` (extraction failed or version mismatch)
-        - :py:obj:`~.NVRTC_ERROR_BUSY` (lock held by another process and NVRTC_INSTALL_HEADERS_NO_WAIT was specified)
+        - :py:obj:`~.NVRTC_ERROR_BUSY` (lock held by another process and :py:obj:`~.NVRTC_INSTALL_HEADERS_NO_WAIT` was specified)
     errorLog : bytes
         Optional pointer to receive detailed error message on failure. If
         non-NULL, ``*errorLog`` will be set to point to a string describing
@@ -1277,7 +1280,7 @@ def nvrtcInstallBundledHeaders(char* installPath, unsigned int flags):
 
     Notes
     -----
-    Use NVRTC_INSTALL_HEADERS_SKIP_IF_EXISTS to avoid reinstalling if headers already exist. Use NVRTC_INSTALL_HEADERS_FORCE_OVERWRITE to guarantee consistency by clearing the directory first.
+    Use :py:obj:`~.NVRTC_INSTALL_HEADERS_SKIP_IF_EXISTS` to avoid reinstalling if headers already exist. Use :py:obj:`~.NVRTC_INSTALL_HEADERS_FORCE_OVERWRITE` to guarantee consistency by clearing the directory first.
     """
     cdef const char* errorLog = NULL
     with nogil:
@@ -1288,15 +1291,15 @@ def nvrtcInstallBundledHeaders(char* installPath, unsigned int flags):
 
 @cython.embedsignature(True)
 def nvrtcGetBundledHeadersInfo():
-    """ nvrtcGetBundledHeadersInfo queries information about the bundled headers without extracting them.
+    """ :func:`~.nvrtcGetBundledHeadersInfo` queries information about the bundled headers without extracting them.
 
     This function allows users to determine if bundled headers are
     available and get size estimates before calling
-    nvrtcInstallBundledHeaders.
+    :func:`~.nvrtcInstallBundledHeaders`.
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT` (info is NULL)
         - :py:obj:`~.NVRTC_ERROR_BUILTIN_OPERATION_FAILURE` (failed to query bundled headers)
@@ -1319,22 +1322,22 @@ def nvrtcGetBundledHeadersInfo():
 
 @cython.embedsignature(True)
 def nvrtcRemoveBundledHeaders(char* installPath):
-    """ nvrtcRemoveBundledHeaders removes previously installed bundled headers.
+    """ :func:`~.nvrtcRemoveBundledHeaders` removes previously installed bundled headers.
 
     This function removes the headers installed by
-    nvrtcInstallBundledHeaders, helping users manage disk space. It
-    recursively removes all files and subdirectories within the
+    :func:`~.nvrtcInstallBundledHeaders`, helping users manage disk space.
+    It recursively removes all files and subdirectories within the
     installation directory.
 
     Parameters
     ----------
     installPath : bytes
         Path where headers were previously installed. Must be the same path
-        used with nvrtcInstallBundledHeaders.
+        used with :func:`~.nvrtcInstallBundledHeaders`.
 
     Returns
     -------
-    nvrtcResult
+    :py:obj:`~.nvrtcResult`
         - :py:obj:`~.NVRTC_SUCCESS`
         - :py:obj:`~.NVRTC_ERROR_INVALID_INPUT` (invalid path)
         - :py:obj:`~.NVRTC_ERROR_BUILTIN_OPERATION_FAILURE` (removal failed)
diff --git a/cuda_bindings/cuda/bindings/runtime.pxd.in b/cuda_bindings/cuda/bindings/runtime.pxd.in
index beff0505f8..60484c79bc 100644
--- a/cuda_bindings/cuda/bindings/runtime.pxd.in
+++ b/cuda_bindings/cuda/bindings/runtime.pxd.in
@@ -29,7 +29,7 @@ cdef class cudaDevResourceDesc_t:
 cdef class cudaExecutionContext_t:
     """
 
-    An opaque handle to a CUDA execution context. It represents an execution context created via CUDA Runtime APIs such as cudaGreenCtxCreate.
+    An opaque handle to a CUDA execution context. It represents an execution context created via CUDA Runtime APIs such as :func:`~.cudaGreenCtxCreate`.
 
     Methods
     -------
@@ -415,7 +415,7 @@ cdef class cudaChannelFormatDesc:
         w
     {{endif}}
     {{if 'cudaChannelFormatDesc.f' in found_struct}}
-    f : cudaChannelFormatKind
+    f : :py:obj:`~.cudaChannelFormatKind`
         Channel format kind
     {{endif}}
 
@@ -475,7 +475,8 @@ cdef class cudaArraySparseProperties:
     {{endif}}
     {{if 'cudaArraySparseProperties.flags' in found_struct}}
     flags : unsigned int
-        Flags will either be zero or cudaArraySparsePropertiesSingleMipTail
+        Flags will either be zero or
+        :py:obj:`~.cudaArraySparsePropertiesSingleMipTail`
     {{endif}}
     {{if 'cudaArraySparseProperties.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -526,7 +527,7 @@ cdef class cudaArrayMemoryRequirements:
 
 cdef class cudaPitchedPtr:
     """
-    CUDA Pitched memory pointer  ``make_cudaPitchedPtr``
+    CUDA Pitched memory pointer  :func:`~.make_cudaPitchedPtr`
 
     Attributes
     ----------
@@ -562,7 +563,7 @@ cdef class cudaPitchedPtr:
 
 cdef class cudaExtent:
     """
-    CUDA extent  ``make_cudaExtent``
+    CUDA extent  :func:`~.make_cudaExtent`
 
     Attributes
     ----------
@@ -592,7 +593,7 @@ cdef class cudaExtent:
 
 cdef class cudaPos:
     """
-    CUDA 3D position  ``make_cudaPos``
+    CUDA 3D position  :func:`~.make_cudaPos`
 
     Attributes
     ----------
@@ -626,35 +627,35 @@ cdef class cudaMemcpy3DParms:
     Attributes
     ----------
     {{if 'cudaMemcpy3DParms.srcArray' in found_struct}}
-    srcArray : cudaArray_t
+    srcArray : :py:obj:`~.cudaArray_t`
         Source memory address
     {{endif}}
     {{if 'cudaMemcpy3DParms.srcPos' in found_struct}}
-    srcPos : cudaPos
+    srcPos : :py:obj:`~.cudaPos`
         Source position offset
     {{endif}}
     {{if 'cudaMemcpy3DParms.srcPtr' in found_struct}}
-    srcPtr : cudaPitchedPtr
+    srcPtr : :py:obj:`~.cudaPitchedPtr`
         Pitched source memory address
     {{endif}}
     {{if 'cudaMemcpy3DParms.dstArray' in found_struct}}
-    dstArray : cudaArray_t
+    dstArray : :py:obj:`~.cudaArray_t`
         Destination memory address
     {{endif}}
     {{if 'cudaMemcpy3DParms.dstPos' in found_struct}}
-    dstPos : cudaPos
+    dstPos : :py:obj:`~.cudaPos`
         Destination position offset
     {{endif}}
     {{if 'cudaMemcpy3DParms.dstPtr' in found_struct}}
-    dstPtr : cudaPitchedPtr
+    dstPtr : :py:obj:`~.cudaPitchedPtr`
         Pitched destination memory address
     {{endif}}
     {{if 'cudaMemcpy3DParms.extent' in found_struct}}
-    extent : cudaExtent
+    extent : :py:obj:`~.cudaExtent`
         Requested memory copy size
     {{endif}}
     {{if 'cudaMemcpy3DParms.kind' in found_struct}}
-    kind : cudaMemcpyKind
+    kind : :py:obj:`~.cudaMemcpyKind`
         Type of transfer
     {{endif}}
 
@@ -704,12 +705,12 @@ cdef class cudaMemcpyNodeParams:
         Must be zero
     {{endif}}
     {{if 'cudaMemcpyNodeParams.ctx' in found_struct}}
-    ctx : cudaExecutionContext_t
+    ctx : :py:obj:`~.cudaExecutionContext_t`
         Context in which to run the memcpy. If NULL will try to use the
         current context.
     {{endif}}
     {{if 'cudaMemcpyNodeParams.copyParams' in found_struct}}
-    copyParams : cudaMemcpy3DParms
+    copyParams : :py:obj:`~.cudaMemcpy3DParms`
         Parameters for the memory copy
     {{endif}}
 
@@ -736,15 +737,15 @@ cdef class cudaMemcpy3DPeerParms:
     Attributes
     ----------
     {{if 'cudaMemcpy3DPeerParms.srcArray' in found_struct}}
-    srcArray : cudaArray_t
+    srcArray : :py:obj:`~.cudaArray_t`
         Source memory address
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.srcPos' in found_struct}}
-    srcPos : cudaPos
+    srcPos : :py:obj:`~.cudaPos`
         Source position offset
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.srcPtr' in found_struct}}
-    srcPtr : cudaPitchedPtr
+    srcPtr : :py:obj:`~.cudaPitchedPtr`
         Pitched source memory address
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.srcDevice' in found_struct}}
@@ -752,15 +753,15 @@ cdef class cudaMemcpy3DPeerParms:
         Source device
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.dstArray' in found_struct}}
-    dstArray : cudaArray_t
+    dstArray : :py:obj:`~.cudaArray_t`
         Destination memory address
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.dstPos' in found_struct}}
-    dstPos : cudaPos
+    dstPos : :py:obj:`~.cudaPos`
         Destination position offset
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.dstPtr' in found_struct}}
-    dstPtr : cudaPitchedPtr
+    dstPtr : :py:obj:`~.cudaPitchedPtr`
         Pitched destination memory address
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.dstDevice' in found_struct}}
@@ -768,7 +769,7 @@ cdef class cudaMemcpy3DPeerParms:
         Destination device
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.extent' in found_struct}}
-    extent : cudaExtent
+    extent : :py:obj:`~.cudaExtent`
         Requested memory copy size
     {{endif}}
 
@@ -878,7 +879,7 @@ cdef class cudaMemsetParamsV2:
         Number of rows
     {{endif}}
     {{if 'cudaMemsetParamsV2.ctx' in found_struct}}
-    ctx : cudaExecutionContext_t
+    ctx : :py:obj:`~.cudaExecutionContext_t`
         Context in which to run the memset. If NULL will try to use the
         current context.
     {{endif}}
@@ -928,12 +929,12 @@ cdef class cudaAccessPolicyWindow:
         assigned missProp.
     {{endif}}
     {{if 'cudaAccessPolicyWindow.hitProp' in found_struct}}
-    hitProp : cudaAccessProperty
-        ``CUaccessProperty`` set for hit.
+    hitProp : :py:obj:`~.cudaAccessProperty`
+        :py:obj:`~.CUaccessProperty` set for hit.
     {{endif}}
     {{if 'cudaAccessPolicyWindow.missProp' in found_struct}}
-    missProp : cudaAccessProperty
-        ``CUaccessProperty`` set for miss. Must be either NORMAL or
+    missProp : :py:obj:`~.cudaAccessProperty`
+        :py:obj:`~.CUaccessProperty` set for miss. Must be either NORMAL or
         STREAMING.
     {{endif}}
 
@@ -957,7 +958,7 @@ cdef class cudaHostNodeParams:
     Attributes
     ----------
     {{if 'cudaHostNodeParams.fn' in found_struct}}
-    fn : cudaHostFn_t
+    fn : :py:obj:`~.cudaHostFn_t`
         The function to call when the node executes
     {{endif}}
     {{if 'cudaHostNodeParams.userData' in found_struct}}
@@ -988,7 +989,7 @@ cdef class cudaHostNodeParamsV2:
     Attributes
     ----------
     {{if 'cudaHostNodeParamsV2.fn' in found_struct}}
-    fn : cudaHostFn_t
+    fn : :py:obj:`~.cudaHostFn_t`
         The function to call when the node executes
     {{endif}}
     {{if 'cudaHostNodeParamsV2.userData' in found_struct}}
@@ -1021,7 +1022,7 @@ cdef class anon_struct1:
     Attributes
     ----------
     {{if 'cudaResourceDesc.res.array.array' in found_struct}}
-    array : cudaArray_t
+    array : :py:obj:`~.cudaArray_t`
 
     {{endif}}
 
@@ -1042,7 +1043,7 @@ cdef class anon_struct2:
     Attributes
     ----------
     {{if 'cudaResourceDesc.res.mipmap.mipmap' in found_struct}}
-    mipmap : cudaMipmappedArray_t
+    mipmap : :py:obj:`~.cudaMipmappedArray_t`
 
     {{endif}}
 
@@ -1067,7 +1068,7 @@ cdef class anon_struct3:
 
     {{endif}}
     {{if 'cudaResourceDesc.res.linear.desc' in found_struct}}
-    desc : cudaChannelFormatDesc
+    desc : :py:obj:`~.cudaChannelFormatDesc`
 
     {{endif}}
     {{if 'cudaResourceDesc.res.linear.sizeInBytes' in found_struct}}
@@ -1099,7 +1100,7 @@ cdef class anon_struct4:
 
     {{endif}}
     {{if 'cudaResourceDesc.res.pitch2D.desc' in found_struct}}
-    desc : cudaChannelFormatDesc
+    desc : :py:obj:`~.cudaChannelFormatDesc`
 
     {{endif}}
     {{if 'cudaResourceDesc.res.pitch2D.width' in found_struct}}
@@ -1204,7 +1205,7 @@ cdef class cudaResourceDesc:
     Attributes
     ----------
     {{if 'cudaResourceDesc.resType' in found_struct}}
-    resType : cudaResourceType
+    resType : :py:obj:`~.cudaResourceType`
         Resource type
     {{endif}}
     {{if 'cudaResourceDesc.res' in found_struct}}
@@ -1236,7 +1237,7 @@ cdef class cudaResourceViewDesc:
     Attributes
     ----------
     {{if 'cudaResourceViewDesc.format' in found_struct}}
-    format : cudaResourceViewFormat
+    format : :py:obj:`~.cudaResourceViewFormat`
         Resource view format
     {{endif}}
     {{if 'cudaResourceViewDesc.width' in found_struct}}
@@ -1289,19 +1290,21 @@ cdef class cudaPointerAttributes:
     Attributes
     ----------
     {{if 'cudaPointerAttributes.type' in found_struct}}
-    type : cudaMemoryType
-        The type of memory - cudaMemoryTypeUnregistered,
-        cudaMemoryTypeHost, cudaMemoryTypeDevice or cudaMemoryTypeManaged.
+    type : :py:obj:`~.cudaMemoryType`
+        The type of memory - :py:obj:`~.cudaMemoryTypeUnregistered`,
+        :py:obj:`~.cudaMemoryTypeHost`, :py:obj:`~.cudaMemoryTypeDevice` or
+        :py:obj:`~.cudaMemoryTypeManaged`.
     {{endif}}
     {{if 'cudaPointerAttributes.device' in found_struct}}
     device : int
         The device against which the memory was allocated or registered. If
-        the memory type is cudaMemoryTypeDevice then this identifies the
-        device on which the memory referred physically resides. If the
-        memory type is cudaMemoryTypeHost or::cudaMemoryTypeManaged then
-        this identifies the device which was current when the memory was
-        allocated or registered (and if that device is deinitialized then
-        this allocation will vanish with that device's state).
+        the memory type is :py:obj:`~.cudaMemoryTypeDevice` then this
+        identifies the device on which the memory referred physically
+        resides. If the memory type is :py:obj:`~.cudaMemoryTypeHost` or
+        :py:obj:`~.cudaMemoryTypeManaged` then this identifies the device
+        which was current when the memory was allocated or registered (and
+        if that device is deinitialized then this allocation will vanish
+        with that device's state).
     {{endif}}
     {{if 'cudaPointerAttributes.devicePointer' in found_struct}}
     devicePointer : Any
@@ -1397,9 +1400,10 @@ cdef class cudaFuncAttributes:
         On devices where the L1 cache and shared memory use the same
         hardware resources, this sets the shared memory carveout
         preference, in percent of the maximum shared memory. Refer to
-        cudaDevAttrMaxSharedMemoryPerMultiprocessor. This is only a hint,
-        and the driver can choose a different ratio if required to execute
-        the function. See cudaFuncSetAttribute
+        :py:obj:`~.cudaDevAttrMaxSharedMemoryPerMultiprocessor`. This is
+        only a hint, and the driver can choose a different ratio if
+        required to execute the function. See
+        :func:`~.cudaFuncSetAttribute`
     {{endif}}
     {{if 'cudaFuncAttributes.clusterDimMustBeSet' in found_struct}}
     clusterDimMustBeSet : int
@@ -1412,8 +1416,8 @@ cdef class cudaFuncAttributes:
         either all be 0 or all be positive. The validity of the cluster
         dimensions is otherwise checked at launch time.  If the value is
         set during compile time, it cannot be set at runtime. Setting it at
-        runtime should return cudaErrorNotPermitted. See
-        cudaFuncSetAttribute
+        runtime should return :py:obj:`~.cudaErrorNotPermitted`. See
+        :func:`~.cudaFuncSetAttribute`
     {{endif}}
     {{if 'cudaFuncAttributes.requiredClusterHeight' in found_struct}}
     requiredClusterHeight : int
@@ -1425,7 +1429,8 @@ cdef class cudaFuncAttributes:
     {{endif}}
     {{if 'cudaFuncAttributes.clusterSchedulingPolicyPreference' in found_struct}}
     clusterSchedulingPolicyPreference : int
-        The block scheduling policy of a function. See cudaFuncSetAttribute
+        The block scheduling policy of a function. See
+        :func:`~.cudaFuncSetAttribute`
     {{endif}}
     {{if 'cudaFuncAttributes.nonPortableClusterSizeAllowed' in found_struct}}
     nonPortableClusterSizeAllowed : int
@@ -1433,15 +1438,16 @@ cdef class cudaFuncAttributes:
         size. 1 is allowed, 0 is disallowed. A non-portable cluster size
         may only function on the specific SKUs the program is tested on.
         The launch might fail if the program is run on a different hardware
-        platform.  CUDA API provides cudaOccupancyMaxActiveClusters to
-        assist with checking whether the desired size can be launched on
-        the current device.  Portable Cluster Size  A portable cluster size
-        is guaranteed to be functional on all compute capabilities higher
-        than the target compute capability. The portable cluster size for
-        sm_90 is 8 blocks per cluster. This value may increase for future
-        compute capabilities.  The specific hardware unit may support
-        higher cluster sizes that’s not guaranteed to be portable. See
-        cudaFuncSetAttribute
+        platform.  CUDA API provides
+        :func:`~.cudaOccupancyMaxActiveClusters` to assist with checking
+        whether the desired size can be launched on the current device.
+        Portable Cluster Size  A portable cluster size is guaranteed to be
+        functional on all compute capabilities higher than the target
+        compute capability. The portable cluster size for sm_90 is 8 blocks
+        per cluster. This value may increase for future compute
+        capabilities.  The specific hardware unit may support higher
+        cluster sizes that’s not guaranteed to be portable. See
+        :func:`~.cudaFuncSetAttribute`
     {{endif}}
     {{if 'cudaFuncAttributes.deviceNodeUpdateStatus' in found_struct}}
     deviceNodeUpdateStatus : int
@@ -1471,21 +1477,23 @@ cdef class cudaFuncAttributes:
 cdef class cudaMemLocation:
     """
     Specifies a memory location.  To specify a gpu, set type =
-    cudaMemLocationTypeDevice and set id = the gpu's device ordinal. To
-    specify a cpu NUMA node, set type = cudaMemLocationTypeHostNuma and
-    set id = host NUMA node id.
+    :py:obj:`~.cudaMemLocationTypeDevice` and set id = the gpu's device
+    ordinal. To specify a cpu NUMA node, set type =
+    :py:obj:`~.cudaMemLocationTypeHostNuma` and set id = host NUMA node
+    id.
 
     Attributes
     ----------
     {{if 'cudaMemLocation.type' in found_struct}}
-    type : cudaMemLocationType
+    type : :py:obj:`~.cudaMemLocationType`
         Specifies the location type, which modifies the meaning of id.
     {{endif}}
     {{if 'cudaMemLocation.id' in found_struct}}
     id : int
-        Identifier for cudaMemLocationType::cudaMemLocationTypeDevice,
-        cudaMemLocationType::cudaMemLocationTypeHost, or
-        cudaMemLocationType::cudaMemLocationTypeHostNuma.
+        Identifier for
+        :py:obj:`~.cudaMemLocationType.cudaMemLocationTypeDevice`,
+        :py:obj:`~.cudaMemLocationType.cudaMemLocationTypeHost`, or
+        :py:obj:`~.cudaMemLocationType.cudaMemLocationTypeHostNuma`.
     {{endif}}
 
     Methods
@@ -1505,11 +1513,11 @@ cdef class cudaMemAccessDesc:
     Attributes
     ----------
     {{if 'cudaMemAccessDesc.location' in found_struct}}
-    location : cudaMemLocation
+    location : :py:obj:`~.cudaMemLocation`
         Location on which the request is to change it's accessibility
     {{endif}}
     {{if 'cudaMemAccessDesc.flags' in found_struct}}
-    flags : cudaMemAccessFlags
+    flags : :py:obj:`~.cudaMemAccessFlags`
         ``CUmemProt`` accessibility flags to set on the request
     {{endif}}
 
@@ -1533,25 +1541,25 @@ cdef class cudaMemPoolProps:
     Attributes
     ----------
     {{if 'cudaMemPoolProps.allocType' in found_struct}}
-    allocType : cudaMemAllocationType
+    allocType : :py:obj:`~.cudaMemAllocationType`
         Allocation type. Currently must be specified as
-        cudaMemAllocationTypePinned
+        :py:obj:`~.cudaMemAllocationTypePinned`
     {{endif}}
     {{if 'cudaMemPoolProps.handleTypes' in found_struct}}
-    handleTypes : cudaMemAllocationHandleType
+    handleTypes : :py:obj:`~.cudaMemAllocationHandleType`
         Handle types that will be supported by allocations from the pool.
     {{endif}}
     {{if 'cudaMemPoolProps.location' in found_struct}}
-    location : cudaMemLocation
+    location : :py:obj:`~.cudaMemLocation`
         Location allocations should reside.
     {{endif}}
     {{if 'cudaMemPoolProps.win32SecurityAttributes' in found_struct}}
     win32SecurityAttributes : Any
         Windows-specific LPSECURITYATTRIBUTES required when
-        cudaMemHandleTypeWin32 is specified. This security attribute
-        defines the scope of which exported allocations may be tranferred
-        to other processes. In all other cases, this field is required to
-        be zero.
+        :py:obj:`~.cudaMemHandleTypeWin32` is specified. This security
+        attribute defines the scope of which exported allocations may be
+        tranferred to other processes. In all other cases, this field is
+        required to be zero.
     {{endif}}
     {{if 'cudaMemPoolProps.maxSize' in found_struct}}
     maxSize : size_t
@@ -1611,14 +1619,14 @@ cdef class cudaMemAllocNodeParams:
     Attributes
     ----------
     {{if 'cudaMemAllocNodeParams.poolProps' in found_struct}}
-    poolProps : cudaMemPoolProps
+    poolProps : :py:obj:`~.cudaMemPoolProps`
         in: location where the allocation should reside (specified in
-        ``location``). ``handleTypes`` must be cudaMemHandleTypeNone. IPC
-        is not supported. in: array of memory access descriptors. Used to
-        describe peer GPU access
+        ``location``). ``handleTypes`` must be
+        :py:obj:`~.cudaMemHandleTypeNone`. IPC is not supported. in: array
+        of memory access descriptors. Used to describe peer GPU access
     {{endif}}
     {{if 'cudaMemAllocNodeParams.accessDescs' in found_struct}}
-    accessDescs : cudaMemAccessDesc
+    accessDescs : :py:obj:`~.cudaMemAccessDesc`
         in: number of memory access descriptors. Must not exceed the number
         of GPUs.
     {{endif}}
@@ -1662,14 +1670,14 @@ cdef class cudaMemAllocNodeParamsV2:
     Attributes
     ----------
     {{if 'cudaMemAllocNodeParamsV2.poolProps' in found_struct}}
-    poolProps : cudaMemPoolProps
+    poolProps : :py:obj:`~.cudaMemPoolProps`
         in: location where the allocation should reside (specified in
-        ``location``). ``handleTypes`` must be cudaMemHandleTypeNone. IPC
-        is not supported. in: array of memory access descriptors. Used to
-        describe peer GPU access
+        ``location``). ``handleTypes`` must be
+        :py:obj:`~.cudaMemHandleTypeNone`. IPC is not supported. in: array
+        of memory access descriptors. Used to describe peer GPU access
     {{endif}}
     {{if 'cudaMemAllocNodeParamsV2.accessDescs' in found_struct}}
-    accessDescs : cudaMemAccessDesc
+    accessDescs : :py:obj:`~.cudaMemAccessDesc`
         in: number of memory access descriptors. Must not exceed the number
         of GPUs.
     {{endif}}
@@ -1733,29 +1741,29 @@ cdef class cudaMemFreeNodeParams:
 cdef class cudaMemcpyAttributes:
     """
     Attributes specific to copies within a batch. For more details on
-    usage see cudaMemcpyBatchAsync.
+    usage see :func:`~.cudaMemcpyBatchAsync`.
 
     Attributes
     ----------
     {{if 'cudaMemcpyAttributes.srcAccessOrder' in found_struct}}
-    srcAccessOrder : cudaMemcpySrcAccessOrder
+    srcAccessOrder : :py:obj:`~.cudaMemcpySrcAccessOrder`
         Source access ordering to be observed for copies with this
         attribute.
     {{endif}}
     {{if 'cudaMemcpyAttributes.srcLocHint' in found_struct}}
-    srcLocHint : cudaMemLocation
+    srcLocHint : :py:obj:`~.cudaMemLocation`
         Hint location for the source operand. Ignored when the pointers are
         not managed memory or memory allocated outside CUDA.
     {{endif}}
     {{if 'cudaMemcpyAttributes.dstLocHint' in found_struct}}
-    dstLocHint : cudaMemLocation
+    dstLocHint : :py:obj:`~.cudaMemLocation`
         Hint location for the destination operand. Ignored when the
         pointers are not managed memory or memory allocated outside CUDA.
     {{endif}}
     {{if 'cudaMemcpyAttributes.flags' in found_struct}}
     flags : unsigned int
         Additional flags for copies with this attribute. See
-        cudaMemcpyFlags.
+        :py:obj:`~.cudaMemcpyFlags`.
     {{endif}}
 
     Methods
@@ -1776,7 +1784,8 @@ cdef class cudaMemcpyAttributes:
 
 cdef class cudaOffset3D:
     """
-    Struct representing offset into a cudaArray_t in elements
+    Struct representing offset into a :py:obj:`~.cudaArray_t` in
+    elements
 
     Attributes
     ----------
@@ -1820,7 +1829,7 @@ cdef class anon_struct6:
 
     {{endif}}
     {{if 'cudaMemcpy3DOperand.op.ptr.locHint' in found_struct}}
-    locHint : cudaMemLocation
+    locHint : :py:obj:`~.cudaMemLocation`
 
     {{endif}}
 
@@ -1844,11 +1853,11 @@ cdef class anon_struct7:
     Attributes
     ----------
     {{if 'cudaMemcpy3DOperand.op.array.array' in found_struct}}
-    array : cudaArray_t
+    array : :py:obj:`~.cudaArray_t`
 
     {{endif}}
     {{if 'cudaMemcpy3DOperand.op.array.offset' in found_struct}}
-    offset : cudaOffset3D
+    offset : :py:obj:`~.cudaOffset3D`
 
     {{endif}}
 
@@ -1897,12 +1906,13 @@ cdef class anon_union2:
 
 cdef class cudaMemcpy3DOperand:
     """
-    Struct representing an operand for copy with cudaMemcpy3DBatchAsync
+    Struct representing an operand for copy with
+    :func:`~.cudaMemcpy3DBatchAsync`
 
     Attributes
     ----------
     {{if 'cudaMemcpy3DOperand.type' in found_struct}}
-    type : cudaMemcpy3DOperandType
+    type : :py:obj:`~.cudaMemcpy3DOperandType`
 
     {{endif}}
     {{if 'cudaMemcpy3DOperand.op' in found_struct}}
@@ -1928,25 +1938,26 @@ cdef class cudaMemcpy3DBatchOp:
     Attributes
     ----------
     {{if 'cudaMemcpy3DBatchOp.src' in found_struct}}
-    src : cudaMemcpy3DOperand
+    src : :py:obj:`~.cudaMemcpy3DOperand`
         Source memcpy operand.
     {{endif}}
     {{if 'cudaMemcpy3DBatchOp.dst' in found_struct}}
-    dst : cudaMemcpy3DOperand
+    dst : :py:obj:`~.cudaMemcpy3DOperand`
         Destination memcpy operand.
     {{endif}}
     {{if 'cudaMemcpy3DBatchOp.extent' in found_struct}}
-    extent : cudaExtent
+    extent : :py:obj:`~.cudaExtent`
         Extents of the memcpy between src and dst. The width, height and
         depth components must not be 0.
     {{endif}}
     {{if 'cudaMemcpy3DBatchOp.srcAccessOrder' in found_struct}}
-    srcAccessOrder : cudaMemcpySrcAccessOrder
+    srcAccessOrder : :py:obj:`~.cudaMemcpySrcAccessOrder`
         Source access ordering to be observed for copy from src to dst.
     {{endif}}
     {{if 'cudaMemcpy3DBatchOp.flags' in found_struct}}
     flags : unsigned int
-        Additional flags for copy from src to dst. See cudaMemcpyFlags.
+        Additional flags for copy from src to dst. See
+        :py:obj:`~.cudaMemcpyFlags`.
     {{endif}}
 
     Methods
@@ -1998,7 +2009,7 @@ cdef class cudaDeviceProp:
         ASCII string identifying device
     {{endif}}
     {{if 'cudaDeviceProp.uuid' in found_struct}}
-    uuid : cudaUUID_t
+    uuid : :py:obj:`~.cudaUUID_t`
         16-byte unique identifier
     {{endif}}
     {{if 'cudaDeviceProp.luid' in found_struct}}
@@ -2075,7 +2086,7 @@ cdef class cudaDeviceProp:
     {{if 'cudaDeviceProp.canMapHostMemory' in found_struct}}
     canMapHostMemory : int
         Device can map host memory with
-        cudaHostAlloc/cudaHostGetDevicePointer
+        :func:`~.cudaHostAlloc`/:func:`~.cudaHostGetDevicePointer`
     {{endif}}
     {{if 'cudaDeviceProp.maxTexture1D' in found_struct}}
     maxTexture1D : int
@@ -2248,7 +2259,7 @@ cdef class cudaDeviceProp:
     {{if 'cudaDeviceProp.pageableMemoryAccess' in found_struct}}
     pageableMemoryAccess : int
         Device supports coherently accessing pageable memory without
-        calling cudaHostRegister on it
+        calling :func:`~.cudaHostRegister` on it
     {{endif}}
     {{if 'cudaDeviceProp.concurrentManagedAccess' in found_struct}}
     concurrentManagedAccess : int
@@ -2267,7 +2278,7 @@ cdef class cudaDeviceProp:
     {{if 'cudaDeviceProp.cooperativeLaunch' in found_struct}}
     cooperativeLaunch : int
         Device supports launching cooperative kernels via
-        cudaLaunchCooperativeKernel
+        :func:`~.cudaLaunchCooperativeKernel`
     {{endif}}
     {{if 'cudaDeviceProp.sharedMemPerBlockOptin' in found_struct}}
     sharedMemPerBlockOptin : size_t
@@ -2288,7 +2299,7 @@ cdef class cudaDeviceProp:
     {{endif}}
     {{if 'cudaDeviceProp.accessPolicyMaxWindowSize' in found_struct}}
     accessPolicyMaxWindowSize : int
-        The maximum value of cudaAccessPolicyWindow::num_bytes.
+        The maximum value of :py:obj:`~.cudaAccessPolicyWindow.num_bytes`.
     {{endif}}
     {{if 'cudaDeviceProp.reservedSharedMemPerBlock' in found_struct}}
     reservedSharedMemPerBlock : size_t
@@ -2296,7 +2307,8 @@ cdef class cudaDeviceProp:
     {{endif}}
     {{if 'cudaDeviceProp.hostRegisterSupported' in found_struct}}
     hostRegisterSupported : int
-        Device supports host memory registration via cudaHostRegister.
+        Device supports host memory registration via
+        :func:`~.cudaHostRegister`.
     {{endif}}
     {{if 'cudaDeviceProp.sparseCudaArraySupported' in found_struct}}
     sparseCudaArraySupported : int
@@ -2305,9 +2317,9 @@ cdef class cudaDeviceProp:
     {{endif}}
     {{if 'cudaDeviceProp.hostRegisterReadOnlySupported' in found_struct}}
     hostRegisterReadOnlySupported : int
-        Device supports using the cudaHostRegister flag
-        cudaHostRegisterReadOnly to register memory that must be mapped as
-        read-only to the GPU
+        Device supports using the :func:`~.cudaHostRegister` flag
+        :py:obj:`~.cudaHostRegisterReadOnly` to register memory that must
+        be mapped as read-only to the GPU
     {{endif}}
     {{if 'cudaDeviceProp.timelineSemaphoreInteropSupported' in found_struct}}
     timelineSemaphoreInteropSupported : int
@@ -2315,8 +2327,8 @@ cdef class cudaDeviceProp:
     {{endif}}
     {{if 'cudaDeviceProp.memoryPoolsSupported' in found_struct}}
     memoryPoolsSupported : int
-        1 if the device supports using the cudaMallocAsync and cudaMemPool
-        family of APIs, 0 otherwise
+        1 if the device supports using the :func:`~.cudaMallocAsync` and
+        cudaMemPool family of APIs, 0 otherwise
     {{endif}}
     {{if 'cudaDeviceProp.gpuDirectRDMASupported' in found_struct}}
     gpuDirectRDMASupported : int
@@ -2325,11 +2337,12 @@ cdef class cudaDeviceProp:
     {{if 'cudaDeviceProp.gpuDirectRDMAFlushWritesOptions' in found_struct}}
     gpuDirectRDMAFlushWritesOptions : unsigned int
         Bitmask to be interpreted according to the
-        cudaFlushGPUDirectRDMAWritesOptions enum
+        :py:obj:`~.cudaFlushGPUDirectRDMAWritesOptions` enum
     {{endif}}
     {{if 'cudaDeviceProp.gpuDirectRDMAWritesOrdering' in found_struct}}
     gpuDirectRDMAWritesOrdering : int
-        See the cudaGPUDirectRDMAWritesOrdering enum for numerical values
+        See the :py:obj:`~.cudaGPUDirectRDMAWritesOrdering` enum for
+        numerical values
     {{endif}}
     {{if 'cudaDeviceProp.memoryPoolSupportedHandleTypes' in found_struct}}
     memoryPoolSupportedHandleTypes : unsigned int
@@ -2355,7 +2368,7 @@ cdef class cudaDeviceProp:
     {{if 'cudaDeviceProp.deviceNumaConfig' in found_struct}}
     deviceNumaConfig : int
         NUMA configuration of a device: value is of type
-        cudaDeviceNumaConfig enum
+        :py:obj:`~.cudaDeviceNumaConfig` enum
     {{endif}}
     {{if 'cudaDeviceProp.deviceNumaId' in found_struct}}
     deviceNumaId : int
@@ -2530,7 +2543,7 @@ cdef class cudaExternalMemoryHandleDesc:
     Attributes
     ----------
     {{if 'cudaExternalMemoryHandleDesc.type' in found_struct}}
-    type : cudaExternalMemoryHandleType
+    type : :py:obj:`~.cudaExternalMemoryHandleType`
         Type of the handle
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.handle' in found_struct}}
@@ -2543,7 +2556,8 @@ cdef class cudaExternalMemoryHandleDesc:
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.flags' in found_struct}}
     flags : unsigned int
-        Flags must either be zero or cudaExternalMemoryDedicated
+        Flags must either be zero or
+        :py:obj:`~.cudaExternalMemoryDedicated`
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -2608,17 +2622,17 @@ cdef class cudaExternalMemoryMipmappedArrayDesc:
         chain is.
     {{endif}}
     {{if 'cudaExternalMemoryMipmappedArrayDesc.formatDesc' in found_struct}}
-    formatDesc : cudaChannelFormatDesc
+    formatDesc : :py:obj:`~.cudaChannelFormatDesc`
         Format of base level of the mipmap chain
     {{endif}}
     {{if 'cudaExternalMemoryMipmappedArrayDesc.extent' in found_struct}}
-    extent : cudaExtent
+    extent : :py:obj:`~.cudaExtent`
         Dimensions of base level of the mipmap chain
     {{endif}}
     {{if 'cudaExternalMemoryMipmappedArrayDesc.flags' in found_struct}}
     flags : unsigned int
         Flags associated with CUDA mipmapped arrays. See
-        cudaMallocMipmappedArray
+        :func:`~.cudaMallocMipmappedArray`
     {{endif}}
     {{if 'cudaExternalMemoryMipmappedArrayDesc.numLevels' in found_struct}}
     numLevels : unsigned int
@@ -2712,7 +2726,7 @@ cdef class cudaExternalSemaphoreHandleDesc:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreHandleDesc.type' in found_struct}}
-    type : cudaExternalSemaphoreHandleType
+    type : :py:obj:`~.cudaExternalSemaphoreHandleType`
         Type of the handle
     {{endif}}
     {{if 'cudaExternalSemaphoreHandleDesc.handle' in found_struct}}
@@ -2853,14 +2867,16 @@ cdef class cudaExternalSemaphoreSignalParams:
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.flags' in found_struct}}
     flags : unsigned int
-        Only when cudaExternalSemaphoreSignalParams is used to signal a
-        cudaExternalSemaphore_t of type
-        cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is
-        cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
-        that while signaling the cudaExternalSemaphore_t, no memory
-        synchronization operations should be performed for any external
-        memory object imported as cudaExternalMemoryHandleTypeNvSciBuf. For
-        all other types of cudaExternalSemaphore_t, flags must be zero.
+        Only when :py:obj:`~.cudaExternalSemaphoreSignalParams` is used to
+        signal a :py:obj:`~.cudaExternalSemaphore_t` of type
+        :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync`, the valid
+        flag is :py:obj:`~.cudaExternalSemaphoreSignalSkipNvSciBufMemSync`:
+        which indicates that while signaling the
+        :py:obj:`~.cudaExternalSemaphore_t`, no memory synchronization
+        operations should be performed for any external memory object
+        imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`. For
+        all other types of :py:obj:`~.cudaExternalSemaphore_t`, flags must
+        be zero.
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -2996,14 +3012,16 @@ cdef class cudaExternalSemaphoreWaitParams:
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.flags' in found_struct}}
     flags : unsigned int
-        Only when cudaExternalSemaphoreSignalParams is used to signal a
-        cudaExternalSemaphore_t of type
-        cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is
-        cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
-        that while waiting for the cudaExternalSemaphore_t, no memory
-        synchronization operations should be performed for any external
-        memory object imported as cudaExternalMemoryHandleTypeNvSciBuf. For
-        all other types of cudaExternalSemaphore_t, flags must be zero.
+        Only when :py:obj:`~.cudaExternalSemaphoreSignalParams` is used to
+        signal a :py:obj:`~.cudaExternalSemaphore_t` of type
+        :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync`, the valid
+        flag is :py:obj:`~.cudaExternalSemaphoreSignalSkipNvSciBufMemSync`:
+        which indicates that while waiting for the
+        :py:obj:`~.cudaExternalSemaphore_t`, no memory synchronization
+        operations should be performed for any external memory object
+        imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`. For
+        all other types of :py:obj:`~.cudaExternalSemaphore_t`, flags must
+        be zero.
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -3049,7 +3067,7 @@ cdef class cudaDevSmResource:
     {{if 'cudaDevSmResource.flags' in found_struct}}
     flags : unsigned int
         The flags set on this SM resource. For available flags see
-        cudaDevSmResourceGroup_flags.
+        :py:obj:`~.cudaDevSmResourceGroup_flags`.
     {{endif}}
 
     Methods
@@ -3077,7 +3095,7 @@ cdef class cudaDevWorkqueueConfigResource:
         The expected maximum number of concurrent stream-ordered workloads
     {{endif}}
     {{if 'cudaDevWorkqueueConfigResource.sharingScope' in found_struct}}
-    sharingScope : cudaDevWorkqueueConfigScope
+    sharingScope : :py:obj:`~.cudaDevWorkqueueConfigScope`
         The sharing scope for the workqueue resources
     {{endif}}
 
@@ -3134,8 +3152,8 @@ cdef class cudaDevSmResourceGroupParams_st:
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
     flags : unsigned int
-        Combination of ``cudaDevSmResourceGroup_flags`` values to indicate
-        this this group is created.
+        Combination of :py:obj:`~.cudaDevSmResourceGroup_flags` values to
+        indicate this this group is created.
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -3160,20 +3178,22 @@ cdef class cudaDevResource_st:
     union structcudaDevSmResourcesm;
     structcudaDevWorkqueueConfigResourcewqConfig;
     structcudaDevWorkqueueResourcewq; ; ;  - If ``typename`` is
-    ``cudaDevResourceTypeInvalid``, this resoure is not valid and
-    cannot be further accessed.    - If ``typename`` is
-    ``cudaDevResourceTypeSm``, the cudaDevSmResource structure ``sm``
-    is filled in. For example, ``sm.smCount`` will reflect the amount
-    of streaming multiprocessors available in this resource.    - If
-    ``typename`` is ``cudaDevResourceTypeWorkqueueConfig``, the
-    cudaDevWorkqueueConfigResource structure ``wqConfig`` is filled in.
-    - If ``typename`` is ``cudaDevResourceTypeWorkqueue``, the
-    cudaDevWorkqueueResource structure ``wq`` is filled in.
+    :py:obj:`~.cudaDevResourceTypeInvalid`, this resoure is not valid
+    and cannot be further accessed.    - If ``typename`` is
+    :py:obj:`~.cudaDevResourceTypeSm`, the
+    :py:obj:`~.cudaDevSmResource` structure ``sm`` is filled in. For
+    example, ``sm.smCount`` will reflect the amount of streaming
+    multiprocessors available in this resource.    - If ``typename`` is
+    :py:obj:`~.cudaDevResourceTypeWorkqueueConfig`, the
+    :py:obj:`~.cudaDevWorkqueueConfigResource` structure ``wqConfig``
+    is filled in.    - If ``typename`` is
+    :py:obj:`~.cudaDevResourceTypeWorkqueue`, the
+    :py:obj:`~.cudaDevWorkqueueResource` structure ``wq`` is filled in.
 
     Attributes
     ----------
     {{if 'cudaDevResource_st.type' in found_struct}}
-    type : cudaDevResourceType
+    type : :py:obj:`~.cudaDevResourceType`
         Type of resource, dictates which union field was last set
     {{endif}}
     {{if 'cudaDevResource_st._internal_padding' in found_struct}}
@@ -3181,17 +3201,18 @@ cdef class cudaDevResource_st:
 
     {{endif}}
     {{if 'cudaDevResource_st.sm' in found_struct}}
-    sm : cudaDevSmResource
-        Resource corresponding to cudaDevResourceTypeSm ``typename``.
+    sm : :py:obj:`~.cudaDevSmResource`
+        Resource corresponding to :py:obj:`~.cudaDevResourceTypeSm`
+        ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st.wqConfig' in found_struct}}
-    wqConfig : cudaDevWorkqueueConfigResource
-        Resource corresponding to cudaDevResourceTypeWorkqueueConfig
-        ``typename``.
+    wqConfig : :py:obj:`~.cudaDevWorkqueueConfigResource`
+        Resource corresponding to
+        :py:obj:`~.cudaDevResourceTypeWorkqueueConfig` ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st.wq' in found_struct}}
-    wq : cudaDevWorkqueueResource
-        Resource corresponding to cudaDevResourceTypeWorkqueue
+    wq : :py:obj:`~.cudaDevWorkqueueResource`
+        Resource corresponding to :py:obj:`~.cudaDevResourceTypeWorkqueue`
         ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st._oversize' in found_struct}}
@@ -3199,7 +3220,7 @@ cdef class cudaDevResource_st:
 
     {{endif}}
     {{if 'cudaDevResource_st.nextResource' in found_struct}}
-    nextResource : cudaDevResource_st
+    nextResource : :py:obj:`~.cudaDevResource_st`
 
     {{endif}}
 
@@ -3274,11 +3295,11 @@ cdef class cudaKernelNodeParams:
         Kernel to launch
     {{endif}}
     {{if 'cudaKernelNodeParams.gridDim' in found_struct}}
-    gridDim : dim3
+    gridDim : :py:obj:`~.dim3`
         Grid dimensions
     {{endif}}
     {{if 'cudaKernelNodeParams.blockDim' in found_struct}}
-    blockDim : dim3
+    blockDim : :py:obj:`~.dim3`
         Block dimensions
     {{endif}}
     {{if 'cudaKernelNodeParams.sharedMemBytes' in found_struct}}
@@ -3327,19 +3348,19 @@ cdef class cudaKernelNodeParamsV2:
         functionType = cudaKernelFucntionTypeDevice
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.kern' in found_struct}}
-    kern : cudaKernel_t
+    kern : :py:obj:`~.cudaKernel_t`
         functionType = cudaKernelFucntionTypeKernel
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.cuFunc' in found_struct}}
-    cuFunc : cudaFunction_t
+    cuFunc : :py:obj:`~.cudaFunction_t`
         functionType = cudaKernelFucntionTypeFunction
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.gridDim' in found_struct}}
-    gridDim : dim3
+    gridDim : :py:obj:`~.dim3`
         Grid dimensions
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.blockDim' in found_struct}}
-    blockDim : dim3
+    blockDim : :py:obj:`~.dim3`
         Block dimensions
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.sharedMemBytes' in found_struct}}
@@ -3355,12 +3376,12 @@ cdef class cudaKernelNodeParamsV2:
         Pointer to kernel arguments in the "extra" format
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.ctx' in found_struct}}
-    ctx : cudaExecutionContext_t
+    ctx : :py:obj:`~.cudaExecutionContext_t`
         Context in which to run the kernel. If NULL will try to use the
         current context.
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.functionType' in found_struct}}
-    functionType : cudaKernelFunctionType
+    functionType : :py:obj:`~.cudaKernelFunctionType`
         Type of handle passed in the func/kern/cuFunc union above
     {{endif}}
 
@@ -3402,11 +3423,11 @@ cdef class cudaExternalSemaphoreSignalNodeParams:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreSignalNodeParams.extSemArray' in found_struct}}
-    extSemArray : cudaExternalSemaphore_t
+    extSemArray : :py:obj:`~.cudaExternalSemaphore_t`
         Array of external semaphore handles.
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalNodeParams.paramsArray' in found_struct}}
-    paramsArray : cudaExternalSemaphoreSignalParams
+    paramsArray : :py:obj:`~.cudaExternalSemaphoreSignalParams`
         Array of external semaphore signal parameters.
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalNodeParams.numExtSems' in found_struct}}
@@ -3440,11 +3461,11 @@ cdef class cudaExternalSemaphoreSignalNodeParamsV2:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreSignalNodeParamsV2.extSemArray' in found_struct}}
-    extSemArray : cudaExternalSemaphore_t
+    extSemArray : :py:obj:`~.cudaExternalSemaphore_t`
         Array of external semaphore handles.
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalNodeParamsV2.paramsArray' in found_struct}}
-    paramsArray : cudaExternalSemaphoreSignalParams
+    paramsArray : :py:obj:`~.cudaExternalSemaphoreSignalParams`
         Array of external semaphore signal parameters.
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalNodeParamsV2.numExtSems' in found_struct}}
@@ -3478,11 +3499,11 @@ cdef class cudaExternalSemaphoreWaitNodeParams:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreWaitNodeParams.extSemArray' in found_struct}}
-    extSemArray : cudaExternalSemaphore_t
+    extSemArray : :py:obj:`~.cudaExternalSemaphore_t`
         Array of external semaphore handles.
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitNodeParams.paramsArray' in found_struct}}
-    paramsArray : cudaExternalSemaphoreWaitParams
+    paramsArray : :py:obj:`~.cudaExternalSemaphoreWaitParams`
         Array of external semaphore wait parameters.
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitNodeParams.numExtSems' in found_struct}}
@@ -3516,11 +3537,11 @@ cdef class cudaExternalSemaphoreWaitNodeParamsV2:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreWaitNodeParamsV2.extSemArray' in found_struct}}
-    extSemArray : cudaExternalSemaphore_t
+    extSemArray : :py:obj:`~.cudaExternalSemaphore_t`
         Array of external semaphore handles.
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitNodeParamsV2.paramsArray' in found_struct}}
-    paramsArray : cudaExternalSemaphoreWaitParams
+    paramsArray : :py:obj:`~.cudaExternalSemaphoreWaitParams`
         Array of external semaphore wait parameters.
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitNodeParamsV2.numExtSems' in found_struct}}
@@ -3554,22 +3575,23 @@ cdef class cudaConditionalNodeParams:
     Attributes
     ----------
     {{if 'cudaConditionalNodeParams.handle' in found_struct}}
-    handle : cudaGraphConditionalHandle
+    handle : :py:obj:`~.cudaGraphConditionalHandle`
         Conditional node handle. Handles must be created in advance of
-        creating the node using cudaGraphConditionalHandleCreate.
+        creating the node using :func:`~.cudaGraphConditionalHandleCreate`.
     {{endif}}
     {{if 'cudaConditionalNodeParams.type' in found_struct}}
-    type : cudaGraphConditionalNodeType
+    type : :py:obj:`~.cudaGraphConditionalNodeType`
         Type of conditional node.
     {{endif}}
     {{if 'cudaConditionalNodeParams.size' in found_struct}}
     size : unsigned int
         Size of graph output array. Allowed values are 1 for
-        cudaGraphCondTypeWhile, 1 or 2 for cudaGraphCondTypeIf, or any
-        value greater than zero for cudaGraphCondTypeSwitch.
+        :py:obj:`~.cudaGraphCondTypeWhile`, 1 or 2 for
+        :py:obj:`~.cudaGraphCondTypeIf`, or any value greater than zero for
+        :py:obj:`~.cudaGraphCondTypeSwitch`.
     {{endif}}
     {{if 'cudaConditionalNodeParams.phGraph_out' in found_struct}}
-    phGraph_out : cudaGraph_t
+    phGraph_out : :py:obj:`~.cudaGraph_t`
         CUDA-owned array populated with conditional node child graphs
         during creation of the node. Valid for the lifetime of the
         conditional node. The contents of the graph(s) are subject to the
@@ -3579,16 +3601,18 @@ cdef class cudaConditionalNodeParams:
         - All kernels, including kernels in nested conditionals or child
         graphs at any level, must belong to the same CUDA context.
         These graphs may be populated using graph node creation APIs or
-        cudaStreamBeginCaptureToGraph. cudaGraphCondTypeIf: phGraph_out[0]
-        is executed when the condition is non-zero. If ``size`` == 2,
-        phGraph_out[1] will be executed when the condition is zero.
-        cudaGraphCondTypeWhile: phGraph_out[0] is executed as long as the
-        condition is non-zero. cudaGraphCondTypeSwitch: phGraph_out[n] is
-        executed when the condition is equal to n. If the condition >=
-        ``size``, no body graph is executed.
+        :func:`~.cudaStreamBeginCaptureToGraph`.
+        :py:obj:`~.cudaGraphCondTypeIf`: phGraph_out[0] is executed when
+        the condition is non-zero. If ``size`` == 2, phGraph_out[1] will be
+        executed when the condition is zero.
+        :py:obj:`~.cudaGraphCondTypeWhile`: phGraph_out[0] is executed as
+        long as the condition is non-zero.
+        :py:obj:`~.cudaGraphCondTypeSwitch`: phGraph_out[n] is executed
+        when the condition is equal to n. If the condition >= ``size``, no
+        body graph is executed.
     {{endif}}
     {{if 'cudaConditionalNodeParams.ctx' in found_struct}}
-    ctx : cudaExecutionContext_t
+    ctx : :py:obj:`~.cudaExecutionContext_t`
         CUDA Execution Context
     {{endif}}
 
@@ -3619,7 +3643,7 @@ cdef class cudaChildGraphNodeParams:
     Attributes
     ----------
     {{if 'cudaChildGraphNodeParams.graph' in found_struct}}
-    graph : cudaGraph_t
+    graph : :py:obj:`~.cudaGraph_t`
         The child graph to clone into the node for node creation, or a
         handle to the graph owned by the node for node query. The graph
         must not contain conditional nodes. Graphs containing memory
@@ -3627,7 +3651,7 @@ cdef class cudaChildGraphNodeParams:
         to the parent.
     {{endif}}
     {{if 'cudaChildGraphNodeParams.ownership' in found_struct}}
-    ownership : cudaGraphChildGraphNodeOwnership
+    ownership : :py:obj:`~.cudaGraphChildGraphNodeOwnership`
         The ownership relationship of the child graph node.
     {{endif}}
 
@@ -3651,7 +3675,7 @@ cdef class cudaEventRecordNodeParams:
     Attributes
     ----------
     {{if 'cudaEventRecordNodeParams.event' in found_struct}}
-    event : cudaEvent_t
+    event : :py:obj:`~.cudaEvent_t`
         The event to record when the node executes
     {{endif}}
 
@@ -3675,7 +3699,7 @@ cdef class cudaEventWaitNodeParams:
     Attributes
     ----------
     {{if 'cudaEventWaitNodeParams.event' in found_struct}}
-    event : cudaEvent_t
+    event : :py:obj:`~.cudaEvent_t`
         The event to wait on from the node
     {{endif}}
 
@@ -3694,12 +3718,12 @@ cdef class cudaEventWaitNodeParams:
 
 cdef class cudaGraphNodeParams:
     """
-    Graph node parameters. See cudaGraphAddNode.
+    Graph node parameters. See :func:`~.cudaGraphAddNode`.
 
     Attributes
     ----------
     {{if 'cudaGraphNodeParams.type' in found_struct}}
-    type : cudaGraphNodeType
+    type : :py:obj:`~.cudaGraphNodeType`
         Type of the node
     {{endif}}
     {{if 'cudaGraphNodeParams.reserved0' in found_struct}}
@@ -3711,51 +3735,51 @@ cdef class cudaGraphNodeParams:
         Padding. Unused bytes must be zero.
     {{endif}}
     {{if 'cudaGraphNodeParams.kernel' in found_struct}}
-    kernel : cudaKernelNodeParamsV2
+    kernel : :py:obj:`~.cudaKernelNodeParamsV2`
         Kernel node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.memcpy' in found_struct}}
-    memcpy : cudaMemcpyNodeParams
+    memcpy : :py:obj:`~.cudaMemcpyNodeParams`
         Memcpy node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.memset' in found_struct}}
-    memset : cudaMemsetParamsV2
+    memset : :py:obj:`~.cudaMemsetParamsV2`
         Memset node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.host' in found_struct}}
-    host : cudaHostNodeParamsV2
+    host : :py:obj:`~.cudaHostNodeParamsV2`
         Host node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.graph' in found_struct}}
-    graph : cudaChildGraphNodeParams
+    graph : :py:obj:`~.cudaChildGraphNodeParams`
         Child graph node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.eventWait' in found_struct}}
-    eventWait : cudaEventWaitNodeParams
+    eventWait : :py:obj:`~.cudaEventWaitNodeParams`
         Event wait node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.eventRecord' in found_struct}}
-    eventRecord : cudaEventRecordNodeParams
+    eventRecord : :py:obj:`~.cudaEventRecordNodeParams`
         Event record node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.extSemSignal' in found_struct}}
-    extSemSignal : cudaExternalSemaphoreSignalNodeParamsV2
+    extSemSignal : :py:obj:`~.cudaExternalSemaphoreSignalNodeParamsV2`
         External semaphore signal node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.extSemWait' in found_struct}}
-    extSemWait : cudaExternalSemaphoreWaitNodeParamsV2
+    extSemWait : :py:obj:`~.cudaExternalSemaphoreWaitNodeParamsV2`
         External semaphore wait node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.alloc' in found_struct}}
-    alloc : cudaMemAllocNodeParamsV2
+    alloc : :py:obj:`~.cudaMemAllocNodeParamsV2`
         Memory allocation node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.free' in found_struct}}
-    free : cudaMemFreeNodeParams
+    free : :py:obj:`~.cudaMemFreeNodeParams`
         Memory free node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.conditional' in found_struct}}
-    conditional : cudaConditionalNodeParams
+    conditional : :py:obj:`~.cudaConditionalNodeParams`
         Conditional node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.reserved2' in found_struct}}
@@ -3813,8 +3837,8 @@ cdef class cudaGraphEdgeData_st:
     """
     Optional annotation for edges in a CUDA graph. Note, all edges
     implicitly have annotations and default to a zero-initialized value
-    if not specified. A zero-initialized struct indicates a standard
-    full serialization of two nodes with memory visibility.
+    if not specified. A zero-initialized ``struct indicates`` a
+    standard full serialization of two nodes with memory visibility.
 
     Attributes
     ----------
@@ -3826,9 +3850,9 @@ cdef class cudaGraphEdgeData_st:
         memory visibility to the downstream node or portion thereof
         (indicated by ``to_port``).   Only kernel nodes define non-zero
         ports. A kernel node can use the following output port types:
-        cudaGraphKernelNodePortDefault,
-        cudaGraphKernelNodePortProgrammatic, or
-        cudaGraphKernelNodePortLaunchCompletion.
+        :py:obj:`~.cudaGraphKernelNodePortDefault`,
+        :py:obj:`~.cudaGraphKernelNodePortProgrammatic`, or
+        :py:obj:`~.cudaGraphKernelNodePortLaunchCompletion`.
     {{endif}}
     {{if 'cudaGraphEdgeData_st.to_port' in found_struct}}
     to_port : bytes
@@ -3841,9 +3865,10 @@ cdef class cudaGraphEdgeData_st:
     {{endif}}
     {{if 'cudaGraphEdgeData_st.type' in found_struct}}
     type : bytes
-        This should be populated with a value from cudaGraphDependencyType.
-        (It is typed as char due to compiler-specific layout of bitfields.)
-        See cudaGraphDependencyType.
+        This should be populated with a value from
+        :py:obj:`~.cudaGraphDependencyType`. (It is typed as char due to
+        compiler-specific layout of bitfields.) See
+        :py:obj:`~.cudaGraphDependencyType`.
     {{endif}}
     {{if 'cudaGraphEdgeData_st.reserved' in found_struct}}
     reserved : bytes
@@ -3872,15 +3897,15 @@ cdef class cudaGraphInstantiateParams_st:
         Instantiation flags
     {{endif}}
     {{if 'cudaGraphInstantiateParams_st.uploadStream' in found_struct}}
-    uploadStream : cudaStream_t
+    uploadStream : :py:obj:`~.cudaStream_t`
         Upload stream
     {{endif}}
     {{if 'cudaGraphInstantiateParams_st.errNode_out' in found_struct}}
-    errNode_out : cudaGraphNode_t
+    errNode_out : :py:obj:`~.cudaGraphNode_t`
         The node which caused instantiation to fail, if any
     {{endif}}
     {{if 'cudaGraphInstantiateParams_st.result_out' in found_struct}}
-    result_out : cudaGraphInstantiateResult
+    result_out : :py:obj:`~.cudaGraphInstantiateResult`
         Whether instantiation was successful. If it failed, the reason why
     {{endif}}
 
@@ -3902,22 +3927,22 @@ cdef class cudaGraphInstantiateParams_st:
 
 cdef class cudaGraphExecUpdateResultInfo_st:
     """
-    Result information returned by cudaGraphExecUpdate
+    Result information returned by :func:`~.cudaGraphExecUpdate`
 
     Attributes
     ----------
     {{if 'cudaGraphExecUpdateResultInfo_st.result' in found_struct}}
-    result : cudaGraphExecUpdateResult
+    result : :py:obj:`~.cudaGraphExecUpdateResult`
         Gives more specific detail when a cuda graph update fails.
     {{endif}}
     {{if 'cudaGraphExecUpdateResultInfo_st.errorNode' in found_struct}}
-    errorNode : cudaGraphNode_t
+    errorNode : :py:obj:`~.cudaGraphNode_t`
         The "to node" of the error edge when the topologies do not match.
         The error node when the error is associated with a specific node.
         NULL when the error is generic.
     {{endif}}
     {{if 'cudaGraphExecUpdateResultInfo_st.errorFromNode' in found_struct}}
-    errorFromNode : cudaGraphNode_t
+    errorFromNode : :py:obj:`~.cudaGraphNode_t`
         The from node of error edge when the topologies do not match.
         Otherwise NULL.
     {{endif}}
@@ -3972,7 +3997,7 @@ cdef class anon_union10:
     Attributes
     ----------
     {{if 'cudaGraphKernelNodeUpdate.updateData.gridDim' in found_struct}}
-    gridDim : dim3
+    gridDim : :py:obj:`~.dim3`
 
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData.param' in found_struct}}
@@ -4007,11 +4032,11 @@ cdef class cudaGraphKernelNodeUpdate:
     Attributes
     ----------
     {{if 'cudaGraphKernelNodeUpdate.node' in found_struct}}
-    node : cudaGraphDeviceNode_t
+    node : :py:obj:`~.cudaGraphDeviceNode_t`
         Node to update
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.field' in found_struct}}
-    field : cudaGraphKernelNodeField
+    field : :py:obj:`~.cudaGraphKernelNodeField`
         Which type of update to apply. Determines how updateData is
         interpreted
     {{endif}}
@@ -4038,13 +4063,15 @@ cdef class cudaGraphKernelNodeUpdate:
 
 cdef class cudaLaunchMemSyncDomainMap_st:
     """
-    Memory Synchronization Domain map  See cudaLaunchMemSyncDomain.  By
-    default, kernels are launched in domain 0. Kernel launched with
-    cudaLaunchMemSyncDomainRemote will have a different domain ID. User
-    may also alter the domain ID with cudaLaunchMemSyncDomainMap for a
-    specific stream / graph node / kernel launch. See
-    cudaLaunchAttributeMemSyncDomainMap.  Domain ID range is available
-    through cudaDevAttrMemSyncDomainCount.
+    Memory Synchronization Domain map  See
+    :py:obj:`~.cudaLaunchMemSyncDomain`.  By default, kernels are
+    launched in domain 0. Kernel launched with
+    :py:obj:`~.cudaLaunchMemSyncDomainRemote` will have a different
+    domain ID. User may also alter the domain ID with
+    :py:obj:`~.cudaLaunchMemSyncDomainMap` for a specific stream /
+    graph node / kernel launch. See
+    :py:obj:`~.cudaLaunchAttributeMemSyncDomainMap`.  Domain ID range
+    is available through :py:obj:`~.cudaDevAttrMemSyncDomainCount`.
 
     Attributes
     ----------
@@ -4098,7 +4125,7 @@ cdef class anon_struct18:
     Attributes
     ----------
     {{if 'cudaLaunchAttributeValue.programmaticEvent.event' in found_struct}}
-    event : cudaEvent_t
+    event : :py:obj:`~.cudaEvent_t`
 
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticEvent.flags' in found_struct}}
@@ -4153,7 +4180,7 @@ cdef class anon_struct20:
     Attributes
     ----------
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent.event' in found_struct}}
-    event : cudaEvent_t
+    event : :py:obj:`~.cudaEvent_t`
 
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent.flags' in found_struct}}
@@ -4182,7 +4209,7 @@ cdef class anon_struct21:
 
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode' in found_struct}}
-    devNode : cudaGraphDeviceNode_t
+    devNode : :py:obj:`~.cudaGraphDeviceNode_t`
 
     {{endif}}
 
@@ -4200,7 +4227,8 @@ cdef class anon_struct21:
 
 cdef class cudaLaunchAttributeValue:
     """
-    Launch attributes union; used as value field of cudaLaunchAttribute
+    Launch attributes union; used as value field of
+    :py:obj:`~.cudaLaunchAttribute`
 
     Attributes
     ----------
@@ -4209,116 +4237,126 @@ cdef class cudaLaunchAttributeValue:
 
     {{endif}}
     {{if 'cudaLaunchAttributeValue.accessPolicyWindow' in found_struct}}
-    accessPolicyWindow : cudaAccessPolicyWindow
-        Value of launch attribute cudaLaunchAttributeAccessPolicyWindow.
+    accessPolicyWindow : :py:obj:`~.cudaAccessPolicyWindow`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeAccessPolicyWindow`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.cooperative' in found_struct}}
     cooperative : int
-        Value of launch attribute cudaLaunchAttributeCooperative. Nonzero
-        indicates a cooperative kernel (see cudaLaunchCooperativeKernel).
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeCooperative`. Nonzero indicates a
+        cooperative kernel (see :func:`~.cudaLaunchCooperativeKernel`).
     {{endif}}
     {{if 'cudaLaunchAttributeValue.syncPolicy' in found_struct}}
-    syncPolicy : cudaSynchronizationPolicy
-        Value of launch attribute cudaLaunchAttributeSynchronizationPolicy.
-        cudaSynchronizationPolicy for work queued up in this stream.
+    syncPolicy : :py:obj:`~.cudaSynchronizationPolicy`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeSynchronizationPolicy`.
+        :py:obj:`~.cudaSynchronizationPolicy` for work queued up in this
+        stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
     clusterDim : anon_struct17
-        Value of launch attribute cudaLaunchAttributeClusterDimension that
-        represents the desired cluster dimensions for the kernel. Opaque
-        type with the following fields: - ``x`` - The X dimension of the
-        cluster, in blocks. Must be a divisor of the grid X dimension.    -
-        ``y`` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
-        the cluster, in blocks. Must be a divisor of the grid Z dimension.
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeClusterDimension` that represents the
+        desired cluster dimensions for the kernel. Opaque type with the
+        following fields: - ``x`` - The X dimension of the cluster, in
+        blocks. Must be a divisor of the grid X dimension.    - ``y`` - The
+        Y dimension of the cluster, in blocks. Must be a divisor of the
+        grid Y dimension.    - ``z`` - The Z dimension of the cluster, in
+        blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterSchedulingPolicyPreference' in found_struct}}
-    clusterSchedulingPolicyPreference : cudaClusterSchedulingPolicy
+    clusterSchedulingPolicyPreference : :py:obj:`~.cudaClusterSchedulingPolicy`
         Value of launch attribute
-        cudaLaunchAttributeClusterSchedulingPolicyPreference. Cluster
-        scheduling policy preference for the kernel.
+        :py:obj:`~.cudaLaunchAttributeClusterSchedulingPolicyPreference`.
+        Cluster scheduling policy preference for the kernel.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticStreamSerializationAllowed' in found_struct}}
     programmaticStreamSerializationAllowed : int
         Value of launch attribute
-        cudaLaunchAttributeProgrammaticStreamSerialization.
+        :py:obj:`~.cudaLaunchAttributeProgrammaticStreamSerialization`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct18
-        Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
-        the following fields: - ``cudaEvent_t`` event - Event to fire when
-        all blocks trigger it.    - ``int`` flags; - Event record flags,
-        see cudaEventRecordWithFlags. Does not accept
-        cudaEventRecordExternal.    - ``int`` triggerAtBlockStart - If this
-        is set to non-0, each block launch will automatically trigger the
-        event.
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeProgrammaticEvent` with the following
+        fields: - :py:obj:`~.cudaEvent_t` event - Event to fire when all
+        blocks trigger it.    - ``int`` flags; - Event record flags, see
+        :func:`~.cudaEventRecordWithFlags`. Does not accept
+        :py:obj:`~.cudaEventRecordExternal`.    - ``int``
+        triggerAtBlockStart - If this is set to non-0, each block launch
+        will automatically trigger the event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
-        Value of launch attribute cudaLaunchAttributePriority. Execution
-        priority of the kernel.
+        Value of launch attribute :py:obj:`~.cudaLaunchAttributePriority`.
+        Execution priority of the kernel.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
-    memSyncDomainMap : cudaLaunchMemSyncDomainMap
-        Value of launch attribute cudaLaunchAttributeMemSyncDomainMap. See
-        cudaLaunchMemSyncDomainMap.
+    memSyncDomainMap : :py:obj:`~.cudaLaunchMemSyncDomainMap`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeMemSyncDomainMap`. See
+        :py:obj:`~.cudaLaunchMemSyncDomainMap`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomain' in found_struct}}
-    memSyncDomain : cudaLaunchMemSyncDomain
-        Value of launch attribute cudaLaunchAttributeMemSyncDomain. See
-        cudaLaunchMemSyncDomain.
+    memSyncDomain : :py:obj:`~.cudaLaunchMemSyncDomain`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeMemSyncDomain`. See
+        :py:obj:`~.cudaLaunchMemSyncDomain`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.preferredClusterDim' in found_struct}}
     preferredClusterDim : anon_struct19
         Value of launch attribute
-        cudaLaunchAttributePreferredClusterDimension that represents the
-        desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - ``x`` - The X dimension of the
-        preferred cluster, in blocks. Must be a divisor of the grid X
+        :py:obj:`~.cudaLaunchAttributePreferredClusterDimension` that
+        represents the desired preferred cluster dimensions for the kernel.
+        Opaque type with the following fields: - ``x`` - The X dimension of
+        the preferred cluster, in blocks. Must be a divisor of the grid X
         dimension, and must be a multiple of the ``x`` field of
-        cudaLaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension
-        of the preferred cluster, in blocks. Must be a divisor of the grid
-        Y dimension, and must be a multiple of the ``y`` field of
-        cudaLaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension
-        of the preferred cluster, in blocks. Must be equal to the ``z``
-        field of cudaLaunchAttributeValue::clusterDim.
+        :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.    - ``y`` - The Y
+        dimension of the preferred cluster, in blocks. Must be a divisor of
+        the grid Y dimension, and must be a multiple of the ``y`` field of
+        :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.    - ``z`` - The Z
+        dimension of the preferred cluster, in blocks. Must be equal to the
+        ``z`` field of :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
-        Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
-        with the following fields: - ``cudaEvent_t`` event - Event to fire
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeLaunchCompletionEvent` with the
+        following fields: - :py:obj:`~.cudaEvent_t` event - Event to fire
         when the last block launches.    - ``int`` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
-        cudaEventRecordExternal.
+        flags, see :func:`~.cudaEventRecordWithFlags`. Does not accept
+        :py:obj:`~.cudaEventRecordExternal`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct21
         Value of launch attribute
-        cudaLaunchAttributeDeviceUpdatableKernelNode with the following
-        fields: - ``int`` deviceUpdatable - Whether or not the resulting
-        kernel node should be device-updatable.    -
-        ``cudaGraphDeviceNode_t`` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        :py:obj:`~.cudaLaunchAttributeDeviceUpdatableKernelNode` with the
+        following fields: - ``int`` deviceUpdatable - Whether or not the
+        resulting kernel node should be device-updatable.    -
+        :py:obj:`~.cudaGraphDeviceNode_t` devNode - Returns a handle to
+        pass to the various device-side update functions.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
         Value of launch attribute
-        cudaLaunchAttributePreferredSharedMemoryCarveout.
+        :py:obj:`~.cudaLaunchAttributePreferredSharedMemoryCarveout`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.nvlinkUtilCentricScheduling' in found_struct}}
     nvlinkUtilCentricScheduling : unsigned int
         Value of launch attribute
-        cudaLaunchAttributeNvlinkUtilCentricScheduling.
+        :py:obj:`~.cudaLaunchAttributeNvlinkUtilCentricScheduling`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.portableClusterSizeMode' in found_struct}}
-    portableClusterSizeMode : cudaLaunchAttributePortableClusterMode
+    portableClusterSizeMode : :py:obj:`~.cudaLaunchAttributePortableClusterMode`
         Value of launch attribute
-        cudaLaunchAttributePortableClusterSizeMode
+        :py:obj:`~.cudaLaunchAttributePortableClusterSizeMode`
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemoryMode' in found_struct}}
-    sharedMemoryMode : cudaSharedMemoryMode
-        Value of launch attribute cudaLaunchAttributeSharedMemoryMode. See
-        cudaSharedMemoryMode for acceptable values.
+    sharedMemoryMode : :py:obj:`~.cudaSharedMemoryMode`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeSharedMemoryMode`. See
+        :py:obj:`~.cudaSharedMemoryMode` for acceptable values.
     {{endif}}
 
     Methods
@@ -4359,11 +4397,11 @@ cdef class cudaLaunchAttribute_st:
     Attributes
     ----------
     {{if 'cudaLaunchAttribute_st.id' in found_struct}}
-    id : cudaLaunchAttributeID
+    id : :py:obj:`~.cudaLaunchAttributeID`
         Attribute to set
     {{endif}}
     {{if 'cudaLaunchAttribute_st.val' in found_struct}}
-    val : cudaLaunchAttributeValue
+    val : :py:obj:`~.cudaLaunchAttributeValue`
         Value of the attribute
     {{endif}}
 
@@ -4426,7 +4464,7 @@ cdef class cudaAsyncNotificationInfo:
     Attributes
     ----------
     {{if 'cudaAsyncNotificationInfo.type' in found_struct}}
-    type : cudaAsyncNotificationType
+    type : :py:obj:`~.cudaAsyncNotificationType`
         The type of notification being sent
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
@@ -4455,15 +4493,15 @@ cdef class cudaTextureDesc:
     Attributes
     ----------
     {{if 'cudaTextureDesc.addressMode' in found_struct}}
-    addressMode : list[cudaTextureAddressMode]
+    addressMode : list[:py:obj:`~.cudaTextureAddressMode`]
         Texture address mode for up to 3 dimensions
     {{endif}}
     {{if 'cudaTextureDesc.filterMode' in found_struct}}
-    filterMode : cudaTextureFilterMode
+    filterMode : :py:obj:`~.cudaTextureFilterMode`
         Texture filter mode
     {{endif}}
     {{if 'cudaTextureDesc.readMode' in found_struct}}
-    readMode : cudaTextureReadMode
+    readMode : :py:obj:`~.cudaTextureReadMode`
         Texture read mode
     {{endif}}
     {{if 'cudaTextureDesc.sRGB' in found_struct}}
@@ -4483,7 +4521,7 @@ cdef class cudaTextureDesc:
         Limit to the anisotropy ratio
     {{endif}}
     {{if 'cudaTextureDesc.mipmapFilterMode' in found_struct}}
-    mipmapFilterMode : cudaTextureFilterMode
+    mipmapFilterMode : :py:obj:`~.cudaTextureFilterMode`
         Mipmap filter mode
     {{endif}}
     {{if 'cudaTextureDesc.mipmapLevelBias' in found_struct}}
@@ -4525,7 +4563,7 @@ cdef class cudaGraphRecaptureCallbackData:
     Attributes
     ----------
     {{if 'cudaGraphRecaptureCallbackData.callbackFunc' in found_struct}}
-    callbackFunc : cudaGraphRecaptureCallback_t
+    callbackFunc : :py:obj:`~.cudaGraphRecaptureCallback_t`
         Callback function that will be invoked
     {{endif}}
     {{if 'cudaGraphRecaptureCallbackData.userData' in found_struct}}
@@ -4577,7 +4615,7 @@ cdef class cudaEglPlaneDesc_st:
         Number of channels for the plane
     {{endif}}
     {{if True}}
-    channelDesc : cudaChannelFormatDesc
+    channelDesc : :py:obj:`~.cudaChannelFormatDesc`
         Channel Format Descriptor
     {{endif}}
     {{if True}}
@@ -4603,11 +4641,11 @@ cdef class anon_union12:
     Attributes
     ----------
     {{if True}}
-    pArray : list[cudaArray_t]
+    pArray : list[:py:obj:`~.cudaArray_t`]
 
     {{endif}}
     {{if True}}
-    pPitch : list[cudaPitchedPtr]
+    pPitch : list[:py:obj:`~.cudaPitchedPtr`]
 
     {{endif}}
 
@@ -4625,11 +4663,11 @@ cdef class cudaEglFrame_st:
     CUDA EGLFrame Descriptor - structure defining one frame of EGL.
     Each frame may contain one or more planes depending on whether the
     surface is Multiplanar or not. Each plane of EGLFrame is
-    represented by cudaEglPlaneDesc which is defined as:
+    represented by :py:obj:`~.cudaEglPlaneDesc` which is defined as:
     typedefstructcudaEglPlaneDesc_st unsignedintwidth;
     unsignedintheight; unsignedintdepth; unsignedintpitch;
     unsignedintnumChannels; structcudaChannelFormatDescchannelDesc;
-    unsignedintreserved[4]; cudaEglPlaneDesc;
+    unsignedintreserved[4]; :py:obj:`~.cudaEglPlaneDesc`;
 
     Attributes
     ----------
@@ -4638,19 +4676,19 @@ cdef class cudaEglFrame_st:
 
     {{endif}}
     {{if True}}
-    planeDesc : list[cudaEglPlaneDesc]
-        CUDA EGL Plane Descriptor cudaEglPlaneDesc
+    planeDesc : list[:py:obj:`~.cudaEglPlaneDesc`]
+        CUDA EGL Plane Descriptor :py:obj:`~.cudaEglPlaneDesc`
     {{endif}}
     {{if True}}
     planeCount : unsigned int
         Number of planes
     {{endif}}
     {{if True}}
-    frameType : cudaEglFrameType
+    frameType : :py:obj:`~.cudaEglFrameType`
         Array or Pitch
     {{endif}}
     {{if True}}
-    eglColorFormat : cudaEglColorFormat
+    eglColorFormat : :py:obj:`~.cudaEglColorFormat`
         CUDA EGL Color Format
     {{endif}}
 
@@ -4783,8 +4821,8 @@ cdef class cudaDevSmResourceGroupParams(cudaDevSmResourceGroupParams_st):
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
     flags : unsigned int
-        Combination of ``cudaDevSmResourceGroup_flags`` values to indicate
-        this this group is created.
+        Combination of :py:obj:`~.cudaDevSmResourceGroup_flags` values to
+        indicate this this group is created.
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -4808,20 +4846,22 @@ cdef class cudaDevResource(cudaDevResource_st):
     union structcudaDevSmResourcesm;
     structcudaDevWorkqueueConfigResourcewqConfig;
     structcudaDevWorkqueueResourcewq; ; ;  - If ``typename`` is
-    ``cudaDevResourceTypeInvalid``, this resoure is not valid and
-    cannot be further accessed.    - If ``typename`` is
-    ``cudaDevResourceTypeSm``, the cudaDevSmResource structure ``sm``
-    is filled in. For example, ``sm.smCount`` will reflect the amount
-    of streaming multiprocessors available in this resource.    - If
-    ``typename`` is ``cudaDevResourceTypeWorkqueueConfig``, the
-    cudaDevWorkqueueConfigResource structure ``wqConfig`` is filled in.
-    - If ``typename`` is ``cudaDevResourceTypeWorkqueue``, the
-    cudaDevWorkqueueResource structure ``wq`` is filled in.
+    :py:obj:`~.cudaDevResourceTypeInvalid`, this resoure is not valid
+    and cannot be further accessed.    - If ``typename`` is
+    :py:obj:`~.cudaDevResourceTypeSm`, the
+    :py:obj:`~.cudaDevSmResource` structure ``sm`` is filled in. For
+    example, ``sm.smCount`` will reflect the amount of streaming
+    multiprocessors available in this resource.    - If ``typename`` is
+    :py:obj:`~.cudaDevResourceTypeWorkqueueConfig`, the
+    :py:obj:`~.cudaDevWorkqueueConfigResource` structure ``wqConfig``
+    is filled in.    - If ``typename`` is
+    :py:obj:`~.cudaDevResourceTypeWorkqueue`, the
+    :py:obj:`~.cudaDevWorkqueueResource` structure ``wq`` is filled in.
 
     Attributes
     ----------
     {{if 'cudaDevResource_st.type' in found_struct}}
-    type : cudaDevResourceType
+    type : :py:obj:`~.cudaDevResourceType`
         Type of resource, dictates which union field was last set
     {{endif}}
     {{if 'cudaDevResource_st._internal_padding' in found_struct}}
@@ -4829,17 +4869,18 @@ cdef class cudaDevResource(cudaDevResource_st):
 
     {{endif}}
     {{if 'cudaDevResource_st.sm' in found_struct}}
-    sm : cudaDevSmResource
-        Resource corresponding to cudaDevResourceTypeSm ``typename``.
+    sm : :py:obj:`~.cudaDevSmResource`
+        Resource corresponding to :py:obj:`~.cudaDevResourceTypeSm`
+        ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st.wqConfig' in found_struct}}
-    wqConfig : cudaDevWorkqueueConfigResource
-        Resource corresponding to cudaDevResourceTypeWorkqueueConfig
-        ``typename``.
+    wqConfig : :py:obj:`~.cudaDevWorkqueueConfigResource`
+        Resource corresponding to
+        :py:obj:`~.cudaDevResourceTypeWorkqueueConfig` ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st.wq' in found_struct}}
-    wq : cudaDevWorkqueueResource
-        Resource corresponding to cudaDevResourceTypeWorkqueue
+    wq : :py:obj:`~.cudaDevWorkqueueResource`
+        Resource corresponding to :py:obj:`~.cudaDevResourceTypeWorkqueue`
         ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st._oversize' in found_struct}}
@@ -4847,7 +4888,7 @@ cdef class cudaDevResource(cudaDevResource_st):
 
     {{endif}}
     {{if 'cudaDevResource_st.nextResource' in found_struct}}
-    nextResource : cudaDevResource_st
+    nextResource : :py:obj:`~.cudaDevResource_st`
 
     {{endif}}
 
@@ -4864,8 +4905,8 @@ cdef class cudaGraphEdgeData(cudaGraphEdgeData_st):
     """
     Optional annotation for edges in a CUDA graph. Note, all edges
     implicitly have annotations and default to a zero-initialized value
-    if not specified. A zero-initialized struct indicates a standard
-    full serialization of two nodes with memory visibility.
+    if not specified. A zero-initialized ``struct indicates`` a
+    standard full serialization of two nodes with memory visibility.
 
     Attributes
     ----------
@@ -4877,9 +4918,9 @@ cdef class cudaGraphEdgeData(cudaGraphEdgeData_st):
         memory visibility to the downstream node or portion thereof
         (indicated by ``to_port``).   Only kernel nodes define non-zero
         ports. A kernel node can use the following output port types:
-        cudaGraphKernelNodePortDefault,
-        cudaGraphKernelNodePortProgrammatic, or
-        cudaGraphKernelNodePortLaunchCompletion.
+        :py:obj:`~.cudaGraphKernelNodePortDefault`,
+        :py:obj:`~.cudaGraphKernelNodePortProgrammatic`, or
+        :py:obj:`~.cudaGraphKernelNodePortLaunchCompletion`.
     {{endif}}
     {{if 'cudaGraphEdgeData_st.to_port' in found_struct}}
     to_port : bytes
@@ -4892,9 +4933,10 @@ cdef class cudaGraphEdgeData(cudaGraphEdgeData_st):
     {{endif}}
     {{if 'cudaGraphEdgeData_st.type' in found_struct}}
     type : bytes
-        This should be populated with a value from cudaGraphDependencyType.
-        (It is typed as char due to compiler-specific layout of bitfields.)
-        See cudaGraphDependencyType.
+        This should be populated with a value from
+        :py:obj:`~.cudaGraphDependencyType`. (It is typed as char due to
+        compiler-specific layout of bitfields.) See
+        :py:obj:`~.cudaGraphDependencyType`.
     {{endif}}
     {{if 'cudaGraphEdgeData_st.reserved' in found_struct}}
     reserved : bytes
@@ -4922,15 +4964,15 @@ cdef class cudaGraphInstantiateParams(cudaGraphInstantiateParams_st):
         Instantiation flags
     {{endif}}
     {{if 'cudaGraphInstantiateParams_st.uploadStream' in found_struct}}
-    uploadStream : cudaStream_t
+    uploadStream : :py:obj:`~.cudaStream_t`
         Upload stream
     {{endif}}
     {{if 'cudaGraphInstantiateParams_st.errNode_out' in found_struct}}
-    errNode_out : cudaGraphNode_t
+    errNode_out : :py:obj:`~.cudaGraphNode_t`
         The node which caused instantiation to fail, if any
     {{endif}}
     {{if 'cudaGraphInstantiateParams_st.result_out' in found_struct}}
-    result_out : cudaGraphInstantiateResult
+    result_out : :py:obj:`~.cudaGraphInstantiateResult`
         Whether instantiation was successful. If it failed, the reason why
     {{endif}}
 
@@ -4945,22 +4987,22 @@ cdef class cudaGraphInstantiateParams(cudaGraphInstantiateParams_st):
 
 cdef class cudaGraphExecUpdateResultInfo(cudaGraphExecUpdateResultInfo_st):
     """
-    Result information returned by cudaGraphExecUpdate
+    Result information returned by :func:`~.cudaGraphExecUpdate`
 
     Attributes
     ----------
     {{if 'cudaGraphExecUpdateResultInfo_st.result' in found_struct}}
-    result : cudaGraphExecUpdateResult
+    result : :py:obj:`~.cudaGraphExecUpdateResult`
         Gives more specific detail when a cuda graph update fails.
     {{endif}}
     {{if 'cudaGraphExecUpdateResultInfo_st.errorNode' in found_struct}}
-    errorNode : cudaGraphNode_t
+    errorNode : :py:obj:`~.cudaGraphNode_t`
         The "to node" of the error edge when the topologies do not match.
         The error node when the error is associated with a specific node.
         NULL when the error is generic.
     {{endif}}
     {{if 'cudaGraphExecUpdateResultInfo_st.errorFromNode' in found_struct}}
-    errorFromNode : cudaGraphNode_t
+    errorFromNode : :py:obj:`~.cudaGraphNode_t`
         The from node of error edge when the topologies do not match.
         Otherwise NULL.
     {{endif}}
@@ -4976,13 +5018,15 @@ cdef class cudaGraphExecUpdateResultInfo(cudaGraphExecUpdateResultInfo_st):
 
 cdef class cudaLaunchMemSyncDomainMap(cudaLaunchMemSyncDomainMap_st):
     """
-    Memory Synchronization Domain map  See cudaLaunchMemSyncDomain.  By
-    default, kernels are launched in domain 0. Kernel launched with
-    cudaLaunchMemSyncDomainRemote will have a different domain ID. User
-    may also alter the domain ID with cudaLaunchMemSyncDomainMap for a
-    specific stream / graph node / kernel launch. See
-    cudaLaunchAttributeMemSyncDomainMap.  Domain ID range is available
-    through cudaDevAttrMemSyncDomainCount.
+    Memory Synchronization Domain map  See
+    :py:obj:`~.cudaLaunchMemSyncDomain`.  By default, kernels are
+    launched in domain 0. Kernel launched with
+    :py:obj:`~.cudaLaunchMemSyncDomainRemote` will have a different
+    domain ID. User may also alter the domain ID with
+    :py:obj:`~.cudaLaunchMemSyncDomainMap` for a specific stream /
+    graph node / kernel launch. See
+    :py:obj:`~.cudaLaunchAttributeMemSyncDomainMap`.  Domain ID range
+    is available through :py:obj:`~.cudaDevAttrMemSyncDomainCount`.
 
     Attributes
     ----------
@@ -5011,11 +5055,11 @@ cdef class cudaLaunchAttribute(cudaLaunchAttribute_st):
     Attributes
     ----------
     {{if 'cudaLaunchAttribute_st.id' in found_struct}}
-    id : cudaLaunchAttributeID
+    id : :py:obj:`~.cudaLaunchAttributeID`
         Attribute to set
     {{endif}}
     {{if 'cudaLaunchAttribute_st.val' in found_struct}}
-    val : cudaLaunchAttributeValue
+    val : :py:obj:`~.cudaLaunchAttributeValue`
         Value of the attribute
     {{endif}}
 
@@ -5035,7 +5079,7 @@ cdef class cudaAsyncNotificationInfo_t(cudaAsyncNotificationInfo):
     Attributes
     ----------
     {{if 'cudaAsyncNotificationInfo.type' in found_struct}}
-    type : cudaAsyncNotificationType
+    type : :py:obj:`~.cudaAsyncNotificationType`
         The type of notification being sent
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
@@ -5055,7 +5099,8 @@ cdef class cudaAsyncNotificationInfo_t(cudaAsyncNotificationInfo):
 
 cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
     """
-    Launch attributes union; used as value field of cudaLaunchAttribute
+    Launch attributes union; used as value field of
+    :py:obj:`~.cudaLaunchAttribute`
 
     Attributes
     ----------
@@ -5064,116 +5109,126 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
 
     {{endif}}
     {{if 'cudaLaunchAttributeValue.accessPolicyWindow' in found_struct}}
-    accessPolicyWindow : cudaAccessPolicyWindow
-        Value of launch attribute cudaLaunchAttributeAccessPolicyWindow.
+    accessPolicyWindow : :py:obj:`~.cudaAccessPolicyWindow`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeAccessPolicyWindow`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.cooperative' in found_struct}}
     cooperative : int
-        Value of launch attribute cudaLaunchAttributeCooperative. Nonzero
-        indicates a cooperative kernel (see cudaLaunchCooperativeKernel).
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeCooperative`. Nonzero indicates a
+        cooperative kernel (see :func:`~.cudaLaunchCooperativeKernel`).
     {{endif}}
     {{if 'cudaLaunchAttributeValue.syncPolicy' in found_struct}}
-    syncPolicy : cudaSynchronizationPolicy
-        Value of launch attribute cudaLaunchAttributeSynchronizationPolicy.
-        cudaSynchronizationPolicy for work queued up in this stream.
+    syncPolicy : :py:obj:`~.cudaSynchronizationPolicy`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeSynchronizationPolicy`.
+        :py:obj:`~.cudaSynchronizationPolicy` for work queued up in this
+        stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
     clusterDim : anon_struct17
-        Value of launch attribute cudaLaunchAttributeClusterDimension that
-        represents the desired cluster dimensions for the kernel. Opaque
-        type with the following fields: - ``x`` - The X dimension of the
-        cluster, in blocks. Must be a divisor of the grid X dimension.    -
-        ``y`` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
-        the cluster, in blocks. Must be a divisor of the grid Z dimension.
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeClusterDimension` that represents the
+        desired cluster dimensions for the kernel. Opaque type with the
+        following fields: - ``x`` - The X dimension of the cluster, in
+        blocks. Must be a divisor of the grid X dimension.    - ``y`` - The
+        Y dimension of the cluster, in blocks. Must be a divisor of the
+        grid Y dimension.    - ``z`` - The Z dimension of the cluster, in
+        blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterSchedulingPolicyPreference' in found_struct}}
-    clusterSchedulingPolicyPreference : cudaClusterSchedulingPolicy
+    clusterSchedulingPolicyPreference : :py:obj:`~.cudaClusterSchedulingPolicy`
         Value of launch attribute
-        cudaLaunchAttributeClusterSchedulingPolicyPreference. Cluster
-        scheduling policy preference for the kernel.
+        :py:obj:`~.cudaLaunchAttributeClusterSchedulingPolicyPreference`.
+        Cluster scheduling policy preference for the kernel.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticStreamSerializationAllowed' in found_struct}}
     programmaticStreamSerializationAllowed : int
         Value of launch attribute
-        cudaLaunchAttributeProgrammaticStreamSerialization.
+        :py:obj:`~.cudaLaunchAttributeProgrammaticStreamSerialization`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct18
-        Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
-        the following fields: - ``cudaEvent_t`` event - Event to fire when
-        all blocks trigger it.    - ``int`` flags; - Event record flags,
-        see cudaEventRecordWithFlags. Does not accept
-        cudaEventRecordExternal.    - ``int`` triggerAtBlockStart - If this
-        is set to non-0, each block launch will automatically trigger the
-        event.
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeProgrammaticEvent` with the following
+        fields: - :py:obj:`~.cudaEvent_t` event - Event to fire when all
+        blocks trigger it.    - ``int`` flags; - Event record flags, see
+        :func:`~.cudaEventRecordWithFlags`. Does not accept
+        :py:obj:`~.cudaEventRecordExternal`.    - ``int``
+        triggerAtBlockStart - If this is set to non-0, each block launch
+        will automatically trigger the event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
-        Value of launch attribute cudaLaunchAttributePriority. Execution
-        priority of the kernel.
+        Value of launch attribute :py:obj:`~.cudaLaunchAttributePriority`.
+        Execution priority of the kernel.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
-    memSyncDomainMap : cudaLaunchMemSyncDomainMap
-        Value of launch attribute cudaLaunchAttributeMemSyncDomainMap. See
-        cudaLaunchMemSyncDomainMap.
+    memSyncDomainMap : :py:obj:`~.cudaLaunchMemSyncDomainMap`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeMemSyncDomainMap`. See
+        :py:obj:`~.cudaLaunchMemSyncDomainMap`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomain' in found_struct}}
-    memSyncDomain : cudaLaunchMemSyncDomain
-        Value of launch attribute cudaLaunchAttributeMemSyncDomain. See
-        cudaLaunchMemSyncDomain.
+    memSyncDomain : :py:obj:`~.cudaLaunchMemSyncDomain`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeMemSyncDomain`. See
+        :py:obj:`~.cudaLaunchMemSyncDomain`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.preferredClusterDim' in found_struct}}
     preferredClusterDim : anon_struct19
         Value of launch attribute
-        cudaLaunchAttributePreferredClusterDimension that represents the
-        desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - ``x`` - The X dimension of the
-        preferred cluster, in blocks. Must be a divisor of the grid X
+        :py:obj:`~.cudaLaunchAttributePreferredClusterDimension` that
+        represents the desired preferred cluster dimensions for the kernel.
+        Opaque type with the following fields: - ``x`` - The X dimension of
+        the preferred cluster, in blocks. Must be a divisor of the grid X
         dimension, and must be a multiple of the ``x`` field of
-        cudaLaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension
-        of the preferred cluster, in blocks. Must be a divisor of the grid
-        Y dimension, and must be a multiple of the ``y`` field of
-        cudaLaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension
-        of the preferred cluster, in blocks. Must be equal to the ``z``
-        field of cudaLaunchAttributeValue::clusterDim.
+        :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.    - ``y`` - The Y
+        dimension of the preferred cluster, in blocks. Must be a divisor of
+        the grid Y dimension, and must be a multiple of the ``y`` field of
+        :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.    - ``z`` - The Z
+        dimension of the preferred cluster, in blocks. Must be equal to the
+        ``z`` field of :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
-        Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
-        with the following fields: - ``cudaEvent_t`` event - Event to fire
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeLaunchCompletionEvent` with the
+        following fields: - :py:obj:`~.cudaEvent_t` event - Event to fire
         when the last block launches.    - ``int`` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
-        cudaEventRecordExternal.
+        flags, see :func:`~.cudaEventRecordWithFlags`. Does not accept
+        :py:obj:`~.cudaEventRecordExternal`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct21
         Value of launch attribute
-        cudaLaunchAttributeDeviceUpdatableKernelNode with the following
-        fields: - ``int`` deviceUpdatable - Whether or not the resulting
-        kernel node should be device-updatable.    -
-        ``cudaGraphDeviceNode_t`` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        :py:obj:`~.cudaLaunchAttributeDeviceUpdatableKernelNode` with the
+        following fields: - ``int`` deviceUpdatable - Whether or not the
+        resulting kernel node should be device-updatable.    -
+        :py:obj:`~.cudaGraphDeviceNode_t` devNode - Returns a handle to
+        pass to the various device-side update functions.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
         Value of launch attribute
-        cudaLaunchAttributePreferredSharedMemoryCarveout.
+        :py:obj:`~.cudaLaunchAttributePreferredSharedMemoryCarveout`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.nvlinkUtilCentricScheduling' in found_struct}}
     nvlinkUtilCentricScheduling : unsigned int
         Value of launch attribute
-        cudaLaunchAttributeNvlinkUtilCentricScheduling.
+        :py:obj:`~.cudaLaunchAttributeNvlinkUtilCentricScheduling`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.portableClusterSizeMode' in found_struct}}
-    portableClusterSizeMode : cudaLaunchAttributePortableClusterMode
+    portableClusterSizeMode : :py:obj:`~.cudaLaunchAttributePortableClusterMode`
         Value of launch attribute
-        cudaLaunchAttributePortableClusterSizeMode
+        :py:obj:`~.cudaLaunchAttributePortableClusterSizeMode`
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemoryMode' in found_struct}}
-    sharedMemoryMode : cudaSharedMemoryMode
-        Value of launch attribute cudaLaunchAttributeSharedMemoryMode. See
-        cudaSharedMemoryMode for acceptable values.
+    sharedMemoryMode : :py:obj:`~.cudaSharedMemoryMode`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeSharedMemoryMode`. See
+        :py:obj:`~.cudaSharedMemoryMode` for acceptable values.
     {{endif}}
 
     Methods
@@ -5187,7 +5242,8 @@ cdef class cudaStreamAttrValue(cudaLaunchAttributeValue):
 
 cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
     """
-    Launch attributes union; used as value field of cudaLaunchAttribute
+    Launch attributes union; used as value field of
+    :py:obj:`~.cudaLaunchAttribute`
 
     Attributes
     ----------
@@ -5196,116 +5252,126 @@ cdef class cudaKernelNodeAttrValue(cudaLaunchAttributeValue):
 
     {{endif}}
     {{if 'cudaLaunchAttributeValue.accessPolicyWindow' in found_struct}}
-    accessPolicyWindow : cudaAccessPolicyWindow
-        Value of launch attribute cudaLaunchAttributeAccessPolicyWindow.
+    accessPolicyWindow : :py:obj:`~.cudaAccessPolicyWindow`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeAccessPolicyWindow`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.cooperative' in found_struct}}
     cooperative : int
-        Value of launch attribute cudaLaunchAttributeCooperative. Nonzero
-        indicates a cooperative kernel (see cudaLaunchCooperativeKernel).
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeCooperative`. Nonzero indicates a
+        cooperative kernel (see :func:`~.cudaLaunchCooperativeKernel`).
     {{endif}}
     {{if 'cudaLaunchAttributeValue.syncPolicy' in found_struct}}
-    syncPolicy : cudaSynchronizationPolicy
-        Value of launch attribute cudaLaunchAttributeSynchronizationPolicy.
-        cudaSynchronizationPolicy for work queued up in this stream.
+    syncPolicy : :py:obj:`~.cudaSynchronizationPolicy`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeSynchronizationPolicy`.
+        :py:obj:`~.cudaSynchronizationPolicy` for work queued up in this
+        stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
     clusterDim : anon_struct17
-        Value of launch attribute cudaLaunchAttributeClusterDimension that
-        represents the desired cluster dimensions for the kernel. Opaque
-        type with the following fields: - ``x`` - The X dimension of the
-        cluster, in blocks. Must be a divisor of the grid X dimension.    -
-        ``y`` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
-        the cluster, in blocks. Must be a divisor of the grid Z dimension.
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeClusterDimension` that represents the
+        desired cluster dimensions for the kernel. Opaque type with the
+        following fields: - ``x`` - The X dimension of the cluster, in
+        blocks. Must be a divisor of the grid X dimension.    - ``y`` - The
+        Y dimension of the cluster, in blocks. Must be a divisor of the
+        grid Y dimension.    - ``z`` - The Z dimension of the cluster, in
+        blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterSchedulingPolicyPreference' in found_struct}}
-    clusterSchedulingPolicyPreference : cudaClusterSchedulingPolicy
+    clusterSchedulingPolicyPreference : :py:obj:`~.cudaClusterSchedulingPolicy`
         Value of launch attribute
-        cudaLaunchAttributeClusterSchedulingPolicyPreference. Cluster
-        scheduling policy preference for the kernel.
+        :py:obj:`~.cudaLaunchAttributeClusterSchedulingPolicyPreference`.
+        Cluster scheduling policy preference for the kernel.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticStreamSerializationAllowed' in found_struct}}
     programmaticStreamSerializationAllowed : int
         Value of launch attribute
-        cudaLaunchAttributeProgrammaticStreamSerialization.
+        :py:obj:`~.cudaLaunchAttributeProgrammaticStreamSerialization`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct18
-        Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
-        the following fields: - ``cudaEvent_t`` event - Event to fire when
-        all blocks trigger it.    - ``int`` flags; - Event record flags,
-        see cudaEventRecordWithFlags. Does not accept
-        cudaEventRecordExternal.    - ``int`` triggerAtBlockStart - If this
-        is set to non-0, each block launch will automatically trigger the
-        event.
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeProgrammaticEvent` with the following
+        fields: - :py:obj:`~.cudaEvent_t` event - Event to fire when all
+        blocks trigger it.    - ``int`` flags; - Event record flags, see
+        :func:`~.cudaEventRecordWithFlags`. Does not accept
+        :py:obj:`~.cudaEventRecordExternal`.    - ``int``
+        triggerAtBlockStart - If this is set to non-0, each block launch
+        will automatically trigger the event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
-        Value of launch attribute cudaLaunchAttributePriority. Execution
-        priority of the kernel.
+        Value of launch attribute :py:obj:`~.cudaLaunchAttributePriority`.
+        Execution priority of the kernel.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
-    memSyncDomainMap : cudaLaunchMemSyncDomainMap
-        Value of launch attribute cudaLaunchAttributeMemSyncDomainMap. See
-        cudaLaunchMemSyncDomainMap.
+    memSyncDomainMap : :py:obj:`~.cudaLaunchMemSyncDomainMap`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeMemSyncDomainMap`. See
+        :py:obj:`~.cudaLaunchMemSyncDomainMap`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomain' in found_struct}}
-    memSyncDomain : cudaLaunchMemSyncDomain
-        Value of launch attribute cudaLaunchAttributeMemSyncDomain. See
-        cudaLaunchMemSyncDomain.
+    memSyncDomain : :py:obj:`~.cudaLaunchMemSyncDomain`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeMemSyncDomain`. See
+        :py:obj:`~.cudaLaunchMemSyncDomain`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.preferredClusterDim' in found_struct}}
     preferredClusterDim : anon_struct19
         Value of launch attribute
-        cudaLaunchAttributePreferredClusterDimension that represents the
-        desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - ``x`` - The X dimension of the
-        preferred cluster, in blocks. Must be a divisor of the grid X
+        :py:obj:`~.cudaLaunchAttributePreferredClusterDimension` that
+        represents the desired preferred cluster dimensions for the kernel.
+        Opaque type with the following fields: - ``x`` - The X dimension of
+        the preferred cluster, in blocks. Must be a divisor of the grid X
         dimension, and must be a multiple of the ``x`` field of
-        cudaLaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension
-        of the preferred cluster, in blocks. Must be a divisor of the grid
-        Y dimension, and must be a multiple of the ``y`` field of
-        cudaLaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension
-        of the preferred cluster, in blocks. Must be equal to the ``z``
-        field of cudaLaunchAttributeValue::clusterDim.
+        :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.    - ``y`` - The Y
+        dimension of the preferred cluster, in blocks. Must be a divisor of
+        the grid Y dimension, and must be a multiple of the ``y`` field of
+        :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.    - ``z`` - The Z
+        dimension of the preferred cluster, in blocks. Must be equal to the
+        ``z`` field of :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
-        Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
-        with the following fields: - ``cudaEvent_t`` event - Event to fire
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeLaunchCompletionEvent` with the
+        following fields: - :py:obj:`~.cudaEvent_t` event - Event to fire
         when the last block launches.    - ``int`` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
-        cudaEventRecordExternal.
+        flags, see :func:`~.cudaEventRecordWithFlags`. Does not accept
+        :py:obj:`~.cudaEventRecordExternal`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct21
         Value of launch attribute
-        cudaLaunchAttributeDeviceUpdatableKernelNode with the following
-        fields: - ``int`` deviceUpdatable - Whether or not the resulting
-        kernel node should be device-updatable.    -
-        ``cudaGraphDeviceNode_t`` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        :py:obj:`~.cudaLaunchAttributeDeviceUpdatableKernelNode` with the
+        following fields: - ``int`` deviceUpdatable - Whether or not the
+        resulting kernel node should be device-updatable.    -
+        :py:obj:`~.cudaGraphDeviceNode_t` devNode - Returns a handle to
+        pass to the various device-side update functions.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
         Value of launch attribute
-        cudaLaunchAttributePreferredSharedMemoryCarveout.
+        :py:obj:`~.cudaLaunchAttributePreferredSharedMemoryCarveout`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.nvlinkUtilCentricScheduling' in found_struct}}
     nvlinkUtilCentricScheduling : unsigned int
         Value of launch attribute
-        cudaLaunchAttributeNvlinkUtilCentricScheduling.
+        :py:obj:`~.cudaLaunchAttributeNvlinkUtilCentricScheduling`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.portableClusterSizeMode' in found_struct}}
-    portableClusterSizeMode : cudaLaunchAttributePortableClusterMode
+    portableClusterSizeMode : :py:obj:`~.cudaLaunchAttributePortableClusterMode`
         Value of launch attribute
-        cudaLaunchAttributePortableClusterSizeMode
+        :py:obj:`~.cudaLaunchAttributePortableClusterSizeMode`
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemoryMode' in found_struct}}
-    sharedMemoryMode : cudaSharedMemoryMode
-        Value of launch attribute cudaLaunchAttributeSharedMemoryMode. See
-        cudaSharedMemoryMode for acceptable values.
+    sharedMemoryMode : :py:obj:`~.cudaSharedMemoryMode`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeSharedMemoryMode`. See
+        :py:obj:`~.cudaSharedMemoryMode` for acceptable values.
     {{endif}}
 
     Methods
@@ -5345,7 +5411,7 @@ cdef class cudaEglPlaneDesc(cudaEglPlaneDesc_st):
         Number of channels for the plane
     {{endif}}
     {{if True}}
-    channelDesc : cudaChannelFormatDesc
+    channelDesc : :py:obj:`~.cudaChannelFormatDesc`
         Channel Format Descriptor
     {{endif}}
     {{if True}}
@@ -5367,11 +5433,11 @@ cdef class cudaEglFrame(cudaEglFrame_st):
     CUDA EGLFrame Descriptor - structure defining one frame of EGL.
     Each frame may contain one or more planes depending on whether the
     surface is Multiplanar or not. Each plane of EGLFrame is
-    represented by cudaEglPlaneDesc which is defined as:
+    represented by :py:obj:`~.cudaEglPlaneDesc` which is defined as:
     typedefstructcudaEglPlaneDesc_st unsignedintwidth;
     unsignedintheight; unsignedintdepth; unsignedintpitch;
     unsignedintnumChannels; structcudaChannelFormatDescchannelDesc;
-    unsignedintreserved[4]; cudaEglPlaneDesc;
+    unsignedintreserved[4]; :py:obj:`~.cudaEglPlaneDesc`;
 
     Attributes
     ----------
@@ -5380,19 +5446,19 @@ cdef class cudaEglFrame(cudaEglFrame_st):
 
     {{endif}}
     {{if True}}
-    planeDesc : list[cudaEglPlaneDesc]
-        CUDA EGL Plane Descriptor cudaEglPlaneDesc
+    planeDesc : list[:py:obj:`~.cudaEglPlaneDesc`]
+        CUDA EGL Plane Descriptor :py:obj:`~.cudaEglPlaneDesc`
     {{endif}}
     {{if True}}
     planeCount : unsigned int
         Number of planes
     {{endif}}
     {{if True}}
-    frameType : cudaEglFrameType
+    frameType : :py:obj:`~.cudaEglFrameType`
         Array or Pitch
     {{endif}}
     {{if True}}
-    eglColorFormat : cudaEglColorFormat
+    eglColorFormat : :py:obj:`~.cudaEglColorFormat`
         CUDA EGL Color Format
     {{endif}}
 
diff --git a/cuda_bindings/cuda/bindings/runtime.pyx.in b/cuda_bindings/cuda/bindings/runtime.pyx.in
index 6644048c22..80aed12b5d 100644
--- a/cuda_bindings/cuda/bindings/runtime.pyx.in
+++ b/cuda_bindings/cuda/bindings/runtime.pyx.in
@@ -83,16 +83,16 @@ cudaStreamNonBlocking = cyruntime.cudaStreamNonBlocking
 
 #: Legacy stream handle
 #:
-#: Stream handle that can be passed as a cudaStream_t to use an implicit
-#: stream with legacy synchronization behavior.
+#: Stream handle that can be passed as a :py:obj:`~.cudaStream_t` to use an
+#: implicit stream with legacy synchronization behavior.
 #:
 #: See details of the \link_sync_behavior
 cudaStreamLegacy = cyruntime.cudaStreamLegacy
 
 #: Per-thread stream handle
 #:
-#: Stream handle that can be passed as a cudaStream_t to use an implicit
-#: stream with per-thread synchronization behavior.
+#: Stream handle that can be passed as a :py:obj:`~.cudaStream_t` to use an
+#: implicit stream with per-thread synchronization behavior.
 #:
 #: See details of the \link_sync_behavior
 cudaStreamPerThread = cyruntime.cudaStreamPerThread
@@ -106,8 +106,8 @@ cudaEventBlockingSync = cyruntime.cudaEventBlockingSync
 #: Event will not record timing data
 cudaEventDisableTiming = cyruntime.cudaEventDisableTiming
 
-#: Event is suitable for interprocess use. cudaEventDisableTiming must be
-#: set
+#: Event is suitable for interprocess use.
+#: :py:obj:`~.cudaEventDisableTiming` must be set
 cudaEventInterprocess = cyruntime.cudaEventInterprocess
 
 #: Default event record flag
@@ -158,32 +158,34 @@ cudaDeviceMask = cyruntime.cudaDeviceMask
 #: Default CUDA array allocation flag
 cudaArrayDefault = cyruntime.cudaArrayDefault
 
-#: Must be set in cudaMalloc3DArray to create a layered CUDA array
+#: Must be set in :func:`~.cudaMalloc3DArray` to create a layered CUDA
+#: array
 cudaArrayLayered = cyruntime.cudaArrayLayered
 
-#: Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind
-#: surfaces to the CUDA array
+#: Must be set in :func:`~.cudaMallocArray` or :func:`~.cudaMalloc3DArray`
+#: in order to bind surfaces to the CUDA array
 cudaArraySurfaceLoadStore = cyruntime.cudaArraySurfaceLoadStore
 
-#: Must be set in cudaMalloc3DArray to create a cubemap CUDA array
+#: Must be set in :func:`~.cudaMalloc3DArray` to create a cubemap CUDA
+#: array
 cudaArrayCubemap = cyruntime.cudaArrayCubemap
 
-#: Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform
-#: texture gather operations on the CUDA array
+#: Must be set in :func:`~.cudaMallocArray` or :func:`~.cudaMalloc3DArray`
+#: in order to perform texture gather operations on the CUDA array
 cudaArrayTextureGather = cyruntime.cudaArrayTextureGather
 
-#: Must be set in cudaExternalMemoryGetMappedMipmappedArray if the
-#: mipmapped array is used as a color target in a graphics API
+#: Must be set in :func:`~.cudaExternalMemoryGetMappedMipmappedArray` if
+#: the mipmapped array is used as a color target in a graphics API
 cudaArrayColorAttachment = cyruntime.cudaArrayColorAttachment
 
-#: Must be set in cudaMallocArray, cudaMalloc3DArray or
-#: cudaMallocMipmappedArray in order to create a sparse CUDA array or CUDA
-#: mipmapped array
+#: Must be set in :func:`~.cudaMallocArray`, :func:`~.cudaMalloc3DArray` or
+#: :func:`~.cudaMallocMipmappedArray` in order to create a sparse CUDA
+#: array or CUDA mipmapped array
 cudaArraySparse = cyruntime.cudaArraySparse
 
-#: Must be set in cudaMallocArray, cudaMalloc3DArray or
-#: cudaMallocMipmappedArray in order to create a deferred mapping CUDA
-#: array or CUDA mipmapped array
+#: Must be set in :func:`~.cudaMallocArray`, :func:`~.cudaMalloc3DArray` or
+#: :func:`~.cudaMallocMipmappedArray` in order to create a deferred mapping
+#: CUDA array or CUDA mipmapped array
 cudaArrayDeferredMapping = cyruntime.cudaArrayDeferredMapping
 
 #: Automatically enable peer access between remote devices as needed
@@ -210,8 +212,8 @@ cudaCpuDeviceId = cyruntime.cudaCpuDeviceId
 #: Device id that represents an invalid device
 cudaInvalidDeviceId = cyruntime.cudaInvalidDeviceId
 
-#: Tell the CUDA runtime that DeviceFlags is being set in cudaInitDevice
-#: call
+#: Tell the CUDA runtime that DeviceFlags is being set in
+#: :func:`~.cudaInitDevice` call
 cudaInitDeviceFlagsAreValid = cyruntime.cudaInitDeviceFlagsAreValid
 
 #: Indicates that the layered sparse CUDA array or CUDA mipmapped array has
@@ -464,7 +466,7 @@ class cudaError_t(_FastEnum):
     cudaErrorInvalidTextureBinding = (
         cyruntime.cudaError.cudaErrorInvalidTextureBinding,
         'This indicates that the texture binding is not valid. This occurs if you\n'
-        'call :py:obj:`~.cudaGetTextureAlignmentOffset()` with an unbound texture.\n'
+        'call ``cudaGetTextureAlignmentOffset()`` with an unbound texture.\n'
     ){{endif}}
     {{if 'cudaErrorInvalidChannelDescriptor' in found_values}}
 
@@ -618,7 +620,8 @@ class cudaError_t(_FastEnum):
         'Driver context was created using an older version of the API, because the\n'
         'Runtime API call expects a primary driver context and the Driver context is\n'
         'not primary, or because the Driver context has been destroyed. Please see\n'
-        'Interactions with the CUDA Driver API for more information.\n'
+        ':ref:`Interactions with the CUDA Driver API <cuda-bindings-runtime-\n'
+        'group__cudart__driver>` for more information.\n'
     ){{endif}}
     {{if 'cudaErrorMissingConfiguration' in found_values}}
 
@@ -626,7 +629,7 @@ class cudaError_t(_FastEnum):
         cyruntime.cudaError.cudaErrorMissingConfiguration,
         'The device function being invoked (usually via\n'
         ':py:obj:`~.cudaLaunchKernel()`) was not previously configured via the\n'
-        ':py:obj:`~.cudaConfigureCall()` function.\n'
+        '``cudaConfigureCall()`` function.\n'
     ){{endif}}
     {{if 'cudaErrorPriorLaunchFailure' in found_values}}
 
@@ -750,9 +753,9 @@ class cudaError_t(_FastEnum):
         'This most frequently indicates that there is no context bound to the\n'
         'current thread. This can also be returned if the context passed to an API\n'
         'call is not a valid handle (such as a context that has had\n'
-        ':py:obj:`~.cuCtxDestroy()` invoked on it). This can also be returned if a\n'
-        'user mixes different API versions (i.e. 3010 context with 3020 API calls).\n'
-        'See :py:obj:`~.cuCtxGetApiVersion()` for more details.\n'
+        ':func:`~.cuCtxDestroy` invoked on it). This can also be returned if a user\n'
+        'mixes different API versions (i.e. 3010 context with 3020 API calls). See\n'
+        ':func:`~.cuCtxGetApiVersion` for more details.\n'
     ){{endif}}
     {{if 'cudaErrorMapBufferObjectFailed' in found_values}}
 
@@ -900,7 +903,7 @@ class cudaError_t(_FastEnum):
     cudaErrorUnsupportedDevSideSync = (
         cyruntime.cudaError.cudaErrorUnsupportedDevSideSync,
         'This indicates that the code to be compiled by the PTX JIT contains\n'
-        'unsupported call to cudaDeviceSynchronize.\n'
+        'unsupported call to :func:`~.cudaDeviceSynchronize`.\n'
     ){{endif}}
     {{if 'cudaErrorContained' in found_values}}
 
@@ -1044,9 +1047,8 @@ class cudaError_t(_FastEnum):
     cudaErrorSetOnActiveProcess = (
         cyruntime.cudaError.cudaErrorSetOnActiveProcess,
         'This indicates that the user has called :py:obj:`~.cudaSetValidDevices()`,\n'
-        ':py:obj:`~.cudaSetDeviceFlags()`, :py:obj:`~.cudaD3D9SetDirect3DDevice()`,\n'
-        ':py:obj:`~.cudaD3D10SetDirect3DDevice`,\n'
-        ':py:obj:`~.cudaD3D11SetDirect3DDevice()`, or\n'
+        ':py:obj:`~.cudaSetDeviceFlags()`, ``cudaD3D9SetDirect3DDevice()``,\n'
+        '``cudaD3D10SetDirect3DDevice``, ``cudaD3D11SetDirect3DDevice()``, or\n'
         ':py:obj:`~.cudaVDPAUSetVDPAUDevice()` after initializing the CUDA runtime\n'
         'by calling non-device management operations (allocating memory and\n'
         'launching kernels are examples of non-device management operations). This\n'
@@ -1058,8 +1060,8 @@ class cudaError_t(_FastEnum):
     cudaErrorContextIsDestroyed = (
         cyruntime.cudaError.cudaErrorContextIsDestroyed,
         'This error indicates that the context current to the calling thread has\n'
-        'been destroyed using :py:obj:`~.cuCtxDestroy`, or is a primary context\n'
-        'which has not yet been initialized.\n'
+        'been destroyed using :func:`~.cuCtxDestroy`, or is a primary context which\n'
+        'has not yet been initialized.\n'
     ){{endif}}
     {{if 'cudaErrorAssert' in found_values}}
 
@@ -1075,7 +1077,7 @@ class cudaError_t(_FastEnum):
         cyruntime.cudaError.cudaErrorTooManyPeers,
         'This error indicates that the hardware resources required to enable peer\n'
         'access have been exhausted for one or more of the devices passed to\n'
-        ':py:obj:`~.cudaEnablePeerAccess()`.\n'
+        '``cudaEnablePeerAccess()``.\n'
     ){{endif}}
     {{if 'cudaErrorHostMemoryAlreadyRegistered' in found_values}}
 
@@ -1318,7 +1320,7 @@ class cudaError_t(_FastEnum):
     cudaErrorStreamCaptureImplicit = (
         cyruntime.cudaError.cudaErrorStreamCaptureImplicit,
         'The operation would have resulted in a disallowed implicit dependency on a\n'
-        'current capture sequence from cudaStreamLegacy.\n'
+        'current capture sequence from :py:obj:`~.cudaStreamLegacy`.\n'
     ){{endif}}
     {{if 'cudaErrorCapturedEvent' in found_values}}
 
@@ -1424,7 +1426,7 @@ class cudaError_t(_FastEnum):
 class cudaSharedMemoryMode(_FastEnum):
     """
     Shared memory related attributes for use with
-    :py:obj:`~.cuLaunchKernelEx`
+    :func:`~.cuLaunchKernelEx`
     """
     {{if 'cudaSharedMemoryModeDefault' in found_values}}
 
@@ -1537,7 +1539,7 @@ class cudaLaunchMemSyncDomain(_FastEnum):
     :py:obj:`~.cudaLaunchMemSyncDomainMap` for a specific stream /
     graph node / kernel launch. See
     :py:obj:`~.cudaLaunchAttributeMemSyncDomain`,
-    :py:obj:`~.cudaStreamSetAttribute`, :py:obj:`~.cudaLaunchKernelEx`,
+    :py:obj:`~.cudaStreamSetAttribute`, ``cudaLaunchKernelEx``,
     :py:obj:`~.cudaGraphKernelNodeSetAttribute`.  Memory operations
     done in kernels launched in different domains are considered
     system-scope distanced. In other words, a GPU scoped memory
@@ -1564,7 +1566,7 @@ class cudaLaunchMemSyncDomain(_FastEnum):
 class cudaLaunchAttributePortableClusterMode(_FastEnum):
     """
     Enum for defining applicability of portable cluster size, used with
-    :py:obj:`~.cudaLaunchKernelEx`
+    ``cudaLaunchKernelEx``
     """
     {{if 'cudaLaunchPortableClusterModeDefault' in found_values}}
 
@@ -1756,16 +1758,15 @@ class cudaLaunchAttributeID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeDeviceUpdatableKernelNode,
         'Valid for graph nodes, launches. This attribute is graphs-only, and passing\n'
         'it to a launch in a non-capturing stream will result in an error.\n'
-        ' :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can\n'
-        'only be set to 0 or 1. Setting the field to 1 indicates that the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable`\n'
+        'can only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
         ':py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode`\n'
         'which can be passed to the various device-side update functions to update\n'
         "the node's kernel parameters from within another kernel. For more\n"
         'information on the types of device updates that can be made, as well as the\n'
-        'relevant limitations thereof, see\n'
-        ':py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
+        'relevant limitations thereof, see ``cudaGraphKernelNodeUpdatesApply``.\n'
         ' Nodes which are device-updatable have additional restrictions compared to\n'
         'regular kernel nodes. Firstly, device-updatable nodes cannot be removed\n'
         'from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once\n'
@@ -1778,7 +1779,7 @@ class cudaLaunchAttributeID(_FastEnum):
         'can be passed to :py:obj:`~.cudaGraphExecUpdate`.\n'
         ' If a graph contains device-updatable nodes and updates those nodes from\n'
         'the device from within the graph, the graph must be uploaded with\n'
-        ':py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-\n'
+        ':func:`~.cuGraphUpload` before it is launched. For such a graph, if host-\n'
         'side executable graph updates are made to the device-updatable nodes, the\n'
         'graph must be uploaded before it is launched again.\n'
     ){{endif}}
@@ -1952,8 +1953,8 @@ class cudaEmulationStrategy(_FastEnum):
     CUDA_EMULATION_STRATEGY_DEFAULT = (
         cyruntime.cudaEmulationStrategy_t.CUDA_EMULATION_STRATEGY_DEFAULT,
         'The default emulation strategy. For emulated computations, this is\n'
-        'equivalent to CUDA_EMULATION_STRATEGY_PERFORMANT, unless a library\n'
-        'dependent environment variable is set\n'
+        'equivalent to :py:obj:`~.CUDA_EMULATION_STRATEGY_PERFORMANT`, unless a\n'
+        'library dependent environment variable is set\n'
     ){{endif}}
     {{if 'CUDA_EMULATION_STRATEGY_PERFORMANT' in found_values}}
 
@@ -3886,7 +3887,7 @@ class cudaSharedMemConfig(_FastEnum):
 class cudaSharedCarveout(_FastEnum):
     """
     Shared memory carveout configurations. These may be passed to
-    cudaFuncSetAttribute
+    :func:`~.cudaFuncSetAttribute`
     """
     {{if 'cudaSharedmemCarveoutDefault' in found_values}}
 
@@ -4132,7 +4133,7 @@ class cudaGPUDirectRDMAWritesOrdering(_FastEnum):
     cudaGPUDirectRDMAWritesOrderingNone = (
         cyruntime.cudaGPUDirectRDMAWritesOrdering.cudaGPUDirectRDMAWritesOrderingNone,
         'The device does not natively support ordering of GPUDirect RDMA writes.\n'
-        ':py:obj:`~.cudaFlushGPUDirectRDMAWrites()` can be leveraged if supported.\n'
+        '``cudaFlushGPUDirectRDMAWrites()`` can be leveraged if supported.\n'
     ){{endif}}
     {{if 'cudaGPUDirectRDMAWritesOrderingOwner' in found_values}}
 
@@ -4455,13 +4456,13 @@ class cudaDeviceAttr(_FastEnum):
 
     cudaDevAttrMaxTexture2DGatherWidth = (
         cyruntime.cudaDeviceAttr.cudaDevAttrMaxTexture2DGatherWidth,
-        'Maximum 2D texture width if cudaArrayTextureGather is set\n'
+        'Maximum 2D texture width if :py:obj:`~.cudaArrayTextureGather` is set\n'
     ){{endif}}
     {{if 'cudaDevAttrMaxTexture2DGatherHeight' in found_values}}
 
     cudaDevAttrMaxTexture2DGatherHeight = (
         cyruntime.cudaDeviceAttr.cudaDevAttrMaxTexture2DGatherHeight,
-        'Maximum 2D texture height if cudaArrayTextureGather is set\n'
+        'Maximum 2D texture height if :py:obj:`~.cudaArrayTextureGather` is set\n'
     ){{endif}}
     {{if 'cudaDevAttrMaxTexture3DWidthAlt' in found_values}}
 
@@ -4715,7 +4716,7 @@ class cudaDeviceAttr(_FastEnum):
     cudaDevAttrPageableMemoryAccess = (
         cyruntime.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess,
         'Device supports coherently accessing pageable memory without calling\n'
-        'cudaHostRegister on it\n'
+        ':func:`~.cudaHostRegister` on it\n'
     ){{endif}}
     {{if 'cudaDevAttrConcurrentManagedAccess' in found_values}}
 
@@ -4817,8 +4818,8 @@ class cudaDeviceAttr(_FastEnum):
     cudaDevAttrHostRegisterReadOnlySupported = (
         cyruntime.cudaDeviceAttr.cudaDevAttrHostRegisterReadOnlySupported,
         'Device supports using the :py:obj:`~.cudaHostRegister` flag\n'
-        'cudaHostRegisterReadOnly to register memory that must be mapped as read-\n'
-        'only to the GPU\n'
+        ':py:obj:`~.cudaHostRegisterReadOnly` to register memory that must be mapped\n'
+        'as read-only to the GPU\n'
     ){{endif}}
     {{if 'cudaDevAttrTimelineSemaphoreInteropSupported' in found_values}}
 
@@ -4830,8 +4831,8 @@ class cudaDeviceAttr(_FastEnum):
 
     cudaDevAttrMemoryPoolsSupported = (
         cyruntime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported,
-        'Device supports using the :py:obj:`~.cudaMallocAsync` and\n'
-        ':py:obj:`~.cudaMemPool` family of APIs\n'
+        'Device supports using the :py:obj:`~.cudaMallocAsync` and ``cudaMemPool``\n'
+        'family of APIs\n'
     ){{endif}}
     {{if 'cudaDevAttrGPUDirectRDMASupported' in found_values}}
 
@@ -4958,7 +4959,7 @@ class cudaDeviceAttr(_FastEnum):
     cudaDevAttrHostNumaMemoryPoolsSupported = (
         cyruntime.cudaDeviceAttr.cudaDevAttrHostNumaMemoryPoolsSupported,
         'Device supports HOST_NUMA location with the :py:obj:`~.cudaMallocAsync` and\n'
-        ':py:obj:`~.cudaMemPool` family of APIs\n'
+        '``cudaMemPool`` family of APIs\n'
     ){{endif}}
     {{if 'cudaDevAttrHostNumaMultinodeIpcSupported' in found_values}}
 
@@ -4970,8 +4971,8 @@ class cudaDeviceAttr(_FastEnum):
 
     cudaDevAttrHostMemoryPoolsSupported = (
         cyruntime.cudaDeviceAttr.cudaDevAttrHostMemoryPoolsSupported,
-        'Device suports HOST location with the :py:obj:`~.cuMemAllocAsync` and\n'
-        ':py:obj:`~.cuMemPool` family of APIs\n'
+        'Device suports HOST location with the :func:`~.cuMemAllocAsync` and\n'
+        '``cuMemPool`` family of APIs\n'
     ){{endif}}
     {{if 'cudaDevAttrReserved145' in found_values}}
     cudaDevAttrReserved145 = cyruntime.cudaDeviceAttr.cudaDevAttrReserved145{{endif}}
@@ -5009,11 +5010,11 @@ class cudaMemPoolAttr(_FastEnum):
 
     cudaMemPoolReuseFollowEventDependencies = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies,
-        '(value type = int) Allow cuMemAllocAsync to use memory asynchronously freed\n'
-        'in another streams as long as a stream ordering dependency of the\n'
-        'allocating stream on the free action exists. Cuda events and null stream\n'
-        'interactions can create the required stream ordered dependencies. (default\n'
-        'enabled)\n'
+        '(value type = int) Allow :func:`~.cuMemAllocAsync` to use memory\n'
+        'asynchronously freed in another streams as long as a stream ordering\n'
+        'dependency of the allocating stream on the free action exists. Cuda events\n'
+        'and null stream interactions can create the required stream ordered\n'
+        'dependencies. (default enabled)\n'
     ){{endif}}
     {{if 'cudaMemPoolReuseAllowOpportunistic' in found_values}}
 
@@ -5026,90 +5027,91 @@ class cudaMemPoolAttr(_FastEnum):
 
     cudaMemPoolReuseAllowInternalDependencies = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies,
-        '(value type = int) Allow cuMemAllocAsync to insert new stream dependencies\n'
-        'in order to establish the stream ordering required to reuse a piece of\n'
-        'memory released by cuFreeAsync (default enabled).\n'
+        '(value type = int) Allow :func:`~.cuMemAllocAsync` to insert new stream\n'
+        'dependencies in order to establish the stream ordering required to reuse a\n'
+        'piece of memory released by cuFreeAsync (default enabled).\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrReleaseThreshold' in found_values}}
 
     cudaMemPoolAttrReleaseThreshold = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold,
-        '(value type = cuuint64_t) Amount of reserved memory in bytes to hold onto\n'
-        'before trying to release memory back to the OS. When more than the release\n'
-        'threshold bytes of memory are held by the memory pool, the allocator will\n'
-        'try to release memory back to the OS on the next call to stream, event or\n'
-        'context synchronize. (default 0)\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of reserved memory in bytes to\n'
+        'hold onto before trying to release memory back to the OS. When more than\n'
+        'the release threshold bytes of memory are held by the memory pool, the\n'
+        'allocator will try to release memory back to the OS on the next call to\n'
+        'stream, event or context synchronize. (default 0)\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrReservedMemCurrent' in found_values}}
 
     cudaMemPoolAttrReservedMemCurrent = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolAttrReservedMemCurrent,
-        '(value type = cuuint64_t) Amount of backing memory currently allocated for\n'
-        'the mempool.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of backing memory currently\n'
+        'allocated for the mempool.\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrReservedMemHigh' in found_values}}
 
     cudaMemPoolAttrReservedMemHigh = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolAttrReservedMemHigh,
-        '(value type = cuuint64_t) High watermark of backing memory allocated for\n'
-        'the mempool since the last time it was reset. High watermark can only be\n'
-        'reset to zero.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) High watermark of backing memory\n'
+        'allocated for the mempool since the last time it was reset. High watermark\n'
+        'can only be reset to zero.\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrUsedMemCurrent' in found_values}}
 
     cudaMemPoolAttrUsedMemCurrent = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolAttrUsedMemCurrent,
-        '(value type = cuuint64_t) Amount of memory from the pool that is currently\n'
-        'in use by the application.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of memory from the pool that\n'
+        'is currently in use by the application.\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrUsedMemHigh' in found_values}}
 
     cudaMemPoolAttrUsedMemHigh = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolAttrUsedMemHigh,
-        '(value type = cuuint64_t) High watermark of the amount of memory from the\n'
-        'pool that was in use by the application since the last time it was reset.\n'
-        'High watermark can only be reset to zero.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) High watermark of the amount of\n'
+        'memory from the pool that was in use by the application since the last time\n'
+        'it was reset. High watermark can only be reset to zero.\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrAllocationType' in found_values}}
 
     cudaMemPoolAttrAllocationType = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolAttrAllocationType,
-        '(value type = cudaMemAllocationType) The allocation type of the mempool\n'
+        '(value type = :py:obj:`~.cudaMemAllocationType`) The allocation type of the\n'
+        'mempool\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrExportHandleTypes' in found_values}}
 
     cudaMemPoolAttrExportHandleTypes = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolAttrExportHandleTypes,
-        '(value type = cudaMemAllocationHandleType) Available export handle types\n'
-        'for the mempool. For imported pools this value is always\n'
-        'cudaMemHandleTypeNone as an imported pool cannot be re-exported\n'
+        '(value type = :py:obj:`~.cudaMemAllocationHandleType`) Available export\n'
+        'handle types for the mempool. For imported pools this value is always\n'
+        ':py:obj:`~.cudaMemHandleTypeNone` as an imported pool cannot be re-exported\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrLocationId' in found_values}}
 
     cudaMemPoolAttrLocationId = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolAttrLocationId,
         '(value type = int) The location id for the mempool. If the location type\n'
-        'for this pool is cudaMemLocationTypeInvisible then ID will be\n'
-        'cudaInvalidDeviceId\n'
+        'for this pool is :py:obj:`~.cudaMemLocationTypeInvisible` then ID will be\n'
+        ':py:obj:`~.cudaInvalidDeviceId`\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrLocationType' in found_values}}
 
     cudaMemPoolAttrLocationType = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolAttrLocationType,
-        '(value type = cudaMemLocationType) The location type for the mempool. For\n'
-        'imported memory pools where the device is not directly visible to the\n'
-        'importing process or pools imported via fabric handles across nodes this\n'
-        'will be cudaMemLocationTypeInvisible\n'
+        '(value type = :py:obj:`~.cudaMemLocationType`) The location type for the\n'
+        'mempool. For imported memory pools where the device is not directly visible\n'
+        'to the importing process or pools imported via fabric handles across nodes\n'
+        'this will be :py:obj:`~.cudaMemLocationTypeInvisible`\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrMaxPoolSize' in found_values}}
 
     cudaMemPoolAttrMaxPoolSize = (
         cyruntime.cudaMemPoolAttr.cudaMemPoolAttrMaxPoolSize,
-        '(value type = cuuint64_t) Maximum size of the pool in bytes, this value may\n'
-        'be higher than what was initially passed to cudaMemPoolCreate due to\n'
-        'alignment requirements. A value of 0 indicates no maximum size. For\n'
-        'cudaMemAllocationTypeManaged and IPC imported pools this value will be\n'
-        'system dependent.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Maximum size of the pool in bytes,\n'
+        'this value may be higher than what was initially passed to\n'
+        ':func:`~.cudaMemPoolCreate` due to alignment requirements. A value of 0\n'
+        'indicates no maximum size. For :py:obj:`~.cudaMemAllocationTypeManaged` and\n'
+        'IPC imported pools this value will be system dependent.\n'
     ){{endif}}
     {{if 'cudaMemPoolAttrHwDecompressEnabled' in found_values}}
 
@@ -5165,7 +5167,7 @@ class cudaMemLocationType(_FastEnum):
     cudaMemLocationTypeInvisible = (
         cyruntime.cudaMemLocationType.cudaMemLocationTypeInvisible,
         'Location is not visible but device is accessible, id is always\n'
-        'cudaInvalidDeviceId\n'
+        ':py:obj:`~.cudaInvalidDeviceId`\n'
     ){{endif}}
 
 {{endif}}
@@ -5255,7 +5257,8 @@ class cudaMemAllocationHandleType(_FastEnum):
 
     cudaMemHandleTypeFabric = (
         cyruntime.cudaMemAllocationHandleType.cudaMemHandleTypeFabric,
-        'Allows a fabric handle to be used for exporting. (cudaMemFabricHandle_t)\n'
+        'Allows a fabric handle to be used for exporting.\n'
+        '(:py:obj:`~.cudaMemFabricHandle_t`)\n'
     ){{endif}}
 
 {{endif}}
@@ -5269,30 +5272,30 @@ class cudaGraphMemAttributeType(_FastEnum):
 
     cudaGraphMemAttrUsedMemCurrent = (
         cyruntime.cudaGraphMemAttributeType.cudaGraphMemAttrUsedMemCurrent,
-        '(value type = cuuint64_t) Amount of memory, in bytes, currently associated\n'
-        'with graphs.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of memory, in bytes, currently\n'
+        'associated with graphs.\n'
     ){{endif}}
     {{if 'cudaGraphMemAttrUsedMemHigh' in found_values}}
 
     cudaGraphMemAttrUsedMemHigh = (
         cyruntime.cudaGraphMemAttributeType.cudaGraphMemAttrUsedMemHigh,
-        '(value type = cuuint64_t) High watermark of memory, in bytes, associated\n'
-        'with graphs since the last time it was reset. High watermark can only be\n'
-        'reset to zero.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) High watermark of memory, in bytes,\n'
+        'associated with graphs since the last time it was reset. High watermark can\n'
+        'only be reset to zero.\n'
     ){{endif}}
     {{if 'cudaGraphMemAttrReservedMemCurrent' in found_values}}
 
     cudaGraphMemAttrReservedMemCurrent = (
         cyruntime.cudaGraphMemAttributeType.cudaGraphMemAttrReservedMemCurrent,
-        '(value type = cuuint64_t) Amount of memory, in bytes, currently allocated\n'
-        'for use by the CUDA graphs asynchronous allocator.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) Amount of memory, in bytes, currently\n'
+        'allocated for use by the CUDA graphs asynchronous allocator.\n'
     ){{endif}}
     {{if 'cudaGraphMemAttrReservedMemHigh' in found_values}}
 
     cudaGraphMemAttrReservedMemHigh = (
         cyruntime.cudaGraphMemAttributeType.cudaGraphMemAttrReservedMemHigh,
-        '(value type = cuuint64_t) High watermark of memory, in bytes, currently\n'
-        'allocated for use by the CUDA graphs asynchronous allocator.\n'
+        '(value type = :py:obj:`~.cuuint64_t`) High watermark of memory, in bytes,\n'
+        'currently allocated for use by the CUDA graphs asynchronous allocator.\n'
     ){{endif}}
 
 {{endif}}
@@ -5377,7 +5380,7 @@ class cudaMemcpy3DOperandType(_FastEnum):
 
     cudaMemcpyOperandTypeArray = (
         cyruntime.cudaMemcpy3DOperandType.cudaMemcpyOperandTypeArray,
-        'Memcpy operand is a CUarray.\n'
+        'Memcpy operand is a :py:obj:`~.CUarray`.\n'
     ){{endif}}
     {{if 'cudaMemcpyOperandTypeMax' in found_values}}
     cudaMemcpyOperandTypeMax = cyruntime.cudaMemcpy3DOperandType.cudaMemcpyOperandTypeMax{{endif}}
@@ -5605,7 +5608,7 @@ class cudaExternalSemaphoreHandleType(_FastEnum):
 
 class cudaDevSmResourceGroup_flags(_FastEnum):
     """
-    Flags for a CUdevSmResource group
+    Flags for a :py:obj:`~.CUdevSmResource` group
     """
     {{if 'cudaDevSmResourceGroupDefault' in found_values}}
     cudaDevSmResourceGroupDefault = cyruntime.cudaDevSmResourceGroup_flags.cudaDevSmResourceGroupDefault{{endif}}
@@ -5976,13 +5979,13 @@ class cudaKernelFunctionType(_FastEnum):
 
     cudaKernelFunctionTypeKernel = (
         cyruntime.cudaKernelFunctionType.cudaKernelFunctionTypeKernel,
-        'Function handle is a cudaKernel_t\n'
+        'Function handle is a :py:obj:`~.cudaKernel_t`\n'
     ){{endif}}
     {{if 'cudaKernelFunctionTypeFunction' in found_values}}
 
     cudaKernelFunctionTypeFunction = (
         cyruntime.cudaKernelFunctionType.cudaKernelFunctionTypeFunction,
-        'Function handle is a cudaFunction_t\n'
+        'Function handle is a :py:obj:`~.cudaFunction_t`\n'
     ){{endif}}
 
 {{endif}}
@@ -6123,7 +6126,7 @@ class cudaGraphNodeType(_FastEnum):
         '                                   Handles must be created in advance of\n'
         'creating the node\n'
         '                                   using\n'
-        ':py:obj:`~.cudaGraphConditionalHandleCreate`.\n'
+        ':func:`~.cudaGraphConditionalHandleCreate`.\n'
         '                                   The following restrictions apply to\n'
         'graphs which contain conditional nodes:\n'
         '                                     The graph cannot be used in a child\n'
@@ -6133,8 +6136,8 @@ class cudaGraphNodeType(_FastEnum):
         '                                     The graph cannot be cloned.\n'
         '                                   To set the control value, supply a\n'
         'default value when creating the handle and/or\n'
-        '                                   call :py:obj:`~.cudaGraphSetConditional`\n'
-        'from device code.\n'
+        '                                   call ``cudaGraphSetConditional`` from\n'
+        'device code.\n'
     ){{endif}}
     {{if 'cudaGraphNodeTypeReserved16' in found_values}}
 
@@ -6176,8 +6179,8 @@ class cudaGraphChildGraphNodeOwnership(_FastEnum):
         'The following restrictions apply to child graphs after they have been\n'
         'moved: Cannot be independently instantiated or destroyed; Cannot be added\n'
         'as a child graph of a separate parent graph; Cannot be used as an argument\n'
-        'to cudaGraphExecUpdate; Cannot have additional memory allocation or free\n'
-        'nodes added.\n'
+        'to :func:`~.cudaGraphExecUpdate`; Cannot have additional memory allocation\n'
+        'or free nodes added.\n'
     ){{endif}}
 
 {{endif}}
@@ -6287,7 +6290,7 @@ class cudaGetDriverEntryPointFlags(_FastEnum):
     """
     Flags to specify search options to be used with
     :py:obj:`~.cudaGetDriverEntryPoint` For more details see
-    :py:obj:`~.cuGetProcAddress`
+    :func:`~.cuGetProcAddress`
     """
     {{if 'cudaEnableDefault' in found_values}}
 
@@ -6314,7 +6317,7 @@ class cudaGetDriverEntryPointFlags(_FastEnum):
 class cudaDriverEntryPointQueryResult(_FastEnum):
     """
     Enum for status from obtaining driver entry points, used with
-    :py:obj:`~.cudaApiGetDriverEntryPoint`
+    ``cudaApiGetDriverEntryPoint``
     """
     {{if 'cudaDriverEntryPointSuccess' in found_values}}
 
@@ -6376,7 +6379,7 @@ class cudaGraphDebugDotFlags(_FastEnum):
 
     cudaGraphDebugDotFlagsEventNodeParams = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsEventNodeParams,
-        'Adds cudaEvent_t handle from record and wait nodes to output\n'
+        'Adds :py:obj:`~.cudaEvent_t` handle from record and wait nodes to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsExtSemasSignalNodeParams' in found_values}}
 
@@ -6394,7 +6397,7 @@ class cudaGraphDebugDotFlags(_FastEnum):
 
     cudaGraphDebugDotFlagsKernelNodeAttributes = (
         cyruntime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsKernelNodeAttributes,
-        'Adds cudaKernelNodeAttrID values to output\n'
+        'Adds :py:obj:`~.cudaKernelNodeAttrID` values to output\n'
     ){{endif}}
     {{if 'cudaGraphDebugDotFlagsHandles' in found_values}}
 
@@ -6437,7 +6440,8 @@ class cudaGraphInstantiateFlags(_FastEnum):
         cyruntime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagDeviceLaunch,
         'Instantiate the graph to be launchable from the device. This flag can only\n'
         ' be used on platforms which support unified addressing. This flag cannot be\n'
-        ' used in conjunction with cudaGraphInstantiateFlagAutoFreeOnLaunch.\n'
+        ' used in conjunction with\n'
+        ':py:obj:`~.cudaGraphInstantiateFlagAutoFreeOnLaunch`.\n'
     ){{endif}}
     {{if 'cudaGraphInstantiateFlagUseNodePriority' in found_values}}
 
@@ -6464,7 +6468,7 @@ class cudaDeviceNumaConfig(_FastEnum):
 
     cudaDeviceNumaConfigNumaNode = (
         cyruntime.cudaDeviceNumaConfig.cudaDeviceNumaConfigNumaNode,
-        'The GPU is a NUMA node, cudaDevAttrNumaId contains its NUMA ID\n'
+        'The GPU is a NUMA node, :py:obj:`~.cudaDevAttrNumaId` contains its NUMA ID\n'
     ){{endif}}
 
 {{endif}}
@@ -6860,16 +6864,15 @@ class cudaStreamAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeDeviceUpdatableKernelNode,
         'Valid for graph nodes, launches. This attribute is graphs-only, and passing\n'
         'it to a launch in a non-capturing stream will result in an error.\n'
-        ' :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can\n'
-        'only be set to 0 or 1. Setting the field to 1 indicates that the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable`\n'
+        'can only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
         ':py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode`\n'
         'which can be passed to the various device-side update functions to update\n'
         "the node's kernel parameters from within another kernel. For more\n"
         'information on the types of device updates that can be made, as well as the\n'
-        'relevant limitations thereof, see\n'
-        ':py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
+        'relevant limitations thereof, see ``cudaGraphKernelNodeUpdatesApply``.\n'
         ' Nodes which are device-updatable have additional restrictions compared to\n'
         'regular kernel nodes. Firstly, device-updatable nodes cannot be removed\n'
         'from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once\n'
@@ -6882,7 +6885,7 @@ class cudaStreamAttrID(_FastEnum):
         'can be passed to :py:obj:`~.cudaGraphExecUpdate`.\n'
         ' If a graph contains device-updatable nodes and updates those nodes from\n'
         'the device from within the graph, the graph must be uploaded with\n'
-        ':py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-\n'
+        ':func:`~.cuGraphUpload` before it is launched. For such a graph, if host-\n'
         'side executable graph updates are made to the device-updatable nodes, the\n'
         'graph must be uploaded before it is launched again.\n'
     ){{endif}}
@@ -7108,16 +7111,15 @@ class cudaKernelNodeAttrID(_FastEnum):
         cyruntime.cudaLaunchAttributeID.cudaLaunchAttributeDeviceUpdatableKernelNode,
         'Valid for graph nodes, launches. This attribute is graphs-only, and passing\n'
         'it to a launch in a non-capturing stream will result in an error.\n'
-        ' :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can\n'
-        'only be set to 0 or 1. Setting the field to 1 indicates that the\n'
+        ':py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable`\n'
+        'can only be set to 0 or 1. Setting the field to 1 indicates that the\n'
         'corresponding kernel node should be device-updatable. On success, a handle\n'
         'will be returned via\n'
         ':py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode`\n'
         'which can be passed to the various device-side update functions to update\n'
         "the node's kernel parameters from within another kernel. For more\n"
         'information on the types of device updates that can be made, as well as the\n'
-        'relevant limitations thereof, see\n'
-        ':py:obj:`~.cudaGraphKernelNodeUpdatesApply`.\n'
+        'relevant limitations thereof, see ``cudaGraphKernelNodeUpdatesApply``.\n'
         ' Nodes which are device-updatable have additional restrictions compared to\n'
         'regular kernel nodes. Firstly, device-updatable nodes cannot be removed\n'
         'from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once\n'
@@ -7130,7 +7132,7 @@ class cudaKernelNodeAttrID(_FastEnum):
         'can be passed to :py:obj:`~.cudaGraphExecUpdate`.\n'
         ' If a graph contains device-updatable nodes and updates those nodes from\n'
         'the device from within the graph, the graph must be uploaded with\n'
-        ':py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-\n'
+        ':func:`~.cuGraphUpload` before it is launched. For such a graph, if host-\n'
         'side executable graph updates are made to the device-updatable nodes, the\n'
         'graph must be uploaded before it is launched again.\n'
     ){{endif}}
@@ -7230,7 +7232,7 @@ cdef class cudaDevResourceDesc_t:
 cdef class cudaExecutionContext_t:
     """
 
-    An opaque handle to a CUDA execution context. It represents an execution context created via CUDA Runtime APIs such as cudaGreenCtxCreate.
+    An opaque handle to a CUDA execution context. It represents an execution context created via CUDA Runtime APIs such as :func:`~.cudaGreenCtxCreate`.
 
     Methods
     -------
@@ -8063,7 +8065,7 @@ cdef class cudaChannelFormatDesc:
         w
     {{endif}}
     {{if 'cudaChannelFormatDesc.f' in found_struct}}
-    f : cudaChannelFormatKind
+    f : :py:obj:`~.cudaChannelFormatKind`
         Channel format kind
     {{endif}}
 
@@ -8264,7 +8266,8 @@ cdef class cudaArraySparseProperties:
     {{endif}}
     {{if 'cudaArraySparseProperties.flags' in found_struct}}
     flags : unsigned int
-        Flags will either be zero or cudaArraySparsePropertiesSingleMipTail
+        Flags will either be zero or
+        :py:obj:`~.cudaArraySparsePropertiesSingleMipTail`
     {{endif}}
     {{if 'cudaArraySparseProperties.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -8457,7 +8460,7 @@ cdef class cudaArrayMemoryRequirements:
 
 cdef class cudaPitchedPtr:
     """
-    CUDA Pitched memory pointer  ``make_cudaPitchedPtr``
+    CUDA Pitched memory pointer  :func:`~.make_cudaPitchedPtr`
 
     Attributes
     ----------
@@ -8562,7 +8565,7 @@ cdef class cudaPitchedPtr:
 
 cdef class cudaExtent:
     """
-    CUDA extent  ``make_cudaExtent``
+    CUDA extent  :func:`~.make_cudaExtent`
 
     Attributes
     ----------
@@ -8649,7 +8652,7 @@ cdef class cudaExtent:
 
 cdef class cudaPos:
     """
-    CUDA 3D position  ``make_cudaPos``
+    CUDA 3D position  :func:`~.make_cudaPos`
 
     Attributes
     ----------
@@ -8740,35 +8743,35 @@ cdef class cudaMemcpy3DParms:
     Attributes
     ----------
     {{if 'cudaMemcpy3DParms.srcArray' in found_struct}}
-    srcArray : cudaArray_t
+    srcArray : :py:obj:`~.cudaArray_t`
         Source memory address
     {{endif}}
     {{if 'cudaMemcpy3DParms.srcPos' in found_struct}}
-    srcPos : cudaPos
+    srcPos : :py:obj:`~.cudaPos`
         Source position offset
     {{endif}}
     {{if 'cudaMemcpy3DParms.srcPtr' in found_struct}}
-    srcPtr : cudaPitchedPtr
+    srcPtr : :py:obj:`~.cudaPitchedPtr`
         Pitched source memory address
     {{endif}}
     {{if 'cudaMemcpy3DParms.dstArray' in found_struct}}
-    dstArray : cudaArray_t
+    dstArray : :py:obj:`~.cudaArray_t`
         Destination memory address
     {{endif}}
     {{if 'cudaMemcpy3DParms.dstPos' in found_struct}}
-    dstPos : cudaPos
+    dstPos : :py:obj:`~.cudaPos`
         Destination position offset
     {{endif}}
     {{if 'cudaMemcpy3DParms.dstPtr' in found_struct}}
-    dstPtr : cudaPitchedPtr
+    dstPtr : :py:obj:`~.cudaPitchedPtr`
         Pitched destination memory address
     {{endif}}
     {{if 'cudaMemcpy3DParms.extent' in found_struct}}
-    extent : cudaExtent
+    extent : :py:obj:`~.cudaExtent`
         Requested memory copy size
     {{endif}}
     {{if 'cudaMemcpy3DParms.kind' in found_struct}}
-    kind : cudaMemcpyKind
+    kind : :py:obj:`~.cudaMemcpyKind`
         Type of transfer
     {{endif}}
 
@@ -8963,12 +8966,12 @@ cdef class cudaMemcpyNodeParams:
         Must be zero
     {{endif}}
     {{if 'cudaMemcpyNodeParams.ctx' in found_struct}}
-    ctx : cudaExecutionContext_t
+    ctx : :py:obj:`~.cudaExecutionContext_t`
         Context in which to run the memcpy. If NULL will try to use the
         current context.
     {{endif}}
     {{if 'cudaMemcpyNodeParams.copyParams' in found_struct}}
-    copyParams : cudaMemcpy3DParms
+    copyParams : :py:obj:`~.cudaMemcpy3DParms`
         Parameters for the memory copy
     {{endif}}
 
@@ -9075,15 +9078,15 @@ cdef class cudaMemcpy3DPeerParms:
     Attributes
     ----------
     {{if 'cudaMemcpy3DPeerParms.srcArray' in found_struct}}
-    srcArray : cudaArray_t
+    srcArray : :py:obj:`~.cudaArray_t`
         Source memory address
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.srcPos' in found_struct}}
-    srcPos : cudaPos
+    srcPos : :py:obj:`~.cudaPos`
         Source position offset
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.srcPtr' in found_struct}}
-    srcPtr : cudaPitchedPtr
+    srcPtr : :py:obj:`~.cudaPitchedPtr`
         Pitched source memory address
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.srcDevice' in found_struct}}
@@ -9091,15 +9094,15 @@ cdef class cudaMemcpy3DPeerParms:
         Source device
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.dstArray' in found_struct}}
-    dstArray : cudaArray_t
+    dstArray : :py:obj:`~.cudaArray_t`
         Destination memory address
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.dstPos' in found_struct}}
-    dstPos : cudaPos
+    dstPos : :py:obj:`~.cudaPos`
         Destination position offset
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.dstPtr' in found_struct}}
-    dstPtr : cudaPitchedPtr
+    dstPtr : :py:obj:`~.cudaPitchedPtr`
         Pitched destination memory address
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.dstDevice' in found_struct}}
@@ -9107,7 +9110,7 @@ cdef class cudaMemcpy3DPeerParms:
         Destination device
     {{endif}}
     {{if 'cudaMemcpy3DPeerParms.extent' in found_struct}}
-    extent : cudaExtent
+    extent : :py:obj:`~.cudaExtent`
         Requested memory copy size
     {{endif}}
 
@@ -9473,7 +9476,7 @@ cdef class cudaMemsetParamsV2:
         Number of rows
     {{endif}}
     {{if 'cudaMemsetParamsV2.ctx' in found_struct}}
-    ctx : cudaExecutionContext_t
+    ctx : :py:obj:`~.cudaExecutionContext_t`
         Context in which to run the memset. If NULL will try to use the
         current context.
     {{endif}}
@@ -9643,12 +9646,12 @@ cdef class cudaAccessPolicyWindow:
         assigned missProp.
     {{endif}}
     {{if 'cudaAccessPolicyWindow.hitProp' in found_struct}}
-    hitProp : cudaAccessProperty
-        ``CUaccessProperty`` set for hit.
+    hitProp : :py:obj:`~.cudaAccessProperty`
+        :py:obj:`~.CUaccessProperty` set for hit.
     {{endif}}
     {{if 'cudaAccessPolicyWindow.missProp' in found_struct}}
-    missProp : cudaAccessProperty
-        ``CUaccessProperty`` set for miss. Must be either NORMAL or
+    missProp : :py:obj:`~.cudaAccessProperty`
+        :py:obj:`~.CUaccessProperty` set for miss. Must be either NORMAL or
         STREAMING.
     {{endif}}
 
@@ -9755,7 +9758,7 @@ cdef class cudaHostNodeParams:
     Attributes
     ----------
     {{if 'cudaHostNodeParams.fn' in found_struct}}
-    fn : cudaHostFn_t
+    fn : :py:obj:`~.cudaHostFn_t`
         The function to call when the node executes
     {{endif}}
     {{if 'cudaHostNodeParams.userData' in found_struct}}
@@ -9836,7 +9839,7 @@ cdef class cudaHostNodeParamsV2:
     Attributes
     ----------
     {{if 'cudaHostNodeParamsV2.fn' in found_struct}}
-    fn : cudaHostFn_t
+    fn : :py:obj:`~.cudaHostFn_t`
         The function to call when the node executes
     {{endif}}
     {{if 'cudaHostNodeParamsV2.userData' in found_struct}}
@@ -9933,7 +9936,7 @@ cdef class anon_struct1:
     Attributes
     ----------
     {{if 'cudaResourceDesc.res.array.array' in found_struct}}
-    array : cudaArray_t
+    array : :py:obj:`~.cudaArray_t`
 
     {{endif}}
 
@@ -9991,7 +9994,7 @@ cdef class anon_struct2:
     Attributes
     ----------
     {{if 'cudaResourceDesc.res.mipmap.mipmap' in found_struct}}
-    mipmap : cudaMipmappedArray_t
+    mipmap : :py:obj:`~.cudaMipmappedArray_t`
 
     {{endif}}
 
@@ -10053,7 +10056,7 @@ cdef class anon_struct3:
 
     {{endif}}
     {{if 'cudaResourceDesc.res.linear.desc' in found_struct}}
-    desc : cudaChannelFormatDesc
+    desc : :py:obj:`~.cudaChannelFormatDesc`
 
     {{endif}}
     {{if 'cudaResourceDesc.res.linear.sizeInBytes' in found_struct}}
@@ -10139,7 +10142,7 @@ cdef class anon_struct4:
 
     {{endif}}
     {{if 'cudaResourceDesc.res.pitch2D.desc' in found_struct}}
-    desc : cudaChannelFormatDesc
+    desc : :py:obj:`~.cudaChannelFormatDesc`
 
     {{endif}}
     {{if 'cudaResourceDesc.res.pitch2D.width' in found_struct}}
@@ -10438,7 +10441,7 @@ cdef class cudaResourceDesc:
     Attributes
     ----------
     {{if 'cudaResourceDesc.resType' in found_struct}}
-    resType : cudaResourceType
+    resType : :py:obj:`~.cudaResourceType`
         Resource type
     {{endif}}
     {{if 'cudaResourceDesc.res' in found_struct}}
@@ -10529,7 +10532,7 @@ cdef class cudaResourceViewDesc:
     Attributes
     ----------
     {{if 'cudaResourceViewDesc.format' in found_struct}}
-    format : cudaResourceViewFormat
+    format : :py:obj:`~.cudaResourceViewFormat`
         Resource view format
     {{endif}}
     {{if 'cudaResourceViewDesc.width' in found_struct}}
@@ -10723,19 +10726,21 @@ cdef class cudaPointerAttributes:
     Attributes
     ----------
     {{if 'cudaPointerAttributes.type' in found_struct}}
-    type : cudaMemoryType
-        The type of memory - cudaMemoryTypeUnregistered,
-        cudaMemoryTypeHost, cudaMemoryTypeDevice or cudaMemoryTypeManaged.
+    type : :py:obj:`~.cudaMemoryType`
+        The type of memory - :py:obj:`~.cudaMemoryTypeUnregistered`,
+        :py:obj:`~.cudaMemoryTypeHost`, :py:obj:`~.cudaMemoryTypeDevice` or
+        :py:obj:`~.cudaMemoryTypeManaged`.
     {{endif}}
     {{if 'cudaPointerAttributes.device' in found_struct}}
     device : int
         The device against which the memory was allocated or registered. If
-        the memory type is cudaMemoryTypeDevice then this identifies the
-        device on which the memory referred physically resides. If the
-        memory type is cudaMemoryTypeHost or::cudaMemoryTypeManaged then
-        this identifies the device which was current when the memory was
-        allocated or registered (and if that device is deinitialized then
-        this allocation will vanish with that device's state).
+        the memory type is :py:obj:`~.cudaMemoryTypeDevice` then this
+        identifies the device on which the memory referred physically
+        resides. If the memory type is :py:obj:`~.cudaMemoryTypeHost` or
+        :py:obj:`~.cudaMemoryTypeManaged` then this identifies the device
+        which was current when the memory was allocated or registered (and
+        if that device is deinitialized then this allocation will vanish
+        with that device's state).
     {{endif}}
     {{if 'cudaPointerAttributes.devicePointer' in found_struct}}
     devicePointer : Any
@@ -10912,9 +10917,10 @@ cdef class cudaFuncAttributes:
         On devices where the L1 cache and shared memory use the same
         hardware resources, this sets the shared memory carveout
         preference, in percent of the maximum shared memory. Refer to
-        cudaDevAttrMaxSharedMemoryPerMultiprocessor. This is only a hint,
-        and the driver can choose a different ratio if required to execute
-        the function. See cudaFuncSetAttribute
+        :py:obj:`~.cudaDevAttrMaxSharedMemoryPerMultiprocessor`. This is
+        only a hint, and the driver can choose a different ratio if
+        required to execute the function. See
+        :func:`~.cudaFuncSetAttribute`
     {{endif}}
     {{if 'cudaFuncAttributes.clusterDimMustBeSet' in found_struct}}
     clusterDimMustBeSet : int
@@ -10927,8 +10933,8 @@ cdef class cudaFuncAttributes:
         either all be 0 or all be positive. The validity of the cluster
         dimensions is otherwise checked at launch time.  If the value is
         set during compile time, it cannot be set at runtime. Setting it at
-        runtime should return cudaErrorNotPermitted. See
-        cudaFuncSetAttribute
+        runtime should return :py:obj:`~.cudaErrorNotPermitted`. See
+        :func:`~.cudaFuncSetAttribute`
     {{endif}}
     {{if 'cudaFuncAttributes.requiredClusterHeight' in found_struct}}
     requiredClusterHeight : int
@@ -10940,7 +10946,8 @@ cdef class cudaFuncAttributes:
     {{endif}}
     {{if 'cudaFuncAttributes.clusterSchedulingPolicyPreference' in found_struct}}
     clusterSchedulingPolicyPreference : int
-        The block scheduling policy of a function. See cudaFuncSetAttribute
+        The block scheduling policy of a function. See
+        :func:`~.cudaFuncSetAttribute`
     {{endif}}
     {{if 'cudaFuncAttributes.nonPortableClusterSizeAllowed' in found_struct}}
     nonPortableClusterSizeAllowed : int
@@ -10948,15 +10955,16 @@ cdef class cudaFuncAttributes:
         size. 1 is allowed, 0 is disallowed. A non-portable cluster size
         may only function on the specific SKUs the program is tested on.
         The launch might fail if the program is run on a different hardware
-        platform.  CUDA API provides cudaOccupancyMaxActiveClusters to
-        assist with checking whether the desired size can be launched on
-        the current device.  Portable Cluster Size  A portable cluster size
-        is guaranteed to be functional on all compute capabilities higher
-        than the target compute capability. The portable cluster size for
-        sm_90 is 8 blocks per cluster. This value may increase for future
-        compute capabilities.  The specific hardware unit may support
-        higher cluster sizes that’s not guaranteed to be portable. See
-        cudaFuncSetAttribute
+        platform.  CUDA API provides
+        :func:`~.cudaOccupancyMaxActiveClusters` to assist with checking
+        whether the desired size can be launched on the current device.
+        Portable Cluster Size  A portable cluster size is guaranteed to be
+        functional on all compute capabilities higher than the target
+        compute capability. The portable cluster size for sm_90 is 8 blocks
+        per cluster. This value may increase for future compute
+        capabilities.  The specific hardware unit may support higher
+        cluster sizes that’s not guaranteed to be portable. See
+        :func:`~.cudaFuncSetAttribute`
     {{endif}}
     {{if 'cudaFuncAttributes.deviceNodeUpdateStatus' in found_struct}}
     deviceNodeUpdateStatus : int
@@ -11267,21 +11275,23 @@ cdef class cudaFuncAttributes:
 cdef class cudaMemLocation:
     """
     Specifies a memory location.  To specify a gpu, set type =
-    cudaMemLocationTypeDevice and set id = the gpu's device ordinal. To
-    specify a cpu NUMA node, set type = cudaMemLocationTypeHostNuma and
-    set id = host NUMA node id.
+    :py:obj:`~.cudaMemLocationTypeDevice` and set id = the gpu's device
+    ordinal. To specify a cpu NUMA node, set type =
+    :py:obj:`~.cudaMemLocationTypeHostNuma` and set id = host NUMA node
+    id.
 
     Attributes
     ----------
     {{if 'cudaMemLocation.type' in found_struct}}
-    type : cudaMemLocationType
+    type : :py:obj:`~.cudaMemLocationType`
         Specifies the location type, which modifies the meaning of id.
     {{endif}}
     {{if 'cudaMemLocation.id' in found_struct}}
     id : int
-        Identifier for cudaMemLocationType::cudaMemLocationTypeDevice,
-        cudaMemLocationType::cudaMemLocationTypeHost, or
-        cudaMemLocationType::cudaMemLocationTypeHostNuma.
+        Identifier for
+        :py:obj:`~.cudaMemLocationType.cudaMemLocationTypeDevice`,
+        :py:obj:`~.cudaMemLocationType.cudaMemLocationTypeHost`, or
+        :py:obj:`~.cudaMemLocationType.cudaMemLocationTypeHostNuma`.
     {{endif}}
 
     Methods
@@ -11346,11 +11356,11 @@ cdef class cudaMemAccessDesc:
     Attributes
     ----------
     {{if 'cudaMemAccessDesc.location' in found_struct}}
-    location : cudaMemLocation
+    location : :py:obj:`~.cudaMemLocation`
         Location on which the request is to change it's accessibility
     {{endif}}
     {{if 'cudaMemAccessDesc.flags' in found_struct}}
-    flags : cudaMemAccessFlags
+    flags : :py:obj:`~.cudaMemAccessFlags`
         ``CUmemProt`` accessibility flags to set on the request
     {{endif}}
 
@@ -11417,25 +11427,25 @@ cdef class cudaMemPoolProps:
     Attributes
     ----------
     {{if 'cudaMemPoolProps.allocType' in found_struct}}
-    allocType : cudaMemAllocationType
+    allocType : :py:obj:`~.cudaMemAllocationType`
         Allocation type. Currently must be specified as
-        cudaMemAllocationTypePinned
+        :py:obj:`~.cudaMemAllocationTypePinned`
     {{endif}}
     {{if 'cudaMemPoolProps.handleTypes' in found_struct}}
-    handleTypes : cudaMemAllocationHandleType
+    handleTypes : :py:obj:`~.cudaMemAllocationHandleType`
         Handle types that will be supported by allocations from the pool.
     {{endif}}
     {{if 'cudaMemPoolProps.location' in found_struct}}
-    location : cudaMemLocation
+    location : :py:obj:`~.cudaMemLocation`
         Location allocations should reside.
     {{endif}}
     {{if 'cudaMemPoolProps.win32SecurityAttributes' in found_struct}}
     win32SecurityAttributes : Any
         Windows-specific LPSECURITYATTRIBUTES required when
-        cudaMemHandleTypeWin32 is specified. This security attribute
-        defines the scope of which exported allocations may be tranferred
-        to other processes. In all other cases, this field is required to
-        be zero.
+        :py:obj:`~.cudaMemHandleTypeWin32` is specified. This security
+        attribute defines the scope of which exported allocations may be
+        tranferred to other processes. In all other cases, this field is
+        required to be zero.
     {{endif}}
     {{if 'cudaMemPoolProps.maxSize' in found_struct}}
     maxSize : size_t
@@ -11641,14 +11651,14 @@ cdef class cudaMemAllocNodeParams:
     Attributes
     ----------
     {{if 'cudaMemAllocNodeParams.poolProps' in found_struct}}
-    poolProps : cudaMemPoolProps
+    poolProps : :py:obj:`~.cudaMemPoolProps`
         in: location where the allocation should reside (specified in
-        ``location``). ``handleTypes`` must be cudaMemHandleTypeNone. IPC
-        is not supported. in: array of memory access descriptors. Used to
-        describe peer GPU access
+        ``location``). ``handleTypes`` must be
+        :py:obj:`~.cudaMemHandleTypeNone`. IPC is not supported. in: array
+        of memory access descriptors. Used to describe peer GPU access
     {{endif}}
     {{if 'cudaMemAllocNodeParams.accessDescs' in found_struct}}
-    accessDescs : cudaMemAccessDesc
+    accessDescs : :py:obj:`~.cudaMemAccessDesc`
         in: number of memory access descriptors. Must not exceed the number
         of GPUs.
     {{endif}}
@@ -11792,14 +11802,14 @@ cdef class cudaMemAllocNodeParamsV2:
     Attributes
     ----------
     {{if 'cudaMemAllocNodeParamsV2.poolProps' in found_struct}}
-    poolProps : cudaMemPoolProps
+    poolProps : :py:obj:`~.cudaMemPoolProps`
         in: location where the allocation should reside (specified in
-        ``location``). ``handleTypes`` must be cudaMemHandleTypeNone. IPC
-        is not supported. in: array of memory access descriptors. Used to
-        describe peer GPU access
+        ``location``). ``handleTypes`` must be
+        :py:obj:`~.cudaMemHandleTypeNone`. IPC is not supported. in: array
+        of memory access descriptors. Used to describe peer GPU access
     {{endif}}
     {{if 'cudaMemAllocNodeParamsV2.accessDescs' in found_struct}}
-    accessDescs : cudaMemAccessDesc
+    accessDescs : :py:obj:`~.cudaMemAccessDesc`
         in: number of memory access descriptors. Must not exceed the number
         of GPUs.
     {{endif}}
@@ -11990,29 +12000,29 @@ cdef class cudaMemFreeNodeParams:
 cdef class cudaMemcpyAttributes:
     """
     Attributes specific to copies within a batch. For more details on
-    usage see cudaMemcpyBatchAsync.
+    usage see :func:`~.cudaMemcpyBatchAsync`.
 
     Attributes
     ----------
     {{if 'cudaMemcpyAttributes.srcAccessOrder' in found_struct}}
-    srcAccessOrder : cudaMemcpySrcAccessOrder
+    srcAccessOrder : :py:obj:`~.cudaMemcpySrcAccessOrder`
         Source access ordering to be observed for copies with this
         attribute.
     {{endif}}
     {{if 'cudaMemcpyAttributes.srcLocHint' in found_struct}}
-    srcLocHint : cudaMemLocation
+    srcLocHint : :py:obj:`~.cudaMemLocation`
         Hint location for the source operand. Ignored when the pointers are
         not managed memory or memory allocated outside CUDA.
     {{endif}}
     {{if 'cudaMemcpyAttributes.dstLocHint' in found_struct}}
-    dstLocHint : cudaMemLocation
+    dstLocHint : :py:obj:`~.cudaMemLocation`
         Hint location for the destination operand. Ignored when the
         pointers are not managed memory or memory allocated outside CUDA.
     {{endif}}
     {{if 'cudaMemcpyAttributes.flags' in found_struct}}
     flags : unsigned int
         Additional flags for copies with this attribute. See
-        cudaMemcpyFlags.
+        :py:obj:`~.cudaMemcpyFlags`.
     {{endif}}
 
     Methods
@@ -12104,7 +12114,8 @@ cdef class cudaMemcpyAttributes:
 
 cdef class cudaOffset3D:
     """
-    Struct representing offset into a cudaArray_t in elements
+    Struct representing offset into a :py:obj:`~.cudaArray_t` in
+    elements
 
     Attributes
     ----------
@@ -12205,7 +12216,7 @@ cdef class anon_struct6:
 
     {{endif}}
     {{if 'cudaMemcpy3DOperand.op.ptr.locHint' in found_struct}}
-    locHint : cudaMemLocation
+    locHint : :py:obj:`~.cudaMemLocation`
 
     {{endif}}
 
@@ -12297,11 +12308,11 @@ cdef class anon_struct7:
     Attributes
     ----------
     {{if 'cudaMemcpy3DOperand.op.array.array' in found_struct}}
-    array : cudaArray_t
+    array : :py:obj:`~.cudaArray_t`
 
     {{endif}}
     {{if 'cudaMemcpy3DOperand.op.array.offset' in found_struct}}
-    offset : cudaOffset3D
+    offset : :py:obj:`~.cudaOffset3D`
 
     {{endif}}
 
@@ -12443,12 +12454,13 @@ cdef class anon_union2:
 
 cdef class cudaMemcpy3DOperand:
     """
-    Struct representing an operand for copy with cudaMemcpy3DBatchAsync
+    Struct representing an operand for copy with
+    :func:`~.cudaMemcpy3DBatchAsync`
 
     Attributes
     ----------
     {{if 'cudaMemcpy3DOperand.type' in found_struct}}
-    type : cudaMemcpy3DOperandType
+    type : :py:obj:`~.cudaMemcpy3DOperandType`
 
     {{endif}}
     {{if 'cudaMemcpy3DOperand.op' in found_struct}}
@@ -12519,25 +12531,26 @@ cdef class cudaMemcpy3DBatchOp:
     Attributes
     ----------
     {{if 'cudaMemcpy3DBatchOp.src' in found_struct}}
-    src : cudaMemcpy3DOperand
+    src : :py:obj:`~.cudaMemcpy3DOperand`
         Source memcpy operand.
     {{endif}}
     {{if 'cudaMemcpy3DBatchOp.dst' in found_struct}}
-    dst : cudaMemcpy3DOperand
+    dst : :py:obj:`~.cudaMemcpy3DOperand`
         Destination memcpy operand.
     {{endif}}
     {{if 'cudaMemcpy3DBatchOp.extent' in found_struct}}
-    extent : cudaExtent
+    extent : :py:obj:`~.cudaExtent`
         Extents of the memcpy between src and dst. The width, height and
         depth components must not be 0.
     {{endif}}
     {{if 'cudaMemcpy3DBatchOp.srcAccessOrder' in found_struct}}
-    srcAccessOrder : cudaMemcpySrcAccessOrder
+    srcAccessOrder : :py:obj:`~.cudaMemcpySrcAccessOrder`
         Source access ordering to be observed for copy from src to dst.
     {{endif}}
     {{if 'cudaMemcpy3DBatchOp.flags' in found_struct}}
     flags : unsigned int
-        Additional flags for copy from src to dst. See cudaMemcpyFlags.
+        Additional flags for copy from src to dst. See
+        :py:obj:`~.cudaMemcpyFlags`.
     {{endif}}
 
     Methods
@@ -12700,7 +12713,7 @@ cdef class cudaDeviceProp:
         ASCII string identifying device
     {{endif}}
     {{if 'cudaDeviceProp.uuid' in found_struct}}
-    uuid : cudaUUID_t
+    uuid : :py:obj:`~.cudaUUID_t`
         16-byte unique identifier
     {{endif}}
     {{if 'cudaDeviceProp.luid' in found_struct}}
@@ -12777,7 +12790,7 @@ cdef class cudaDeviceProp:
     {{if 'cudaDeviceProp.canMapHostMemory' in found_struct}}
     canMapHostMemory : int
         Device can map host memory with
-        cudaHostAlloc/cudaHostGetDevicePointer
+        :func:`~.cudaHostAlloc`/:func:`~.cudaHostGetDevicePointer`
     {{endif}}
     {{if 'cudaDeviceProp.maxTexture1D' in found_struct}}
     maxTexture1D : int
@@ -12950,7 +12963,7 @@ cdef class cudaDeviceProp:
     {{if 'cudaDeviceProp.pageableMemoryAccess' in found_struct}}
     pageableMemoryAccess : int
         Device supports coherently accessing pageable memory without
-        calling cudaHostRegister on it
+        calling :func:`~.cudaHostRegister` on it
     {{endif}}
     {{if 'cudaDeviceProp.concurrentManagedAccess' in found_struct}}
     concurrentManagedAccess : int
@@ -12969,7 +12982,7 @@ cdef class cudaDeviceProp:
     {{if 'cudaDeviceProp.cooperativeLaunch' in found_struct}}
     cooperativeLaunch : int
         Device supports launching cooperative kernels via
-        cudaLaunchCooperativeKernel
+        :func:`~.cudaLaunchCooperativeKernel`
     {{endif}}
     {{if 'cudaDeviceProp.sharedMemPerBlockOptin' in found_struct}}
     sharedMemPerBlockOptin : size_t
@@ -12990,7 +13003,7 @@ cdef class cudaDeviceProp:
     {{endif}}
     {{if 'cudaDeviceProp.accessPolicyMaxWindowSize' in found_struct}}
     accessPolicyMaxWindowSize : int
-        The maximum value of cudaAccessPolicyWindow::num_bytes.
+        The maximum value of :py:obj:`~.cudaAccessPolicyWindow.num_bytes`.
     {{endif}}
     {{if 'cudaDeviceProp.reservedSharedMemPerBlock' in found_struct}}
     reservedSharedMemPerBlock : size_t
@@ -12998,7 +13011,8 @@ cdef class cudaDeviceProp:
     {{endif}}
     {{if 'cudaDeviceProp.hostRegisterSupported' in found_struct}}
     hostRegisterSupported : int
-        Device supports host memory registration via cudaHostRegister.
+        Device supports host memory registration via
+        :func:`~.cudaHostRegister`.
     {{endif}}
     {{if 'cudaDeviceProp.sparseCudaArraySupported' in found_struct}}
     sparseCudaArraySupported : int
@@ -13007,9 +13021,9 @@ cdef class cudaDeviceProp:
     {{endif}}
     {{if 'cudaDeviceProp.hostRegisterReadOnlySupported' in found_struct}}
     hostRegisterReadOnlySupported : int
-        Device supports using the cudaHostRegister flag
-        cudaHostRegisterReadOnly to register memory that must be mapped as
-        read-only to the GPU
+        Device supports using the :func:`~.cudaHostRegister` flag
+        :py:obj:`~.cudaHostRegisterReadOnly` to register memory that must
+        be mapped as read-only to the GPU
     {{endif}}
     {{if 'cudaDeviceProp.timelineSemaphoreInteropSupported' in found_struct}}
     timelineSemaphoreInteropSupported : int
@@ -13017,8 +13031,8 @@ cdef class cudaDeviceProp:
     {{endif}}
     {{if 'cudaDeviceProp.memoryPoolsSupported' in found_struct}}
     memoryPoolsSupported : int
-        1 if the device supports using the cudaMallocAsync and cudaMemPool
-        family of APIs, 0 otherwise
+        1 if the device supports using the :func:`~.cudaMallocAsync` and
+        cudaMemPool family of APIs, 0 otherwise
     {{endif}}
     {{if 'cudaDeviceProp.gpuDirectRDMASupported' in found_struct}}
     gpuDirectRDMASupported : int
@@ -13027,11 +13041,12 @@ cdef class cudaDeviceProp:
     {{if 'cudaDeviceProp.gpuDirectRDMAFlushWritesOptions' in found_struct}}
     gpuDirectRDMAFlushWritesOptions : unsigned int
         Bitmask to be interpreted according to the
-        cudaFlushGPUDirectRDMAWritesOptions enum
+        :py:obj:`~.cudaFlushGPUDirectRDMAWritesOptions` enum
     {{endif}}
     {{if 'cudaDeviceProp.gpuDirectRDMAWritesOrdering' in found_struct}}
     gpuDirectRDMAWritesOrdering : int
-        See the cudaGPUDirectRDMAWritesOrdering enum for numerical values
+        See the :py:obj:`~.cudaGPUDirectRDMAWritesOrdering` enum for
+        numerical values
     {{endif}}
     {{if 'cudaDeviceProp.memoryPoolSupportedHandleTypes' in found_struct}}
     memoryPoolSupportedHandleTypes : unsigned int
@@ -13057,7 +13072,7 @@ cdef class cudaDeviceProp:
     {{if 'cudaDeviceProp.deviceNumaConfig' in found_struct}}
     deviceNumaConfig : int
         NUMA configuration of a device: value is of type
-        cudaDeviceNumaConfig enum
+        :py:obj:`~.cudaDeviceNumaConfig` enum
     {{endif}}
     {{if 'cudaDeviceProp.deviceNumaId' in found_struct}}
     deviceNumaId : int
@@ -14773,7 +14788,7 @@ cdef class cudaExternalMemoryHandleDesc:
     Attributes
     ----------
     {{if 'cudaExternalMemoryHandleDesc.type' in found_struct}}
-    type : cudaExternalMemoryHandleType
+    type : :py:obj:`~.cudaExternalMemoryHandleType`
         Type of the handle
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.handle' in found_struct}}
@@ -14786,7 +14801,8 @@ cdef class cudaExternalMemoryHandleDesc:
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.flags' in found_struct}}
     flags : unsigned int
-        Flags must either be zero or cudaExternalMemoryDedicated
+        Flags must either be zero or
+        :py:obj:`~.cudaExternalMemoryDedicated`
     {{endif}}
     {{if 'cudaExternalMemoryHandleDesc.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -15009,17 +15025,17 @@ cdef class cudaExternalMemoryMipmappedArrayDesc:
         chain is.
     {{endif}}
     {{if 'cudaExternalMemoryMipmappedArrayDesc.formatDesc' in found_struct}}
-    formatDesc : cudaChannelFormatDesc
+    formatDesc : :py:obj:`~.cudaChannelFormatDesc`
         Format of base level of the mipmap chain
     {{endif}}
     {{if 'cudaExternalMemoryMipmappedArrayDesc.extent' in found_struct}}
-    extent : cudaExtent
+    extent : :py:obj:`~.cudaExtent`
         Dimensions of base level of the mipmap chain
     {{endif}}
     {{if 'cudaExternalMemoryMipmappedArrayDesc.flags' in found_struct}}
     flags : unsigned int
         Flags associated with CUDA mipmapped arrays. See
-        cudaMallocMipmappedArray
+        :func:`~.cudaMallocMipmappedArray`
     {{endif}}
     {{if 'cudaExternalMemoryMipmappedArrayDesc.numLevels' in found_struct}}
     numLevels : unsigned int
@@ -15304,7 +15320,7 @@ cdef class cudaExternalSemaphoreHandleDesc:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreHandleDesc.type' in found_struct}}
-    type : cudaExternalSemaphoreHandleType
+    type : :py:obj:`~.cudaExternalSemaphoreHandleType`
         Type of the handle
     {{endif}}
     {{if 'cudaExternalSemaphoreHandleDesc.handle' in found_struct}}
@@ -15684,14 +15700,16 @@ cdef class cudaExternalSemaphoreSignalParams:
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.flags' in found_struct}}
     flags : unsigned int
-        Only when cudaExternalSemaphoreSignalParams is used to signal a
-        cudaExternalSemaphore_t of type
-        cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is
-        cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
-        that while signaling the cudaExternalSemaphore_t, no memory
-        synchronization operations should be performed for any external
-        memory object imported as cudaExternalMemoryHandleTypeNvSciBuf. For
-        all other types of cudaExternalSemaphore_t, flags must be zero.
+        Only when :py:obj:`~.cudaExternalSemaphoreSignalParams` is used to
+        signal a :py:obj:`~.cudaExternalSemaphore_t` of type
+        :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync`, the valid
+        flag is :py:obj:`~.cudaExternalSemaphoreSignalSkipNvSciBufMemSync`:
+        which indicates that while signaling the
+        :py:obj:`~.cudaExternalSemaphore_t`, no memory synchronization
+        operations should be performed for any external memory object
+        imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`. For
+        all other types of :py:obj:`~.cudaExternalSemaphore_t`, flags must
+        be zero.
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalParams.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -16064,14 +16082,16 @@ cdef class cudaExternalSemaphoreWaitParams:
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.flags' in found_struct}}
     flags : unsigned int
-        Only when cudaExternalSemaphoreSignalParams is used to signal a
-        cudaExternalSemaphore_t of type
-        cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is
-        cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
-        that while waiting for the cudaExternalSemaphore_t, no memory
-        synchronization operations should be performed for any external
-        memory object imported as cudaExternalMemoryHandleTypeNvSciBuf. For
-        all other types of cudaExternalSemaphore_t, flags must be zero.
+        Only when :py:obj:`~.cudaExternalSemaphoreSignalParams` is used to
+        signal a :py:obj:`~.cudaExternalSemaphore_t` of type
+        :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync`, the valid
+        flag is :py:obj:`~.cudaExternalSemaphoreSignalSkipNvSciBufMemSync`:
+        which indicates that while waiting for the
+        :py:obj:`~.cudaExternalSemaphore_t`, no memory synchronization
+        operations should be performed for any external memory object
+        imported as :py:obj:`~.cudaExternalMemoryHandleTypeNvSciBuf`. For
+        all other types of :py:obj:`~.cudaExternalSemaphore_t`, flags must
+        be zero.
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitParams.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -16174,7 +16194,7 @@ cdef class cudaDevSmResource:
     {{if 'cudaDevSmResource.flags' in found_struct}}
     flags : unsigned int
         The flags set on this SM resource. For available flags see
-        cudaDevSmResourceGroup_flags.
+        :py:obj:`~.cudaDevSmResourceGroup_flags`.
     {{endif}}
 
     Methods
@@ -16273,7 +16293,7 @@ cdef class cudaDevWorkqueueConfigResource:
         The expected maximum number of concurrent stream-ordered workloads
     {{endif}}
     {{if 'cudaDevWorkqueueConfigResource.sharingScope' in found_struct}}
-    sharingScope : cudaDevWorkqueueConfigScope
+    sharingScope : :py:obj:`~.cudaDevWorkqueueConfigScope`
         The sharing scope for the workqueue resources
     {{endif}}
 
@@ -16419,8 +16439,8 @@ cdef class cudaDevSmResourceGroupParams_st:
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.flags' in found_struct}}
     flags : unsigned int
-        Combination of ``cudaDevSmResourceGroup_flags`` values to indicate
-        this this group is created.
+        Combination of :py:obj:`~.cudaDevSmResourceGroup_flags` values to
+        indicate this this group is created.
     {{endif}}
     {{if 'cudaDevSmResourceGroupParams_st.reserved' in found_struct}}
     reserved : list[unsigned int]
@@ -16530,20 +16550,22 @@ cdef class cudaDevResource_st:
     union structcudaDevSmResourcesm;
     structcudaDevWorkqueueConfigResourcewqConfig;
     structcudaDevWorkqueueResourcewq; ; ;  - If ``typename`` is
-    ``cudaDevResourceTypeInvalid``, this resoure is not valid and
-    cannot be further accessed.    - If ``typename`` is
-    ``cudaDevResourceTypeSm``, the cudaDevSmResource structure ``sm``
-    is filled in. For example, ``sm.smCount`` will reflect the amount
-    of streaming multiprocessors available in this resource.    - If
-    ``typename`` is ``cudaDevResourceTypeWorkqueueConfig``, the
-    cudaDevWorkqueueConfigResource structure ``wqConfig`` is filled in.
-    - If ``typename`` is ``cudaDevResourceTypeWorkqueue``, the
-    cudaDevWorkqueueResource structure ``wq`` is filled in.
+    :py:obj:`~.cudaDevResourceTypeInvalid`, this resoure is not valid
+    and cannot be further accessed.    - If ``typename`` is
+    :py:obj:`~.cudaDevResourceTypeSm`, the
+    :py:obj:`~.cudaDevSmResource` structure ``sm`` is filled in. For
+    example, ``sm.smCount`` will reflect the amount of streaming
+    multiprocessors available in this resource.    - If ``typename`` is
+    :py:obj:`~.cudaDevResourceTypeWorkqueueConfig`, the
+    :py:obj:`~.cudaDevWorkqueueConfigResource` structure ``wqConfig``
+    is filled in.    - If ``typename`` is
+    :py:obj:`~.cudaDevResourceTypeWorkqueue`, the
+    :py:obj:`~.cudaDevWorkqueueResource` structure ``wq`` is filled in.
 
     Attributes
     ----------
     {{if 'cudaDevResource_st.type' in found_struct}}
-    type : cudaDevResourceType
+    type : :py:obj:`~.cudaDevResourceType`
         Type of resource, dictates which union field was last set
     {{endif}}
     {{if 'cudaDevResource_st._internal_padding' in found_struct}}
@@ -16551,17 +16573,18 @@ cdef class cudaDevResource_st:
 
     {{endif}}
     {{if 'cudaDevResource_st.sm' in found_struct}}
-    sm : cudaDevSmResource
-        Resource corresponding to cudaDevResourceTypeSm ``typename``.
+    sm : :py:obj:`~.cudaDevSmResource`
+        Resource corresponding to :py:obj:`~.cudaDevResourceTypeSm`
+        ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st.wqConfig' in found_struct}}
-    wqConfig : cudaDevWorkqueueConfigResource
-        Resource corresponding to cudaDevResourceTypeWorkqueueConfig
-        ``typename``.
+    wqConfig : :py:obj:`~.cudaDevWorkqueueConfigResource`
+        Resource corresponding to
+        :py:obj:`~.cudaDevResourceTypeWorkqueueConfig` ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st.wq' in found_struct}}
-    wq : cudaDevWorkqueueResource
-        Resource corresponding to cudaDevResourceTypeWorkqueue
+    wq : :py:obj:`~.cudaDevWorkqueueResource`
+        Resource corresponding to :py:obj:`~.cudaDevResourceTypeWorkqueue`
         ``typename``.
     {{endif}}
     {{if 'cudaDevResource_st._oversize' in found_struct}}
@@ -16569,7 +16592,7 @@ cdef class cudaDevResource_st:
 
     {{endif}}
     {{if 'cudaDevResource_st.nextResource' in found_struct}}
-    nextResource : cudaDevResource_st
+    nextResource : :py:obj:`~.cudaDevResource_st`
 
     {{endif}}
 
@@ -16849,11 +16872,11 @@ cdef class cudaKernelNodeParams:
         Kernel to launch
     {{endif}}
     {{if 'cudaKernelNodeParams.gridDim' in found_struct}}
-    gridDim : dim3
+    gridDim : :py:obj:`~.dim3`
         Grid dimensions
     {{endif}}
     {{if 'cudaKernelNodeParams.blockDim' in found_struct}}
-    blockDim : dim3
+    blockDim : :py:obj:`~.dim3`
         Block dimensions
     {{endif}}
     {{if 'cudaKernelNodeParams.sharedMemBytes' in found_struct}}
@@ -16997,19 +17020,19 @@ cdef class cudaKernelNodeParamsV2:
         functionType = cudaKernelFucntionTypeDevice
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.kern' in found_struct}}
-    kern : cudaKernel_t
+    kern : :py:obj:`~.cudaKernel_t`
         functionType = cudaKernelFucntionTypeKernel
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.cuFunc' in found_struct}}
-    cuFunc : cudaFunction_t
+    cuFunc : :py:obj:`~.cudaFunction_t`
         functionType = cudaKernelFucntionTypeFunction
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.gridDim' in found_struct}}
-    gridDim : dim3
+    gridDim : :py:obj:`~.dim3`
         Grid dimensions
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.blockDim' in found_struct}}
-    blockDim : dim3
+    blockDim : :py:obj:`~.dim3`
         Block dimensions
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.sharedMemBytes' in found_struct}}
@@ -17025,12 +17048,12 @@ cdef class cudaKernelNodeParamsV2:
         Pointer to kernel arguments in the "extra" format
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.ctx' in found_struct}}
-    ctx : cudaExecutionContext_t
+    ctx : :py:obj:`~.cudaExecutionContext_t`
         Context in which to run the kernel. If NULL will try to use the
         current context.
     {{endif}}
     {{if 'cudaKernelNodeParamsV2.functionType' in found_struct}}
-    functionType : cudaKernelFunctionType
+    functionType : :py:obj:`~.cudaKernelFunctionType`
         Type of handle passed in the func/kern/cuFunc union above
     {{endif}}
 
@@ -17252,11 +17275,11 @@ cdef class cudaExternalSemaphoreSignalNodeParams:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreSignalNodeParams.extSemArray' in found_struct}}
-    extSemArray : cudaExternalSemaphore_t
+    extSemArray : :py:obj:`~.cudaExternalSemaphore_t`
         Array of external semaphore handles.
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalNodeParams.paramsArray' in found_struct}}
-    paramsArray : cudaExternalSemaphoreSignalParams
+    paramsArray : :py:obj:`~.cudaExternalSemaphoreSignalParams`
         Array of external semaphore signal parameters.
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalNodeParams.numExtSems' in found_struct}}
@@ -17381,11 +17404,11 @@ cdef class cudaExternalSemaphoreSignalNodeParamsV2:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreSignalNodeParamsV2.extSemArray' in found_struct}}
-    extSemArray : cudaExternalSemaphore_t
+    extSemArray : :py:obj:`~.cudaExternalSemaphore_t`
         Array of external semaphore handles.
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalNodeParamsV2.paramsArray' in found_struct}}
-    paramsArray : cudaExternalSemaphoreSignalParams
+    paramsArray : :py:obj:`~.cudaExternalSemaphoreSignalParams`
         Array of external semaphore signal parameters.
     {{endif}}
     {{if 'cudaExternalSemaphoreSignalNodeParamsV2.numExtSems' in found_struct}}
@@ -17510,11 +17533,11 @@ cdef class cudaExternalSemaphoreWaitNodeParams:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreWaitNodeParams.extSemArray' in found_struct}}
-    extSemArray : cudaExternalSemaphore_t
+    extSemArray : :py:obj:`~.cudaExternalSemaphore_t`
         Array of external semaphore handles.
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitNodeParams.paramsArray' in found_struct}}
-    paramsArray : cudaExternalSemaphoreWaitParams
+    paramsArray : :py:obj:`~.cudaExternalSemaphoreWaitParams`
         Array of external semaphore wait parameters.
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitNodeParams.numExtSems' in found_struct}}
@@ -17639,11 +17662,11 @@ cdef class cudaExternalSemaphoreWaitNodeParamsV2:
     Attributes
     ----------
     {{if 'cudaExternalSemaphoreWaitNodeParamsV2.extSemArray' in found_struct}}
-    extSemArray : cudaExternalSemaphore_t
+    extSemArray : :py:obj:`~.cudaExternalSemaphore_t`
         Array of external semaphore handles.
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitNodeParamsV2.paramsArray' in found_struct}}
-    paramsArray : cudaExternalSemaphoreWaitParams
+    paramsArray : :py:obj:`~.cudaExternalSemaphoreWaitParams`
         Array of external semaphore wait parameters.
     {{endif}}
     {{if 'cudaExternalSemaphoreWaitNodeParamsV2.numExtSems' in found_struct}}
@@ -17768,22 +17791,23 @@ cdef class cudaConditionalNodeParams:
     Attributes
     ----------
     {{if 'cudaConditionalNodeParams.handle' in found_struct}}
-    handle : cudaGraphConditionalHandle
+    handle : :py:obj:`~.cudaGraphConditionalHandle`
         Conditional node handle. Handles must be created in advance of
-        creating the node using cudaGraphConditionalHandleCreate.
+        creating the node using :func:`~.cudaGraphConditionalHandleCreate`.
     {{endif}}
     {{if 'cudaConditionalNodeParams.type' in found_struct}}
-    type : cudaGraphConditionalNodeType
+    type : :py:obj:`~.cudaGraphConditionalNodeType`
         Type of conditional node.
     {{endif}}
     {{if 'cudaConditionalNodeParams.size' in found_struct}}
     size : unsigned int
         Size of graph output array. Allowed values are 1 for
-        cudaGraphCondTypeWhile, 1 or 2 for cudaGraphCondTypeIf, or any
-        value greater than zero for cudaGraphCondTypeSwitch.
+        :py:obj:`~.cudaGraphCondTypeWhile`, 1 or 2 for
+        :py:obj:`~.cudaGraphCondTypeIf`, or any value greater than zero for
+        :py:obj:`~.cudaGraphCondTypeSwitch`.
     {{endif}}
     {{if 'cudaConditionalNodeParams.phGraph_out' in found_struct}}
-    phGraph_out : cudaGraph_t
+    phGraph_out : :py:obj:`~.cudaGraph_t`
         CUDA-owned array populated with conditional node child graphs
         during creation of the node. Valid for the lifetime of the
         conditional node. The contents of the graph(s) are subject to the
@@ -17793,16 +17817,18 @@ cdef class cudaConditionalNodeParams:
         - All kernels, including kernels in nested conditionals or child
         graphs at any level, must belong to the same CUDA context.
         These graphs may be populated using graph node creation APIs or
-        cudaStreamBeginCaptureToGraph. cudaGraphCondTypeIf: phGraph_out[0]
-        is executed when the condition is non-zero. If ``size`` == 2,
-        phGraph_out[1] will be executed when the condition is zero.
-        cudaGraphCondTypeWhile: phGraph_out[0] is executed as long as the
-        condition is non-zero. cudaGraphCondTypeSwitch: phGraph_out[n] is
-        executed when the condition is equal to n. If the condition >=
-        ``size``, no body graph is executed.
+        :func:`~.cudaStreamBeginCaptureToGraph`.
+        :py:obj:`~.cudaGraphCondTypeIf`: phGraph_out[0] is executed when
+        the condition is non-zero. If ``size`` == 2, phGraph_out[1] will be
+        executed when the condition is zero.
+        :py:obj:`~.cudaGraphCondTypeWhile`: phGraph_out[0] is executed as
+        long as the condition is non-zero.
+        :py:obj:`~.cudaGraphCondTypeSwitch`: phGraph_out[n] is executed
+        when the condition is equal to n. If the condition >= ``size``, no
+        body graph is executed.
     {{endif}}
     {{if 'cudaConditionalNodeParams.ctx' in found_struct}}
-    ctx : cudaExecutionContext_t
+    ctx : :py:obj:`~.cudaExecutionContext_t`
         CUDA Execution Context
     {{endif}}
 
@@ -17931,7 +17957,7 @@ cdef class cudaChildGraphNodeParams:
     Attributes
     ----------
     {{if 'cudaChildGraphNodeParams.graph' in found_struct}}
-    graph : cudaGraph_t
+    graph : :py:obj:`~.cudaGraph_t`
         The child graph to clone into the node for node creation, or a
         handle to the graph owned by the node for node query. The graph
         must not contain conditional nodes. Graphs containing memory
@@ -17939,7 +17965,7 @@ cdef class cudaChildGraphNodeParams:
         to the parent.
     {{endif}}
     {{if 'cudaChildGraphNodeParams.ownership' in found_struct}}
-    ownership : cudaGraphChildGraphNodeOwnership
+    ownership : :py:obj:`~.cudaGraphChildGraphNodeOwnership`
         The ownership relationship of the child graph node.
     {{endif}}
 
@@ -18015,7 +18041,7 @@ cdef class cudaEventRecordNodeParams:
     Attributes
     ----------
     {{if 'cudaEventRecordNodeParams.event' in found_struct}}
-    event : cudaEvent_t
+    event : :py:obj:`~.cudaEvent_t`
         The event to record when the node executes
     {{endif}}
 
@@ -18077,7 +18103,7 @@ cdef class cudaEventWaitNodeParams:
     Attributes
     ----------
     {{if 'cudaEventWaitNodeParams.event' in found_struct}}
-    event : cudaEvent_t
+    event : :py:obj:`~.cudaEvent_t`
         The event to wait on from the node
     {{endif}}
 
@@ -18134,12 +18160,12 @@ cdef class cudaEventWaitNodeParams:
 
 cdef class cudaGraphNodeParams:
     """
-    Graph node parameters. See cudaGraphAddNode.
+    Graph node parameters. See :func:`~.cudaGraphAddNode`.
 
     Attributes
     ----------
     {{if 'cudaGraphNodeParams.type' in found_struct}}
-    type : cudaGraphNodeType
+    type : :py:obj:`~.cudaGraphNodeType`
         Type of the node
     {{endif}}
     {{if 'cudaGraphNodeParams.reserved0' in found_struct}}
@@ -18151,51 +18177,51 @@ cdef class cudaGraphNodeParams:
         Padding. Unused bytes must be zero.
     {{endif}}
     {{if 'cudaGraphNodeParams.kernel' in found_struct}}
-    kernel : cudaKernelNodeParamsV2
+    kernel : :py:obj:`~.cudaKernelNodeParamsV2`
         Kernel node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.memcpy' in found_struct}}
-    memcpy : cudaMemcpyNodeParams
+    memcpy : :py:obj:`~.cudaMemcpyNodeParams`
         Memcpy node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.memset' in found_struct}}
-    memset : cudaMemsetParamsV2
+    memset : :py:obj:`~.cudaMemsetParamsV2`
         Memset node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.host' in found_struct}}
-    host : cudaHostNodeParamsV2
+    host : :py:obj:`~.cudaHostNodeParamsV2`
         Host node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.graph' in found_struct}}
-    graph : cudaChildGraphNodeParams
+    graph : :py:obj:`~.cudaChildGraphNodeParams`
         Child graph node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.eventWait' in found_struct}}
-    eventWait : cudaEventWaitNodeParams
+    eventWait : :py:obj:`~.cudaEventWaitNodeParams`
         Event wait node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.eventRecord' in found_struct}}
-    eventRecord : cudaEventRecordNodeParams
+    eventRecord : :py:obj:`~.cudaEventRecordNodeParams`
         Event record node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.extSemSignal' in found_struct}}
-    extSemSignal : cudaExternalSemaphoreSignalNodeParamsV2
+    extSemSignal : :py:obj:`~.cudaExternalSemaphoreSignalNodeParamsV2`
         External semaphore signal node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.extSemWait' in found_struct}}
-    extSemWait : cudaExternalSemaphoreWaitNodeParamsV2
+    extSemWait : :py:obj:`~.cudaExternalSemaphoreWaitNodeParamsV2`
         External semaphore wait node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.alloc' in found_struct}}
-    alloc : cudaMemAllocNodeParamsV2
+    alloc : :py:obj:`~.cudaMemAllocNodeParamsV2`
         Memory allocation node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.free' in found_struct}}
-    free : cudaMemFreeNodeParams
+    free : :py:obj:`~.cudaMemFreeNodeParams`
         Memory free node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.conditional' in found_struct}}
-    conditional : cudaConditionalNodeParams
+    conditional : :py:obj:`~.cudaConditionalNodeParams`
         Conditional node parameters.
     {{endif}}
     {{if 'cudaGraphNodeParams.reserved2' in found_struct}}
@@ -18494,8 +18520,8 @@ cdef class cudaGraphEdgeData_st:
     """
     Optional annotation for edges in a CUDA graph. Note, all edges
     implicitly have annotations and default to a zero-initialized value
-    if not specified. A zero-initialized struct indicates a standard
-    full serialization of two nodes with memory visibility.
+    if not specified. A zero-initialized ``struct indicates`` a
+    standard full serialization of two nodes with memory visibility.
 
     Attributes
     ----------
@@ -18507,9 +18533,9 @@ cdef class cudaGraphEdgeData_st:
         memory visibility to the downstream node or portion thereof
         (indicated by ``to_port``).   Only kernel nodes define non-zero
         ports. A kernel node can use the following output port types:
-        cudaGraphKernelNodePortDefault,
-        cudaGraphKernelNodePortProgrammatic, or
-        cudaGraphKernelNodePortLaunchCompletion.
+        :py:obj:`~.cudaGraphKernelNodePortDefault`,
+        :py:obj:`~.cudaGraphKernelNodePortProgrammatic`, or
+        :py:obj:`~.cudaGraphKernelNodePortLaunchCompletion`.
     {{endif}}
     {{if 'cudaGraphEdgeData_st.to_port' in found_struct}}
     to_port : bytes
@@ -18522,9 +18548,10 @@ cdef class cudaGraphEdgeData_st:
     {{endif}}
     {{if 'cudaGraphEdgeData_st.type' in found_struct}}
     type : bytes
-        This should be populated with a value from cudaGraphDependencyType.
-        (It is typed as char due to compiler-specific layout of bitfields.)
-        See cudaGraphDependencyType.
+        This should be populated with a value from
+        :py:obj:`~.cudaGraphDependencyType`. (It is typed as char due to
+        compiler-specific layout of bitfields.) See
+        :py:obj:`~.cudaGraphDependencyType`.
     {{endif}}
     {{if 'cudaGraphEdgeData_st.reserved' in found_struct}}
     reserved : bytes
@@ -18627,15 +18654,15 @@ cdef class cudaGraphInstantiateParams_st:
         Instantiation flags
     {{endif}}
     {{if 'cudaGraphInstantiateParams_st.uploadStream' in found_struct}}
-    uploadStream : cudaStream_t
+    uploadStream : :py:obj:`~.cudaStream_t`
         Upload stream
     {{endif}}
     {{if 'cudaGraphInstantiateParams_st.errNode_out' in found_struct}}
-    errNode_out : cudaGraphNode_t
+    errNode_out : :py:obj:`~.cudaGraphNode_t`
         The node which caused instantiation to fail, if any
     {{endif}}
     {{if 'cudaGraphInstantiateParams_st.result_out' in found_struct}}
-    result_out : cudaGraphInstantiateResult
+    result_out : :py:obj:`~.cudaGraphInstantiateResult`
         Whether instantiation was successful. If it failed, the reason why
     {{endif}}
 
@@ -18746,22 +18773,22 @@ cdef class cudaGraphInstantiateParams_st:
 
 cdef class cudaGraphExecUpdateResultInfo_st:
     """
-    Result information returned by cudaGraphExecUpdate
+    Result information returned by :func:`~.cudaGraphExecUpdate`
 
     Attributes
     ----------
     {{if 'cudaGraphExecUpdateResultInfo_st.result' in found_struct}}
-    result : cudaGraphExecUpdateResult
+    result : :py:obj:`~.cudaGraphExecUpdateResult`
         Gives more specific detail when a cuda graph update fails.
     {{endif}}
     {{if 'cudaGraphExecUpdateResultInfo_st.errorNode' in found_struct}}
-    errorNode : cudaGraphNode_t
+    errorNode : :py:obj:`~.cudaGraphNode_t`
         The "to node" of the error edge when the topologies do not match.
         The error node when the error is associated with a specific node.
         NULL when the error is generic.
     {{endif}}
     {{if 'cudaGraphExecUpdateResultInfo_st.errorFromNode' in found_struct}}
-    errorFromNode : cudaGraphNode_t
+    errorFromNode : :py:obj:`~.cudaGraphNode_t`
         The from node of error edge when the topologies do not match.
         Otherwise NULL.
     {{endif}}
@@ -18945,7 +18972,7 @@ cdef class anon_union10:
     Attributes
     ----------
     {{if 'cudaGraphKernelNodeUpdate.updateData.gridDim' in found_struct}}
-    gridDim : dim3
+    gridDim : :py:obj:`~.dim3`
 
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.updateData.param' in found_struct}}
@@ -19036,11 +19063,11 @@ cdef class cudaGraphKernelNodeUpdate:
     Attributes
     ----------
     {{if 'cudaGraphKernelNodeUpdate.node' in found_struct}}
-    node : cudaGraphDeviceNode_t
+    node : :py:obj:`~.cudaGraphDeviceNode_t`
         Node to update
     {{endif}}
     {{if 'cudaGraphKernelNodeUpdate.field' in found_struct}}
-    field : cudaGraphKernelNodeField
+    field : :py:obj:`~.cudaGraphKernelNodeField`
         Which type of update to apply. Determines how updateData is
         interpreted
     {{endif}}
@@ -19135,13 +19162,15 @@ cdef class cudaGraphKernelNodeUpdate:
 
 cdef class cudaLaunchMemSyncDomainMap_st:
     """
-    Memory Synchronization Domain map  See cudaLaunchMemSyncDomain.  By
-    default, kernels are launched in domain 0. Kernel launched with
-    cudaLaunchMemSyncDomainRemote will have a different domain ID. User
-    may also alter the domain ID with cudaLaunchMemSyncDomainMap for a
-    specific stream / graph node / kernel launch. See
-    cudaLaunchAttributeMemSyncDomainMap.  Domain ID range is available
-    through cudaDevAttrMemSyncDomainCount.
+    Memory Synchronization Domain map  See
+    :py:obj:`~.cudaLaunchMemSyncDomain`.  By default, kernels are
+    launched in domain 0. Kernel launched with
+    :py:obj:`~.cudaLaunchMemSyncDomainRemote` will have a different
+    domain ID. User may also alter the domain ID with
+    :py:obj:`~.cudaLaunchMemSyncDomainMap` for a specific stream /
+    graph node / kernel launch. See
+    :py:obj:`~.cudaLaunchAttributeMemSyncDomainMap`.  Domain ID range
+    is available through :py:obj:`~.cudaDevAttrMemSyncDomainCount`.
 
     Attributes
     ----------
@@ -19294,7 +19323,7 @@ cdef class anon_struct18:
     Attributes
     ----------
     {{if 'cudaLaunchAttributeValue.programmaticEvent.event' in found_struct}}
-    event : cudaEvent_t
+    event : :py:obj:`~.cudaEvent_t`
 
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticEvent.flags' in found_struct}}
@@ -19470,7 +19499,7 @@ cdef class anon_struct20:
     Attributes
     ----------
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent.event' in found_struct}}
-    event : cudaEvent_t
+    event : :py:obj:`~.cudaEvent_t`
 
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent.flags' in found_struct}}
@@ -19550,7 +19579,7 @@ cdef class anon_struct21:
 
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode' in found_struct}}
-    devNode : cudaGraphDeviceNode_t
+    devNode : :py:obj:`~.cudaGraphDeviceNode_t`
 
     {{endif}}
 
@@ -19619,7 +19648,8 @@ cdef class anon_struct21:
 
 cdef class cudaLaunchAttributeValue:
     """
-    Launch attributes union; used as value field of cudaLaunchAttribute
+    Launch attributes union; used as value field of
+    :py:obj:`~.cudaLaunchAttribute`
 
     Attributes
     ----------
@@ -19628,116 +19658,126 @@ cdef class cudaLaunchAttributeValue:
 
     {{endif}}
     {{if 'cudaLaunchAttributeValue.accessPolicyWindow' in found_struct}}
-    accessPolicyWindow : cudaAccessPolicyWindow
-        Value of launch attribute cudaLaunchAttributeAccessPolicyWindow.
+    accessPolicyWindow : :py:obj:`~.cudaAccessPolicyWindow`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeAccessPolicyWindow`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.cooperative' in found_struct}}
     cooperative : int
-        Value of launch attribute cudaLaunchAttributeCooperative. Nonzero
-        indicates a cooperative kernel (see cudaLaunchCooperativeKernel).
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeCooperative`. Nonzero indicates a
+        cooperative kernel (see :func:`~.cudaLaunchCooperativeKernel`).
     {{endif}}
     {{if 'cudaLaunchAttributeValue.syncPolicy' in found_struct}}
-    syncPolicy : cudaSynchronizationPolicy
-        Value of launch attribute cudaLaunchAttributeSynchronizationPolicy.
-        cudaSynchronizationPolicy for work queued up in this stream.
+    syncPolicy : :py:obj:`~.cudaSynchronizationPolicy`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeSynchronizationPolicy`.
+        :py:obj:`~.cudaSynchronizationPolicy` for work queued up in this
+        stream.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterDim' in found_struct}}
     clusterDim : anon_struct17
-        Value of launch attribute cudaLaunchAttributeClusterDimension that
-        represents the desired cluster dimensions for the kernel. Opaque
-        type with the following fields: - ``x`` - The X dimension of the
-        cluster, in blocks. Must be a divisor of the grid X dimension.    -
-        ``y`` - The Y dimension of the cluster, in blocks. Must be a
-        divisor of the grid Y dimension.    - ``z`` - The Z dimension of
-        the cluster, in blocks. Must be a divisor of the grid Z dimension.
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeClusterDimension` that represents the
+        desired cluster dimensions for the kernel. Opaque type with the
+        following fields: - ``x`` - The X dimension of the cluster, in
+        blocks. Must be a divisor of the grid X dimension.    - ``y`` - The
+        Y dimension of the cluster, in blocks. Must be a divisor of the
+        grid Y dimension.    - ``z`` - The Z dimension of the cluster, in
+        blocks. Must be a divisor of the grid Z dimension.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.clusterSchedulingPolicyPreference' in found_struct}}
-    clusterSchedulingPolicyPreference : cudaClusterSchedulingPolicy
+    clusterSchedulingPolicyPreference : :py:obj:`~.cudaClusterSchedulingPolicy`
         Value of launch attribute
-        cudaLaunchAttributeClusterSchedulingPolicyPreference. Cluster
-        scheduling policy preference for the kernel.
+        :py:obj:`~.cudaLaunchAttributeClusterSchedulingPolicyPreference`.
+        Cluster scheduling policy preference for the kernel.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticStreamSerializationAllowed' in found_struct}}
     programmaticStreamSerializationAllowed : int
         Value of launch attribute
-        cudaLaunchAttributeProgrammaticStreamSerialization.
+        :py:obj:`~.cudaLaunchAttributeProgrammaticStreamSerialization`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.programmaticEvent' in found_struct}}
     programmaticEvent : anon_struct18
-        Value of launch attribute cudaLaunchAttributeProgrammaticEvent with
-        the following fields: - ``cudaEvent_t`` event - Event to fire when
-        all blocks trigger it.    - ``int`` flags; - Event record flags,
-        see cudaEventRecordWithFlags. Does not accept
-        cudaEventRecordExternal.    - ``int`` triggerAtBlockStart - If this
-        is set to non-0, each block launch will automatically trigger the
-        event.
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeProgrammaticEvent` with the following
+        fields: - :py:obj:`~.cudaEvent_t` event - Event to fire when all
+        blocks trigger it.    - ``int`` flags; - Event record flags, see
+        :func:`~.cudaEventRecordWithFlags`. Does not accept
+        :py:obj:`~.cudaEventRecordExternal`.    - ``int``
+        triggerAtBlockStart - If this is set to non-0, each block launch
+        will automatically trigger the event.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.priority' in found_struct}}
     priority : int
-        Value of launch attribute cudaLaunchAttributePriority. Execution
-        priority of the kernel.
+        Value of launch attribute :py:obj:`~.cudaLaunchAttributePriority`.
+        Execution priority of the kernel.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomainMap' in found_struct}}
-    memSyncDomainMap : cudaLaunchMemSyncDomainMap
-        Value of launch attribute cudaLaunchAttributeMemSyncDomainMap. See
-        cudaLaunchMemSyncDomainMap.
+    memSyncDomainMap : :py:obj:`~.cudaLaunchMemSyncDomainMap`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeMemSyncDomainMap`. See
+        :py:obj:`~.cudaLaunchMemSyncDomainMap`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.memSyncDomain' in found_struct}}
-    memSyncDomain : cudaLaunchMemSyncDomain
-        Value of launch attribute cudaLaunchAttributeMemSyncDomain. See
-        cudaLaunchMemSyncDomain.
+    memSyncDomain : :py:obj:`~.cudaLaunchMemSyncDomain`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeMemSyncDomain`. See
+        :py:obj:`~.cudaLaunchMemSyncDomain`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.preferredClusterDim' in found_struct}}
     preferredClusterDim : anon_struct19
         Value of launch attribute
-        cudaLaunchAttributePreferredClusterDimension that represents the
-        desired preferred cluster dimensions for the kernel. Opaque type
-        with the following fields: - ``x`` - The X dimension of the
-        preferred cluster, in blocks. Must be a divisor of the grid X
+        :py:obj:`~.cudaLaunchAttributePreferredClusterDimension` that
+        represents the desired preferred cluster dimensions for the kernel.
+        Opaque type with the following fields: - ``x`` - The X dimension of
+        the preferred cluster, in blocks. Must be a divisor of the grid X
         dimension, and must be a multiple of the ``x`` field of
-        cudaLaunchAttributeValue::clusterDim.    - ``y`` - The Y dimension
-        of the preferred cluster, in blocks. Must be a divisor of the grid
-        Y dimension, and must be a multiple of the ``y`` field of
-        cudaLaunchAttributeValue::clusterDim.    - ``z`` - The Z dimension
-        of the preferred cluster, in blocks. Must be equal to the ``z``
-        field of cudaLaunchAttributeValue::clusterDim.
+        :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.    - ``y`` - The Y
+        dimension of the preferred cluster, in blocks. Must be a divisor of
+        the grid Y dimension, and must be a multiple of the ``y`` field of
+        :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.    - ``z`` - The Z
+        dimension of the preferred cluster, in blocks. Must be equal to the
+        ``z`` field of :py:obj:`~.cudaLaunchAttributeValue.clusterDim`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.launchCompletionEvent' in found_struct}}
     launchCompletionEvent : anon_struct20
-        Value of launch attribute cudaLaunchAttributeLaunchCompletionEvent
-        with the following fields: - ``cudaEvent_t`` event - Event to fire
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeLaunchCompletionEvent` with the
+        following fields: - :py:obj:`~.cudaEvent_t` event - Event to fire
         when the last block launches.    - ``int`` flags - Event record
-        flags, see cudaEventRecordWithFlags. Does not accept
-        cudaEventRecordExternal.
+        flags, see :func:`~.cudaEventRecordWithFlags`. Does not accept
+        :py:obj:`~.cudaEventRecordExternal`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.deviceUpdatableKernelNode' in found_struct}}
     deviceUpdatableKernelNode : anon_struct21
         Value of launch attribute
-        cudaLaunchAttributeDeviceUpdatableKernelNode with the following
-        fields: - ``int`` deviceUpdatable - Whether or not the resulting
-        kernel node should be device-updatable.    -
-        ``cudaGraphDeviceNode_t`` devNode - Returns a handle to pass to the
-        various device-side update functions.
+        :py:obj:`~.cudaLaunchAttributeDeviceUpdatableKernelNode` with the
+        following fields: - ``int`` deviceUpdatable - Whether or not the
+        resulting kernel node should be device-updatable.    -
+        :py:obj:`~.cudaGraphDeviceNode_t` devNode - Returns a handle to
+        pass to the various device-side update functions.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemCarveout' in found_struct}}
     sharedMemCarveout : unsigned int
         Value of launch attribute
-        cudaLaunchAttributePreferredSharedMemoryCarveout.
+        :py:obj:`~.cudaLaunchAttributePreferredSharedMemoryCarveout`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.nvlinkUtilCentricScheduling' in found_struct}}
     nvlinkUtilCentricScheduling : unsigned int
         Value of launch attribute
-        cudaLaunchAttributeNvlinkUtilCentricScheduling.
+        :py:obj:`~.cudaLaunchAttributeNvlinkUtilCentricScheduling`.
     {{endif}}
     {{if 'cudaLaunchAttributeValue.portableClusterSizeMode' in found_struct}}
-    portableClusterSizeMode : cudaLaunchAttributePortableClusterMode
+    portableClusterSizeMode : :py:obj:`~.cudaLaunchAttributePortableClusterMode`
         Value of launch attribute
-        cudaLaunchAttributePortableClusterSizeMode
+        :py:obj:`~.cudaLaunchAttributePortableClusterSizeMode`
     {{endif}}
     {{if 'cudaLaunchAttributeValue.sharedMemoryMode' in found_struct}}
-    sharedMemoryMode : cudaSharedMemoryMode
-        Value of launch attribute cudaLaunchAttributeSharedMemoryMode. See
-        cudaSharedMemoryMode for acceptable values.
+    sharedMemoryMode : :py:obj:`~.cudaSharedMemoryMode`
+        Value of launch attribute
+        :py:obj:`~.cudaLaunchAttributeSharedMemoryMode`. See
+        :py:obj:`~.cudaSharedMemoryMode` for acceptable values.
     {{endif}}
 
     Methods
@@ -20056,11 +20096,11 @@ cdef class cudaLaunchAttribute_st:
     Attributes
     ----------
     {{if 'cudaLaunchAttribute_st.id' in found_struct}}
-    id : cudaLaunchAttributeID
+    id : :py:obj:`~.cudaLaunchAttributeID`
         Attribute to set
     {{endif}}
     {{if 'cudaLaunchAttribute_st.val' in found_struct}}
-    val : cudaLaunchAttributeValue
+    val : :py:obj:`~.cudaLaunchAttributeValue`
         Value of the attribute
     {{endif}}
 
@@ -20222,7 +20262,7 @@ cdef class cudaAsyncNotificationInfo:
     Attributes
     ----------
     {{if 'cudaAsyncNotificationInfo.type' in found_struct}}
-    type : cudaAsyncNotificationType
+    type : :py:obj:`~.cudaAsyncNotificationType`
         The type of notification being sent
     {{endif}}
     {{if 'cudaAsyncNotificationInfo.info' in found_struct}}
@@ -20296,15 +20336,15 @@ cdef class cudaTextureDesc:
     Attributes
     ----------
     {{if 'cudaTextureDesc.addressMode' in found_struct}}
-    addressMode : list[cudaTextureAddressMode]
+    addressMode : list[:py:obj:`~.cudaTextureAddressMode`]
         Texture address mode for up to 3 dimensions
     {{endif}}
     {{if 'cudaTextureDesc.filterMode' in found_struct}}
-    filterMode : cudaTextureFilterMode
+    filterMode : :py:obj:`~.cudaTextureFilterMode`
         Texture filter mode
     {{endif}}
     {{if 'cudaTextureDesc.readMode' in found_struct}}
-    readMode : cudaTextureReadMode
+    readMode : :py:obj:`~.cudaTextureReadMode`
         Texture read mode
     {{endif}}
     {{if 'cudaTextureDesc.sRGB' in found_struct}}
@@ -20324,7 +20364,7 @@ cdef class cudaTextureDesc:
         Limit to the anisotropy ratio
     {{endif}}
     {{if 'cudaTextureDesc.mipmapFilterMode' in found_struct}}
-    mipmapFilterMode : cudaTextureFilterMode
+    mipmapFilterMode : :py:obj:`~.cudaTextureFilterMode`
         Mipmap filter mode
     {{endif}}
     {{if 'cudaTextureDesc.mipmapLevelBias' in found_struct}}
@@ -20563,7 +20603,7 @@ cdef class cudaGraphRecaptureCallbackData:
     Attributes
     ----------
     {{if 'cudaGraphRecaptureCallbackData.callbackFunc' in found_struct}}
-    callbackFunc : cudaGraphRecaptureCallback_t
+    callbackFunc : :py:obj:`~.cudaGraphRecaptureCallback_t`
         Callback function that will be invoked
     {{endif}}
     {{if 'cudaGraphRecaptureCallbackData.userData' in found_struct}}
@@ -20665,7 +20705,7 @@ cdef class cudaEglPlaneDesc_st:
         Number of channels for the plane
     {{endif}}
     {{if True}}
-    channelDesc : cudaChannelFormatDesc
+    channelDesc : :py:obj:`~.cudaChannelFormatDesc`
         Channel Format Descriptor
     {{endif}}
     {{if True}}
@@ -20804,11 +20844,11 @@ cdef class anon_union12:
     Attributes
     ----------
     {{if True}}
-    pArray : list[cudaArray_t]
+    pArray : list[:py:obj:`~.cudaArray_t`]
 
     {{endif}}
     {{if True}}
-    pPitch : list[cudaPitchedPtr]
+    pPitch : list[:py:obj:`~.cudaPitchedPtr`]
 
     {{endif}}
 
@@ -20880,11 +20920,11 @@ cdef class cudaEglFrame_st:
     CUDA EGLFrame Descriptor - structure defining one frame of EGL.
     Each frame may contain one or more planes depending on whether the
     surface is Multiplanar or not. Each plane of EGLFrame is
-    represented by cudaEglPlaneDesc which is defined as:
+    represented by :py:obj:`~.cudaEglPlaneDesc` which is defined as:
     typedefstructcudaEglPlaneDesc_st unsignedintwidth;
     unsignedintheight; unsignedintdepth; unsignedintpitch;
     unsignedintnumChannels; structcudaChannelFormatDescchannelDesc;
-    unsignedintreserved[4]; cudaEglPlaneDesc;
+    unsignedintreserved[4]; :py:obj:`~.cudaEglPlaneDesc`;
 
     Attributes
     ----------
@@ -20893,19 +20933,19 @@ cdef class cudaEglFrame_st:
 
     {{endif}}
     {{if True}}
-    planeDesc : list[cudaEglPlaneDesc]
-        CUDA EGL Plane Descriptor cudaEglPlaneDesc
+    planeDesc : list[:py:obj:`~.cudaEglPlaneDesc`]
+        CUDA EGL Plane Descriptor :py:obj:`~.cudaEglPlaneDesc`
     {{endif}}
     {{if True}}
     planeCount : unsigned int
         Number of planes
     {{endif}}
     {{if True}}
-    frameType : cudaEglFrameType
+    frameType : :py:obj:`~.cudaEglFrameType`
         Array or Pitch
     {{endif}}
     {{if True}}
-    eglColorFormat : cudaEglColorFormat
+    eglColorFormat : :py:obj:`~.cudaEglColorFormat`
         CUDA EGL Color Format
     {{endif}}
 
@@ -21342,7 +21382,7 @@ def cudaDeviceReset():
     :py:obj:`~.cudaEvent_t`, :py:obj:`~.cudaArray_t`,
     :py:obj:`~.cudaMipmappedArray_t`, :py:obj:`~.cudaPitchedPtr`,
     :py:obj:`~.cudaTextureObject_t`, :py:obj:`~.cudaSurfaceObject_t`,
-    :py:obj:`~.textureReference`, :py:obj:`~.surfaceReference`,
+    ``textureReference``, ``surfaceReference``,
     :py:obj:`~.cudaExternalMemory_t`, :py:obj:`~.cudaExternalSemaphore_t`
     and :py:obj:`~.cudaGraphicsResource_t`. These resources also include
     memory allocations by :py:obj:`~.cudaMalloc`,
@@ -21357,7 +21397,7 @@ def cudaDeviceReset():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`
 
     See Also
@@ -21390,12 +21430,12 @@ def cudaDeviceSynchronize():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorStreamCaptureUnsupported`
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceReset`, :py:obj:`~.cuCtxSynchronize`
+    :py:obj:`~.cudaDeviceReset`, :func:`~.cuCtxSynchronize`
     """
     with nogil:
         err = cyruntime.cudaDeviceSynchronize()
@@ -21422,18 +21462,17 @@ def cudaDeviceSetLimit(limit not None : cudaLimit, size_t value):
       each GPU thread.
 
     - :py:obj:`~.cudaLimitPrintfFifoSize` controls the size in bytes of the
-      shared FIFO used by the :py:obj:`~.printf()` device system call.
-      Setting :py:obj:`~.cudaLimitPrintfFifoSize` must not be performed
-      after launching any kernel that uses the :py:obj:`~.printf()` device
-      system call - in such case :py:obj:`~.cudaErrorInvalidValue` will be
-      returned.
+      shared FIFO used by the ``printf()`` device system call. Setting
+      :py:obj:`~.cudaLimitPrintfFifoSize` must not be performed after
+      launching any kernel that uses the ``printf()`` device system call -
+      in such case :py:obj:`~.cudaErrorInvalidValue` will be returned.
 
     - :py:obj:`~.cudaLimitMallocHeapSize` controls the size in bytes of the
-      heap used by the :py:obj:`~.malloc()` and :py:obj:`~.free()` device
-      system calls. Setting :py:obj:`~.cudaLimitMallocHeapSize` must not be
-      performed after launching any kernel that uses the
-      :py:obj:`~.malloc()` or :py:obj:`~.free()` device system calls - in
-      such case :py:obj:`~.cudaErrorInvalidValue` will be returned.
+      heap used by the ``malloc()`` and ``free()`` device system calls.
+      Setting :py:obj:`~.cudaLimitMallocHeapSize` must not be performed
+      after launching any kernel that uses the ``malloc()`` or ``free()``
+      device system calls - in such case :py:obj:`~.cudaErrorInvalidValue`
+      will be returned.
 
     - :py:obj:`~.cudaLimitDevRuntimeSyncDepth` controls the maximum nesting
       depth of a grid at which a thread can safely call
@@ -21492,12 +21531,12 @@ def cudaDeviceSetLimit(limit not None : cudaLimit, size_t value):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorUnsupportedLimit`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceGetLimit`, :py:obj:`~.cuCtxSetLimit`
+    :py:obj:`~.cudaDeviceGetLimit`, :func:`~.cuCtxSetLimit`
     """
     cdef cyruntime.cudaLimit cylimit = int(limit)
     with nogil:
@@ -21518,11 +21557,10 @@ def cudaDeviceGetLimit(limit not None : cudaLimit):
       thread.
 
     - :py:obj:`~.cudaLimitPrintfFifoSize` is the size in bytes of the
-      shared FIFO used by the :py:obj:`~.printf()` device system call.
+      shared FIFO used by the ``printf()`` device system call.
 
     - :py:obj:`~.cudaLimitMallocHeapSize` is the size in bytes of the heap
-      used by the :py:obj:`~.malloc()` and :py:obj:`~.free()` device system
-      calls.
+      used by the ``malloc()`` and ``free()`` device system calls.
 
     - :py:obj:`~.cudaLimitDevRuntimeSyncDepth` is the maximum grid depth at
       which a thread can isssue the device runtime call
@@ -21547,14 +21585,14 @@ def cudaDeviceGetLimit(limit not None : cudaLimit):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorUnsupportedLimit`, :py:obj:`~.cudaErrorInvalidValue`
     pValue : int
         Returned size of the limit
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceSetLimit`, :py:obj:`~.cuCtxGetLimit`
+    :py:obj:`~.cudaDeviceSetLimit`, :func:`~.cuCtxGetLimit`
     """
     cdef size_t pValue = 0
     cdef cyruntime.cudaLimit cylimit = int(limit)
@@ -21584,7 +21622,7 @@ def cudaDeviceGetTexture1DLinearMaxWidth(fmtDesc : Optional[cudaChannelFormatDes
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorUnsupportedLimit`, :py:obj:`~.cudaErrorInvalidValue`
     maxWidthInElements : int
         Returns maximum number of texture elements allocatable for given
@@ -21592,7 +21630,7 @@ def cudaDeviceGetTexture1DLinearMaxWidth(fmtDesc : Optional[cudaChannelFormatDes
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetTexture1DLinearMaxWidth`
+    :func:`~.cuDeviceGetTexture1DLinearMaxWidth`
     """
     cdef size_t maxWidthInElements = 0
     cdef cyruntime.cudaChannelFormatDesc* cyfmtDesc_ptr = <cyruntime.cudaChannelFormatDesc*>fmtDesc._pvt_ptr if fmtDesc is not None else NULL
@@ -21636,14 +21674,14 @@ def cudaDeviceGetCacheConfig():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`
     pCacheConfig : :py:obj:`~.cudaFuncCache`
         Returned cache configuration
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaFuncSetCacheConfig (C API)`, cudaFuncSetCacheConfig (C++ API), :py:obj:`~.cuCtxGetCacheConfig`
+    :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaFuncSetCacheConfig (C API)`, :func:`~.cudaFuncSetCacheConfig` (C++ API), :func:`~.cuCtxGetCacheConfig`
     """
     cdef cyruntime.cudaFuncCache pCacheConfig
     with nogil:
@@ -21678,7 +21716,7 @@ def cudaDeviceGetStreamPriorityRange():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`
     leastPriority : int
         Pointer to an int in which the numerical value for least stream
@@ -21689,7 +21727,7 @@ def cudaDeviceGetStreamPriorityRange():
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cuCtxGetStreamPriorityRange`
+    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamGetPriority`, :func:`~.cuCtxGetStreamPriorityRange`
     """
     cdef int leastPriority = 0
     cdef int greatestPriority = 0
@@ -21712,11 +21750,12 @@ def cudaDeviceSetCacheConfig(cacheConfig not None : cudaFuncCache):
     runtime will use the requested configuration if possible, but it is
     free to choose a different configuration if required to execute the
     function. Any function preference set via
-    :py:obj:`~.cudaFuncSetCacheConfig (C API)` or cudaFuncSetCacheConfig
-    (C++ API) will be preferred over this device-wide setting. Setting the
-    device-wide cache configuration to :py:obj:`~.cudaFuncCachePreferNone`
-    will cause subsequent kernel launches to prefer to not change the cache
-    configuration unless required to launch the kernel.
+    :py:obj:`~.cudaFuncSetCacheConfig (C API)` or
+    :func:`~.cudaFuncSetCacheConfig` (C++ API) will be preferred over this
+    device-wide setting. Setting the device-wide cache configuration to
+    :py:obj:`~.cudaFuncCachePreferNone` will cause subsequent kernel
+    launches to prefer to not change the cache configuration unless
+    required to launch the kernel.
 
     This setting does nothing on devices where the size of the L1 cache and
     shared memory are fixed.
@@ -21745,12 +21784,12 @@ def cudaDeviceSetCacheConfig(cacheConfig not None : cudaFuncCache):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceGetCacheConfig`, :py:obj:`~.cudaFuncSetCacheConfig (C API)`, cudaFuncSetCacheConfig (C++ API), :py:obj:`~.cuCtxSetCacheConfig`
+    :py:obj:`~.cudaDeviceGetCacheConfig`, :py:obj:`~.cudaFuncSetCacheConfig (C API)`, :func:`~.cudaFuncSetCacheConfig` (C++ API), :func:`~.cuCtxSetCacheConfig`
     """
     cdef cyruntime.cudaFuncCache cycacheConfig = int(cacheConfig)
     with nogil:
@@ -21776,14 +21815,14 @@ def cudaDeviceGetByPCIBusId(char* pciBusId):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`
     device : int
         Returned device ordinal
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceGetPCIBusId`, :py:obj:`~.cuDeviceGetByPCIBusId`
+    :py:obj:`~.cudaDeviceGetPCIBusId`, :func:`~.cuDeviceGetByPCIBusId`
     """
     cdef int device = 0
     with nogil:
@@ -21816,14 +21855,14 @@ def cudaDeviceGetPCIBusId(int length, int device):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`
     pciBusId : bytes
         Returned identifier string for the device in the following format
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceGetByPCIBusId`, :py:obj:`~.cuDeviceGetPCIBusId`
+    :py:obj:`~.cudaDeviceGetByPCIBusId`, :func:`~.cuDeviceGetPCIBusId`
     """
     pypciBusId = b" " * length
     cdef char* pciBusId = pypciBusId
@@ -21869,7 +21908,7 @@ def cudaIpcGetEventHandle(event):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorMemoryAllocation`, :py:obj:`~.cudaErrorMapBufferObjectFailed`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`
     handle : :py:obj:`~.cudaIpcEventHandle_t`
         Pointer to a user allocated cudaIpcEventHandle in which to return
@@ -21877,7 +21916,7 @@ def cudaIpcGetEventHandle(event):
 
     See Also
     --------
-    :py:obj:`~.cudaEventCreate`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaIpcOpenEventHandle`, :py:obj:`~.cudaIpcGetMemHandle`, :py:obj:`~.cudaIpcOpenMemHandle`, :py:obj:`~.cudaIpcCloseMemHandle`, :py:obj:`~.cuIpcGetEventHandle`
+    :py:obj:`~.cudaEventCreate`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaIpcOpenEventHandle`, :py:obj:`~.cudaIpcGetMemHandle`, :py:obj:`~.cudaIpcOpenMemHandle`, :py:obj:`~.cudaIpcCloseMemHandle`, :func:`~.cuIpcGetEventHandle`
     """
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
@@ -21925,14 +21964,14 @@ def cudaIpcOpenEventHandle(handle not None : cudaIpcEventHandle_t):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorMapBufferObjectFailed`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorDeviceUninitialized`
     event : :py:obj:`~.cudaEvent_t`
         Returns the imported event
 
     See Also
     --------
-    :py:obj:`~.cudaEventCreate`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaIpcGetEventHandle`, :py:obj:`~.cudaIpcGetMemHandle`, :py:obj:`~.cudaIpcOpenMemHandle`, :py:obj:`~.cudaIpcCloseMemHandle`, :py:obj:`~.cuIpcOpenEventHandle`
+    :py:obj:`~.cudaEventCreate`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaIpcGetEventHandle`, :py:obj:`~.cudaIpcGetMemHandle`, :py:obj:`~.cudaIpcOpenMemHandle`, :py:obj:`~.cudaIpcCloseMemHandle`, :func:`~.cuIpcOpenEventHandle`
     """
     cdef cudaEvent_t event = cudaEvent_t()
     with nogil:
@@ -21972,15 +22011,15 @@ def cudaIpcGetMemHandle(devPtr):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorMemoryAllocation`, :py:obj:`~.cudaErrorMapBufferObjectFailed`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`
     handle : :py:obj:`~.cudaIpcMemHandle_t`
-        Pointer to user allocated :py:obj:`~.cudaIpcMemHandle` to return
-        the handle in.
+        Pointer to user allocated ``cudaIpcMemHandle`` to return the handle
+        in.
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaIpcGetEventHandle`, :py:obj:`~.cudaIpcOpenEventHandle`, :py:obj:`~.cudaIpcOpenMemHandle`, :py:obj:`~.cudaIpcCloseMemHandle`, :py:obj:`~.cuIpcGetMemHandle`
+    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaIpcGetEventHandle`, :py:obj:`~.cudaIpcOpenEventHandle`, :py:obj:`~.cudaIpcOpenMemHandle`, :py:obj:`~.cudaIpcCloseMemHandle`, :func:`~.cuIpcGetMemHandle`
     """
     cdef cudaIpcMemHandle_t handle = cudaIpcMemHandle_t()
     cdef _HelperInputVoidPtrStruct cydevPtrHelper
@@ -22011,10 +22050,9 @@ def cudaIpcOpenMemHandle(handle not None : cudaIpcMemHandle_t, unsigned int flag
     :py:obj:`~.cudaIpcOpenMemHandle` can open handles to devices that may
     not be visible in the process calling the API.
 
-    Contexts that may open :py:obj:`~.cudaIpcMemHandles` are restricted in
-    the following way. :py:obj:`~.cudaIpcMemHandles` from each device in a
-    given process may only be opened by one context per device per other
-    process.
+    Contexts that may open ``cudaIpcMemHandles`` are restricted in the
+    following way. ``cudaIpcMemHandles`` from each device in a given
+    process may only be opened by one context per device per other process.
 
     If the memory handle has already been opened by the current context,
     the reference count on the handle is incremented by 1 and the existing
@@ -22037,21 +22075,21 @@ def cudaIpcOpenMemHandle(handle not None : cudaIpcMemHandle_t, unsigned int flag
     Parameters
     ----------
     handle : :py:obj:`~.cudaIpcMemHandle_t`
-        :py:obj:`~.cudaIpcMemHandle` to open
+        ``cudaIpcMemHandle`` to open
     flags : unsigned int
         Flags for this operation. Must be specified as
         :py:obj:`~.cudaIpcMemLazyEnablePeerAccess`
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorMapBufferObjectFailed`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorDeviceUninitialized`, :py:obj:`~.cudaErrorTooManyPeers`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`
     devPtr : Any
         Returned device pointer
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaIpcGetEventHandle`, :py:obj:`~.cudaIpcOpenEventHandle`, :py:obj:`~.cudaIpcGetMemHandle`, :py:obj:`~.cudaIpcCloseMemHandle`, :py:obj:`~.cudaDeviceEnablePeerAccess`, :py:obj:`~.cudaDeviceCanAccessPeer`, :py:obj:`~.cuIpcOpenMemHandle`
+    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaIpcGetEventHandle`, :py:obj:`~.cudaIpcOpenEventHandle`, :py:obj:`~.cudaIpcGetMemHandle`, :py:obj:`~.cudaIpcCloseMemHandle`, :py:obj:`~.cudaDeviceEnablePeerAccess`, :py:obj:`~.cudaDeviceCanAccessPeer`, :func:`~.cuIpcOpenMemHandle`
 
     Notes
     -----
@@ -22070,7 +22108,7 @@ def cudaIpcOpenMemHandle(handle not None : cudaIpcMemHandle_t, unsigned int flag
 
 @cython.embedsignature(True)
 def cudaIpcCloseMemHandle(devPtr):
-    """ Attempts to close memory mapped with cudaIpcOpenMemHandle.
+    """ Attempts to close memory mapped with :func:`~.cudaIpcOpenMemHandle`.
 
     Decrements the reference count of the memory returnd by
     :py:obj:`~.cudaIpcOpenMemHandle` by 1. When the reference count reaches
@@ -22095,12 +22133,12 @@ def cudaIpcCloseMemHandle(devPtr):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorMapBufferObjectFailed`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaIpcGetEventHandle`, :py:obj:`~.cudaIpcOpenEventHandle`, :py:obj:`~.cudaIpcGetMemHandle`, :py:obj:`~.cudaIpcOpenMemHandle`, :py:obj:`~.cuIpcCloseMemHandle`
+    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaIpcGetEventHandle`, :py:obj:`~.cudaIpcOpenEventHandle`, :py:obj:`~.cudaIpcGetMemHandle`, :py:obj:`~.cudaIpcOpenMemHandle`, :func:`~.cuIpcCloseMemHandle`
     """
     cdef _HelperInputVoidPtrStruct cydevPtrHelper
     cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper)
@@ -22133,18 +22171,20 @@ def cudaDeviceFlushGPUDirectRDMAWrites(target not None : cudaFlushGPUDirectRDMAW
     Parameters
     ----------
     target : :py:obj:`~.cudaFlushGPUDirectRDMAWritesTarget`
-        The target of the operation, see cudaFlushGPUDirectRDMAWritesTarget
+        The target of the operation, see
+        :py:obj:`~.cudaFlushGPUDirectRDMAWritesTarget`
     scope : :py:obj:`~.cudaFlushGPUDirectRDMAWritesScope`
-        The scope of the operation, see cudaFlushGPUDirectRDMAWritesScope
+        The scope of the operation, see
+        :py:obj:`~.cudaFlushGPUDirectRDMAWritesScope`
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotSupported`,
 
     See Also
     --------
-    :py:obj:`~.cuFlushGPUDirectRDMAWrites`
+    :func:`~.cuFlushGPUDirectRDMAWrites`
     """
     cdef cyruntime.cudaFlushGPUDirectRDMAWritesTarget cytarget = int(target)
     cdef cyruntime.cudaFlushGPUDirectRDMAWritesScope cyscope = int(scope)
@@ -22181,10 +22221,10 @@ def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData):
     quickly (~10ms). Any long running tasks should be queued for execution
     on an application thread.
 
-    Callbacks may not call cudaDeviceRegisterAsyncNotification or
-    cudaDeviceUnregisterAsyncNotification. Doing so will result in
-    :py:obj:`~.cudaErrorNotPermitted`. Async notification callbacks execute
-    in an undefined order and may be serialized.
+    Callbacks may not call :func:`~.cudaDeviceRegisterAsyncNotification` or
+    :func:`~.cudaDeviceUnregisterAsyncNotification`. Doing so will result
+    in :py:obj:`~.cudaErrorNotPermitted`. Async notification callbacks
+    execute in an undefined order and may be serialized.
 
     Returns in ``*callback`` a handle representing the registered callback
     instance.
@@ -22201,7 +22241,7 @@ def cudaDeviceRegisterAsyncNotification(int device, callbackFunc, userData):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorNotSupported` :py:obj:`~.cudaErrorInvalidDevice` :py:obj:`~.cudaErrorInvalidValue` :py:obj:`~.cudaErrorNotPermitted` :py:obj:`~.cudaErrorUnknown`
     callback : :py:obj:`~.cudaAsyncCallbackHandle_t`
         A handle representing the registered callback instance
@@ -22261,7 +22301,7 @@ def cudaDeviceUnregisterAsyncNotification(int device, callback):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorNotSupported` :py:obj:`~.cudaErrorInvalidDevice` :py:obj:`~.cudaErrorInvalidValue` :py:obj:`~.cudaErrorNotPermitted` :py:obj:`~.cudaErrorUnknown`
 
     See Also
@@ -22311,14 +22351,14 @@ def cudaDeviceGetSharedMemConfig():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pConfig : :py:obj:`~.cudaSharedMemConfig`
         Returned cache configuration
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaDeviceGetCacheConfig`, :py:obj:`~.cudaDeviceSetSharedMemConfig`, :py:obj:`~.cudaFuncSetCacheConfig`, :py:obj:`~.cuCtxGetSharedMemConfig`
+    :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaDeviceGetCacheConfig`, :py:obj:`~.cudaDeviceSetSharedMemConfig`, :py:obj:`~.cudaFuncSetCacheConfig`, :func:`~.cuCtxGetSharedMemConfig`
     """
     cdef cyruntime.cudaSharedMemConfig pConfig
     with nogil:
@@ -22372,12 +22412,12 @@ def cudaDeviceSetSharedMemConfig(config not None : cudaSharedMemConfig):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaDeviceGetCacheConfig`, :py:obj:`~.cudaDeviceGetSharedMemConfig`, :py:obj:`~.cudaFuncSetCacheConfig`, :py:obj:`~.cuCtxSetSharedMemConfig`
+    :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaDeviceGetCacheConfig`, :py:obj:`~.cudaDeviceGetSharedMemConfig`, :py:obj:`~.cudaFuncSetCacheConfig`, :func:`~.cuCtxSetSharedMemConfig`
     """
     cdef cyruntime.cudaSharedMemConfig cyconfig = int(config)
     with nogil:
@@ -22401,7 +22441,7 @@ def cudaGetLastError():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorMissingConfiguration`, :py:obj:`~.cudaErrorMemoryAllocation`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorLaunchFailure`, :py:obj:`~.cudaErrorLaunchTimeout`, :py:obj:`~.cudaErrorLaunchOutOfResources`, :py:obj:`~.cudaErrorInvalidDeviceFunction`, :py:obj:`~.cudaErrorInvalidConfiguration`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidPitchValue`, :py:obj:`~.cudaErrorInvalidSymbol`, :py:obj:`~.cudaErrorUnmapBufferObjectFailed`, :py:obj:`~.cudaErrorInvalidDevicePointer`, :py:obj:`~.cudaErrorInvalidTexture`, :py:obj:`~.cudaErrorInvalidTextureBinding`, :py:obj:`~.cudaErrorInvalidChannelDescriptor`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`, :py:obj:`~.cudaErrorInvalidFilterSetting`, :py:obj:`~.cudaErrorInvalidNormSetting`, :py:obj:`~.cudaErrorUnknown`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorInsufficientDriver`, :py:obj:`~.cudaErrorNoDevice`, :py:obj:`~.cudaErrorSetOnActiveProcess`, :py:obj:`~.cudaErrorStartupFailure`, :py:obj:`~.cudaErrorInvalidPtx`, :py:obj:`~.cudaErrorUnsupportedPtxVersion`, :py:obj:`~.cudaErrorNoKernelImageForDevice`, :py:obj:`~.cudaErrorJitCompilerNotFound`, :py:obj:`~.cudaErrorJitCompilationDisabled`
 
     See Also
@@ -22430,7 +22470,7 @@ def cudaPeekAtLastError():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorMissingConfiguration`, :py:obj:`~.cudaErrorMemoryAllocation`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorLaunchFailure`, :py:obj:`~.cudaErrorLaunchTimeout`, :py:obj:`~.cudaErrorLaunchOutOfResources`, :py:obj:`~.cudaErrorInvalidDeviceFunction`, :py:obj:`~.cudaErrorInvalidConfiguration`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidPitchValue`, :py:obj:`~.cudaErrorInvalidSymbol`, :py:obj:`~.cudaErrorUnmapBufferObjectFailed`, :py:obj:`~.cudaErrorInvalidDevicePointer`, :py:obj:`~.cudaErrorInvalidTexture`, :py:obj:`~.cudaErrorInvalidTextureBinding`, :py:obj:`~.cudaErrorInvalidChannelDescriptor`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`, :py:obj:`~.cudaErrorInvalidFilterSetting`, :py:obj:`~.cudaErrorInvalidNormSetting`, :py:obj:`~.cudaErrorUnknown`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorInsufficientDriver`, :py:obj:`~.cudaErrorNoDevice`, :py:obj:`~.cudaErrorSetOnActiveProcess`, :py:obj:`~.cudaErrorStartupFailure`, :py:obj:`~.cudaErrorInvalidPtx`, :py:obj:`~.cudaErrorUnsupportedPtxVersion`, :py:obj:`~.cudaErrorNoKernelImageForDevice`, :py:obj:`~.cudaErrorJitCompilerNotFound`, :py:obj:`~.cudaErrorJitCompilationDisabled`
 
     See Also
@@ -22466,7 +22506,7 @@ def cudaGetErrorName(error not None : cudaError_t):
 
     See Also
     --------
-    :py:obj:`~.cudaGetErrorString`, :py:obj:`~.cudaGetLastError`, :py:obj:`~.cudaPeekAtLastError`, :py:obj:`~.cudaError`, :py:obj:`~.cuGetErrorName`
+    :py:obj:`~.cudaGetErrorString`, :py:obj:`~.cudaGetLastError`, :py:obj:`~.cudaPeekAtLastError`, :py:obj:`~.cudaError`, :func:`~.cuGetErrorName`
     """
     cdef cyruntime.cudaError_t cyerror = int(error)
     with nogil:
@@ -22497,7 +22537,7 @@ def cudaGetErrorString(error not None : cudaError_t):
 
     See Also
     --------
-    :py:obj:`~.cudaGetErrorName`, :py:obj:`~.cudaGetLastError`, :py:obj:`~.cudaPeekAtLastError`, :py:obj:`~.cudaError`, :py:obj:`~.cuGetErrorString`
+    :py:obj:`~.cudaGetErrorName`, :py:obj:`~.cudaGetLastError`, :py:obj:`~.cudaPeekAtLastError`, :py:obj:`~.cudaError`, :func:`~.cuGetErrorString`
     """
     cdef cyruntime.cudaError_t cyerror = int(error)
     with nogil:
@@ -22516,7 +22556,7 @@ def cudaGetDeviceCount():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`
     count : int
         Returns the number of devices with compute capability greater or
@@ -22524,7 +22564,7 @@ def cudaGetDeviceCount():
 
     See Also
     --------
-    :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cuDeviceGetCount`
+    :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaInitDevice`, :func:`~.cuDeviceGetCount`
     """
     cdef int count = 0
     with nogil:
@@ -22549,14 +22589,14 @@ def cudaGetDeviceProperties(int device):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
     prop : :py:obj:`~.cudaDeviceProp`
         Properties for the specified device
 
     See Also
     --------
-    :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaDeviceGetAttribute`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuDeviceGetName`
+    :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaDeviceGetAttribute`, :py:obj:`~.cudaInitDevice`, :func:`~.cuDeviceGetAttribute`, :func:`~.cuDeviceGetName`
     """
     cdef cudaDeviceProp prop = cudaDeviceProp()
     with nogil:
@@ -22584,14 +22624,14 @@ def cudaDeviceGetAttribute(attr not None : cudaDeviceAttr, int device):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue`
     value : int
         Returned device attribute value
 
     See Also
     --------
-    :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cuDeviceGetAttribute`
+    :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaInitDevice`, :func:`~.cuDeviceGetAttribute`
     """
     cdef int value = 0
     cdef cyruntime.cudaDeviceAttr cyattr = int(attr)
@@ -22635,14 +22675,14 @@ def cudaDeviceGetHostAtomicCapabilities(operations : Optional[tuple[cudaAtomicOp
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue`
     capabilities : list[unsigned int]
         Returned capability details of each requested operation
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceGetAttribute`, :py:obj:`~.cudaDeviceGetP2PAtomicCapabilities`, :py:obj:`~.cuDeviceGeHostAtomicCapabilities`
+    :py:obj:`~.cudaDeviceGetAttribute`, :py:obj:`~.cudaDeviceGetP2PAtomicCapabilities`, ``cuDeviceGeHostAtomicCapabilities``
     """
     operations = [] if operations is None else operations
     if not all(isinstance(_x, (cudaAtomicOperation)) for _x in operations):
@@ -22682,14 +22722,14 @@ def cudaDeviceGetDefaultMemPool(int device):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue` :py:obj:`~.cudaErrorNotSupported`
     memPool : :py:obj:`~.cudaMemPool_t`
         None
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaMemPoolTrimTo`, :py:obj:`~.cudaMemPoolGetAttribute`, :py:obj:`~.cudaDeviceSetMemPool`, :py:obj:`~.cudaMemPoolSetAttribute`, :py:obj:`~.cudaMemPoolSetAccess`
+    :func:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaMemPoolTrimTo`, :py:obj:`~.cudaMemPoolGetAttribute`, :py:obj:`~.cudaDeviceSetMemPool`, :py:obj:`~.cudaMemPoolSetAttribute`, :py:obj:`~.cudaMemPoolSetAccess`
     """
     cdef cudaMemPool_t memPool = cudaMemPool_t()
     with nogil:
@@ -22720,12 +22760,12 @@ def cudaDeviceSetMemPool(int device, memPool):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue` :py:obj:`~.cudaErrorInvalidDevice` :py:obj:`~.cudaErrorNotSupported`
 
     See Also
     --------
-    :py:obj:`~.cuDeviceSetMemPool`, :py:obj:`~.cudaDeviceGetMemPool`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaMemPoolCreate`, :py:obj:`~.cudaMemPoolDestroy`, :py:obj:`~.cudaMallocFromPoolAsync`
+    :func:`~.cuDeviceSetMemPool`, :py:obj:`~.cudaDeviceGetMemPool`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaMemPoolCreate`, :py:obj:`~.cudaMemPoolDestroy`, :py:obj:`~.cudaMallocFromPoolAsync`
 
     Notes
     -----
@@ -22754,7 +22794,7 @@ def cudaDeviceGetMemPool(int device):
     this device or the device's default memory pool if
     :py:obj:`~.cudaDeviceSetMemPool` has never been called. By default the
     current mempool is the default mempool for a device, otherwise the
-    returned pool must have been set with :py:obj:`~.cuDeviceSetMemPool` or
+    returned pool must have been set with :func:`~.cuDeviceSetMemPool` or
     :py:obj:`~.cudaDeviceSetMemPool`.
 
     Parameters
@@ -22764,14 +22804,14 @@ def cudaDeviceGetMemPool(int device):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue` :py:obj:`~.cudaErrorNotSupported`
     memPool : :py:obj:`~.cudaMemPool_t`
         None
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetMemPool`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceSetMemPool`
+    :func:`~.cuDeviceGetMemPool`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceSetMemPool`
     """
     cdef cudaMemPool_t memPool = cudaMemPool_t()
     with nogil:
@@ -22797,7 +22837,7 @@ def cudaDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, int device, int flags):
 
     The applications should set ``nvSciSyncAttrList`` to a valid
     NvSciSyncAttrList failing which this API will return
-    :py:obj:`~.cudaErrorInvalidHandle`.
+    ``cudaErrorInvalidHandle``.
 
     The ``flags`` controls how applications intends to use the NvSciSync
     created from the ``nvSciSyncAttrList``. The valid flags are:
@@ -22836,11 +22876,11 @@ def cudaDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, int device, int flags):
 
     - NvSciSyncAttrValPrimitiveType_SysmemSemaphorePayload64b if ``device``
       is GA10X+. NvSciSyncAttrKey_GpuId is set to the same UUID that is
-      returned in ``cudaDeviceProp.uuid`` from
-      :py:obj:`~.cudaDeviceGetProperties` for this ``device``.
+      returned in :py:obj:`~.cudaDeviceProp.uuid` from
+      ``cudaDeviceGetProperties`` for this ``device``.
 
     :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorDeviceUninitialized`,
-    :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidHandle`,
+    :py:obj:`~.cudaErrorInvalidValue`, ``cudaErrorInvalidHandle``,
     :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorNotSupported`,
     :py:obj:`~.cudaErrorMemoryAllocation`
 
@@ -22855,7 +22895,7 @@ def cudaDeviceGetNvSciSyncAttributes(nvSciSyncAttrList, int device, int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
 
 
     See Also
@@ -22917,14 +22957,14 @@ def cudaDeviceGetP2PAttribute(attr not None : cudaDeviceP2PAttr, int srcDevice,
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue`
     value : int
         Returned value of the requested attribute
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceEnablePeerAccess`, :py:obj:`~.cudaDeviceDisablePeerAccess`, :py:obj:`~.cudaDeviceCanAccessPeer`, :py:obj:`~.cuDeviceGetP2PAttribute` :py:obj:`~.cudaDeviceGetP2PAtomicCapabilities`
+    :py:obj:`~.cudaDeviceEnablePeerAccess`, :py:obj:`~.cudaDeviceDisablePeerAccess`, :py:obj:`~.cudaDeviceCanAccessPeer`, :func:`~.cuDeviceGetP2PAttribute` :py:obj:`~.cudaDeviceGetP2PAtomicCapabilities`
     """
     cdef int value = 0
     cdef cyruntime.cudaDeviceP2PAttr cyattr = int(attr)
@@ -22971,14 +23011,14 @@ def cudaDeviceGetP2PAtomicCapabilities(operations : Optional[tuple[cudaAtomicOpe
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue`
     capabilities : list[unsigned int]
         Returned capability details of each requested operation
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceGetP2PAttribute`, :py:obj:`~.cuDeviceGetP2PAttribute`, :py:obj:`~.cuDeviceGetP2PAtomicCapabilities`
+    :py:obj:`~.cudaDeviceGetP2PAttribute`, :func:`~.cuDeviceGetP2PAttribute`, :func:`~.cuDeviceGetP2PAtomicCapabilities`
     """
     operations = [] if operations is None else operations
     if not all(isinstance(_x, (cudaAtomicOperation)) for _x in operations):
@@ -23018,7 +23058,7 @@ def cudaChooseDevice(prop : Optional[cudaDeviceProp]):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     device : int
         Device with best match
@@ -23067,12 +23107,12 @@ def cudaInitDevice(int device, unsigned int deviceFlags, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`,
 
     See Also
     --------
-    :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaSetDevice` :py:obj:`~.cuCtxSetCurrent`
+    :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaSetDevice` :func:`~.cuCtxSetCurrent`
     """
     with nogil:
         err = cyruntime.cudaInitDevice(device, deviceFlags, flags)
@@ -23123,12 +23163,12 @@ def cudaSetDevice(int device):
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorDeviceUnavailable`,
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, ``cudaErrorDeviceUnavailable``,
 
     See Also
     --------
-    :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cuCtxSetCurrent`
+    :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaInitDevice`, :func:`~.cuCtxSetCurrent`
     """
     with nogil:
         err = cyruntime.cudaSetDevice(device)
@@ -23145,15 +23185,15 @@ def cudaGetDevice():
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorDeviceUnavailable`,
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, ``cudaErrorDeviceUnavailable``,
     device : int
         Returns the device on which the active host thread executes the
         device code.
 
     See Also
     --------
-    :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cuCtxGetCurrent`
+    :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaChooseDevice`, :func:`~.cuCtxGetCurrent`
     """
     cdef int device = 0
     with nogil:
@@ -23237,12 +23277,12 @@ def cudaSetDeviceFlags(unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cudaGetDeviceFlags`, :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaSetValidDevices`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cuDevicePrimaryCtxSetFlags`
+    :py:obj:`~.cudaGetDeviceFlags`, :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaSetValidDevices`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cudaChooseDevice`, :func:`~.cuDevicePrimaryCtxSetFlags`
     """
     with nogil:
         err = cyruntime.cudaSetDeviceFlags(flags)
@@ -23282,14 +23322,14 @@ def cudaGetDeviceFlags():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
     flags : unsigned int
         Pointer to store the device flags
 
     See Also
     --------
-    :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaSetDeviceFlags`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cuCtxGetFlags`, :py:obj:`~.cuDevicePrimaryCtxGetState`
+    :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaGetDeviceProperties`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaSetDeviceFlags`, :py:obj:`~.cudaInitDevice`, :func:`~.cuCtxGetFlags`, :func:`~.cuDevicePrimaryCtxGetState`
     """
     cdef unsigned int flags = 0
     with nogil:
@@ -23312,14 +23352,14 @@ def cudaStreamCreate():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue` :py:obj:`~.cudaErrorExternalDevice`
     pStream : :py:obj:`~.cudaStream_t`
         Pointer to new stream identifier
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cudaStreamGetDevResource`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cuStreamCreate`
+    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cudaStreamGetDevResource`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaStreamDestroy`, :func:`~.cuStreamCreate`
     """
     cdef cudaStream_t pStream = cudaStream_t()
     with nogil:
@@ -23356,14 +23396,14 @@ def cudaStreamCreateWithFlags(unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue` :py:obj:`~.cudaErrorExternalDevice`
     pStream : :py:obj:`~.cudaStream_t`
         Pointer to new stream identifier
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cudaStreamGetDevResource`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cuStreamCreate`
+    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cudaStreamGetDevResource`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaStreamDestroy`, :func:`~.cuStreamCreate`
     """
     cdef cudaStream_t pStream = cudaStream_t()
     with nogil:
@@ -23411,14 +23451,14 @@ def cudaStreamCreateWithPriority(unsigned int flags, int priority):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue` :py:obj:`~.cudaErrorExternalDevice`
     pStream : :py:obj:`~.cudaStream_t`
         Pointer to new stream identifier
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaDeviceGetStreamPriorityRange`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cuStreamCreateWithPriority`
+    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaDeviceGetStreamPriorityRange`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaStreamDestroy`, :func:`~.cuStreamCreateWithPriority`
 
     Notes
     -----
@@ -23454,7 +23494,7 @@ def cudaStreamGetPriority(hStream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`
     priority : int
         Pointer to a signed integer in which the stream's priority is
@@ -23462,7 +23502,7 @@ def cudaStreamGetPriority(hStream):
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaDeviceGetStreamPriorityRange`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cudaStreamGetDevResource`, :py:obj:`~.cuStreamGetPriority`
+    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaDeviceGetStreamPriorityRange`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cudaStreamGetDevResource`, :func:`~.cuStreamGetPriority`
     """
     cdef cyruntime.cudaStream_t cyhStream
     if hStream is None:
@@ -23496,7 +23536,7 @@ def cudaStreamGetFlags(hStream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`
     flags : unsigned int
         Pointer to an unsigned integer in which the stream's flags are
@@ -23504,7 +23544,7 @@ def cudaStreamGetFlags(hStream):
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cudaStreamGetDevice`, :py:obj:`~.cuStreamGetFlags`
+    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cudaStreamGetDevice`, :func:`~.cuStreamGetFlags`
     """
     cdef cyruntime.cudaStream_t cyhStream
     if hStream is None:
@@ -23536,9 +23576,9 @@ def cudaStreamGetId(hStream):
     - a stream created via any of the CUDA runtime APIs such as
       :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`
       and :py:obj:`~.cudaStreamCreateWithPriority`, or their driver API
-      equivalents such as :py:obj:`~.cuStreamCreate` or
-      :py:obj:`~.cuStreamCreateWithPriority`. Passing an invalid handle
-      will result in undefined behavior.
+      equivalents such as :func:`~.cuStreamCreate` or
+      :func:`~.cuStreamCreateWithPriority`. Passing an invalid handle will
+      result in undefined behavior.
 
     - any of the special streams such as the NULL stream,
       :py:obj:`~.cudaStreamLegacy` and :py:obj:`~.cudaStreamPerThread`
@@ -23553,14 +23593,14 @@ def cudaStreamGetId(hStream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`
     streamId : unsigned long long
         Pointer to an unsigned long long in which the stream Id is returned
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cuStreamGetId`
+    :py:obj:`~.cudaStreamCreateWithPriority`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cudaStreamGetFlags`, :func:`~.cuStreamGetId`
     """
     cdef cyruntime.cudaStream_t cyhStream
     if hStream is None:
@@ -23593,14 +23633,14 @@ def cudaStreamGetDevice(hStream):
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorDeviceUnavailable`,
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, ``cudaErrorDeviceUnavailable``,
     device : int
         Returns the device to which the stream belongs
 
     See Also
     --------
-    :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cudaStreamGetFlags`, :py:obj:`~.cuStreamGetId`
+    :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamGetPriority`, :py:obj:`~.cudaStreamGetFlags`, :func:`~.cuStreamGetId`
     """
     cdef cyruntime.cudaStream_t cyhStream
     if hStream is None:
@@ -23629,7 +23669,7 @@ def cudaCtxResetPersistingL2Cache():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`,
 
     See Also
@@ -23659,7 +23699,7 @@ def cudaStreamCopyAttributes(dst, src):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotSupported`
 
     See Also
@@ -23705,7 +23745,7 @@ def cudaStreamGetAttribute(hStream, attr not None : cudaStreamAttrID):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`
     value_out : :py:obj:`~.cudaStreamAttrValue`
 
@@ -23752,7 +23792,7 @@ def cudaStreamSetAttribute(hStream, attr not None : cudaStreamAttrID, value : Op
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`
 
     See Also
@@ -23795,12 +23835,12 @@ def cudaStreamDestroy(stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle` :py:obj:`~.cudaErrorExternalDevice`
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cuStreamDestroy`
+    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamAddCallback`, :func:`~.cuStreamDestroy`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -23845,12 +23885,12 @@ def cudaStreamWaitEvent(stream, event, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cuStreamWaitEvent`
+    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaStreamDestroy`, :func:`~.cuStreamWaitEvent`
     """
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
@@ -23893,9 +23933,10 @@ def cudaStreamAddCallback(stream, callback, userData, unsigned int flags):
     """ Add a callback to a compute stream.
 
     Adds a callback to be called on the host after all currently enqueued
-    items in the stream have completed. For each cudaStreamAddCallback
-    call, a callback will be executed exactly once. The callback will block
-    later work in the stream until it is finished.
+    items in the stream have completed. For each
+    :func:`~.cudaStreamAddCallback` call, a callback will be executed
+    exactly once. The callback will block later work in the stream until it
+    is finished.
 
     The callback may be passed :py:obj:`~.cudaSuccess` or an error code. In
     the event of a device error, all subsequently executed callbacks will
@@ -23946,12 +23987,12 @@ def cudaStreamAddCallback(stream, callback, userData, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cudaMallocManaged`, :py:obj:`~.cudaStreamAttachMemAsync`, :py:obj:`~.cudaLaunchHostFunc`, :py:obj:`~.cuStreamAddCallback`
+    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cudaMallocManaged`, :py:obj:`~.cudaStreamAttachMemAsync`, :py:obj:`~.cudaLaunchHostFunc`, :func:`~.cuStreamAddCallback`
 
     Notes
     -----
@@ -24010,12 +24051,12 @@ def cudaStreamSynchronize(stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle`
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cuStreamSynchronize`
+    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaStreamDestroy`, :func:`~.cuStreamSynchronize`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -24050,12 +24091,12 @@ def cudaStreamQuery(stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotReady`, :py:obj:`~.cudaErrorInvalidResourceHandle`
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cuStreamQuery`
+    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaStreamDestroy`, :func:`~.cuStreamQuery`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -24159,12 +24200,12 @@ def cudaStreamAttachMemAsync(stream, devPtr, size_t length, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotReady`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cudaMallocManaged`, :py:obj:`~.cuStreamAttachMemAsync`
+    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamCreateWithFlags`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cudaMallocManaged`, :func:`~.cuStreamAttachMemAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -24214,7 +24255,7 @@ def cudaStreamBeginCapture(stream, mode not None : cudaStreamCaptureMode):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -24257,13 +24298,13 @@ def cudaStreamBeginRecaptureToGraph(stream, mode not None : cudaStreamCaptureMod
     Any other node parameter mismatches during recapture can be configured
     to call the function provided in ``callbackFunc``. The recapture will
     fail immediately if the callback returns anything other than
-    cudaSuccess.
+    :py:obj:`~.cudaSuccess`.
 
     If the recapture fails for any reason, the ``graph`` will be in an
     undefined state and should be destroyed.
 
-    See cudaStreamBeginCapture for additional detail on beginning the
-    capture.
+    See :func:`~.cudaStreamBeginCapture` for additional detail on beginning
+    the capture.
 
     Parameters
     ----------
@@ -24276,13 +24317,13 @@ def cudaStreamBeginRecaptureToGraph(stream, mode not None : cudaStreamCaptureMod
     graph : :py:obj:`~.CUgraph` or :py:obj:`~.cudaGraph_t`
         Existing CUDA graph to be captured into
     callbackData : :py:obj:`~.cudaGraphRecaptureCallbackData`
-        Optional struct of callback data that will be invoked for all
+        Optional ``struct of`` callback data that will be invoked for all
         parameter mismatches from the original graph
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorDeinitialized`, :py:obj:`~.cudaErrorNotInitialized`, :py:obj:`~.cudaErrorInvalidValue`,
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, ``cudaErrorDeinitialized``, ``cudaErrorNotInitialized``, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
     --------
@@ -24357,7 +24398,7 @@ def cudaStreamBeginCaptureToGraph(stream, graph, dependencies : Optional[tuple[c
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -24451,24 +24492,25 @@ def cudaThreadExchangeStreamCaptureMode(mode not None : cudaStreamCaptureMode):
 
     A thread's mode is one of the following:
 
-    - ``cudaStreamCaptureModeGlobal:`` This is the default mode. If the
-      local thread has an ongoing capture sequence that was not initiated
-      with ``cudaStreamCaptureModeRelaxed`` at ``cuStreamBeginCapture``, or
-      if any other thread has a concurrent capture sequence initiated with
-      ``cudaStreamCaptureModeGlobal``, this thread is prohibited from
-      potentially unsafe API calls.
+    - :py:obj:`~.cudaStreamCaptureModeGlobal`: This is the default mode. If
+      the local thread has an ongoing capture sequence that was not
+      initiated with :py:obj:`~.cudaStreamCaptureModeRelaxed` at
+      :func:`~.cuStreamBeginCapture`, or if any other thread has a
+      concurrent capture sequence initiated with
+      :py:obj:`~.cudaStreamCaptureModeGlobal`, this thread is prohibited
+      from potentially unsafe API calls.
 
-    - ``cudaStreamCaptureModeThreadLocal:`` If the local thread has an
-      ongoing capture sequence not initiated with
-      ``cudaStreamCaptureModeRelaxed``, it is prohibited from potentially
-      unsafe API calls. Concurrent capture sequences in other threads are
-      ignored.
+    - :py:obj:`~.cudaStreamCaptureModeThreadLocal`: If the local thread has
+      an ongoing capture sequence not initiated with
+      :py:obj:`~.cudaStreamCaptureModeRelaxed`, it is prohibited from
+      potentially unsafe API calls. Concurrent capture sequences in other
+      threads are ignored.
 
-    - ``cudaStreamCaptureModeRelaxed:`` The local thread is not prohibited
-      from potentially unsafe API calls. Note that the thread is still
-      prohibited from API calls which necessarily conflict with stream
-      capture, for example, attempting :py:obj:`~.cudaEventQuery` on an
-      event that was last recorded inside a capture sequence.
+    - :py:obj:`~.cudaStreamCaptureModeRelaxed`: The local thread is not
+      prohibited from potentially unsafe API calls. Note that the thread is
+      still prohibited from API calls which necessarily conflict with
+      stream capture, for example, attempting :py:obj:`~.cudaEventQuery` on
+      an event that was last recorded inside a capture sequence.
 
     Parameters
     ----------
@@ -24477,7 +24519,7 @@ def cudaThreadExchangeStreamCaptureMode(mode not None : cudaStreamCaptureMode):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     mode : :py:obj:`~.cudaStreamCaptureMode`
         Pointer to mode value to swap with the current mode
@@ -24517,7 +24559,7 @@ def cudaStreamEndCapture(stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorStreamCaptureWrongThread`
     pGraph : :py:obj:`~.cudaGraph_t`
         The captured graph
@@ -24579,7 +24621,7 @@ def cudaStreamIsCapturing(stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorStreamCaptureImplicit`
     pCaptureStatus : :py:obj:`~.cudaStreamCaptureStatus`
         Returns the stream's capture status
@@ -24619,7 +24661,7 @@ def cudaStreamGetCaptureInfo(stream):
     Valid data (other than capture status) is returned only if both of the
     following are true:
 
-    - the call returns cudaSuccess
+    - the call returns :py:obj:`~.cudaSuccess`
 
     - the returned capture status is
       :py:obj:`~.cudaStreamCaptureStatusActive`
@@ -24636,7 +24678,7 @@ def cudaStreamGetCaptureInfo(stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorStreamCaptureImplicit`, :py:obj:`~.cudaErrorLossyQuery`
     captureStatus_out : :py:obj:`~.cudaStreamCaptureStatus`
         Location to return the capture status of the stream; required
@@ -24746,7 +24788,7 @@ def cudaStreamUpdateCaptureDependencies(stream, dependencies : Optional[tuple[cu
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorIllegalState`
 
     See Also
@@ -24806,14 +24848,14 @@ def cudaEventCreate():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorLaunchFailure`, :py:obj:`~.cudaErrorMemoryAllocation`
     event : :py:obj:`~.cudaEvent_t`
         Newly created event
 
     See Also
     --------
-    cudaEventCreate (C++ API), :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cuEventCreate`
+    :func:`~.cudaEventCreate` (C++ API), :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :func:`~.cuEventCreate`
     """
     cdef cudaEvent_t event = cudaEvent_t()
     with nogil:
@@ -24858,14 +24900,14 @@ def cudaEventCreateWithFlags(unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorLaunchFailure`, :py:obj:`~.cudaErrorMemoryAllocation`
     event : :py:obj:`~.cudaEvent_t`
         Newly created event
 
     See Also
     --------
-    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cuEventCreate`
+    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :func:`~.cuEventCreate`
     """
     cdef cudaEvent_t event = cudaEvent_t()
     with nogil:
@@ -24907,12 +24949,12 @@ def cudaEventRecord(event, stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`
 
     See Also
     --------
-    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :py:obj:`~.cuEventRecord`
+    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaEventRecordWithFlags`, :func:`~.cuEventRecord`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -24976,12 +25018,12 @@ def cudaEventRecordWithFlags(event, stream, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`
 
     See Also
     --------
-    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cuEventRecord`,
+    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaEventRecord`, :func:`~.cuEventRecord`,
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -25029,12 +25071,12 @@ def cudaEventQuery(event):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotReady`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`
 
     See Also
     --------
-    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cuEventQuery`
+    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :func:`~.cuEventQuery`
     """
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
@@ -25073,12 +25115,12 @@ def cudaEventSynchronize(event):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`
 
     See Also
     --------
-    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cuEventSynchronize`
+    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventElapsedTime`, :func:`~.cuEventSynchronize`
     """
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
@@ -25114,12 +25156,12 @@ def cudaEventDestroy(event):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`
 
     See Also
     --------
-    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaEventElapsedTime`, :py:obj:`~.cuEventDestroy`
+    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaEventElapsedTime`, :func:`~.cuEventDestroy`
     """
     cdef cyruntime.cudaEvent_t cyevent
     if event is None:
@@ -25174,14 +25216,14 @@ def cudaEventElapsedTime(start, end):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotReady`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorLaunchFailure`, :py:obj:`~.cudaErrorUnknown`
     ms : float
         Time between ``start`` and ``end`` in ms
 
     See Also
     --------
-    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventRecord`, :py:obj:`~.cuEventElapsedTime`
+    :py:obj:`~.cudaEventCreate (C API)`, :py:obj:`~.cudaEventCreateWithFlags`, :py:obj:`~.cudaEventQuery`, :py:obj:`~.cudaEventSynchronize`, :py:obj:`~.cudaEventDestroy`, :py:obj:`~.cudaEventRecord`, :func:`~.cuEventElapsedTime`
     """
     cdef cyruntime.cudaEvent_t cyend
     if end is None:
@@ -25333,7 +25375,7 @@ def cudaImportExternalMemory(memHandleDesc : Optional[cudaExternalMemoryHandleDe
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorOperatingSystem`
     extMem_out : :py:obj:`~.cudaExternalMemory_t`
         Returned handle to an external memory object
@@ -25399,7 +25441,7 @@ def cudaExternalMemoryGetMappedBuffer(extMem, bufferDesc : Optional[cudaExternal
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`
     devPtr : Any
         Returned device pointer to buffer
@@ -25467,7 +25509,7 @@ def cudaExternalMemoryGetMappedMipmappedArray(extMem, mipmapDesc : Optional[cuda
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`
     mipmap : :py:obj:`~.cudaMipmappedArray_t`
         Returned CUDA mipmapped array
@@ -25515,7 +25557,7 @@ def cudaDestroyExternalMemory(extMem):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle`
 
     See Also
@@ -25664,7 +25706,7 @@ def cudaImportExternalSemaphore(semHandleDesc : Optional[cudaExternalSemaphoreHa
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorOperatingSystem`
     extSem_out : :py:obj:`~.cudaExternalSemaphore_t`
         Returned handle to an external semaphore
@@ -25730,7 +25772,8 @@ def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalS
     the type :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync`, if the
     NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags
     in :py:obj:`~.cudaDeviceGetNvSciSyncAttributes` to
-    cudaNvSciSyncAttrSignal, this API will return cudaErrorNotSupported.
+    :py:obj:`~.cudaNvSciSyncAttrSignal`, this API will return
+    :py:obj:`~.cudaErrorNotSupported`.
 
     :py:obj:`~.cudaExternalSemaphoreSignalParams.params.nvSciSync.fence`
     associated with semaphore object of the type
@@ -25787,7 +25830,7 @@ def cudaSignalExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalS
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle`
 
     See Also
@@ -25888,7 +25931,8 @@ def cudaWaitExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalSem
     :py:obj:`~.cudaExternalSemaphoreHandleTypeNvSciSync`, if the
     NvSciSyncAttrList used to create the NvSciSyncObj had not set the flags
     in :py:obj:`~.cudaDeviceGetNvSciSyncAttributes` to
-    cudaNvSciSyncAttrWait, this API will return cudaErrorNotSupported.
+    :py:obj:`~.cudaNvSciSyncAttrWait`, this API will return
+    :py:obj:`~.cudaErrorNotSupported`.
 
     If the semaphore object is any one of the following types:
     :py:obj:`~.cudaExternalSemaphoreHandleTypeKeyedMutex`,
@@ -25915,7 +25959,7 @@ def cudaWaitExternalSemaphoresAsync(extSemArray : Optional[tuple[cudaExternalSem
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle` :py:obj:`~.cudaErrorTimeout`
 
     See Also
@@ -25983,7 +26027,7 @@ def cudaDestroyExternalSemaphore(extSem):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle`
 
     See Also
@@ -26051,16 +26095,16 @@ def cudaFuncSetCacheConfig(func, cacheConfig not None : cudaFuncCache):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDeviceFunction` 2
 
     See Also
     --------
-    cudaFuncSetCacheConfig (C++ API), :py:obj:`~.cudaFuncGetAttributes (C API)`, :py:obj:`~.cudaLaunchKernel (C API)`, :py:obj:`~.cuFuncSetCacheConfig`
+    :func:`~.cudaFuncSetCacheConfig` (C++ API), :py:obj:`~.cudaFuncGetAttributes (C API)`, :py:obj:`~.cudaLaunchKernel (C API)`, :func:`~.cuFuncSetCacheConfig`
 
     Notes
     -----
-    This API does not accept a :py:obj:`~.cudaKernel_t` casted as void*. If cache config modification is required for a :py:obj:`~.cudaKernel_t` (or a global function), it can be replaced with a call to :py:obj:`~.cudaFuncSetAttributes` with the attribute :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout` to specify a more granular L1 cache and shared memory split configuration.
+    This API does not accept a :py:obj:`~.cudaKernel_t` casted as :py:obj:`~.EGLImageKHR`*. If cache config modification is required for a :py:obj:`~.cudaKernel_t` (or a global function), it can be replaced with a call to ``cudaFuncSetAttributes`` with the attribute :py:obj:`~.cudaFuncAttributePreferredSharedMemoryCarveout` to specify a more granular L1 cache and shared memory split configuration.
     """
     cdef _HelperInputVoidPtrStruct cyfuncHelper
     cdef void* cyfunc = _helper_input_void_ptr(func, &cyfuncHelper)
@@ -26096,14 +26140,14 @@ def cudaFuncGetAttributes(func):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDeviceFunction` 2
     attr : :py:obj:`~.cudaFuncAttributes`
         Return pointer to function's attributes
 
     See Also
     --------
-    :py:obj:`~.cudaFuncSetCacheConfig (C API)`, cudaFuncGetAttributes (C++ API), :py:obj:`~.cudaLaunchKernel (C API)`, :py:obj:`~.cuFuncGetAttribute`
+    :py:obj:`~.cudaFuncSetCacheConfig (C API)`, :func:`~.cudaFuncGetAttributes` (C++ API), :py:obj:`~.cudaLaunchKernel (C API)`, :func:`~.cuFuncGetAttribute`
     """
     cdef cudaFuncAttributes attr = cudaFuncAttributes()
     cdef _HelperInputVoidPtrStruct cyfuncHelper
@@ -26136,7 +26180,7 @@ def cudaFuncSetAttribute(func, attr not None : cudaFuncAttribute, int value):
     - :py:obj:`~.cudaFuncAttributeMaxDynamicSharedMemorySize` - The
       requested maximum size in bytes of dynamically-allocated shared
       memory. The sum of this value and the function attribute
-      :py:obj:`~.sharedSizeBytes` cannot exceed the device attribute
+      ``sharedSizeBytes`` cannot exceed the device attribute
       :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlockOptin`. The maximal size
       of requestable dynamic shared memory may differ by GPU architecture.
 
@@ -26153,21 +26197,21 @@ def cudaFuncSetAttribute(func, attr not None : cudaFuncAttribute, int value):
       either all be 0 or all be positive. The validity of the cluster
       dimensions is checked at launch time. If the value is set during
       compile time, it cannot be set at runtime. Setting it at runtime will
-      return cudaErrorNotPermitted.
+      return :py:obj:`~.cudaErrorNotPermitted`.
 
     - :py:obj:`~.cudaFuncAttributeRequiredClusterHeight`: The required
       cluster height in blocks. The width, height, and depth values must
       either all be 0 or all be positive. The validity of the cluster
       dimensions is checked at launch time. If the value is set during
       compile time, it cannot be set at runtime. Setting it at runtime will
-      return cudaErrorNotPermitted.
+      return :py:obj:`~.cudaErrorNotPermitted`.
 
     - :py:obj:`~.cudaFuncAttributeRequiredClusterDepth`: The required
       cluster depth in blocks. The width, height, and depth values must
       either all be 0 or all be positive. The validity of the cluster
       dimensions is checked at launch time. If the value is set during
       compile time, it cannot be set at runtime. Setting it at runtime will
-      return cudaErrorNotPermitted.
+      return :py:obj:`~.cudaErrorNotPermitted`.
 
     - :py:obj:`~.cudaFuncAttributeNonPortableClusterSizeAllowed`: Indicates
       whether the function can be launched with non-portable cluster size.
@@ -26175,10 +26219,10 @@ def cudaFuncSetAttribute(func, attr not None : cudaFuncAttribute, int value):
 
     - :py:obj:`~.cudaFuncAttributeClusterSchedulingPolicyPreference`: The
       block scheduling policy of a function. The value type is
-      cudaClusterSchedulingPolicy.
+      :py:obj:`~.cudaClusterSchedulingPolicy`.
 
-    cudaLaunchKernel (C++ API), cudaFuncSetCacheConfig (C++ API),
-    :py:obj:`~.cudaFuncGetAttributes (C API)`,
+    :func:`~.cudaLaunchKernel` (C++ API), :func:`~.cudaFuncSetCacheConfig`
+    (C++ API), :py:obj:`~.cudaFuncGetAttributes (C API)`,
 
     Parameters
     ----------
@@ -26191,7 +26235,7 @@ def cudaFuncSetAttribute(func, attr not None : cudaFuncAttribute, int value):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDeviceFunction`, :py:obj:`~.cudaErrorInvalidValue`
     """
     cdef _HelperInputVoidPtrStruct cyfuncHelper
@@ -26219,7 +26263,7 @@ def cudaFuncGetParamCount(func):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`,
     paramCount : int
         Returns the number of parameters used by the function
@@ -26290,9 +26334,8 @@ def cudaLaunchHostFunc(stream, fn, userData):
       Thus, for example, stream synchronization can be done by signaling
       from a host function at the end of the stream.
 
-    Note that, in constrast to :py:obj:`~.cuStreamAddCallback`, the
-    function will not be called in the event of an error in the CUDA
-    context.
+    Note that, in constrast to :func:`~.cuStreamAddCallback`, the function
+    will not be called in the event of an error in the CUDA context.
 
     Parameters
     ----------
@@ -26305,12 +26348,12 @@ def cudaLaunchHostFunc(stream, fn, userData):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cudaMallocManaged`, :py:obj:`~.cudaStreamAttachMemAsync`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cuLaunchHostFunc`
+    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cudaMallocManaged`, :py:obj:`~.cudaStreamAttachMemAsync`, :py:obj:`~.cudaStreamAddCallback`, :func:`~.cuLaunchHostFunc`
     """
     cdef cyruntime.cudaHostFn_t cyfn
     if fn is None:
@@ -26389,9 +26432,8 @@ def cudaLaunchHostFunc_v2(stream, fn, userData, unsigned int syncMode):
       Thus, for example, stream synchronization can be done by signaling
       from a host function at the end of the stream.
 
-    Note that, in constrast to :py:obj:`~.cuStreamAddCallback`, the
-    function will not be called in the event of an error in the CUDA
-    context.
+    Note that, in constrast to :func:`~.cuStreamAddCallback`, the function
+    will not be called in the event of an error in the CUDA context.
 
     Parameters
     ----------
@@ -26406,12 +26448,12 @@ def cudaLaunchHostFunc_v2(stream, fn, userData, unsigned int syncMode):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`
 
     See Also
     --------
-    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cudaMallocManaged`, :py:obj:`~.cudaStreamAttachMemAsync`, :py:obj:`~.cudaStreamAddCallback`, :py:obj:`~.cuLaunchHostFunc`
+    :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaStreamQuery`, :py:obj:`~.cudaStreamSynchronize`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cudaStreamDestroy`, :py:obj:`~.cudaMallocManaged`, :py:obj:`~.cudaStreamAttachMemAsync`, :py:obj:`~.cudaStreamAddCallback`, :func:`~.cuLaunchHostFunc`
     """
     cdef cyruntime.cudaHostFn_t cyfn
     if fn is None:
@@ -26490,12 +26532,12 @@ def cudaFuncSetSharedMemConfig(func, config not None : cudaSharedMemConfig):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDeviceFunction`, :py:obj:`~.cudaErrorInvalidValue`,2
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceSetSharedMemConfig`, :py:obj:`~.cudaDeviceGetSharedMemConfig`, :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaDeviceGetCacheConfig`, :py:obj:`~.cudaFuncSetCacheConfig`, :py:obj:`~.cuFuncSetSharedMemConfig`
+    :py:obj:`~.cudaDeviceSetSharedMemConfig`, :py:obj:`~.cudaDeviceGetSharedMemConfig`, :py:obj:`~.cudaDeviceSetCacheConfig`, :py:obj:`~.cudaDeviceGetCacheConfig`, :py:obj:`~.cudaFuncSetCacheConfig`, :func:`~.cuFuncSetSharedMemConfig`
     """
     cdef _HelperInputVoidPtrStruct cyfuncHelper
     cdef void* cyfunc = _helper_input_void_ptr(func, &cyfuncHelper)
@@ -26526,14 +26568,14 @@ def cudaOccupancyMaxActiveBlocksPerMultiprocessor(func, int blockSize, size_t dy
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidDeviceFunction`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`,
     numBlocks : int
         Returned occupancy
 
     See Also
     --------
-    :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`, cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API), cudaOccupancyAvailableDynamicSMemPerBlock (C++ API), :py:obj:`~.cuOccupancyMaxActiveBlocksPerMultiprocessor`
+    :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`, cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API), :func:`~.cudaOccupancyAvailableDynamicSMemPerBlock` (C++ API), :func:`~.cuOccupancyMaxActiveBlocksPerMultiprocessor`
     """
     cdef int numBlocks = 0
     cdef _HelperInputVoidPtrStruct cyfuncHelper
@@ -26566,7 +26608,7 @@ def cudaOccupancyAvailableDynamicSMemPerBlock(func, int numBlocks, int blockSize
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidDeviceFunction`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`,
     dynamicSmemSize : int
         Returned maximum dynamic shared memory
@@ -26623,14 +26665,14 @@ def cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(func, int blockSize,
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidDeviceFunction`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`,
     numBlocks : int
         Returned occupancy
 
     See Also
     --------
-    :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessor`, cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API), cudaOccupancyAvailableDynamicSMemPerBlock (C++ API), :py:obj:`~.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`
+    :py:obj:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessor`, cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSizeWithFlags (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags (C++ API), :func:`~.cudaOccupancyAvailableDynamicSMemPerBlock` (C++ API), :func:`~.cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`
     """
     cdef int numBlocks = 0
     cdef _HelperInputVoidPtrStruct cyfuncHelper
@@ -26760,14 +26802,14 @@ def cudaMallocManaged(size_t size, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorMemoryAllocation`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`
     devPtr : Any
         Pointer to allocated device memory
 
     See Also
     --------
-    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cudaDeviceGetAttribute`, :py:obj:`~.cudaStreamAttachMemAsync`, :py:obj:`~.cuMemAllocManaged`
+    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cudaDeviceGetAttribute`, :py:obj:`~.cudaStreamAttachMemAsync`, :func:`~.cuMemAllocManaged`
     """
     cdef void_ptr devPtr = 0
     with nogil:
@@ -26799,14 +26841,14 @@ def cudaMalloc(size_t size):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`, :py:obj:`~.cudaErrorExternalDevice`
     devPtr : Any
         Pointer to allocated device memory
 
     See Also
     --------
-    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuMemAlloc`
+    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :func:`~.cuMemAlloc`
     """
     cdef void_ptr devPtr = 0
     with nogil:
@@ -26828,11 +26870,10 @@ def cudaMallocHost(size_t size):
     functions such as :py:obj:`~.cudaMemcpy`\\*(). Since the memory can be
     accessed directly by the device, it can be read or written with much
     higher bandwidth than pageable memory obtained with functions such as
-    :py:obj:`~.malloc()`.
+    ``malloc()``.
 
-    On systems where :py:obj:`~.pageableMemoryAccessUsesHostPageTables` is
-    true, :py:obj:`~.cudaMallocHost` may not page-lock the allocated
-    memory.
+    On systems where ``pageableMemoryAccessUsesHostPageTables`` is true,
+    :py:obj:`~.cudaMallocHost` may not page-lock the allocated memory.
 
     Page-locking excessive amounts of memory with
     :py:obj:`~.cudaMallocHost()` may degrade system performance, since it
@@ -26847,14 +26888,14 @@ def cudaMallocHost(size_t size):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`, :py:obj:`~.cudaErrorExternalDevice`
     ptr : Any
         Pointer to allocated host memory
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, cudaMallocHost (C++ API), :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuMemAllocHost`
+    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :func:`~.cudaMallocHost` (C++ API), :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :func:`~.cuMemAllocHost`
     """
     cdef void_ptr ptr = 0
     with nogil:
@@ -26900,7 +26941,7 @@ def cudaMallocPitch(size_t width, size_t height):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`
     devPtr : Any
         Pointer to allocated pitched device memory
@@ -26909,7 +26950,7 @@ def cudaMallocPitch(size_t width, size_t height):
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuMemAllocPitch`
+    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaHostAlloc`, :func:`~.cuMemAllocPitch`
     """
     cdef void_ptr devPtr = 0
     cdef size_t pitch = 0
@@ -26954,14 +26995,14 @@ def cudaMallocArray(desc : Optional[cudaChannelFormatDesc], size_t width, size_t
     - :py:obj:`~.cudaArraySparse`: Allocates a CUDA array without physical
       backing memory. The subregions within this sparse array can later be
       mapped onto a physical memory allocation by calling
-      :py:obj:`~.cuMemMapArrayAsync`. The physical backing memory must be
-      allocated via :py:obj:`~.cuMemCreate`.
+      :func:`~.cuMemMapArrayAsync`. The physical backing memory must be
+      allocated via :func:`~.cuMemCreate`.
 
     - :py:obj:`~.cudaArrayDeferredMapping`: Allocates a CUDA array without
       physical backing memory. The entire array can later be mapped onto a
-      physical memory allocation by calling :py:obj:`~.cuMemMapArrayAsync`.
+      physical memory allocation by calling :func:`~.cuMemMapArrayAsync`.
       The physical backing memory must be allocated via
-      :py:obj:`~.cuMemCreate`.
+      :func:`~.cuMemCreate`.
 
     ``width`` and ``height`` must meet certain size requirements. See
     :py:obj:`~.cudaMalloc3DArray()` for more details.
@@ -26979,14 +27020,14 @@ def cudaMallocArray(desc : Optional[cudaChannelFormatDesc], size_t width, size_t
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`
     array : :py:obj:`~.cudaArray_t`
         Pointer to allocated array in device memory
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuArrayCreate`
+    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaHostAlloc`, :func:`~.cuArrayCreate`
     """
     cdef cudaArray_t array = cudaArray_t()
     cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc is not None else NULL
@@ -27020,8 +27061,7 @@ def cudaFree(devPtr):
 
     If :py:obj:`~.cudaFree`(``devPtr``) has already been called before, an
     error is returned. If ``devPtr`` is 0, no operation is performed.
-    :py:obj:`~.cudaFree()` returns :py:obj:`~.cudaErrorValue` in case of
-    failure.
+    :py:obj:`~.cudaFree()` returns ``cudaErrorValue`` in case of failure.
 
     The device version of :py:obj:`~.cudaFree` cannot be used with a
     ``*devPtr`` allocated using the host API, and vice versa.
@@ -27033,12 +27073,12 @@ def cudaFree(devPtr):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaMallocManaged`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaMallocFromPoolAsync` :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaFreeAsync` :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuMemFree`
+    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaMallocManaged`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaMallocFromPoolAsync` :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaFreeAsync` :py:obj:`~.cudaHostAlloc`, :func:`~.cuMemFree`
     """
     cdef _HelperInputVoidPtrStruct cydevPtrHelper
     cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper)
@@ -27065,12 +27105,12 @@ def cudaFreeHost(ptr):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuMemFreeHost`
+    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaHostAlloc`, :func:`~.cuMemFreeHost`
     """
     cdef _HelperInputVoidPtrStruct cyptrHelper
     cdef void* cyptr = _helper_input_void_ptr(ptr, &cyptrHelper)
@@ -27097,12 +27137,12 @@ def cudaFreeArray(array):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuArrayDestroy`
+    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :func:`~.cuArrayDestroy`
     """
     cdef cyruntime.cudaArray_t cyarray
     if array is None:
@@ -27134,12 +27174,12 @@ def cudaFreeMipmappedArray(mipmappedArray):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuMipmappedArrayDestroy`
+    :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :func:`~.cuMipmappedArrayDestroy`
     """
     cdef cyruntime.cudaMipmappedArray_t cymipmappedArray
     if mipmappedArray is None:
@@ -27166,11 +27206,11 @@ def cudaHostAlloc(size_t size, unsigned int flags):
     functions such as :py:obj:`~.cudaMemcpy()`. Since the memory can be
     accessed directly by the device, it can be read or written with much
     higher bandwidth than pageable memory obtained with functions such as
-    :py:obj:`~.malloc()`. Allocating excessive amounts of pinned memory may
-    degrade system performance, since it reduces the amount of memory
-    available to the system for paging. As a result, this function is best
-    used sparingly to allocate staging areas for data exchange between host
-    and device.
+    ``malloc()``. Allocating excessive amounts of pinned memory may degrade
+    system performance, since it reduces the amount of memory available to
+    the system for paging. As a result, this function is best used
+    sparingly to allocate staging areas for data exchange between host and
+    device.
 
     The ``flags`` parameter enables different options to be specified that
     affect the allocation, as follows.
@@ -27222,14 +27262,14 @@ def cudaHostAlloc(size_t size, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`, :py:obj:`~.cudaErrorExternalDevice`
     pHost : Any
         Device pointer to allocated memory
 
     See Also
     --------
-    :py:obj:`~.cudaSetDeviceFlags`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaGetDeviceFlags`, :py:obj:`~.cuMemHostAlloc`
+    :py:obj:`~.cudaSetDeviceFlags`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaGetDeviceFlags`, :func:`~.cuMemHostAlloc`
     """
     cdef void_ptr pHost = 0
     with nogil:
@@ -27257,8 +27297,8 @@ def cudaHostRegister(ptr, size_t size, unsigned int flags):
     function is best used sparingly to register staging areas for data
     exchange between host and device.
 
-    On systems where :py:obj:`~.pageableMemoryAccessUsesHostPageTables` is
-    true, :py:obj:`~.cudaHostRegister` will not page-lock the memory range
+    On systems where ``pageableMemoryAccessUsesHostPageTables`` is true,
+    :py:obj:`~.cudaHostRegister` will not page-lock the memory range
     specified by ``ptr`` but only populate unpopulated pages.
 
     :py:obj:`~.cudaHostRegister` is supported only on I/O coherent devices
@@ -27296,14 +27336,14 @@ def cudaHostRegister(ptr, size_t size, unsigned int flags):
       :py:obj:`~.cudaDevAttrHostRegisterReadOnlySupported`. Using this flag
       with a current context associated with a device that does not have
       this attribute set will cause :py:obj:`~.cudaHostRegister` to error
-      with cudaErrorNotSupported.
+      with :py:obj:`~.cudaErrorNotSupported`.
 
     All of these flags are orthogonal to one another: a developer may page-
     lock memory that is portable or mapped with no restrictions.
 
-    The CUDA context must have been created with the
-    :py:obj:`~.cudaMapHost` flag in order for the
-    :py:obj:`~.cudaHostRegisterMapped` flag to have any effect.
+    The CUDA context must have been created with the ``cudaMapHost`` flag
+    in order for the :py:obj:`~.cudaHostRegisterMapped` flag to have any
+    effect.
 
     The :py:obj:`~.cudaHostRegisterMapped` flag may be specified on CUDA
     contexts for devices that do not support mapped pinned memory. The
@@ -27342,12 +27382,12 @@ def cudaHostRegister(ptr, size_t size, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`, :py:obj:`~.cudaErrorHostMemoryAlreadyRegistered`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorExternalDevice`
 
     See Also
     --------
-    :py:obj:`~.cudaHostUnregister`, :py:obj:`~.cudaHostGetFlags`, :py:obj:`~.cudaHostGetDevicePointer`, :py:obj:`~.cuMemHostRegister`
+    :py:obj:`~.cudaHostUnregister`, :py:obj:`~.cudaHostGetFlags`, :py:obj:`~.cudaHostGetDevicePointer`, :func:`~.cuMemHostRegister`
     """
     cdef _HelperInputVoidPtrStruct cyptrHelper
     cdef void* cyptr = _helper_input_void_ptr(ptr, &cyptrHelper)
@@ -27361,7 +27401,7 @@ def cudaHostRegister(ptr, size_t size, unsigned int flags):
 
 @cython.embedsignature(True)
 def cudaHostUnregister(ptr):
-    """ Unregisters a memory range that was registered with cudaHostRegister.
+    """ Unregisters a memory range that was registered with :func:`~.cudaHostRegister`.
 
     Unmaps the memory range whose base address is specified by ``ptr``, and
     makes it pageable again.
@@ -27376,12 +27416,12 @@ def cudaHostUnregister(ptr):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorHostMemoryNotRegistered`
 
     See Also
     --------
-    :py:obj:`~.cudaHostUnregister`, :py:obj:`~.cuMemHostUnregister`
+    :py:obj:`~.cudaHostUnregister`, :func:`~.cuMemHostUnregister`
     """
     cdef _HelperInputVoidPtrStruct cyptrHelper
     cdef void* cyptr = _helper_input_void_ptr(ptr, &cyptrHelper)
@@ -27395,7 +27435,7 @@ def cudaHostUnregister(ptr):
 
 @cython.embedsignature(True)
 def cudaHostGetDevicePointer(pHost, unsigned int flags):
-    """ Passes back device pointer of mapped host memory allocated by cudaHostAlloc or registered by cudaHostRegister.
+    """ Passes back device pointer of mapped host memory allocated by :func:`~.cudaHostAlloc` or registered by :func:`~.cudaHostRegister`.
 
     Passes back the device pointer corresponding to the mapped, pinned host
     buffer allocated by :py:obj:`~.cudaHostAlloc()` or registered by
@@ -27435,14 +27475,14 @@ def cudaHostGetDevicePointer(pHost, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`
     pDevice : Any
         Returned device pointer for mapped memory
 
     See Also
     --------
-    :py:obj:`~.cudaSetDeviceFlags`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`
+    :py:obj:`~.cudaSetDeviceFlags`, :py:obj:`~.cudaHostAlloc`, :func:`~.cuMemHostGetDevicePointer`
     """
     cdef void_ptr pDevice = 0
     cdef _HelperInputVoidPtrStruct cypHostHelper
@@ -27459,7 +27499,7 @@ def cudaHostGetDevicePointer(pHost, unsigned int flags):
 
 @cython.embedsignature(True)
 def cudaHostGetFlags(pHost):
-    """ Passes back flags used to allocate pinned host memory allocated by cudaHostAlloc.
+    """ Passes back flags used to allocate pinned host memory allocated by :func:`~.cudaHostAlloc`.
 
     :py:obj:`~.cudaHostGetFlags()` will fail if the input pointer does not
     reside in an address range allocated by :py:obj:`~.cudaHostAlloc()`.
@@ -27471,14 +27511,14 @@ def cudaHostGetFlags(pHost):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pFlags : unsigned int
         Returned flags word
 
     See Also
     --------
-    :py:obj:`~.cudaHostAlloc`, :py:obj:`~.cuMemHostGetFlags`
+    :py:obj:`~.cudaHostAlloc`, :func:`~.cuMemHostGetFlags`
     """
     cdef unsigned int pFlags = 0
     cdef _HelperInputVoidPtrStruct cypHostHelper
@@ -27523,14 +27563,14 @@ def cudaMalloc3D(extent not None : cudaExtent):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`
     pitchedDevPtr : :py:obj:`~.cudaPitchedPtr`
         Pointer to allocated pitched device memory
 
     See Also
     --------
-    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMemAllocPitch`
+    :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMallocArray`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :func:`~.make_cudaPitchedPtr`, :func:`~.make_cudaExtent`, :func:`~.cuMemAllocPitch`
     """
     cdef cudaPitchedPtr pitchedDevPtr = cudaPitchedPtr()
     with nogil:
@@ -27569,27 +27609,27 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
     - A 3D array is allocated if all three extents are non-zero.
 
     - A 1D layered CUDA array is allocated if only the height extent is
-      zero and the cudaArrayLayered flag is set. Each layer is a 1D array.
-      The number of layers is determined by the depth extent.
+      zero and the :py:obj:`~.cudaArrayLayered` flag is set. Each layer is
+      a 1D array. The number of layers is determined by the depth extent.
 
     - A 2D layered CUDA array is allocated if all three extents are non-
-      zero and the cudaArrayLayered flag is set. Each layer is a 2D array.
-      The number of layers is determined by the depth extent.
+      zero and the :py:obj:`~.cudaArrayLayered` flag is set. Each layer is
+      a 2D array. The number of layers is determined by the depth extent.
 
     - A cubemap CUDA array is allocated if all three extents are non-zero
-      and the cudaArrayCubemap flag is set. Width must be equal to height,
-      and depth must be six. A cubemap is a special type of 2D layered CUDA
-      array, where the six layers represent the six faces of a cube. The
-      order of the six layers in memory is the same as that listed in
-      :py:obj:`~.cudaGraphicsCubeFace`.
+      and the :py:obj:`~.cudaArrayCubemap` flag is set. Width must be equal
+      to height, and depth must be six. A cubemap is a special type of 2D
+      layered CUDA array, where the six layers represent the six faces of a
+      cube. The order of the six layers in memory is the same as that
+      listed in :py:obj:`~.cudaGraphicsCubeFace`.
 
     - A cubemap layered CUDA array is allocated if all three extents are
-      non-zero, and both, cudaArrayCubemap and cudaArrayLayered flags are
-      set. Width must be equal to height, and depth must be a multiple of
-      six. A cubemap layered CUDA array is a special type of 2D layered
-      CUDA array that consists of a collection of cubemaps. The first six
-      layers represent the first cubemap, the next six layers form the
-      second cubemap, and so on.
+      non-zero, and both, :py:obj:`~.cudaArrayCubemap` and
+      :py:obj:`~.cudaArrayLayered` flags are set. Width must be equal to
+      height, and depth must be a multiple of six. A cubemap layered CUDA
+      array is a special type of 2D layered CUDA array that consists of a
+      collection of cubemaps. The first six layers represent the first
+      cubemap, the next six layers form the second cubemap, and so on.
 
     The ``flags`` parameter enables different options to be specified that
     affect the allocation, as follows.
@@ -27602,7 +27642,8 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
 
     - :py:obj:`~.cudaArrayCubemap`: Allocates a cubemap CUDA array. Width
       must be equal to height, and depth must be six. If the
-      cudaArrayLayered flag is also set, depth must be a multiple of six.
+      :py:obj:`~.cudaArrayLayered` flag is also set, depth must be a
+      multiple of six.
 
     - :py:obj:`~.cudaArraySurfaceLoadStore`: Allocates a CUDA array that
       could be read from or written to using a surface reference.
@@ -27614,15 +27655,15 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
     - :py:obj:`~.cudaArraySparse`: Allocates a CUDA array without physical
       backing memory. The subregions within this sparse array can later be
       mapped onto a physical memory allocation by calling
-      :py:obj:`~.cuMemMapArrayAsync`. This flag can only be used for
-      creating 2D, 3D or 2D layered sparse CUDA arrays. The physical
-      backing memory must be allocated via :py:obj:`~.cuMemCreate`.
+      :func:`~.cuMemMapArrayAsync`. This flag can only be used for creating
+      2D, 3D or 2D layered sparse CUDA arrays. The physical backing memory
+      must be allocated via :func:`~.cuMemCreate`.
 
     - :py:obj:`~.cudaArrayDeferredMapping`: Allocates a CUDA array without
       physical backing memory. The entire array can later be mapped onto a
-      physical memory allocation by calling :py:obj:`~.cuMemMapArrayAsync`.
+      physical memory allocation by calling :func:`~.cuMemMapArrayAsync`.
       The physical backing memory must be allocated via
-      :py:obj:`~.cuMemCreate`.
+      :func:`~.cuMemCreate`.
 
     The width, height and depth extents must meet certain size requirements
     as listed in the following table. All values are specified in elements.
@@ -27645,14 +27686,14 @@ def cudaMalloc3DArray(desc : Optional[cudaChannelFormatDesc], extent not None :
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`
     array : :py:obj:`~.cudaArray_t`
         Pointer to allocated array in device memory
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuArray3DCreate`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :func:`~.make_cudaExtent`, :func:`~.cuArray3DCreate`
     """
     cdef cudaArray_t array = cudaArray_t()
     cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc is not None else NULL
@@ -27695,28 +27736,29 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
     - A 3D mipmapped array is allocated if all three extents are non-zero.
 
     - A 1D layered CUDA mipmapped array is allocated if only the height
-      extent is zero and the cudaArrayLayered flag is set. Each layer is a
-      1D mipmapped array. The number of layers is determined by the depth
-      extent.
+      extent is zero and the :py:obj:`~.cudaArrayLayered` flag is set. Each
+      layer is a 1D mipmapped array. The number of layers is determined by
+      the depth extent.
 
     - A 2D layered CUDA mipmapped array is allocated if all three extents
-      are non-zero and the cudaArrayLayered flag is set. Each layer is a 2D
-      mipmapped array. The number of layers is determined by the depth
-      extent.
+      are non-zero and the :py:obj:`~.cudaArrayLayered` flag is set. Each
+      layer is a 2D mipmapped array. The number of layers is determined by
+      the depth extent.
 
     - A cubemap CUDA mipmapped array is allocated if all three extents are
-      non-zero and the cudaArrayCubemap flag is set. Width must be equal to
-      height, and depth must be six. The order of the six layers in memory
-      is the same as that listed in :py:obj:`~.cudaGraphicsCubeFace`.
+      non-zero and the :py:obj:`~.cudaArrayCubemap` flag is set. Width must
+      be equal to height, and depth must be six. The order of the six
+      layers in memory is the same as that listed in
+      :py:obj:`~.cudaGraphicsCubeFace`.
 
     - A cubemap layered CUDA mipmapped array is allocated if all three
-      extents are non-zero, and both, cudaArrayCubemap and cudaArrayLayered
-      flags are set. Width must be equal to height, and depth must be a
-      multiple of six. A cubemap layered CUDA mipmapped array is a special
-      type of 2D layered CUDA mipmapped array that consists of a collection
-      of cubemap mipmapped arrays. The first six layers represent the first
-      cubemap mipmapped array, the next six layers form the second cubemap
-      mipmapped array, and so on.
+      extents are non-zero, and both, :py:obj:`~.cudaArrayCubemap` and
+      :py:obj:`~.cudaArrayLayered` flags are set. Width must be equal to
+      height, and depth must be a multiple of six. A cubemap layered CUDA
+      mipmapped array is a special type of 2D layered CUDA mipmapped array
+      that consists of a collection of cubemap mipmapped arrays. The first
+      six layers represent the first cubemap mipmapped array, the next six
+      layers form the second cubemap mipmapped array, and so on.
 
     The ``flags`` parameter enables different options to be specified that
     affect the allocation, as follows.
@@ -27729,7 +27771,8 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
 
     - :py:obj:`~.cudaArrayCubemap`: Allocates a cubemap CUDA mipmapped
       array. Width must be equal to height, and depth must be six. If the
-      cudaArrayLayered flag is also set, depth must be a multiple of six.
+      :py:obj:`~.cudaArrayLayered` flag is also set, depth must be a
+      multiple of six.
 
     - :py:obj:`~.cudaArraySurfaceLoadStore`: This flag indicates that
       individual mipmap levels of the CUDA mipmapped array will be read
@@ -27743,16 +27786,15 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
     - :py:obj:`~.cudaArraySparse`: Allocates a CUDA mipmapped array without
       physical backing memory. The subregions within this sparse array can
       later be mapped onto a physical memory allocation by calling
-      :py:obj:`~.cuMemMapArrayAsync`. This flag can only be used for
-      creating 2D, 3D or 2D layered sparse CUDA mipmapped arrays. The
-      physical backing memory must be allocated via
-      :py:obj:`~.cuMemCreate`.
+      :func:`~.cuMemMapArrayAsync`. This flag can only be used for creating
+      2D, 3D or 2D layered sparse CUDA mipmapped arrays. The physical
+      backing memory must be allocated via :func:`~.cuMemCreate`.
 
     - :py:obj:`~.cudaArrayDeferredMapping`: Allocates a CUDA mipmapped
       array without physical backing memory. The entire array can later be
       mapped onto a physical memory allocation by calling
-      :py:obj:`~.cuMemMapArrayAsync`. The physical backing memory must be
-      allocated via :py:obj:`~.cuMemCreate`.
+      :func:`~.cuMemMapArrayAsync`. The physical backing memory must be
+      allocated via :func:`~.cuMemCreate`.
 
     The width, height and depth extents must meet certain size requirements
     as listed in the following table. All values are specified in elements.
@@ -27772,14 +27814,14 @@ def cudaMallocMipmappedArray(desc : Optional[cudaChannelFormatDesc], extent not
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`
     mipmappedArray : :py:obj:`~.cudaMipmappedArray_t`
         Pointer to allocated mipmapped array in device memory
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMipmappedArrayCreate`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :func:`~.make_cudaExtent`, :func:`~.cuMipmappedArrayCreate`
     """
     cdef cudaMipmappedArray_t mipmappedArray = cudaMipmappedArray_t()
     cdef cyruntime.cudaChannelFormatDesc* cydesc_ptr = <cyruntime.cudaChannelFormatDesc*>desc._pvt_ptr if desc is not None else NULL
@@ -27814,14 +27856,14 @@ def cudaGetMipmappedArrayLevel(mipmappedArray, unsigned int level):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue` :py:obj:`~.cudaErrorInvalidResourceHandle`
     levelArray : :py:obj:`~.cudaArray_t`
         Returned mipmap level CUDA array
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.cuMipmappedArrayGetLevel`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc`, :py:obj:`~.cudaMallocPitch`, :py:obj:`~.cudaFree`, :py:obj:`~.cudaFreeArray`, :py:obj:`~.cudaMallocHost (C API)`, :py:obj:`~.cudaFreeHost`, :py:obj:`~.cudaHostAlloc`, :func:`~.make_cudaExtent`, :func:`~.cuMipmappedArrayGetLevel`
     """
     cdef cyruntime.cudaMipmappedArray_const_t cymipmappedArray
     if mipmappedArray is None:
@@ -27850,12 +27892,12 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
     :py:obj:`~.cudaMemcpy3D()` copies data betwen two 3D objects. The
     source and destination objects may be in either host memory, device
     memory, or a CUDA array. The source, destination, extent, and kind of
-    copy performed is specified by the :py:obj:`~.cudaMemcpy3DParms` struct
-    which should be initialized to zero before use:
+    copy performed is specified by the :py:obj:`~.cudaMemcpy3DParms`
+    ``struct which`` should be initialized to zero before use:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    The struct passed to :py:obj:`~.cudaMemcpy3D()` must specify one of
+    The ``struct passed`` to :py:obj:`~.cudaMemcpy3D()` must specify one of
     ``srcArray`` or ``srcPtr`` and one of ``dstArray`` or ``dstPtr``.
     Passing more than one non-zero source or destination will cause
     :py:obj:`~.cudaMemcpy3D()` to return an error.
@@ -27880,11 +27922,12 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
     :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
     unified virtual addressing. For :py:obj:`~.cudaMemcpyHostToHost` or
     :py:obj:`~.cudaMemcpyHostToDevice` or
-    :py:obj:`~.cudaMemcpyDeviceToHost` passed as kind and cudaArray type
-    passed as source or destination, if the kind implies cudaArray type to
-    be present on the host, :py:obj:`~.cudaMemcpy3D()` will disregard that
-    implication and silently correct the kind based on the fact that
-    cudaArray type can only be present on the device.
+    :py:obj:`~.cudaMemcpyDeviceToHost` passed as kind and
+    :py:obj:`~.cudaArray_t` type passed as source or destination, if the
+    kind implies :py:obj:`~.cudaArray_t` type to be present on the host,
+    :py:obj:`~.cudaMemcpy3D()` will disregard that implication and silently
+    correct the kind based on the fact that :py:obj:`~.cudaArray_t` type
+    can only be present on the device.
 
     If the source and destination are both arrays,
     :py:obj:`~.cudaMemcpy3D()` will return an error if they do not have the
@@ -27910,12 +27953,12 @@ def cudaMemcpy3D(p : Optional[cudaMemcpy3DParms]):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidPitchValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3DAsync`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.make_cudaPos`, :py:obj:`~.cuMemcpy3D`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3DAsync`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.make_cudaExtent`, :func:`~.make_cudaPos`, :func:`~.cuMemcpy3D`
     """
     cdef cyruntime.cudaMemcpy3DParms* cyp_ptr = <cyruntime.cudaMemcpy3DParms*>p._pvt_ptr if p is not None else NULL
     with nogil:
@@ -27947,12 +27990,12 @@ def cudaMemcpy3DPeer(p : Optional[cudaMemcpy3DPeerParms]):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidPitchValue`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyPeerAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cuMemcpy3DPeer`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyPeerAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :func:`~.cuMemcpy3DPeer`
     """
     cdef cyruntime.cudaMemcpy3DPeerParms* cyp_ptr = <cyruntime.cudaMemcpy3DPeerParms*>p._pvt_ptr if p is not None else NULL
     with nogil:
@@ -27971,15 +28014,15 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
     :py:obj:`~.cudaMemcpy3DAsync()` copies data betwen two 3D objects. The
     source and destination objects may be in either host memory, device
     memory, or a CUDA array. The source, destination, extent, and kind of
-    copy performed is specified by the :py:obj:`~.cudaMemcpy3DParms` struct
-    which should be initialized to zero before use:
+    copy performed is specified by the :py:obj:`~.cudaMemcpy3DParms`
+    ``struct which`` should be initialized to zero before use:
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
-    The struct passed to :py:obj:`~.cudaMemcpy3DAsync()` must specify one
-    of ``srcArray`` or ``srcPtr`` and one of ``dstArray`` or ``dstPtr``.
-    Passing more than one non-zero source or destination will cause
-    :py:obj:`~.cudaMemcpy3DAsync()` to return an error.
+    The ``struct passed`` to :py:obj:`~.cudaMemcpy3DAsync()` must specify
+    one of ``srcArray`` or ``srcPtr`` and one of ``dstArray`` or
+    ``dstPtr``. Passing more than one non-zero source or destination will
+    cause :py:obj:`~.cudaMemcpy3DAsync()` to return an error.
 
     The ``srcPos`` and ``dstPos`` fields are optional offsets into the
     source and destination objects and are defined in units of each
@@ -28002,11 +28045,12 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
     :py:obj:`~.cudaMemcpyDefault` is only allowed on systems that support
     unified virtual addressing. For :py:obj:`~.cudaMemcpyHostToHost` or
     :py:obj:`~.cudaMemcpyHostToDevice` or
-    :py:obj:`~.cudaMemcpyDeviceToHost` passed as kind and cudaArray type
-    passed as source or destination, if the kind implies cudaArray type to
-    be present on the host, :py:obj:`~.cudaMemcpy3DAsync()` will disregard
-    that implication and silently correct the kind based on the fact that
-    cudaArray type can only be present on the device.
+    :py:obj:`~.cudaMemcpyDeviceToHost` passed as kind and
+    :py:obj:`~.cudaArray_t` type passed as source or destination, if the
+    kind implies :py:obj:`~.cudaArray_t` type to be present on the host,
+    :py:obj:`~.cudaMemcpy3DAsync()` will disregard that implication and
+    silently correct the kind based on the fact that
+    :py:obj:`~.cudaArray_t` type can only be present on the device.
 
     If the source and destination are both arrays,
     :py:obj:`~.cudaMemcpy3DAsync()` will return an error if they do not
@@ -28044,12 +28088,12 @@ def cudaMemcpy3DAsync(p : Optional[cudaMemcpy3DParms], stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidPitchValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, ::::py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.make_cudaExtent`, :py:obj:`~.make_cudaPos`, :py:obj:`~.cuMemcpy3DAsync`
+    :py:obj:`~.cudaMalloc3D`, :py:obj:`~.cudaMalloc3DArray`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemcpy3D`, :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, ::::py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.make_cudaExtent`, :func:`~.make_cudaPos`, :func:`~.cuMemcpy3DAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -28084,12 +28128,12 @@ def cudaMemcpy3DPeerAsync(p : Optional[cudaMemcpy3DPeerParms], stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidPitchValue`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyPeerAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cuMemcpy3DPeerAsync`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyPeerAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :func:`~.cuMemcpy3DPeerAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -28130,7 +28174,7 @@ def cudaMemGetInfo():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorLaunchFailure`
     free : int
         Returned free memory in bytes
@@ -28139,7 +28183,7 @@ def cudaMemGetInfo():
 
     See Also
     --------
-    :py:obj:`~.cuMemGetInfo`
+    :func:`~.cuMemGetInfo`
     """
     cdef size_t free = 0
     cdef size_t total = 0
@@ -28154,7 +28198,7 @@ def cudaMemGetInfo():
 
 @cython.embedsignature(True)
 def cudaArrayGetInfo(array):
-    """ Gets info about the specified cudaArray.
+    """ Gets info about the specified :py:obj:`~.cudaArray_t`.
 
     Returns in ``*desc``, ``*extent`` and ``*flags`` respectively, the
     type, shape and flags of ``array``.
@@ -28164,11 +28208,11 @@ def cudaArrayGetInfo(array):
     Parameters
     ----------
     array : :py:obj:`~.cudaArray_t`
-        The :py:obj:`~.cudaArray` to get info for
+        The :py:obj:`~.cudaArray_t` to get info for
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     desc : :py:obj:`~.cudaChannelFormatDesc`
         Returned array type
@@ -28179,7 +28223,7 @@ def cudaArrayGetInfo(array):
 
     See Also
     --------
-    :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuArray3DGetDescriptor`
+    :func:`~.cuArrayGetDescriptor`, :func:`~.cuArray3DGetDescriptor`
     """
     cdef cyruntime.cudaArray_t cyarray
     if array is None:
@@ -28231,14 +28275,14 @@ def cudaArrayGetPlane(hArray, unsigned int planeIdx):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue` :py:obj:`~.cudaErrorInvalidResourceHandle`
     pPlaneArray : :py:obj:`~.cudaArray_t`
         Returned CUDA array referenced by the ``planeIdx``
 
     See Also
     --------
-    :py:obj:`~.cuArrayGetPlane`
+    :func:`~.cuArrayGetPlane`
     """
     cdef cyruntime.cudaArray_t cyhArray
     if hArray is None:
@@ -28281,7 +28325,7 @@ def cudaArrayGetMemoryRequirements(array, int device):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     memoryRequirements : :py:obj:`~.cudaArrayMemoryRequirements`
         Pointer to :py:obj:`~.cudaArrayMemoryRequirements`
@@ -28331,7 +28375,7 @@ def cudaMipmappedArrayGetMemoryRequirements(mipmap, int device):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     memoryRequirements : :py:obj:`~.cudaArrayMemoryRequirements`
         Pointer to :py:obj:`~.cudaArrayMemoryRequirements`
@@ -28374,7 +28418,7 @@ def cudaArrayGetSparseProperties(array):
     in :py:obj:`~.cudaArraySparseProperties.miptailFirstLevel` is always
     zero. Note that the ``array`` must have been allocated using
     :py:obj:`~.cudaMallocArray` or :py:obj:`~.cudaMalloc3DArray`. For CUDA
-    arrays obtained using :py:obj:`~.cudaMipmappedArrayGetLevel`,
+    arrays obtained using ``cudaMipmappedArrayGetLevel``,
     :py:obj:`~.cudaErrorInvalidValue` will be returned. Instead,
     :py:obj:`~.cudaMipmappedArrayGetSparseProperties` must be used to
     obtain the sparse properties of the entire CUDA mipmapped array to
@@ -28387,14 +28431,14 @@ def cudaArrayGetSparseProperties(array):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     sparseProperties : :py:obj:`~.cudaArraySparseProperties`
         Pointer to return the :py:obj:`~.cudaArraySparseProperties`
 
     See Also
     --------
-    :py:obj:`~.cudaMipmappedArrayGetSparseProperties`, :py:obj:`~.cuMemMapArrayAsync`
+    :py:obj:`~.cudaMipmappedArrayGetSparseProperties`, :func:`~.cuMemMapArrayAsync`
     """
     cdef cyruntime.cudaArray_t cyarray
     if array is None:
@@ -28443,14 +28487,14 @@ def cudaMipmappedArrayGetSparseProperties(mipmap):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     sparseProperties : :py:obj:`~.cudaArraySparseProperties`
         Pointer to return :py:obj:`~.cudaArraySparseProperties`
 
     See Also
     --------
-    :py:obj:`~.cudaArrayGetSparseProperties`, :py:obj:`~.cuMemMapArrayAsync`
+    :py:obj:`~.cudaArrayGetSparseProperties`, :func:`~.cuMemMapArrayAsync`
     """
     cdef cyruntime.cudaMipmappedArray_t cymipmap
     if mipmap is None:
@@ -28502,12 +28546,12 @@ def cudaMemcpy(dst, src, size_t count, kind not None : cudaMemcpyKind):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpy`
+    :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpyDtoH`, :func:`~.cuMemcpyHtoD`, :func:`~.cuMemcpyDtoD`, :func:`~.cuMemcpy`
     """
     cdef _HelperInputVoidPtrStruct cydstHelper
     cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper)
@@ -28553,12 +28597,12 @@ def cudaMemcpyPeer(dst, int dstDevice, src, int srcDevice, size_t count):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyPeerAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cuMemcpyPeer`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpyPeerAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :func:`~.cuMemcpyPeer`
     """
     cdef _HelperInputVoidPtrStruct cydstHelper
     cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper)
@@ -28614,12 +28658,12 @@ def cudaMemcpy2D(dst, size_t dpitch, src, size_t spitch, size_t width, size_t he
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidPitchValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DUnaligned`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpy2D`, :func:`~.cuMemcpy2DUnaligned`
     """
     cdef _HelperInputVoidPtrStruct cydstHelper
     cdef void* cydst = _helper_input_void_ptr(dst, &cydstHelper)
@@ -28677,12 +28721,12 @@ def cudaMemcpy2DToArray(dst, size_t wOffset, size_t hOffset, src, size_t spitch,
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidPitchValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DUnaligned`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpy2D`, :func:`~.cuMemcpy2DUnaligned`
     """
     cdef cyruntime.cudaArray_t cydst
     if dst is None:
@@ -28745,12 +28789,12 @@ def cudaMemcpy2DFromArray(dst, size_t dpitch, src, size_t wOffset, size_t hOffse
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidPitchValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DUnaligned`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpy2D`, :func:`~.cuMemcpy2DUnaligned`
     """
     cdef cyruntime.cudaArray_const_t cysrc
     if src is None:
@@ -28813,12 +28857,12 @@ def cudaMemcpy2DArrayToArray(dst, size_t wOffsetDst, size_t hOffsetDst, src, siz
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DUnaligned`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpy2D`, :func:`~.cuMemcpy2DUnaligned`
     """
     cdef cyruntime.cudaArray_const_t cysrc
     if src is None:
@@ -28888,12 +28932,12 @@ def cudaMemcpyAsync(dst, src, size_t count, kind not None : cudaMemcpyKind, stre
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpyAsync`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemcpyDtoDAsync`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpyAsync`, :func:`~.cuMemcpyDtoHAsync`, :func:`~.cuMemcpyHtoDAsync`, :func:`~.cuMemcpyDtoDAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -28947,12 +28991,12 @@ def cudaMemcpyPeerAsync(dst, int dstDevice, src, int srcDevice, size_t count, st
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cuMemcpyPeerAsync`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :func:`~.cuMemcpyPeerAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -29074,7 +29118,7 @@ def cudaMemcpyBatchAsync(dsts : Optional[tuple[Any] | list[Any]], srcs : Optiona
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     """
     cdef cyruntime.cudaStream_t cystream
@@ -29222,7 +29266,7 @@ def cudaMemcpy3DBatchAsync(size_t numOps, opList : Optional[tuple[cudaMemcpy3DBa
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     """
     cdef cyruntime.cudaStream_t cystream
@@ -29269,8 +29313,8 @@ def cudaMemcpyWithAttributesAsync(dst, src, size_t size, attr : Optional[cudaMem
     the operation in.
 
     For more information regarding the attributes, please refer to
-    :py:obj:`~.cudaMemcpyAttributes` and it's usage desciption
-    in::cudaMemcpyBatchAsync
+    :py:obj:`~.cudaMemcpyAttributes` and it's usage desciption in
+    :func:`~.cudaMemcpyBatchAsync`
 
     Parameters
     ----------
@@ -29287,7 +29331,7 @@ def cudaMemcpyWithAttributesAsync(dst, src, size_t size, attr : Optional[cudaMem
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -29327,8 +29371,8 @@ def cudaMemcpy3DWithAttributesAsync(op : Optional[cudaMemcpy3DBatchOp], unsigned
     the operation in.
 
     For more information regarding the operation, please refer to
-    :py:obj:`~.cudaMemcpy3DBatchOp` and it's usage desciption
-    in::cudaMemcpy3DBatchAsync
+    :py:obj:`~.cudaMemcpy3DBatchOp` and it's usage desciption in
+    :func:`~.cudaMemcpy3DBatchAsync`
 
     Parameters
     ----------
@@ -29341,7 +29385,7 @@ def cudaMemcpy3DWithAttributesAsync(op : Optional[cudaMemcpy3DBatchOp], unsigned
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -29419,12 +29463,12 @@ def cudaMemcpy2DAsync(dst, size_t dpitch, src, size_t spitch, size_t width, size
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidPitchValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpy2DAsync`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpy2DAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -29478,7 +29522,7 @@ def cudaMemcpy2DToArrayAsync(dst, size_t wOffset, size_t hOffset, src, size_t sp
 
     :py:obj:`~.cudaMemcpy2DFromArrayAsync`,
     :py:obj:`~.cudaMemcpyToSymbolAsync`,
-    :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpy2DAsync`
+    :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpy2DAsync`
 
     Parameters
     ----------
@@ -29503,7 +29547,7 @@ def cudaMemcpy2DToArrayAsync(dst, size_t wOffset, size_t hOffset, src, size_t sp
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidPitchValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
@@ -29566,7 +29610,7 @@ def cudaMemcpy2DFromArrayAsync(dst, size_t dpitch, src, size_t wOffset, size_t h
     copy may overlap with operations in other streams.
 
     :py:obj:`~.cudaMemcpyToSymbolAsync`,
-    :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpy2DAsync`
+    :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpy2DAsync`
 
     Parameters
     ----------
@@ -29591,7 +29635,7 @@ def cudaMemcpy2DFromArrayAsync(dst, size_t dpitch, src, size_t wOffset, size_t h
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidPitchValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
@@ -29646,12 +29690,12 @@ def cudaMemset(devPtr, int value, size_t count):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
     --------
-    :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`
+    :func:`~.cuMemsetD8`, :func:`~.cuMemsetD16`, :func:`~.cuMemsetD32`
     """
     cdef _HelperInputVoidPtrStruct cydevPtrHelper
     cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper)
@@ -29692,12 +29736,12 @@ def cudaMemset2D(devPtr, size_t pitch, int value, size_t width, size_t height):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
     --------
-    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`
+    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :func:`~.cuMemsetD2D8`, :func:`~.cuMemsetD2D16`, :func:`~.cuMemsetD2D32`
     """
     cdef _HelperInputVoidPtrStruct cydevPtrHelper
     cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper)
@@ -29750,12 +29794,12 @@ def cudaMemset3D(pitchedDevPtr not None : cudaPitchedPtr, int value, extent not
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
     --------
-    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`
+    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cudaMalloc3D`, :func:`~.make_cudaPitchedPtr`, :func:`~.make_cudaExtent`
     """
     with nogil:
         err = cyruntime.cudaMemset3D(pitchedDevPtr._pvt_ptr[0], value, extent._pvt_ptr[0])
@@ -29793,12 +29837,12 @@ def cudaMemsetAsync(devPtr, int value, size_t count, stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
     --------
-    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cuMemsetD8Async`, :py:obj:`~.cuMemsetD16Async`, :py:obj:`~.cuMemsetD32Async`
+    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMemset3DAsync`, :func:`~.cuMemsetD8Async`, :func:`~.cuMemsetD16Async`, :func:`~.cuMemsetD32Async`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -29855,12 +29899,12 @@ def cudaMemset2DAsync(devPtr, size_t pitch, int value, size_t width, size_t heig
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
     --------
-    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset3DAsync`, :py:obj:`~.cuMemsetD2D8Async`, :py:obj:`~.cuMemsetD2D16Async`, :py:obj:`~.cuMemsetD2D32Async`
+    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset3DAsync`, :func:`~.cuMemsetD2D8Async`, :func:`~.cuMemsetD2D16Async`, :func:`~.cuMemsetD2D32Async`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -29929,12 +29973,12 @@ def cudaMemset3DAsync(pitchedDevPtr not None : cudaPitchedPtr, int value, extent
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
     --------
-    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMalloc3D`, :py:obj:`~.make_cudaPitchedPtr`, :py:obj:`~.make_cudaExtent`
+    :py:obj:`~.cudaMemset`, :py:obj:`~.cudaMemset2D`, :py:obj:`~.cudaMemset3D`, :py:obj:`~.cudaMemsetAsync`, :py:obj:`~.cudaMemset2DAsync`, :py:obj:`~.cudaMalloc3D`, :func:`~.make_cudaPitchedPtr`, :func:`~.make_cudaExtent`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -29963,19 +30007,18 @@ def cudaMemPrefetchAsync(devPtr, size_t count, location not None : cudaMemLocati
     :py:obj:`~.cudaMallocManaged` or declared via managed variables, or it
     may also refer to memory allocated from a managed memory pool, or it
     may also refer to system-allocated memory on systems with non-zero
-    cudaDevAttrPageableMemoryAccess.
+    :py:obj:`~.cudaDevAttrPageableMemoryAccess`.
 
     Specifying :py:obj:`~.cudaMemLocationTypeDevice` for
     :py:obj:`~.cudaMemLocation.type` will prefetch memory to GPU specified
     by device ordinal :py:obj:`~.cudaMemLocation.id` which must have non-
-    zero value for the device attribute
-    :py:obj:`~.concurrentManagedAccess`. Additionally, ``stream`` must be
-    associated with a device that has a non-zero value for the device
-    attribute :py:obj:`~.concurrentManagedAccess`. Specifying
-    :py:obj:`~.cudaMemLocationTypeHost` as :py:obj:`~.cudaMemLocation.type`
-    will prefetch data to host memory. Applications can request prefetching
-    memory to a specific host NUMA node by specifying
-    :py:obj:`~.cudaMemLocationTypeHostNuma` for
+    zero value for the device attribute ``concurrentManagedAccess``.
+    Additionally, ``stream`` must be associated with a device that has a
+    non-zero value for the device attribute ``concurrentManagedAccess``.
+    Specifying :py:obj:`~.cudaMemLocationTypeHost` as
+    :py:obj:`~.cudaMemLocation.type` will prefetch data to host memory.
+    Applications can request prefetching memory to a specific host NUMA
+    node by specifying :py:obj:`~.cudaMemLocationTypeHostNuma` for
     :py:obj:`~.cudaMemLocation.type` and a valid host NUMA node id in
     :py:obj:`~.cudaMemLocation.id` Users can also request prefetching
     memory to the host NUMA node closest to the current thread's CPU by
@@ -30001,7 +30044,7 @@ def cudaMemPrefetchAsync(devPtr, size_t count, location not None : cudaMemLocati
     By default, any mappings to the previous location of the migrated pages
     are removed and mappings for the new location are only setup on the
     destination location. The exact behavior however also depends on the
-    settings applied to this memory range via :py:obj:`~.cuMemAdvise` as
+    settings applied to this memory range via :func:`~.cuMemAdvise` as
     described below:
 
     If :py:obj:`~.cudaMemAdviseSetReadMostly` was set on any subset of this
@@ -30045,12 +30088,12 @@ def cudaMemPrefetchAsync(devPtr, size_t count, location not None : cudaMemLocati
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cudaMemAdvise`, :py:obj:`~.cuMemPrefetchAsync`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cudaMemAdvise`, :func:`~.cuMemPrefetchAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -30134,7 +30177,7 @@ def cudaMemPrefetchBatchAsync(dptrs : Optional[tuple[Any] | list[Any]], sizes :
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -30226,7 +30269,7 @@ def cudaMemDiscardBatchAsync(dptrs : Optional[tuple[Any] | list[Any]], sizes : t
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -30323,7 +30366,7 @@ def cudaMemDiscardAndPrefetchBatchAsync(dptrs : Optional[tuple[Any] | list[Any]]
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -30417,7 +30460,7 @@ def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, loca
       setting this advice will not create a read-only copy when that device
       accesses this memory region.
 
-    - :py:obj:`~.cudaMemAdviceUnsetReadMostly`: Undoes the effect of
+    - ``cudaMemAdviceUnsetReadMostly``: Undoes the effect of
       :py:obj:`~.cudaMemAdviseSetReadMostly` and also prevents the Unified
       Memory driver from attempting heuristic read-duplication on the
       memory range. Any read-duplicated copies of the data will be
@@ -30548,12 +30591,12 @@ def cudaMemAdvise(devPtr, size_t count, advice not None : cudaMemoryAdvise, loca
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cudaMemPrefetchAsync`, :py:obj:`~.cuMemAdvise`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpyPeer`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy3DPeerAsync`, :py:obj:`~.cudaMemPrefetchAsync`, :func:`~.cuMemAdvise`
     """
     cdef _HelperInputVoidPtrStruct cydevPtrHelper
     cdef void* cydevPtr = _helper_input_void_ptr(devPtr, &cydevPtrHelper)
@@ -30586,51 +30629,52 @@ def cudaMemRangeGetAttribute(size_t dataSize, attribute not None : cudaMemRangeA
       is specified, ``data`` will be interpreted as a 32-bit integer, and
       ``dataSize`` must be 4. The result returned will be a GPU device id
       if all pages in the memory range have that GPU as their preferred
-      location, or it will be cudaCpuDeviceId if all pages in the memory
-      range have the CPU as their preferred location, or it will be
-      cudaInvalidDeviceId if either all the pages don't have the same
-      preferred location or some of the pages don't have a preferred
-      location at all. Note that the actual location of the pages in the
-      memory range at the time of the query may be different from the
-      preferred location.
+      location, or it will be :py:obj:`~.cudaCpuDeviceId` if all pages in
+      the memory range have the CPU as their preferred location, or it will
+      be :py:obj:`~.cudaInvalidDeviceId` if either all the pages don't have
+      the same preferred location or some of the pages don't have a
+      preferred location at all. Note that the actual location of the pages
+      in the memory range at the time of the query may be different from
+      the preferred location.
 
     - :py:obj:`~.cudaMemRangeAttributeAccessedBy`: If this attribute is
       specified, ``data`` will be interpreted as an array of 32-bit
       integers, and ``dataSize`` must be a non-zero multiple of 4. The
       result returned will be a list of device ids that had
-      :py:obj:`~.cudaMemAdviceSetAccessedBy` set for that entire memory
-      range. If any device does not have that advice set for the entire
-      memory range, that device will not be included. If ``data`` is larger
-      than the number of devices that have that advice set for that memory
-      range, cudaInvalidDeviceId will be returned in all the extra space
-      provided. For ex., if ``dataSize`` is 12 (i.e. ``data`` has 3
+      ``cudaMemAdviceSetAccessedBy`` set for that entire memory range. If
+      any device does not have that advice set for the entire memory range,
+      that device will not be included. If ``data`` is larger than the
+      number of devices that have that advice set for that memory range,
+      :py:obj:`~.cudaInvalidDeviceId` will be returned in all the extra
+      space provided. For ex., if ``dataSize`` is 12 (i.e. ``data`` has 3
       elements) and only device 0 has the advice set, then the result
-      returned will be { 0, cudaInvalidDeviceId, cudaInvalidDeviceId }. If
-      ``data`` is smaller than the number of devices that have that advice
-      set, then only as many devices will be returned as can fit in the
-      array. There is no guarantee on which specific devices will be
-      returned, however.
+      returned will be { 0, :py:obj:`~.cudaInvalidDeviceId`,
+      :py:obj:`~.cudaInvalidDeviceId` }. If ``data`` is smaller than the
+      number of devices that have that advice set, then only as many
+      devices will be returned as can fit in the array. There is no
+      guarantee on which specific devices will be returned, however.
 
     - :py:obj:`~.cudaMemRangeAttributeLastPrefetchLocation`: If this
       attribute is specified, ``data`` will be interpreted as a 32-bit
       integer, and ``dataSize`` must be 4. The result returned will be the
       last location to which all pages in the memory range were prefetched
       explicitly via :py:obj:`~.cudaMemPrefetchAsync`. This will either be
-      a GPU id or cudaCpuDeviceId depending on whether the last location
-      for prefetch was a GPU or the CPU respectively. If any page in the
-      memory range was never explicitly prefetched or if all pages were not
-      prefetched to the same location, cudaInvalidDeviceId will be
-      returned. Note that this simply returns the last location that the
-      applicaton requested to prefetch the memory range to. It gives no
-      indication as to whether the prefetch operation to that location has
-      completed or even begun.
+      a GPU id or :py:obj:`~.cudaCpuDeviceId` depending on whether the last
+      location for prefetch was a GPU or the CPU respectively. If any page
+      in the memory range was never explicitly prefetched or if all pages
+      were not prefetched to the same location,
+      :py:obj:`~.cudaInvalidDeviceId` will be returned. Note that this
+      simply returns the last location that the applicaton requested to
+      prefetch the memory range to. It gives no indication as to whether
+      the prefetch operation to that location has completed or even begun.
 
     - :py:obj:`~.cudaMemRangeAttributePreferredLocationType`: If this
       attribute is specified, ``data`` will be interpreted as a
       :py:obj:`~.cudaMemLocationType`, and ``dataSize`` must be
-      sizeof(cudaMemLocationType). The :py:obj:`~.cudaMemLocationType`
-      returned will be :py:obj:`~.cudaMemLocationTypeDevice` if all pages
-      in the memory range have the same GPU as their preferred location, or
+      sizeof(:py:obj:`~.cudaMemLocationType`). The
+      :py:obj:`~.cudaMemLocationType` returned will be
+      :py:obj:`~.cudaMemLocationTypeDevice` if all pages in the memory
+      range have the same GPU as their preferred location, or
       :py:obj:`~.cudaMemLocationType` will be
       :py:obj:`~.cudaMemLocationTypeHost` if all pages in the memory range
       have the CPU as their preferred location, or or it will be
@@ -30655,9 +30699,9 @@ def cudaMemRangeGetAttribute(size_t dataSize, attribute not None : cudaMemRangeA
     - :py:obj:`~.cudaMemRangeAttributeLastPrefetchLocationType`: If this
       attribute is specified, ``data`` will be interpreted as a
       :py:obj:`~.cudaMemLocationType`, and ``dataSize`` must be
-      sizeof(cudaMemLocationType). The result returned will be the last
-      location type to which all pages in the memory range were prefetched
-      explicitly via :py:obj:`~.cuMemPrefetchAsync`. The
+      sizeof(:py:obj:`~.cudaMemLocationType`). The result returned will be
+      the last location type to which all pages in the memory range were
+      prefetched explicitly via :func:`~.cuMemPrefetchAsync`. The
       :py:obj:`~.cudaMemLocationType` returned will be
       :py:obj:`~.cudaMemLocationTypeDevice` if the last prefetch location
       was the GPU or :py:obj:`~.cudaMemLocationTypeHost` if it was the CPU
@@ -30693,7 +30737,7 @@ def cudaMemRangeGetAttribute(size_t dataSize, attribute not None : cudaMemRangeA
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     data : Any
         A pointers to a memory location where the result of each attribute
@@ -30701,7 +30745,7 @@ def cudaMemRangeGetAttribute(size_t dataSize, attribute not None : cudaMemRangeA
 
     See Also
     --------
-    :py:obj:`~.cudaMemRangeGetAttributes`, :py:obj:`~.cudaMemPrefetchAsync`, :py:obj:`~.cudaMemAdvise`, :py:obj:`~.cuMemRangeGetAttribute`
+    :py:obj:`~.cudaMemRangeGetAttributes`, :py:obj:`~.cudaMemPrefetchAsync`, :py:obj:`~.cudaMemAdvise`, :func:`~.cuMemRangeGetAttribute`
     """
     cdef _HelperCUmem_range_attribute cydata = _HelperCUmem_range_attribute(attribute, dataSize)
     cdef void* cydata_ptr = <void*><void_ptr>cydata.cptr
@@ -30742,13 +30786,13 @@ def cudaMemRangeGetAttributes(dataSizes : tuple[int] | list[int], attributes : O
 
     - :py:obj:`~.cudaMemRangeAttributeLastPrefetchLocation`
 
-    - :: cudaMemRangeAttributePreferredLocationType
+    - :: :py:obj:`~.cudaMemRangeAttributePreferredLocationType`
 
-    - :: cudaMemRangeAttributePreferredLocationId
+    - :: :py:obj:`~.cudaMemRangeAttributePreferredLocationId`
 
-    - :: cudaMemRangeAttributeLastPrefetchLocationType
+    - :: :py:obj:`~.cudaMemRangeAttributeLastPrefetchLocationType`
 
-    - :: cudaMemRangeAttributeLastPrefetchLocationId
+    - :: :py:obj:`~.cudaMemRangeAttributeLastPrefetchLocationId`
 
     Parameters
     ----------
@@ -30766,7 +30810,7 @@ def cudaMemRangeGetAttributes(dataSizes : tuple[int] | list[int], attributes : O
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     data : list[Any]
         A two-dimensional array containing pointers to memory locations
@@ -30774,7 +30818,7 @@ def cudaMemRangeGetAttributes(dataSizes : tuple[int] | list[int], attributes : O
 
     See Also
     --------
-    :py:obj:`~.cudaMemRangeGetAttribute`, :py:obj:`~.cudaMemAdvise`, :py:obj:`~.cudaMemPrefetchAsync`, :py:obj:`~.cuMemRangeGetAttributes`
+    :py:obj:`~.cudaMemRangeGetAttribute`, :py:obj:`~.cudaMemAdvise`, :py:obj:`~.cudaMemPrefetchAsync`, :func:`~.cuMemRangeGetAttributes`
     """
     attributes = [] if attributes is None else attributes
     if not all(isinstance(_x, (cudaMemRangeAttribute)) for _x in attributes):
@@ -30835,12 +30879,12 @@ def cudaMemcpyToArray(dst, size_t wOffset, size_t hOffset, src, size_t count, ki
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpyFromArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpyArrayToArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpyToArrayAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpyFromArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyDtoA`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpyFromArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpyArrayToArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpyToArrayAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpyFromArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpyHtoA`, :func:`~.cuMemcpyDtoA`
     """
     cdef cyruntime.cudaArray_t cydst
     if dst is None:
@@ -30896,12 +30940,12 @@ def cudaMemcpyFromArray(dst, src, size_t wOffset, size_t hOffset, size_t count,
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpyToArray`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpyArrayToArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpyToArrayAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpyFromArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoD`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpyToArray`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpyArrayToArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpyToArrayAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpyFromArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpyAtoH`, :func:`~.cuMemcpyAtoD`
     """
     cdef cyruntime.cudaArray_const_t cysrc
     if src is None:
@@ -30962,12 +31006,12 @@ def cudaMemcpyArrayToArray(dst, size_t wOffsetDst, size_t hOffsetDst, src, size_
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpyToArray`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpyFromArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpyToArrayAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpyFromArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpyAtoA`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpyToArray`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpyFromArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpyToArrayAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpyFromArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpyAtoA`
     """
     cdef cyruntime.cudaArray_const_t cysrc
     if src is None:
@@ -31037,12 +31081,12 @@ def cudaMemcpyToArrayAsync(dst, size_t wOffset, size_t hOffset, src, size_t coun
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpyToArray`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpyFromArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpyArrayToArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpyFromArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpy2DAsync`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpyToArray`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpyFromArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpyArrayToArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpyFromArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpyHtoAAsync`, :func:`~.cuMemcpy2DAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -31115,12 +31159,12 @@ def cudaMemcpyFromArrayAsync(dst, src, size_t wOffset, size_t hOffset, size_t co
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidMemcpyDirection`
 
     See Also
     --------
-    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpyToArray`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpyFromArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpyArrayToArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpyToArrayAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpy2DAsync`
+    :py:obj:`~.cudaMemcpy`, :py:obj:`~.cudaMemcpy2D`, :py:obj:`~.cudaMemcpyToArray`, :py:obj:`~.cudaMemcpy2DToArray`, :py:obj:`~.cudaMemcpyFromArray`, :py:obj:`~.cudaMemcpy2DFromArray`, :py:obj:`~.cudaMemcpyArrayToArray`, :py:obj:`~.cudaMemcpy2DArrayToArray`, :py:obj:`~.cudaMemcpyToSymbol`, :py:obj:`~.cudaMemcpyFromSymbol`, :py:obj:`~.cudaMemcpyAsync`, :py:obj:`~.cudaMemcpy2DAsync`, :py:obj:`~.cudaMemcpyToArrayAsync`, :py:obj:`~.cudaMemcpy2DToArrayAsync`, :py:obj:`~.cudaMemcpy2DFromArrayAsync`, :py:obj:`~.cudaMemcpyToSymbolAsync`, :py:obj:`~.cudaMemcpyFromSymbolAsync`, :func:`~.cuMemcpyAtoHAsync`, :func:`~.cuMemcpy2DAsync`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -31169,14 +31213,14 @@ def cudaMallocAsync(size_t size, hStream):
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorOutOfMemory`,
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`, ``cudaErrorOutOfMemory``,
     devPtr : Any
         Returned device pointer
 
     See Also
     --------
-    :py:obj:`~.cuMemAllocAsync`, cudaMallocAsync (C++ API), :py:obj:`~.cudaMallocFromPoolAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaDeviceSetMemPool`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceGetMemPool`, :py:obj:`~.cudaMemPoolSetAccess`, :py:obj:`~.cudaMemPoolSetAttribute`, :py:obj:`~.cudaMemPoolGetAttribute`
+    :func:`~.cuMemAllocAsync`, :func:`~.cudaMallocAsync` (C++ API), :py:obj:`~.cudaMallocFromPoolAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaDeviceSetMemPool`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceGetMemPool`, :py:obj:`~.cudaMemPoolSetAccess`, :py:obj:`~.cudaMemPoolSetAttribute`, :py:obj:`~.cudaMemPoolGetAttribute`
 
     Notes
     -----
@@ -31222,12 +31266,12 @@ def cudaFreeAsync(devPtr, hStream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`
 
     See Also
     --------
-    :py:obj:`~.cuMemFreeAsync`, :py:obj:`~.cudaMallocAsync`
+    :func:`~.cuMemFreeAsync`, :py:obj:`~.cudaMallocAsync`
 
     Notes
     -----
@@ -31273,12 +31317,12 @@ def cudaMemPoolTrimTo(memPool, size_t minBytesToKeep):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cuMemPoolTrimTo`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceGetMemPool`, :py:obj:`~.cudaMemPoolCreate`
+    :func:`~.cuMemPoolTrimTo`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceGetMemPool`, :py:obj:`~.cudaMemPoolCreate`
 
     Notes
     -----
@@ -31308,11 +31352,11 @@ def cudaMemPoolSetAttribute(memPool, attr not None : cudaMemPoolAttr, value):
     Supported attributes are:
 
     - :py:obj:`~.cudaMemPoolAttrReleaseThreshold`: (value type =
-      cuuint64_t) Amount of reserved memory in bytes to hold onto before
-      trying to release memory back to the OS. When more than the release
-      threshold bytes of memory are held by the memory pool, the allocator
-      will try to release memory back to the OS on the next call to stream,
-      event or context synchronize. (default 0)
+      :py:obj:`~.cuuint64_t`) Amount of reserved memory in bytes to hold
+      onto before trying to release memory back to the OS. When more than
+      the release threshold bytes of memory are held by the memory pool,
+      the allocator will try to release memory back to the OS on the next
+      call to stream, event or context synchronize. (default 0)
 
     - :py:obj:`~.cudaMemPoolReuseFollowEventDependencies`: (value type =
       int) Allow :py:obj:`~.cudaMallocAsync` to use memory asynchronously
@@ -31331,15 +31375,15 @@ def cudaMemPoolSetAttribute(memPool, attr not None : cudaMemPoolAttr, value):
       reuse a piece of memory released by :py:obj:`~.cudaFreeAsync`
       (default enabled).
 
-    - :py:obj:`~.cudaMemPoolAttrReservedMemHigh`: (value type = cuuint64_t)
-      Reset the high watermark that tracks the amount of backing memory
-      that was allocated for the memory pool. It is illegal to set this
-      attribute to a non-zero value.
+    - :py:obj:`~.cudaMemPoolAttrReservedMemHigh`: (value type =
+      :py:obj:`~.cuuint64_t`) Reset the high watermark that tracks the
+      amount of backing memory that was allocated for the memory pool. It
+      is illegal to set this attribute to a non-zero value.
 
-    - :py:obj:`~.cudaMemPoolAttrUsedMemHigh`: (value type = cuuint64_t)
-      Reset the high watermark that tracks the amount of used memory that
-      was allocated for the memory pool. It is illegal to set this
-      attribute to a non-zero value.
+    - :py:obj:`~.cudaMemPoolAttrUsedMemHigh`: (value type =
+      :py:obj:`~.cuuint64_t`) Reset the high watermark that tracks the
+      amount of used memory that was allocated for the memory pool. It is
+      illegal to set this attribute to a non-zero value.
 
     Parameters
     ----------
@@ -31352,12 +31396,12 @@ def cudaMemPoolSetAttribute(memPool, attr not None : cudaMemPoolAttr, value):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cuMemPoolSetAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceGetMemPool`, :py:obj:`~.cudaMemPoolCreate`
+    :func:`~.cuMemPoolSetAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceGetMemPool`, :py:obj:`~.cudaMemPoolCreate`
     """
     cdef cyruntime.cudaMemPool_t cymemPool
     if memPool is None:
@@ -31384,11 +31428,11 @@ def cudaMemPoolGetAttribute(memPool, attr not None : cudaMemPoolAttr):
     Supported attributes are:
 
     - :py:obj:`~.cudaMemPoolAttrReleaseThreshold`: (value type =
-      cuuint64_t) Amount of reserved memory in bytes to hold onto before
-      trying to release memory back to the OS. When more than the release
-      threshold bytes of memory are held by the memory pool, the allocator
-      will try to release memory back to the OS on the next call to stream,
-      event or context synchronize. (default 0)
+      :py:obj:`~.cuuint64_t`) Amount of reserved memory in bytes to hold
+      onto before trying to release memory back to the OS. When more than
+      the release threshold bytes of memory are held by the memory pool,
+      the allocator will try to release memory back to the OS on the next
+      call to stream, event or context synchronize. (default 0)
 
     - :py:obj:`~.cudaMemPoolReuseFollowEventDependencies`: (value type =
       int) Allow :py:obj:`~.cudaMallocAsync` to use memory asynchronously
@@ -31408,48 +31452,52 @@ def cudaMemPoolGetAttribute(memPool, attr not None : cudaMemPoolAttr):
       (default enabled).
 
     - :py:obj:`~.cudaMemPoolAttrReservedMemCurrent`: (value type =
-      cuuint64_t) Amount of backing memory currently allocated for the
-      mempool.
+      :py:obj:`~.cuuint64_t`) Amount of backing memory currently allocated
+      for the mempool.
 
-    - :py:obj:`~.cudaMemPoolAttrReservedMemHigh`: (value type = cuuint64_t)
-      High watermark of backing memory allocated for the mempool since the
-      last time it was reset.
+    - :py:obj:`~.cudaMemPoolAttrReservedMemHigh`: (value type =
+      :py:obj:`~.cuuint64_t`) High watermark of backing memory allocated
+      for the mempool since the last time it was reset.
 
-    - :py:obj:`~.cudaMemPoolAttrUsedMemCurrent`: (value type = cuuint64_t)
-      Amount of memory from the pool that is currently in use by the
-      application.
+    - :py:obj:`~.cudaMemPoolAttrUsedMemCurrent`: (value type =
+      :py:obj:`~.cuuint64_t`) Amount of memory from the pool that is
+      currently in use by the application.
 
-    - :py:obj:`~.cudaMemPoolAttrUsedMemHigh`: (value type = cuuint64_t)
-      High watermark of the amount of memory from the pool that was in use
-      by the application since the last time it was reset.
+    - :py:obj:`~.cudaMemPoolAttrUsedMemHigh`: (value type =
+      :py:obj:`~.cuuint64_t`) High watermark of the amount of memory from
+      the pool that was in use by the application since the last time it
+      was reset.
 
     The following properties can be also be queried on imported and default
     pools:
 
     - :py:obj:`~.cudaMemPoolAttrAllocationType`: (value type =
-      cudaMemAllocationType) The allocation type of the mempool
+      :py:obj:`~.cudaMemAllocationType`) The allocation type of the mempool
 
     - :py:obj:`~.cudaMemPoolAttrExportHandleTypes`: (value type =
-      cudaMemAllocationHandleType) Available export handle types for the
-      mempool. For imported pools this value is always
-      cudaMemHandleTypeNone as an imported pool cannot be re-exported
+      :py:obj:`~.cudaMemAllocationHandleType`) Available export handle
+      types for the mempool. For imported pools this value is always
+      :py:obj:`~.cudaMemHandleTypeNone` as an imported pool cannot be re-
+      exported
 
     - :py:obj:`~.cudaMemPoolAttrLocationId`: (value type = int) The
       location id for the mempool. If the location type for this pool is
-      cudaMemLocationTypeInvisible then ID will be cudaInvalidDeviceId.
+      :py:obj:`~.cudaMemLocationTypeInvisible` then ID will be
+      :py:obj:`~.cudaInvalidDeviceId`.
 
     - :py:obj:`~.cudaMemPoolAttrLocationType`: (value type =
-      cudaMemLocationType) The location type for the mempool. For imported
-      memory pools where the device is not directly visible to the
-      importing process or pools imported via fabric handles across nodes
-      this will be cudaMemlocataionTypeInvisible.
-
-    - :py:obj:`~.cudaMemPoolAttrMaxPoolSize`: (value type = cuuint64_t)
-      Maximum size of the pool in bytes, this value may be higher than what
-      was initially passed to cuMemPoolCreate due to alignment
-      requirements. A value of 0 indicates no maximum size. For
-      cudaMemAllocationTypeManaged and IPC imported pools this value will
-      be system dependent.
+      :py:obj:`~.cudaMemLocationType`) The location type for the mempool.
+      For imported memory pools where the device is not directly visible to
+      the importing process or pools imported via fabric handles across
+      nodes this will be cudaMemlocataionTypeInvisible.
+
+    - :py:obj:`~.cudaMemPoolAttrMaxPoolSize`: (value type =
+      :py:obj:`~.cuuint64_t`) Maximum size of the pool in bytes, this value
+      may be higher than what was initially passed to
+      :func:`~.cuMemPoolCreate` due to alignment requirements. A value of 0
+      indicates no maximum size. For
+      :py:obj:`~.cudaMemAllocationTypeManaged` and IPC imported pools this
+      value will be system dependent.
 
     - :py:obj:`~.cudaMemPoolAttrHwDecompressEnabled`: (value type = int)
       Indicates whether the pool has hardware compresssion enabled
@@ -31463,14 +31511,14 @@ def cudaMemPoolGetAttribute(memPool, attr not None : cudaMemPoolAttr):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     value : Any
         Retrieved value
 
     See Also
     --------
-    :py:obj:`~.cuMemPoolGetAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceGetMemPool`, :py:obj:`~.cudaMemPoolCreate`
+    :func:`~.cuMemPoolGetAttribute`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceGetMemPool`, :py:obj:`~.cudaMemPoolCreate`
     """
     cdef cyruntime.cudaMemPool_t cymemPool
     if memPool is None:
@@ -31508,12 +31556,12 @@ def cudaMemPoolSetAccess(memPool, descList : Optional[tuple[cudaMemAccessDesc] |
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cuMemPoolSetAccess`, :py:obj:`~.cudaMemPoolGetAccess`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
+    :func:`~.cuMemPoolSetAccess`, :py:obj:`~.cudaMemPoolGetAccess`, :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`
     """
     descList = [] if descList is None else descList
     if not all(isinstance(_x, (cudaMemAccessDesc,)) for _x in descList):
@@ -31561,14 +31609,14 @@ def cudaMemPoolGetAccess(memPool, location : Optional[cudaMemLocation]):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
 
     flags : :py:obj:`~.cudaMemAccessFlags`
         the accessibility of the pool from the specified location
 
     See Also
     --------
-    :py:obj:`~.cuMemPoolGetAccess`, :py:obj:`~.cudaMemPoolSetAccess`
+    :func:`~.cuMemPoolGetAccess`, :py:obj:`~.cudaMemPoolSetAccess`
     """
     cdef cyruntime.cudaMemPool_t cymemPool
     if memPool is None:
@@ -31599,23 +31647,23 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
 
     To create a memory pool for host memory not targeting a specific NUMA
     node, applications must set set
-    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.type` to
+    :py:obj:`~.cudaMemPoolProps.py`:obj:`~.cudaMemLocation.type` to
     :py:obj:`~.cudaMemLocationTypeHost`.
-    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.id` is ignored for such
-    pools. Pools created with the type :py:obj:`~.cudaMemLocationTypeHost`
-    are not IPC capable and :py:obj:`~.cudaMemPoolProps.handleTypes` must
-    be 0, any other values will result in
-    :py:obj:`~.cudaErrorInvalidValue`. To create a memory pool targeting a
-    specific host NUMA node, applications must set
-    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.type` to
+    :py:obj:`~.cudaMemPoolProps.py`:obj:`~.cudaMemLocation.id` is ignored
+    for such pools. Pools created with the type
+    :py:obj:`~.cudaMemLocationTypeHost` are not IPC capable and
+    :py:obj:`~.cudaMemPoolProps.handleTypes` must be 0, any other values
+    will result in :py:obj:`~.cudaErrorInvalidValue`. To create a memory
+    pool targeting a specific host NUMA node, applications must set
+    :py:obj:`~.cudaMemPoolProps.py`:obj:`~.cudaMemLocation.type` to
     :py:obj:`~.cudaMemLocationTypeHostNuma` and
-    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.id` must specify the NUMA
-    ID of the host memory node. Specifying
+    :py:obj:`~.cudaMemPoolProps.py`:obj:`~.cudaMemLocation.id` must specify
+    the NUMA ID of the host memory node. Specifying
     :py:obj:`~.cudaMemLocationTypeHostNumaCurrent` as the
-    :py:obj:`~.cudaMemPoolProps.cudaMemLocation.type` will result in
-    :py:obj:`~.cudaErrorInvalidValue`. By default, the pool's memory will
-    be accessible from the device it is allocated on. In the case of pools
-    created with :py:obj:`~.cudaMemLocationTypeHostNuma` or
+    :py:obj:`~.cudaMemPoolProps.py`:obj:`~.cudaMemLocation.type` will
+    result in :py:obj:`~.cudaErrorInvalidValue`. By default, the pool's
+    memory will be accessible from the device it is allocated on. In the
+    case of pools created with :py:obj:`~.cudaMemLocationTypeHostNuma` or
     :py:obj:`~.cudaMemLocationTypeHost`, their default accessibility will
     be from the host CPU. Applications can control the maximum size of the
     pool by specifying a non-zero value for
@@ -31643,20 +31691,20 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
     /dev/nvidia-caps-imex-channels/channel0 c <major number> 0``
 
     To create a managed memory pool, applications must set
-    :py:obj:`~.cudaMemPoolProps`:cudaMemAllocationType to
+    :py:obj:`~.cudaMemPoolProps.py`:obj:`~.cudaMemAllocationType` to
     :py:obj:`~.cudaMemAllocationTypeManaged`.
-    :py:obj:`~.cudaMemPoolProps.cudaMemAllocationHandleType` must also be
-    set to :py:obj:`~.cudaMemHandleTypeNone` since IPC is not supported.
-    For managed memory pools, :py:obj:`~.cudaMemPoolProps.cudaMemLocation`
-    will be treated as the preferred location for all allocations created
-    from the pool. An application can also set
-    :py:obj:`~.cudaMemLocationTypeNone` to indicate no preferred location.
-    :py:obj:`~.cudaMemPoolProps.maxSize` must be set to zero for managed
-    memory pools. :py:obj:`~.cudaMemPoolProps.usage` should be zero as
-    decompress for managed memory is not supported. For managed memory
-    pools, all devices on the system must have non-zero
-    :py:obj:`~.concurrentManagedAccess`. If not, this call returns
-    :py:obj:`~.cudaErrorNotSupported`
+    :py:obj:`~.cudaMemPoolProps`:::py:obj:`~.cudaMemAllocationHandleType`
+    must also be set to :py:obj:`~.cudaMemHandleTypeNone` since IPC is not
+    supported. For managed memory pools,
+    :py:obj:`~.cudaMemPoolProps`:::py:obj:`~.cudaMemLocation` will be
+    treated as the preferred location for all allocations created from the
+    pool. An application can also set :py:obj:`~.cudaMemLocationTypeNone`
+    to indicate no preferred location. :py:obj:`~.cudaMemPoolProps.maxSize`
+    must be set to zero for managed memory pools.
+    :py:obj:`~.cudaMemPoolProps.usage` should be zero as decompress for
+    managed memory is not supported. For managed memory pools, all devices
+    on the system must have non-zero ``concurrentManagedAccess``. If not,
+    this call returns :py:obj:`~.cudaErrorNotSupported`
 
     Parameters
     ----------
@@ -31665,14 +31713,14 @@ def cudaMemPoolCreate(poolProps : Optional[cudaMemPoolProps]):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`
     memPool : :py:obj:`~.cudaMemPool_t`
         None
 
     See Also
     --------
-    :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cudaDeviceSetMemPool`, :py:obj:`~.cudaMallocFromPoolAsync`, :py:obj:`~.cudaMemPoolExportToShareableHandle`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceGetMemPool`
+    :func:`~.cuMemPoolCreate`, :py:obj:`~.cudaDeviceSetMemPool`, :py:obj:`~.cudaMallocFromPoolAsync`, :py:obj:`~.cudaMemPoolExportToShareableHandle`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceGetMemPool`
 
     Notes
     -----
@@ -31709,12 +31757,12 @@ def cudaMemPoolDestroy(memPool):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    cuMemPoolDestroy, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaDeviceSetMemPool`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceGetMemPool`, :py:obj:`~.cudaMemPoolCreate`
+    :func:`~.cuMemPoolDestroy`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaDeviceSetMemPool`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaDeviceGetMemPool`, :py:obj:`~.cudaMemPoolCreate`
 
     Notes
     -----
@@ -31759,14 +31807,14 @@ def cudaMemGetDefaultMemPool(location : Optional[cudaMemLocation], typename not
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`,
     memPool : :py:obj:`~.cudaMemPool_t`
         None
 
     See Also
     --------
-    :py:obj:`~.cuMemAllocAsync`, :py:obj:`~.cuMemPoolTrimTo`, :py:obj:`~.cuMemPoolGetAttribute`, :py:obj:`~.cuMemPoolSetAttribute`, cuMemPoolSetAccess, :py:obj:`~.cuMemGetMemPool`, :py:obj:`~.cuMemPoolCreate`
+    :func:`~.cuMemAllocAsync`, :func:`~.cuMemPoolTrimTo`, :func:`~.cuMemPoolGetAttribute`, :func:`~.cuMemPoolSetAttribute`, :func:`~.cuMemPoolSetAccess`, :func:`~.cuMemGetMemPool`, :func:`~.cuMemPoolCreate`
     """
     cdef cudaMemPool_t memPool = cudaMemPool_t()
     cdef cyruntime.cudaMemLocation* cylocation_ptr = <cyruntime.cudaMemLocation*>location._pvt_ptr if location is not None else NULL
@@ -31801,8 +31849,8 @@ def cudaMemGetMemPool(location : Optional[cudaMemLocation], typename not None :
     or :py:obj:`~.cudaDeviceSetMemPool` for that allocType and location has
     never been called. By default the current mempool of a location is the
     default mempool for a device that can be obtained via
-    cudaMemGetDefaultMemPool Otherwise the returned pool must have been set
-    with :py:obj:`~.cudaDeviceSetMemPool`.
+    :func:`~.cudaMemGetDefaultMemPool` Otherwise the returned pool must
+    have been set with :py:obj:`~.cudaDeviceSetMemPool`.
 
     Parameters
     ----------
@@ -31813,14 +31861,14 @@ def cudaMemGetMemPool(location : Optional[cudaMemLocation], typename not None :
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     memPool : :py:obj:`~.cudaMemPool_t`
         None
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuDeviceSetMemPool`, :py:obj:`~.cuMemSetMemPool`
+    :func:`~.cuDeviceGetDefaultMemPool`, :func:`~.cuMemPoolCreate`, :func:`~.cuDeviceSetMemPool`, :func:`~.cuMemSetMemPool`
     """
     cdef cudaMemPool_t memPool = cudaMemPool_t()
     cdef cyruntime.cudaMemLocation* cylocation_ptr = <cyruntime.cudaMemLocation*>location._pvt_ptr if location is not None else NULL
@@ -31873,12 +31921,12 @@ def cudaMemSetMemPool(location : Optional[cudaMemLocation], typename not None :
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetDefaultMemPool`, :py:obj:`~.cuDeviceGetMemPool`, :py:obj:`~.cuMemGetMemPool`, :py:obj:`~.cuMemPoolCreate`, :py:obj:`~.cuMemPoolDestroy`, :py:obj:`~.cuMemAllocFromPoolAsync`
+    :func:`~.cuDeviceGetDefaultMemPool`, :func:`~.cuDeviceGetMemPool`, :func:`~.cuMemGetMemPool`, :func:`~.cuMemPoolCreate`, :func:`~.cuMemPoolDestroy`, :func:`~.cuMemAllocFromPoolAsync`
 
     Notes
     -----
@@ -31921,14 +31969,14 @@ def cudaMallocFromPoolAsync(size_t size, memPool, stream):
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorOutOfMemory`
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`, ``cudaErrorOutOfMemory``
     ptr : Any
         Returned device pointer
 
     See Also
     --------
-    :py:obj:`~.cuMemAllocFromPoolAsync`, cudaMallocAsync (C++ API), :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaMemPoolCreate`, :py:obj:`~.cudaMemPoolSetAccess`, :py:obj:`~.cudaMemPoolSetAttribute`
+    :func:`~.cuMemAllocFromPoolAsync`, :func:`~.cudaMallocAsync` (C++ API), :py:obj:`~.cudaMallocAsync`, :py:obj:`~.cudaFreeAsync`, :py:obj:`~.cudaDeviceGetDefaultMemPool`, :py:obj:`~.cudaMemPoolCreate`, :py:obj:`~.cudaMemPoolSetAccess`, :py:obj:`~.cudaMemPoolSetAttribute`
 
     Notes
     -----
@@ -31984,18 +32032,18 @@ def cudaMemPoolExportToShareableHandle(memPool, handleType not None : cudaMemAll
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOutOfMemory`
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, ``cudaErrorOutOfMemory``
     handle_out : Any
         pointer to the location in which to store the requested handle
 
     See Also
     --------
-    :py:obj:`~.cuMemPoolExportToShareableHandle`, :py:obj:`~.cudaMemPoolImportFromShareableHandle`, :py:obj:`~.cudaMemPoolExportPointer`, :py:obj:`~.cudaMemPoolImportPointer`
+    :func:`~.cuMemPoolExportToShareableHandle`, :py:obj:`~.cudaMemPoolImportFromShareableHandle`, :py:obj:`~.cudaMemPoolExportPointer`, :py:obj:`~.cudaMemPoolImportPointer`
 
     Notes
     -----
-    : To create an IPC capable mempool, create a mempool with a CUmemAllocationHandleType other than cudaMemHandleTypeNone.
+    : To create an IPC capable mempool, create a mempool with a :py:obj:`~.CUmemAllocationHandleType` other than :py:obj:`~.cudaMemHandleTypeNone`.
     """
     cdef cyruntime.cudaMemPool_t cymemPool
     if memPool is None:
@@ -32035,14 +32083,14 @@ def cudaMemPoolImportFromShareableHandle(shareableHandle, handleType not None :
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOutOfMemory`
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, ``cudaErrorOutOfMemory``
     pool_out : :py:obj:`~.cudaMemPool_t`
         Returned memory pool
 
     See Also
     --------
-    :py:obj:`~.cuMemPoolImportFromShareableHandle`, :py:obj:`~.cudaMemPoolExportToShareableHandle`, :py:obj:`~.cudaMemPoolExportPointer`, :py:obj:`~.cudaMemPoolImportPointer`
+    :func:`~.cuMemPoolImportFromShareableHandle`, :py:obj:`~.cudaMemPoolExportToShareableHandle`, :py:obj:`~.cudaMemPoolExportPointer`, :py:obj:`~.cudaMemPoolImportPointer`
 
     Notes
     -----
@@ -32078,14 +32126,14 @@ def cudaMemPoolExportPointer(ptr):
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOutOfMemory`
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, ``cudaErrorOutOfMemory``
     shareData_out : :py:obj:`~.cudaMemPoolPtrExportData`
         Returned export data
 
     See Also
     --------
-    :py:obj:`~.cuMemPoolExportPointer`, :py:obj:`~.cudaMemPoolExportToShareableHandle`, :py:obj:`~.cudaMemPoolImportFromShareableHandle`, :py:obj:`~.cudaMemPoolImportPointer`
+    :func:`~.cuMemPoolExportPointer`, :py:obj:`~.cudaMemPoolExportToShareableHandle`, :py:obj:`~.cudaMemPoolImportFromShareableHandle`, :py:obj:`~.cudaMemPoolImportPointer`
     """
     cdef cudaMemPoolPtrExportData exportData = cudaMemPoolPtrExportData()
     cdef _HelperInputVoidPtrStruct cyptrHelper
@@ -32108,9 +32156,10 @@ def cudaMemPoolImportPointer(memPool, exportData : Optional[cudaMemPoolPtrExport
     memory must not be accessed before the allocation operation completes
     in the exporting process. The imported memory must be freed from all
     importing processes before being freed in the exporting process. The
-    pointer may be freed with cudaFree or cudaFreeAsync. If
-    :py:obj:`~.cudaFreeAsync` is used, the free must be completed on the
-    importing process before the free operation on the exporting process.
+    pointer may be freed with :func:`~.cudaFree` or
+    :func:`~.cudaFreeAsync`. If :py:obj:`~.cudaFreeAsync` is used, the free
+    must be completed on the importing process before the free operation on
+    the exporting process.
 
     Parameters
     ----------
@@ -32121,14 +32170,14 @@ def cudaMemPoolImportPointer(memPool, exportData : Optional[cudaMemPoolPtrExport
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_INITIALIZED`, :py:obj:`~.CUDA_ERROR_OUT_OF_MEMORY`
     ptr_out : Any
         pointer to imported memory
 
     See Also
     --------
-    :py:obj:`~.cuMemPoolImportPointer`, :py:obj:`~.cudaMemPoolExportToShareableHandle`, :py:obj:`~.cudaMemPoolImportFromShareableHandle`, :py:obj:`~.cudaMemPoolExportPointer`
+    :func:`~.cuMemPoolImportPointer`, :py:obj:`~.cudaMemPoolExportToShareableHandle`, :py:obj:`~.cudaMemPoolImportFromShareableHandle`, :py:obj:`~.cudaMemPoolExportPointer`
 
     Notes
     -----
@@ -32200,14 +32249,14 @@ def cudaPointerGetAttributes(ptr):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue`
     attributes : :py:obj:`~.cudaPointerAttributes`
         Attributes for the specified pointer
 
     See Also
     --------
-    :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaInitDevice`, :py:obj:`~.cuPointerGetAttributes`
+    :py:obj:`~.cudaGetDeviceCount`, :py:obj:`~.cudaGetDevice`, :py:obj:`~.cudaSetDevice`, :py:obj:`~.cudaChooseDevice`, :py:obj:`~.cudaInitDevice`, :func:`~.cuPointerGetAttributes`
 
     Notes
     -----
@@ -32247,14 +32296,14 @@ def cudaDeviceCanAccessPeer(int device, int peerDevice):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
     canAccessPeer : int
         Returned access capability
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceEnablePeerAccess`, :py:obj:`~.cudaDeviceDisablePeerAccess`, :py:obj:`~.cuDeviceCanAccessPeer`
+    :py:obj:`~.cudaDeviceEnablePeerAccess`, :py:obj:`~.cudaDeviceDisablePeerAccess`, :func:`~.cuDeviceCanAccessPeer`
     """
     cdef int canAccessPeer = 0
     with nogil:
@@ -32303,12 +32352,12 @@ def cudaDeviceEnablePeerAccess(int peerDevice, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorPeerAccessAlreadyEnabled`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceCanAccessPeer`, :py:obj:`~.cudaDeviceDisablePeerAccess`, :py:obj:`~.cuCtxEnablePeerAccess`
+    :py:obj:`~.cudaDeviceCanAccessPeer`, :py:obj:`~.cudaDeviceDisablePeerAccess`, :func:`~.cuCtxEnablePeerAccess`
     """
     with nogil:
         err = cyruntime.cudaDeviceEnablePeerAccess(peerDevice, flags)
@@ -32332,12 +32381,12 @@ def cudaDeviceDisablePeerAccess(int peerDevice):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorPeerAccessNotEnabled`, :py:obj:`~.cudaErrorInvalidDevice`
 
     See Also
     --------
-    :py:obj:`~.cudaDeviceCanAccessPeer`, :py:obj:`~.cudaDeviceEnablePeerAccess`, :py:obj:`~.cuCtxDisablePeerAccess`
+    :py:obj:`~.cudaDeviceCanAccessPeer`, :py:obj:`~.cudaDeviceEnablePeerAccess`, :func:`~.cuCtxDisablePeerAccess`
     """
     with nogil:
         err = cyruntime.cudaDeviceDisablePeerAccess(peerDevice)
@@ -32363,12 +32412,12 @@ def cudaGraphicsUnregisterResource(resource):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorUnknown`
 
     See Also
     --------
-    :py:obj:`~.cudaGraphicsD3D9RegisterResource`, :py:obj:`~.cudaGraphicsD3D10RegisterResource`, :py:obj:`~.cudaGraphicsD3D11RegisterResource`, :py:obj:`~.cudaGraphicsGLRegisterBuffer`, :py:obj:`~.cudaGraphicsGLRegisterImage`, :py:obj:`~.cuGraphicsUnregisterResource`
+    ``cudaGraphicsD3D9RegisterResource``, ``cudaGraphicsD3D10RegisterResource``, ``cudaGraphicsD3D11RegisterResource``, :py:obj:`~.cudaGraphicsGLRegisterBuffer`, :py:obj:`~.cudaGraphicsGLRegisterImage`, :func:`~.cuGraphicsUnregisterResource`
     """
     cdef cyruntime.cudaGraphicsResource_t cyresource
     if resource is None:
@@ -32419,12 +32468,12 @@ def cudaGraphicsResourceSetMapFlags(resource, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorUnknown`,
 
     See Also
     --------
-    :py:obj:`~.cudaGraphicsMapResources`, :py:obj:`~.cuGraphicsResourceSetMapFlags`
+    :py:obj:`~.cudaGraphicsMapResources`, :func:`~.cuGraphicsResourceSetMapFlags`
     """
     cdef cyruntime.cudaGraphicsResource_t cyresource
     if resource is None:
@@ -32473,12 +32522,12 @@ def cudaGraphicsMapResources(int count, resources, stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorUnknown`
 
     See Also
     --------
-    :py:obj:`~.cudaGraphicsResourceGetMappedPointer`, :py:obj:`~.cudaGraphicsSubResourceGetMappedArray`, :py:obj:`~.cudaGraphicsUnmapResources`, :py:obj:`~.cuGraphicsMapResources`
+    :py:obj:`~.cudaGraphicsResourceGetMappedPointer`, :py:obj:`~.cudaGraphicsSubResourceGetMappedArray`, :py:obj:`~.cudaGraphicsUnmapResources`, :func:`~.cuGraphicsMapResources`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -32534,12 +32583,12 @@ def cudaGraphicsUnmapResources(int count, resources, stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorUnknown`
 
     See Also
     --------
-    :py:obj:`~.cudaGraphicsMapResources`, :py:obj:`~.cuGraphicsUnmapResources`
+    :py:obj:`~.cudaGraphicsMapResources`, :func:`~.cuGraphicsUnmapResources`
     """
     cdef cyruntime.cudaStream_t cystream
     if stream is None:
@@ -32586,7 +32635,7 @@ def cudaGraphicsResourceGetMappedPointer(resource):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
 
     devPtr : Any
         None
@@ -32642,7 +32691,7 @@ def cudaGraphicsSubResourceGetMappedArray(resource, unsigned int arrayIndex, uns
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorUnknown`
     array : :py:obj:`~.cudaArray_t`
         Returned array through which a subresource of ``resource`` may be
@@ -32650,7 +32699,7 @@ def cudaGraphicsSubResourceGetMappedArray(resource, unsigned int arrayIndex, uns
 
     See Also
     --------
-    :py:obj:`~.cudaGraphicsResourceGetMappedPointer`, :py:obj:`~.cuGraphicsSubResourceGetMappedArray`
+    :py:obj:`~.cudaGraphicsResourceGetMappedPointer`, :func:`~.cuGraphicsSubResourceGetMappedArray`
     """
     cdef cyruntime.cudaGraphicsResource_t cyresource
     if resource is None:
@@ -32689,14 +32738,14 @@ def cudaGraphicsResourceGetMappedMipmappedArray(resource):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorUnknown`
     mipmappedArray : :py:obj:`~.cudaMipmappedArray_t`
         Returned mipmapped array through which ``resource`` may be accessed
 
     See Also
     --------
-    :py:obj:`~.cudaGraphicsResourceGetMappedPointer`, :py:obj:`~.cuGraphicsResourceGetMappedMipmappedArray`
+    :py:obj:`~.cudaGraphicsResourceGetMappedPointer`, :func:`~.cuGraphicsResourceGetMappedMipmappedArray`
     """
     cdef cyruntime.cudaGraphicsResource_t cyresource
     if resource is None:
@@ -32730,7 +32779,7 @@ def cudaGetChannelDesc(array):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     desc : :py:obj:`~.cudaChannelFormatDesc`
         Channel format
@@ -32794,7 +32843,7 @@ def cudaCreateChannelDesc(int x, int y, int z, int w, f not None : cudaChannelFo
 
     See Also
     --------
-    cudaCreateChannelDesc (C++ API), :py:obj:`~.cudaGetChannelDesc`, :py:obj:`~.cudaCreateTextureObject`, :py:obj:`~.cudaCreateSurfaceObject`
+    :func:`~.cudaCreateChannelDesc` (C++ API), :py:obj:`~.cudaGetChannelDesc`, :py:obj:`~.cudaCreateTextureObject`, :py:obj:`~.cudaCreateSurfaceObject`
     """
     cdef cyruntime.cudaChannelFormatKind cyf = int(f)
     with nogil:
@@ -32875,7 +32924,7 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
     :py:obj:`~.cudaDeviceProp.texturePitchAlignment`. Pitch cannot exceed
     :py:obj:`~.cudaDeviceProp.maxTexture2DLinear` ``[2]``.
 
-    The :py:obj:`~.cudaTextureDesc` struct is defined as
+    The :py:obj:`~.cudaTextureDesc` ``struct is`` defined as
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -32926,7 +32975,8 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
       of 'A' Note that application using integer border color values will
       need to <reinterpret_cast> these values to float. The values are set
       only when the addressing mode specified by
-      :py:obj:`~.cudaTextureDesc.addressMode` is cudaAddressModeBorder.
+      :py:obj:`~.cudaTextureDesc.addressMode` is
+      :py:obj:`~.cudaAddressModeBorder`.
 
     - :py:obj:`~.cudaTextureDesc.normalizedCoords` specifies whether the
       texture coordinates will be normalized or not.
@@ -32965,7 +33015,7 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
       :py:obj:`~.cudaFilterModeLinear` seamless cube map filtering will be
       performed when sampling along the cube face borders.
 
-    The :py:obj:`~.cudaResourceViewDesc` struct is defined as
+    The :py:obj:`~.cudaResourceViewDesc` ``struct is`` defined as
 
     **View CUDA Toolkit Documentation for a C++ code example**
 
@@ -33030,14 +33080,14 @@ def cudaCreateTextureObject(pResDesc : Optional[cudaResourceDesc], pTexDesc : Op
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pTexObject : :py:obj:`~.cudaTextureObject_t`
         Texture object to create
 
     See Also
     --------
-    :py:obj:`~.cudaDestroyTextureObject`, :py:obj:`~.cuTexObjectCreate`
+    :py:obj:`~.cudaDestroyTextureObject`, :func:`~.cuTexObjectCreate`
     """
     cdef cudaTextureObject_t pTexObject = cudaTextureObject_t()
     cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = <cyruntime.cudaResourceDesc*>pResDesc._pvt_ptr if pResDesc is not None else NULL
@@ -33065,12 +33115,12 @@ def cudaDestroyTextureObject(texObject):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cudaCreateTextureObject`, :py:obj:`~.cuTexObjectDestroy`
+    :py:obj:`~.cudaCreateTextureObject`, :func:`~.cuTexObjectDestroy`
     """
     cdef cyruntime.cudaTextureObject_t cytexObject
     if texObject is None:
@@ -33101,14 +33151,14 @@ def cudaGetTextureObjectResourceDesc(texObject):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pResDesc : :py:obj:`~.cudaResourceDesc`
         Resource descriptor
 
     See Also
     --------
-    :py:obj:`~.cudaCreateTextureObject`, :py:obj:`~.cuTexObjectGetResourceDesc`
+    :py:obj:`~.cudaCreateTextureObject`, :func:`~.cuTexObjectGetResourceDesc`
     """
     cdef cyruntime.cudaTextureObject_t cytexObject
     if texObject is None:
@@ -33142,14 +33192,14 @@ def cudaGetTextureObjectTextureDesc(texObject):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pTexDesc : :py:obj:`~.cudaTextureDesc`
         Texture descriptor
 
     See Also
     --------
-    :py:obj:`~.cudaCreateTextureObject`, :py:obj:`~.cuTexObjectGetTextureDesc`
+    :py:obj:`~.cudaCreateTextureObject`, :func:`~.cuTexObjectGetTextureDesc`
     """
     cdef cyruntime.cudaTextureObject_t cytexObject
     if texObject is None:
@@ -33184,14 +33234,14 @@ def cudaGetTextureObjectResourceViewDesc(texObject):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pResViewDesc : :py:obj:`~.cudaResourceViewDesc`
         Resource view descriptor
 
     See Also
     --------
-    :py:obj:`~.cudaCreateTextureObject`, :py:obj:`~.cuTexObjectGetResourceViewDesc`
+    :py:obj:`~.cudaCreateTextureObject`, :func:`~.cuTexObjectGetResourceViewDesc`
     """
     cdef cyruntime.cudaTextureObject_t cytexObject
     if texObject is None:
@@ -33233,14 +33283,14 @@ def cudaCreateSurfaceObject(pResDesc : Optional[cudaResourceDesc]):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidChannelDescriptor`, :py:obj:`~.cudaErrorInvalidResourceHandle`
     pSurfObject : :py:obj:`~.cudaSurfaceObject_t`
         Surface object to create
 
     See Also
     --------
-    :py:obj:`~.cudaDestroySurfaceObject`, :py:obj:`~.cuSurfObjectCreate`
+    :py:obj:`~.cudaDestroySurfaceObject`, :func:`~.cuSurfObjectCreate`
     """
     cdef cudaSurfaceObject_t pSurfObject = cudaSurfaceObject_t()
     cdef cyruntime.cudaResourceDesc* cypResDesc_ptr = <cyruntime.cudaResourceDesc*>pResDesc._pvt_ptr if pResDesc is not None else NULL
@@ -33266,12 +33316,12 @@ def cudaDestroySurfaceObject(surfObject):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cudaCreateSurfaceObject`, :py:obj:`~.cuSurfObjectDestroy`
+    :py:obj:`~.cudaCreateSurfaceObject`, :func:`~.cuSurfObjectDestroy`
     """
     cdef cyruntime.cudaSurfaceObject_t cysurfObject
     if surfObject is None:
@@ -33299,14 +33349,14 @@ def cudaGetSurfaceObjectResourceDesc(surfObject):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pResDesc : :py:obj:`~.cudaResourceDesc`
         Resource descriptor
 
     See Also
     --------
-    :py:obj:`~.cudaCreateSurfaceObject`, :py:obj:`~.cuSurfObjectGetResourceDesc`
+    :py:obj:`~.cudaCreateSurfaceObject`, :func:`~.cuSurfObjectGetResourceDesc`
     """
     cdef cyruntime.cudaSurfaceObject_t cysurfObject
     if surfObject is None:
@@ -33340,14 +33390,14 @@ def cudaDriverGetVersion():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     driverVersion : int
         Returns the CUDA driver version.
 
     See Also
     --------
-    :py:obj:`~.cudaRuntimeGetVersion`, :py:obj:`~.cuDriverGetVersion`
+    :py:obj:`~.cudaRuntimeGetVersion`, :func:`~.cuDriverGetVersion`
     """
     cdef int driverVersion = 0
     with nogil:
@@ -33376,14 +33426,14 @@ def cudaRuntimeGetVersion():
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     runtimeVersion : int
         Returns the CUDA Runtime version.
 
     See Also
     --------
-    :py:obj:`~.cudaDriverGetVersion`, :py:obj:`~.cuDriverGetVersion`
+    :py:obj:`~.cudaDriverGetVersion`, :func:`~.cuDriverGetVersion`
     """
     cdef int runtimeVersion = 0
     with nogil:
@@ -33409,7 +33459,7 @@ def cudaLogsRegisterCallback(callbackFunc, userData):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
     callback_out : :py:obj:`~.cudaLogsCallbackHandle`
         Optional location to store the callback handle after it is
@@ -33447,7 +33497,7 @@ def cudaLogsUnregisterCallback(callback):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
     """
     cdef cyruntime.cudaLogsCallbackHandle cycallback
@@ -33476,7 +33526,7 @@ def cudaLogsCurrent(unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
     iterator_out : :py:obj:`~.cudaLogIterator`
         Location to store an iterator to the current tail of the logs
@@ -33511,7 +33561,7 @@ def cudaLogsDumpToFile(iterator : Optional[cudaLogIterator], char* pathToFile, u
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
     iterator : :py:obj:`~.cudaLogIterator`
         Optional auto-advancing iterator specifying the starting log to
@@ -33565,7 +33615,7 @@ def cudaLogsDumpToMemory(iterator : Optional[cudaLogIterator], char* buffer, siz
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
     iterator : :py:obj:`~.cudaLogIterator`
         Optional auto-advancing iterator specifying the starting log to
@@ -33606,7 +33656,7 @@ def cudaGraphCreate(unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`
     pGraph : :py:obj:`~.cudaGraph_t`
         Returns newly created graph
@@ -33666,7 +33716,7 @@ def cudaGraphAddKernelNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
     ``extra`` specifies a list of names of extra settings and their
     corresponding values. Each extra setting name is immediately followed
     by the corresponding value. The list must be terminated with either
-    NULL or CU_LAUNCH_PARAM_END.
+    NULL or :py:obj:`~.CU_LAUNCH_PARAM_END`.
 
     - :py:obj:`~.CU_LAUNCH_PARAM_END`, which indicates the end of the
       ``extra`` array;
@@ -33700,7 +33750,7 @@ def cudaGraphAddKernelNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDeviceFunction`
     pGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
@@ -33770,7 +33820,7 @@ def cudaGraphKernelNodeGetParams(node):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDeviceFunction`
     pNodeParams : :py:obj:`~.cudaKernelNodeParams`
         Pointer to return the parameters
@@ -33812,7 +33862,7 @@ def cudaGraphKernelNodeSetParams(node, pNodeParams : Optional[cudaKernelNodePara
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorMemoryAllocation`
 
     See Also
@@ -33852,8 +33902,8 @@ def cudaGraphKernelNodeCopyAttributes(hDst, hSrc):
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidContext`
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, ``cudaErrorInvalidContext``
 
     See Also
     --------
@@ -33898,7 +33948,7 @@ def cudaGraphKernelNodeGetAttribute(hNode, attr not None : cudaKernelNodeAttrID)
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`
     value_out : :py:obj:`~.cudaKernelNodeAttrValue`
 
@@ -33944,7 +33994,7 @@ def cudaGraphKernelNodeSetAttribute(hNode, attr not None : cudaKernelNodeAttrID,
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`
 
     See Also
@@ -34001,7 +34051,7 @@ def cudaGraphAddMemcpyNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
@@ -34093,7 +34143,7 @@ def cudaGraphAddMemcpyNode1D(graph, pDependencies : Optional[tuple[cudaGraphNode
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
@@ -34155,7 +34205,7 @@ def cudaGraphMemcpyNodeGetParams(node):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pNodeParams : :py:obj:`~.cudaMemcpy3DParms`
         Pointer to return the parameters
@@ -34197,7 +34247,7 @@ def cudaGraphMemcpyNodeSetParams(node, pNodeParams : Optional[cudaMemcpy3DParms]
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
@@ -34255,7 +34305,7 @@ def cudaGraphMemcpyNodeSetParams1D(node, dst, src, size_t count, kind not None :
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -34311,7 +34361,7 @@ def cudaGraphAddMemsetNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`
     pGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
@@ -34368,7 +34418,7 @@ def cudaGraphMemsetNodeGetParams(node):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pNodeParams : :py:obj:`~.cudaMemsetParams`
         Pointer to return the parameters
@@ -34410,7 +34460,7 @@ def cudaGraphMemsetNodeSetParams(node, pNodeParams : Optional[cudaMemsetParams])
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -34460,7 +34510,7 @@ def cudaGraphAddHostNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t]
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`
     pGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
@@ -34517,7 +34567,7 @@ def cudaGraphHostNodeGetParams(node):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pNodeParams : :py:obj:`~.cudaHostNodeParams`
         Pointer to return the parameters
@@ -34559,7 +34609,7 @@ def cudaGraphHostNodeSetParams(node, pNodeParams : Optional[cudaHostNodeParams])
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -34612,7 +34662,7 @@ def cudaGraphAddChildGraphNode(graph, pDependencies : Optional[tuple[cudaGraphNo
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
@@ -34681,7 +34731,7 @@ def cudaGraphChildGraphNodeGetGraph(node):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pGraph : :py:obj:`~.cudaGraph_t`
         Location to store a handle to the graph
@@ -34736,7 +34786,7 @@ def cudaGraphAddEmptyNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t]
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
@@ -34808,7 +34858,7 @@ def cudaGraphAddEventRecordNode(graph, pDependencies : Optional[tuple[cudaGraphN
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     phGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
@@ -34872,7 +34922,7 @@ def cudaGraphEventRecordNodeGetEvent(node):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     event_out : :py:obj:`~.cudaEvent_t`
         Pointer to return the event
@@ -34914,7 +34964,7 @@ def cudaGraphEventRecordNodeSetEvent(node, event):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -34956,10 +35006,10 @@ def cudaGraphAddEventWaitNode(graph, pDependencies : Optional[tuple[cudaGraphNod
     new node will be returned in ``phGraphNode``.
 
     The graph node will wait for all work captured in ``event``. See
-    :py:obj:`~.cuEventRecord()` for details on what is captured by an
-    event. The synchronization will be performed efficiently on the device
-    when applicable. ``event`` may be from a different context or device
-    than the launch stream.
+    :func:`~.cuEventRecord` for details on what is captured by an event.
+    The synchronization will be performed efficiently on the device when
+    applicable. ``event`` may be from a different context or device than
+    the launch stream.
 
     These nodes may not be used in loops or conditionals.
 
@@ -34976,7 +35026,7 @@ def cudaGraphAddEventWaitNode(graph, pDependencies : Optional[tuple[cudaGraphNod
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     phGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
@@ -35040,7 +35090,7 @@ def cudaGraphEventWaitNodeGetEvent(node):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     event_out : :py:obj:`~.cudaEvent_t`
         Pointer to return the event
@@ -35082,7 +35132,7 @@ def cudaGraphEventWaitNodeSetEvent(node, event):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -35140,7 +35190,7 @@ def cudaGraphAddExternalSemaphoresSignalNode(graph, pDependencies : Optional[tup
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
@@ -35203,7 +35253,7 @@ def cudaGraphExternalSemaphoresSignalNodeGetParams(hNode):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     params_out : :py:obj:`~.cudaExternalSemaphoreSignalNodeParams`
         Pointer to return the parameters
@@ -35246,7 +35296,7 @@ def cudaGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams : Optional[
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -35297,7 +35347,7 @@ def cudaGraphAddExternalSemaphoresWaitNode(graph, pDependencies : Optional[tuple
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
@@ -35360,7 +35410,7 @@ def cudaGraphExternalSemaphoresWaitNodeGetParams(hNode):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     params_out : :py:obj:`~.cudaExternalSemaphoreWaitNodeParams`
         Pointer to return the parameters
@@ -35403,7 +35453,7 @@ def cudaGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams : Optional[cu
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -35454,14 +35504,13 @@ def cudaGraphAddMemAllocNode(graph, pDependencies : Optional[tuple[cudaGraphNode
 
     Allocations which are not freed in the same graph can be freed by:
 
-    - passing the allocation to :py:obj:`~.cudaMemFreeAsync` or
-      :py:obj:`~.cudaMemFree`;
+    - passing the allocation to ``cudaMemFreeAsync`` or ``cudaMemFree``;
 
     - launching a graph with a free node for that allocation; or
 
     - specifying :py:obj:`~.cudaGraphInstantiateFlagAutoFreeOnLaunch`
       during instantiation, which makes each launch behave as though it
-      called :py:obj:`~.cudaMemFreeAsync` for every unfreed allocation.
+      called ``cudaMemFreeAsync`` for every unfreed allocation.
 
     It is not possible to free an allocation in both the owning graph and
     another graph. If the allocation is freed in the same graph, a free
@@ -35493,8 +35542,8 @@ def cudaGraphAddMemAllocNode(graph, pDependencies : Optional[tuple[cudaGraphNode
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOutOfMemory`
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`, ``cudaErrorOutOfMemory``
     pGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
 
@@ -35553,7 +35602,7 @@ def cudaGraphMemAllocNodeGetParams(node):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     params_out : :py:obj:`~.cudaMemAllocNodeParams`
         Pointer to return the parameters
@@ -35625,8 +35674,8 @@ def cudaGraphAddMemFreeNode(graph, pDependencies : Optional[tuple[cudaGraphNode_
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOutOfMemory`
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorInvalidValue`, ``cudaErrorOutOfMemory``
     pGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
 
@@ -35684,7 +35733,7 @@ def cudaGraphMemFreeNodeGetParams(node):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     dptr_out : Any
         Pointer to return the device address
@@ -35727,7 +35776,7 @@ def cudaDeviceGraphMemTrim(int device):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -35771,7 +35820,7 @@ def cudaDeviceGetGraphMemAttribute(int device, attr not None : cudaGraphMemAttri
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
     value : Any
         retrieved value
@@ -35817,7 +35866,7 @@ def cudaDeviceSetGraphMemAttribute(int device, attr not None : cudaGraphMemAttri
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`
 
     See Also
@@ -35853,7 +35902,7 @@ def cudaGraphClone(originalGraph):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`
     pGraphClone : :py:obj:`~.cudaGraph_t`
         Returns newly created cloned graph
@@ -35907,7 +35956,7 @@ def cudaGraphNodeFindInClone(originalNode, clonedGraph):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pNode : :py:obj:`~.cudaGraphNode_t`
         Returns handle to the cloned node
@@ -35955,7 +36004,7 @@ def cudaGraphNodeGetType(node):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pType : :py:obj:`~.cudaGraphNodeType`
         Pointer to return the node type
@@ -35996,7 +36045,7 @@ def cudaGraphNodeGetContainingGraph(hNode):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     phGraph : :py:obj:`~.cudaGraph_t`
         Pointer to return the containing graph
@@ -36038,7 +36087,7 @@ def cudaGraphNodeGetLocalId(hNode):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess` :py:obj:`~.cudaErrorInvalidValue`
     nodeId : unsigned int
         Pointer to return the nodeId
@@ -36076,7 +36125,7 @@ def cudaGraphNodeGetToolsId(hNode):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.CUDA_SUCCESS` :py:obj:`~.cudaErrorInvalidValue`
     \\*toolsNodeId : unsigned long long
         Pointer to return the id used by tools
@@ -36117,7 +36166,7 @@ def cudaGraphGetId(hGraph):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     graphId : unsigned int
         Pointer to return the graphId
@@ -36159,7 +36208,7 @@ def cudaGraphExecGetId(hGraphExec):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     graphId : unsigned int
         Pointer to return the graphId
@@ -36206,7 +36255,7 @@ def cudaGraphGetNodes(graph, size_t numNodes = 0):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     nodes : list[:py:obj:`~.cudaGraphNode_t`]
         Pointer to return the nodes
@@ -36268,7 +36317,7 @@ def cudaGraphGetRootNodes(graph, size_t pNumRootNodes = 0):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pRootNodes : list[:py:obj:`~.cudaGraphNode_t`]
         Pointer to return the root nodes
@@ -36336,7 +36385,7 @@ def cudaGraphGetEdges(graph, size_t numEdges = 0):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorLossyQuery`, :py:obj:`~.cudaErrorInvalidValue`
     from : list[:py:obj:`~.cudaGraphNode_t`]
         Location to return edge endpoints
@@ -36431,7 +36480,7 @@ def cudaGraphNodeGetDependencies(node, size_t pNumDependencies = 0):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorLossyQuery`, :py:obj:`~.cudaErrorInvalidValue`
     pDependencies : list[:py:obj:`~.cudaGraphNode_t`]
         Pointer to return the dependencies
@@ -36513,7 +36562,7 @@ def cudaGraphNodeGetDependentNodes(node, size_t pNumDependentNodes = 0):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorLossyQuery`, :py:obj:`~.cudaErrorInvalidValue`
     pDependentNodes : list[:py:obj:`~.cudaGraphNode_t`]
         Pointer to return the dependent nodes
@@ -36596,7 +36645,7 @@ def cudaGraphAddDependencies(graph, from_ : Optional[tuple[cudaGraphNode_t] | li
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -36692,7 +36741,7 @@ def cudaGraphRemoveDependencies(graph, from_ : Optional[tuple[cudaGraphNode_t] |
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -36775,7 +36824,7 @@ def cudaGraphDestroyNode(node):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -36875,7 +36924,7 @@ def cudaGraphInstantiate(graph, unsigned long long flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pGraphExec : :py:obj:`~.cudaGraphExec_t`
         Returns instantiated graph
@@ -36982,7 +37031,7 @@ def cudaGraphInstantiateWithFlags(graph, unsigned long long flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pGraphExec : :py:obj:`~.cudaGraphExec_t`
         Returns instantiated graph
@@ -37129,7 +37178,7 @@ def cudaGraphInstantiateWithParams(graph, instantiateParams : Optional[cudaGraph
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     pGraphExec : :py:obj:`~.cudaGraphExec_t`
         Returns instantiated graph
@@ -37173,7 +37222,7 @@ def cudaGraphExecGetFlags(graphExec):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     flags : unsigned long long
         Returns the instantiation flags
@@ -37251,7 +37300,7 @@ def cudaGraphExecKernelNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
@@ -37316,7 +37365,7 @@ def cudaGraphExecMemcpyNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
@@ -37385,7 +37434,7 @@ def cudaGraphExecMemcpyNodeSetParams1D(hGraphExec, node, dst, src, size_t count,
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -37461,7 +37510,7 @@ def cudaGraphExecMemsetNodeSetParams(hGraphExec, node, pNodeParams : Optional[cu
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
@@ -37516,7 +37565,7 @@ def cudaGraphExecHostNodeSetParams(hGraphExec, node, pNodeParams : Optional[cuda
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
@@ -37579,7 +37628,7 @@ def cudaGraphExecChildGraphNodeSetParams(hGraphExec, node, childGraph):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
@@ -37642,7 +37691,7 @@ def cudaGraphExecEventRecordNodeSetEvent(hGraphExec, hNode, event):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
@@ -37705,7 +37754,7 @@ def cudaGraphExecEventWaitNodeSetEvent(hGraphExec, hNode, event):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
@@ -37772,7 +37821,7 @@ def cudaGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodePa
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
@@ -37832,7 +37881,7 @@ def cudaGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodePara
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
@@ -37892,7 +37941,7 @@ def cudaGraphNodeSetEnabled(hGraphExec, hNode, unsigned int isEnabled):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
@@ -37947,7 +37996,7 @@ def cudaGraphNodeGetEnabled(hGraphExec, hNode):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
     isEnabled : unsigned int
         Location to return the enabled status of the node
@@ -38011,10 +38060,10 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
         vice-versa.
 
       - If the graph was instantiated with
-        cudaGraphInstantiateFlagUseNodePriority, the priority attribute
-        cannot change. Equality is checked on the originally requested
-        priority values, before they are clamped to the device's supported
-        range.
+        :py:obj:`~.cudaGraphInstantiateFlagUseNodePriority`, the priority
+        attribute cannot change. Equality is checked on the originally
+        requested priority values, before they are clamped to the device's
+        supported range.
 
       - If ``hGraphExec`` was not instantiated for device launch, a node
         whose function originally did not use device-side
@@ -38044,7 +38093,8 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
     - Additional memcpy node restrictions:
 
       - Changing either the source or destination memory type(i.e.
-        CU_MEMORYTYPE_DEVICE, CU_MEMORYTYPE_ARRAY, etc.) is not supported.
+        :py:obj:`~.CU_MEMORYTYPE_DEVICE`, :py:obj:`~.CU_MEMORYTYPE_ARRAY`,
+        etc.) is not supported.
 
     - Conditional nodes:
 
@@ -38059,8 +38109,9 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
     Note: The API may add further restrictions in future releases. The
     return code should always be checked.
 
-    cudaGraphExecUpdate sets the result member of ``resultInfo`` to
-    cudaGraphExecUpdateErrorTopologyChanged under the following conditions:
+    :func:`~.cudaGraphExecUpdate` sets the result member of ``resultInfo``
+    to :py:obj:`~.cudaGraphExecUpdateErrorTopologyChanged` under the
+    following conditions:
 
     - The count of nodes directly in ``hGraphExec`` and ``hGraph`` differ,
       in which case resultInfo->errorNode is set to NULL.
@@ -38080,43 +38131,47 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
       not match when the nodes are already paired based on other edges
       examined in the graph.
 
-    cudaGraphExecUpdate sets ``the`` result member of ``resultInfo`` to:
+    :func:`~.cudaGraphExecUpdate` sets ``the`` result member of
+    ``resultInfo`` to:
 
-    - cudaGraphExecUpdateError if passed an invalid value.
+    - :py:obj:`~.cudaGraphExecUpdateError` if passed an invalid value.
 
-    - cudaGraphExecUpdateErrorTopologyChanged if the graph topology changed
+    - :py:obj:`~.cudaGraphExecUpdateErrorTopologyChanged` if the graph
+      topology changed
 
-    - cudaGraphExecUpdateErrorNodeTypeChanged if the type of a node
-      changed, in which case ``hErrorNode_out`` is set to the node from
-      ``hGraph``.
+    - :py:obj:`~.cudaGraphExecUpdateErrorNodeTypeChanged` if the type of a
+      node changed, in which case ``hErrorNode_out`` is set to the node
+      from ``hGraph``.
 
-    - cudaGraphExecUpdateErrorFunctionChanged if the function of a kernel
-      node changed (CUDA driver < 11.2)
+    - :py:obj:`~.cudaGraphExecUpdateErrorFunctionChanged` if the function
+      of a kernel node changed (CUDA driver < 11.2)
 
-    - cudaGraphExecUpdateErrorUnsupportedFunctionChange if the func field
-      of a kernel changed in an unsupported way(see note above), in which
-      case ``hErrorNode_out`` is set to the node from ``hGraph``
+    - :py:obj:`~.cudaGraphExecUpdateErrorUnsupportedFunctionChange` if the
+      func field of a kernel changed in an unsupported way(see note above),
+      in which case ``hErrorNode_out`` is set to the node from ``hGraph``
 
-    - cudaGraphExecUpdateErrorParametersChanged if any parameters to a node
-      changed in a way that is not supported, in which case
-      ``hErrorNode_out`` is set to the node from ``hGraph``
+    - :py:obj:`~.cudaGraphExecUpdateErrorParametersChanged` if any
+      parameters to a node changed in a way that is not supported, in which
+      case ``hErrorNode_out`` is set to the node from ``hGraph``
 
-    - cudaGraphExecUpdateErrorAttributesChanged if any attributes of a node
-      changed in a way that is not supported, in which case
-      ``hErrorNode_out`` is set to the node from ``hGraph``
+    - :py:obj:`~.cudaGraphExecUpdateErrorAttributesChanged` if any
+      attributes of a node changed in a way that is not supported, in which
+      case ``hErrorNode_out`` is set to the node from ``hGraph``
 
-    - cudaGraphExecUpdateErrorNotSupported if something about a node is
-      unsupported, like the node's type or configuration, in which case
-      ``hErrorNode_out`` is set to the node from ``hGraph``
+    - :py:obj:`~.cudaGraphExecUpdateErrorNotSupported` if something about a
+      node is unsupported, like the node's type or configuration, in which
+      case ``hErrorNode_out`` is set to the node from ``hGraph``
 
     If the update fails for a reason not listed above, the result member of
-    ``resultInfo`` will be set to cudaGraphExecUpdateError. If the update
-    succeeds, the result member will be set to cudaGraphExecUpdateSuccess.
+    ``resultInfo`` will be set to :py:obj:`~.cudaGraphExecUpdateError`. If
+    the update succeeds, the result member will be set to
+    :py:obj:`~.cudaGraphExecUpdateSuccess`.
 
-    cudaGraphExecUpdate returns cudaSuccess when the updated was performed
-    successfully. It returns cudaErrorGraphExecUpdateFailure if the graph
-    update was not performed because it included changes which violated
-    constraints specific to instantiated graph update.
+    :func:`~.cudaGraphExecUpdate` returns :py:obj:`~.cudaSuccess` when the
+    updated was performed successfully. It returns
+    :py:obj:`~.cudaErrorGraphExecUpdateFailure` if the graph update was not
+    performed because it included changes which violated constraints
+    specific to instantiated graph update.
 
     Parameters
     ----------
@@ -38127,7 +38182,7 @@ def cudaGraphExecUpdate(hGraphExec, hGraph):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorGraphExecUpdateFailure`,
     resultInfo : :py:obj:`~.cudaGraphExecUpdateResultInfo`
         the error info structure
@@ -38181,7 +38236,7 @@ def cudaGraphUpload(graphExec, stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`,
 
     See Also
@@ -38235,7 +38290,7 @@ def cudaGraphLaunch(graphExec, stream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -38278,7 +38333,7 @@ def cudaGraphExecDestroy(graphExec):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -38313,7 +38368,7 @@ def cudaGraphDestroy(graph):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -38352,12 +38407,12 @@ def cudaGraphDebugDotPrint(graph, char* path, unsigned int flags):
     path : bytes
         The path to write the DOT file to
     flags : unsigned int
-        Flags from cudaGraphDebugDotFlags for specifying which additional
-        node information to write
+        Flags from :py:obj:`~.cudaGraphDebugDotFlags` for specifying which
+        additional node information to write
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorOperatingSystem`
     """
     cdef cyruntime.cudaGraph_t cygraph
@@ -38408,7 +38463,7 @@ def cudaUserObjectCreate(ptr, destroy, unsigned int initialRefcount, unsigned in
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
     object_out : :py:obj:`~.cudaUserObject_t`
         Location to return the user object handle
@@ -38458,7 +38513,7 @@ def cudaUserObjectRetain(object, unsigned int count):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -38503,7 +38558,7 @@ def cudaUserObjectRelease(object, unsigned int count):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -38551,7 +38606,7 @@ def cudaGraphRetainUserObject(graph, object, unsigned int count, unsigned int fl
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -38602,7 +38657,7 @@ def cudaGraphReleaseUserObject(graph, object, unsigned int count):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
@@ -38671,7 +38726,7 @@ def cudaGraphAddNode(graph, pDependencies : Optional[tuple[cudaGraphNode_t] | li
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDeviceFunction`, :py:obj:`~.cudaErrorNotSupported`
     pGraphNode : :py:obj:`~.cudaGraphNode_t`
         Returns newly created node
@@ -38738,7 +38793,8 @@ def cudaGraphNodeSetParams(node, nodeParams : Optional[cudaGraphNodeParams]):
     (reserved, padding) zeroed.
 
     Modifying parameters is not supported for node types
-    cudaGraphNodeTypeMemAlloc and cudaGraphNodeTypeMemFree.
+    :py:obj:`~.cudaGraphNodeTypeMemAlloc` and
+    :py:obj:`~.cudaGraphNodeTypeMemFree`.
 
     Parameters
     ----------
@@ -38749,7 +38805,7 @@ def cudaGraphNodeSetParams(node, nodeParams : Optional[cudaGraphNodeParams]):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDeviceFunction`, :py:obj:`~.cudaErrorNotSupported`
 
     See Also
@@ -38784,12 +38840,12 @@ def cudaGraphNodeGetParams(node):
     modified.
 
     The returned parameters are a description of the node, but may not be
-    identical to the struct provided at creation and may not be suitable
-    for direct creation of identical nodes. This is because parameters may
-    be partially unspecified and filled in by the driver at creation, may
-    reference non-copyable handles, or may describe ownership semantics or
-    other parameters that govern behavior of node creation but are not part
-    of the final functional descriptor.
+    identical to the ``struct provided`` at creation and may not be
+    suitable for direct creation of identical nodes. This is because
+    parameters may be partially unspecified and filled in by the driver at
+    creation, may reference non-copyable handles, or may describe ownership
+    semantics or other parameters that govern behavior of node creation but
+    are not part of the final functional descriptor.
 
     Parameters
     ----------
@@ -38798,7 +38854,7 @@ def cudaGraphNodeGetParams(node):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`
     nodeParams : :py:obj:`~.cudaGraphNodeParams`
         Pointer to return the parameters
@@ -38854,7 +38910,7 @@ def cudaGraphExecNodeSetParams(graphExec, node, nodeParams : Optional[cudaGraphN
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDeviceFunction`, :py:obj:`~.cudaErrorNotSupported`
 
     See Also
@@ -38903,21 +38959,21 @@ def cudaGraphConditionalHandleCreate(graph, unsigned int defaultLaunchValue, uns
         Graph which will contain the conditional node using this handle.
     defaultLaunchValue : unsigned int
         Optional initial value for the conditional variable. Applied at the
-        beginning of each graph execution if cudaGraphCondAssignDefault is
-        set in ``flags``.
+        beginning of each graph execution if
+        :py:obj:`~.cudaGraphCondAssignDefault` is set in ``flags``.
     flags : unsigned int
-        Currently must be cudaGraphCondAssignDefault or 0.
+        Currently must be :py:obj:`~.cudaGraphCondAssignDefault` or 0.
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     pHandle_out : :py:obj:`~.cudaGraphConditionalHandle`
         Pointer used to return the handle to the caller.
 
     See Also
     --------
-    :py:obj:`~.cuGraphAddNode`,
+    :func:`~.cuGraphAddNode`,
     """
     cdef cyruntime.cudaGraph_t cygraph
     if graph is None:
@@ -38958,21 +39014,21 @@ def cudaGraphConditionalHandleCreate_v2(graph, ctx, unsigned int defaultLaunchVa
         If NULL, current context will be used.
     defaultLaunchValue : unsigned int
         Optional initial value for the conditional variable. Applied at the
-        beginning of each graph execution if cudaGraphCondAssignDefault is
-        set in ``flags``.
+        beginning of each graph execution if
+        :py:obj:`~.cudaGraphCondAssignDefault` is set in ``flags``.
     flags : unsigned int
-        Currently must be cudaGraphCondAssignDefault or 0.
+        Currently must be :py:obj:`~.cudaGraphCondAssignDefault` or 0.
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.CUDA_SUCCESS`, :py:obj:`~.CUDA_ERROR_INVALID_VALUE`, :py:obj:`~.CUDA_ERROR_NOT_SUPPORTED`
     pHandle_out : :py:obj:`~.cudaGraphConditionalHandle`
         Pointer used to return the handle to the caller.
 
     See Also
     --------
-    :py:obj:`~.cuGraphAddNode`,
+    :func:`~.cuGraphAddNode`,
     """
     cdef cyruntime.cudaExecutionContext_t cyctx
     if ctx is None:
@@ -39042,7 +39098,7 @@ def cudaGetDriverEntryPoint(char* symbol, unsigned long long flags):
 
     - :py:obj:`~.cudaDriverEntryPointVersionNotSufficent` - The requested
       symbol was found but is not supported by the current runtime version
-      (CUDART_VERSION)
+      (:py:obj:`~.CUDART_VERSION`)
 
     The requested flags can be:
 
@@ -39066,16 +39122,17 @@ def cudaGetDriverEntryPoint(char* symbol, unsigned long long flags):
     ----------
     symbol : bytes
         The base name of the driver API function to look for. As an
-        example, for the driver API :py:obj:`~.cuMemAlloc_v2`, ``symbol``
-        would be cuMemAlloc. Note that the API will use the CUDA runtime
-        version to return the address to the most recent ABI compatible
-        driver symbol, :py:obj:`~.cuMemAlloc` or :py:obj:`~.cuMemAlloc_v2`.
+        example, for the driver API :func:`~.cuMemAlloc`, ``symbol`` would
+        be :py:obj:`~.cuMemAlloc`. Note that the API will use the CUDA
+        runtime version to return the address to the most recent ABI
+        compatible driver symbol, :func:`~.cuMemAlloc` or
+        :func:`~.cuMemAlloc`.
     flags : unsigned long long
         Flags to specify search options.
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`
     funcPtr : Any
         Location to return the function pointer to the requested driver
@@ -39087,11 +39144,11 @@ def cudaGetDriverEntryPoint(char* symbol, unsigned long long flags):
 
     See Also
     --------
-    :py:obj:`~.cuGetProcAddress`
+    :func:`~.cuGetProcAddress`
 
     Notes
     -----
-    This API is deprecated and :py:obj:`~.cudaGetDriverEntryPointByVersion` (with a hardcoded :py:obj:`~.cudaVersion`) should be used instead.
+    This API is deprecated and :py:obj:`~.cudaGetDriverEntryPointByVersion` (with a hardcoded ``cudaVersion``) should be used instead.
     """
     cdef void_ptr funcPtr = 0
     cdef cyruntime.cudaDriverEntryPointQueryResult driverStatus
@@ -39178,8 +39235,8 @@ def cudaGetDriverEntryPointByVersion(char* symbol, unsigned int cudaVersion, uns
     ----------
     symbol : bytes
         The base name of the driver API function to look for. As an
-        example, for the driver API :py:obj:`~.cuMemAlloc_v2`, ``symbol``
-        would be cuMemAlloc.
+        example, for the driver API :func:`~.cuMemAlloc`, ``symbol`` would
+        be :py:obj:`~.cuMemAlloc`.
     cudaVersion : unsigned int
         The CUDA version to look for the requested driver symbol
     flags : unsigned long long
@@ -39187,7 +39244,7 @@ def cudaGetDriverEntryPointByVersion(char* symbol, unsigned int cudaVersion, uns
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`
     funcPtr : Any
         Location to return the function pointer to the requested driver
@@ -39199,7 +39256,7 @@ def cudaGetDriverEntryPointByVersion(char* symbol, unsigned int cudaVersion, uns
 
     See Also
     --------
-    :py:obj:`~.cuGetProcAddress`
+    :func:`~.cuGetProcAddress`
     """
     cdef void_ptr funcPtr = 0
     cdef cyruntime.cudaDriverEntryPointQueryResult driverStatus
@@ -39270,14 +39327,14 @@ def cudaLibraryLoadData(code, jitOptions : Optional[tuple[cudaJitOption] | list[
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInvalidPtx`, :py:obj:`~.cudaErrorUnsupportedPtxVersion`, :py:obj:`~.cudaErrorNoKernelImageForDevice`, :py:obj:`~.cudaErrorSharedObjectSymbolNotFound`, :py:obj:`~.cudaErrorSharedObjectInitFailed`, :py:obj:`~.cudaErrorJitCompilerNotFound`
     library : :py:obj:`~.cudaLibrary_t`
         Returned library
 
     See Also
     --------
-    :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryUnload`, :py:obj:`~.cuLibraryLoadData`
+    :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryUnload`, :func:`~.cuLibraryLoadData`
     """
     libraryOptionValues = [] if libraryOptionValues is None else libraryOptionValues
     libraryOptions = [] if libraryOptions is None else libraryOptions
@@ -39371,14 +39428,14 @@ def cudaLibraryLoadFromFile(char* fileName, jitOptions : Optional[tuple[cudaJitO
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorMemoryAllocation`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInvalidPtx`, :py:obj:`~.cudaErrorUnsupportedPtxVersion`, :py:obj:`~.cudaErrorNoKernelImageForDevice`, :py:obj:`~.cudaErrorSharedObjectSymbolNotFound`, :py:obj:`~.cudaErrorSharedObjectInitFailed`, :py:obj:`~.cudaErrorJitCompilerNotFound`
     library : :py:obj:`~.cudaLibrary_t`
         Returned library
 
     See Also
     --------
-    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryUnload`, :py:obj:`~.cuLibraryLoadFromFile`
+    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryUnload`, :func:`~.cuLibraryLoadFromFile`
     """
     libraryOptionValues = [] if libraryOptionValues is None else libraryOptionValues
     libraryOptions = [] if libraryOptions is None else libraryOptions
@@ -39423,12 +39480,12 @@ def cudaLibraryUnload(library):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cuLibraryUnload`
+    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryLoadFromFile`, :func:`~.cuLibraryUnload`
     """
     cdef cyruntime.cudaLibrary_t cylibrary
     if library is None:
@@ -39462,14 +39519,14 @@ def cudaLibraryGetKernel(library, char* name):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorSymbolNotFound`
     pKernel : :py:obj:`~.cudaKernel_t`
         Returned kernel handle
 
     See Also
     --------
-    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryUnload`, :py:obj:`~.cuLibraryGetKernel`
+    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryUnload`, :func:`~.cuLibraryGetKernel`
     """
     cdef cyruntime.cudaLibrary_t cylibrary
     if library is None:
@@ -39512,7 +39569,7 @@ def cudaLibraryGetGlobal(library, char* name):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorSymbolNotFound` :py:obj:`~.cudaErrorDeviceUninitialized`, :py:obj:`~.cudaErrorContextIsDestroyed`
     dptr : Any
         Returned global device pointer for the requested library
@@ -39521,7 +39578,7 @@ def cudaLibraryGetGlobal(library, char* name):
 
     See Also
     --------
-    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryUnload`, :py:obj:`~.cudaLibraryGetManaged`, :py:obj:`~.cuLibraryGetGlobal`
+    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryUnload`, :py:obj:`~.cudaLibraryGetManaged`, :func:`~.cuLibraryGetGlobal`
     """
     cdef cyruntime.cudaLibrary_t cylibrary
     if library is None:
@@ -39566,7 +39623,7 @@ def cudaLibraryGetManaged(library, char* name):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorSymbolNotFound`
     dptr : Any
         Returned pointer to the managed memory
@@ -39575,7 +39632,7 @@ def cudaLibraryGetManaged(library, char* name):
 
     See Also
     --------
-    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryUnload`, :py:obj:`~.cudaLibraryGetGlobal`, :py:obj:`~.cuLibraryGetManaged`
+    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryUnload`, :py:obj:`~.cudaLibraryGetGlobal`, :func:`~.cuLibraryGetManaged`
     """
     cdef cyruntime.cudaLibrary_t cylibrary
     if library is None:
@@ -39616,14 +39673,14 @@ def cudaLibraryGetUnifiedFunction(library, char* symbol):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorSymbolNotFound`
     fptr : Any
         Returned pointer to a unified function
 
     See Also
     --------
-    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryUnload`, :py:obj:`~.cuLibraryGetUnifiedFunction`
+    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryUnload`, :func:`~.cuLibraryGetUnifiedFunction`
     """
     cdef cyruntime.cudaLibrary_t cylibrary
     if library is None:
@@ -39656,14 +39713,14 @@ def cudaLibraryGetKernelCount(lib):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`
     count : unsigned int
         Number of kernels found within the library
 
     See Also
     --------
-    :py:obj:`~.cudaLibraryEnumerateKernels`, :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cuLibraryGetKernelCount`
+    :py:obj:`~.cudaLibraryEnumerateKernels`, :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryLoadData`, :func:`~.cuLibraryGetKernelCount`
     """
     cdef cyruntime.cudaLibrary_t cylib
     if lib is None:
@@ -39700,14 +39757,14 @@ def cudaLibraryEnumerateKernels(unsigned int numKernels, lib):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`
     kernels : list[:py:obj:`~.cudaKernel_t`]
         Buffer where the kernel handles are returned to
 
     See Also
     --------
-    :py:obj:`~.cudaLibraryGetKernelCount`, :py:obj:`~.cuLibraryEnumerateKernels`
+    :py:obj:`~.cudaLibraryGetKernelCount`, :func:`~.cuLibraryEnumerateKernels`
     """
     cdef cyruntime.cudaLibrary_t cylib
     if lib is None:
@@ -39762,7 +39819,7 @@ def cudaKernelSetAttributeForDevice(kernel, attr not None : cudaFuncAttribute, i
     - :py:obj:`~.cudaFuncAttributeMaxDynamicSharedMemorySize` - The
       requested maximum size in bytes of dynamically-allocated shared
       memory. The sum of this value and the function attribute
-      :py:obj:`~.sharedSizeBytes` cannot exceed the device attribute
+      ``sharedSizeBytes`` cannot exceed the device attribute
       :py:obj:`~.cudaDevAttrMaxSharedMemoryPerBlockOptin`. The maximal size
       of requestable dynamic shared memory may differ by GPU architecture.
 
@@ -39779,21 +39836,21 @@ def cudaKernelSetAttributeForDevice(kernel, attr not None : cudaFuncAttribute, i
       either all be 0 or all be positive. The validity of the cluster
       dimensions is checked at launch time. If the value is set during
       compile time, it cannot be set at runtime. Setting it at runtime will
-      return cudaErrorNotPermitted.
+      return :py:obj:`~.cudaErrorNotPermitted`.
 
     - :py:obj:`~.cudaFuncAttributeRequiredClusterHeight`: The required
       cluster height in blocks. The width, height, and depth values must
       either all be 0 or all be positive. The validity of the cluster
       dimensions is checked at launch time. If the value is set during
       compile time, it cannot be set at runtime. Setting it at runtime will
-      return cudaErrorNotPermitted.
+      return :py:obj:`~.cudaErrorNotPermitted`.
 
     - :py:obj:`~.cudaFuncAttributeRequiredClusterDepth`: The required
       cluster depth in blocks. The width, height, and depth values must
       either all be 0 or all be positive. The validity of the cluster
       dimensions is checked at launch time. If the value is set during
       compile time, it cannot be set at runtime. Setting it at runtime will
-      return cudaErrorNotPermitted.
+      return :py:obj:`~.cudaErrorNotPermitted`.
 
     - :py:obj:`~.cudaFuncAttributeNonPortableClusterSizeAllowed`: Indicates
       whether the function can be launched with non-portable cluster size.
@@ -39801,7 +39858,7 @@ def cudaKernelSetAttributeForDevice(kernel, attr not None : cudaFuncAttribute, i
 
     - :py:obj:`~.cudaFuncAttributeClusterSchedulingPolicyPreference`: The
       block scheduling policy of a function. The value type is
-      cudaClusterSchedulingPolicy.
+      :py:obj:`~.cudaClusterSchedulingPolicy`.
 
     Parameters
     ----------
@@ -39816,12 +39873,12 @@ def cudaKernelSetAttributeForDevice(kernel, attr not None : cudaFuncAttribute, i
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDeviceFunction`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryUnload`, :py:obj:`~.cudaLibraryGetKernel`, :py:obj:`~.cudaLaunchKernel`, :py:obj:`~.cudaFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
+    :py:obj:`~.cudaLibraryLoadData`, :py:obj:`~.cudaLibraryLoadFromFile`, :py:obj:`~.cudaLibraryUnload`, :py:obj:`~.cudaLibraryGetKernel`, :py:obj:`~.cudaLaunchKernel`, :py:obj:`~.cudaFuncSetAttribute`, :func:`~.cuKernelSetAttribute`
 
     Notes
     -----
@@ -39862,14 +39919,14 @@ def cudaDeviceGetDevResource(int device, typename not None : cudaDevResourceType
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidResourceType`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
     resource : :py:obj:`~.cudaDevResource`
-        Output pointer to a cudaDevResource structure
+        Output pointer to a :py:obj:`~.cudaDevResource` structure
 
     See Also
     --------
-    :py:obj:`~.cuDeviceGetDevResource`, :py:obj:`~.cudaExecutionCtxGetDevResource`, :py:obj:`~.cudaDevSmResourceSplit`, :py:obj:`~.cudaDevResourceGenerateDesc`
+    :func:`~.cuDeviceGetDevResource`, :py:obj:`~.cudaExecutionCtxGetDevResource`, :py:obj:`~.cudaDevSmResourceSplit`, :py:obj:`~.cudaDevResourceGenerateDesc`
     """
     cdef cudaDevResource resource = cudaDevResource()
     cdef cyruntime.cudaDevResourceType cytypename = int(typename)
@@ -39884,17 +39941,17 @@ def cudaDeviceGetDevResource(int device, typename not None : cudaDevResourceType
 
 @cython.embedsignature(True)
 def cudaDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[cudaDevResource], unsigned int flags, unsigned int minCount):
-    """ Splits ``cudaDevResourceTypeSm`` resources.
-
-    Splits ``cudaDevResourceTypeSm`` resources into ``nbGroups``, adhering
-    to the minimum SM count specified in ``minCount`` and the usage flags
-    in ``flags``. If ``result`` is NULL, the API simulates a split and
-    provides the amount of groups that would be created in ``nbGroups``.
-    Otherwise, ``nbGroups`` must point to the amount of elements in
-    ``result`` and on return, the API will overwrite ``nbGroups`` with the
-    amount actually created. The groups are written to the array in
-    ``result``. ``nbGroups`` can be less than the total amount if a smaller
-    number of groups is needed.
+    """ Splits :py:obj:`~.cudaDevResourceTypeSm` resources.
+
+    Splits :py:obj:`~.cudaDevResourceTypeSm` resources into ``nbGroups``,
+    adhering to the minimum SM count specified in ``minCount`` and the
+    usage flags in ``flags``. If ``result`` is NULL, the API simulates a
+    split and provides the amount of groups that would be created in
+    ``nbGroups``. Otherwise, ``nbGroups`` must point to the amount of
+    elements in ``result`` and on return, the API will overwrite
+    ``nbGroups`` with the amount actually created. The groups are written
+    to the array in ``result``. ``nbGroups`` can be less than the total
+    amount if a smaller number of groups is needed.
 
     This API is used to spatially partition the input resource. The input
     resource needs to come from one of
@@ -39921,12 +39978,13 @@ def cudaDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[cudaD
 
     The following flags are supported:
 
-    - ``cudaDevSmResourceSplitIgnoreSmCoscheduling`` : Lower the minimum SM
-      count and alignment, and treat each SM independent of its hierarchy.
-      This allows more fine grained partitions but at the cost of advanced
-      features (such as large clusters on compute capability 9.0+).
+    - :py:obj:`~.cudaDevSmResourceSplitIgnoreSmCoscheduling` : Lower the
+      minimum SM count and alignment, and treat each SM independent of its
+      hierarchy. This allows more fine grained partitions but at the cost
+      of advanced features (such as large clusters on compute capability
+      9.0+).
 
-    - ``cudaDevSmResourceSplitMaxPotentialClusterSize`` : Compute
+    - :py:obj:`~.cudaDevSmResourceSplitMaxPotentialClusterSize` : Compute
       Capability 9.0+ only. Attempt to create groups that may allow for
       maximally sized thread clusters. This can be queried post green
       context creation using
@@ -39935,13 +39993,13 @@ def cudaDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[cudaD
     A successful API call must either have:
 
     - A valid array of ``result`` pointers of size passed in ``nbGroups``,
-      with ``input`` of type ``cudaDevResourceTypeSm``. Value of
+      with ``input`` of type :py:obj:`~.cudaDevResourceTypeSm`. Value of
       ``minCount`` must be between 0 and the SM count specified in
       ``input``. ``remaining`` may be NULL.
 
     - NULL passed in for ``result``, with a valid integer pointer in
-      ``nbGroups`` and ``input`` of type ``cudaDevResourceTypeSm``. Value
-      of ``minCount`` must be between 0 and the SM count specified in
+      ``nbGroups`` and ``input`` of type :py:obj:`~.cudaDevResourceTypeSm`.
+      Value of ``minCount`` must be between 0 and the SM count specified in
       ``input``. ``remaining`` may be NULL. This queries the number of
       groups that would be created by the API.
 
@@ -39954,7 +40012,7 @@ def cudaDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[cudaD
         should be created as described below.
     input : :py:obj:`~.cudaDevResource`
         Input SM resource to be split. Must be a valid
-        ``cudaDevSmResource`` resource.
+        :py:obj:`~.cudaDevSmResource` resource.
     flags : unsigned int
         Flags specifying how these partitions are used or which constraints
         to abide by when splitting the input. Zero is valid for default
@@ -39964,11 +40022,11 @@ def cudaDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[cudaD
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorInvalidResourceType`, :py:obj:`~.cudaErrorInvalidResourceConfiguration`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
     result : list[:py:obj:`~.cudaDevResource`]
-        Output array of ``cudaDevResource`` resources. Can be NULL to query
-        the number of groups.
+        Output array of :py:obj:`~.cudaDevResource` resources. Can be NULL
+        to query the number of groups.
     nbGroups : unsigned int
         This is a pointer, specifying the number of groups that would be or
         should be created as described below.
@@ -39979,7 +40037,7 @@ def cudaDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[cudaD
 
     See Also
     --------
-    :py:obj:`~.cuDevSmResourceSplitByCount`, :py:obj:`~.cudaDeviceGetDevResource`, :py:obj:`~.cudaExecutionCtxGetDevResource`, :py:obj:`~.cudaDevResourceGenerateDesc`
+    :func:`~.cuDevSmResourceSplitByCount`, :py:obj:`~.cudaDeviceGetDevResource`, :py:obj:`~.cudaExecutionCtxGetDevResource`, :py:obj:`~.cudaDevResourceGenerateDesc`
     """
     cdef cyruntime.cudaDevResource* cyresult = NULL
     pyresult = [cudaDevResource() for idx in range(nbGroups)]
@@ -40006,7 +40064,7 @@ def cudaDevSmResourceSplitByCount(unsigned int nbGroups, input_ : Optional[cudaD
 
 @cython.embedsignature(True)
 def cudaDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[cudaDevResource], unsigned int flags, groupParams : Optional[tuple[cudaDevSmResourceGroupParams] | list[cudaDevSmResourceGroupParams]]):
-    """ Splits a ``cudaDevResourceTypeSm`` resource into structured groups.
+    """ Splits a :py:obj:`~.cudaDevResourceTypeSm` resource into structured groups.
 
     This API will split a resource of :py:obj:`~.cudaDevResourceTypeSm`
     into ``nbGroups`` structured device resource groups (the ``result``
@@ -40040,9 +40098,9 @@ def cudaDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[cudaDevResou
 
     For a valid call:
 
-    - ``result`` should point to a ``cudaDevResource`` array of size
-      ``nbGroups``, or alternatively, may be NULL, if the developer wishes
-      for only the groupParams entries to be updated
+    - ``result`` should point to a :py:obj:`~.cudaDevResource` array of
+      size ``nbGroups``, or alternatively, may be NULL, if the developer
+      wishes for only the groupParams entries to be updated
 
     - ``input`` should be a valid :py:obj:`~.cudaDevResourceTypeSm`
       resource that originates from querying the execution context, or
@@ -40079,8 +40137,8 @@ def cudaDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[cudaDevResou
 
       - ``flags:``
 
-        - ``cudaDevSmResourceGroupBackfill:`` lets ``smCount`` be a non-
-          multiple of ``coscheduledSmCount``, filling the difference
+        - :py:obj:`~.cudaDevSmResourceGroupBackfill`: lets ``smCount`` be a
+          non-multiple of ``coscheduledSmCount``, filling the difference
           between SM count and already assigned co-scheduled groupings with
           other SMs. This lets any resulting group behave similar to the
           ``remainder`` group for example.
@@ -40107,7 +40165,7 @@ def cudaDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[cudaDevResou
       always need to adhere to a structure of coscheduledSmCount (even if
       its just 2), and therefore must always have enough coscheduled SMs to
       cover that requirement (even with the
-      ``cudaDevSmResourceGroupBackfill`` flag enabled).
+      :py:obj:`~.cudaDevSmResourceGroupBackfill` flag enabled).
 
     Splitting an input into N groups, can be accomplished by repeatedly
     splitting off 1 group and re-splitting the remainder (a bisect
@@ -40120,7 +40178,7 @@ def cudaDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[cudaDevResou
         Specifies the number of groups in ``result`` and ``groupParams``
     input : :py:obj:`~.cudaDevResource`
         Input SM resource to be split. Must be a valid
-        ``cudaDevResourceTypeSm`` resource.
+        :py:obj:`~.cudaDevResourceTypeSm` resource.
     flags : unsigned int
         Flags specifying how the API should behave. The value should be 0
         for now.
@@ -40130,10 +40188,10 @@ def cudaDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[cudaDevResou
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorInvalidResourceType`, :py:obj:`~.cudaErrorInvalidResourceConfiguration`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
     result : list[:py:obj:`~.cudaDevResource`]
-        Output array of ``cudaDevResource`` resources. Can be NULL,
+        Output array of :py:obj:`~.cudaDevResource` resources. Can be NULL,
         alongside an smCount of 0, for discovery purpose.
     remainder : :py:obj:`~.cudaDevResource`
         If splitting the input resource leaves any SMs, the remainder is
@@ -40141,7 +40199,7 @@ def cudaDevSmResourceSplit(unsigned int nbGroups, input_ : Optional[cudaDevResou
 
     See Also
     --------
-    :py:obj:`~.cuDevSmResourceSplit`, :py:obj:`~.cudaDeviceGetDevResource`, :py:obj:`~.cudaExecutionCtxGetDevResource`, :py:obj:`~.cudaDevResourceGenerateDesc`
+    :func:`~.cuDevSmResourceSplit`, :py:obj:`~.cudaDeviceGetDevResource`, :py:obj:`~.cudaExecutionCtxGetDevResource`, :py:obj:`~.cudaDevResourceGenerateDesc`
     """
     groupParams = [] if groupParams is None else groupParams
     if not all(isinstance(_x, (cudaDevSmResourceGroupParams,)) for _x in groupParams):
@@ -40213,14 +40271,14 @@ def cudaDevResourceGenerateDesc(resources : Optional[tuple[cudaDevResource] | li
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorInvalidResourceType`, :py:obj:`~.cudaErrorInvalidResourceConfiguration`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorOutOfMemory`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorInvalidResourceType`, :py:obj:`~.cudaErrorInvalidResourceConfiguration`, :py:obj:`~.cudaErrorNotSupported`, ``cudaErrorOutOfMemory``, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
     phDesc : :py:obj:`~.cudaDevResourceDesc_t`
         Output descriptor
 
     See Also
     --------
-    :py:obj:`~.cuDevResourceGenerateDesc`, :py:obj:`~.cudaDeviceGetDevResource`, :py:obj:`~.cudaExecutionCtxGetDevResource`, :py:obj:`~.cudaDevSmResourceSplit`, :py:obj:`~.cudaGreenCtxCreate`
+    :func:`~.cuDevResourceGenerateDesc`, :py:obj:`~.cudaDeviceGetDevResource`, :py:obj:`~.cudaExecutionCtxGetDevResource`, :py:obj:`~.cudaDevSmResourceSplit`, :py:obj:`~.cudaGreenCtxCreate`
     """
     resources = [] if resources is None else resources
     if not all(isinstance(_x, (cudaDevResource,)) for _x in resources):
@@ -40283,8 +40341,8 @@ def cudaGreenCtxCreate(desc, int device, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorOutOfMemory`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorNotSupported`, ``cudaErrorOutOfMemory``, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
     phCtx : :py:obj:`~.cudaExecutionContext_t`
         Pointer for the output handle to the green context
 
@@ -40347,7 +40405,7 @@ def cudaExecutionCtxDestroy(ctx):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
 
     See Also
@@ -40388,10 +40446,10 @@ def cudaExecutionCtxGetDevResource(ctx, typename not None : cudaDevResourceType)
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotSupported`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
     resource : :py:obj:`~.cudaDevResource`
-        Output pointer to a cudaDevResource structure
+        Output pointer to a :py:obj:`~.cudaDevResource` structure
 
     See Also
     --------
@@ -40431,14 +40489,14 @@ def cudaExecutionCtxGetDevice(ctx):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`
     device : int
         Returned device handle for the specified execution context
 
     See Also
     --------
-    :py:obj:`~.cudaGreenCtxCreate`, :py:obj:`~.cudaExecutionCtxDestroy`, :py:obj:`~.cuCtxGetDevice`
+    :py:obj:`~.cudaGreenCtxCreate`, :py:obj:`~.cudaExecutionCtxDestroy`, :func:`~.cuCtxGetDevice`
     """
     cdef cyruntime.cudaExecutionContext_t cyctx
     if ctx is None:
@@ -40474,14 +40532,14 @@ def cudaExecutionCtxGetId(ctx):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`
     ctxId : unsigned long long
         Pointer to store the Id of the context
 
     See Also
     --------
-    :py:obj:`~.cudaGreenCtxCreate`, :py:obj:`~.cudaExecutionCtxDestroy`, :py:obj:`~.cudaExecutionCtxGetDevice`, :py:obj:`~.cuCtxGetId`
+    :py:obj:`~.cudaGreenCtxCreate`, :py:obj:`~.cudaExecutionCtxDestroy`, :py:obj:`~.cudaExecutionCtxGetDevice`, :func:`~.cuCtxGetId`
     """
     cdef cyruntime.cudaExecutionContext_t cyctx
     if ctx is None:
@@ -40545,8 +40603,8 @@ def cudaExecutionCtxStreamCreate(ctx, unsigned int flags, int priority):
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorOutOfMemory`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorNotPermitted`, ``cudaErrorOutOfMemory``, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`
     phStream : :py:obj:`~.cudaStream_t`
         Returned stream handle
 
@@ -40596,12 +40654,12 @@ def cudaExecutionCtxSynchronize(ctx):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorDeviceUninitialized`, :py:obj:`~.cudaErrorInvalidValue`
 
     See Also
     --------
-    :py:obj:`~.cudaGreenCtxCreate`, :py:obj:`~.cudaExecutionCtxDestroy`, :py:obj:`~.cudaDeviceSynchronize`, :py:obj:`~.cuCtxSynchronize_v2`
+    :py:obj:`~.cudaGreenCtxCreate`, :py:obj:`~.cudaExecutionCtxDestroy`, :py:obj:`~.cudaDeviceSynchronize`, :func:`~.cuCtxSynchronize_v2`
     """
     cdef cyruntime.cudaExecutionContext_t cyctx
     if ctx is None:
@@ -40626,8 +40684,8 @@ def cudaStreamGetDevResource(hStream, typename not None : cudaDevResourceType):
     them in ``resource``.
 
     Note: The API will return :py:obj:`~.cudaErrorInvalidResourceType` is
-    ``typename`` is ``cudaDevResourceTypeWorkqueueConfig`` or
-    ``cudaDevResourceTypeWorkqueue``.
+    ``typename`` is :py:obj:`~.cudaDevResourceTypeWorkqueueConfig` or
+    :py:obj:`~.cudaDevResourceTypeWorkqueue`.
 
     Parameters
     ----------
@@ -40638,14 +40696,14 @@ def cudaStreamGetDevResource(hStream, typename not None : cudaDevResourceType):
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorDeviceUninitialized`, :py:obj:`~.cudaErrorInvalidResourceType`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidHandle`, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorCallRequiresNewerDriver`,
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorDeviceUninitialized`, :py:obj:`~.cudaErrorInvalidResourceType`, :py:obj:`~.cudaErrorInvalidValue`, ``cudaErrorInvalidHandle``, :py:obj:`~.cudaErrorNotPermitted`, :py:obj:`~.cudaErrorCallRequiresNewerDriver`,
     resource : :py:obj:`~.cudaDevResource`
-        Output pointer to a cudaDevResource structure
+        Output pointer to a :py:obj:`~.cudaDevResource` structure
 
     See Also
     --------
-    :py:obj:`~.cudaGreenCtxCreate`, :py:obj:`~.cudaExecutionCtxStreamCreate`, :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaDevSmResourceSplit`, :py:obj:`~.cudaDevResourceGenerateDesc`, :py:obj:`~.cuStreamGetDevResource`
+    :py:obj:`~.cudaGreenCtxCreate`, :py:obj:`~.cudaExecutionCtxStreamCreate`, :py:obj:`~.cudaStreamCreate`, :py:obj:`~.cudaDevSmResourceSplit`, :py:obj:`~.cudaDevResourceGenerateDesc`, :func:`~.cuStreamGetDevResource`
     """
     cdef cyruntime.cudaStream_t cyhStream
     if hStream is None:
@@ -40672,8 +40730,8 @@ def cudaExecutionCtxRecordEvent(ctx, event):
 
     Captures in ``event`` all the activities of the execution context
     ``ctx`` at the time of this call. ``event`` and ``ctx`` must be from
-    the same CUDA device, otherwise :py:obj:`~.cudaErrorInvalidHandle` will
-    be returned. Calls such as :py:obj:`~.cudaEventQuery()` or
+    the same CUDA device, otherwise ``cudaErrorInvalidHandle`` will be
+    returned. Calls such as :py:obj:`~.cudaEventQuery()` or
     :py:obj:`~.cudaExecutionCtxWaitEvent()` will then examine or wait for
     completion of the work that was captured. Uses of ``ctx`` after this
     call do not modify ``event``. If the execution context passed to
@@ -40691,12 +40749,12 @@ def cudaExecutionCtxRecordEvent(ctx, event):
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidHandle`, :py:obj:`~.cudaErrorStreamCaptureUnsupported`
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, ``cudaErrorInvalidHandle``, :py:obj:`~.cudaErrorStreamCaptureUnsupported`
 
     See Also
     --------
-    :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaExecutionCtxWaitEvent`, :py:obj:`~.cuCtxRecordEvent`, :py:obj:`~.cuGreenCtxRecordEvent`
+    :py:obj:`~.cudaEventRecord`, :py:obj:`~.cudaExecutionCtxWaitEvent`, :func:`~.cuCtxRecordEvent`, :func:`~.cuGreenCtxRecordEvent`
 
     Notes
     -----
@@ -40747,12 +40805,12 @@ def cudaExecutionCtxWaitEvent(ctx, event):
 
     Returns
     -------
-    cudaError_t
-        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidHandle`, :py:obj:`~.cudaErrorStreamCaptureUnsupported`
+    :py:obj:`~.cudaError_t`
+        :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorCudartUnloading`, :py:obj:`~.cudaErrorInitializationError`, ``cudaErrorInvalidHandle``, :py:obj:`~.cudaErrorStreamCaptureUnsupported`
 
     See Also
     --------
-    :py:obj:`~.cudaExecutionCtxRecordEvent`, :py:obj:`~.cudaStreamWaitEvent`, :py:obj:`~.cuCtxWaitEvent`, :py:obj:`~.cuGreenCtxWaitEvent`
+    :py:obj:`~.cudaExecutionCtxRecordEvent`, :py:obj:`~.cudaStreamWaitEvent`, :func:`~.cuCtxWaitEvent`, :func:`~.cuGreenCtxWaitEvent`
 
     Notes
     -----
@@ -40789,8 +40847,9 @@ def cudaDeviceGetExecutionCtx(int device):
 
     Returns in ``ctx`` the execution context for the specified device. This
     is the device's primary context. The returned context can then be
-    passed to APIs that take in a cudaExecutionContext_t enabling explicit
-    context-based programming without relying on thread-local state.
+    passed to APIs that take in a :py:obj:`~.cudaExecutionContext_t`
+    enabling explicit context-based programming without relying on thread-
+    local state.
 
     Passing the returned execution context to
     :py:obj:`~.cudaExecutionCtxDestroy()` is not allowed and will result in
@@ -40803,7 +40862,7 @@ def cudaDeviceGetExecutionCtx(int device):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidDevice`
     ctx : :py:obj:`~.cudaExecutionContext_t`
         Returns the device execution context
@@ -40859,14 +40918,14 @@ def cudaGetKernel(entryFuncAddr):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`
     kernelPtr : :py:obj:`~.cudaKernel_t`
         Returns the device kernel
 
     See Also
     --------
-    cudaGetKernel (C++ API)
+    :func:`~.cudaGetKernel` (C++ API)
     """
     cdef cudaKernel_t kernelPtr = cudaKernel_t()
     cdef _HelperInputVoidPtrStruct cyentryFuncAddrHelper
@@ -40908,7 +40967,7 @@ def make_cudaPitchedPtr(d, size_t p, size_t xsz, size_t ysz):
 
     See Also
     --------
-    make_cudaExtent, make_cudaPos
+    :func:`~.make_cudaExtent`, :func:`~.make_cudaPos`
     """
     cdef _HelperInputVoidPtrStruct cydHelper
     cdef void* cyd = _helper_input_void_ptr(d, &cydHelper)
@@ -40947,7 +41006,7 @@ def make_cudaPos(size_t x, size_t y, size_t z):
 
     See Also
     --------
-    make_cudaExtent, make_cudaPitchedPtr
+    :func:`~.make_cudaExtent`, :func:`~.make_cudaPitchedPtr`
     """
     with nogil:
         err = cyruntime.make_cudaPos(x, y, z)
@@ -40984,7 +41043,7 @@ def make_cudaExtent(size_t w, size_t h, size_t d):
 
     See Also
     --------
-    make_cudaPitchedPtr, make_cudaPos
+    :func:`~.make_cudaPitchedPtr`, :func:`~.make_cudaPos`
     """
     with nogil:
         err = cyruntime.make_cudaExtent(w, h, d)
@@ -40999,11 +41058,12 @@ def make_cudaExtent(size_t w, size_t h, size_t d):
 def cudaGraphicsEGLRegisterImage(image, unsigned int flags):
     """ Registers an EGL image.
 
-    Registers the EGLImageKHR specified by ``image`` for access by CUDA. A
-    handle to the registered object is returned as ``pCudaResource``.
-    Additional Mapping/Unmapping is not required for the registered
-    resource and :py:obj:`~.cudaGraphicsResourceGetMappedEglFrame` can be
-    directly called on the ``pCudaResource``.
+    Registers the :py:obj:`~.EGLImageKHR` specified by ``image`` for access
+    by CUDA. A handle to the registered object is returned as
+    ``pCudaResource``. Additional Mapping/Unmapping is not required for the
+    registered resource and
+    :py:obj:`~.cudaGraphicsResourceGetMappedEglFrame` can be directly
+    called on the ``pCudaResource``.
 
     The application will be responsible for synchronizing access to shared
     objects. The application must ensure that any pending operation which
@@ -41013,8 +41073,8 @@ def cudaGraphicsEGLRegisterImage(image, unsigned int flags):
     will be also responsible for ensuring that any pending operation on the
     registered CUDA resource has completed prior to executing subsequent
     commands in other APIs accesing the same memory objects. This can be
-    accomplished by calling cuCtxSynchronize or cuEventSynchronize
-    (preferably).
+    accomplished by calling :func:`~.cuCtxSynchronize` or
+    :func:`~.cuEventSynchronize` (preferably).
 
     The surface's intended usage is specified using ``flags``, as follows:
 
@@ -41031,27 +41091,28 @@ def cudaGraphicsEGLRegisterImage(image, unsigned int flags):
       contents of the resource, so none of the data previously stored in
       the resource will be preserved.
 
-    The EGLImageKHR is an object which can be used to create EGLImage
-    target resource. It is defined as a void pointer. typedef void*
-    EGLImageKHR
+    The :py:obj:`~.EGLImageKHR` is an object which can be used to create
+    EGLImage target resource. It is defined as a :py:obj:`~.EGLImageKHR`
+    pointer. typedef :py:obj:`~.EGLImageKHR`* :py:obj:`~.EGLImageKHR`
 
     Parameters
     ----------
     image : :py:obj:`~.EGLImageKHR`
-        An EGLImageKHR image which can be used to create target resource.
+        An :py:obj:`~.EGLImageKHR` image which can be used to create target
+        resource.
     flags : unsigned int
         Map flags
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`
     pCudaResource : :py:obj:`~.cudaGraphicsResource`
         Pointer to the returned object handle
 
     See Also
     --------
-    :py:obj:`~.cudaGraphicsUnregisterResource`, :py:obj:`~.cudaGraphicsResourceGetMappedEglFrame`, :py:obj:`~.cuGraphicsEGLRegisterImage`
+    :py:obj:`~.cudaGraphicsUnregisterResource`, :py:obj:`~.cudaGraphicsResourceGetMappedEglFrame`, :func:`~.cuGraphicsEGLRegisterImage`
     """
     cdef cyruntime.EGLImageKHR cyimage
     if image is None:
@@ -41075,26 +41136,27 @@ def cudaGraphicsEGLRegisterImage(image, unsigned int flags):
 def cudaEGLStreamConsumerConnect(eglStream):
     """ Connect CUDA to EGLStream as a consumer.
 
-    Connect CUDA as a consumer to EGLStreamKHR specified by ``eglStream``.
+    Connect CUDA as a consumer to :py:obj:`~.EGLStreamKHR` specified by
+    ``eglStream``.
 
-    The EGLStreamKHR is an EGL object that transfers a sequence of image
-    frames from one API to another.
+    The :py:obj:`~.EGLStreamKHR` is an EGL object that transfers a sequence
+    of image frames from one API to another.
 
     Parameters
     ----------
     eglStream : :py:obj:`~.EGLStreamKHR`
-        EGLStreamKHR handle
+        :py:obj:`~.EGLStreamKHR` handle
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`
     conn : :py:obj:`~.cudaEglStreamConnection`
         Pointer to the returned connection handle
 
     See Also
     --------
-    :py:obj:`~.cudaEGLStreamConsumerDisconnect`, :py:obj:`~.cudaEGLStreamConsumerAcquireFrame`, :py:obj:`~.cudaEGLStreamConsumerReleaseFrame`, :py:obj:`~.cuEGLStreamConsumerConnect`
+    :py:obj:`~.cudaEGLStreamConsumerDisconnect`, :py:obj:`~.cudaEGLStreamConsumerAcquireFrame`, :py:obj:`~.cudaEGLStreamConsumerReleaseFrame`, :func:`~.cuEGLStreamConsumerConnect`
     """
     cdef cyruntime.EGLStreamKHR cyeglStream
     if eglStream is None:
@@ -41118,8 +41180,8 @@ def cudaEGLStreamConsumerConnect(eglStream):
 def cudaEGLStreamConsumerConnectWithFlags(eglStream, unsigned int flags):
     """ Connect CUDA to EGLStream as a consumer with given flags.
 
-    Connect CUDA as a consumer to EGLStreamKHR specified by ``stream`` with
-    specified ``flags`` defined by
+    Connect CUDA as a consumer to :py:obj:`~.EGLStreamKHR` specified by
+    ``stream`` with specified ``flags`` defined by
     :py:obj:`~.cudaEglResourceLocationFlags`.
 
     The flags specify whether the consumer wants to access frames from
@@ -41129,20 +41191,20 @@ def cudaEGLStreamConsumerConnectWithFlags(eglStream, unsigned int flags):
     Parameters
     ----------
     eglStream : :py:obj:`~.EGLStreamKHR`
-        EGLStreamKHR handle
+        :py:obj:`~.EGLStreamKHR` handle
     flags : unsigned int
         Flags denote intended location - system or video.
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`
     conn : :py:obj:`~.cudaEglStreamConnection`
         Pointer to the returned connection handle
 
     See Also
     --------
-    :py:obj:`~.cudaEGLStreamConsumerDisconnect`, :py:obj:`~.cudaEGLStreamConsumerAcquireFrame`, :py:obj:`~.cudaEGLStreamConsumerReleaseFrame`, :py:obj:`~.cuEGLStreamConsumerConnectWithFlags`
+    :py:obj:`~.cudaEGLStreamConsumerDisconnect`, :py:obj:`~.cudaEGLStreamConsumerAcquireFrame`, :py:obj:`~.cudaEGLStreamConsumerReleaseFrame`, :func:`~.cuEGLStreamConsumerConnectWithFlags`
     """
     cdef cyruntime.EGLStreamKHR cyeglStream
     if eglStream is None:
@@ -41166,7 +41228,7 @@ def cudaEGLStreamConsumerConnectWithFlags(eglStream, unsigned int flags):
 def cudaEGLStreamConsumerDisconnect(conn):
     """ Disconnect CUDA as a consumer to EGLStream .
 
-    Disconnect CUDA as a consumer to EGLStreamKHR.
+    Disconnect CUDA as a consumer to :py:obj:`~.EGLStreamKHR`.
 
     Parameters
     ----------
@@ -41175,12 +41237,12 @@ def cudaEGLStreamConsumerDisconnect(conn):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`
 
     See Also
     --------
-    :py:obj:`~.cudaEGLStreamConsumerConnect`, :py:obj:`~.cudaEGLStreamConsumerAcquireFrame`, :py:obj:`~.cudaEGLStreamConsumerReleaseFrame`, :py:obj:`~.cuEGLStreamConsumerDisconnect`
+    :py:obj:`~.cudaEGLStreamConsumerConnect`, :py:obj:`~.cudaEGLStreamConsumerAcquireFrame`, :py:obj:`~.cudaEGLStreamConsumerReleaseFrame`, :func:`~.cuEGLStreamConsumerDisconnect`
     """
     cdef cyruntime.cudaEglStreamConnection *cyconn
     if conn is None:
@@ -41203,7 +41265,7 @@ def cudaEGLStreamConsumerDisconnect(conn):
 def cudaEGLStreamConsumerAcquireFrame(conn, pCudaResource, pStream, unsigned int timeout):
     """ Acquire an image frame from the EGLStream with CUDA as a consumer.
 
-    Acquire an image frame from EGLStreamKHR.
+    Acquire an image frame from :py:obj:`~.EGLStreamKHR`.
     :py:obj:`~.cudaGraphicsResourceGetMappedEglFrame` can be called on
     ``pCudaResource`` to get :py:obj:`~.cudaEglFrame`.
 
@@ -41221,12 +41283,12 @@ def cudaEGLStreamConsumerAcquireFrame(conn, pCudaResource, pStream, unsigned int
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`, :py:obj:`~.cudaErrorLaunchTimeout`
 
     See Also
     --------
-    :py:obj:`~.cudaEGLStreamConsumerConnect`, :py:obj:`~.cudaEGLStreamConsumerDisconnect`, :py:obj:`~.cudaEGLStreamConsumerReleaseFrame`, :py:obj:`~.cuEGLStreamConsumerAcquireFrame`
+    :py:obj:`~.cudaEGLStreamConsumerConnect`, :py:obj:`~.cudaEGLStreamConsumerDisconnect`, :py:obj:`~.cudaEGLStreamConsumerReleaseFrame`, :func:`~.cuEGLStreamConsumerAcquireFrame`
     """
     cdef cyruntime.cudaStream_t *cypStream
     if pStream is None:
@@ -41270,7 +41332,7 @@ def cudaEGLStreamConsumerReleaseFrame(conn, pCudaResource, pStream):
     """ Releases the last frame acquired from the EGLStream.
 
     Release the acquired image frame specified by ``pCudaResource`` to
-    EGLStreamKHR.
+    :py:obj:`~.EGLStreamKHR`.
 
     Parameters
     ----------
@@ -41283,12 +41345,12 @@ def cudaEGLStreamConsumerReleaseFrame(conn, pCudaResource, pStream):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`
 
     See Also
     --------
-    :py:obj:`~.cudaEGLStreamConsumerConnect`, :py:obj:`~.cudaEGLStreamConsumerDisconnect`, :py:obj:`~.cudaEGLStreamConsumerAcquireFrame`, :py:obj:`~.cuEGLStreamConsumerReleaseFrame`
+    :py:obj:`~.cudaEGLStreamConsumerConnect`, :py:obj:`~.cudaEGLStreamConsumerDisconnect`, :py:obj:`~.cudaEGLStreamConsumerAcquireFrame`, :func:`~.cuEGLStreamConsumerReleaseFrame`
     """
     cdef cyruntime.cudaStream_t *cypStream
     if pStream is None:
@@ -41329,15 +41391,16 @@ def cudaEGLStreamConsumerReleaseFrame(conn, pCudaResource, pStream):
 def cudaEGLStreamProducerConnect(eglStream, width, height):
     """ Connect CUDA to EGLStream as a producer.
 
-    Connect CUDA as a producer to EGLStreamKHR specified by ``stream``.
+    Connect CUDA as a producer to :py:obj:`~.EGLStreamKHR` specified by
+    ``stream``.
 
-    The EGLStreamKHR is an EGL object that transfers a sequence of image
-    frames from one API to another.
+    The :py:obj:`~.EGLStreamKHR` is an EGL object that transfers a sequence
+    of image frames from one API to another.
 
     Parameters
     ----------
     eglStream : :py:obj:`~.EGLStreamKHR`
-        EGLStreamKHR handle
+        :py:obj:`~.EGLStreamKHR` handle
     width : :py:obj:`~.EGLint`
         width of the image to be submitted to the stream
     height : :py:obj:`~.EGLint`
@@ -41345,14 +41408,14 @@ def cudaEGLStreamProducerConnect(eglStream, width, height):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`
     conn : :py:obj:`~.cudaEglStreamConnection`
         Pointer to the returned connection handle
 
     See Also
     --------
-    :py:obj:`~.cudaEGLStreamProducerDisconnect`, :py:obj:`~.cudaEGLStreamProducerPresentFrame`, :py:obj:`~.cudaEGLStreamProducerReturnFrame`, :py:obj:`~.cuEGLStreamProducerConnect`
+    :py:obj:`~.cudaEGLStreamProducerDisconnect`, :py:obj:`~.cudaEGLStreamProducerPresentFrame`, :py:obj:`~.cudaEGLStreamProducerReturnFrame`, :func:`~.cuEGLStreamProducerConnect`
     """
     cdef cyruntime.EGLint cyheight
     if height is None:
@@ -41392,7 +41455,7 @@ def cudaEGLStreamProducerConnect(eglStream, width, height):
 def cudaEGLStreamProducerDisconnect(conn):
     """ Disconnect CUDA as a producer to EGLStream .
 
-    Disconnect CUDA as a producer to EGLStreamKHR.
+    Disconnect CUDA as a producer to :py:obj:`~.EGLStreamKHR`.
 
     Parameters
     ----------
@@ -41401,12 +41464,12 @@ def cudaEGLStreamProducerDisconnect(conn):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`
 
     See Also
     --------
-    :py:obj:`~.cudaEGLStreamProducerConnect`, :py:obj:`~.cudaEGLStreamProducerPresentFrame`, :py:obj:`~.cudaEGLStreamProducerReturnFrame`, :py:obj:`~.cuEGLStreamProducerDisconnect`
+    :py:obj:`~.cudaEGLStreamProducerConnect`, :py:obj:`~.cudaEGLStreamProducerPresentFrame`, :py:obj:`~.cudaEGLStreamProducerReturnFrame`, :func:`~.cuEGLStreamProducerDisconnect`
     """
     cdef cyruntime.cudaEglStreamConnection *cyconn
     if conn is None:
@@ -41451,12 +41514,12 @@ def cudaEGLStreamProducerPresentFrame(conn, eglframe not None : cudaEglFrame, pS
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`
 
     See Also
     --------
-    :py:obj:`~.cudaEGLStreamProducerConnect`, :py:obj:`~.cudaEGLStreamProducerDisconnect`, :py:obj:`~.cudaEGLStreamProducerReturnFrame`, :py:obj:`~.cuEGLStreamProducerPresentFrame`
+    :py:obj:`~.cudaEGLStreamProducerConnect`, :py:obj:`~.cudaEGLStreamProducerDisconnect`, :py:obj:`~.cudaEGLStreamProducerReturnFrame`, :func:`~.cuEGLStreamProducerPresentFrame`
     """
     cdef cyruntime.cudaStream_t *cypStream
     if pStream is None:
@@ -41489,9 +41552,9 @@ def cudaEGLStreamProducerPresentFrame(conn, eglframe not None : cudaEglFrame, pS
 def cudaEGLStreamProducerReturnFrame(conn, eglframe : Optional[cudaEglFrame], pStream):
     """ Return the CUDA eglFrame to the EGLStream last released by the consumer.
 
-    This API can potentially return cudaErrorLaunchTimeout if the consumer
-    has not returned a frame to EGL stream. If timeout is returned the
-    application can retry.
+    This API can potentially return :py:obj:`~.cudaErrorLaunchTimeout` if
+    the consumer has not returned a frame to EGL stream. If timeout is
+    returned the application can retry.
 
     Parameters
     ----------
@@ -41505,12 +41568,12 @@ def cudaEGLStreamProducerReturnFrame(conn, eglframe : Optional[cudaEglFrame], pS
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorLaunchTimeout`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`
 
     See Also
     --------
-    :py:obj:`~.cudaEGLStreamProducerConnect`, :py:obj:`~.cudaEGLStreamProducerDisconnect`, :py:obj:`~.cudaEGLStreamProducerPresentFrame`, :py:obj:`~.cuEGLStreamProducerReturnFrame`
+    :py:obj:`~.cudaEGLStreamProducerConnect`, :py:obj:`~.cudaEGLStreamProducerDisconnect`, :py:obj:`~.cudaEGLStreamProducerPresentFrame`, :func:`~.cuEGLStreamProducerReturnFrame`
     """
     cdef cyruntime.cudaStream_t *cypStream
     if pStream is None:
@@ -41563,14 +41626,14 @@ def cudaGraphicsResourceGetMappedEglFrame(resource, unsigned int index, unsigned
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorUnknown`
     eglFrame : :py:obj:`~.cudaEglFrame`
         Returned eglFrame.
 
     See Also
     --------
-    :py:obj:`~.cudaGraphicsSubResourceGetMappedArray`, :py:obj:`~.cudaGraphicsResourceGetMappedPointer`, :py:obj:`~.cuGraphicsResourceGetMappedEglFrame`
+    :py:obj:`~.cudaGraphicsSubResourceGetMappedArray`, :py:obj:`~.cudaGraphicsResourceGetMappedPointer`, :func:`~.cuGraphicsResourceGetMappedEglFrame`
 
     Notes
     -----
@@ -41598,8 +41661,8 @@ def cudaGraphicsResourceGetMappedEglFrame(resource, unsigned int index, unsigned
 def cudaEventCreateFromEGLSync(eglSync, unsigned int flags):
     """ Creates an event from EGLSync object.
 
-    Creates an event \\*phEvent from an EGLSyncKHR eglSync with the flages
-    specified via ``flags``. Valid flags include:
+    Creates an event \\*phEvent from an :py:obj:`~.EGLSyncKHR` eglSync with
+    the flages specified via ``flags``. Valid flags include:
 
     - :py:obj:`~.cudaEventDefault`: Default event creation flag.
 
@@ -41611,8 +41674,8 @@ def cudaEventCreateFromEGLSync(eglSync, unsigned int flags):
     :py:obj:`~.cudaEventRecord` and TimingData are not supported for events
     created from EGLSync.
 
-    The EGLSyncKHR is an opaque handle to an EGL sync object. typedef void*
-    EGLSyncKHR
+    The :py:obj:`~.EGLSyncKHR` is an opaque handle to an EGL sync object.
+    typedef :py:obj:`~.EGLImageKHR`* :py:obj:`~.EGLSyncKHR`
 
     Parameters
     ----------
@@ -41623,7 +41686,7 @@ def cudaEventCreateFromEGLSync(eglSync, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInitializationError`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorLaunchFailure`, :py:obj:`~.cudaErrorMemoryAllocation`
     phEvent : :py:obj:`~.cudaEvent_t`
         Returns newly created event
@@ -41658,18 +41721,18 @@ def cudaProfilerStart():
     context. If profiling is already enabled, then
     :py:obj:`~.cudaProfilerStart()` has no effect.
 
-    cudaProfilerStart and cudaProfilerStop APIs are used to
-    programmatically control the profiling granularity by allowing
+    :func:`~.cudaProfilerStart` and :func:`~.cudaProfilerStop` APIs are
+    used to programmatically control the profiling granularity by allowing
     profiling to be done only on selective pieces of code.
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`
 
     See Also
     --------
-    :py:obj:`~.cudaProfilerStop`, :py:obj:`~.cuProfilerStart`
+    :py:obj:`~.cudaProfilerStop`, :func:`~.cuProfilerStart`
     """
     with nogil:
         err = cyruntime.cudaProfilerStart()
@@ -41686,18 +41749,18 @@ def cudaProfilerStop():
     current context. If profiling is already disabled, then
     :py:obj:`~.cudaProfilerStop()` has no effect.
 
-    cudaProfilerStart and cudaProfilerStop APIs are used to
-    programmatically control the profiling granularity by allowing
+    :func:`~.cudaProfilerStart` and :func:`~.cudaProfilerStop` APIs are
+    used to programmatically control the profiling granularity by allowing
     profiling to be done only on selective pieces of code.
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`
 
     See Also
     --------
-    :py:obj:`~.cudaProfilerStart`, :py:obj:`~.cuProfilerStop`
+    :py:obj:`~.cudaProfilerStart`, :func:`~.cuProfilerStop`
     """
     with nogil:
         err = cyruntime.cudaProfilerStop()
@@ -41783,9 +41846,8 @@ def cudaGraphicsGLRegisterImage(image, target, unsigned int flags):
     ``resource``.
 
     ``target`` must match the type of the object, and must be one of
-    :py:obj:`~.GL_TEXTURE_2D`, :py:obj:`~.GL_TEXTURE_RECTANGLE`,
-    :py:obj:`~.GL_TEXTURE_CUBE_MAP`, :py:obj:`~.GL_TEXTURE_3D`,
-    :py:obj:`~.GL_TEXTURE_2D_ARRAY`, or :py:obj:`~.GL_RENDERBUFFER`.
+    ``GL_TEXTURE_2D``, ``GL_TEXTURE_RECTANGLE``, ``GL_TEXTURE_CUBE_MAP``,
+    ``GL_TEXTURE_3D``, ``GL_TEXTURE_2D_ARRAY``, or ``GL_RENDERBUFFER``.
 
     The register flags ``flags`` specify the intended usage, as follows:
 
@@ -41839,14 +41901,14 @@ def cudaGraphicsGLRegisterImage(image, target, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorOperatingSystem`, :py:obj:`~.cudaErrorUnknown`
     resource : :py:obj:`~.cudaGraphicsResource`
         Pointer to the returned object handle
 
     See Also
     --------
-    :py:obj:`~.cudaGraphicsUnregisterResource`, :py:obj:`~.cudaGraphicsMapResources`, :py:obj:`~.cudaGraphicsSubResourceGetMappedArray`, :py:obj:`~.cuGraphicsGLRegisterImage`
+    :py:obj:`~.cudaGraphicsUnregisterResource`, :py:obj:`~.cudaGraphicsMapResources`, :py:obj:`~.cudaGraphicsSubResourceGetMappedArray`, :func:`~.cuGraphicsGLRegisterImage`
     """
     cdef cyruntime.GLenum cytarget
     if target is None:
@@ -41904,14 +41966,14 @@ def cudaGraphicsGLRegisterBuffer(buffer, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorOperatingSystem`, :py:obj:`~.cudaErrorUnknown`
     resource : :py:obj:`~.cudaGraphicsResource`
         Pointer to the returned object handle
 
     See Also
     --------
-    :py:obj:`~.cudaGraphicsUnregisterResource`, :py:obj:`~.cudaGraphicsMapResources`, :py:obj:`~.cudaGraphicsResourceGetMappedPointer`, :py:obj:`~.cuGraphicsGLRegisterBuffer`
+    :py:obj:`~.cudaGraphicsUnregisterResource`, :py:obj:`~.cudaGraphicsMapResources`, :py:obj:`~.cudaGraphicsResourceGetMappedPointer`, :func:`~.cuGraphicsGLRegisterBuffer`
     """
     cdef cyruntime.GLuint cybuffer
     if buffer is None:
@@ -41933,20 +41995,21 @@ def cudaGraphicsGLRegisterBuffer(buffer, unsigned int flags):
 
 @cython.embedsignature(True)
 def cudaVDPAUGetDevice(vdpDevice, vdpGetProcAddress):
-    """ Gets the CUDA device associated with a VdpDevice.
+    """ Gets the CUDA device associated with a :py:obj:`~.VdpDevice`.
 
-    Returns the CUDA device associated with a VdpDevice, if applicable.
+    Returns the CUDA device associated with a :py:obj:`~.VdpDevice`, if
+    applicable.
 
     Parameters
     ----------
     vdpDevice : :py:obj:`~.VdpDevice`
-        A VdpDevice handle
+        A :py:obj:`~.VdpDevice` handle
     vdpGetProcAddress : :py:obj:`~.VdpGetProcAddress`
-        VDPAU's VdpGetProcAddress function pointer
+        VDPAU's :py:obj:`~.VdpGetProcAddress` function pointer
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`
     device : int
         Returns the device associated with vdpDevice, or -1 if the device
@@ -41954,7 +42017,7 @@ def cudaVDPAUGetDevice(vdpDevice, vdpGetProcAddress):
 
     See Also
     --------
-    :py:obj:`~.cudaVDPAUSetVDPAUDevice`, :py:obj:`~.cuVDPAUGetDevice`
+    :py:obj:`~.cudaVDPAUSetVDPAUDevice`, :func:`~.cuVDPAUGetDevice`
     """
     cdef cyruntime.VdpGetProcAddress *cyvdpGetProcAddress
     if vdpGetProcAddress is None:
@@ -41988,9 +42051,9 @@ def cudaVDPAUGetDevice(vdpDevice, vdpGetProcAddress):
 def cudaVDPAUSetVDPAUDevice(int device, vdpDevice, vdpGetProcAddress):
     """ Sets a CUDA device to use VDPAU interoperability.
 
-    Records ``vdpDevice`` as the VdpDevice for VDPAU interoperability with
-    the CUDA device ``device`` and sets ``device`` as the current device
-    for the calling host thread.
+    Records ``vdpDevice`` as the :py:obj:`~.VdpDevice` for VDPAU
+    interoperability with the CUDA device ``device`` and sets ``device`` as
+    the current device for the calling host thread.
 
     This function will immediately initialize the primary context on
     ``device`` if needed.
@@ -42005,13 +42068,13 @@ def cudaVDPAUSetVDPAUDevice(int device, vdpDevice, vdpGetProcAddress):
     device : int
         Device to use for VDPAU interoperability
     vdpDevice : :py:obj:`~.VdpDevice`
-        The VdpDevice to interoperate with
+        The :py:obj:`~.VdpDevice` to interoperate with
     vdpGetProcAddress : :py:obj:`~.VdpGetProcAddress`
-        VDPAU's VdpGetProcAddress function pointer
+        VDPAU's :py:obj:`~.VdpGetProcAddress` function pointer
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorSetOnActiveProcess`
 
     See Also
@@ -42045,11 +42108,12 @@ def cudaVDPAUSetVDPAUDevice(int device, vdpDevice, vdpGetProcAddress):
 
 @cython.embedsignature(True)
 def cudaGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
-    """ Register a VdpVideoSurface object.
+    """ Register a :py:obj:`~.VdpVideoSurface` object.
 
-    Registers the VdpVideoSurface specified by ``vdpSurface`` for access by
-    CUDA. A handle to the registered object is returned as ``resource``.
-    The surface's intended usage is specified using ``flags``, as follows:
+    Registers the :py:obj:`~.VdpVideoSurface` specified by ``vdpSurface``
+    for access by CUDA. A handle to the registered object is returned as
+    ``resource``. The surface's intended usage is specified using
+    ``flags``, as follows:
 
     - :py:obj:`~.cudaGraphicsMapFlagsNone`: Specifies no hints about how
       this resource will be used. It is therefore assumed that this
@@ -42073,14 +42137,14 @@ def cudaGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorUnknown`
     resource : :py:obj:`~.cudaGraphicsResource`
         Pointer to the returned object handle
 
     See Also
     --------
-    :py:obj:`~.cudaVDPAUSetVDPAUDevice`, :py:obj:`~.cudaGraphicsUnregisterResource`, :py:obj:`~.cudaGraphicsSubResourceGetMappedArray`, :py:obj:`~.cuGraphicsVDPAURegisterVideoSurface`
+    :py:obj:`~.cudaVDPAUSetVDPAUDevice`, :py:obj:`~.cudaGraphicsUnregisterResource`, :py:obj:`~.cudaGraphicsSubResourceGetMappedArray`, :func:`~.cuGraphicsVDPAURegisterVideoSurface`
     """
     cdef cyruntime.VdpVideoSurface cyvdpSurface
     if vdpSurface is None:
@@ -42102,11 +42166,12 @@ def cudaGraphicsVDPAURegisterVideoSurface(vdpSurface, unsigned int flags):
 
 @cython.embedsignature(True)
 def cudaGraphicsVDPAURegisterOutputSurface(vdpSurface, unsigned int flags):
-    """ Register a VdpOutputSurface object.
+    """ Register a :py:obj:`~.VdpOutputSurface` object.
 
-    Registers the VdpOutputSurface specified by ``vdpSurface`` for access
-    by CUDA. A handle to the registered object is returned as ``resource``.
-    The surface's intended usage is specified using ``flags``, as follows:
+    Registers the :py:obj:`~.VdpOutputSurface` specified by ``vdpSurface``
+    for access by CUDA. A handle to the registered object is returned as
+    ``resource``. The surface's intended usage is specified using
+    ``flags``, as follows:
 
     - :py:obj:`~.cudaGraphicsMapFlagsNone`: Specifies no hints about how
       this resource will be used. It is therefore assumed that this
@@ -42130,14 +42195,14 @@ def cudaGraphicsVDPAURegisterOutputSurface(vdpSurface, unsigned int flags):
 
     Returns
     -------
-    cudaError_t
+    :py:obj:`~.cudaError_t`
         :py:obj:`~.cudaSuccess`, :py:obj:`~.cudaErrorInvalidDevice`, :py:obj:`~.cudaErrorInvalidValue`, :py:obj:`~.cudaErrorInvalidResourceHandle`, :py:obj:`~.cudaErrorUnknown`
     resource : :py:obj:`~.cudaGraphicsResource`
         Pointer to the returned object handle
 
     See Also
     --------
-    :py:obj:`~.cudaVDPAUSetVDPAUDevice`, :py:obj:`~.cudaGraphicsUnregisterResource`, :py:obj:`~.cudaGraphicsSubResourceGetMappedArray`, :py:obj:`~.cuGraphicsVDPAURegisterOutputSurface`
+    :py:obj:`~.cudaVDPAUSetVDPAUDevice`, :py:obj:`~.cudaGraphicsUnregisterResource`, :py:obj:`~.cudaGraphicsSubResourceGetMappedArray`, :func:`~.cuGraphicsVDPAURegisterOutputSurface`
     """
     cdef cyruntime.VdpOutputSurface cyvdpSurface
     if vdpSurface is None:
diff --git a/cuda_bindings/docs/source/module/driver.rst b/cuda_bindings/docs/source/module/driver.rst
index df896172a3..1c4eb31f25 100644
--- a/cuda_bindings/docs/source/module/driver.rst
+++ b/cuda_bindings/docs/source/module/driver.rst
@@ -5,6 +5,9 @@
 driver
 ------
 
+.. _cuda-bindings-driver-cuda_types:
+.. _cuda-bindings-driver-group__cuda__types:
+
 Data types used by CUDA driver
 ------------------------------
 
@@ -308,7 +311,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUevent_flags.CU_EVENT_INTERPROCESS
 
 
-        Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set
+        Event is suitable for interprocess use. :py:obj:`~.CU_EVENT_DISABLE_TIMING` must be set
 
 .. autoclass:: cuda.bindings.driver.CUevent_record_flags
 
@@ -1045,7 +1048,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK
 
 
-        Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK
+        Deprecated, use :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK`
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY
@@ -1075,7 +1078,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK
 
 
-        Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK
+        Deprecated, use :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK`
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CLOCK_RATE
@@ -1093,7 +1096,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
 
 
-        Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT.
+        Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead :py:obj:`~.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT`.
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT
@@ -1183,19 +1186,19 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH
 
 
-        Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH
+        Deprecated, use :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH`
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT
 
 
-        Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT
+        Deprecated, use :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT`
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES
 
 
-        Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS
+        Deprecated, use :py:obj:`~.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS`
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT
@@ -1291,13 +1294,13 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH
 
 
-        Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set
+        Maximum 2D texture width if :py:obj:`~.CUDA_ARRAY3D_TEXTURE_GATHER` is set
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT
 
 
-        Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set
+        Maximum 2D texture height if :py:obj:`~.CUDA_ARRAY3D_TEXTURE_GATHER` is set
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE
@@ -1435,7 +1438,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH
 
 
-        Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or :py:obj:`~.cuDeviceGetTexture1DLinearMaxWidth()` instead.
+        Deprecated, do not use. Use :func:`~.cudaDeviceGetTexture1DLinearMaxWidth` or :py:obj:`~.cuDeviceGetTexture1DLinearMaxWidth()` instead.
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH
@@ -1549,7 +1552,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS
 
 
-        Device supports coherently accessing pageable memory without calling cudaHostRegister on it
+        Device supports coherently accessing pageable memory without calling :func:`~.cudaHostRegister` on it
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS
@@ -1609,13 +1612,13 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES
 
 
-        The :py:obj:`~.CU_STREAM_WAIT_VALUE_FLUSH` flag and the :py:obj:`~.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES` MemOp are supported on the device. See Stream Memory Operations for additional details.
+        The :py:obj:`~.CU_STREAM_WAIT_VALUE_FLUSH` flag and the :py:obj:`~.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES` MemOp are supported on the device. See :ref:`Stream Memory Operations <cuda-bindings-driver-group__cuda__memop>` for additional details.
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_REGISTER_SUPPORTED
 
 
-        Device supports host memory registration via :py:obj:`~.cudaHostRegister`.
+        Device supports host memory registration via :func:`~.cudaHostRegister`.
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES
@@ -1633,7 +1636,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED
 
 
-        Deprecated, Use CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED
+        Deprecated, Use :py:obj:`~.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED`
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED
@@ -1705,7 +1708,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED
 
 
-        Device supports using the :py:obj:`~.cuMemHostRegister` flag :py:obj:`~.CU_MEMHOSTERGISTER_READ_ONLY` to register memory that must be mapped as read-only to the GPU
+        Device supports using the :py:obj:`~.cuMemHostRegister` flag ``CU_MEMHOSTERGISTER_READ_ONLY`` to register memory that must be mapped as read-only to the GPU
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED
@@ -1717,7 +1720,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED
 
 
-        Device supports using the :py:obj:`~.cuMemAllocAsync` and :py:obj:`~.cuMemPool` family of APIs
+        Device supports using the :py:obj:`~.cuMemAllocAsync` and ``cuMemPool`` family of APIs
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED
@@ -1879,7 +1882,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_NUMA_MEMORY_POOLS_SUPPORTED
 
 
-        Device supports HOST_NUMA location with the :py:obj:`~.cuMemAllocAsync` and :py:obj:`~.cuMemPool` family of APIs
+        Device supports HOST_NUMA location with the :py:obj:`~.cuMemAllocAsync` and ``cuMemPool`` family of APIs
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_NUMA_MULTINODE_IPC_SUPPORTED
@@ -1891,7 +1894,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_MEMORY_POOLS_SUPPORTED
 
 
-        Device suports HOST location with the :py:obj:`~.cuMemAllocAsync` and :py:obj:`~.cuMemPool` family of APIs
+        Device suports HOST location with the :py:obj:`~.cuMemAllocAsync` and ``cuMemPool`` family of APIs
 
 
     .. autoattribute:: cuda.bindings.driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_HOST_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED
@@ -2015,7 +2018,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
 
 
-        1 if this pointer maps to an allocation that is suitable for :py:obj:`~.cudaIpcGetMemHandle`, 0 otherwise
+        1 if this pointer maps to an allocation that is suitable for :func:`~.cudaIpcGetMemHandle`, 0 otherwise
 
 
     .. autoattribute:: cuda.bindings.driver.CUpointer_attribute.CU_POINTER_ATTRIBUTE_RANGE_START_ADDR
@@ -2158,7 +2161,7 @@ Data types used by CUDA driver
 
 
 
-        If the value is set during compile time, it cannot be set at runtime. Setting it at runtime will return CUDA_ERROR_NOT_PERMITTED. See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
+        If the value is set during compile time, it cannot be set at runtime. Setting it at runtime will return :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`. See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
 
 
     .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT
@@ -2168,7 +2171,7 @@ Data types used by CUDA driver
 
 
 
-        If the value is set during compile time, it cannot be set at runtime. Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
+        If the value is set during compile time, it cannot be set at runtime. Setting it at runtime should return :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`. See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
 
 
     .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH
@@ -2178,7 +2181,7 @@ Data types used by CUDA driver
 
 
 
-        If the value is set during compile time, it cannot be set at runtime. Setting it at runtime should return CUDA_ERROR_NOT_PERMITTED. See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
+        If the value is set during compile time, it cannot be set at runtime. Setting it at runtime should return :py:obj:`~.CUDA_ERROR_NOT_PERMITTED`. See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
 
 
     .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED
@@ -2188,7 +2191,7 @@ Data types used by CUDA driver
 
 
 
-        CUDA API provides cudaOccupancyMaxActiveClusters to assist with checking whether the desired size can be launched on the current device.
+        CUDA API provides :func:`~.cudaOccupancyMaxActiveClusters` to assist with checking whether the desired size can be launched on the current device.
 
 
 
@@ -2206,7 +2209,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE
 
 
-        The block scheduling policy of a function. The value type is CUclusterSchedulingPolicy / cudaClusterSchedulingPolicy. See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
+        The block scheduling policy of a function. The value type is :py:obj:`~.CUclusterSchedulingPolicy` / :py:obj:`~.cudaClusterSchedulingPolicy`. See :py:obj:`~.cuFuncSetAttribute`, :py:obj:`~.cuKernelSetAttribute`
 
 
     .. autoattribute:: cuda.bindings.driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_DEVICE_NODE_UPDATE_SUPPORTED
@@ -3349,7 +3352,7 @@ Data types used by CUDA driver
 
                                                 Handles must be created in advance of creating the node
 
-                                                using :py:obj:`~.cuGraphConditionalHandleCreate`.
+                                                using :func:`~.cuGraphConditionalHandleCreate`.
 
 
 
@@ -3365,7 +3368,7 @@ Data types used by CUDA driver
 
                                                 To set the control value, supply a default value when creating the handle and/or
 
-                                                call :py:obj:`~.cudaGraphSetConditional` from device code.
+                                                call ``cudaGraphSetConditional`` from device code.
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphNodeType.CU_GRAPH_NODE_TYPE_RESERVED_16
@@ -3603,7 +3606,7 @@ Data types used by CUDA driver
 
         Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error.
 
-         :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable` can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode` which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.
+         :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable` can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.CUlaunchAttributeValue.deviceUpdatableKernelNode.devNode` which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see ``cudaGraphKernelNodeUpdatesApply``.
 
          Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cuGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cuGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cuGraphExecUpdate`.
 
@@ -3984,7 +3987,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_UNSUPPORTED_DEVSIDE_SYNC
 
 
-        This indicates that the code to be compiled by the PTX JIT contains unsupported call to cudaDeviceSynchronize.
+        This indicates that the code to be compiled by the PTX JIT contains unsupported call to :func:`~.cudaDeviceSynchronize`.
 
 
     .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_CONTAINED
@@ -4290,7 +4293,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_STREAM_CAPTURE_IMPLICIT
 
 
-        This error indicates a disallowed implicit dependency on a current capture sequence from cudaStreamLegacy.
+        This error indicates a disallowed implicit dependency on a current capture sequence from :py:obj:`~.cudaStreamLegacy`.
 
 
     .. autoattribute:: cuda.bindings.driver.CUresult.CUDA_ERROR_CAPTURED_EVENT
@@ -4890,7 +4893,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_FABRIC
 
 
-        Allows a fabric handle to be used for exporting. (CUmemFabricHandle)
+        Allows a fabric handle to be used for exporting. (:py:obj:`~.CUmemFabricHandle`)
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_MAX
@@ -4955,7 +4958,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_INVISIBLE
 
 
-        Location is not visible but device is accessible, id is always CU_DEVICE_INVALID
+        Location is not visible but device is accessible, id is always :py:obj:`~.CU_DEVICE_INVALID`
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemLocationType.CU_MEM_LOCATION_TYPE_MAX
@@ -5110,7 +5113,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES
 
 
-        (value type = int) Allow cuMemAllocAsync to use memory asynchronously freed in another streams as long as a stream ordering dependency of the allocating stream on the free action exists. Cuda events and null stream interactions can create the required stream ordered dependencies. (default enabled)
+        (value type = int) Allow :func:`~.cuMemAllocAsync` to use memory asynchronously freed in another streams as long as a stream ordering dependency of the allocating stream on the free action exists. Cuda events and null stream interactions can create the required stream ordered dependencies. (default enabled)
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC
@@ -5122,67 +5125,67 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES
 
 
-        (value type = int) Allow cuMemAllocAsync to insert new stream dependencies in order to establish the stream ordering required to reuse a piece of memory released by cuMemFreeAsync (default enabled).
+        (value type = int) Allow :func:`~.cuMemAllocAsync` to insert new stream dependencies in order to establish the stream ordering required to reuse a piece of memory released by :func:`~.cuMemFreeAsync` (default enabled).
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD
 
 
-        (value type = cuuint64_t) Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS. When more than the release threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next call to stream, event or context synchronize. (default 0)
+        (value type = :py:obj:`~.cuuint64_t`) Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS. When more than the release threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next call to stream, event or context synchronize. (default 0)
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT
 
 
-        (value type = cuuint64_t) Amount of backing memory currently allocated for the mempool.
+        (value type = :py:obj:`~.cuuint64_t`) Amount of backing memory currently allocated for the mempool.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH
 
 
-        (value type = cuuint64_t) High watermark of backing memory allocated for the mempool since the last time it was reset. High watermark can only be reset to zero.
+        (value type = :py:obj:`~.cuuint64_t`) High watermark of backing memory allocated for the mempool since the last time it was reset. High watermark can only be reset to zero.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_CURRENT
 
 
-        (value type = cuuint64_t) Amount of memory from the pool that is currently in use by the application.
+        (value type = :py:obj:`~.cuuint64_t`) Amount of memory from the pool that is currently in use by the application.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_HIGH
 
 
-        (value type = cuuint64_t) High watermark of the amount of memory from the pool that was in use by the application since the last time it was reset. High watermark can only be reset to zero.
+        (value type = :py:obj:`~.cuuint64_t`) High watermark of the amount of memory from the pool that was in use by the application since the last time it was reset. High watermark can only be reset to zero.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_ALLOCATION_TYPE
 
 
-        (value type = CUmemAllocationType) The allocation type of the mempool
+        (value type = :py:obj:`~.CUmemAllocationType`) The allocation type of the mempool
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_EXPORT_HANDLE_TYPES
 
 
-        (value type = CUmemAllocationHandleType) Available export handle types for the mempool. For imported pools this value is always CU_MEM_HANDLE_TYPE_NONE as an imported pool cannot be re-exported
+        (value type = :py:obj:`~.CUmemAllocationHandleType`) Available export handle types for the mempool. For imported pools this value is always :py:obj:`~.CU_MEM_HANDLE_TYPE_NONE` as an imported pool cannot be re-exported
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_LOCATION_ID
 
 
-        (value type = int) The location id for the mempool. If the location type for this pool is CU_MEM_LOCATION_TYPE_INVISIBLE then ID will be CU_DEVICE_INVALID.
+        (value type = int) The location id for the mempool. If the location type for this pool is :py:obj:`~.CU_MEM_LOCATION_TYPE_INVISIBLE` then ID will be :py:obj:`~.CU_DEVICE_INVALID`.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_LOCATION_TYPE
 
 
-        (value type = CUmemLocationType) The location type for the mempool. For imported memory pools where the device is not directly visible to the importing process or pools imported via fabric handles across nodes this will be CU_MEM_LOCATION_TYPE_INVISIBLE.
+        (value type = :py:obj:`~.CUmemLocationType`) The location type for the mempool. For imported memory pools where the device is not directly visible to the importing process or pools imported via fabric handles across nodes this will be :py:obj:`~.CU_MEM_LOCATION_TYPE_INVISIBLE`.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_MAX_POOL_SIZE
 
 
-        (value type = cuuint64_t) Maximum size of the pool in bytes, this value may be higher than what was initially passed to cuMemPoolCreate due to alignment requirements. A value of 0 indicates no maximum size. For CU_MEM_ALLOCATION_TYPE_MANAGED and IPC imported pools this value will be system dependent.
+        (value type = :py:obj:`~.cuuint64_t`) Maximum size of the pool in bytes, this value may be higher than what was initially passed to :func:`~.cuMemPoolCreate` due to alignment requirements. A value of 0 indicates no maximum size. For :py:obj:`~.CU_MEM_ALLOCATION_TYPE_MANAGED` and IPC imported pools this value will be system dependent.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_HW_DECOMPRESS_ENABLED
@@ -5239,7 +5242,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUmemcpy3DOperandType.CU_MEMCPY_OPERAND_TYPE_ARRAY
 
 
-        Memcpy operand is a CUarray.
+        Memcpy operand is a :py:obj:`~.CUarray`.
 
 
     .. autoattribute:: cuda.bindings.driver.CUmemcpy3DOperandType.CU_MEMCPY_OPERAND_TYPE_MAX
@@ -5249,25 +5252,25 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT
 
 
-        (value type = cuuint64_t) Amount of memory, in bytes, currently associated with graphs
+        (value type = :py:obj:`~.cuuint64_t`) Amount of memory, in bytes, currently associated with graphs
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_USED_MEM_HIGH
 
 
-        (value type = cuuint64_t) High watermark of memory, in bytes, associated with graphs since the last time it was reset. High watermark can only be reset to zero.
+        (value type = :py:obj:`~.cuuint64_t`) High watermark of memory, in bytes, associated with graphs since the last time it was reset. High watermark can only be reset to zero.
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT
 
 
-        (value type = cuuint64_t) Amount of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
+        (value type = :py:obj:`~.cuuint64_t`) Amount of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphMem_attribute.CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH
 
 
-        (value type = cuuint64_t) High watermark of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
+        (value type = :py:obj:`~.cuuint64_t`) High watermark of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
 
 .. autoclass:: cuda.bindings.driver.CUgraphChildGraphNodeOwnership
 
@@ -5284,7 +5287,7 @@ Data types used by CUDA driver
 
 
 
-        The following restrictions apply to child graphs after they have been moved: Cannot be independently instantiated or destroyed; Cannot be added as a child graph of a separate parent graph; Cannot be used as an argument to cuGraphExecUpdate; Cannot have additional memory allocation or free nodes added.
+        The following restrictions apply to child graphs after they have been moved: Cannot be independently instantiated or destroyed; Cannot be added as a child graph of a separate parent graph; Cannot be used as an argument to :py:obj:`~.cuGraphExecUpdate`; Cannot have additional memory allocation or free nodes added.
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphChildGraphNodeOwnership.CU_GRAPH_CHILD_GRAPH_OWNERSHIP_INVALID
@@ -5361,49 +5364,49 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_PARAMS
 
 
-        Adds CUDA_KERNEL_NODE_PARAMS values to output
+        Adds :py:obj:`~.CUDA_KERNEL_NODE_PARAMS` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_MEMCPY_NODE_PARAMS
 
 
-        Adds CUDA_MEMCPY3D values to output
+        Adds :py:obj:`~.CUDA_MEMCPY3D` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_MEMSET_NODE_PARAMS
 
 
-        Adds CUDA_MEMSET_NODE_PARAMS values to output
+        Adds :py:obj:`~.CUDA_MEMSET_NODE_PARAMS` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_HOST_NODE_PARAMS
 
 
-        Adds CUDA_HOST_NODE_PARAMS values to output
+        Adds :py:obj:`~.CUDA_HOST_NODE_PARAMS` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EVENT_NODE_PARAMS
 
 
-        Adds CUevent handle from record and wait nodes to output
+        Adds :py:obj:`~.CUevent` handle from record and wait nodes to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_SIGNAL_NODE_PARAMS
 
 
-        Adds CUDA_EXT_SEM_SIGNAL_NODE_PARAMS values to output
+        Adds :py:obj:`~.CUDA_EXT_SEM_SIGNAL_NODE_PARAMS` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_EXT_SEMAS_WAIT_NODE_PARAMS
 
 
-        Adds CUDA_EXT_SEM_WAIT_NODE_PARAMS values to output
+        Adds :py:obj:`~.CUDA_EXT_SEM_WAIT_NODE_PARAMS` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_KERNEL_NODE_ATTRIBUTES
 
 
-        Adds CUkernelNodeAttrValue values to output
+        Adds :py:obj:`~.CUkernelNodeAttrValue` values to output
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphDebugDot_flags.CU_GRAPH_DEBUG_DOT_FLAGS_HANDLES
@@ -5472,7 +5475,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUgraphInstantiate_flags.CUDA_GRAPH_INSTANTIATE_FLAG_DEVICE_LAUNCH
 
 
-        Instantiate the graph to be launchable from the device. This flag can only be used on platforms which support unified addressing. This flag cannot be used in conjunction with CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH.
+        Instantiate the graph to be launchable from the device. This flag can only be used on platforms which support unified addressing. This flag cannot be used in conjunction with :py:obj:`~.CUDA_GRAPH_INSTANTIATE_FLAG_AUTO_FREE_ON_LAUNCH`.
 
 
     .. autoattribute:: cuda.bindings.driver.CUgraphInstantiate_flags.CUDA_GRAPH_INSTANTIATE_FLAG_USE_NODE_PRIORITY
@@ -5491,7 +5494,7 @@ Data types used by CUDA driver
     .. autoattribute:: cuda.bindings.driver.CUdeviceNumaConfig.CU_DEVICE_NUMA_CONFIG_NUMA_NODE
 
 
-        The GPU is a NUMA node, CU_DEVICE_ATTRIBUTE_NUMA_ID contains its NUMA ID
+        The GPU is a NUMA node, :py:obj:`~.CU_DEVICE_ATTRIBUTE_NUMA_ID` contains its NUMA ID
 
 .. autoclass:: cuda.bindings.driver.CUprocessState
 
@@ -6441,7 +6444,7 @@ Data types used by CUDA driver
 
 
 
-    Stream handle that can be passed as a CUstream to use an implicit stream with legacy synchronization behavior.
+    Stream handle that can be passed as a :py:obj:`~.CUstream` to use an implicit stream with legacy synchronization behavior.
 
 
 
@@ -6453,7 +6456,7 @@ Data types used by CUDA driver
 
 
 
-    Stream handle that can be passed as a CUstream to use an implicit stream with per-thread synchronization behavior.
+    Stream handle that can be passed as a :py:obj:`~.CUstream` to use an implicit stream with per-thread synchronization behavior.
 
 
 
@@ -6571,11 +6574,11 @@ Data types used by CUDA driver
 
 .. autoattribute:: cuda.bindings.driver.CUDA_ARRAY3D_LAYERED
 
-    If set, the CUDA array is a collection of layers, where each layer is either a 1D or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number of layers, not the depth of a 3D array.
+    If set, the CUDA array is a collection of layers, where each layer is either a 1D or a 2D array and the Depth member of :py:obj:`~.CUDA_ARRAY3D_DESCRIPTOR` specifies the number of layers, not the depth of a 3D array.
 
 .. autoattribute:: cuda.bindings.driver.CUDA_ARRAY3D_2DARRAY
 
-    Deprecated, use CUDA_ARRAY3D_LAYERED
+    Deprecated, use :py:obj:`~.CUDA_ARRAY3D_LAYERED`
 
 .. autoattribute:: cuda.bindings.driver.CUDA_ARRAY3D_SURFACE_LDST
 
@@ -6639,7 +6642,7 @@ Data types used by CUDA driver
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_END_AS_INT
 
-    C++ compile time constant for CU_LAUNCH_PARAM_END
+    C++ compile time constant for :py:obj:`~.CU_LAUNCH_PARAM_END`
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_END
 
@@ -6647,7 +6650,7 @@ Data types used by CUDA driver
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_BUFFER_POINTER_AS_INT
 
-    C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_POINTER
+    C++ compile time constant for :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_POINTER`
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_BUFFER_POINTER
 
@@ -6655,7 +6658,7 @@ Data types used by CUDA driver
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_BUFFER_SIZE_AS_INT
 
-    C++ compile time constant for CU_LAUNCH_PARAM_BUFFER_SIZE
+    C++ compile time constant for :py:obj:`~.CU_LAUNCH_PARAM_BUFFER_SIZE`
 
 .. autoattribute:: cuda.bindings.driver.CU_LAUNCH_PARAM_BUFFER_SIZE
 
@@ -6682,6 +6685,9 @@ Data types used by CUDA driver
     Indicates that timeout for :py:obj:`~.cuEGLStreamConsumerAcquireFrame` is infinite.
 
 
+.. _cuda-bindings-driver-cuda_error:
+.. _cuda-bindings-driver-group__cuda__error:
+
 Error Handling
 --------------
 
@@ -6690,6 +6696,9 @@ This section describes the error handling functions of the low-level CUDA driver
 .. autofunction:: cuda.bindings.driver.cuGetErrorString
 .. autofunction:: cuda.bindings.driver.cuGetErrorName
 
+.. _cuda-bindings-driver-cuda_initialize:
+.. _cuda-bindings-driver-group__cuda__initialize:
+
 Initialization
 --------------
 
@@ -6697,6 +6706,9 @@ This section describes the initialization functions of the low-level CUDA driver
 
 .. autofunction:: cuda.bindings.driver.cuInit
 
+.. _cuda-bindings-driver-cuda_version:
+.. _cuda-bindings-driver-group__cuda__version:
+
 Version Management
 ------------------
 
@@ -6704,6 +6716,9 @@ This section describes the version management functions of the low-level CUDA dr
 
 .. autofunction:: cuda.bindings.driver.cuDriverGetVersion
 
+.. _cuda-bindings-driver-cuda_device:
+.. _cuda-bindings-driver-group__cuda__device:
+
 Device Management
 -----------------
 
@@ -6725,6 +6740,9 @@ This section describes the device management functions of the low-level CUDA dri
 .. autofunction:: cuda.bindings.driver.cuDeviceGetExecAffinitySupport
 .. autofunction:: cuda.bindings.driver.cuFlushGPUDirectRDMAWrites
 
+.. _cuda-bindings-driver-cuda_primary_ctx:
+.. _cuda-bindings-driver-group__cuda__primary__ctx:
+
 Primary Context Management
 --------------------------
 
@@ -6740,6 +6758,9 @@ The primary context is unique per device and shared with the CUDA runtime API. T
 .. autofunction:: cuda.bindings.driver.cuDevicePrimaryCtxGetState
 .. autofunction:: cuda.bindings.driver.cuDevicePrimaryCtxReset
 
+.. _cuda-bindings-driver-cuda_ctx:
+.. _cuda-bindings-driver-group__cuda__ctx:
+
 Context Management
 ------------------
 
@@ -6773,6 +6794,9 @@ Please note that some functions are described in Primary Context Management sect
 .. autofunction:: cuda.bindings.driver.cuCtxRecordEvent
 .. autofunction:: cuda.bindings.driver.cuCtxWaitEvent
 
+.. _cuda-bindings-driver-cuda_module:
+.. _cuda-bindings-driver-group__cuda__module:
+
 Module Management
 -----------------
 
@@ -6807,6 +6831,9 @@ This section describes the module management functions of the low-level CUDA dri
 .. autofunction:: cuda.bindings.driver.cuLinkComplete
 .. autofunction:: cuda.bindings.driver.cuLinkDestroy
 
+.. _cuda-bindings-driver-cuda_library:
+.. _cuda-bindings-driver-group__cuda__library:
+
 Library Management
 ------------------
 
@@ -6831,6 +6858,9 @@ This section describes the library management functions of the low-level CUDA dr
 .. autofunction:: cuda.bindings.driver.cuKernelGetParamInfo
 .. autofunction:: cuda.bindings.driver.cuKernelGetParamCount
 
+.. _cuda-bindings-driver-cuda_mem:
+.. _cuda-bindings-driver-group__cuda__mem:
+
 Memory Management
 -----------------
 
@@ -6941,6 +6971,9 @@ This section describes the memory management functions of the low-level CUDA dri
 .. autofunction:: cuda.bindings.driver.cuMemGetHandleForAddressRange
 .. autofunction:: cuda.bindings.driver.cuMemBatchDecompressAsync
 
+.. _cuda-bindings-driver-cuda_va:
+.. _cuda-bindings-driver-group__cuda__va:
+
 Virtual Memory Management
 -------------------------
 
@@ -6961,6 +6994,9 @@ This section describes the virtual memory management functions of the low-level
 .. autofunction:: cuda.bindings.driver.cuMemGetAllocationPropertiesFromHandle
 .. autofunction:: cuda.bindings.driver.cuMemRetainAllocationHandle
 
+.. _cuda-bindings-driver-cuda_malloc_async:
+.. _cuda-bindings-driver-group__cuda__malloc__async:
+
 Stream Ordered Memory Allocator
 -------------------------------
 
@@ -6986,7 +7022,7 @@ The allocator is free to reallocate the memory as long as it can guarantee that
 
 
 
-Whether or not a device supports the integrated stream ordered memory allocator may be queried by calling cuDeviceGetAttribute() with the device attribute CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED
+Whether or not a device supports the integrated stream ordered memory allocator may be queried by calling :func:`~.cuDeviceGetAttribute` with the device attribute :py:obj:`~.CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED`
 
 .. autofunction:: cuda.bindings.driver.cuMemFreeAsync
 .. autofunction:: cuda.bindings.driver.cuMemAllocAsync
@@ -7006,6 +7042,9 @@ Whether or not a device supports the integrated stream ordered memory allocator
 .. autofunction:: cuda.bindings.driver.cuMemPoolExportPointer
 .. autofunction:: cuda.bindings.driver.cuMemPoolImportPointer
 
+.. _cuda-bindings-driver-cuda_multicast:
+.. _cuda-bindings-driver-group__cuda__multicast:
+
 Multicast Object Management
 ---------------------------
 
@@ -7019,7 +7058,7 @@ This section describes the CUDA multicast object operations exposed by the low-l
 
 
 
-A multicast object created via cuMulticastCreate enables certain memory operations to be broadcast to a team of devices. Devices can be added to a multicast object via cuMulticastAddDevice. Memory can be bound on each participating device via cuMulticastBindMem, cuMulticastBindMem_v2, cuMulticastBindAddr, or cuMulticastBindAddr_v2. Multicast objects can be mapped into a device's virtual address space using the virtual memmory management APIs (see cuMemMap and cuMemSetAccess).
+A multicast object created via :func:`~.cuMulticastCreate` enables certain memory operations to be broadcast to a team of devices. Devices can be added to a multicast object via :func:`~.cuMulticastAddDevice`. Memory can be bound on each participating device via :func:`~.cuMulticastBindMem`, :func:`~.cuMulticastBindMem_v2`, :func:`~.cuMulticastBindAddr`, or :func:`~.cuMulticastBindAddr_v2`. Multicast objects can be mapped into a device's virtual address space using the virtual memmory management APIs (see :func:`~.cuMemMap` and :func:`~.cuMemSetAccess`).
 
 
 
@@ -7029,7 +7068,7 @@ A multicast object created via cuMulticastCreate enables certain memory operatio
 
 
 
-Support for multicast on a specific device can be queried using the device attribute CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED
+Support for multicast on a specific device can be queried using the device attribute :py:obj:`~.CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED`
 
 .. autofunction:: cuda.bindings.driver.cuMulticastCreate
 .. autofunction:: cuda.bindings.driver.cuMulticastAddDevice
@@ -7040,6 +7079,9 @@ Support for multicast on a specific device can be queried using the device attri
 .. autofunction:: cuda.bindings.driver.cuMulticastUnbind
 .. autofunction:: cuda.bindings.driver.cuMulticastGetGranularity
 
+.. _cuda-bindings-driver-cuda_logical_endpoint:
+.. _cuda-bindings-driver-group__cuda__logical__endpoint:
+
 Logical Endpoint
 ----------------
 
@@ -7093,6 +7135,9 @@ This section describes the logical endpoint functions of the low-level CUDA driv
 .. autofunction:: cuda.bindings.driver.cuLogicalEndpointGetLimits
 .. autofunction:: cuda.bindings.driver.cuLogicalEndpointQuery
 
+.. _cuda-bindings-driver-cuda_unified:
+.. _cuda-bindings-driver-group__cuda__unified:
+
 Unified Addressing
 ------------------
 
@@ -7116,7 +7161,7 @@ CUDA devices can share a unified address space with the host. For these devices
 
 
 
-Whether or not a device supports unified addressing may be queried by calling cuDeviceGetAttribute() with the device attribute CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING.
+Whether or not a device supports unified addressing may be queried by calling :func:`~.cuDeviceGetAttribute` with the device attribute :py:obj:`~.CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING`.
 
 Unified addressing is automatically enabled in 64-bit processes
 
@@ -7128,9 +7173,9 @@ Unified addressing is automatically enabled in 64-bit processes
 
 
 
-It is possible to look up information about the memory which backs a pointer value. For instance, one may want to know if a pointer points to host or device memory. As another example, in the case of device memory, one may want to know on which CUDA device the memory resides. These properties may be queried using the function cuPointerGetAttribute()
+It is possible to look up information about the memory which backs a pointer value. For instance, one may want to know if a pointer points to host or device memory. As another example, in the case of device memory, one may want to know on which CUDA device the memory resides. These properties may be queried using the function :func:`~.cuPointerGetAttribute`
 
-Since pointers are unique, it is not necessary to specify information about the pointers specified to the various copy functions in the CUDA API. The function cuMemcpy() may be used to perform a copy between two pointers, ignoring whether they point to host or device memory (making cuMemcpyHtoD(), cuMemcpyDtoD(), and cuMemcpyDtoH() unnecessary for devices supporting unified addressing). For multidimensional copies, the memory type CU_MEMORYTYPE_UNIFIED may be used to specify that the CUDA driver should infer the location of the pointer from its value.
+Since pointers are unique, it is not necessary to specify information about the pointers specified to the various copy functions in the CUDA API. The function :func:`~.cuMemcpy` may be used to perform a copy between two pointers, ignoring whether they point to host or device memory (making :func:`~.cuMemcpyHtoD`, :func:`~.cuMemcpyDtoD`, and :func:`~.cuMemcpyDtoH` unnecessary for devices supporting unified addressing). For multidimensional copies, the memory type :py:obj:`~.CU_MEMORYTYPE_UNIFIED` may be used to specify that the CUDA driver should infer the location of the pointer from its value.
 
 
 
@@ -7140,11 +7185,11 @@ Since pointers are unique, it is not necessary to specify information about the
 
 
 
-All host memory allocated in all contexts using cuMemAllocHost() and cuMemHostAlloc() is always directly accessible from all contexts on all devices that support unified addressing. This is the case regardless of whether or not the flags CU_MEMHOSTALLOC_PORTABLE and CU_MEMHOSTALLOC_DEVICEMAP are specified.
+All host memory allocated in all contexts using :func:`~.cuMemAllocHost` and :func:`~.cuMemHostAlloc` is always directly accessible from all contexts on all devices that support unified addressing. This is the case regardless of whether or not the flags :py:obj:`~.CU_MEMHOSTALLOC_PORTABLE` and :py:obj:`~.CU_MEMHOSTALLOC_DEVICEMAP` are specified.
 
-The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host, so it is not necessary to call cuMemHostGetDevicePointer() to get the device pointer for these allocations.
+The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host, so it is not necessary to call :func:`~.cuMemHostGetDevicePointer` to get the device pointer for these allocations.
 
-Note that this is not the case for memory allocated using the flag CU_MEMHOSTALLOC_WRITECOMBINED, as discussed below.
+Note that this is not the case for memory allocated using the flag :py:obj:`~.CU_MEMHOSTALLOC_WRITECOMBINED`, as discussed below.
 
 
 
@@ -7154,7 +7199,7 @@ Note that this is not the case for memory allocated using the flag CU_MEMHOSTALL
 
 
 
-Upon enabling direct access from a context that supports unified addressing to another peer context that supports unified addressing using cuCtxEnablePeerAccess() all memory allocated in the peer context using cuMemAlloc() and cuMemAllocPitch() will immediately be accessible by the current context. The device pointer value through which any peer memory may be accessed in the current context is the same pointer value through which that memory may be accessed in the peer context.
+Upon enabling direct access from a context that supports unified addressing to another peer context that supports unified addressing using :func:`~.cuCtxEnablePeerAccess` all memory allocated in the peer context using :func:`~.cuMemAlloc` and :func:`~.cuMemAllocPitch` will immediately be accessible by the current context. The device pointer value through which any peer memory may be accessed in the current context is the same pointer value through which that memory may be accessed in the peer context.
 
 
 
@@ -7164,9 +7209,9 @@ Upon enabling direct access from a context that supports unified addressing to a
 
 
 
-Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cuMemHostRegister() and host memory allocated using the flag CU_MEMHOSTALLOC_WRITECOMBINED. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all contexts that support unified addressing.
+Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using :func:`~.cuMemHostRegister` and host memory allocated using the flag :py:obj:`~.CU_MEMHOSTALLOC_WRITECOMBINED`. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all contexts that support unified addressing.
 
-This device address may be queried using cuMemHostGetDevicePointer() when a context using unified addressing is current. Either the host or the unified device pointer value may be used to refer to this memory through cuMemcpy() and similar functions using the CU_MEMORYTYPE_UNIFIED memory type.
+This device address may be queried using :func:`~.cuMemHostGetDevicePointer` when a context using unified addressing is current. Either the host or the unified device pointer value may be used to refer to this memory through :func:`~.cuMemcpy` and similar functions using the :py:obj:`~.CU_MEMORYTYPE_UNIFIED` memory type.
 
 .. autofunction:: cuda.bindings.driver.cuPointerGetAttribute
 .. autofunction:: cuda.bindings.driver.cuMemPrefetchAsync
@@ -7179,6 +7224,9 @@ This device address may be queried using cuMemHostGetDevicePointer() when a cont
 .. autofunction:: cuda.bindings.driver.cuPointerSetAttribute
 .. autofunction:: cuda.bindings.driver.cuPointerGetAttributes
 
+.. _cuda-bindings-driver-cuda_stream:
+.. _cuda-bindings-driver-group__cuda__stream:
+
 Stream Management
 -----------------
 
@@ -7232,6 +7280,9 @@ This section describes the stream management functions of the low-level CUDA dri
 .. autofunction:: cuda.bindings.driver.cuStreamGetAttribute
 .. autofunction:: cuda.bindings.driver.cuStreamSetAttribute
 
+.. _cuda-bindings-driver-cuda_event:
+.. _cuda-bindings-driver-group__cuda__event:
+
 Event Management
 ----------------
 
@@ -7245,6 +7296,9 @@ This section describes the event management functions of the low-level CUDA driv
 .. autofunction:: cuda.bindings.driver.cuEventDestroy
 .. autofunction:: cuda.bindings.driver.cuEventElapsedTime
 
+.. _cuda-bindings-driver-cuda_extres_interop:
+.. _cuda-bindings-driver-group__cuda__extres__interop:
+
 External Resource Interoperability
 ----------------------------------
 
@@ -7259,6 +7313,9 @@ This section describes the external resource interoperability functions of the l
 .. autofunction:: cuda.bindings.driver.cuWaitExternalSemaphoresAsync
 .. autofunction:: cuda.bindings.driver.cuDestroyExternalSemaphore
 
+.. _cuda-bindings-driver-cuda_memop:
+.. _cuda-bindings-driver-group__cuda__memop:
+
 Stream Memory Operations
 ------------------------
 
@@ -7266,23 +7323,23 @@ This section describes the stream memory operations of the low-level CUDA driver
 
 
 
-Support for the CU_STREAM_WAIT_VALUE_NOR flag can be queried with ``CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2``.
+Support for the :py:obj:`~.CU_STREAM_WAIT_VALUE_NOR` flag can be queried with ``CU_DEVICE_ATTRIBUTE_CAN_USE_STREAM_WAIT_VALUE_NOR_V2``.
 
 
 
-Support for the cuStreamWriteValue64() and cuStreamWaitValue64() functions, as well as for the CU_STREAM_MEM_OP_WAIT_VALUE_64 and CU_STREAM_MEM_OP_WRITE_VALUE_64 flags, can be queried with CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS.
+Support for the :func:`~.cuStreamWriteValue64` and :func:`~.cuStreamWaitValue64` functions, as well as for the :py:obj:`~.CU_STREAM_MEM_OP_WAIT_VALUE_64` and :py:obj:`~.CU_STREAM_MEM_OP_WRITE_VALUE_64` flags, can be queried with :py:obj:`~.CU_DEVICE_ATTRIBUTE_CAN_USE_64_BIT_STREAM_MEM_OPS`.
 
 
 
-Support for both CU_STREAM_WAIT_VALUE_FLUSH and CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES requires dedicated platform hardware features and can be queried with cuDeviceGetAttribute() and CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES.
+Support for both :py:obj:`~.CU_STREAM_WAIT_VALUE_FLUSH` and :py:obj:`~.CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES` requires dedicated platform hardware features and can be queried with :func:`~.cuDeviceGetAttribute` and :py:obj:`~.CU_DEVICE_ATTRIBUTE_CAN_FLUSH_REMOTE_WRITES`.
 
 
 
-Note that all memory pointers passed as parameters to these operations are device pointers. Where necessary a device pointer should be obtained, for example with cuMemHostGetDevicePointer().
+Note that all memory pointers passed as parameters to these operations are device pointers. Where necessary a device pointer should be obtained, for example with :func:`~.cuMemHostGetDevicePointer`.
 
 
 
-None of the operations accepts pointers to managed memory buffers (cuMemAllocManaged).
+None of the operations accepts pointers to managed memory buffers (:func:`~.cuMemAllocManaged`).
 
 
 
@@ -7294,6 +7351,9 @@ Warning: Improper use of these APIs may deadlock the application. Synchronizatio
 .. autofunction:: cuda.bindings.driver.cuStreamWriteValue64
 .. autofunction:: cuda.bindings.driver.cuStreamBatchMemOp
 
+.. _cuda-bindings-driver-cuda_exec:
+.. _cuda-bindings-driver-group__cuda__exec:
+
 Execution Control
 -----------------
 
@@ -7324,6 +7384,9 @@ This section describes the execution control functions of the low-level CUDA dri
 .. autofunction:: cuda.bindings.driver.cuLaunchHostFunc
 .. autofunction:: cuda.bindings.driver.cuLaunchHostFunc_v2
 
+.. _cuda-bindings-driver-cuda_graph:
+.. _cuda-bindings-driver-group__cuda__graph:
+
 Graph Management
 ----------------
 
@@ -7418,6 +7481,9 @@ This section describes the graph management functions of the low-level CUDA driv
 .. autofunction:: cuda.bindings.driver.cuGraphExecNodeSetParams
 .. autofunction:: cuda.bindings.driver.cuGraphConditionalHandleCreate
 
+.. _cuda-bindings-driver-cuda_occupancy:
+.. _cuda-bindings-driver-group__cuda__occupancy:
+
 Occupancy
 ---------
 
@@ -7431,6 +7497,9 @@ This section describes the occupancy calculation functions of the low-level CUDA
 .. autofunction:: cuda.bindings.driver.cuOccupancyMaxPotentialClusterSize
 .. autofunction:: cuda.bindings.driver.cuOccupancyMaxActiveClusters
 
+.. _cuda-bindings-driver-cuda_texobject:
+.. _cuda-bindings-driver-group__cuda__texobject:
+
 Texture Object Management
 -------------------------
 
@@ -7442,6 +7511,9 @@ This section describes the texture object management functions of the low-level
 .. autofunction:: cuda.bindings.driver.cuTexObjectGetTextureDesc
 .. autofunction:: cuda.bindings.driver.cuTexObjectGetResourceViewDesc
 
+.. _cuda-bindings-driver-cuda_surfobject:
+.. _cuda-bindings-driver-group__cuda__surfobject:
+
 Surface Object Management
 -------------------------
 
@@ -7451,6 +7523,9 @@ This section describes the surface object management functions of the low-level
 .. autofunction:: cuda.bindings.driver.cuSurfObjectDestroy
 .. autofunction:: cuda.bindings.driver.cuSurfObjectGetResourceDesc
 
+.. _cuda-bindings-driver-cuda_tensor_memory:
+.. _cuda-bindings-driver-group__cuda__tensor__memory:
+
 Tensor Map Object Managment
 ---------------------------
 
@@ -7461,6 +7536,9 @@ This section describes the tensor map object management functions of the low-lev
 .. autofunction:: cuda.bindings.driver.cuTensorMapEncodeIm2colWide
 .. autofunction:: cuda.bindings.driver.cuTensorMapReplaceAddress
 
+.. _cuda-bindings-driver-cuda_peer_access:
+.. _cuda-bindings-driver-group__cuda__peer__access:
+
 Peer Context Memory Access
 --------------------------
 
@@ -7472,6 +7550,9 @@ This section describes the direct peer context memory access functions of the lo
 .. autofunction:: cuda.bindings.driver.cuDeviceGetP2PAttribute
 .. autofunction:: cuda.bindings.driver.cuDeviceGetP2PAtomicCapabilities
 
+.. _cuda-bindings-driver-cuda_graphics:
+.. _cuda-bindings-driver-group__cuda__graphics:
+
 Graphics Interoperability
 -------------------------
 
@@ -7485,6 +7566,9 @@ This section describes the graphics interoperability functions of the low-level
 .. autofunction:: cuda.bindings.driver.cuGraphicsMapResources
 .. autofunction:: cuda.bindings.driver.cuGraphicsUnmapResources
 
+.. _cuda-bindings-driver-cuda_driver_entry_point:
+.. _cuda-bindings-driver-group__cuda__driver__entry__point:
+
 Driver Entry Point Access
 -------------------------
 
@@ -7492,6 +7576,9 @@ This section describes the driver entry point access functions of the low-level
 
 .. autofunction:: cuda.bindings.driver.cuGetProcAddress
 
+.. _cuda-bindings-driver-cuda_coredump:
+.. _cuda-bindings-driver-group__cuda__coredump:
+
 Coredump Attributes Control API
 -------------------------------
 
@@ -7570,6 +7657,9 @@ This section describes the coredump attribute control functions of the low-level
 .. autofunction:: cuda.bindings.driver.cuCoredumpDeregisterStartCallback
 .. autofunction:: cuda.bindings.driver.cuCoredumpDeregisterCompleteCallback
 
+.. _cuda-bindings-driver-cuda_green_contexts:
+.. _cuda-bindings-driver-group__cuda__green__contexts:
+
 Green Contexts
 --------------
 
@@ -7579,7 +7669,7 @@ This section describes the APIs for creation and manipulation of green contexts
 
 Here are the broad initial steps to follow to get started:
 
-- (1) Start with an initial set of resources. For SM resources, they can be fetched via cuDeviceGetDevResource. In case of workqueues, a new configuration can be used or an existing one queried via the cuDeviceGetDevResource API.
+- (1) Start with an initial set of resources. For SM resources, they can be fetched via :func:`~.cuDeviceGetDevResource`. In case of workqueues, a new configuration can be used or an existing one queried via the :func:`~.cuDeviceGetDevResource` API.
 
 
 
@@ -7587,7 +7677,7 @@ Here are the broad initial steps to follow to get started:
 
 
 
-- (2) Modify these resources by either partitioning them (in case of SMs) or changing the configuration (in case of workqueues). To partition SMs, we recommend cuDevSmResourceSplit. Changing the workqueue configuration can be done directly in place.
+- (2) Modify these resources by either partitioning them (in case of SMs) or changing the configuration (in case of workqueues). To partition SMs, we recommend :func:`~.cuDevSmResourceSplit`. Changing the workqueue configuration can be done directly in place.
 
 
 
@@ -7595,7 +7685,7 @@ Here are the broad initial steps to follow to get started:
 
 
 
-- (3) Finalize the specification of resources by creating a descriptor via cuDevResourceGenerateDesc.
+- (3) Finalize the specification of resources by creating a descriptor via :func:`~.cuDevResourceGenerateDesc`.
 
 
 
@@ -7603,7 +7693,7 @@ Here are the broad initial steps to follow to get started:
 
 
 
-- (4) Create a green context via cuGreenCtxCreate. This provisions the resource, such as workqueues (until this step it was only a configuration specification).
+- (4) Create a green context via :func:`~.cuGreenCtxCreate`. This provisions the resource, such as workqueues (until this step it was only a configuration specification).
 
 
 
@@ -7611,7 +7701,7 @@ Here are the broad initial steps to follow to get started:
 
 
 
-- (5) Create a stream via cuGreenCtxStreamCreate, and use it throughout your application.
+- (5) Create a stream via :func:`~.cuGreenCtxStreamCreate`, and use it throughout your application.
 
 
 
@@ -7631,7 +7721,7 @@ SMs
 
 
 
-There are two possible partition operations - with cuDevSmResourceSplitByCount the partitions created have to follow default SM count granularity requirements, so it will often be rounded up and aligned to a default value. On the other hand, cuDevSmResourceSplit is explicit and allows for creation of non-equal groups. It will not round up automatically - instead it is the developer’s responsibility to query and set the correct values. These requirements can be queried with cuDeviceGetDevResource to determine the alignment granularity (sm.smCoscheduledAlignment). A general guideline on the default values for each compute architecture:
+There are two possible partition operations - with :func:`~.cuDevSmResourceSplitByCount` the partitions created have to follow default SM count granularity requirements, so it will often be rounded up and aligned to a default value. On the other hand, :func:`~.cuDevSmResourceSplit` is explicit and allows for creation of non-equal groups. It will not round up automatically - instead it is the developer’s responsibility to query and set the correct values. These requirements can be queried with :func:`~.cuDeviceGetDevResource` to determine the alignment granularity (sm.smCoscheduledAlignment). A general guideline on the default values for each compute architecture:
 
 - On Compute Architecture 7.X, 8.X, and all Tegra SoC:
 
@@ -7693,9 +7783,9 @@ Workqueues
 
 
 
-For ``CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG``\ , the resource specifies the expected maximum number of concurrent stream-ordered workloads via the ``wqConcurrencyLimit``\  field. The ``sharingScope``\  field determines how workqueue resources are shared:
+For :py:obj:`~.CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG`\ , the resource specifies the expected maximum number of concurrent stream-ordered workloads via the ``wqConcurrencyLimit``\  field. The ``sharingScope``\  field determines how workqueue resources are shared:
 
-- ``CU_WORKQUEUE_SCOPE_DEVICE_CTX:``\  Use all shared workqueue resources across all contexts (default driver behavior).
+- :py:obj:`~.CU_WORKQUEUE_SCOPE_DEVICE_CTX`:\  Use all shared workqueue resources across all contexts (default driver behavior).
 
 
 
@@ -7703,7 +7793,7 @@ For ``CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG``\ , the resource specifies the expe
 
 
 
-- ``CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED:``\  When possible, use non-overlapping workqueue resources with other balanced green contexts.
+- :py:obj:`~.CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED`:\  When possible, use non-overlapping workqueue resources with other balanced green contexts.
 
 
 
@@ -7715,11 +7805,11 @@ For ``CU_DEV_RESOURCE_TYPE_WORKQUEUE_CONFIG``\ , the resource specifies the expe
 
 
 
-The maximum concurrency limit depends on ``CUDA_DEVICE_MAX_CONNECTIONS`` and can be queried from the primary context via cuCtxGetDevResource. Configurations may exceed this concurrency limit, but the driver will not guarantee that work submission remains non-overlapping.
+The maximum concurrency limit depends on ``CUDA_DEVICE_MAX_CONNECTIONS`` and can be queried from the primary context via :func:`~.cuCtxGetDevResource`. Configurations may exceed this concurrency limit, but the driver will not guarantee that work submission remains non-overlapping.
 
 
 
-For ``CU_DEV_RESOURCE_TYPE_WORKQUEUE``\ , the resource represents a pre-existing workqueue that can be retrieved from existing contexts or green contexts. This allows reusing workqueue resources across different green contexts.
+For :py:obj:`~.CU_DEV_RESOURCE_TYPE_WORKQUEUE`\ , the resource represents a pre-existing workqueue that can be retrieved from existing contexts or green contexts. This allows reusing workqueue resources across different green contexts.
 
 
 
@@ -7731,7 +7821,7 @@ On Concurrency
 
 
 
-Even if the green contexts have disjoint SM partitions, it is not guaranteed that the kernels launched in them will run concurrently or have forward progress guarantees. This is due to other resources that could cause a dependency. Using a combination of disjoint SMs and CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED workqueue configurations can provide the best chance of avoiding interference. More resources will be added in the future to provide stronger guarantees.
+Even if the green contexts have disjoint SM partitions, it is not guaranteed that the kernels launched in them will run concurrently or have forward progress guarantees. This is due to other resources that could cause a dependency. Using a combination of disjoint SMs and :py:obj:`~.CU_WORKQUEUE_SCOPE_GREEN_CTX_BALANCED` workqueue configurations can provide the best chance of avoiding interference. More resources will be added in the future to provide stronger guarantees.
 
 
 
@@ -7842,6 +7932,9 @@ Additionally, there are two known scenarios, where its possible for the workload
 .. autoattribute:: cuda.bindings.driver._CONCAT_INNER
 .. autoattribute:: cuda.bindings.driver._CONCAT_OUTER
 
+.. _cuda-bindings-driver-cuda_logs:
+.. _cuda-bindings-driver-group__cuda__logs:
+
 Error Log Management Functions
 ------------------------------
 
@@ -7863,6 +7956,9 @@ This section describes the error log management functions of the low-level CUDA
 .. autofunction:: cuda.bindings.driver.cuLogsDumpToFile
 .. autofunction:: cuda.bindings.driver.cuLogsDumpToMemory
 
+.. _cuda-bindings-driver-cuda_checkpoint:
+.. _cuda-bindings-driver-group__cuda__checkpoint:
+
 CUDA Checkpointing
 ------------------
 
@@ -7891,6 +7987,9 @@ Checkpoint and restore capabilities are currently restricted to Linux.
 .. autofunction:: cuda.bindings.driver.cuCheckpointProcessRestore
 .. autofunction:: cuda.bindings.driver.cuCheckpointProcessUnlock
 
+.. _cuda-bindings-driver-cuda_profiler:
+.. _cuda-bindings-driver-group__cuda__profiler:
+
 Profiler Control
 ----------------
 
@@ -7899,6 +7998,9 @@ This section describes the profiler control functions of the low-level CUDA driv
 .. autofunction:: cuda.bindings.driver.cuProfilerStart
 .. autofunction:: cuda.bindings.driver.cuProfilerStop
 
+.. _cuda-bindings-driver-cuda_egl:
+.. _cuda-bindings-driver-group__cuda__egl:
+
 EGL Interoperability
 --------------------
 
@@ -7917,6 +8019,9 @@ This section describes the EGL interoperability functions of the low-level CUDA
 .. autofunction:: cuda.bindings.driver.cuGraphicsResourceGetMappedEglFrame
 .. autofunction:: cuda.bindings.driver.cuEventCreateFromEGLSync
 
+.. _cuda-bindings-driver-cuda_gl:
+.. _cuda-bindings-driver-group__cuda__gl:
+
 OpenGL Interoperability
 -----------------------
 
@@ -7945,6 +8050,9 @@ This section describes the OpenGL interoperability functions of the low-level CU
 .. autofunction:: cuda.bindings.driver.cuGraphicsGLRegisterImage
 .. autofunction:: cuda.bindings.driver.cuGLGetDevices
 
+.. _cuda-bindings-driver-cuda_vdpau:
+.. _cuda-bindings-driver-group__cuda__vdpau:
+
 VDPAU Interoperability
 ----------------------
 
diff --git a/cuda_bindings/docs/source/module/nvrtc.rst b/cuda_bindings/docs/source/module/nvrtc.rst
index 74f041d3b1..4c9977817f 100644
--- a/cuda_bindings/docs/source/module/nvrtc.rst
+++ b/cuda_bindings/docs/source/module/nvrtc.rst
@@ -5,6 +5,9 @@
 nvrtc
 -----
 
+.. _cuda-bindings-nvrtc-error:
+.. _cuda-bindings-nvrtc-group__error:
+
 Error Handling
 --------------
 
@@ -70,6 +73,9 @@ NVRTC defines the following enumeration type and function for API call error han
 
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetErrorString
 
+.. _cuda-bindings-nvrtc-query:
+.. _cuda-bindings-nvrtc-group__query:
+
 General Information Query
 -------------------------
 
@@ -79,6 +85,9 @@ NVRTC defines the following function for general information query.
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetNumSupportedArchs
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetSupportedArchs
 
+.. _cuda-bindings-nvrtc-compilation:
+.. _cuda-bindings-nvrtc-group__compilation:
+
 Compilation
 -----------
 
@@ -104,16 +113,22 @@ NVRTC defines the following type and functions for actual compilation.
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetTileIRSize
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetTileIR
 
+.. _cuda-bindings-nvrtc-precompiled_header:
+.. _cuda-bindings-nvrtc-group__precompiled__header:
+
 Precompiled header (PCH) (CUDA 12.8+)
 -------------------------------------
 
-NVRTC defines the following function related to PCH. Also see PCH related flags passed to nvrtcCompileProgram.
+NVRTC defines the following function related to PCH. Also see PCH related flags passed to :func:`~.nvrtcCompileProgram`.
 
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetPCHHeapSize
 .. autofunction:: cuda.bindings.nvrtc.nvrtcSetPCHHeapSize
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetPCHCreateStatus
 .. autofunction:: cuda.bindings.nvrtc.nvrtcGetPCHHeapSizeRequired
 
+.. _cuda-bindings-nvrtc-bundled_headers:
+.. _cuda-bindings-nvrtc-group__bundled__headers:
+
 Bundled Headers Installation
 ----------------------------
 
@@ -125,7 +140,7 @@ NVRTC defines the following types and functions for bundled headers installation
 .. autofunction:: cuda.bindings.nvrtc.nvrtcRemoveBundledHeaders
 .. autoattribute:: cuda.bindings.nvrtc.NVRTC_INSTALL_HEADERS_SKIP_IF_EXISTS
 
-    Flags for nvrtcInstallBundledHeaders.Skip installation if version marker exists and version matches. This is the default behavior when flags=0.
+    Flags for :func:`~.nvrtcInstallBundledHeaders`.Skip installation if version marker exists and version matches. This is the default behavior when flags=0.
 
 .. autoattribute:: cuda.bindings.nvrtc.NVRTC_INSTALL_HEADERS_FORCE_OVERWRITE
 
@@ -133,8 +148,11 @@ NVRTC defines the following types and functions for bundled headers installation
 
 .. autoattribute:: cuda.bindings.nvrtc.NVRTC_INSTALL_HEADERS_NO_WAIT
 
-    Return NVRTC_ERROR_BUSY immediately if installation is in progress by another process, instead of waiting for the lock. Can be combined with FORCE_OVERWRITE using bitwise OR.
+    Return :py:obj:`~.NVRTC_ERROR_BUSY` immediately if installation is in progress by another process, instead of waiting for the lock. Can be combined with FORCE_OVERWRITE using bitwise OR.
+
 
+.. _cuda-bindings-nvrtc-options:
+.. _cuda-bindings-nvrtc-group__options:
 
 Supported Compile Options
 -------------------------
@@ -447,7 +465,7 @@ On Linux, during compilation, use ``setrlimit()``\  to increase stack size to ma
 
   - ``--dlink-time-opt``\  (``-dlto``\ )
 
-Generate intermediate code for later link-time optimization. It implies ``-rdc=true``\ . Note: when this option is used the ``nvrtcGetLTOIR``\  API should be used, as PTX or Cubin will not be generated.
+Generate intermediate code for later link-time optimization. It implies ``-rdc=true``\ . Note: when this option is used the :func:`~.nvrtcGetLTOIR`\  API should be used, as PTX or Cubin will not be generated.
 
 
 
@@ -547,7 +565,7 @@ Cancel any previous definition of ``<def>``\ .
 
   - ``--include-path=<dir>``\  (``-I``\ )
 
-Add the directory ``<dir>``\  to the list of directories to be searched for headers. These paths are searched after the list of headers given to nvrtcCreateProgram.
+Add the directory ``<dir>``\  to the list of directories to be searched for headers. These paths are searched after the list of headers given to :func:`~.nvrtcCreateProgram`.
 
 
 
@@ -557,7 +575,7 @@ Add the directory ``<dir>``\  to the list of directories to be searched for head
 
   - ``--use-bundled-headers=<dir>``\
 
-Install bundled CUDA headers to ``<dir>``\  and add include paths. This is a convenience flag that combines calling nvrtcInstallBundledHeaders and adding ``-I<dir>``\  and ``-I<dir>/cccl``\  to the include search path. Headers are installed only if they don't already exist at the specified location.
+Install bundled CUDA headers to ``<dir>``\  and add include paths. This is a convenience flag that combines calling :func:`~.nvrtcInstallBundledHeaders` and adding ``-I<dir>``\  and ``-I<dir>/cccl``\  to the include search path. Headers are installed only if they don't already exist at the specified location.
 
 
 
diff --git a/cuda_bindings/docs/source/module/runtime.rst b/cuda_bindings/docs/source/module/runtime.rst
index 617c318914..bc66b21701 100644
--- a/cuda_bindings/docs/source/module/runtime.rst
+++ b/cuda_bindings/docs/source/module/runtime.rst
@@ -5,6 +5,9 @@
 runtime
 -------
 
+.. _cuda-bindings-runtime-cudart_types:
+.. _cuda-bindings-runtime-group__cudart__types:
+
 Data types used by CUDA Runtime
 -------------------------------
 
@@ -179,7 +182,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidTextureBinding
 
 
-        This indicates that the texture binding is not valid. This occurs if you call :py:obj:`~.cudaGetTextureAlignmentOffset()` with an unbound texture.
+        This indicates that the texture binding is not valid. This occurs if you call ``cudaGetTextureAlignmentOffset()`` with an unbound texture.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorInvalidChannelDescriptor
@@ -299,13 +302,13 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorIncompatibleDriverContext
 
 
-        This indicates that the current context is not compatible with this the CUDA Runtime. This can only occur if you are using CUDA Runtime/Driver interoperability and have created an existing Driver context using the driver API. The Driver context may be incompatible either because the Driver context was created using an older version of the API, because the Runtime API call expects a primary driver context and the Driver context is not primary, or because the Driver context has been destroyed. Please see Interactions with the CUDA Driver API for more information.
+        This indicates that the current context is not compatible with this the CUDA Runtime. This can only occur if you are using CUDA Runtime/Driver interoperability and have created an existing Driver context using the driver API. The Driver context may be incompatible either because the Driver context was created using an older version of the API, because the Runtime API call expects a primary driver context and the Driver context is not primary, or because the Driver context has been destroyed. Please see :ref:`Interactions with the CUDA Driver API <cuda-bindings-runtime-group__cudart__driver>` for more information.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMissingConfiguration
 
 
-        The device function being invoked (usually via :py:obj:`~.cudaLaunchKernel()`) was not previously configured via the :py:obj:`~.cudaConfigureCall()` function.
+        The device function being invoked (usually via :py:obj:`~.cudaLaunchKernel()`) was not previously configured via the ``cudaConfigureCall()`` function.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorPriorLaunchFailure
@@ -389,7 +392,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorDeviceUninitialized
 
 
-        This most frequently indicates that there is no context bound to the current thread. This can also be returned if the context passed to an API call is not a valid handle (such as a context that has had :py:obj:`~.cuCtxDestroy()` invoked on it). This can also be returned if a user mixes different API versions (i.e. 3010 context with 3020 API calls). See :py:obj:`~.cuCtxGetApiVersion()` for more details.
+        This most frequently indicates that there is no context bound to the current thread. This can also be returned if the context passed to an API call is not a valid handle (such as a context that has had :func:`~.cuCtxDestroy` invoked on it). This can also be returned if a user mixes different API versions (i.e. 3010 context with 3020 API calls). See :func:`~.cuCtxGetApiVersion` for more details.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorMapBufferObjectFailed
@@ -515,7 +518,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorUnsupportedDevSideSync
 
 
-        This indicates that the code to be compiled by the PTX JIT contains unsupported call to cudaDeviceSynchronize.
+        This indicates that the code to be compiled by the PTX JIT contains unsupported call to :func:`~.cudaDeviceSynchronize`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorContained
@@ -623,13 +626,13 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorSetOnActiveProcess
 
 
-        This indicates that the user has called :py:obj:`~.cudaSetValidDevices()`, :py:obj:`~.cudaSetDeviceFlags()`, :py:obj:`~.cudaD3D9SetDirect3DDevice()`, :py:obj:`~.cudaD3D10SetDirect3DDevice`, :py:obj:`~.cudaD3D11SetDirect3DDevice()`, or :py:obj:`~.cudaVDPAUSetVDPAUDevice()` after initializing the CUDA runtime by calling non-device management operations (allocating memory and launching kernels are examples of non-device management operations). This error can also be returned if using runtime/driver interoperability and there is an existing :py:obj:`~.CUcontext` active on the host thread.
+        This indicates that the user has called :py:obj:`~.cudaSetValidDevices()`, :py:obj:`~.cudaSetDeviceFlags()`, ``cudaD3D9SetDirect3DDevice()``, ``cudaD3D10SetDirect3DDevice``, ``cudaD3D11SetDirect3DDevice()``, or :py:obj:`~.cudaVDPAUSetVDPAUDevice()` after initializing the CUDA runtime by calling non-device management operations (allocating memory and launching kernels are examples of non-device management operations). This error can also be returned if using runtime/driver interoperability and there is an existing :py:obj:`~.CUcontext` active on the host thread.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorContextIsDestroyed
 
 
-        This error indicates that the context current to the calling thread has been destroyed using :py:obj:`~.cuCtxDestroy`, or is a primary context which has not yet been initialized.
+        This error indicates that the context current to the calling thread has been destroyed using :func:`~.cuCtxDestroy`, or is a primary context which has not yet been initialized.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorAssert
@@ -641,7 +644,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorTooManyPeers
 
 
-        This error indicates that the hardware resources required to enable peer access have been exhausted for one or more of the devices passed to :py:obj:`~.cudaEnablePeerAccess()`.
+        This error indicates that the hardware resources required to enable peer access have been exhausted for one or more of the devices passed to ``cudaEnablePeerAccess()``.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorHostMemoryAlreadyRegistered
@@ -821,7 +824,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorStreamCaptureImplicit
 
 
-        The operation would have resulted in a disallowed implicit dependency on a current capture sequence from cudaStreamLegacy.
+        The operation would have resulted in a disallowed implicit dependency on a current capture sequence from :py:obj:`~.cudaStreamLegacy`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaError_t.cudaErrorCapturedEvent
@@ -1984,7 +1987,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGPUDirectRDMAWritesOrdering.cudaGPUDirectRDMAWritesOrderingNone
 
 
-        The device does not natively support ordering of GPUDirect RDMA writes. :py:obj:`~.cudaFlushGPUDirectRDMAWrites()` can be leveraged if supported.
+        The device does not natively support ordering of GPUDirect RDMA writes. ``cudaFlushGPUDirectRDMAWrites()`` can be leveraged if supported.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGPUDirectRDMAWritesOrdering.cudaGPUDirectRDMAWritesOrderingOwner
@@ -2281,13 +2284,13 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DGatherWidth
 
 
-        Maximum 2D texture width if cudaArrayTextureGather is set
+        Maximum 2D texture width if :py:obj:`~.cudaArrayTextureGather` is set
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture2DGatherHeight
 
 
-        Maximum 2D texture height if cudaArrayTextureGather is set
+        Maximum 2D texture height if :py:obj:`~.cudaArrayTextureGather` is set
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMaxTexture3DWidthAlt
@@ -2539,7 +2542,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess
 
 
-        Device supports coherently accessing pageable memory without calling cudaHostRegister on it
+        Device supports coherently accessing pageable memory without calling :func:`~.cudaHostRegister` on it
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrConcurrentManagedAccess
@@ -2641,7 +2644,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostRegisterReadOnlySupported
 
 
-        Device supports using the :py:obj:`~.cudaHostRegister` flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU
+        Device supports using the :py:obj:`~.cudaHostRegister` flag :py:obj:`~.cudaHostRegisterReadOnly` to register memory that must be mapped as read-only to the GPU
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrTimelineSemaphoreInteropSupported
@@ -2653,7 +2656,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported
 
 
-        Device supports using the :py:obj:`~.cudaMallocAsync` and :py:obj:`~.cudaMemPool` family of APIs
+        Device supports using the :py:obj:`~.cudaMallocAsync` and ``cudaMemPool`` family of APIs
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrGPUDirectRDMASupported
@@ -2779,7 +2782,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostNumaMemoryPoolsSupported
 
 
-        Device supports HOST_NUMA location with the :py:obj:`~.cudaMallocAsync` and :py:obj:`~.cudaMemPool` family of APIs
+        Device supports HOST_NUMA location with the :py:obj:`~.cudaMallocAsync` and ``cudaMemPool`` family of APIs
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostNumaMultinodeIpcSupported
@@ -2791,7 +2794,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrHostMemoryPoolsSupported
 
 
-        Device suports HOST location with the :py:obj:`~.cuMemAllocAsync` and :py:obj:`~.cuMemPool` family of APIs
+        Device suports HOST location with the :func:`~.cuMemAllocAsync` and ``cuMemPool`` family of APIs
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceAttr.cudaDevAttrReserved145
@@ -2822,7 +2825,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolReuseFollowEventDependencies
 
 
-        (value type = int) Allow cuMemAllocAsync to use memory asynchronously freed in another streams as long as a stream ordering dependency of the allocating stream on the free action exists. Cuda events and null stream interactions can create the required stream ordered dependencies. (default enabled)
+        (value type = int) Allow :func:`~.cuMemAllocAsync` to use memory asynchronously freed in another streams as long as a stream ordering dependency of the allocating stream on the free action exists. Cuda events and null stream interactions can create the required stream ordered dependencies. (default enabled)
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolReuseAllowOpportunistic
@@ -2834,67 +2837,67 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolReuseAllowInternalDependencies
 
 
-        (value type = int) Allow cuMemAllocAsync to insert new stream dependencies in order to establish the stream ordering required to reuse a piece of memory released by cuFreeAsync (default enabled).
+        (value type = int) Allow :func:`~.cuMemAllocAsync` to insert new stream dependencies in order to establish the stream ordering required to reuse a piece of memory released by cuFreeAsync (default enabled).
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrReleaseThreshold
 
 
-        (value type = cuuint64_t) Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS. When more than the release threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next call to stream, event or context synchronize. (default 0)
+        (value type = :py:obj:`~.cuuint64_t`) Amount of reserved memory in bytes to hold onto before trying to release memory back to the OS. When more than the release threshold bytes of memory are held by the memory pool, the allocator will try to release memory back to the OS on the next call to stream, event or context synchronize. (default 0)
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrReservedMemCurrent
 
 
-        (value type = cuuint64_t) Amount of backing memory currently allocated for the mempool.
+        (value type = :py:obj:`~.cuuint64_t`) Amount of backing memory currently allocated for the mempool.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrReservedMemHigh
 
 
-        (value type = cuuint64_t) High watermark of backing memory allocated for the mempool since the last time it was reset. High watermark can only be reset to zero.
+        (value type = :py:obj:`~.cuuint64_t`) High watermark of backing memory allocated for the mempool since the last time it was reset. High watermark can only be reset to zero.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrUsedMemCurrent
 
 
-        (value type = cuuint64_t) Amount of memory from the pool that is currently in use by the application.
+        (value type = :py:obj:`~.cuuint64_t`) Amount of memory from the pool that is currently in use by the application.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrUsedMemHigh
 
 
-        (value type = cuuint64_t) High watermark of the amount of memory from the pool that was in use by the application since the last time it was reset. High watermark can only be reset to zero.
+        (value type = :py:obj:`~.cuuint64_t`) High watermark of the amount of memory from the pool that was in use by the application since the last time it was reset. High watermark can only be reset to zero.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrAllocationType
 
 
-        (value type = cudaMemAllocationType) The allocation type of the mempool
+        (value type = :py:obj:`~.cudaMemAllocationType`) The allocation type of the mempool
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrExportHandleTypes
 
 
-        (value type = cudaMemAllocationHandleType) Available export handle types for the mempool. For imported pools this value is always cudaMemHandleTypeNone as an imported pool cannot be re-exported
+        (value type = :py:obj:`~.cudaMemAllocationHandleType`) Available export handle types for the mempool. For imported pools this value is always :py:obj:`~.cudaMemHandleTypeNone` as an imported pool cannot be re-exported
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrLocationId
 
 
-        (value type = int) The location id for the mempool. If the location type for this pool is cudaMemLocationTypeInvisible then ID will be cudaInvalidDeviceId
+        (value type = int) The location id for the mempool. If the location type for this pool is :py:obj:`~.cudaMemLocationTypeInvisible` then ID will be :py:obj:`~.cudaInvalidDeviceId`
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrLocationType
 
 
-        (value type = cudaMemLocationType) The location type for the mempool. For imported memory pools where the device is not directly visible to the importing process or pools imported via fabric handles across nodes this will be cudaMemLocationTypeInvisible
+        (value type = :py:obj:`~.cudaMemLocationType`) The location type for the mempool. For imported memory pools where the device is not directly visible to the importing process or pools imported via fabric handles across nodes this will be :py:obj:`~.cudaMemLocationTypeInvisible`
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrMaxPoolSize
 
 
-        (value type = cuuint64_t) Maximum size of the pool in bytes, this value may be higher than what was initially passed to cudaMemPoolCreate due to alignment requirements. A value of 0 indicates no maximum size. For cudaMemAllocationTypeManaged and IPC imported pools this value will be system dependent.
+        (value type = :py:obj:`~.cuuint64_t`) Maximum size of the pool in bytes, this value may be higher than what was initially passed to :func:`~.cudaMemPoolCreate` due to alignment requirements. A value of 0 indicates no maximum size. For :py:obj:`~.cudaMemAllocationTypeManaged` and IPC imported pools this value will be system dependent.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemPoolAttr.cudaMemPoolAttrHwDecompressEnabled
@@ -2940,7 +2943,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaMemLocationType.cudaMemLocationTypeInvisible
 
 
-        Location is not visible but device is accessible, id is always cudaInvalidDeviceId
+        Location is not visible but device is accessible, id is always :py:obj:`~.cudaInvalidDeviceId`
 
 .. autoclass:: cuda.bindings.runtime.cudaMemAccessFlags
 
@@ -3009,32 +3012,32 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaMemAllocationHandleType.cudaMemHandleTypeFabric
 
 
-        Allows a fabric handle to be used for exporting. (cudaMemFabricHandle_t)
+        Allows a fabric handle to be used for exporting. (:py:obj:`~.cudaMemFabricHandle_t`)
 
 .. autoclass:: cuda.bindings.runtime.cudaGraphMemAttributeType
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphMemAttributeType.cudaGraphMemAttrUsedMemCurrent
 
 
-        (value type = cuuint64_t) Amount of memory, in bytes, currently associated with graphs.
+        (value type = :py:obj:`~.cuuint64_t`) Amount of memory, in bytes, currently associated with graphs.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphMemAttributeType.cudaGraphMemAttrUsedMemHigh
 
 
-        (value type = cuuint64_t) High watermark of memory, in bytes, associated with graphs since the last time it was reset. High watermark can only be reset to zero.
+        (value type = :py:obj:`~.cuuint64_t`) High watermark of memory, in bytes, associated with graphs since the last time it was reset. High watermark can only be reset to zero.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphMemAttributeType.cudaGraphMemAttrReservedMemCurrent
 
 
-        (value type = cuuint64_t) Amount of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
+        (value type = :py:obj:`~.cuuint64_t`) Amount of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphMemAttributeType.cudaGraphMemAttrReservedMemHigh
 
 
-        (value type = cuuint64_t) High watermark of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
+        (value type = :py:obj:`~.cuuint64_t`) High watermark of memory, in bytes, currently allocated for use by the CUDA graphs asynchronous allocator.
 
 .. autoclass:: cuda.bindings.runtime.cudaMemcpyFlags
 
@@ -3085,7 +3088,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaMemcpy3DOperandType.cudaMemcpyOperandTypeArray
 
 
-        Memcpy operand is a CUarray.
+        Memcpy operand is a :py:obj:`~.CUarray`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaMemcpy3DOperandType.cudaMemcpyOperandTypeMax
@@ -3594,13 +3597,13 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaKernelFunctionType.cudaKernelFunctionTypeKernel
 
 
-        Function handle is a cudaKernel_t
+        Function handle is a :py:obj:`~.cudaKernel_t`
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaKernelFunctionType.cudaKernelFunctionTypeFunction
 
 
-        Function handle is a cudaFunction_t
+        Function handle is a :py:obj:`~.cudaFunction_t`
 
 .. autoclass:: cuda.bindings.runtime.cudaGraphConditionalHandleFlags
 
@@ -3717,7 +3720,7 @@ Data types used by CUDA Runtime
 
                                            Handles must be created in advance of creating the node
 
-                                           using :py:obj:`~.cudaGraphConditionalHandleCreate`.
+                                           using :func:`~.cudaGraphConditionalHandleCreate`.
 
 
 
@@ -3733,7 +3736,7 @@ Data types used by CUDA Runtime
 
                                            To set the control value, supply a default value when creating the handle and/or
 
-                                           call :py:obj:`~.cudaGraphSetConditional` from device code.
+                                           call ``cudaGraphSetConditional`` from device code.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphNodeType.cudaGraphNodeTypeReserved16
@@ -3759,7 +3762,7 @@ Data types used by CUDA Runtime
 
 
 
-        The following restrictions apply to child graphs after they have been moved: Cannot be independently instantiated or destroyed; Cannot be added as a child graph of a separate parent graph; Cannot be used as an argument to cudaGraphExecUpdate; Cannot have additional memory allocation or free nodes added.
+        The following restrictions apply to child graphs after they have been moved: Cannot be independently instantiated or destroyed; Cannot be added as a child graph of a separate parent graph; Cannot be used as an argument to :func:`~.cudaGraphExecUpdate`; Cannot have additional memory allocation or free nodes added.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphChildGraphNodeOwnership.cudaGraphChildGraphOwnershipInvalid
@@ -3970,7 +3973,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsEventNodeParams
 
 
-        Adds cudaEvent_t handle from record and wait nodes to output
+        Adds :py:obj:`~.cudaEvent_t` handle from record and wait nodes to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsExtSemasSignalNodeParams
@@ -3988,7 +3991,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsKernelNodeAttributes
 
 
-        Adds cudaKernelNodeAttrID values to output
+        Adds :py:obj:`~.cudaKernelNodeAttrID` values to output
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphDebugDotFlags.cudaGraphDebugDotFlagsHandles
@@ -4027,7 +4030,7 @@ Data types used by CUDA Runtime
 
          be used on platforms which support unified addressing. This flag cannot be
 
-         used in conjunction with cudaGraphInstantiateFlagAutoFreeOnLaunch.
+         used in conjunction with :py:obj:`~.cudaGraphInstantiateFlagAutoFreeOnLaunch`.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaGraphInstantiateFlags.cudaGraphInstantiateFlagUseNodePriority
@@ -4164,11 +4167,11 @@ Data types used by CUDA Runtime
 
         Valid for graph nodes, launches. This attribute is graphs-only, and passing it to a launch in a non-capturing stream will result in an error.
 
-         :cudaLaunchAttributeValue::deviceUpdatableKernelNode::deviceUpdatable can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode` which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see :py:obj:`~.cudaGraphKernelNodeUpdatesApply`.
+         :py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.deviceUpdatable` can only be set to 0 or 1. Setting the field to 1 indicates that the corresponding kernel node should be device-updatable. On success, a handle will be returned via :py:obj:`~.cudaLaunchAttributeValue.deviceUpdatableKernelNode.devNode` which can be passed to the various device-side update functions to update the node's kernel parameters from within another kernel. For more information on the types of device updates that can be made, as well as the relevant limitations thereof, see ``cudaGraphKernelNodeUpdatesApply``.
 
          Nodes which are device-updatable have additional restrictions compared to regular kernel nodes. Firstly, device-updatable nodes cannot be removed from their graph via :py:obj:`~.cudaGraphDestroyNode`. Additionally, once opted-in to this functionality, a node cannot opt out, and any attempt to set the deviceUpdatable attribute to 0 will result in an error. Device-updatable kernel nodes also cannot have their attributes copied to/from another kernel node via :py:obj:`~.cudaGraphKernelNodeCopyAttributes`. Graphs containing one or more device-updatable nodes also do not allow multiple instantiation, and neither the graph nor its instantiated version can be passed to :py:obj:`~.cudaGraphExecUpdate`.
 
-         If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :py:obj:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again.
+         If a graph contains device-updatable nodes and updates those nodes from the device from within the graph, the graph must be uploaded with :func:`~.cuGraphUpload` before it is launched. For such a graph, if host-side executable graph updates are made to the device-updatable nodes, the graph must be uploaded before it is launched again.
 
 
     .. autoattribute:: cuda.bindings.runtime.cudaLaunchAttributeID.cudaLaunchAttributePreferredSharedMemoryCarveout
@@ -4213,7 +4216,7 @@ Data types used by CUDA Runtime
     .. autoattribute:: cuda.bindings.runtime.cudaDeviceNumaConfig.cudaDeviceNumaConfigNumaNode
 
 
-        The GPU is a NUMA node, cudaDevAttrNumaId contains its NUMA ID
+        The GPU is a NUMA node, :py:obj:`~.cudaDevAttrNumaId` contains its NUMA ID
 
 .. autoclass:: cuda.bindings.runtime.cudaAsyncNotificationType
 
@@ -5136,7 +5139,7 @@ Data types used by CUDA Runtime
 
 
 
-    Stream handle that can be passed as a cudaStream_t to use an implicit stream with legacy synchronization behavior.
+    Stream handle that can be passed as a :py:obj:`~.cudaStream_t` to use an implicit stream with legacy synchronization behavior.
 
 
 
@@ -5148,7 +5151,7 @@ Data types used by CUDA Runtime
 
 
 
-    Stream handle that can be passed as a cudaStream_t to use an implicit stream with per-thread synchronization behavior.
+    Stream handle that can be passed as a :py:obj:`~.cudaStream_t` to use an implicit stream with per-thread synchronization behavior.
 
 
 
@@ -5168,7 +5171,7 @@ Data types used by CUDA Runtime
 
 .. autoattribute:: cuda.bindings.runtime.cudaEventInterprocess
 
-    Event is suitable for interprocess use. cudaEventDisableTiming must be set
+    Event is suitable for interprocess use. :py:obj:`~.cudaEventDisableTiming` must be set
 
 .. autoattribute:: cuda.bindings.runtime.cudaEventRecordDefault
 
@@ -5232,31 +5235,31 @@ Data types used by CUDA Runtime
 
 .. autoattribute:: cuda.bindings.runtime.cudaArrayLayered
 
-    Must be set in cudaMalloc3DArray to create a layered CUDA array
+    Must be set in :func:`~.cudaMalloc3DArray` to create a layered CUDA array
 
 .. autoattribute:: cuda.bindings.runtime.cudaArraySurfaceLoadStore
 
-    Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array
+    Must be set in :func:`~.cudaMallocArray` or :func:`~.cudaMalloc3DArray` in order to bind surfaces to the CUDA array
 
 .. autoattribute:: cuda.bindings.runtime.cudaArrayCubemap
 
-    Must be set in cudaMalloc3DArray to create a cubemap CUDA array
+    Must be set in :func:`~.cudaMalloc3DArray` to create a cubemap CUDA array
 
 .. autoattribute:: cuda.bindings.runtime.cudaArrayTextureGather
 
-    Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array
+    Must be set in :func:`~.cudaMallocArray` or :func:`~.cudaMalloc3DArray` in order to perform texture gather operations on the CUDA array
 
 .. autoattribute:: cuda.bindings.runtime.cudaArrayColorAttachment
 
-    Must be set in cudaExternalMemoryGetMappedMipmappedArray if the mipmapped array is used as a color target in a graphics API
+    Must be set in :func:`~.cudaExternalMemoryGetMappedMipmappedArray` if the mipmapped array is used as a color target in a graphics API
 
 .. autoattribute:: cuda.bindings.runtime.cudaArraySparse
 
-    Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a sparse CUDA array or CUDA mipmapped array
+    Must be set in :func:`~.cudaMallocArray`, :func:`~.cudaMalloc3DArray` or :func:`~.cudaMallocMipmappedArray` in order to create a sparse CUDA array or CUDA mipmapped array
 
 .. autoattribute:: cuda.bindings.runtime.cudaArrayDeferredMapping
 
-    Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a deferred mapping CUDA array or CUDA mipmapped array
+    Must be set in :func:`~.cudaMallocArray`, :func:`~.cudaMalloc3DArray` or :func:`~.cudaMallocMipmappedArray` in order to create a deferred mapping CUDA array or CUDA mipmapped array
 
 .. autoattribute:: cuda.bindings.runtime.cudaIpcMemLazyEnablePeerAccess
 
@@ -5292,7 +5295,7 @@ Data types used by CUDA Runtime
 
 .. autoattribute:: cuda.bindings.runtime.cudaInitDeviceFlagsAreValid
 
-    Tell the CUDA runtime that DeviceFlags is being set in cudaInitDevice call
+    Tell the CUDA runtime that DeviceFlags is being set in :func:`~.cudaInitDevice` call
 
 .. autoattribute:: cuda.bindings.runtime.cudaArraySparsePropertiesSingleMipTail
 
@@ -5382,6 +5385,9 @@ Data types used by CUDA Runtime
     Maximum number of planes per frame
 
 
+.. _cuda-bindings-runtime-cudart_device:
+.. _cuda-bindings-runtime-group__cudart__device:
+
 Device Management
 -----------------
 
@@ -5430,6 +5436,9 @@ This section describes the device management functions of the CUDA runtime appli
 .. autofunction:: cuda.bindings.runtime.cudaSetDeviceFlags
 .. autofunction:: cuda.bindings.runtime.cudaGetDeviceFlags
 
+.. _cuda-bindings-runtime-cudart_error:
+.. _cuda-bindings-runtime-group__cudart__error:
+
 Error Handling
 --------------
 
@@ -5440,6 +5449,9 @@ This section describes the error handling functions of the CUDA runtime applicat
 .. autofunction:: cuda.bindings.runtime.cudaGetErrorName
 .. autofunction:: cuda.bindings.runtime.cudaGetErrorString
 
+.. _cuda-bindings-runtime-cudart_stream:
+.. _cuda-bindings-runtime-group__cudart__stream:
+
 Stream Management
 -----------------
 
@@ -5474,6 +5486,9 @@ This section describes the stream management functions of the CUDA runtime appli
 .. autofunction:: cuda.bindings.runtime.cudaStreamGetCaptureInfo
 .. autofunction:: cuda.bindings.runtime.cudaStreamUpdateCaptureDependencies
 
+.. _cuda-bindings-runtime-cudart_event:
+.. _cuda-bindings-runtime-group__cudart__event:
+
 Event Management
 ----------------
 
@@ -5488,6 +5503,9 @@ This section describes the event management functions of the CUDA runtime applic
 .. autofunction:: cuda.bindings.runtime.cudaEventDestroy
 .. autofunction:: cuda.bindings.runtime.cudaEventElapsedTime
 
+.. _cuda-bindings-runtime-cudart_extres_interop:
+.. _cuda-bindings-runtime-group__cudart__extres__interop:
+
 External Resource Interoperability
 ----------------------------------
 
@@ -5502,6 +5520,9 @@ This section describes the external resource interoperability functions of the C
 .. autofunction:: cuda.bindings.runtime.cudaWaitExternalSemaphoresAsync
 .. autofunction:: cuda.bindings.runtime.cudaDestroyExternalSemaphore
 
+.. _cuda-bindings-runtime-cudart_execution:
+.. _cuda-bindings-runtime-group__cudart__execution:
+
 Execution Control
 -----------------
 
@@ -5518,6 +5539,9 @@ Some functions have overloaded C++ API template versions documented separately i
 .. autofunction:: cuda.bindings.runtime.cudaLaunchHostFunc
 .. autofunction:: cuda.bindings.runtime.cudaLaunchHostFunc_v2
 
+.. _cuda-bindings-runtime-cudart_occupancy:
+.. _cuda-bindings-runtime-group__cudart__occupancy:
+
 Occupancy
 ---------
 
@@ -5525,16 +5549,19 @@ This section describes the occupancy calculation functions of the CUDA runtime a
 
 
 
-Besides the occupancy calculator functions (cudaOccupancyMaxActiveBlocksPerMultiprocessor and cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags), there are also C++ only occupancy-based launch configuration functions documented in C++ API Routines module.
+Besides the occupancy calculator functions (:func:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessor` and :func:`~.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags`), there are also C++ only occupancy-based launch configuration functions documented in C++ API Routines module.
 
 
 
-See cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API) cudaOccupancyAvailableDynamicSMemPerBlock (C++ API),
+See cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSize (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API), cudaOccupancyMaxPotentialBlockSizeVariableSMem (C++ API) :func:`~.cudaOccupancyAvailableDynamicSMemPerBlock` (C++ API),
 
 .. autofunction:: cuda.bindings.runtime.cudaOccupancyMaxActiveBlocksPerMultiprocessor
 .. autofunction:: cuda.bindings.runtime.cudaOccupancyAvailableDynamicSMemPerBlock
 .. autofunction:: cuda.bindings.runtime.cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
 
+.. _cuda-bindings-runtime-cudart_memory:
+.. _cuda-bindings-runtime-group__cudart__memory:
+
 Memory Management
 -----------------
 
@@ -5605,6 +5632,9 @@ Some functions have overloaded C++ API template versions documented separately i
 .. autofunction:: cuda.bindings.runtime.make_cudaPos
 .. autofunction:: cuda.bindings.runtime.make_cudaExtent
 
+.. _cuda-bindings-runtime-cudart_memory_pools:
+.. _cuda-bindings-runtime-group__cudart__memory__pools:
+
 Stream Ordered Memory Allocator
 -------------------------------
 
@@ -5624,7 +5654,7 @@ The allocator is free to reallocate the memory as long as it can guarantee that
 
 
 
-Whether or not a device supports the integrated stream ordered memory allocator may be queried by calling cudaDeviceGetAttribute() with the device attribute cudaDevAttrMemoryPoolsSupported.
+Whether or not a device supports the integrated stream ordered memory allocator may be queried by calling :func:`~.cudaDeviceGetAttribute` with the device attribute :py:obj:`~.cudaDevAttrMemoryPoolsSupported`.
 
 .. autofunction:: cuda.bindings.runtime.cudaMallocAsync
 .. autofunction:: cuda.bindings.runtime.cudaFreeAsync
@@ -5644,6 +5674,9 @@ Whether or not a device supports the integrated stream ordered memory allocator
 .. autofunction:: cuda.bindings.runtime.cudaMemPoolExportPointer
 .. autofunction:: cuda.bindings.runtime.cudaMemPoolImportPointer
 
+.. _cuda-bindings-runtime-cudart_unified:
+.. _cuda-bindings-runtime-group__cudart__unified:
+
 Unified Addressing
 ------------------
 
@@ -5669,7 +5702,7 @@ CUDA devices can share a unified address space with the host.
 
 
 
-Whether or not a device supports unified addressing may be queried by calling cudaGetDeviceProperties() with the device property cudaDeviceProp::unifiedAddressing.
+Whether or not a device supports unified addressing may be queried by calling :func:`~.cudaGetDeviceProperties` with the device property :py:obj:`~.cudaDeviceProp.unifiedAddressing`.
 
 Unified addressing is automatically enabled in 64-bit processes .
 
@@ -5681,11 +5714,11 @@ Unified addressing is automatically enabled in 64-bit processes .
 
 
 
-It is possible to look up information about the memory which backs a pointer value. For instance, one may want to know if a pointer points to host or device memory. As another example, in the case of device memory, one may want to know on which CUDA device the memory resides. These properties may be queried using the function cudaPointerGetAttributes()
+It is possible to look up information about the memory which backs a pointer value. For instance, one may want to know if a pointer points to host or device memory. As another example, in the case of device memory, one may want to know on which CUDA device the memory resides. These properties may be queried using the function :func:`~.cudaPointerGetAttributes`
 
-Since pointers are unique, it is not necessary to specify information about the pointers specified to cudaMemcpy() and other copy functions.
+Since pointers are unique, it is not necessary to specify information about the pointers specified to :func:`~.cudaMemcpy` and other copy functions.
 
- The copy direction cudaMemcpyDefault may be used to specify that the CUDA runtime should infer the location of the pointer from its value.
+ The copy direction :py:obj:`~.cudaMemcpyDefault` may be used to specify that the CUDA runtime should infer the location of the pointer from its value.
 
 
 
@@ -5695,13 +5728,13 @@ Since pointers are unique, it is not necessary to specify information about the
 
 
 
-All host memory allocated through all devices using cudaMallocHost() and cudaHostAlloc() is always directly accessible from all devices that support unified addressing. This is the case regardless of whether or not the flags cudaHostAllocPortable and cudaHostAllocMapped are specified.
+All host memory allocated through all devices using :func:`~.cudaMallocHost` and :func:`~.cudaHostAlloc` is always directly accessible from all devices that support unified addressing. This is the case regardless of whether or not the flags :py:obj:`~.cudaHostAllocPortable` and :py:obj:`~.cudaHostAllocMapped` are specified.
 
-The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call cudaHostGetDevicePointer() to get the device pointer for these allocations.
+The pointer value through which allocated host memory may be accessed in kernels on all devices that support unified addressing is the same as the pointer value through which that memory is accessed on the host. It is not necessary to call :func:`~.cudaHostGetDevicePointer` to get the device pointer for these allocations.
 
 
 
-Note that this is not the case for memory allocated using the flag cudaHostAllocWriteCombined, as discussed below.
+Note that this is not the case for memory allocated using the flag :py:obj:`~.cudaHostAllocWriteCombined`, as discussed below.
 
 
 
@@ -5711,7 +5744,7 @@ Note that this is not the case for memory allocated using the flag cudaHostAlloc
 
 
 
-Upon enabling direct access from a device that supports unified addressing to another peer device that supports unified addressing using cudaDeviceEnablePeerAccess() all memory allocated in the peer device using cudaMalloc() and cudaMallocPitch() will immediately be accessible by the current device. The device pointer value through which any peer's memory may be accessed in the current device is the same pointer value through which that memory may be accessed from the peer device.
+Upon enabling direct access from a device that supports unified addressing to another peer device that supports unified addressing using :func:`~.cudaDeviceEnablePeerAccess` all memory allocated in the peer device using :func:`~.cudaMalloc` and :func:`~.cudaMallocPitch` will immediately be accessible by the current device. The device pointer value through which any peer's memory may be accessed in the current device is the same pointer value through which that memory may be accessed from the peer device.
 
 
 
@@ -5721,14 +5754,17 @@ Upon enabling direct access from a device that supports unified addressing to an
 
 
 
-Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using cudaHostRegister() and host memory allocated using the flag cudaHostAllocWriteCombined. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing.
+Not all memory may be accessed on devices through the same pointer value through which they are accessed on the host. These exceptions are host memory registered using :func:`~.cudaHostRegister` and host memory allocated using the flag :py:obj:`~.cudaHostAllocWriteCombined`. For these exceptions, there exists a distinct host and device address for the memory. The device address is guaranteed to not overlap any valid host pointer range and is guaranteed to have the same value across all devices that support unified addressing.
 
 
 
-This device address may be queried using cudaHostGetDevicePointer() when a device using unified addressing is current. Either the host or the unified device pointer value may be used to refer to this memory in cudaMemcpy() and similar functions using the cudaMemcpyDefault memory direction.
+This device address may be queried using :func:`~.cudaHostGetDevicePointer` when a device using unified addressing is current. Either the host or the unified device pointer value may be used to refer to this memory in :func:`~.cudaMemcpy` and similar functions using the :py:obj:`~.cudaMemcpyDefault` memory direction.
 
 .. autofunction:: cuda.bindings.runtime.cudaPointerGetAttributes
 
+.. _cuda-bindings-runtime-cudart_peer:
+.. _cuda-bindings-runtime-group__cudart__peer:
+
 Peer Device Memory Access
 -------------------------
 
@@ -5738,6 +5774,9 @@ This section describes the peer device memory access functions of the CUDA runti
 .. autofunction:: cuda.bindings.runtime.cudaDeviceEnablePeerAccess
 .. autofunction:: cuda.bindings.runtime.cudaDeviceDisablePeerAccess
 
+.. _cuda-bindings-runtime-cudart_opengl:
+.. _cuda-bindings-runtime-group__cudart__opengl:
+
 OpenGL Interoperability
 -----------------------
 
@@ -5770,24 +5809,36 @@ This section describes the OpenGL interoperability functions of the CUDA runtime
 .. autofunction:: cuda.bindings.runtime.cudaGraphicsGLRegisterImage
 .. autofunction:: cuda.bindings.runtime.cudaGraphicsGLRegisterBuffer
 
+.. _cuda-bindings-runtime-cudart_d3d9:
+.. _cuda-bindings-runtime-group__cudart__d3d9:
+
 Direct3D 9 Interoperability
 ---------------------------
 
 
 
 
+.. _cuda-bindings-runtime-cudart_d3d10:
+.. _cuda-bindings-runtime-group__cudart__d3d10:
+
 Direct3D 10 Interoperability
 ----------------------------
 
 
 
 
+.. _cuda-bindings-runtime-cudart_d3d11:
+.. _cuda-bindings-runtime-group__cudart__d3d11:
+
 Direct3D 11 Interoperability
 ----------------------------
 
 
 
 
+.. _cuda-bindings-runtime-cudart_vdpau:
+.. _cuda-bindings-runtime-group__cudart__vdpau:
+
 VDPAU Interoperability
 ----------------------
 
@@ -5798,6 +5849,9 @@ This section describes the VDPAU interoperability functions of the CUDA runtime
 .. autofunction:: cuda.bindings.runtime.cudaGraphicsVDPAURegisterVideoSurface
 .. autofunction:: cuda.bindings.runtime.cudaGraphicsVDPAURegisterOutputSurface
 
+.. _cuda-bindings-runtime-cudart_egl:
+.. _cuda-bindings-runtime-group__cudart__egl:
+
 EGL Interoperability
 --------------------
 
@@ -5816,6 +5870,9 @@ This section describes the EGL interoperability functions of the CUDA runtime ap
 .. autofunction:: cuda.bindings.runtime.cudaGraphicsResourceGetMappedEglFrame
 .. autofunction:: cuda.bindings.runtime.cudaEventCreateFromEGLSync
 
+.. _cuda-bindings-runtime-cudart_interop:
+.. _cuda-bindings-runtime-group__cudart__interop:
+
 Graphics Interoperability
 -------------------------
 
@@ -5829,6 +5886,9 @@ This section describes the graphics interoperability functions of the CUDA runti
 .. autofunction:: cuda.bindings.runtime.cudaGraphicsSubResourceGetMappedArray
 .. autofunction:: cuda.bindings.runtime.cudaGraphicsResourceGetMappedMipmappedArray
 
+.. _cuda-bindings-runtime-cudart_texture_object:
+.. _cuda-bindings-runtime-group__cudart__texture__object:
+
 Texture Object Management
 -------------------------
 
@@ -5842,6 +5902,9 @@ This section describes the low level texture object management functions of the
 .. autofunction:: cuda.bindings.runtime.cudaGetTextureObjectTextureDesc
 .. autofunction:: cuda.bindings.runtime.cudaGetTextureObjectResourceViewDesc
 
+.. _cuda-bindings-runtime-cudart_surface_object:
+.. _cuda-bindings-runtime-group__cudart__surface__object:
+
 Surface Object Management
 -------------------------
 
@@ -5851,6 +5914,9 @@ This section describes the low level texture object management functions of the
 .. autofunction:: cuda.bindings.runtime.cudaDestroySurfaceObject
 .. autofunction:: cuda.bindings.runtime.cudaGetSurfaceObjectResourceDesc
 
+.. _cuda-bindings-runtime-cudart__version:
+.. _cuda-bindings-runtime-group__cudart____version:
+
 Version Management
 ------------------
 
@@ -5860,6 +5926,9 @@ Version Management
 .. autofunction:: cuda.bindings.runtime.cudaRuntimeGetVersion
 .. autofunction:: cuda.bindings.runtime.getLocalRuntimeVersion
 
+.. _cuda-bindings-runtime-cudart_logs:
+.. _cuda-bindings-runtime-group__cudart__logs:
+
 Error Log Management Functions
 ------------------------------
 
@@ -5872,6 +5941,9 @@ This section describes the error log management functions of the CUDA runtime ap
 .. autofunction:: cuda.bindings.runtime.cudaLogsDumpToFile
 .. autofunction:: cuda.bindings.runtime.cudaLogsDumpToMemory
 
+.. _cuda-bindings-runtime-cudart_graph:
+.. _cuda-bindings-runtime-group__cudart__graph:
+
 Graph Management
 ----------------
 
@@ -5967,6 +6039,9 @@ This section describes the graph management functions of CUDA runtime applicatio
 .. autofunction:: cuda.bindings.runtime.cudaGraphConditionalHandleCreate
 .. autofunction:: cuda.bindings.runtime.cudaGraphConditionalHandleCreate_v2
 
+.. _cuda-bindings-runtime-cudart_driver_entry_point:
+.. _cuda-bindings-runtime-group__cudart__driver__entry__point:
+
 Driver Entry Point Access
 -------------------------
 
@@ -5975,6 +6050,9 @@ This section describes the driver entry point access functions of CUDA runtime a
 .. autofunction:: cuda.bindings.runtime.cudaGetDriverEntryPoint
 .. autofunction:: cuda.bindings.runtime.cudaGetDriverEntryPointByVersion
 
+.. _cuda-bindings-runtime-cudart_library:
+.. _cuda-bindings-runtime-group__cudart__library:
+
 Library Management
 ------------------
 
@@ -5991,6 +6069,9 @@ This section describes the library management functions of the CUDA runtime appl
 .. autofunction:: cuda.bindings.runtime.cudaLibraryEnumerateKernels
 .. autofunction:: cuda.bindings.runtime.cudaKernelSetAttributeForDevice
 
+.. _cuda-bindings-runtime-cudart_execution_context:
+.. _cuda-bindings-runtime-group__cudart__execution__context:
+
 Execution Context Management
 ----------------------------
 
@@ -6004,11 +6085,11 @@ This section describes the execution context management functions of the CUDA ru
 
 
 
-A CUDA execution context cudaExecutionContext_t serves as an abstraction for the contexts exposed by the CUDA Runtime, specifically green contexts and the primary context, and provides a unified programming model and API interface for contexts in the Runtime.
+A CUDA execution context :py:obj:`~.cudaExecutionContext_t` serves as an abstraction for the contexts exposed by the CUDA Runtime, specifically green contexts and the primary context, and provides a unified programming model and API interface for contexts in the Runtime.
 
 There are two primary ways today to obtain an execution context:
 
-- cudaDeviceGetExecutionCtx: Returns the execution context that corresponds to the primary context of the specified device.
+- :func:`~.cudaDeviceGetExecutionCtx`: Returns the execution context that corresponds to the primary context of the specified device.
 
 
 
@@ -6016,7 +6097,7 @@ There are two primary ways today to obtain an execution context:
 
 
 
-- cudaGreenCtxCreate: Creates a green context with the specified resources and returns an execution context.
+- :func:`~.cudaGreenCtxCreate`: Creates a green context with the specified resources and returns an execution context.
 
 
 
@@ -6028,7 +6109,7 @@ There are two primary ways today to obtain an execution context:
 
 Once you have an execution context at hand, you can perform context-level operations via the CUDA Runtime APIs. This includes:
 
-- Submitting work via streams created with cudaExecutionCtxStreamCreate.
+- Submitting work via streams created with :func:`~.cudaExecutionCtxStreamCreate`.
 
 
 
@@ -6036,7 +6117,7 @@ Once you have an execution context at hand, you can perform context-level operat
 
 
 
-- Querying context via cudaExecutionCtxGetDevResource, cudaExecutionCtxGetDevice, etc.
+- Querying context via :func:`~.cudaExecutionCtxGetDevResource`, :func:`~.cudaExecutionCtxGetDevice`, etc.
 
 
 
@@ -6044,7 +6125,7 @@ Once you have an execution context at hand, you can perform context-level operat
 
 
 
-- Synchronizing and tracking context-level operations via cudaExecutionCtxSynchronize, cudaExecutionCtxRecordEvent, cudaExecutionCtxWaitEvent.
+- Synchronizing and tracking context-level operations via :func:`~.cudaExecutionCtxSynchronize`, :func:`~.cudaExecutionCtxRecordEvent`, :func:`~.cudaExecutionCtxWaitEvent`.
 
 
 
@@ -6052,7 +6133,7 @@ Once you have an execution context at hand, you can perform context-level operat
 
 
 
-- Performing context-level graph node operations via cudaGraphAddNode by specifying the context in ``nodeParams``\ . Note that individual node creation APIs, such as cudaGraphAddKernelNode, do not support specifying an execution context.
+- Performing context-level graph node operations via :func:`~.cudaGraphAddNode` by specifying the context in ``nodeParams``\ . Note that individual node creation APIs, such as :func:`~.cudaGraphAddKernelNode`, do not support specifying an execution context.
 
 
 
@@ -6062,9 +6143,9 @@ Once you have an execution context at hand, you can perform context-level operat
 
 
 
-Note: The above APIs take in an explicit cudaExecutionContext_t handle and ignores the context that is current to the calling thread. This enables explicit context-based programming without relying on thread-local state. If no context is specified, the APIs return cudaErrorInvalidValue.
+Note: The above APIs take in an explicit :py:obj:`~.cudaExecutionContext_t` handle and ignores the context that is current to the calling thread. This enables explicit context-based programming without relying on thread-local state. If no context is specified, the APIs return :py:obj:`~.cudaErrorInvalidValue`.
 
-Note: Developers should treat cudaExecutionContext_t as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into driver-level contexts, such as ``CUcontext`` or ``CUgreenCtx``.
+Note: Developers should treat :py:obj:`~.cudaExecutionContext_t` as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into driver-level contexts, such as :py:obj:`~.CUcontext` or :py:obj:`~.CUgreenCtx`.
 
 
 
@@ -6074,7 +6155,7 @@ Note: Developers should treat cudaExecutionContext_t as an opaque handle and avo
 
 
 
-The lifetime of CUDA resources (memory, streams, events, modules, etc) is not tied to the lifetime of the execution context. Their lifetime is tied to the device against which they were created. As such, usage of cudaDeviceReset() should be avoided to persist the lifetime of these resources.
+The lifetime of CUDA resources (memory, streams, events, modules, etc) is not tied to the lifetime of the execution context. Their lifetime is tied to the device against which they were created. As such, usage of :func:`~.cudaDeviceReset` should be avoided to persist the lifetime of these resources.
 
 
 
@@ -6092,7 +6173,7 @@ The CUDA runtime does not provide a way to set an execution context as current.
 
 
 
-To work with these APIs (for example, cudaMalloc, cudaEventCreate, etc), developers are expected to call cudaSetDevice() prior to invoking them. Doing so does not impact functional correctness as these APIs operate on resources that are device-wide. If users have a context handle at hand, they can get the device handle from the context handle using cudaExecutionCtxGetDevice().
+To work with these APIs (for example, :func:`~.cudaMalloc`, :func:`~.cudaEventCreate`, etc), developers are expected to call :func:`~.cudaSetDevice` prior to invoking them. Doing so does not impact functional correctness as these APIs operate on resources that are device-wide. If users have a context handle at hand, they can get the device handle from the context handle using :func:`~.cudaExecutionCtxGetDevice`.
 
 
 
@@ -6102,7 +6183,7 @@ To work with these APIs (for example, cudaMalloc, cudaEventCreate, etc), develop
 
 
 
-These APIs (for example, cudaLaunchKernel, cudaMemcpyAsync, cudaMemsetAsync, etc) take in a stream and resources are inferred from the context bound to the stream at creation. See cudaExecutionCtxStreamCreate for more details. Developers are expected to use the stream-based APIs for context awareness and always pass an explicit stream handle to ensure context-awareness, and avoid reliance on the default NULL stream, which implicitly binds to the current context.
+These APIs (for example, :func:`~.cudaLaunchKernel`, :func:`~.cudaMemcpyAsync`, :func:`~.cudaMemsetAsync`, etc) take in a stream and resources are inferred from the context bound to the stream at creation. See :func:`~.cudaExecutionCtxStreamCreate` for more details. Developers are expected to use the stream-based APIs for context awareness and always pass an explicit stream handle to ensure context-awareness, and avoid reliance on the default NULL stream, which implicitly binds to the current context.
 
 
 
@@ -6118,7 +6199,7 @@ Green contexts are a lightweight alternative to traditional contexts, that can b
 
 Here are the broad initial steps to follow to get started:
 
-- (1) Start with an initial set of resources. For SM resources, they can be fetched via cudaDeviceGetDevResource. In case of workqueues, a new configuration can be used or an existing one queried via the cudaDeviceGetDevResource API.
+- (1) Start with an initial set of resources. For SM resources, they can be fetched via :func:`~.cudaDeviceGetDevResource`. In case of workqueues, a new configuration can be used or an existing one queried via the :func:`~.cudaDeviceGetDevResource` API.
 
 
 
@@ -6126,7 +6207,7 @@ Here are the broad initial steps to follow to get started:
 
 
 
-- (2) Modify these resources by either partitioning them (in case of SMs) or changing the configuration (in case of workqueues). To partition SMs, we recommend cudaDevSmResourceSplit. Changing the workqueue configuration can be done directly in place.
+- (2) Modify these resources by either partitioning them (in case of SMs) or changing the configuration (in case of workqueues). To partition SMs, we recommend :func:`~.cudaDevSmResourceSplit`. Changing the workqueue configuration can be done directly in place.
 
 
 
@@ -6134,7 +6215,7 @@ Here are the broad initial steps to follow to get started:
 
 
 
-- (3) Finalize the specification of resources by creating a descriptor via cudaDevResourceGenerateDesc.
+- (3) Finalize the specification of resources by creating a descriptor via :func:`~.cudaDevResourceGenerateDesc`.
 
 
 
@@ -6142,7 +6223,7 @@ Here are the broad initial steps to follow to get started:
 
 
 
-- (4) Create a green context via cudaGreenCtxCreate. This provisions the resource, such as workqueues (until this step it was only a configuration specification).
+- (4) Create a green context via :func:`~.cudaGreenCtxCreate`. This provisions the resource, such as workqueues (until this step it was only a configuration specification).
 
 
 
@@ -6150,7 +6231,7 @@ Here are the broad initial steps to follow to get started:
 
 
 
-- (5) Create a stream via cudaExecutionCtxStreamCreate, and use it throughout your application.
+- (5) Create a stream via :func:`~.cudaExecutionCtxStreamCreate`, and use it throughout your application.
 
 
 
@@ -6162,7 +6243,7 @@ Here are the broad initial steps to follow to get started:
 
 SMs
 
-There are two possible partition operations - with cudaDevSmResourceSplitByCount the partitions created have to follow default SM count granularity requirements, so it will often be rounded up and aligned to a default value. On the other hand, cudaDevSmResourceSplit is explicit and allows for creation of non-equal groups. It will not round up automatically - instead it is the developer’s responsibility to query and set the correct values. These requirements can be queried with cudaDeviceGetDevResource to determine the alignment granularity (sm.smCoscheduledAlignment). A general guideline on the default values for each compute architecture:
+There are two possible partition operations - with :func:`~.cudaDevSmResourceSplitByCount` the partitions created have to follow default SM count granularity requirements, so it will often be rounded up and aligned to a default value. On the other hand, :func:`~.cudaDevSmResourceSplit` is explicit and allows for creation of non-equal groups. It will not round up automatically - instead it is the developer’s responsibility to query and set the correct values. These requirements can be queried with :func:`~.cudaDeviceGetDevResource` to determine the alignment granularity (sm.smCoscheduledAlignment). A general guideline on the default values for each compute architecture:
 
 - On Compute Architecture 7.X, 8.X, and all Tegra SoC:
 
@@ -6216,9 +6297,9 @@ There are two possible partition operations - with cudaDevSmResourceSplitByCount
 
 Workqueues
 
-For ``cudaDevResourceTypeWorkqueueConfig``\ , the resource specifies the expected maximum number of concurrent stream-ordered workloads via the ``wqConcurrencyLimit``\  field. The ``sharingScope``\  field determines how workqueue resources are shared:
+For :py:obj:`~.cudaDevResourceTypeWorkqueueConfig`\ , the resource specifies the expected maximum number of concurrent stream-ordered workloads via the ``wqConcurrencyLimit``\  field. The ``sharingScope``\  field determines how workqueue resources are shared:
 
-- ``cudaDevWorkqueueConfigScopeDeviceCtx:``\  Use all shared workqueue resources across all contexts (default driver behavior).
+- :py:obj:`~.cudaDevWorkqueueConfigScopeDeviceCtx`:\  Use all shared workqueue resources across all contexts (default driver behavior).
 
 
 
@@ -6226,7 +6307,7 @@ For ``cudaDevResourceTypeWorkqueueConfig``\ , the resource specifies the expecte
 
 
 
-- ``cudaDevWorkqueueConfigScopeGreenCtxBalanced:``\  When possible, use non-overlapping workqueue resources with other balanced green contexts.
+- :py:obj:`~.cudaDevWorkqueueConfigScopeGreenCtxBalanced`:\  When possible, use non-overlapping workqueue resources with other balanced green contexts.
 
 
 
@@ -6236,13 +6317,13 @@ For ``cudaDevResourceTypeWorkqueueConfig``\ , the resource specifies the expecte
 
 
 
-The maximum concurrency limit depends on ``CUDA_DEVICE_MAX_CONNECTIONS`` and can be queried from the device via cudaDeviceGetDevResource. Configurations may exceed this concurrency limit, but the driver will not guarantee that work submission remains non-overlapping.
+The maximum concurrency limit depends on ``CUDA_DEVICE_MAX_CONNECTIONS`` and can be queried from the device via :func:`~.cudaDeviceGetDevResource`. Configurations may exceed this concurrency limit, but the driver will not guarantee that work submission remains non-overlapping.
 
-For ``cudaDevResourceTypeWorkqueue``\ , the resource represents a pre-existing workqueue that can be retrieved from existing execution contexts. This allows reusing workqueue resources across different execution contexts.
+For :py:obj:`~.cudaDevResourceTypeWorkqueue`\ , the resource represents a pre-existing workqueue that can be retrieved from existing execution contexts. This allows reusing workqueue resources across different execution contexts.
 
 On Concurrency
 
-Even if the green contexts have disjoint SM partitions, it is not guaranteed that the kernels launched in them will run concurrently or have forward progress guarantees. This is due to other resources that could cause a dependency. Using a combination of disjoint SMs and ``cudaDevWorkqueueConfigScopeGreenCtxBalanced``\  workqueue configurations can provide the best chance of avoiding interference. More resources will be added in the future to provide stronger guarantees.
+Even if the green contexts have disjoint SM partitions, it is not guaranteed that the kernels launched in them will run concurrently or have forward progress guarantees. This is due to other resources that could cause a dependency. Using a combination of disjoint SMs and :py:obj:`~.cudaDevWorkqueueConfigScopeGreenCtxBalanced`\  workqueue configurations can provide the best chance of avoiding interference. More resources will be added in the future to provide stronger guarantees.
 
 Additionally, there are two known scenarios, where its possible for the workload to run on more SMs than was provisioned (but never less).
 
@@ -6274,6 +6355,9 @@ Additionally, there are two known scenarios, where its possible for the workload
 .. autofunction:: cuda.bindings.runtime.cudaExecutionCtxWaitEvent
 .. autofunction:: cuda.bindings.runtime.cudaDeviceGetExecutionCtx
 
+.. _cuda-bindings-runtime-cudart_highlevel:
+.. _cuda-bindings-runtime-group__cudart__highlevel:
+
 C++ API Routines
 ----------------
 C++-style interface built on top of CUDA runtime API.
@@ -6288,6 +6372,9 @@ impl_private
 This section describes the C++ high level API functions of the CUDA runtime application programming interface. To use these functions, your application needs to be compiled with the ``nvcc``\  compiler.
 
 
+.. _cuda-bindings-runtime-cudart_driver:
+.. _cuda-bindings-runtime-group__cudart__driver:
+
 Interactions with the CUDA Driver API
 -------------------------------------
 
@@ -6301,11 +6388,11 @@ This section describes the interactions between the CUDA Driver API and the CUDA
 
 
 
-The CUDA Runtime provides cudaExecutionContext_t as an abstraction over driver-level contexts—specifically, green contexts and the primary context.
+The CUDA Runtime provides :py:obj:`~.cudaExecutionContext_t` as an abstraction over driver-level contexts—specifically, green contexts and the primary context.
 
 There are two primary ways to obtain an execution context:
 
-- cudaDeviceGetExecutionCtx: Returns the execution context that corresponds to the primary context of the specified device.
+- :func:`~.cudaDeviceGetExecutionCtx`: Returns the execution context that corresponds to the primary context of the specified device.
 
 
 
@@ -6313,7 +6400,7 @@ There are two primary ways to obtain an execution context:
 
 
 
-- cudaGreenCtxCreate: Creates a green context with the specified resources and returns an execution context.
+- :func:`~.cudaGreenCtxCreate`: Creates a green context with the specified resources and returns an execution context.
 
 
 
@@ -6323,7 +6410,7 @@ There are two primary ways to obtain an execution context:
 
 
 
-Note: Developers should treat cudaExecutionContext_t as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into a ``CUcontext`` or ``CUgreenCtx``.
+Note: Developers should treat :py:obj:`~.cudaExecutionContext_t` as an opaque handle and avoid assumptions about its underlying representation. The CUDA Runtime does not provide a way to convert this handle into a :py:obj:`~.CUcontext` or :py:obj:`~.CUgreenCtx`.
 
 
 
@@ -6333,11 +6420,11 @@ Note: Developers should treat cudaExecutionContext_t as an opaque handle and avo
 
 
 
-The primary context is the default execution context associated with a device in the Runtime. It can be obtained via a call to cudaDeviceGetExecutionCtx(). There is a one-to-one mapping between CUDA devices in the runtime and their primary contexts within a process.
+The primary context is the default execution context associated with a device in the Runtime. It can be obtained via a call to :func:`~.cudaDeviceGetExecutionCtx`. There is a one-to-one mapping between CUDA devices in the runtime and their primary contexts within a process.
 
 From the CUDA Runtime’s perspective, a device and its primary context are functionally synonymous.
 
-Unless explicitly overridden, either by making a different context current via the Driver API (e.g., ``cuCtxSetCurrent()``) or by using an explicit execution context handle, the Runtime will implicitly initialize and use the primary context for API calls as needed.
+Unless explicitly overridden, either by making a different context current via the Driver API (e.g., :func:`~.cuCtxSetCurrent`) or by using an explicit execution context handle, the Runtime will implicitly initialize and use the primary context for API calls as needed.
 
 
 
@@ -6347,13 +6434,13 @@ Unless explicitly overridden, either by making a different context current via t
 
 
 
-Unless an explicit execution context is specified (see “Execution Context Management” for APIs), CUDA Runtime API calls operate on the CUDA Driver ``CUcontext`` which is current to the calling host thread. If no ``CUcontext`` is current to the calling thread when a CUDA Runtime API call which requires an active context is made, then the primary context (device execution context) for a device will be selected, made current to the calling thread, and initialized. The context will be initialized using the parameters specified by the CUDA Runtime API functions cudaSetDeviceFlags(), ``cudaD3D9SetDirect3DDevice()``, ``cudaD3D10SetDirect3DDevice()``, ``cudaD3D11SetDirect3DDevice()``, cudaGLSetGLDevice(), and cudaVDPAUSetVDPAUDevice(). Note that these functions will fail with cudaErrorSetOnActiveProcess if they are called when the primary context for the specified device has already been initialized, except for cudaSetDeviceFlags() which will simply overwrite the previous settings.
+Unless an explicit execution context is specified (see “Execution Context Management” for APIs), CUDA Runtime API calls operate on the CUDA Driver :py:obj:`~.CUcontext` which is current to the calling host thread. If no :py:obj:`~.CUcontext` is current to the calling thread when a CUDA Runtime API call which requires an active context is made, then the primary context (device execution context) for a device will be selected, made current to the calling thread, and initialized. The context will be initialized using the parameters specified by the CUDA Runtime API functions :func:`~.cudaSetDeviceFlags`, ``cudaD3D9SetDirect3DDevice()``, ``cudaD3D10SetDirect3DDevice()``, ``cudaD3D11SetDirect3DDevice()``, :func:`~.cudaGLSetGLDevice`, and :func:`~.cudaVDPAUSetVDPAUDevice`. Note that these functions will fail with :py:obj:`~.cudaErrorSetOnActiveProcess` if they are called when the primary context for the specified device has already been initialized, except for :func:`~.cudaSetDeviceFlags` which will simply overwrite the previous settings.
 
-The function cudaInitDevice() ensures that the primary context is initialized for the requested device but does not make it current to the calling thread.
+The function :func:`~.cudaInitDevice` ensures that the primary context is initialized for the requested device but does not make it current to the calling thread.
 
-The function cudaSetDevice() initializes the primary context for the specified device and makes it current to the calling thread by calling ``cuCtxSetCurrent()``.
+The function :func:`~.cudaSetDevice` initializes the primary context for the specified device and makes it current to the calling thread by calling :func:`~.cuCtxSetCurrent`.
 
-Primary contexts will remain active until they are explicitly deinitialized using cudaDeviceReset(). The function cudaDeviceReset() will deinitialize the primary context for the calling thread's current device immediately. The context will remain current to all of the threads that it was current to. The next CUDA Runtime API call on any thread which requires an active context will trigger the reinitialization of that device's primary context.
+Primary contexts will remain active until they are explicitly deinitialized using :func:`~.cudaDeviceReset`. The function :func:`~.cudaDeviceReset` will deinitialize the primary context for the calling thread's current device immediately. The context will remain current to all of the threads that it was current to. The next CUDA Runtime API call on any thread which requires an active context will trigger the reinitialization of that device's primary context.
 
 Note that primary contexts are shared resources. It is recommended that the primary context not be reset except just before exit or to recover from an unspecified launch failure.
 
@@ -6361,122 +6448,125 @@ Note that primary contexts are shared resources. It is recommended that the prim
 
 
 
-**CUcontext Interoperability**
+**:py:obj:`~.CUcontext` Interoperability**
 
 
 
-Note that the use of multiple ``CUcontext`` s per device within a single process will substantially degrade performance and is strongly discouraged. Instead, it is highly recommended to either use execution contexts cudaExecutionContext_t or the implicit one-to-one device-to-primary context mapping for the process provided by the CUDA Runtime API.
+Note that the use of multiple :py:obj:`~.CUcontext` s per device within a single process will substantially degrade performance and is strongly discouraged. Instead, it is highly recommended to either use execution contexts :py:obj:`~.cudaExecutionContext_t` or the implicit one-to-one device-to-primary context mapping for the process provided by the CUDA Runtime API.
 
-If a non-primary ``CUcontext`` created by the CUDA Driver API is current to a thread then the CUDA Runtime API calls to that thread will operate on that ``CUcontext``, with some exceptions listed below. Interoperability between data types is discussed in the following sections.
+If a non-primary :py:obj:`~.CUcontext` created by the CUDA Driver API is current to a thread then the CUDA Runtime API calls to that thread will operate on that :py:obj:`~.CUcontext`, with some exceptions listed below. Interoperability between data types is discussed in the following sections.
 
-The function cudaDeviceEnablePeerAccess() and the rest of the peer access API may not be called when a non-primary CUcontext is current. To use the peer access APIs with a context created using the CUDA Driver API, it is necessary that the CUDA Driver API be used to access these features.
+The function :func:`~.cudaDeviceEnablePeerAccess` and the rest of the peer access API may not be called when a non-primary :py:obj:`~.CUcontext` is current. To use the peer access APIs with a context created using the CUDA Driver API, it is necessary that the CUDA Driver API be used to access these features.
 
-All CUDA Runtime API state (e.g, global variables' addresses and values) travels with its underlying ``CUcontext``. In particular, if a ``CUcontext`` is moved from one thread to another then all CUDA Runtime API state will move to that thread as well.
+All CUDA Runtime API state (e.g, global variables' addresses and values) travels with its underlying :py:obj:`~.CUcontext`. In particular, if a :py:obj:`~.CUcontext` is moved from one thread to another then all CUDA Runtime API state will move to that thread as well.
 
-Please note that attaching to legacy CUcontext (those with a version of 3010 as returned by ``cuCtxGetApiVersion()``) is not possible. The CUDA Runtime will return cudaErrorIncompatibleDriverContext in such cases.
+Please note that attaching to legacy :py:obj:`~.CUcontext` (those with a version of 3010 as returned by :func:`~.cuCtxGetApiVersion`) is not possible. The CUDA Runtime will return :py:obj:`~.cudaErrorIncompatibleDriverContext` in such cases.
 
 
 
 
 
-**Interactions between CUstream and cudaStream_t**
+**Interactions between :py:obj:`~.CUstream` and :py:obj:`~.cudaStream_t`**
 
 
 
-The types ``CUstream`` and cudaStream_t are identical and may be used interchangeably.
+The types :py:obj:`~.CUstream` and :py:obj:`~.cudaStream_t` are identical and may be used interchangeably.
 
 
 
 
 
-**Interactions between CUevent and cudaEvent_t**
+**Interactions between :py:obj:`~.CUevent` and :py:obj:`~.cudaEvent_t`**
 
 
 
-The types ``CUevent`` and cudaEvent_t are identical and may be used interchangeably.
+The types :py:obj:`~.CUevent` and :py:obj:`~.cudaEvent_t` are identical and may be used interchangeably.
 
 
 
 
 
-**Interactions between CUarray and cudaArray_t**
+**Interactions between :py:obj:`~.CUarray` and :py:obj:`~.cudaArray_t`**
 
 
 
-The types ::CUarray and struct ::cudaArray * represent the same data type and may be used interchangeably by casting the two types between each other.
+The types :py:obj:`~.CUarray` and :py:obj:`~.cudaArray_t` represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a ::CUarray in a CUDA Runtime API function which takes a struct ::cudaArray *, it is necessary to explicitly cast the ::CUarray to a struct ::cudaArray *.
+In order to use a :py:obj:`~.CUarray` in a CUDA Runtime API function which takes a :py:obj:`~.cudaArray_t`, it is necessary to explicitly cast the :py:obj:`~.CUarray` to a :py:obj:`~.cudaArray_t`.
 
-In order to use a ``struct cudaArray *`` in a CUDA Driver API function which takes a ``CUarray``, it is necessary to explicitly cast the ``struct cudaArray *`` to a ``CUarray`` .
+In order to use a :py:obj:`~.cudaArray_t` in a CUDA Driver API function which takes a :py:obj:`~.CUarray`, it is necessary to explicitly cast the :py:obj:`~.cudaArray_t` to a :py:obj:`~.CUarray` .
 
 
 
 
 
-**Interactions between CUgraphicsResource and cudaGraphicsResource_t**
+**Interactions between :py:obj:`~.CUgraphicsResource` and :py:obj:`~.cudaGraphicsResource_t`**
 
 
 
-The types ``CUgraphicsResource`` and cudaGraphicsResource_t represent the same data type and may be used interchangeably by casting the two types between each other.
+The types :py:obj:`~.CUgraphicsResource` and :py:obj:`~.cudaGraphicsResource_t` represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a ``CUgraphicsResource`` in a CUDA Runtime API function which takes a cudaGraphicsResource_t, it is necessary to explicitly cast the ``CUgraphicsResource`` to a cudaGraphicsResource_t.
+In order to use a :py:obj:`~.CUgraphicsResource` in a CUDA Runtime API function which takes a :py:obj:`~.cudaGraphicsResource_t`, it is necessary to explicitly cast the :py:obj:`~.CUgraphicsResource` to a :py:obj:`~.cudaGraphicsResource_t`.
 
-In order to use a cudaGraphicsResource_t in a CUDA Driver API function which takes a ``CUgraphicsResource``, it is necessary to explicitly cast the cudaGraphicsResource_t to a ``CUgraphicsResource``.
+In order to use a :py:obj:`~.cudaGraphicsResource_t` in a CUDA Driver API function which takes a :py:obj:`~.CUgraphicsResource`, it is necessary to explicitly cast the :py:obj:`~.cudaGraphicsResource_t` to a :py:obj:`~.CUgraphicsResource`.
 
 
 
 
 
-**Interactions between CUtexObject and cudaTextureObject_t**
+**Interactions between :py:obj:`~.CUtexObject` and :py:obj:`~.cudaTextureObject_t`**
 
 
 
-The types ``CUtexObject`` and cudaTextureObject_t represent the same data type and may be used interchangeably by casting the two types between each other.
+The types :py:obj:`~.CUtexObject` and :py:obj:`~.cudaTextureObject_t` represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a ``CUtexObject`` in a CUDA Runtime API function which takes a cudaTextureObject_t, it is necessary to explicitly cast the ``CUtexObject`` to a cudaTextureObject_t.
+In order to use a :py:obj:`~.CUtexObject` in a CUDA Runtime API function which takes a :py:obj:`~.cudaTextureObject_t`, it is necessary to explicitly cast the :py:obj:`~.CUtexObject` to a :py:obj:`~.cudaTextureObject_t`.
 
-In order to use a cudaTextureObject_t in a CUDA Driver API function which takes a ``CUtexObject``, it is necessary to explicitly cast the cudaTextureObject_t to a ``CUtexObject``.
+In order to use a :py:obj:`~.cudaTextureObject_t` in a CUDA Driver API function which takes a :py:obj:`~.CUtexObject`, it is necessary to explicitly cast the :py:obj:`~.cudaTextureObject_t` to a :py:obj:`~.CUtexObject`.
 
 
 
 
 
-**Interactions between CUsurfObject and cudaSurfaceObject_t**
+**Interactions between :py:obj:`~.CUsurfObject` and :py:obj:`~.cudaSurfaceObject_t`**
 
 
 
-The types ``CUsurfObject`` and cudaSurfaceObject_t represent the same data type and may be used interchangeably by casting the two types between each other.
+The types :py:obj:`~.CUsurfObject` and :py:obj:`~.cudaSurfaceObject_t` represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a ``CUsurfObject`` in a CUDA Runtime API function which takes a cudaSurfaceObject_t, it is necessary to explicitly cast the ``CUsurfObject`` to a cudaSurfaceObject_t.
+In order to use a :py:obj:`~.CUsurfObject` in a CUDA Runtime API function which takes a :py:obj:`~.cudaSurfaceObject_t`, it is necessary to explicitly cast the :py:obj:`~.CUsurfObject` to a :py:obj:`~.cudaSurfaceObject_t`.
 
-In order to use a cudaSurfaceObject_t in a CUDA Driver API function which takes a ``CUsurfObject``, it is necessary to explicitly cast the cudaSurfaceObject_t to a ``CUsurfObject``.
+In order to use a :py:obj:`~.cudaSurfaceObject_t` in a CUDA Driver API function which takes a :py:obj:`~.CUsurfObject`, it is necessary to explicitly cast the :py:obj:`~.cudaSurfaceObject_t` to a :py:obj:`~.CUsurfObject`.
 
 
 
 
 
-**Interactions between CUfunction and cudaFunction_t**
+**Interactions between :py:obj:`~.CUfunction` and :py:obj:`~.cudaFunction_t`**
 
 
 
-The types ``CUfunction`` and cudaFunction_t represent the same data type and may be used interchangeably by casting the two types between each other.
+The types :py:obj:`~.CUfunction` and :py:obj:`~.cudaFunction_t` represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a cudaFunction_t in a CUDA Driver API function which takes a ``CUfunction``, it is necessary to explicitly cast the cudaFunction_t to a ``CUfunction``.
+In order to use a :py:obj:`~.cudaFunction_t` in a CUDA Driver API function which takes a :py:obj:`~.CUfunction`, it is necessary to explicitly cast the :py:obj:`~.cudaFunction_t` to a :py:obj:`~.CUfunction`.
 
 
 
 
 
-**Interactions between CUkernel and cudaKernel_t**
+**Interactions between :py:obj:`~.CUkernel` and :py:obj:`~.cudaKernel_t`**
 
 
 
-The types ``CUkernel`` and cudaKernel_t represent the same data type and may be used interchangeably by casting the two types between each other.
+The types :py:obj:`~.CUkernel` and :py:obj:`~.cudaKernel_t` represent the same data type and may be used interchangeably by casting the two types between each other.
 
-In order to use a cudaKernel_t in a CUDA Driver API function which takes a ``CUkernel``, it is necessary to explicitly cast the cudaKernel_t to a ``CUkernel``.
+In order to use a :py:obj:`~.cudaKernel_t` in a CUDA Driver API function which takes a :py:obj:`~.CUkernel`, it is necessary to explicitly cast the :py:obj:`~.cudaKernel_t` to a :py:obj:`~.CUkernel`.
 
 .. autofunction:: cuda.bindings.runtime.cudaGetKernel
 
+.. _cuda-bindings-runtime-cudart_profiler:
+.. _cuda-bindings-runtime-group__cudart__profiler:
+
 Profiler Control
 ----------------