diff --git a/cuda_bindings/docs/source/conf.py b/cuda_bindings/docs/source/conf.py index b7503c3d77..898c824906 100644 --- a/cuda_bindings/docs/source/conf.py +++ b/cuda_bindings/docs/source/conf.py @@ -44,6 +44,7 @@ def _github_examples_ref(): extensions = [ "sphinx.ext.autodoc", "sphinx.ext.autosummary", + "sphinx.ext.extlinks", "sphinx.ext.napoleon", "sphinx.ext.intersphinx", "myst_nb", @@ -109,9 +110,16 @@ def _github_examples_ref(): # skip cmdline prompts copybutton_exclude = ".linenos, .gp" -rst_epilog = f""" -.. |cuda_bindings_github_ref| replace:: {GITHUB_EXAMPLES_REF} -""" +extlinks = { + "cuda-bindings-example": ( + f"https://github.com/NVIDIA/cuda-python/blob/{GITHUB_EXAMPLES_REF}/cuda_bindings/examples/%s", + "%s", + ), + "cuda-bindings-examples": ( + f"https://github.com/NVIDIA/cuda-python/tree/{GITHUB_EXAMPLES_REF}/cuda_bindings/examples%s", + "%s", + ), +} intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), diff --git a/cuda_bindings/docs/source/examples.rst b/cuda_bindings/docs/source/examples.rst index fb0e12e75d..a9a588a850 100644 --- a/cuda_bindings/docs/source/examples.rst +++ b/cuda_bindings/docs/source/examples.rst @@ -5,61 +5,61 @@ Examples ======== This page links to the ``cuda.bindings`` examples shipped in the -`cuda-python repository `_. +:cuda-bindings-examples:`cuda-python repository `. Use it as a quick index when you want a runnable sample for a specific API area or CUDA feature. Introduction ------------ -- `clock_nvrtc.py `_ +- :cuda-bindings-example:`clock_nvrtc.py <0_Introduction/clock_nvrtc.py>` uses NVRTC-compiled CUDA code and the device clock to time a reduction kernel. -- `simple_cubemap_texture.py `_ +- :cuda-bindings-example:`simple_cubemap_texture.py <0_Introduction/simple_cubemap_texture.py>` demonstrates cubemap texture sampling and transformation. -- `simple_p2p.py `_ +- :cuda-bindings-example:`simple_p2p.py <0_Introduction/simple_p2p.py>` shows peer-to-peer memory access and transfers between multiple GPUs. -- `simple_zero_copy.py `_ +- :cuda-bindings-example:`simple_zero_copy.py <0_Introduction/simple_zero_copy.py>` uses zero-copy mapped host memory for vector addition. -- `system_wide_atomics.py `_ +- :cuda-bindings-example:`system_wide_atomics.py <0_Introduction/system_wide_atomics.py>` demonstrates system-wide atomic operations on managed memory. -- `vector_add_drv.py `_ +- :cuda-bindings-example:`vector_add_drv.py <0_Introduction/vector_add_drv.py>` uses the CUDA Driver API and unified virtual addressing for vector addition. -- `vector_add_mmap.py `_ +- :cuda-bindings-example:`vector_add_mmap.py <0_Introduction/vector_add_mmap.py>` uses virtual memory management APIs such as ``cuMemCreate`` and ``cuMemMap`` for vector addition. Concepts and techniques ----------------------- -- `stream_ordered_allocation.py `_ +- :cuda-bindings-example:`stream_ordered_allocation.py <2_Concepts_and_Techniques/stream_ordered_allocation.py>` demonstrates ``cudaMallocAsync`` and ``cudaFreeAsync`` together with memory-pool release thresholds. CUDA features ------------- -- `global_to_shmem_async_copy.py `_ +- :cuda-bindings-example:`global_to_shmem_async_copy.py <3_CUDA_Features/global_to_shmem_async_copy.py>` compares asynchronous global-to-shared-memory copy strategies in matrix multiplication kernels. -- `simple_cuda_graphs.py `_ +- :cuda-bindings-example:`simple_cuda_graphs.py <3_CUDA_Features/simple_cuda_graphs.py>` shows both manual CUDA graph construction and stream-capture-based replay. Libraries and tools ------------------- -- `conjugate_gradient_multi_block_cg.py `_ +- :cuda-bindings-example:`conjugate_gradient_multi_block_cg.py <4_CUDA_Libraries/conjugate_gradient_multi_block_cg.py>` implements a conjugate-gradient solver with cooperative groups and multi-block synchronization. -- `nvidia_smi.py `_ +- :cuda-bindings-example:`nvidia_smi.py <4_CUDA_Libraries/nvidia_smi.py>` uses NVML to implement a Python subset of ``nvidia-smi``. Advanced and interoperability ----------------------------- -- `iso_fd_modelling.py `_ +- :cuda-bindings-example:`iso_fd_modelling.py ` runs isotropic finite-difference wave propagation across multiple GPUs with peer-to-peer halo exchange. -- `jit_program.py `_ +- :cuda-bindings-example:`jit_program.py ` JIT-compiles a SAXPY kernel with NVRTC and launches it through the Driver API. diff --git a/cuda_bindings/docs/source/overview.rst b/cuda_bindings/docs/source/overview.rst index 0ebf99bc23..e0d269cb6b 100644 --- a/cuda_bindings/docs/source/overview.rst +++ b/cuda_bindings/docs/source/overview.rst @@ -522,7 +522,7 @@ CUDA objects Certain CUDA kernels use native CUDA types as their parameters such as ``cudaTextureObject_t``. These types require special handling since they're neither a primitive ctype nor a custom user type. Since ``cuda.bindings`` exposes each of them as Python classes, they each implement ``getPtr()`` and ``__int__()``. These two callables used to support the NumPy and ctypes approach. The difference between each call is further described under `Tips and Tricks `_. For this example, lets use the ``transformKernel`` from -`simple_cubemap_texture.py `_. +:cuda-bindings-example:`simple_cubemap_texture.py <0_Introduction/simple_cubemap_texture.py>`. The :doc:`examples` page links to more samples covering textures, graphs, memory mapping, and multi-GPU workflows. diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py index 6c0fe6b307..14d9329793 100644 --- a/cuda_core/docs/source/conf.py +++ b/cuda_core/docs/source/conf.py @@ -46,6 +46,7 @@ def _github_examples_ref(): "sphinx.ext.autosummary", "sphinx.ext.napoleon", "sphinx.ext.intersphinx", + "sphinx.ext.extlinks", "myst_nb", "sphinx_copybutton", "sphinx_toolbox.more_autodoc.autoprotocol", @@ -107,9 +108,16 @@ def _github_examples_ref(): # skip cmdline prompts copybutton_exclude = ".linenos, .gp" -rst_epilog = f""" -.. |cuda_core_github_ref| replace:: {GITHUB_EXAMPLES_REF} -""" +extlinks = { + "cuda-core-example": ( + f"https://github.com/NVIDIA/cuda-python/blob/{GITHUB_EXAMPLES_REF}/cuda_core/examples/%s", + "%s", + ), + "cuda-core-examples": ( + f"https://github.com/NVIDIA/cuda-python/tree/{GITHUB_EXAMPLES_REF}/cuda_core/examples%s", + "%s", + ), +} intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), diff --git a/cuda_core/docs/source/examples.rst b/cuda_core/docs/source/examples.rst index 45044a0c90..e3b2ef8f3f 100644 --- a/cuda_core/docs/source/examples.rst +++ b/cuda_core/docs/source/examples.rst @@ -5,55 +5,55 @@ Examples ======== This page links to the ``cuda.core`` examples shipped in the -`cuda-python repository `_. +:cuda-core-examples:`cuda-python repository `. Use it as a quick index when you want a runnable starting point for a specific workflow. Compilation and kernel launch ----------------------------- -- `vector_add.py `_ +- :cuda-core-example:`vector_add.py` compiles and launches a simple vector-add kernel with CuPy arrays. -- `saxpy.py `_ +- :cuda-core-example:`saxpy.py` JIT-compiles a templated SAXPY kernel and launches both float and double instantiations. -- `pytorch_example.py `_ +- :cuda-core-example:`pytorch_example.py` launches a CUDA kernel with PyTorch tensors and a wrapped PyTorch stream. Multi-device and advanced launch configuration ---------------------------------------------- -- `simple_multi_gpu_example.py `_ +- :cuda-core-example:`simple_multi_gpu_example.py` compiles and launches kernels across multiple GPUs. -- `thread_block_cluster.py `_ +- :cuda-core-example:`thread_block_cluster.py` demonstrates thread block cluster launch configuration on Hopper-class GPUs. -- `tma_tensor_map.py `_ +- :cuda-core-example:`tma_tensor_map.py` demonstrates Tensor Memory Accelerator descriptors and TMA-based bulk copies. Linking and graphs ------------------ -- `jit_lto_fractal.py `_ +- :cuda-core-example:`jit_lto_fractal.py` uses JIT link-time optimization to link user-provided device code into a fractal workflow at runtime. -- `cuda_graphs.py `_ +- :cuda-core-example:`cuda_graphs.py` captures and replays a multi-kernel CUDA graph to reduce launch overhead. Interoperability and memory access ---------------------------------- -- `memory_ops.py `_ +- :cuda-core-example:`memory_ops.py` covers memory resources, pinned memory, device transfers, and DLPack interop. -- `strided_memory_view_cpu.py `_ +- :cuda-core-example:`strided_memory_view_cpu.py` uses ``StridedMemoryView`` with JIT-compiled CPU code via ``cffi``. -- `strided_memory_view_gpu.py `_ +- :cuda-core-example:`strided_memory_view_gpu.py` uses ``StridedMemoryView`` with JIT-compiled GPU code and foreign GPU buffers. -- `gl_interop_plasma.py `_ +- :cuda-core-example:`gl_interop_plasma.py` renders a CUDA-generated plasma effect through OpenGL interop without CPU copies. System inspection ----------------- -- `show_device_properties.py `_ +- :cuda-core-example:`show_device_properties.py` prints a detailed report of the CUDA devices available on the system. diff --git a/cuda_core/docs/source/getting-started.rst b/cuda_core/docs/source/getting-started.rst index ebe97df834..fb2f0b22fc 100644 --- a/cuda_core/docs/source/getting-started.rst +++ b/cuda_core/docs/source/getting-started.rst @@ -32,7 +32,7 @@ Example: Compiling and Launching a CUDA kernel ---------------------------------------------- To get a taste for ``cuda.core``, let's walk through a simple example that compiles and launches a vector addition kernel. -You can find the complete example in `vector_add.py `_ +You can find the complete example in :cuda-core-example:`vector_add.py` and browse the :doc:`examples page ` for the rest of the shipped workflows. @@ -80,7 +80,7 @@ Note the use of the ``name_expressions`` parameter to the :meth:`Program.compile Next, we retrieve the compiled kernel from the CUBIN and prepare the arguments and kernel configuration. We're using `CuPy `_ arrays as inputs for this example, but you can use PyTorch tensors too (see -`pytorch_example.py `_ +:cuda-core-example:`pytorch_example.py` and the :doc:`examples page `). .. code-block:: python diff --git a/cuda_core/docs/source/interoperability.rst b/cuda_core/docs/source/interoperability.rst index 4aac89d13d..4aa155ce5f 100644 --- a/cuda_core/docs/source/interoperability.rst +++ b/cuda_core/docs/source/interoperability.rst @@ -70,11 +70,11 @@ a few iterations to ensure correctness. for extracting the metadata (such as pointer address, shape, strides, and dtype) from any Python objects supporting either CAI or DLPack and returning a :class:`~utils.StridedMemoryView` object. See the -`strided_memory_view_constructors.py `_ +:cuda-core-example:`strided_memory_view_constructors.py` example for the explicit constructors, or -`strided_memory_view_cpu.py `_ +:cuda-core-example:`strided_memory_view_cpu.py` and -`strided_memory_view_gpu.py `_ +:cuda-core-example:`strided_memory_view_gpu.py` for decorator-based workflows. This provides a *concrete implementation* to both protocols that is **array-library-agnostic**, so that all Python projects can just rely on this without either re-implementing (the consumer-side of)