From c7c996af848870d345379fe7b1e02887b9c0429c Mon Sep 17 00:00:00 2001 From: Jonathan Bell Date: Tue, 14 Apr 2026 14:30:50 +0100 Subject: [PATCH 1/2] nvme-pci: manually allocate Host Memory Buffer segments on arm64 The Host Memory Buffer allocation algorithm interacts badly with arm64 platforms with no IOMMU for PCIe devices, such as BCM2711/BCM2712. The discontiguous VA allocation in nvme_host_mem_alloc_single() always fails, so nvme_alloc_host_mem_multi() falls back to the DMA coherent allocation scheme. On arm64, this will come out of CMA by default. Recent DRAM-less SSDs will request significant amounts of host memory - up to 128MB. As NVMe devices are set up early in boot, CMA is mostly-free so it ends up being claimed by a driver using it for opaque device-exclusive buffers. The divide-and-conquer allocation strategy also paradoxically results in increased CMA pressure if portions are already reserved. PCIe NVMe controllers implement a variably-sized HMB descriptor table, typically ranging from 32 to 256 entries in size. Therefore, aside from implementation-specific costs in the controller doing more granular look-ups, providing smaller orders is acceptable. Failing to provide a HMB does not prevent the controller from functioning. Create an alternate implementation for arm64 that creates a scatterlist and directly assigns contiguous pages from the buddy allocator, retrying with smaller orders on failure. This will avoid CMA by default. Signed-off-by: Jonathan Bell --- drivers/nvme/host/pci.c | 98 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a64b4b4a18a19..89615cac702da 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -183,9 +183,11 @@ struct nvme_dev { /* host memory buffer support: */ u64 host_mem_size; u32 nr_host_mem_descs; + u32 nr_sgl_ents; u32 host_mem_descs_size; dma_addr_t host_mem_descs_dma; struct nvme_host_mem_buf_desc *host_mem_descs; + struct scatterlist *host_mem_sgl; void **host_mem_desc_bufs; unsigned int nr_allocated_queues; unsigned int nr_write_queues; @@ -2300,6 +2302,13 @@ static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) return ret; } +#if IS_ENABLED(CONFIG_ARM64) +static void nvme_free_host_mem_multi(struct nvme_dev *dev) +{ + dma_unmap_sg(dev->dev, dev->host_mem_sgl, dev->nr_host_mem_descs, DMA_FROM_DEVICE); + sgl_free(dev->host_mem_sgl); +} +#else static void nvme_free_host_mem_multi(struct nvme_dev *dev) { int i; @@ -2316,6 +2325,7 @@ static void nvme_free_host_mem_multi(struct nvme_dev *dev) kfree(dev->host_mem_desc_bufs); dev->host_mem_desc_bufs = NULL; } +#endif static void nvme_free_host_mem(struct nvme_dev *dev) { @@ -2358,6 +2368,93 @@ static int nvme_alloc_host_mem_single(struct nvme_dev *dev, u64 size) return 0; } +#if IS_ENABLED(CONFIG_ARM64) +static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred, + u32 chunk_size) +{ + struct nvme_host_mem_buf_desc *descs; + u32 max_entries, len, descs_size; + dma_addr_t descs_dma; + struct scatterlist *slist; + struct page *page; + int i = 0, mapped_nents; + u64 size, tmp; + + tmp = (preferred + chunk_size - 1); + do_div(tmp, chunk_size); + max_entries = tmp; + + if (dev->ctrl.hmmaxd && dev->ctrl.hmmaxd < max_entries) + max_entries = dev->ctrl.hmmaxd; + + descs_size = max_entries * sizeof(*descs); + /* + * Allocate the descriptor table from coherent memory - + * usually occupies less than/up to a single page. + */ + descs = dma_alloc_coherent(dev->dev, descs_size, &descs_dma, + GFP_KERNEL); + if (!descs) + goto out; + + slist = kcalloc(max_entries, sizeof(struct scatterlist), GFP_KERNEL); + if (!slist) + goto out_free_descs; + + sg_init_table(slist, max_entries); + + dev_dbg(dev->dev, "Allocating HMB pref = %llu max_entries = %u\n", + preferred, max_entries); + + for (size = 0; size < preferred && i < max_entries; size += len) { + int order; + + len = min_t(u64, chunk_size, preferred - size); + order = get_order(len); + page = alloc_pages(GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN, order); + if (!page) + break; + sg_set_page(&slist[i], page, len, 0); + i++; + } + if (size < preferred) + goto out_free_sgl; + + mapped_nents = dma_map_sg(dev->dev, slist, i, DMA_FROM_DEVICE); + if (mapped_nents <= 0) + goto out_free_pages; + + /* Flush in case the CPU has cached any parts of the DMA buffers */ + dma_sync_sg_for_device(dev->dev, slist, i, DMA_FROM_DEVICE); + + i = dev->nr_host_mem_descs = mapped_nents; + + while (--i >= 0) { + descs[i].addr = sg_dma_address(&slist[i]); + WARN_ON_ONCE(sg_dma_len(&slist[i]) & (NVME_CTRL_PAGE_SIZE - 1)); + descs[i].size = sg_dma_len(&slist[i]) / NVME_CTRL_PAGE_SIZE; + } + + dev->host_mem_size = size; + dev->host_mem_descs = descs; + dev->host_mem_descs_dma = descs_dma; + dev->host_mem_descs_size = descs_size; + dev->host_mem_sgl = slist; + return 0; + +out_free_pages: + /* Don't use mapped_nents here as it could be incomplete */ + while (--i >= 0) + __free_pages(sg_page(&slist[i]), get_order(slist[i].length)); +out_free_sgl: + kfree(slist); +out_free_descs: + dma_free_coherent(dev->dev, descs_size, descs, descs_dma); +out: + dev->host_mem_descs = NULL; + return -ENOMEM; +} +#else static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred, u32 chunk_size) { @@ -2418,6 +2515,7 @@ static int nvme_alloc_host_mem_multi(struct nvme_dev *dev, u64 preferred, dev->host_mem_descs = NULL; return -ENOMEM; } +#endif static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred) { From 9ec944584372805fb46306fe72866f8c4d83922e Mon Sep 17 00:00:00 2001 From: Jonathan Bell Date: Tue, 14 Apr 2026 15:53:53 +0100 Subject: [PATCH 2/2] DTS: set default nvme Host Memory Buffer size to 32MB on BCM2711/2 In https://github.com/raspberrypi/linux/issues/6504 the nvme HMB allocation was found to come from CMA so was restricted to zero on the basis of a superficial amount of testing showing no performance impact. This is not the case for high-density low-capacity DRAM-less drives, where random read can suffer up to 50% degradation. With the addition of commit 6686634750d3 ("nvme-pci: manually allocate Host Memory Buffer segments on arm64") this restriction is no longer necessary. Bump the default to 32MiB, which is a fair compromise for increasing random read performance on smaller drives that typically have less internal parallelism and request smaller HMBs. Signed-off-by: Jonathan Bell --- arch/arm/boot/dts/broadcom/bcm2711-rpi-ds.dtsi | 2 +- arch/arm64/boot/dts/broadcom/bcm2712-rpi.dtsi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm/boot/dts/broadcom/bcm2711-rpi-ds.dtsi b/arch/arm/boot/dts/broadcom/bcm2711-rpi-ds.dtsi index 652062b5ba378..e498c634fb727 100644 --- a/arch/arm/boot/dts/broadcom/bcm2711-rpi-ds.dtsi +++ b/arch/arm/boot/dts/broadcom/bcm2711-rpi-ds.dtsi @@ -3,7 +3,7 @@ / { chosen { - bootargs = "coherent_pool=1M 8250.nr_uarts=1 snd_bcm2835.enable_headphones=0 cgroup_disable=memory numa_policy=interleave nvme.max_host_mem_size_mb=0"; + bootargs = "coherent_pool=1M 8250.nr_uarts=1 snd_bcm2835.enable_headphones=0 cgroup_disable=memory numa_policy=interleave nvme.max_host_mem_size_mb=32"; }; __overrides__ { diff --git a/arch/arm64/boot/dts/broadcom/bcm2712-rpi.dtsi b/arch/arm64/boot/dts/broadcom/bcm2712-rpi.dtsi index 2fa5eb16ae08a..b25e761dd23ff 100644 --- a/arch/arm64/boot/dts/broadcom/bcm2712-rpi.dtsi +++ b/arch/arm64/boot/dts/broadcom/bcm2712-rpi.dtsi @@ -113,7 +113,7 @@ watchdog: &pm {}; / { chosen: chosen { - bootargs = "reboot=w coherent_pool=1M 8250.nr_uarts=1 pci=pcie_bus_safe cgroup_disable=memory numa_policy=interleave nvme.max_host_mem_size_mb=0"; + bootargs = "reboot=w coherent_pool=1M 8250.nr_uarts=1 pci=pcie_bus_safe cgroup_disable=memory numa_policy=interleave nvme.max_host_mem_size_mb=32"; stdout-path = "serial10:115200n8"; };