From 3a98531c414df4a0454a4369c46355849e736233 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 13 Apr 2026 14:08:30 +0800 Subject: [PATCH 1/8] enable xpu tests Signed-off-by: jiqing-feng --- tests/test_functional.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_functional.py b/tests/test_functional.py index 1098c6087..0cd977831 100644 --- a/tests/test_functional.py +++ b/tests/test_functional.py @@ -101,10 +101,10 @@ class Test8BitBlockwiseQuantizeFunctional: def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed): iters = 100 - if device != "cuda": + if device not in ["cuda", "xpu"]: iters = 10 - # This test is slow in our non-CUDA implementations, so avoid atypical use cases. + # This test is slow in our non-cuda/non-xpu implementations, so avoid atypical use cases. if nested: pytest.skip("Not a typical use case.") if blocksize != 256: From 5d695a56aea7b76ee49e31674228990c22911981 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 13 Apr 2026 14:16:16 +0800 Subject: [PATCH 2/8] enable fsdp tests Signed-off-by: jiqing-feng --- tests/fsdp_state_dict_save.py | 15 ++++++++++++--- tests/test_linear4bit.py | 6 +----- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/tests/fsdp_state_dict_save.py b/tests/fsdp_state_dict_save.py index 2e56c1c03..dd8bd7562 100644 --- a/tests/fsdp_state_dict_save.py +++ b/tests/fsdp_state_dict_save.py @@ -20,6 +20,14 @@ import bitsandbytes as bnb +def _get_device_and_backend(): + """Auto-detect accelerator device and distributed backend.""" + device_type = str(torch.accelerator.current_accelerator()) + backend_map = {"cuda": "nccl", "xpu": "ccl"} + backend = backend_map.get(device_type, "gloo") + return device_type, backend + + class SimpleQLoRAModel(nn.Module): """Minimal model with a frozen 4-bit base layer and a trainable adapter.""" @@ -33,15 +41,16 @@ def forward(self, x): def main(): - dist.init_process_group(backend="nccl") + device_type, backend = _get_device_and_backend() + dist.init_process_group(backend=backend) rank = dist.get_rank() - torch.cuda.set_device(rank) + torch.accelerator.set_device_index(rank) errors = [] for quant_type in ("nf4", "fp4"): model = SimpleQLoRAModel(quant_type=quant_type) - model = model.to("cuda") + model = model.to(device_type) # Freeze quantized base weights (as in real QLoRA) for p in model.base.parameters(): diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index d9a25c90e..930226e19 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -569,11 +569,7 @@ def test_params4bit_quant_state_attr_access(device, quant_type, compress_statist assert w.bnb_quantized is True -@pytest.mark.skipif(not torch.cuda.is_available(), reason="FSDP requires CUDA") -@pytest.mark.skipif( - not getattr(torch.distributed, "is_nccl_available", lambda: False)(), - reason="FSDP test requires NCCL backend", -) +@pytest.mark.skipif(not torch.accelerator.is_available(), reason="FSDP requires an accelerator device") def test_fsdp_state_dict_save_4bit(): """Integration test: FSDP get_model_state_dict with cpu_offload on a 4-bit model (#1405). From 7925a2ad93682ad7ee85d78c57ed5d3e22d8cc97 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Mon, 13 Apr 2026 14:18:03 +0800 Subject: [PATCH 3/8] fix backend Signed-off-by: jiqing-feng --- tests/fsdp_state_dict_save.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/fsdp_state_dict_save.py b/tests/fsdp_state_dict_save.py index dd8bd7562..2c6d26052 100644 --- a/tests/fsdp_state_dict_save.py +++ b/tests/fsdp_state_dict_save.py @@ -23,7 +23,7 @@ def _get_device_and_backend(): """Auto-detect accelerator device and distributed backend.""" device_type = str(torch.accelerator.current_accelerator()) - backend_map = {"cuda": "nccl", "xpu": "ccl"} + backend_map = {"cuda": "nccl", "xpu": "xccl"} backend = backend_map.get(device_type, "gloo") return device_type, backend From dac9de0a5f2abd6f901b12a2b03fc1c266c04adb Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 14 Apr 2026 09:11:45 +0000 Subject: [PATCH 4/8] fix torch version accelerator Signed-off-by: jiqing-feng --- tests/fsdp_state_dict_save.py | 24 ++++++++++++++++++++++-- tests/test_linear4bit.py | 9 ++++++++- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/tests/fsdp_state_dict_save.py b/tests/fsdp_state_dict_save.py index 2c6d26052..4a2e08c2b 100644 --- a/tests/fsdp_state_dict_save.py +++ b/tests/fsdp_state_dict_save.py @@ -20,9 +20,29 @@ import bitsandbytes as bnb +def _current_accelerator_type(): + if hasattr(torch, "accelerator") and torch.accelerator.is_available(): + return str(torch.accelerator.current_accelerator()) + if hasattr(torch, "xpu") and torch.xpu.is_available(): + return "xpu" + if torch.cuda.is_available(): + return "cuda" + return "cpu" + + +def _set_device_index(index: int, device_type: str): + if hasattr(torch, "accelerator"): + torch.accelerator.set_device_index(index) + return + if device_type == "cuda": + torch.cuda.set_device(index) + elif device_type == "xpu" and hasattr(torch, "xpu") and hasattr(torch.xpu, "set_device"): + torch.xpu.set_device(index) + + def _get_device_and_backend(): """Auto-detect accelerator device and distributed backend.""" - device_type = str(torch.accelerator.current_accelerator()) + device_type = _current_accelerator_type() backend_map = {"cuda": "nccl", "xpu": "xccl"} backend = backend_map.get(device_type, "gloo") return device_type, backend @@ -44,7 +64,7 @@ def main(): device_type, backend = _get_device_and_backend() dist.init_process_group(backend=backend) rank = dist.get_rank() - torch.accelerator.set_device_index(rank) + _set_device_index(rank, device_type) errors = [] diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index 930226e19..d96a0ebe1 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -569,7 +569,14 @@ def test_params4bit_quant_state_attr_access(device, quant_type, compress_statist assert w.bnb_quantized is True -@pytest.mark.skipif(not torch.accelerator.is_available(), reason="FSDP requires an accelerator device") +@pytest.mark.skipif( + not ( + (hasattr(torch, "accelerator") and torch.accelerator.is_available()) + or torch.cuda.is_available() + or (hasattr(torch, "xpu") and torch.xpu.is_available()) + ), + reason="FSDP requires an accelerator device", +) def test_fsdp_state_dict_save_4bit(): """Integration test: FSDP get_model_state_dict with cpu_offload on a 4-bit model (#1405). From a47ed349946a324585308504b615a2ad03a62ee7 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Tue, 14 Apr 2026 10:10:12 +0000 Subject: [PATCH 5/8] skip fsdp if cpu Signed-off-by: jiqing-feng --- tests/test_linear4bit.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index d96a0ebe1..73d19798a 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -569,14 +569,7 @@ def test_params4bit_quant_state_attr_access(device, quant_type, compress_statist assert w.bnb_quantized is True -@pytest.mark.skipif( - not ( - (hasattr(torch, "accelerator") and torch.accelerator.is_available()) - or torch.cuda.is_available() - or (hasattr(torch, "xpu") and torch.xpu.is_available()) - ), - reason="FSDP requires an accelerator device", -) +@pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="FSDP requires an accelerator device") def test_fsdp_state_dict_save_4bit(): """Integration test: FSDP get_model_state_dict with cpu_offload on a 4-bit model (#1405). From 1c5c63b7307d4af4416777ceae02d0fdb76d13c6 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Wed, 15 Apr 2026 09:17:59 +0000 Subject: [PATCH 6/8] fix windows skip Signed-off-by: jiqing-feng --- tests/test_linear4bit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index 73d19798a..71b58a561 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -451,7 +451,7 @@ def test_linear4bit_torch_compile_activation_checkpointing(device, quant_type, c """ if device == "hpu" and not is_supported_on_hpu(quant_type): pytest.skip("This configuration is not supported on HPU.") - if device == "cuda" and platform.system() == "Windows": + if platform.system() == "Windows": pytest.skip("Triton is not officially supported on Windows") dim = 256 batch_size = 16 From b0597cd68322569acb9041fc952b716e70ee953b Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 16 Apr 2026 13:51:49 +0000 Subject: [PATCH 7/8] skip fsdp on Windows Signed-off-by: jiqing-feng --- tests/test_linear4bit.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py index 71b58a561..f2335e5ea 100644 --- a/tests/test_linear4bit.py +++ b/tests/test_linear4bit.py @@ -451,7 +451,7 @@ def test_linear4bit_torch_compile_activation_checkpointing(device, quant_type, c """ if device == "hpu" and not is_supported_on_hpu(quant_type): pytest.skip("This configuration is not supported on HPU.") - if platform.system() == "Windows": + if device == "cuda" and platform.system() == "Windows": pytest.skip("Triton is not officially supported on Windows") dim = 256 batch_size = 16 @@ -569,6 +569,7 @@ def test_params4bit_quant_state_attr_access(device, quant_type, compress_statist assert w.bnb_quantized is True +@pytest.mark.skipif(platform.system() == "Windows", reason="FSDP is not supported on Windows") @pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="FSDP requires an accelerator device") def test_fsdp_state_dict_save_4bit(): """Integration test: FSDP get_model_state_dict with cpu_offload on a 4-bit model (#1405). From 7f88321852e72a8880fa5c6bef554cd8416041a5 Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 16 Apr 2026 16:05:11 +0800 Subject: [PATCH 8/8] enable 8bit and fsdp tests for xpu Signed-off-by: jiqing-feng --- tests/test_linear8bitlt.py | 7 ++++--- tests/test_modules.py | 14 ++++++++------ 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py index 410961e0b..a9d75b92e 100644 --- a/tests/test_linear8bitlt.py +++ b/tests/test_linear8bitlt.py @@ -172,8 +172,9 @@ def test_linear_serialization( assert torch.allclose(x_first.grad, x_third.grad, atol=1e-5) -@pytest.fixture -def linear8bit(requires_cuda): +@pytest.fixture(params=get_available_devices(no_cpu=True)) +def linear8bit(request): + device = request.param linear = torch.nn.Linear(32, 96) linear_custom = Linear8bitLt( linear.in_features, @@ -188,7 +189,7 @@ def linear8bit(requires_cuda): has_fp16_weights=False, ) linear_custom.bias = linear.bias - linear_custom = linear_custom.cuda() + linear_custom = linear_custom.to(device) return linear_custom diff --git a/tests/test_modules.py b/tests/test_modules.py index 8c4d666d3..95f78b6d3 100644 --- a/tests/test_modules.py +++ b/tests/test_modules.py @@ -448,34 +448,36 @@ def test_4bit_embedding_warnings(device, caplog): assert any("inference" in msg for msg in caplog.messages) -def test_4bit_embedding_weight_fsdp_fix(requires_cuda): +@pytest.mark.parametrize("device", get_available_devices(no_cpu=True)) +def test_4bit_embedding_weight_fsdp_fix(device): num_embeddings = 64 embedding_dim = 32 module = bnb.nn.Embedding4bit(num_embeddings=num_embeddings, embedding_dim=embedding_dim) - module.cuda() + module.to(device) module.weight.quant_state = None - input_tokens = torch.randint(low=0, high=num_embeddings, size=(1,), device="cuda") + input_tokens = torch.randint(low=0, high=num_embeddings, size=(1,), device=device) module(input_tokens) assert module.weight.quant_state is not None -def test_4bit_linear_weight_fsdp_fix(requires_cuda): +@pytest.mark.parametrize("device", get_available_devices(no_cpu=True)) +def test_4bit_linear_weight_fsdp_fix(device): inp_size = 64 out_size = 32 module = bnb.nn.Linear4bit(inp_size, out_size) - module.cuda() + module.to(device) module.weight.quant_state = None - input_tensor = torch.randn((1, inp_size), device="cuda") + input_tensor = torch.randn((1, inp_size), device=device) module(input_tensor)