From 3a98531c414df4a0454a4369c46355849e736233 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 13 Apr 2026 14:08:30 +0800
Subject: [PATCH 1/8] enable xpu tests

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/test_functional.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_functional.py b/tests/test_functional.py
index 1098c6087..0cd977831 100644
--- a/tests/test_functional.py
+++ b/tests/test_functional.py
@@ -101,10 +101,10 @@ class Test8BitBlockwiseQuantizeFunctional:
     def test_dynamic_blockwise_quantization(self, device, dtype, nested, blocksize, signed):
         iters = 100
 
-        if device != "cuda":
+        if device not in ["cuda", "xpu"]:
             iters = 10
 
-            # This test is slow in our non-CUDA implementations, so avoid atypical use cases.
+            # This test is slow in our non-cuda/non-xpu implementations, so avoid atypical use cases.
             if nested:
                 pytest.skip("Not a typical use case.")
             if blocksize != 256:

From 5d695a56aea7b76ee49e31674228990c22911981 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 13 Apr 2026 14:16:16 +0800
Subject: [PATCH 2/8] enable fsdp tests

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/fsdp_state_dict_save.py | 15 ++++++++++++---
 tests/test_linear4bit.py      |  6 +-----
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/tests/fsdp_state_dict_save.py b/tests/fsdp_state_dict_save.py
index 2e56c1c03..dd8bd7562 100644
--- a/tests/fsdp_state_dict_save.py
+++ b/tests/fsdp_state_dict_save.py
@@ -20,6 +20,14 @@
 import bitsandbytes as bnb
 
 
+def _get_device_and_backend():
+    """Auto-detect accelerator device and distributed backend."""
+    device_type = str(torch.accelerator.current_accelerator())
+    backend_map = {"cuda": "nccl", "xpu": "ccl"}
+    backend = backend_map.get(device_type, "gloo")
+    return device_type, backend
+
+
 class SimpleQLoRAModel(nn.Module):
     """Minimal model with a frozen 4-bit base layer and a trainable adapter."""
 
@@ -33,15 +41,16 @@ def forward(self, x):
 
 
 def main():
-    dist.init_process_group(backend="nccl")
+    device_type, backend = _get_device_and_backend()
+    dist.init_process_group(backend=backend)
     rank = dist.get_rank()
-    torch.cuda.set_device(rank)
+    torch.accelerator.set_device_index(rank)
 
     errors = []
 
     for quant_type in ("nf4", "fp4"):
         model = SimpleQLoRAModel(quant_type=quant_type)
-        model = model.to("cuda")
+        model = model.to(device_type)
 
         # Freeze quantized base weights (as in real QLoRA)
         for p in model.base.parameters():
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index d9a25c90e..930226e19 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -569,11 +569,7 @@ def test_params4bit_quant_state_attr_access(device, quant_type, compress_statist
     assert w.bnb_quantized is True
 
 
-@pytest.mark.skipif(not torch.cuda.is_available(), reason="FSDP requires CUDA")
-@pytest.mark.skipif(
-    not getattr(torch.distributed, "is_nccl_available", lambda: False)(),
-    reason="FSDP test requires NCCL backend",
-)
+@pytest.mark.skipif(not torch.accelerator.is_available(), reason="FSDP requires an accelerator device")
 def test_fsdp_state_dict_save_4bit():
     """Integration test: FSDP get_model_state_dict with cpu_offload on a 4-bit model (#1405).
 

From 7925a2ad93682ad7ee85d78c57ed5d3e22d8cc97 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Mon, 13 Apr 2026 14:18:03 +0800
Subject: [PATCH 3/8] fix backend

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/fsdp_state_dict_save.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/fsdp_state_dict_save.py b/tests/fsdp_state_dict_save.py
index dd8bd7562..2c6d26052 100644
--- a/tests/fsdp_state_dict_save.py
+++ b/tests/fsdp_state_dict_save.py
@@ -23,7 +23,7 @@
 def _get_device_and_backend():
     """Auto-detect accelerator device and distributed backend."""
     device_type = str(torch.accelerator.current_accelerator())
-    backend_map = {"cuda": "nccl", "xpu": "ccl"}
+    backend_map = {"cuda": "nccl", "xpu": "xccl"}
     backend = backend_map.get(device_type, "gloo")
     return device_type, backend
 

From dac9de0a5f2abd6f901b12a2b03fc1c266c04adb Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 14 Apr 2026 09:11:45 +0000
Subject: [PATCH 4/8] fix torch version accelerator

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/fsdp_state_dict_save.py | 24 ++++++++++++++++++++++--
 tests/test_linear4bit.py      |  9 ++++++++-
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/tests/fsdp_state_dict_save.py b/tests/fsdp_state_dict_save.py
index 2c6d26052..4a2e08c2b 100644
--- a/tests/fsdp_state_dict_save.py
+++ b/tests/fsdp_state_dict_save.py
@@ -20,9 +20,29 @@
 import bitsandbytes as bnb
 
 
+def _current_accelerator_type():
+    if hasattr(torch, "accelerator") and torch.accelerator.is_available():
+        return str(torch.accelerator.current_accelerator())
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        return "xpu"
+    if torch.cuda.is_available():
+        return "cuda"
+    return "cpu"
+
+
+def _set_device_index(index: int, device_type: str):
+    if hasattr(torch, "accelerator"):
+        torch.accelerator.set_device_index(index)
+        return
+    if device_type == "cuda":
+        torch.cuda.set_device(index)
+    elif device_type == "xpu" and hasattr(torch, "xpu") and hasattr(torch.xpu, "set_device"):
+        torch.xpu.set_device(index)
+
+
 def _get_device_and_backend():
     """Auto-detect accelerator device and distributed backend."""
-    device_type = str(torch.accelerator.current_accelerator())
+    device_type = _current_accelerator_type()
     backend_map = {"cuda": "nccl", "xpu": "xccl"}
     backend = backend_map.get(device_type, "gloo")
     return device_type, backend
@@ -44,7 +64,7 @@ def main():
     device_type, backend = _get_device_and_backend()
     dist.init_process_group(backend=backend)
     rank = dist.get_rank()
-    torch.accelerator.set_device_index(rank)
+    _set_device_index(rank, device_type)
 
     errors = []
 
diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index 930226e19..d96a0ebe1 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -569,7 +569,14 @@ def test_params4bit_quant_state_attr_access(device, quant_type, compress_statist
     assert w.bnb_quantized is True
 
 
-@pytest.mark.skipif(not torch.accelerator.is_available(), reason="FSDP requires an accelerator device")
+@pytest.mark.skipif(
+    not (
+        (hasattr(torch, "accelerator") and torch.accelerator.is_available())
+        or torch.cuda.is_available()
+        or (hasattr(torch, "xpu") and torch.xpu.is_available())
+    ),
+    reason="FSDP requires an accelerator device",
+)
 def test_fsdp_state_dict_save_4bit():
     """Integration test: FSDP get_model_state_dict with cpu_offload on a 4-bit model (#1405).
 

From a47ed349946a324585308504b615a2ad03a62ee7 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Tue, 14 Apr 2026 10:10:12 +0000
Subject: [PATCH 5/8] skip fsdp if cpu

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/test_linear4bit.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index d96a0ebe1..73d19798a 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -569,14 +569,7 @@ def test_params4bit_quant_state_attr_access(device, quant_type, compress_statist
     assert w.bnb_quantized is True
 
 
-@pytest.mark.skipif(
-    not (
-        (hasattr(torch, "accelerator") and torch.accelerator.is_available())
-        or torch.cuda.is_available()
-        or (hasattr(torch, "xpu") and torch.xpu.is_available())
-    ),
-    reason="FSDP requires an accelerator device",
-)
+@pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="FSDP requires an accelerator device")
 def test_fsdp_state_dict_save_4bit():
     """Integration test: FSDP get_model_state_dict with cpu_offload on a 4-bit model (#1405).
 

From 1c5c63b7307d4af4416777ceae02d0fdb76d13c6 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Wed, 15 Apr 2026 09:17:59 +0000
Subject: [PATCH 6/8] fix windows skip

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/test_linear4bit.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index 73d19798a..71b58a561 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -451,7 +451,7 @@ def test_linear4bit_torch_compile_activation_checkpointing(device, quant_type, c
     """
     if device == "hpu" and not is_supported_on_hpu(quant_type):
         pytest.skip("This configuration is not supported on HPU.")
-    if device == "cuda" and platform.system() == "Windows":
+    if platform.system() == "Windows":
         pytest.skip("Triton is not officially supported on Windows")
     dim = 256
     batch_size = 16

From b0597cd68322569acb9041fc952b716e70ee953b Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 16 Apr 2026 13:51:49 +0000
Subject: [PATCH 7/8] skip fsdp on Windows

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/test_linear4bit.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_linear4bit.py b/tests/test_linear4bit.py
index 71b58a561..f2335e5ea 100644
--- a/tests/test_linear4bit.py
+++ b/tests/test_linear4bit.py
@@ -451,7 +451,7 @@ def test_linear4bit_torch_compile_activation_checkpointing(device, quant_type, c
     """
     if device == "hpu" and not is_supported_on_hpu(quant_type):
         pytest.skip("This configuration is not supported on HPU.")
-    if platform.system() == "Windows":
+    if device == "cuda" and platform.system() == "Windows":
         pytest.skip("Triton is not officially supported on Windows")
     dim = 256
     batch_size = 16
@@ -569,6 +569,7 @@ def test_params4bit_quant_state_attr_access(device, quant_type, compress_statist
     assert w.bnb_quantized is True
 
 
+@pytest.mark.skipif(platform.system() == "Windows", reason="FSDP is not supported on Windows")
 @pytest.mark.skipif(not get_available_devices(no_cpu=True), reason="FSDP requires an accelerator device")
 def test_fsdp_state_dict_save_4bit():
     """Integration test: FSDP get_model_state_dict with cpu_offload on a 4-bit model (#1405).

From 7f88321852e72a8880fa5c6bef554cd8416041a5 Mon Sep 17 00:00:00 2001
From: jiqing-feng <jiqing.feng@intel.com>
Date: Thu, 16 Apr 2026 16:05:11 +0800
Subject: [PATCH 8/8] enable 8bit and fsdp tests for xpu

Signed-off-by: jiqing-feng <jiqing.feng@intel.com>
---
 tests/test_linear8bitlt.py |  7 ++++---
 tests/test_modules.py      | 14 ++++++++------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/tests/test_linear8bitlt.py b/tests/test_linear8bitlt.py
index 410961e0b..a9d75b92e 100644
--- a/tests/test_linear8bitlt.py
+++ b/tests/test_linear8bitlt.py
@@ -172,8 +172,9 @@ def test_linear_serialization(
     assert torch.allclose(x_first.grad, x_third.grad, atol=1e-5)
 
 
-@pytest.fixture
-def linear8bit(requires_cuda):
+@pytest.fixture(params=get_available_devices(no_cpu=True))
+def linear8bit(request):
+    device = request.param
     linear = torch.nn.Linear(32, 96)
     linear_custom = Linear8bitLt(
         linear.in_features,
@@ -188,7 +189,7 @@ def linear8bit(requires_cuda):
         has_fp16_weights=False,
     )
     linear_custom.bias = linear.bias
-    linear_custom = linear_custom.cuda()
+    linear_custom = linear_custom.to(device)
     return linear_custom
 
 
diff --git a/tests/test_modules.py b/tests/test_modules.py
index 8c4d666d3..95f78b6d3 100644
--- a/tests/test_modules.py
+++ b/tests/test_modules.py
@@ -448,34 +448,36 @@ def test_4bit_embedding_warnings(device, caplog):
     assert any("inference" in msg for msg in caplog.messages)
 
 
-def test_4bit_embedding_weight_fsdp_fix(requires_cuda):
+@pytest.mark.parametrize("device", get_available_devices(no_cpu=True))
+def test_4bit_embedding_weight_fsdp_fix(device):
     num_embeddings = 64
     embedding_dim = 32
 
     module = bnb.nn.Embedding4bit(num_embeddings=num_embeddings, embedding_dim=embedding_dim)
 
-    module.cuda()
+    module.to(device)
 
     module.weight.quant_state = None
 
-    input_tokens = torch.randint(low=0, high=num_embeddings, size=(1,), device="cuda")
+    input_tokens = torch.randint(low=0, high=num_embeddings, size=(1,), device=device)
 
     module(input_tokens)
 
     assert module.weight.quant_state is not None
 
 
-def test_4bit_linear_weight_fsdp_fix(requires_cuda):
+@pytest.mark.parametrize("device", get_available_devices(no_cpu=True))
+def test_4bit_linear_weight_fsdp_fix(device):
     inp_size = 64
     out_size = 32
 
     module = bnb.nn.Linear4bit(inp_size, out_size)
 
-    module.cuda()
+    module.to(device)
 
     module.weight.quant_state = None
 
-    input_tensor = torch.randn((1, inp_size), device="cuda")
+    input_tensor = torch.randn((1, inp_size), device=device)
 
     module(input_tensor)