From 314d9ddabd9fb812b5fed8b956833775901fa478 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 16 Jun 2026 00:11:52 +0800
Subject: [PATCH 01/10] feat(pt_expt): DPA4 model freeze to .pt2 (model-type
 alias + nloc export fix)

---
 .../dpmodel/descriptor/dpa4_nn/attention.py   |   9 +-
 .../dpmodel/descriptor/dpa4_nn/embedding.py   |  21 ++-
 deepmd/dpmodel/descriptor/dpa4_nn/so2.py      |   8 +-
 deepmd/dpmodel/model/ener_model.py            |   2 +
 deepmd/pt_expt/model/ener_model.py            |   2 +
 .../tests/pt_expt/model/test_dpa4_export.py   | 148 ++++++++++++++++++
 6 files changed, 181 insertions(+), 9 deletions(-)
 create mode 100644 source/tests/pt_expt/model/test_dpa4_export.py

diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/attention.py b/deepmd/dpmodel/descriptor/dpa4_nn/attention.py
index c27b6174b8..4b7eeb24db 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/attention.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/attention.py
@@ -88,12 +88,17 @@ def segment_envelope_gated_softmax(
     n_edge, n_focus, n_head = logits.shape
     n_channel = n_focus * n_head
     eps_f = float(eps)
-    if n_nodes <= 0 or n_edge % int(n_nodes) != 0:
+    # Keep ``n_nodes`` symbolic (no ``int()``): it is the product ``nf*nloc``,
+    # and casting to a Python int specializes it to the trace-time sample
+    # shape, which breaks torch.export with a dynamic ``nloc`` dim. The
+    # ``Mod`` check below stays statically known (``E == n_nodes*nnei``) and
+    # the ``(n_nodes, nnei, ...)`` reshapes recover the layout symbolically.
+    if n_nodes <= 0 or n_edge % n_nodes != 0:
         raise ValueError(
             "padded-edge layout requires E to be a multiple of n_nodes; "
             f"got E={n_edge}, n_nodes={n_nodes}"
         )
-    nnei = n_edge // int(n_nodes)
+    nnei = n_edge // n_nodes
     device = array_api_compat.device(logits)
 
     # === Step 1. Flatten (F, H) and build the effective per-edge weight ===
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/embedding.py b/deepmd/dpmodel/descriptor/dpa4_nn/embedding.py
index 73ac77456d..7cfec7f2bd 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/embedding.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/embedding.py
@@ -290,8 +290,14 @@ def call(
             return xp.zeros(
                 (n_nodes, self.ebed_dim, self.channels), dtype=dtype, device=device
             )
-        n_edge = int(edge_cache.dst.shape[0])
-        nnei = _edge_layout(n_edge, int(n_nodes))
+        # Keep ``n_edge``/``n_nodes`` symbolic (no ``int()``): they are the
+        # products ``nf*nloc*nnei`` / ``nf*nloc``. Casting to a Python int
+        # specializes them to the trace-time sample shape (e.g. nf*nloc==14),
+        # which breaks torch.export with a dynamic ``nloc`` dim. ``_edge_layout``
+        # returns a symbolic ``nnei`` and the masked-sum reshapes below use
+        # ``-1`` for the node axis to recover it symbolically.
+        n_edge = edge_cache.dst.shape[0]
+        nnei = _edge_layout(n_edge, n_nodes)
 
         # === Step 2. Gather all m=0 columns (l >= 1) in one shot ===
         # pt embedding.py:235-241 pairs one packed non-scalar row with the
@@ -345,7 +351,7 @@ def call(
         non_scalar_out = xp.sum(
             xp.reshape(
                 non_scalar_message,
-                (n_nodes, nnei, self.ebed_dim - 1, self.channels),
+                (-1, nnei, self.ebed_dim - 1, self.channels),
             ),
             axis=1,
         )  # (N, D-1, C)
@@ -592,8 +598,11 @@ def call(
         edge_vec = edge_cache.edge_vec  # (E, 3)
         edge_rbf = edge_cache.edge_rbf  # (E, n_radial)
         edge_env = edge_cache.edge_env  # (E, 1)
-        n_edge = int(dst.shape[0])
-        nnei = _edge_layout(n_edge, int(n_nodes))
+        # Keep ``n_edge``/``n_nodes`` symbolic (no ``int()``); see the matching
+        # comment in ``GeometricInitialEmbedding.call`` for why casting to a
+        # Python int breaks torch.export with a dynamic ``nloc`` dim.
+        n_edge = dst.shape[0]
+        nnei = _edge_layout(n_edge, n_nodes)
 
         # === Step 1. Construct r_tilde = [s, s*r_hat] ===
         # s = edge_env * (1/r), r_hat = edge_vec / r (pt embedding.py:489-495)
@@ -641,7 +650,7 @@ def call(
                 xp.reshape(edge_mask, (n_edge, 1)), outer_flat.dtype
             )
         env_agg = xp.sum(
-            xp.reshape(outer_flat, (n_nodes, nnei, 4 * self.embed_dim)),
+            xp.reshape(outer_flat, (-1, nnei, 4 * self.embed_dim)),
             axis=1,
         )  # (N, 4*embed_dim)
         env_agg = xp.reshape(env_agg, (n_nodes, 4, self.embed_dim))
diff --git a/deepmd/dpmodel/descriptor/dpa4_nn/so2.py b/deepmd/dpmodel/descriptor/dpa4_nn/so2.py
index ac4f7d298b..1414956b13 100644
--- a/deepmd/dpmodel/descriptor/dpa4_nn/so2.py
+++ b/deepmd/dpmodel/descriptor/dpa4_nn/so2.py
@@ -1183,7 +1183,13 @@ def call(
         device = array_api_compat.device(x)
         src, dst = edge_cache.src, edge_cache.dst
         n_node = x.shape[0]
-        n_edge = int(src.shape[0])
+        # Keep ``n_edge``/``n_node`` symbolic (no ``int()``): they are the
+        # products ``nf*nloc*nnei`` / ``nf*nloc``. Casting to a Python int
+        # specializes them to the trace-time sample shape (breaking
+        # torch.export with a dynamic ``nloc`` dim); the ``Mod`` check stays
+        # statically known and the ``(n_node, nnei, ...)`` reshape below
+        # recovers the layout symbolically.
+        n_edge = src.shape[0]
         if n_node <= 0 or n_edge % n_node != 0:
             raise ValueError(
                 "padded-edge layout requires E to be a multiple of N; "
diff --git a/deepmd/dpmodel/model/ener_model.py b/deepmd/dpmodel/model/ener_model.py
index a8280dbebf..09b50a6f17 100644
--- a/deepmd/dpmodel/model/ener_model.py
+++ b/deepmd/dpmodel/model/ener_model.py
@@ -36,6 +36,8 @@
 
 
 @BaseModel.register("ener")
+@BaseModel.register("sezm_ener")
+@BaseModel.register("dpa4_ener")
 class EnergyModel(DPModelCommon, DPEnergyModel_):
     r"""Energy model that predicts total energy and derived quantities.
 
diff --git a/deepmd/pt_expt/model/ener_model.py b/deepmd/pt_expt/model/ener_model.py
index 4f868043b6..6347382135 100644
--- a/deepmd/pt_expt/model/ener_model.py
+++ b/deepmd/pt_expt/model/ener_model.py
@@ -33,6 +33,8 @@
 
 
 @BaseModel.register("ener")
+@BaseModel.register("sezm_ener")
+@BaseModel.register("dpa4_ener")
 class EnergyModel(DPModelCommon, DPEnergyModel_):
     def __init__(
         self,
diff --git a/source/tests/pt_expt/model/test_dpa4_export.py b/source/tests/pt_expt/model/test_dpa4_export.py
new file mode 100644
index 0000000000..a82ee9dc4c
--- /dev/null
+++ b/source/tests/pt_expt/model/test_dpa4_export.py
@@ -0,0 +1,148 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Model-level freeze test for the DPA4/SeZM energy model.
+
+Mirrors the DPA3 ``test_export_with_comm`` round-trip: a DPA4 model is a
+GNN (``has_message_passing_across_ranks() == True``), so
+``deserialize_to_file`` produces a .pt2 archive containing TWO compiled
+artifacts:
+  * the regular ``forward_lower`` (no comm), packed at the top of the ZIP;
+  * a ``forward_lower_with_comm`` variant nested at
+    ``model/extra/forward_lower_with_comm.pt2``.
+
+This test verifies:
+  1. The .pt2 archive is produced and both artifacts are present.
+  2. ``metadata.json`` carries the correct ``type_map``/``rcut`` and
+     ``has_message_passing: true`` (DPA4 is a message-passing descriptor).
+  3. The regular artifact loads via ``aoti_load_package``.
+  4. The loaded artifact's ``forward_common_lower`` output matches the
+     eager model (fp64 AOTI parity, rtol 1e-10).
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import json
+import os
+import zipfile
+
+import numpy as np
+import pytest
+
+# Trigger registration of the deepmd_export::border_op opaque wrapper
+# (needed by the with-comm artifact at runtime / load time).
+import deepmd.pt_expt.utils.comm  # noqa: F401  # lgtm[py/unused-import]
+from deepmd.pt_expt.model.get_model import (
+    get_model,
+)
+from deepmd.pt_expt.utils.serialization import (
+    _make_sample_inputs,
+    deserialize_to_file,
+)
+
+# Small fp64 DPA4 config (channels 16, n_radial 8, lmax 2, mmax 1,
+# n_blocks 2) — large enough to exercise the SO(2)/SO(3) + attention +
+# embedding paths that previously specialized ``nloc`` during export, but
+# small enough to keep the AOTInductor compile time bounded.
+_DPA4_CONFIG = {
+    "type": "dpa4",
+    "type_map": ["O", "H"],
+    "descriptor": {
+        "type": "dpa4",
+        "sel": 20,
+        "rcut": 4.0,
+        "channels": 16,
+        "n_radial": 8,
+        "lmax": 2,
+        "mmax": 1,
+        "n_blocks": 2,
+        "precision": "float64",
+        "seed": 1,
+    },
+    "fitting_net": {
+        "type": "dpa4_ener",
+        "neuron": [16],
+        "precision": "float64",
+        "seed": 1,
+    },
+}
+
+
+@pytest.mark.skipif(
+    os.environ.get("CI") == "true",
+    reason="AOTInductor compile is slow (minutes); run locally only by default.",
+)
+def test_dpa4_freeze_to_pt2(tmp_path) -> None:
+    """End-to-end: DPA4 model freezes to a dual-artifact .pt2 and the
+    regular artifact reproduces the eager ``forward_common_lower``.
+    """
+    model = get_model(_DPA4_CONFIG)
+    model.to("cpu")
+    model.eval()
+
+    # 1. Serialize → deserialize_to_file (compiles and packs both artifacts).
+    pt2_path = str(tmp_path / "test_dpa4.pt2")
+    deserialize_to_file(pt2_path, {"model": model.serialize()})
+    assert os.path.exists(pt2_path)
+
+    # 2. ZIP layout + metadata sanity. PyTorch's strict layout puts our
+    #    sidecars under ``model/extra/`` (PT2_EXTRA_PREFIX).
+    with zipfile.ZipFile(pt2_path, "r") as zf:
+        names = set(zf.namelist())
+        meta = json.loads(zf.read("model/extra/metadata.json").decode("utf-8"))
+        assert "model/extra/forward_lower_with_comm.pt2" in names, (
+            f"with-comm artifact missing; names={sorted(names)}"
+        )
+    assert meta["type_map"] == _DPA4_CONFIG["type_map"]
+    assert meta["rcut"] == model.get_rcut()
+    # DPA4 is a message-passing GNN descriptor.
+    assert meta["has_message_passing"] is True
+    assert meta["has_comm_artifact"] is True
+
+    # 3. The regular artifact loads.
+    from torch._inductor import (
+        aoti_load_package,
+    )
+
+    regular = aoti_load_package(pt2_path)
+
+    # 4. Eager reference vs. AOTI artifact parity on forward_common_lower.
+    sample = _make_sample_inputs(model, nframes=1, has_spin=False)
+    ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam, charge_spin = sample
+
+    eager_out = model.forward_common_lower(
+        ext_coord.detach().requires_grad_(True),
+        ext_atype,
+        nlist_t,
+        mapping_t,
+        fparam=fparam,
+        aparam=aparam,
+        do_atomic_virial=False,
+        charge_spin=charge_spin,
+    )
+
+    artifact_out = regular(
+        ext_coord, ext_atype, nlist_t, mapping_t, fparam, aparam, charge_spin
+    )
+
+    # The AOTI artifact returns the internal forward_common_lower keys; compare
+    # every key it produces against the eager reference (fp64 AOTI tolerance).
+    compared = 0
+    for key, val in artifact_out.items():
+        if key not in eager_out or eager_out[key] is None or val is None:
+            continue
+        np.testing.assert_allclose(
+            val.detach().cpu().numpy(),
+            eager_out[key].detach().cpu().numpy(),
+            rtol=1e-10,
+            atol=1e-10,
+            err_msg=f"artifact vs eager forward_common_lower differs: {key}",
+        )
+        compared += 1
+    # Guard against a vacuous pass (no overlapping keys compared).
+    assert compared > 0, (
+        f"no overlapping output keys compared; artifact keys="
+        f"{sorted(artifact_out)}, eager keys={sorted(eager_out)}"
+    )
+    # The energy output must be among the compared keys.
+    assert "energy_redu" in artifact_out or "energy" in artifact_out

From c5f606b14cedc24e2850a1c300940076afa68044 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 16 Jun 2026 00:23:57 +0800
Subject: [PATCH 02/10] test(pt_expt): guard DPA4 model-type alias deserialize

---
 .../pt_expt/model/test_get_model_dpa4.py      | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/source/tests/pt_expt/model/test_get_model_dpa4.py b/source/tests/pt_expt/model/test_get_model_dpa4.py
index f550d8fab2..de497876ab 100644
--- a/source/tests/pt_expt/model/test_get_model_dpa4.py
+++ b/source/tests/pt_expt/model/test_get_model_dpa4.py
@@ -99,6 +99,45 @@ def test_get_model_type_aliases(self) -> None:
             model = get_model(model_params)
             self.assertIsInstance(model, EnergyModel, msg=f"alias={alias}")
 
+    def test_serialize_deserialize_alias(self) -> None:
+        """Round-trip locks the sezm_ener/dpa4_ener -> EnergyModel alias.
+
+        Fast (no-AOTI) regression guard: the model-type alias is otherwise
+        only exercised by the CI-skipped AOTI freeze test.
+        """
+        from deepmd.pt_expt.model.model import (
+            BaseModel,
+        )
+
+        model = get_model(_make_raw_model_config()).to(self.device)
+        data = model.serialize()
+        # serialized layout: top-level "standard", fitting "sezm_ener"
+        self.assertEqual(data["type"], "standard")
+        self.assertEqual(data["fitting"]["type"], "sezm_ener")
+        self.assertEqual(
+            model.atomic_model.fitting_net.serialize()["type"], "sezm_ener"
+        )
+        # the alias resolution must not raise and must rebuild an EnergyModel
+        model2 = BaseModel.deserialize(model.serialize())
+        self.assertIsInstance(model2, EnergyModel)
+        model2 = model2.to(self.device)
+        # forward-smoke the deserialized model to prove the round-trip works
+        generator = torch.Generator(device=self.device).manual_seed(1)
+        cell = 5.0 * torch.eye(3, dtype=torch.float64, device=self.device)
+        coord = (
+            torch.rand(
+                [1, 5, 3],
+                dtype=torch.float64,
+                device=self.device,
+                generator=generator,
+            )
+            @ cell
+        ).requires_grad_(True)
+        atype = torch.tensor([[0, 0, 0, 1, 1]], dtype=torch.int64, device=self.device)
+        ret0 = model(coord, atype, cell.reshape(1, 9))
+        ret = model2(coord, atype, cell.reshape(1, 9))
+        self.assertEqual(ret["energy"].shape, ret0["energy"].shape)
+
     def test_descriptor_fitting_type_defaults(self) -> None:
         """Descriptor/fitting type keys default to dpa4/dpa4_ener when absent."""
         raw = _make_raw_model_config()

From aa77ffef26d42ff965d2dd36b7caf924966fe205 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 16 Jun 2026 01:29:08 +0800
Subject: [PATCH 03/10] fix(pt_expt): DPA4 nopbc export (avoid Ne(nall,nloc)
 guard) + pt checkpoint interop + DeepEval parity test

---
 deepmd/dpmodel/descriptor/dpa4.py             |   7 +-
 deepmd/pt_expt/model/model.py                 |  84 +++++++-
 .../pt_expt/infer/test_dpa4_deep_eval.py      | 198 ++++++++++++++++++
 3 files changed, 287 insertions(+), 2 deletions(-)
 create mode 100644 source/tests/pt_expt/infer/test_dpa4_deep_eval.py

diff --git a/deepmd/dpmodel/descriptor/dpa4.py b/deepmd/dpmodel/descriptor/dpa4.py
index 338ee3353e..1c00561bdf 100644
--- a/deepmd/dpmodel/descriptor/dpa4.py
+++ b/deepmd/dpmodel/descriptor/dpa4.py
@@ -41,6 +41,7 @@
 )
 from deepmd.dpmodel.array_api import (
     xp_asarray_nodetach,
+    xp_take_first_n,
 )
 from deepmd.dpmodel.common import (
     PRECISION_DICT,
@@ -811,7 +812,11 @@ def call(
         pair_keep_mask = self.emask.build_type_exclude_mask(nlist, atype_ext) != 0
 
         # === Step 2. Type embedding (l=0) ===
-        atype_loc = atype_ext[:, :nloc]
+        # Use ``xp_take_first_n`` (torch.index_select) rather than a plain
+        # ``[:, :nloc]`` slice: the slice makes torch.export emit a spurious
+        # ``Ne(nall, nloc)`` contiguity guard that breaks the ``nall == nloc``
+        # (NoPBC, no ghost atoms) case in the compiled .pt2 artifact.
+        atype_loc = xp_take_first_n(atype_ext, 1, nloc)
         type_ebed = xp.reshape(
             self.type_embedding(atype_loc), (n_nodes, self.channels)
         )  # (N, C)
diff --git a/deepmd/pt_expt/model/model.py b/deepmd/pt_expt/model/model.py
index 83842eaabd..3721e38362 100644
--- a/deepmd/pt_expt/model/model.py
+++ b/deepmd/pt_expt/model/model.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
 from deepmd.dpmodel.model.base_model import (
     make_base_model,
 )
@@ -16,4 +20,82 @@ class BaseModel(make_base_model()):
         Backend-independent BaseModel class.
     """
 
-    pass
+    # The pt backend's ``SeZMModel`` (model_type "SeZM", aliases dpa4/sezm)
+    # serialises with a *model-level wrapper*: ``{type: "SeZM",
+    # atomic_model: <sezm_atomic dict>, bridging_method, bridging_r_*, lora}``,
+    # and its atomic model uses ``type: "sezm_atomic"`` carrying pt-only
+    # extras (``dens_fitting``/``active_mode`` plus a ``dens_force_rmsd``
+    # @variable).  pt_expt builds the equivalent DPA4 model via the generic
+    # ``make_model`` path, whose ``serialize()`` emits the standard atomic
+    # dict directly (``type: "standard"``).  To load a pt-trained checkpoint
+    # into pt_expt (the serialization-compat / checkpoint-interop
+    # requirement), recognise the wrapper, reject the pt-only features pt_expt
+    # does not implement (when they are non-default), strip the rest, and
+    # delegate to the standard path.  The nested descriptor/fitting dicts are
+    # already backend-agnostic dpmodel serializations and pass through intact.
+    _SEZM_MODEL_TYPES = frozenset({"sezm", "dpa4"})
+    _SEZM_ATOMIC_TYPES = frozenset({"sezm_atomic"})
+
+    @classmethod
+    def deserialize(cls, data: dict[str, Any]) -> "BaseModel":
+        model_type = str(data.get("type", "standard"))
+        if model_type.lower() in cls._SEZM_MODEL_TYPES:
+            return cls.deserialize(cls._unwrap_pt_sezm_model(data))
+        if model_type.lower() in cls._SEZM_ATOMIC_TYPES:
+            return cls.deserialize(cls._normalize_pt_sezm_atomic(data))
+        return super().deserialize(data)
+
+    @staticmethod
+    def _unwrap_pt_sezm_model(data: dict[str, Any]) -> dict[str, Any]:
+        """Unwrap pt's ``SeZMModel`` serialization to the inner atomic dict."""
+        data = data.copy()
+        bridging_method = str(data.get("bridging_method", "none")).lower()
+        if bridging_method not in ("none", ""):
+            raise NotImplementedError(
+                "Deserializing a pt SeZM/DPA4 checkpoint with "
+                f"`bridging_method`={data.get('bridging_method')!r} is not "
+                "supported in pt_expt."
+            )
+        if data.get("lora") is not None:
+            raise NotImplementedError(
+                "Deserializing a pt SeZM/DPA4 checkpoint with `lora` is "
+                "not supported in pt_expt."
+            )
+        atomic_model = data.get("atomic_model")
+        if atomic_model is None:
+            raise ValueError(
+                "SeZM/DPA4 model data is missing the 'atomic_model' entry."
+            )
+        return atomic_model
+
+    @staticmethod
+    def _normalize_pt_sezm_atomic(data: dict[str, Any]) -> dict[str, Any]:
+        """Convert a pt ``sezm_atomic`` dict to a standard atomic dict.
+
+        Strips the pt-only ``dens`` head state (``dens_fitting`` /
+        ``active_mode`` / the ``dens_force_rmsd`` @variable) and rewrites the
+        ``type``/``@version`` so the generic dpmodel atomic-model deserialize
+        accepts it.  A non-energy active mode or a populated dens head is
+        rejected because pt_expt only implements the energy path.
+        """
+        data = data.copy()
+        if data.pop("dens_fitting", None) is not None:
+            raise NotImplementedError(
+                "Deserializing a pt SeZM/DPA4 checkpoint with a `dens` "
+                "fitting head is not supported in pt_expt."
+            )
+        active_mode = data.pop("active_mode", None)
+        if active_mode not in (None, "ener"):
+            raise NotImplementedError(
+                f"Deserializing a pt SeZM/DPA4 checkpoint in active_mode "
+                f"{active_mode!r} is not supported in pt_expt (energy only)."
+            )
+        variables = data.get("@variables")
+        if isinstance(variables, dict):
+            data["@variables"] = {
+                k: v for k, v in variables.items() if k in ("out_bias", "out_std")
+            }
+        # The standard dpmodel atomic-model deserialize checks @version == 2.
+        data["@version"] = 2
+        data["type"] = "standard"
+        return data
diff --git a/source/tests/pt_expt/infer/test_dpa4_deep_eval.py b/source/tests/pt_expt/infer/test_dpa4_deep_eval.py
new file mode 100644
index 0000000000..8151c30c17
--- /dev/null
+++ b/source/tests/pt_expt/infer/test_dpa4_deep_eval.py
@@ -0,0 +1,198 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""DPA4/SeZM DeepEval parity: pt (.pt) vs pt_expt (.pt2).
+
+This test doubles as the pt-checkpoint -> pt_expt interop proof.  A single
+DPA4/SeZM model is built with the *pt* backend and random-initialised, then:
+
+1. pt reference: the pt model + its ``model_params`` are written to a ``.pt``
+   checkpoint and evaluated through ``DeepPot(.pt)`` (routes to the pt backend;
+   SeZM disables torch.jit so this is eager pt inference).
+2. pt_expt path: the SAME pt model's ``serialize()`` dict is fed to
+   ``deserialize_to_file`` which calls ``pt_expt.BaseModel.deserialize`` (the
+   checkpoint-interop step), compiles via AOTInductor, and packs a ``.pt2``
+   archive evaluated through ``DeepPot(.pt2)``.
+
+Because the weights are transferred by serialize/deserialize (not retrained),
+the two backends must produce identical conservative quantities.  Energy,
+force and the *global* virial are compared at fp64 cross-backend tolerance
+(rtol/atol 1e-10).
+
+Per-atom virial is NOT compared element-wise: pt's SeZM force/virial uses an
+edge-force scatter that distributes the per-atom virial differently from
+pt_expt's generic ``fit_output_to_model_output`` assembly (#5518).  Both are
+correct -- their sum (the global virial) matches at 1e-10 -- but the per-atom
+distribution legitimately differs, so we assert the global virial only and
+additionally check that pt_expt's per-atom virial *sums* to its global virial.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import copy
+import os
+
+import numpy as np
+import pytest
+import torch
+
+from deepmd.infer import (
+    DeepPot,
+)
+from deepmd.pt.model.model import (
+    get_model as pt_get_model,
+)
+from deepmd.pt.train.wrapper import (
+    ModelWrapper as PtModelWrapper,
+)
+from deepmd.pt_expt.utils.serialization import (
+    deserialize_to_file,
+)
+from deepmd.utils.argcheck import (
+    normalize,
+)
+from deepmd.utils.compat import (
+    update_deepmd_input,
+)
+
+# Small fp64 DPA4 config: channels 16, n_radial 8, lmax 2, mmax 1, n_blocks 2,
+# fitting neuron [16] -- mirrors test_dpa4_export so the AOTI compile time is
+# bounded but still exercises the SO(2)/SO(3) + attention + embedding paths.
+_DPA4_RAW_CONFIG = {
+    "type": "dpa4",
+    "type_map": ["O", "H"],
+    "descriptor": {
+        "type": "dpa4",
+        "sel": 20,
+        "rcut": 4.0,
+        "channels": 16,
+        "n_radial": 8,
+        "lmax": 2,
+        "mmax": 1,
+        "n_blocks": 2,
+        "precision": "float64",
+        "seed": 1,
+    },
+    "fitting_net": {
+        "type": "dpa4_ener",
+        "neuron": [16],
+        "precision": "float64",
+        "seed": 1,
+    },
+}
+
+
+def _normalize_model(model: dict) -> dict:
+    config = {
+        "model": copy.deepcopy(model),
+        "training": {"training_data": {"systems": ["dummy"]}, "numb_steps": 1},
+        "loss": {"type": "ener"},
+        "learning_rate": {"type": "exp", "start_lr": 1e-3},
+    }
+    config = update_deepmd_input(config, warning=False)
+    config = normalize(config)
+    return config["model"]
+
+
+# A small, fixed water-like box: 2 oxygens + 4 hydrogens.  Coordinates are
+# explicit (no RNG) so the test is fully deterministic.
+_NATOMS = 6
+_ATYPES = np.array([0, 0, 1, 1, 1, 1], dtype=np.int32)  # O, O, H, H, H, H
+_COORDS = np.array(
+    [
+        [1.0, 1.0, 1.0],
+        [3.2, 1.4, 1.1],
+        [1.3, 1.8, 1.0],
+        [0.4, 1.2, 1.6],
+        [3.6, 2.0, 1.3],
+        [3.4, 0.7, 1.7],
+    ],
+    dtype=np.float64,
+).reshape(1, _NATOMS, 3)
+_CELL = (np.eye(3, dtype=np.float64) * 6.0).reshape(1, 9)
+
+
+@pytest.fixture(scope="module")
+def dpa4_pt_and_pt2(tmp_path_factory):
+    """Build one pt DPA4 model; emit a pt ``.pt`` and a pt_expt ``.pt2``."""
+    tmp_path = tmp_path_factory.mktemp("dpa4_deep_eval")
+    model_params = _normalize_model(_DPA4_RAW_CONFIG)
+
+    # Build the pt model and random-init it (fp64, eval mode).
+    pt_model = pt_get_model(copy.deepcopy(model_params))
+    pt_model = pt_model.to(torch.float64)
+    pt_model.eval()
+
+    # 1. pt `.pt` checkpoint: state_dict + model_params in _extra_state.
+    pt_path = str(tmp_path / "dpa4.pt")
+    wrapper = PtModelWrapper(pt_model, model_params=copy.deepcopy(model_params))
+    torch.save({"model": wrapper.state_dict()}, pt_path)
+
+    # 2. pt_expt `.pt2`: transfer weights via serialize() -> BaseModel.deserialize
+    #    (the interop step) inside deserialize_to_file, then AOTI-compile/pack.
+    pt2_path = str(tmp_path / "dpa4.pt2")
+    data = {"model": pt_model.serialize()}
+    deserialize_to_file(pt2_path, data, do_atomic_virial=True)
+
+    return {"pt": pt_path, "pt2": pt2_path}
+
+
+@pytest.mark.skipif(
+    os.environ.get("CI") == "true",
+    reason="AOTInductor compile is slow (minutes); run locally only by default.",
+)
+@pytest.mark.parametrize("pbc", [True, False])  # periodic vs open boundary
+def test_dpa4_deep_eval_parity(dpa4_pt_and_pt2, pbc) -> None:
+    """Backends agree: pt (.pt) vs pt_expt (.pt2) at fp64 tolerance."""
+    dp_pt = DeepPot(dpa4_pt_and_pt2["pt"])
+    dp_pt2 = DeepPot(dpa4_pt_and_pt2["pt2"])
+
+    cell = _CELL if pbc else None
+
+    e_pt, f_pt, v_pt, ae_pt, av_pt = dp_pt.eval(_COORDS, cell, _ATYPES, atomic=True)
+    e_x, f_x, v_x, ae_x, av_x = dp_pt2.eval(_COORDS, cell, _ATYPES, atomic=True)
+
+    tag = "pbc" if pbc else "nopbc"
+    np.testing.assert_allclose(
+        e_x, e_pt, rtol=1e-10, atol=1e-10, err_msg=f"{tag}: energy"
+    )
+    np.testing.assert_allclose(
+        f_x, f_pt, rtol=1e-10, atol=1e-10, err_msg=f"{tag}: force"
+    )
+    np.testing.assert_allclose(
+        v_x, v_pt, rtol=1e-10, atol=1e-10, err_msg=f"{tag}: global virial"
+    )
+    np.testing.assert_allclose(
+        ae_x, ae_pt, rtol=1e-10, atol=1e-10, err_msg=f"{tag}: atom energy"
+    )
+
+    # Per-atom virial: pt's edge-force scatter (#5518) distributes the
+    # per-atom virial differently from pt_expt's generic assembly.  The two
+    # are NOT expected to match element-wise; only the global virial (their
+    # sum) is a physical observable.  Verify pt_expt's per-atom virial reduces
+    # to its own global virial so the assembly stays self-consistent.
+    av_x_sum = av_x.reshape(1, _NATOMS, 9).sum(axis=1)
+    np.testing.assert_allclose(
+        av_x_sum,
+        v_x.reshape(1, 9),
+        rtol=1e-10,
+        atol=1e-10,
+        err_msg=f"{tag}: pt_expt atom_virial does not sum to global virial",
+    )
+
+
+@pytest.mark.skipif(
+    os.environ.get("CI") == "true",
+    reason="AOTInductor compile is slow (minutes); run locally only by default.",
+)
+def test_dpa4_deep_eval_metadata(dpa4_pt_and_pt2) -> None:
+    """Both backends expose the same model metadata (rcut/type_map/...)."""
+    dp_pt = DeepPot(dpa4_pt_and_pt2["pt"])
+    dp_pt2 = DeepPot(dpa4_pt_and_pt2["pt2"])
+
+    assert dp_pt2.deep_eval.get_rcut() == dp_pt.deep_eval.get_rcut()
+    assert dp_pt2.deep_eval.get_type_map() == dp_pt.deep_eval.get_type_map()
+    assert dp_pt2.deep_eval.get_ntypes() == dp_pt.deep_eval.get_ntypes()
+    assert dp_pt2.deep_eval.get_dim_fparam() == dp_pt.deep_eval.get_dim_fparam()
+    assert dp_pt2.deep_eval.get_dim_aparam() == dp_pt.deep_eval.get_dim_aparam()
+    assert not dp_pt2.has_spin

From 54732271895de61765391b8dcd8ed6f7adfe156c Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 16 Jun 2026 01:44:34 +0800
Subject: [PATCH 04/10] fix(pt_expt): validate version in DPA4 pt-checkpoint
 interop + fast unit tests

---
 deepmd/pt_expt/model/model.py                 |  15 ++
 .../tests/pt_expt/model/test_dpa4_interop.py  | 213 ++++++++++++++++++
 2 files changed, 228 insertions(+)
 create mode 100644 source/tests/pt_expt/model/test_dpa4_interop.py

diff --git a/deepmd/pt_expt/model/model.py b/deepmd/pt_expt/model/model.py
index 3721e38362..a4b18d3d41 100644
--- a/deepmd/pt_expt/model/model.py
+++ b/deepmd/pt_expt/model/model.py
@@ -6,6 +6,9 @@
 from deepmd.dpmodel.model.base_model import (
     make_base_model,
 )
+from deepmd.utils.version import (
+    check_version_compatibility,
+)
 
 
 class BaseModel(make_base_model()):
@@ -49,6 +52,11 @@ def deserialize(cls, data: dict[str, Any]) -> "BaseModel":
     def _unwrap_pt_sezm_model(data: dict[str, Any]) -> dict[str, Any]:
         """Unwrap pt's ``SeZMModel`` serialization to the inner atomic dict."""
         data = data.copy()
+        # The pt SeZM model wrapper serialises with ``@version`` 1.  Validate
+        # before discarding it so a future incompatible wrapper schema is not
+        # silently mis-deserialized (the wrapper only carries the guarded
+        # bridging/lora extras below, so the accepted range is narrow).
+        check_version_compatibility(int(data.get("@version", 1)), 1, 1)
         bridging_method = str(data.get("bridging_method", "none")).lower()
         if bridging_method not in ("none", ""):
             raise NotImplementedError(
@@ -79,6 +87,13 @@ def _normalize_pt_sezm_atomic(data: dict[str, Any]) -> dict[str, Any]:
         rejected because pt_expt only implements the energy path.
         """
         data = data.copy()
+        # pt emits ``@version`` 3 for ``sezm_atomic``; the standard dpmodel
+        # atomic-model deserialize requires exactly 2.  The only schema delta
+        # between the two is the stripped ``dens`` state below, so coercion is
+        # safe for the known-compatible range {2, 3}.  Validate the incoming
+        # version BEFORE coercing so a future incompatible pt schema (e.g.
+        # ``@version`` 4) is rejected loudly instead of mis-deserialized.
+        check_version_compatibility(int(data.get("@version", 2)), 3, 2)
         if data.pop("dens_fitting", None) is not None:
             raise NotImplementedError(
                 "Deserializing a pt SeZM/DPA4 checkpoint with a `dens` "
diff --git a/source/tests/pt_expt/model/test_dpa4_interop.py b/source/tests/pt_expt/model/test_dpa4_interop.py
new file mode 100644
index 0000000000..533ebdd862
--- /dev/null
+++ b/source/tests/pt_expt/model/test_dpa4_interop.py
@@ -0,0 +1,213 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Fast (no-AOTI) tests for the pt -> pt_expt DPA4/SeZM checkpoint interop.
+
+``BaseModel.deserialize`` recognises pt's ``SeZMModel`` wrapper (top-level
+``type`` in {SeZM, sezm, dpa4}, ``@version`` 1) and its ``sezm_atomic`` atomic
+dict (``@version`` 3), validates the versions, strips the pt-only ``dens`` head
+state, and rejects pt-only features pt_expt does not implement.  These cases
+are otherwise only exercised by the CI-skipped AOTI parity test
+(``source/tests/pt_expt/infer/test_dpa4_deep_eval.py``); the tests here run in
+CI and need neither ``torch.export`` nor AOTInductor.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import copy
+
+import pytest
+import torch
+
+from deepmd.pt.model.model import (
+    get_model as pt_get_model,
+)
+from deepmd.pt_expt.model.ener_model import (
+    EnergyModel,
+)
+from deepmd.pt_expt.model.model import (
+    BaseModel,
+)
+from deepmd.pt_expt.utils import (
+    env,
+)
+from deepmd.utils.argcheck import (
+    normalize,
+)
+from deepmd.utils.compat import (
+    update_deepmd_input,
+)
+
+# Small fp64 DPA4 config (channels 8, n_radial 4, lmax 1, mmax 1, n_blocks 1)
+# -- only large enough to serialize a real pt SeZM wrapper + sezm_atomic dict.
+_DPA4_RAW_CONFIG = {
+    "type": "dpa4",
+    "type_map": ["O", "H"],
+    "descriptor": {
+        "type": "dpa4",
+        "sel": 20,
+        "rcut": 4.0,
+        "channels": 8,
+        "n_radial": 4,
+        "lmax": 1,
+        "mmax": 1,
+        "n_blocks": 1,
+        "precision": "float64",
+        "seed": 1,
+    },
+    "fitting_net": {
+        "type": "dpa4_ener",
+        "neuron": [8],
+        "precision": "float64",
+        "seed": 1,
+    },
+}
+
+
+def _normalize_model(model: dict) -> dict:
+    config = {
+        "model": copy.deepcopy(model),
+        "training": {"training_data": {"systems": ["dummy"]}, "numb_steps": 1},
+        "loss": {"type": "ener"},
+        "learning_rate": {"type": "exp", "start_lr": 1e-3},
+    }
+    config = update_deepmd_input(config, warning=False)
+    config = normalize(config)
+    return config["model"]
+
+
+@pytest.fixture(scope="module")
+def pt_dpa4_model():
+    """Build one real pt SeZMModel (fp64, eval); reused across tests.
+
+    Each test calls ``.serialize()`` fresh (it returns new nested dicts), so
+    in-place mutation of the serialized payload is isolated per test.
+    """
+    model_params = _normalize_model(_DPA4_RAW_CONFIG)
+    model = pt_get_model(copy.deepcopy(model_params)).to(torch.float64)
+    model.eval()
+    return model
+
+
+def _forward_smoke(model: EnergyModel) -> dict:
+    """Run a tiny forward pass to prove the deserialized model is functional."""
+    model = model.to(env.DEVICE)
+    generator = torch.Generator(device=env.DEVICE).manual_seed(1)
+    cell = 5.0 * torch.eye(3, dtype=torch.float64, device=env.DEVICE)
+    coord = (
+        torch.rand(
+            [1, 5, 3],
+            dtype=torch.float64,
+            device=env.DEVICE,
+            generator=generator,
+        )
+        @ cell
+    ).requires_grad_(True)
+    atype = torch.tensor([[0, 0, 0, 1, 1]], dtype=torch.int64, device=env.DEVICE)
+    return model(coord, atype, cell.reshape(1, 9))
+
+
+class TestDPA4Interop:
+    def test_serialize_layout(self, pt_dpa4_model) -> None:
+        """The pt serialize layout matches the interop override's expectations."""
+        ser = pt_dpa4_model.serialize()
+        # wrapper: recognised model type + @version 1
+        assert ser["type"].lower() in BaseModel._SEZM_MODEL_TYPES
+        assert ser["@version"] == 1
+        # nested atomic: sezm_atomic @version 3 carrying the pt-only dens state
+        atomic = ser["atomic_model"]
+        assert atomic["type"] in BaseModel._SEZM_ATOMIC_TYPES
+        assert atomic["@version"] == 3
+        assert "dens_force_rmsd" in atomic["@variables"]
+        assert "active_mode" in atomic
+
+    def test_happy_path_deserialize_and_forward(self, pt_dpa4_model) -> None:
+        """A real pt checkpoint deserializes to a working pt_expt EnergyModel."""
+        ser = pt_dpa4_model.serialize()
+        model = BaseModel.deserialize(ser)
+        assert isinstance(model, EnergyModel)
+        ret = _forward_smoke(model)
+        assert ret["energy"].shape == (1, 1)
+        assert ret["force"].shape == (1, 5, 3)
+
+    def test_variables_filtered_to_out_bias_out_std(self, pt_dpa4_model) -> None:
+        """The pt-only ``dens_force_rmsd`` @variable is dropped on normalize."""
+        atomic = pt_dpa4_model.serialize()["atomic_model"]
+        assert set(atomic["@variables"]) >= {"out_bias", "out_std", "dens_force_rmsd"}
+        normalized = BaseModel._normalize_pt_sezm_atomic(atomic)
+        assert set(normalized["@variables"]) == {"out_bias", "out_std"}
+        # version coerced to the standard atomic schema, type rewritten
+        assert normalized["@version"] == 2
+        assert normalized["type"] == "standard"
+
+    # mutator(ser) edits the full pt wrapper serialize in place to trip one
+    # guard; (exc_type, match) is the expected raise.  The wrapper @version
+    # check runs before everything in _unwrap; the atomic @version check runs
+    # first in _normalize -- both reject out-of-range versions loudly.
+    @pytest.mark.parametrize(
+        "mutator, exc_type, match",
+        [
+            # bridging_method != none -> NotImplementedError
+            (
+                lambda s: s.__setitem__("bridging_method", "ZBL"),
+                NotImplementedError,
+                "bridging_method",
+            ),
+            # lora not None -> NotImplementedError
+            (
+                lambda s: s.__setitem__("lora", {"rank": 4}),
+                NotImplementedError,
+                "lora",
+            ),
+            # populated dens fitting head -> NotImplementedError
+            (
+                lambda s: s["atomic_model"].__setitem__("dens_fitting", {"foo": 1}),
+                NotImplementedError,
+                "dens",
+            ),
+            # non-energy active_mode -> NotImplementedError
+            (
+                lambda s: s["atomic_model"].__setitem__("active_mode", "dens"),
+                NotImplementedError,
+                "active_mode",
+            ),
+            # missing atomic_model entry -> ValueError
+            (
+                lambda s: s.pop("atomic_model"),
+                ValueError,
+                "atomic_model",
+            ),
+            # unsupported atomic @version (Fix 1 guard) -> ValueError
+            (
+                lambda s: s["atomic_model"].__setitem__("@version", 4),
+                ValueError,
+                "not compatible",
+            ),
+            # unsupported wrapper @version (Fix 1 guard) -> ValueError
+            (
+                lambda s: s.__setitem__("@version", 2),
+                ValueError,
+                "not compatible",
+            ),
+        ],
+    )
+    def test_guard_branches_raise(
+        self, pt_dpa4_model, mutator, exc_type, match
+    ) -> None:
+        """Each unsupported/invalid pt feature fails fast with a clear error."""
+        ser = pt_dpa4_model.serialize()
+        mutator(ser)
+        with pytest.raises(exc_type, match=match):
+            BaseModel.deserialize(ser)
+
+    @pytest.mark.parametrize("version", [2, 3])  # known-compatible atomic versions
+    def test_atomic_version_in_range_accepted(self, pt_dpa4_model, version) -> None:
+        """Both in-range atomic @versions {2, 3} normalize without raising."""
+        atomic = pt_dpa4_model.serialize()["atomic_model"]
+        atomic["@version"] = version
+        normalized = BaseModel._normalize_pt_sezm_atomic(atomic)
+        assert normalized["@version"] == 2
+
+
+if __name__ == "__main__":
+    raise SystemExit(pytest.main([__file__, "-v"]))

From 4bd454ee4e6b9fc685ff59ad5493d4a412c5ab34 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 16 Jun 2026 01:51:09 +0800
Subject: [PATCH 05/10] feat(dpa4): warn on silently-ignored
 use_amp/enable_tf32

---
 deepmd/dpmodel/descriptor/dpa4.py             | 15 +++++++++
 deepmd/pt_expt/model/get_model.py             | 15 +++++++++
 source/tests/pt_expt/descriptor/test_dpa4.py  | 29 +++++++++++++++++
 .../pt_expt/model/test_get_model_dpa4.py      | 32 +++++++++++++++++++
 4 files changed, 91 insertions(+)

diff --git a/deepmd/dpmodel/descriptor/dpa4.py b/deepmd/dpmodel/descriptor/dpa4.py
index 1c00561bdf..617560e409 100644
--- a/deepmd/dpmodel/descriptor/dpa4.py
+++ b/deepmd/dpmodel/descriptor/dpa4.py
@@ -26,6 +26,7 @@
     annotations,
 )
 
+import logging
 import math
 from typing import (
     TYPE_CHECKING,
@@ -36,6 +37,12 @@
 import array_api_compat
 import numpy as np
 
+log = logging.getLogger(__name__)
+
+# Warn at most once per process that ``use_amp`` has no effect on the
+# dpmodel/pt_expt backend (it is a pt-runtime CUDA autocast switch).
+_USE_AMP_WARNED = False
+
 from deepmd.dpmodel import (
     NativeOP,
 )
@@ -324,6 +331,14 @@ def __init__(
         # pt-runtime-only switch (CUDA bfloat16 autocast during training);
         # accepted for config compatibility and ignored by dpmodel.
         self.use_amp = bool(use_amp)
+        if self.use_amp:
+            global _USE_AMP_WARNED
+            if not _USE_AMP_WARNED:
+                log.warning(
+                    "`use_amp` has no effect on the dpmodel/pt_expt backend "
+                    "(it is a pt-runtime CUDA autocast switch); ignoring it."
+                )
+                _USE_AMP_WARNED = True
         self.trainable = bool(trainable)
         self.seed = seed
         self.random_gamma = bool(random_gamma)
diff --git a/deepmd/pt_expt/model/get_model.py b/deepmd/pt_expt/model/get_model.py
index c0ac078024..e5b00c10d9 100644
--- a/deepmd/pt_expt/model/get_model.py
+++ b/deepmd/pt_expt/model/get_model.py
@@ -7,6 +7,7 @@
 """
 
 import copy
+import logging
 from typing import (
     Any,
 )
@@ -44,6 +45,12 @@
     Spin,
 )
 
+log = logging.getLogger(__name__)
+
+# Warn at most once per process that ``enable_tf32`` has no effect on the
+# pt_expt backend (which always runs at "highest" matmul precision).
+_ENABLE_TF32_WARNED = False
+
 
 def _get_standard_model_components(
     data: dict[str, Any],
@@ -128,6 +135,14 @@ def get_sezm_model(data: dict) -> EnergyModel:
     ("highest") matmul precision, which is numerically conservative.
     """
     data = copy.deepcopy(data)
+    if bool(data.get("enable_tf32", True)):
+        global _ENABLE_TF32_WARNED
+        if not _ENABLE_TF32_WARNED:
+            log.warning(
+                "`enable_tf32` has no effect on the pt_expt backend, which "
+                "always runs at full ('highest') matmul precision; ignoring it."
+            )
+            _ENABLE_TF32_WARNED = True
     if "spin" in data:
         raise NotImplementedError(
             "Spin DPA4/SeZM models are not supported in the pt_expt backend."
diff --git a/source/tests/pt_expt/descriptor/test_dpa4.py b/source/tests/pt_expt/descriptor/test_dpa4.py
index ef4a10227b..fc5d265dcf 100644
--- a/source/tests/pt_expt/descriptor/test_dpa4.py
+++ b/source/tests/pt_expt/descriptor/test_dpa4.py
@@ -205,3 +205,32 @@ def test_trainable_false_freezes_all_parameters(self, via_deserialize) -> None:
             f"trainable=False left parameters trainable: "
             f"{sorted(set(params) - set(frozen))}"
         )
+
+
+# `use_amp` is a pt-runtime CUDA autocast switch with no dpmodel/pt_expt effect;
+# constructing the descriptor with it truthy must emit a warn-once message.
+@pytest.mark.parametrize("use_amp", [True, False])  # truthy warns, falsy is silent
+def test_use_amp_warns_once(use_amp, caplog, monkeypatch) -> None:
+    import logging
+
+    import deepmd.dpmodel.descriptor.dpa4 as dpa4_mod
+
+    # reset the warn-once flag so the assertion is deterministic regardless of
+    # test ordering (other constructions in the suite may have already warned)
+    monkeypatch.setattr(dpa4_mod, "_USE_AMP_WARNED", False)
+
+    def _construct() -> None:
+        make_descriptor(2, [10, 10], 4.0, use_amp=use_amp)
+
+    with caplog.at_level(logging.WARNING, logger=dpa4_mod.log.name):
+        _construct()
+    matches = [r for r in caplog.records if "use_amp" in r.getMessage()]
+    if use_amp:
+        assert len(matches) == 1, caplog.text
+        # second construction must NOT warn again (warn-once per process)
+        caplog.clear()
+        with caplog.at_level(logging.WARNING, logger=dpa4_mod.log.name):
+            _construct()
+        assert not [r for r in caplog.records if "use_amp" in r.getMessage()]
+    else:
+        assert not matches, caplog.text
diff --git a/source/tests/pt_expt/model/test_get_model_dpa4.py b/source/tests/pt_expt/model/test_get_model_dpa4.py
index de497876ab..4df8611aab 100644
--- a/source/tests/pt_expt/model/test_get_model_dpa4.py
+++ b/source/tests/pt_expt/model/test_get_model_dpa4.py
@@ -2,8 +2,10 @@
 """Tests for the DPA4/SeZM model-type dispatch in pt_expt ``get_model``."""
 
 import copy
+import logging
 import unittest
 
+import pytest
 import torch
 
 from deepmd.pt_expt.model import (
@@ -218,5 +220,35 @@ def test_default_unsupported_values_pass(self) -> None:
         self.assertIsInstance(model, EnergyModel)
 
 
+# `enable_tf32` toggles TF32 matmul precision in pt but is ignored by pt_expt
+# (always "highest" precision); a truthy value must emit a warn-once message.
+@pytest.mark.parametrize("enable_tf32", [True, False])  # truthy warns, falsy silent
+def test_enable_tf32_warns_once(enable_tf32, caplog, monkeypatch) -> None:
+    import importlib
+
+    # the package __init__ rebinds the name ``get_model`` to the function, so
+    # ``import ...get_model as`` would shadow the submodule; load it explicitly
+    gm_mod = importlib.import_module("deepmd.pt_expt.model.get_model")
+
+    # reset the warn-once flag so the assertion is deterministic regardless of
+    # test ordering (other get_sezm_model calls may have already warned)
+    monkeypatch.setattr(gm_mod, "_ENABLE_TF32_WARNED", False)
+
+    raw = _make_raw_model_config(enable_tf32=enable_tf32)
+
+    with caplog.at_level(logging.WARNING, logger=gm_mod.log.name):
+        gm_mod.get_sezm_model(raw)
+    matches = [r for r in caplog.records if "enable_tf32" in r.getMessage()]
+    if enable_tf32:
+        assert len(matches) == 1, caplog.text
+        # a second call must NOT warn again (warn-once per process)
+        caplog.clear()
+        with caplog.at_level(logging.WARNING, logger=gm_mod.log.name):
+            gm_mod.get_sezm_model(_make_raw_model_config(enable_tf32=enable_tf32))
+        assert not [r for r in caplog.records if "enable_tf32" in r.getMessage()]
+    else:
+        assert not matches, caplog.text
+
+
 if __name__ == "__main__":
     unittest.main()

From 4bbfdc994534d8f96bc4a329b2cf66faabddc655 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 16 Jun 2026 02:23:16 +0800
Subject: [PATCH 06/10] test(infer): DPA4 .pt2 fixture generator

---
 source/install/test_cc_local.sh |   4 +
 source/tests/infer/gen_dpa4.py  | 241 ++++++++++++++++++++++++++++++++
 2 files changed, 245 insertions(+)
 create mode 100644 source/tests/infer/gen_dpa4.py

diff --git a/source/install/test_cc_local.sh b/source/install/test_cc_local.sh
index 5ddbf0eecc..3a39b4c69e 100755
--- a/source/install/test_cc_local.sh
+++ b/source/install/test_cc_local.sh
@@ -91,6 +91,10 @@ else:
 	wait $PID5
 	wait $PID6
 
+	env ${_GEN_ENV} python ${INFER_SCRIPT_PATH}/gen_dpa4.py &
+	PID9=$!
+	wait $PID9
+
 	env ${_GEN_ENV} python ${INFER_SCRIPT_PATH}/gen_spin.py &
 	PID7=$!
 	env ${_GEN_ENV} python ${INFER_SCRIPT_PATH}/gen_spin_model_devi.py &
diff --git a/source/tests/infer/gen_dpa4.py b/source/tests/infer/gen_dpa4.py
new file mode 100644
index 0000000000..d09f372e65
--- /dev/null
+++ b/source/tests/infer/gen_dpa4.py
@@ -0,0 +1,241 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Generate deeppot_dpa4.pth and deeppot_dpa4.pt2 test models.
+
+Creates a DPA4/SeZM model from a pt_expt config, serializes, and exports
+to both .pt2 (pt_expt / AOTInductor) and .pth (pt) from the same weights.
+Also writes a sidecar reference file (PBC and NoPbc per-atom energy/force/
+virial) consumed by the C++ tests.
+"""
+
+import copy
+import os
+import sys
+
+import numpy as np
+
+# Ensure the source tree is on the path
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..", ".."))
+
+from gen_common import (
+    ensure_inductor_compiler,
+    load_custom_ops,
+    write_expected_ref,
+)
+
+
+def main():
+    import torch
+
+    from deepmd.pt_expt.model.get_model import (
+        get_model,
+    )
+
+    ensure_inductor_compiler()
+
+    # ---- 1. DPA4/SeZM model config (small, fast to compile) ----
+    # Mirrors test_dpa4_export.py: channels 16, n_radial 8, lmax 2, mmax 1,
+    # n_blocks 2 — large enough to exercise the SO(2)/SO(3) + attention +
+    # embedding paths, small enough to keep the AOTInductor compile bounded.
+    config = {
+        "type": "dpa4",
+        "type_map": ["O", "H"],
+        "descriptor": {
+            "type": "dpa4",
+            "sel": 20,
+            "rcut": 4.0,
+            "channels": 16,
+            "n_radial": 8,
+            "lmax": 2,
+            "mmax": 1,
+            "n_blocks": 2,
+            "precision": "float64",
+            "seed": 1,
+        },
+        "fitting_net": {
+            "type": "dpa4_ener",
+            "neuron": [16],
+            "precision": "float64",
+            "seed": 1,
+        },
+    }
+
+    # ---- 2. Build the pt_expt model and serialize ----
+    # dpmodel ``get_model`` has no DPA4 dispatch; the model-type alias lives
+    # in pt_expt ``get_model``.  Build there, then serialize to a backend-
+    # neutral dict that both pt_expt and pt can deserialize.
+    model = get_model(copy.deepcopy(config))
+    model.to("cpu")
+    model.eval()
+
+    # ---- 2b. Activate the zero-initialised residual branches ----
+    # DPA4/SeZM follows the standard residual-network convention of
+    # ZERO-initialising the output projection of every residual branch
+    # (``*.so3_linear_2.weight``, ``post_focus_mix.weight``,
+    # ``env_seed_embedding.output_proj.w``) and the final descriptor output
+    # projection (``output_ffn.so3_linear_2.weight``).  At random init these
+    # branches therefore contribute EXACTLY zero, so a freshly built DPA4
+    # collapses to a type-embedding-only descriptor: the per-atom energy is a
+    # pure per-type constant and every force/virial is identically zero,
+    # regardless of geometry.  Such a fixture exercises none of the
+    # force/virial code paths and would make the C++ inference test vacuous.
+    #
+    # A trained model has non-zero weights in these branches, so to obtain a
+    # representative (geometry-dependent, non-zero-force) reference we fill the
+    # all-zero parameters with small deterministic pseudo-random values.  This
+    # is the minimal change that makes the descriptor coordinate-dependent
+    # while leaving the rest of the random init untouched.  (Unlike DPA3,
+    # whose random init already yields non-zero forces, DPA4 needs this step.)
+    generator = torch.Generator().manual_seed(20240614)
+    with torch.no_grad():
+        for _name, param in model.named_parameters():
+            if float(param.detach().abs().max()) == 0.0:
+                param.copy_(
+                    0.1
+                    * torch.randn(param.shape, dtype=param.dtype, generator=generator)
+                )
+
+    model_dict = model.serialize()
+
+    data = {
+        "model": model_dict,
+        "model_def_script": config,
+        "backend": "dpmodel",
+        "software": "deepmd-kit",
+        "version": "3.0.0",
+    }
+
+    # ---- 3. Export to .pt2 and .pth ----
+    from deepmd.pt.utils.serialization import (
+        deserialize_to_file as pt_deserialize_to_file,
+    )
+    from deepmd.pt_expt.utils.serialization import (
+        deserialize_to_file as pt_expt_deserialize_to_file,
+    )
+
+    # Load custom ops after deepmd.pt import to avoid double registration
+    load_custom_ops()
+
+    base_dir = os.path.dirname(__file__)
+
+    pt2_path = os.path.join(base_dir, "deeppot_dpa4.pt2")
+    print(f"Exporting to {pt2_path} ...")  # noqa: T201
+    pt_expt_deserialize_to_file(pt2_path, copy.deepcopy(data), do_atomic_virial=True)
+
+    pth_path = os.path.join(base_dir, "deeppot_dpa4.pth")
+    print(f"Exporting to {pth_path} ...")  # noqa: T201
+    try:
+        pt_deserialize_to_file(pth_path, copy.deepcopy(data))
+    except RuntimeError as e:
+        # Custom ops may not be available in all build environments;
+        # .pth generation is not critical.
+        print(f"WARNING: .pth export failed ({e}), skipping.")  # noqa: T201
+
+    print("Export done.")  # noqa: T201
+
+    # ---- 4. Run inference for PBC test ----
+    from deepmd.infer import (
+        DeepPot,
+    )
+
+    dp = DeepPot(pt2_path)
+
+    coord = np.array(
+        [
+            12.83,
+            2.56,
+            2.18,
+            12.09,
+            2.87,
+            2.74,
+            0.25,
+            3.32,
+            1.68,
+            3.36,
+            3.00,
+            1.81,
+            3.51,
+            2.51,
+            2.60,
+            4.27,
+            3.22,
+            1.56,
+        ],
+        dtype=np.float64,
+    )
+    atype = [0, 1, 1, 0, 1, 1]
+    box = np.array([13.0, 0.0, 0.0, 0.0, 13.0, 0.0, 0.0, 0.0, 13.0], dtype=np.float64)
+
+    e1, f1, v1, ae1, av1 = dp.eval(coord, box, atype, atomic=True)
+    print(f"\n// PBC total energy: {e1[0, 0]:.18e}")  # noqa: T201
+
+    # ---- 5. Run inference for NoPbc test ----
+    e_np, f_np, v_np, ae_np, av_np = dp.eval(coord, None, atype, atomic=True)
+    print(f"\n// NoPbc total energy: {e_np[0, 0]:.18e}")  # noqa: T201
+
+    # ---- 5b. Write sidecar reference file consumed by C++ tests ----
+    ref_path = os.path.join(base_dir, "deeppot_dpa4.expected")
+    write_expected_ref(
+        ref_path,
+        sections={
+            "pbc": {
+                "expected_e": ae1[0, :, 0],
+                "expected_f": f1[0],
+                "expected_v": av1[0],
+            },
+            "nopbc": {
+                "expected_e": ae_np[0, :, 0],
+                "expected_f": f_np[0],
+                "expected_v": av_np[0],
+            },
+        },
+        source_script="source/tests/infer/gen_dpa4.py",
+    )
+    print(f"Wrote {ref_path}")  # noqa: T201
+
+    # ---- 6. Verify .pth gives same results ----
+    if os.path.exists(pth_path):
+        dp_pth = DeepPot(pth_path)
+        e_pth, f_pth, v_pth, ae_pth, av_pth = dp_pth.eval(
+            coord, box, atype, atomic=True
+        )
+        # PBC parity assertions
+        pbc_e_diff = abs(e1[0, 0] - e_pth[0, 0])
+        pbc_f_diff = np.max(np.abs(f1 - f_pth))
+        pbc_v_diff = np.max(np.abs(v1 - v_pth))
+        print(f"\n// .pth PBC total energy: {e_pth[0, 0]:.18e}")  # noqa: T201
+        print(f"// .pth vs .pt2 energy diff: {pbc_e_diff:.2e}")  # noqa: T201
+        print(f"// .pth vs .pt2 force max diff: {pbc_f_diff:.2e}")  # noqa: T201
+        print(f"// .pth vs .pt2 virial max diff: {pbc_v_diff:.2e}")  # noqa: T201
+        tol = 1e-10
+        assert pbc_e_diff < tol, f"PBC energy parity failed: diff={pbc_e_diff:.2e}"
+        assert pbc_f_diff < tol, f"PBC force parity failed: diff={pbc_f_diff:.2e}"
+        # NOTE: ``v1``/``v_pth`` are the *global* virials (3rd return value).
+        # The per-atom virial distribution legitimately differs between pt's
+        # edge-force scatter and pt_expt's generic assembly (#5518); only the
+        # global virial (their sum) is a physical observable, so we assert on
+        # the global virial here.
+        assert pbc_v_diff < tol, f"PBC virial parity failed: diff={pbc_v_diff:.2e}"
+
+        e_pth_np, f_pth_np, v_pth_np, ae_pth_np, av_pth_np = dp_pth.eval(
+            coord, None, atype, atomic=True
+        )
+        # NoPbc parity assertions
+        np_e_diff = abs(e_np[0, 0] - e_pth_np[0, 0])
+        np_f_diff = np.max(np.abs(f_np - f_pth_np))
+        np_v_diff = np.max(np.abs(v_np - v_pth_np))
+        print(f"// .pth NoPbc total energy: {e_pth_np[0, 0]:.18e}")  # noqa: T201
+        print(f"// .pth vs .pt2 NoPbc energy diff: {np_e_diff:.2e}")  # noqa: T201
+        print(f"// .pth vs .pt2 NoPbc force diff: {np_f_diff:.2e}")  # noqa: T201
+        print(f"// .pth vs .pt2 NoPbc virial diff: {np_v_diff:.2e}")  # noqa: T201
+        assert np_e_diff < tol, f"NoPbc energy parity failed: diff={np_e_diff:.2e}"
+        assert np_f_diff < tol, f"NoPbc force parity failed: diff={np_f_diff:.2e}"
+        assert np_v_diff < tol, f"NoPbc virial parity failed: diff={np_v_diff:.2e}"
+    else:
+        print("\n// Skipping .pth verification (file not generated).")  # noqa: T201
+
+    print("\nDone!")  # noqa: T201
+
+
+if __name__ == "__main__":
+    main()

From fbe968ba64fc33dd8a8c57a4b5b7460043ec56e9 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 16 Jun 2026 02:42:41 +0800
Subject: [PATCH 07/10] test(api_cc): DPA4 .pt2 single-rank C++ inference

---
 .../api_cc/tests/test_deeppot_dpa4_ptexpt.cc  | 537 ++++++++++++++++++
 1 file changed, 537 insertions(+)
 create mode 100644 source/api_cc/tests/test_deeppot_dpa4_ptexpt.cc

diff --git a/source/api_cc/tests/test_deeppot_dpa4_ptexpt.cc b/source/api_cc/tests/test_deeppot_dpa4_ptexpt.cc
new file mode 100644
index 0000000000..77200c9508
--- /dev/null
+++ b/source/api_cc/tests/test_deeppot_dpa4_ptexpt.cc
@@ -0,0 +1,537 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+// Test C++ inference for pt_expt (.pt2) backend with DPA4 (mixed-type) model.
+// Reference values generated by source/tests/infer/gen_dpa4.py.
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cmath>
+#include <fstream>
+#include <vector>
+
+#include "DeepPot.h"
+#include "DeepPotPTExpt.h"
+#include "expected_ref.h"
+#include "neighbor_list.h"
+#include "test_utils.h"
+
+// DPA4 models need relaxed epsilon (same as test_deeppot_dpa3_ptexpt.cc)
+#undef EPSILON
+#define EPSILON (std::is_same<VALUETYPE, double>::value ? 1e-10 : 1e-4)
+
+namespace {
+constexpr const char* kRefPath = "../../tests/infer/deeppot_dpa4.expected";
+constexpr const char* kModelPath = "../../tests/infer/deeppot_dpa4.pt2";
+}  // namespace
+
+template <class VALUETYPE>
+class TestInferDeepPotDpa4PtExpt : public ::testing::Test {
+ protected:
+  std::vector<VALUETYPE> coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
+                                  00.25, 3.32, 1.68, 3.36,  3.00, 1.81,
+                                  3.51,  2.51, 2.60, 4.27,  3.22, 1.56};
+  std::vector<int> atype = {0, 1, 1, 0, 1, 1};
+  std::vector<VALUETYPE> box = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
+  std::vector<VALUETYPE> expected_e;
+  std::vector<VALUETYPE> expected_f;
+  std::vector<VALUETYPE> expected_v;
+  int natoms;
+  double expected_tot_e;
+  std::vector<VALUETYPE> expected_tot_v;
+
+  static deepmd::DeepPot dp;
+
+  static void SetUpTestSuite() {
+#if defined(BUILD_PYTORCH) && BUILD_PT_EXPT
+    dp.init(kModelPath);
+#endif
+  }
+
+  void SetUp() override {
+#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT
+    GTEST_SKIP() << "Skip because PyTorch support is not enabled.";
+#endif
+    deepmd_test::ExpectedRef ref;
+    ref.load(kRefPath);
+    expected_e = ref.get<VALUETYPE>("pbc", "expected_e");
+    expected_f = ref.get<VALUETYPE>("pbc", "expected_f");
+    expected_v = ref.get<VALUETYPE>("pbc", "expected_v");
+
+    natoms = expected_e.size();
+    EXPECT_EQ(natoms * 3, expected_f.size());
+    EXPECT_EQ(natoms * 9, expected_v.size());
+    expected_tot_e = 0.;
+    expected_tot_v.assign(9, 0.);
+    for (int ii = 0; ii < natoms; ++ii) {
+      expected_tot_e += expected_e[ii];
+    }
+    for (int ii = 0; ii < natoms; ++ii) {
+      for (int dd = 0; dd < 9; ++dd) {
+        expected_tot_v[dd] += expected_v[ii * 9 + dd];
+      }
+    }
+  };
+
+  void TearDown() override {};
+
+  static void TearDownTestSuite() { dp = deepmd::DeepPot(); }
+};
+
+template <class VALUETYPE>
+deepmd::DeepPot TestInferDeepPotDpa4PtExpt<VALUETYPE>::dp;
+
+TYPED_TEST_SUITE(TestInferDeepPotDpa4PtExpt, ValueTypes);
+
+TYPED_TEST(TestInferDeepPotDpa4PtExpt, cpu_build_nlist) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+  double ener;
+  std::vector<VALUETYPE> force, virial;
+  dp.compute(ener, force, virial, coord, atype, box);
+
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(virial.size(), 9);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 3 * 3; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+}
+
+TYPED_TEST(TestInferDeepPotDpa4PtExpt, cpu_build_nlist_numfv) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  deepmd::DeepPot& dp = this->dp;
+  class MyModel : public EnergyModelTest<VALUETYPE> {
+    deepmd::DeepPot& mydp;
+    const std::vector<int> atype;
+
+   public:
+    MyModel(deepmd::DeepPot& dp_, const std::vector<int>& atype_)
+        : mydp(dp_), atype(atype_) {
+      // DPA4 needs relaxed numfv tolerance (consistent with Python places=5)
+      this->level = std::is_same<VALUETYPE, double>::value ? 1e-3 : 1e-1;
+    };
+    virtual void compute(double& ener,
+                         std::vector<VALUETYPE>& force,
+                         std::vector<VALUETYPE>& virial,
+                         const std::vector<VALUETYPE>& coord,
+                         const std::vector<VALUETYPE>& box) {
+      mydp.compute(ener, force, virial, coord, atype, box);
+    }
+  };
+  MyModel model(dp, atype);
+  model.test_f(coord, box);
+  model.test_v(coord, box);
+  std::vector<VALUETYPE> box_(box);
+  box_[1] -= 0.4;
+  model.test_f(coord, box_);
+  model.test_v(coord, box_);
+  box_[2] += 0.5;
+  model.test_f(coord, box_);
+  model.test_v(coord, box_);
+  box_[4] += 0.2;
+  model.test_f(coord, box_);
+  model.test_v(coord, box_);
+  box_[3] -= 0.3;
+  model.test_f(coord, box_);
+  model.test_v(coord, box_);
+  box_[6] -= 0.7;
+  model.test_f(coord, box_);
+  model.test_v(coord, box_);
+  box_[7] += 0.6;
+  model.test_f(coord, box_);
+  model.test_v(coord, box_);
+}
+
+TYPED_TEST(TestInferDeepPotDpa4PtExpt, cpu_build_nlist_atomic) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_e = this->expected_e;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  std::vector<VALUETYPE>& expected_v = this->expected_v;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+  double ener;
+  std::vector<VALUETYPE> force, virial, atom_ener, atom_vir;
+  dp.compute(ener, force, virial, atom_ener, atom_vir, coord, atype, box);
+
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(virial.size(), 9);
+  EXPECT_EQ(atom_ener.size(), natoms);
+  EXPECT_EQ(atom_vir.size(), natoms * 9);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 3 * 3; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < natoms; ++ii) {
+    EXPECT_LT(fabs(atom_ener[ii] - expected_e[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < natoms * 9; ++ii) {
+    EXPECT_LT(fabs(atom_vir[ii] - expected_v[ii]), EPSILON);
+  }
+}
+
+TYPED_TEST(TestInferDeepPotDpa4PtExpt, cpu_lmp_nlist) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+  float rc = dp.cutoff();
+  int nloc = coord.size() / 3;
+  std::vector<VALUETYPE> coord_cpy;
+  std::vector<int> atype_cpy, mapping;
+  std::vector<std::vector<int> > nlist_data;
+  _build_nlist<VALUETYPE>(nlist_data, coord_cpy, atype_cpy, mapping, coord,
+                          atype, box, rc);
+  int nall = coord_cpy.size() / 3;
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_data);
+  // DPA4 is a message-passing descriptor; pass mapping so the C++ backend can
+  // correctly map ghost atoms back to local atoms.
+  inlist.mapping = mapping.data();
+
+  double ener;
+  std::vector<VALUETYPE> force_, virial;
+  dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc,
+             inlist, 0);
+  std::vector<VALUETYPE> force;
+  _fold_back<VALUETYPE>(force, force_, mapping, nloc, nall, 3);
+
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(virial.size(), 9);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 3 * 3; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+
+  ener = 0.;
+  std::fill(force_.begin(), force_.end(), 0.0);
+  std::fill(virial.begin(), virial.end(), 0.0);
+  dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc,
+             inlist, 1);
+  _fold_back<VALUETYPE>(force, force_, mapping, nloc, nall, 3);
+
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(virial.size(), 9);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 3 * 3; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+}
+
+TYPED_TEST(TestInferDeepPotDpa4PtExpt, cpu_lmp_nlist_atomic) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_e = this->expected_e;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  std::vector<VALUETYPE>& expected_v = this->expected_v;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+  float rc = dp.cutoff();
+  int nloc = coord.size() / 3;
+  std::vector<VALUETYPE> coord_cpy;
+  std::vector<int> atype_cpy, mapping;
+  std::vector<std::vector<int> > nlist_data;
+  _build_nlist<VALUETYPE>(nlist_data, coord_cpy, atype_cpy, mapping, coord,
+                          atype, box, rc);
+  int nall = coord_cpy.size() / 3;
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_data);
+  inlist.mapping = mapping.data();
+  double ener;
+  std::vector<VALUETYPE> force_, atom_ener_, atom_vir_, virial;
+  std::vector<VALUETYPE> force, atom_ener, atom_vir;
+  dp.compute(ener, force_, virial, atom_ener_, atom_vir_, coord_cpy, atype_cpy,
+             box, nall - nloc, inlist, 0);
+  _fold_back<VALUETYPE>(force, force_, mapping, nloc, nall, 3);
+  _fold_back<VALUETYPE>(atom_ener, atom_ener_, mapping, nloc, nall, 1);
+  _fold_back<VALUETYPE>(atom_vir, atom_vir_, mapping, nloc, nall, 9);
+
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(virial.size(), 9);
+  EXPECT_EQ(atom_ener.size(), natoms);
+  EXPECT_EQ(atom_vir.size(), natoms * 9);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 3 * 3; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < natoms; ++ii) {
+    EXPECT_LT(fabs(atom_ener[ii] - expected_e[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < natoms * 9; ++ii) {
+    EXPECT_LT(fabs(atom_vir[ii] - expected_v[ii]), EPSILON);
+  }
+}
+
+TYPED_TEST(TestInferDeepPotDpa4PtExpt, cpu_lmp_nlist_2rc) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+  float rc = dp.cutoff();
+  int nloc = coord.size() / 3;
+  std::vector<VALUETYPE> coord_cpy;
+  std::vector<int> atype_cpy, mapping;
+  std::vector<std::vector<int> > nlist_data;
+  _build_nlist<VALUETYPE>(nlist_data, coord_cpy, atype_cpy, mapping, coord,
+                          atype, box, rc * 2);
+  int nall = coord_cpy.size() / 3;
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_data);
+  inlist.mapping = mapping.data();
+
+  double ener;
+  std::vector<VALUETYPE> force_(nall * 3, 0.0), virial(9, 0.0);
+  dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc,
+             inlist, 0);
+  std::vector<VALUETYPE> force;
+  _fold_back<VALUETYPE>(force, force_, mapping, nloc, nall, 3);
+
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(virial.size(), 9);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 3 * 3; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+}
+
+TYPED_TEST(TestInferDeepPotDpa4PtExpt, cpu_lmp_nlist_type_sel) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+  float rc = dp.cutoff();
+
+  // add vir atoms
+  int nvir = 2;
+  std::vector<VALUETYPE> coord_vir(nvir * 3);
+  std::vector<int> atype_vir(nvir, 2);
+  for (int ii = 0; ii < nvir * 3; ++ii) {
+    coord_vir[ii] = coord[ii];
+  }
+  coord.insert(coord.begin(), coord_vir.begin(), coord_vir.end());
+  atype.insert(atype.begin(), atype_vir.begin(), atype_vir.end());
+  natoms += nvir;
+  std::vector<VALUETYPE> expected_f_vir(nvir * 3, 0.0);
+  expected_f.insert(expected_f.begin(), expected_f_vir.begin(),
+                    expected_f_vir.end());
+
+  // build nlist
+  int nloc = coord.size() / 3;
+  std::vector<VALUETYPE> coord_cpy;
+  std::vector<int> atype_cpy, mapping;
+  std::vector<std::vector<int> > nlist_data;
+  _build_nlist<VALUETYPE>(nlist_data, coord_cpy, atype_cpy, mapping, coord,
+                          atype, box, rc);
+  int nall = coord_cpy.size() / 3;
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_data);
+  inlist.mapping = mapping.data();
+
+  // dp compute
+  double ener;
+  std::vector<VALUETYPE> force_(nall * 3, 0.0), virial(9, 0.0);
+  dp.compute(ener, force_, virial, coord_cpy, atype_cpy, box, nall - nloc,
+             inlist, 0);
+  // fold back
+  std::vector<VALUETYPE> force;
+  _fold_back<VALUETYPE>(force, force_, mapping, nloc, nall, 3);
+
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(virial.size(), 9);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 3 * 3; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+}
+
+TYPED_TEST(TestInferDeepPotDpa4PtExpt, print_summary) {
+  deepmd::DeepPot& dp = this->dp;
+  dp.print_summary("");
+}
+
+// ---- NoPbc fixture (box = {}) ----
+template <class VALUETYPE>
+class TestInferDeepPotDpa4PtExptNoPbc : public ::testing::Test {
+ protected:
+  std::vector<VALUETYPE> coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
+                                  00.25, 3.32, 1.68, 3.36,  3.00, 1.81,
+                                  3.51,  2.51, 2.60, 4.27,  3.22, 1.56};
+  std::vector<int> atype = {0, 1, 1, 0, 1, 1};
+  std::vector<VALUETYPE> box = {};
+  std::vector<VALUETYPE> expected_e;
+  std::vector<VALUETYPE> expected_f;
+  std::vector<VALUETYPE> expected_v;
+  int natoms;
+  double expected_tot_e;
+  std::vector<VALUETYPE> expected_tot_v;
+
+  static deepmd::DeepPot dp;
+
+  static void SetUpTestSuite() {
+#if defined(BUILD_PYTORCH) && BUILD_PT_EXPT
+    dp.init(kModelPath);
+#endif
+  }
+
+  void SetUp() override {
+#if !defined(BUILD_PYTORCH) || !BUILD_PT_EXPT
+    GTEST_SKIP() << "Skip because PyTorch support is not enabled.";
+#endif
+    deepmd_test::ExpectedRef ref;
+    ref.load(kRefPath);
+    expected_e = ref.get<VALUETYPE>("nopbc", "expected_e");
+    expected_f = ref.get<VALUETYPE>("nopbc", "expected_f");
+    expected_v = ref.get<VALUETYPE>("nopbc", "expected_v");
+
+    natoms = expected_e.size();
+    EXPECT_EQ(natoms * 3, expected_f.size());
+    EXPECT_EQ(natoms * 9, expected_v.size());
+    expected_tot_e = 0.;
+    expected_tot_v.assign(9, 0.);
+    for (int ii = 0; ii < natoms; ++ii) {
+      expected_tot_e += expected_e[ii];
+    }
+    for (int ii = 0; ii < natoms; ++ii) {
+      for (int dd = 0; dd < 9; ++dd) {
+        expected_tot_v[dd] += expected_v[ii * 9 + dd];
+      }
+    }
+  };
+
+  void TearDown() override {};
+
+  static void TearDownTestSuite() { dp = deepmd::DeepPot(); }
+};
+
+template <class VALUETYPE>
+deepmd::DeepPot TestInferDeepPotDpa4PtExptNoPbc<VALUETYPE>::dp;
+
+TYPED_TEST_SUITE(TestInferDeepPotDpa4PtExptNoPbc, ValueTypes);
+
+TYPED_TEST(TestInferDeepPotDpa4PtExptNoPbc, cpu_build_nlist) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+  double ener;
+  std::vector<VALUETYPE> force, virial;
+  dp.compute(ener, force, virial, coord, atype, box);
+
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(virial.size(), 9);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 3 * 3; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+}
+
+TYPED_TEST(TestInferDeepPotDpa4PtExptNoPbc, cpu_lmp_nlist) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_f = this->expected_f;
+  int& natoms = this->natoms;
+  double& expected_tot_e = this->expected_tot_e;
+  std::vector<VALUETYPE>& expected_tot_v = this->expected_tot_v;
+  deepmd::DeepPot& dp = this->dp;
+  double ener;
+  std::vector<VALUETYPE> force, virial;
+
+  std::vector<std::vector<int> > nlist_data = {
+      {1, 2, 3, 4, 5}, {0, 2, 3, 4, 5}, {0, 1, 3, 4, 5},
+      {0, 1, 2, 4, 5}, {0, 1, 2, 3, 5}, {0, 1, 2, 3, 4}};
+  std::vector<int> ilist(natoms), numneigh(natoms);
+  std::vector<int*> firstneigh(natoms);
+  deepmd::InputNlist inlist(natoms, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_data);
+  dp.compute(ener, force, virial, coord, atype, box, 0, inlist, 0);
+
+  EXPECT_EQ(force.size(), natoms * 3);
+  EXPECT_EQ(virial.size(), 9);
+
+  EXPECT_LT(fabs(ener - expected_tot_e), EPSILON);
+  for (int ii = 0; ii < natoms * 3; ++ii) {
+    EXPECT_LT(fabs(force[ii] - expected_f[ii]), EPSILON);
+  }
+  for (int ii = 0; ii < 3 * 3; ++ii) {
+    EXPECT_LT(fabs(virial[ii] - expected_tot_v[ii]), EPSILON);
+  }
+}

From ac85cdbe64c445b4be1f3f21c83cf357dfb3c511 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 16 Jun 2026 07:58:05 +0800
Subject: [PATCH 08/10] test(lmp): DPA4 .pt2 single-rank LAMMPS

---
 source/lmp/tests/test_lammps_dpa4_pt2.py | 309 +++++++++++++++++++++++
 1 file changed, 309 insertions(+)
 create mode 100644 source/lmp/tests/test_lammps_dpa4_pt2.py

diff --git a/source/lmp/tests/test_lammps_dpa4_pt2.py b/source/lmp/tests/test_lammps_dpa4_pt2.py
new file mode 100644
index 0000000000..d68680a996
--- /dev/null
+++ b/source/lmp/tests/test_lammps_dpa4_pt2.py
@@ -0,0 +1,309 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Test LAMMPS with .pt2 (AOTInductor) DPA4 model, single-rank only.
+
+Mirrors the single-rank portion of test_lammps_dpa3_pt2.py for the DPA4
+descriptor (PR-3 Task 5).
+
+Scope / coverage
+----------------
+``deeppot_dpa4.pt2`` (generated by source/tests/infer/gen_dpa4.py) is a
+DUAL-artifact archive: ``has_comm_artifact=True`` and
+``has_message_passing=True`` (GNN). It is therefore the analogue of
+``deeppot_dpa3_mpi.pt2`` (use_loc_mapping=False) — gen_dpa4.py does NOT
+produce a separate use_loc_mapping=True archive, so the dpa3 cells A/B
+(which need the no-with-comm .pt2) have no DPA4 counterpart.
+
+Single-rank cells covered here (all on the with-comm archive):
+
+- ``test_pair_deepmd`` — atom_modify map yes. Dispatch picks the regular
+  path because nswap==0 (single-rank PBC has an empty CommBrick
+  sendlist); the regular artifact uses the correct mapping built from the
+  LAMMPS atom-map. pe/forces must match the DeepPot reference. Mirrors
+  dpa3 cell C (``test_pair_deepmd_with_comm``).
+- ``test_pair_deepmd_no_atom_map_fails_fast`` — atom_modify map no.
+  Despite the with-comm artifact being available, single-rank PBC has
+  nswap==0 so border_op cannot fill ghost features and the GNN model has
+  no reliable mapping. Must fail fast with the actionable
+  ``atom_modify map yes`` message. Mirrors dpa3 cell D
+  (``test_pair_deepmd_with_comm_no_atom_map_fails_fast``).
+- virial / type_map / real-units / si-units variants mirror the dpa3
+  single-rank set.
+
+Deferred (NOT covered): live multi-rank parity. DPA4 multi-rank
+inference is out of PR-3 scope and has no mpi runner script. The C++
+with-comm dispatch is exercised for DPA4 only at the single-rank level
+here; multi-rank DPA4 is left to a follow-up.
+
+Tolerances match test_lammps_dpa3_pt2.py exactly (pytest.approx defaults
+for pe/forces; per-atom virial compared with pytest.approx).
+"""
+
+import os
+from pathlib import (
+    Path,
+)
+
+import constants
+import numpy as np
+import pytest
+from expected_ref import (
+    read_expected_ref,
+)
+from lammps import (
+    PyLammps,
+)
+from write_lmp_data import (
+    write_lmp_data,
+)
+
+# Dual-artifact (with-comm) DPA4 .pt2 — the only archive gen_dpa4.py emits.
+pb_file = Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa4.pt2"
+ref_file = (
+    Path(__file__).parent.parent.parent / "tests" / "infer" / "deeppot_dpa4.expected"
+)
+data_file = Path(__file__).parent / "data_dpa4_pt2.lmp"
+data_file_si = Path(__file__).parent / "data_dpa4_pt2.si"
+data_type_map_file = Path(__file__).parent / "data_type_map_dpa4_pt2.lmp"
+
+# Reference values written by source/tests/infer/gen_dpa4.py (PBC case).
+# Guarded with try/except because gen_dpa4.py only runs when PyTorch is built;
+# matrices that disable PyTorch (e.g. paddle-only) skip the test in
+# setup_module but still load this file at pytest collection time.
+try:
+    _ref = read_expected_ref(ref_file)["pbc"]
+    expected_e = float(np.sum(_ref["expected_e"]))
+    expected_f = _ref["expected_f"].reshape(6, 3)
+    # LAMMPS uses opposite sign convention for virial vs DeepPot atom_virial.
+    expected_v = -_ref["expected_v"].reshape(6, 9)
+except FileNotFoundError:
+    expected_e = expected_f = expected_v = None
+
+# Same 6-atom water system as the DPA4 fixture (source/tests/infer/gen_dpa4.py)
+# and the DPA3 LAMMPS test: type_map [O, H], box 13x13x13.
+box = np.array([0, 13, 0, 13, 0, 13, 0, 0, 0])
+coord = np.array(
+    [
+        [12.83, 2.56, 2.18],
+        [12.09, 2.87, 2.74],
+        [0.25, 3.32, 1.68],
+        [3.36, 3.00, 1.81],
+        [3.51, 2.51, 2.60],
+        [4.27, 3.22, 1.56],
+    ]
+)
+type_OH = np.array([1, 2, 2, 1, 2, 2])
+type_HO = np.array([2, 1, 1, 2, 1, 1])
+
+
+def setup_module() -> None:
+    if os.environ.get("ENABLE_PYTORCH", "1") != "1":
+        pytest.skip(
+            "Skip test because PyTorch support is not enabled.",
+        )
+    write_lmp_data(box, coord, type_OH, data_file)
+    write_lmp_data(box, coord, type_HO, data_type_map_file)
+    write_lmp_data(
+        box * constants.dist_metal2si,
+        coord * constants.dist_metal2si,
+        type_OH,
+        data_file_si,
+    )
+
+
+def teardown_module() -> None:
+    for f in [data_file, data_type_map_file, data_file_si]:
+        if f.exists():
+            os.remove(f)
+
+
+def _lammps(data_file, units="metal", atom_map: str = "yes") -> PyLammps:
+    lammps = PyLammps()
+    lammps.units(units)
+    lammps.boundary("p p p")
+    lammps.atom_style("atomic")
+    # LAMMPS rejects ``atom_modify map no``; the supported way to leave
+    # the atom-map disabled is to simply omit the command (default for
+    # ``atom_style atomic``).
+    if atom_map != "no":
+        lammps.atom_modify(f"map {atom_map}")
+    if units == "metal" or units == "real":
+        lammps.neighbor("2.0 bin")
+    elif units == "si":
+        lammps.neighbor("2.0e-10 bin")
+    else:
+        raise ValueError("units should be metal, real, or si")
+    lammps.neigh_modify("every 10 delay 0 check no")
+    lammps.read_data(data_file.resolve())
+    if units == "metal" or units == "real":
+        lammps.mass("1 16")
+        lammps.mass("2 2")
+    elif units == "si":
+        lammps.mass("1 %.10e" % (16 * constants.mass_metal2si))
+        lammps.mass("2 %.10e" % (2 * constants.mass_metal2si))
+    else:
+        raise ValueError("units should be metal, real, or si")
+    if units == "metal":
+        lammps.timestep(0.0005)
+    elif units == "real":
+        lammps.timestep(0.5)
+    elif units == "si":
+        lammps.timestep(5e-16)
+    else:
+        raise ValueError("units should be metal, real, or si")
+    lammps.fix("1 all nve")
+    return lammps
+
+
+@pytest.fixture
+def lammps():
+    lmp = _lammps(data_file=data_file)
+    yield lmp
+    lmp.close()
+
+
+@pytest.fixture
+def lammps_type_map():
+    lmp = _lammps(data_file=data_type_map_file)
+    yield lmp
+    lmp.close()
+
+
+@pytest.fixture
+def lammps_real():
+    lmp = _lammps(data_file=data_file, units="real")
+    yield lmp
+    lmp.close()
+
+
+@pytest.fixture
+def lammps_si():
+    lmp = _lammps(data_file=data_file_si, units="si")
+    yield lmp
+    lmp.close()
+
+
+@pytest.fixture
+def lammps_no_atom_map():
+    # Same as the default ``lammps`` fixture but with the LAMMPS atom-map
+    # disabled (``atom_modify map no``). Exercises the C++ fail-fast branch
+    # in DeepPotPTExpt::compute_inner — single-rank .pt2 GNN inference
+    # without atom-map cannot resolve ghost-to-local mapping, so the regular
+    # path throws with an actionable error message.
+    lmp = _lammps(data_file=data_file, atom_map="no")
+    yield lmp
+    lmp.close()
+
+
+def test_pair_deepmd(lammps) -> None:
+    # Single-rank with-comm archive + atom_modify map yes. Dispatch picks
+    # the regular path because nswap==0; the regular artifact uses the
+    # correct mapping built from the LAMMPS atom-map. Mirrors dpa3 cell C.
+    lammps.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps.pair_coeff("* *")
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    lammps.run(1)
+
+
+def test_pair_deepmd_no_atom_map_fails_fast(lammps_no_atom_map) -> None:
+    # Single-rank with-comm archive + atom_modify map no. Single-rank PBC
+    # has an empty CommBrick sendlist (nswap==0), so border_op cannot fill
+    # ghost features and the GNN model has no reliable mapping. Must fail
+    # fast with the single-rank ``atom_modify map yes`` message. Mirrors
+    # dpa3 cell D.
+    lammps_no_atom_map.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_no_atom_map.pair_coeff("* *")
+    with pytest.raises(Exception, match=r"atom_modify map yes"):
+        lammps_no_atom_map.run(0)
+
+
+def test_pair_deepmd_virial(lammps) -> None:
+    lammps.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps.pair_coeff("* *")
+    lammps.compute("virial all centroid/stress/atom NULL pair")
+    for ii in range(9):
+        jj = [0, 4, 8, 3, 6, 7, 1, 2, 5][ii]
+        lammps.variable(f"virial{jj} atom c_virial[{ii + 1}]")
+    lammps.dump(
+        "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)])
+    )
+    lammps.run(0)
+    assert lammps.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps.atoms[ii].force == pytest.approx(
+            expected_f[lammps.atoms[ii].id - 1]
+        )
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
+    for ii in range(9):
+        assert np.array(
+            lammps.variables[f"virial{ii}"].value
+        ) / constants.nktv2p == pytest.approx(expected_v[idx_map, ii])
+
+
+def test_pair_deepmd_type_map(lammps_type_map) -> None:
+    lammps_type_map.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_type_map.pair_coeff("* * H O")
+    lammps_type_map.run(0)
+    assert lammps_type_map.eval("pe") == pytest.approx(expected_e)
+    for ii in range(6):
+        assert lammps_type_map.atoms[ii].force == pytest.approx(
+            expected_f[lammps_type_map.atoms[ii].id - 1]
+        )
+    lammps_type_map.run(1)
+
+
+def test_pair_deepmd_real(lammps_real) -> None:
+    lammps_real.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_real.pair_coeff("* *")
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
+        )
+    lammps_real.run(1)
+
+
+def test_pair_deepmd_virial_real(lammps_real) -> None:
+    lammps_real.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_real.pair_coeff("* *")
+    lammps_real.compute("virial all centroid/stress/atom NULL pair")
+    for ii in range(9):
+        jj = [0, 4, 8, 3, 6, 7, 1, 2, 5][ii]
+        lammps_real.variable(f"virial{jj} atom c_virial[{ii + 1}]")
+    lammps_real.dump(
+        "1 all custom 1 dump id " + " ".join([f"v_virial{ii}" for ii in range(9)])
+    )
+    lammps_real.run(0)
+    assert lammps_real.eval("pe") == pytest.approx(
+        expected_e * constants.ener_metal2real
+    )
+    for ii in range(6):
+        assert lammps_real.atoms[ii].force == pytest.approx(
+            expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
+        )
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
+    for ii in range(9):
+        assert np.array(
+            lammps_real.variables[f"virial{ii}"].value
+        ) / constants.nktv2p_real == pytest.approx(
+            expected_v[idx_map, ii] * constants.ener_metal2real
+        )
+
+
+def test_pair_deepmd_si(lammps_si) -> None:
+    lammps_si.pair_style(f"deepmd {pb_file.resolve()}")
+    lammps_si.pair_coeff("* *")
+    lammps_si.run(0)
+    assert lammps_si.eval("pe") == pytest.approx(expected_e * constants.ener_metal2si)
+    for ii in range(6):
+        assert lammps_si.atoms[ii].force == pytest.approx(
+            expected_f[lammps_si.atoms[ii].id - 1] * constants.force_metal2si
+        )
+    lammps_si.run(1)

From 5ed93a15c233a02e2f6cf6729ae0d2954d78fae7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Jun 2026 00:26:58 +0000
Subject: [PATCH 09/10] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 source/tests/pt_expt/infer/test_dpa4_deep_eval.py | 8 ++------
 source/tests/pt_expt/model/test_dpa4_interop.py   | 4 +---
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/source/tests/pt_expt/infer/test_dpa4_deep_eval.py b/source/tests/pt_expt/infer/test_dpa4_deep_eval.py
index 8151c30c17..bcade0d77e 100644
--- a/source/tests/pt_expt/infer/test_dpa4_deep_eval.py
+++ b/source/tests/pt_expt/infer/test_dpa4_deep_eval.py
@@ -39,12 +39,8 @@
 from deepmd.infer import (
     DeepPot,
 )
-from deepmd.pt.model.model import (
-    get_model as pt_get_model,
-)
-from deepmd.pt.train.wrapper import (
-    ModelWrapper as PtModelWrapper,
-)
+from deepmd.pt.model.model import get_model as pt_get_model
+from deepmd.pt.train.wrapper import ModelWrapper as PtModelWrapper
 from deepmd.pt_expt.utils.serialization import (
     deserialize_to_file,
 )
diff --git a/source/tests/pt_expt/model/test_dpa4_interop.py b/source/tests/pt_expt/model/test_dpa4_interop.py
index 533ebdd862..15771f20a6 100644
--- a/source/tests/pt_expt/model/test_dpa4_interop.py
+++ b/source/tests/pt_expt/model/test_dpa4_interop.py
@@ -19,9 +19,7 @@
 import pytest
 import torch
 
-from deepmd.pt.model.model import (
-    get_model as pt_get_model,
-)
+from deepmd.pt.model.model import get_model as pt_get_model
 from deepmd.pt_expt.model.ener_model import (
     EnergyModel,
 )

From dfddcd50a5e1a2de667155f9cd54746e3b727fcb Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Tue, 16 Jun 2026 13:25:57 +0800
Subject: [PATCH 10/10] fix: address CodeQL findings on DPA4 inference PR
 (warn-once set, imports)

---
 deepmd/dpmodel/descriptor/dpa4.py             | 19 ++++++++-----------
 deepmd/pt_expt/model/get_model.py             | 19 ++++++++-----------
 source/tests/pt_expt/descriptor/test_dpa4.py  | 15 +++++++++------
 .../tests/pt_expt/model/test_dpa4_export.py   |  6 +++---
 .../pt_expt/model/test_get_model_dpa4.py      |  4 ++--
 5 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/deepmd/dpmodel/descriptor/dpa4.py b/deepmd/dpmodel/descriptor/dpa4.py
index 617560e409..5c61dbc388 100644
--- a/deepmd/dpmodel/descriptor/dpa4.py
+++ b/deepmd/dpmodel/descriptor/dpa4.py
@@ -39,9 +39,8 @@
 
 log = logging.getLogger(__name__)
 
-# Warn at most once per process that ``use_amp`` has no effect on the
-# dpmodel/pt_expt backend (it is a pt-runtime CUDA autocast switch).
-_USE_AMP_WARNED = False
+# Warn at most once per process for backend-ignored switches (keyed by name).
+_WARNED_ONCE: set[str] = set()
 
 from deepmd.dpmodel import (
     NativeOP,
@@ -331,14 +330,12 @@ def __init__(
         # pt-runtime-only switch (CUDA bfloat16 autocast during training);
         # accepted for config compatibility and ignored by dpmodel.
         self.use_amp = bool(use_amp)
-        if self.use_amp:
-            global _USE_AMP_WARNED
-            if not _USE_AMP_WARNED:
-                log.warning(
-                    "`use_amp` has no effect on the dpmodel/pt_expt backend "
-                    "(it is a pt-runtime CUDA autocast switch); ignoring it."
-                )
-                _USE_AMP_WARNED = True
+        if self.use_amp and "use_amp" not in _WARNED_ONCE:
+            log.warning(
+                "`use_amp` has no effect on the dpmodel/pt_expt backend "
+                "(it is a pt-runtime CUDA autocast switch); ignoring it."
+            )
+            _WARNED_ONCE.add("use_amp")
         self.trainable = bool(trainable)
         self.seed = seed
         self.random_gamma = bool(random_gamma)
diff --git a/deepmd/pt_expt/model/get_model.py b/deepmd/pt_expt/model/get_model.py
index e5b00c10d9..7efa904f23 100644
--- a/deepmd/pt_expt/model/get_model.py
+++ b/deepmd/pt_expt/model/get_model.py
@@ -47,9 +47,8 @@
 
 log = logging.getLogger(__name__)
 
-# Warn at most once per process that ``enable_tf32`` has no effect on the
-# pt_expt backend (which always runs at "highest" matmul precision).
-_ENABLE_TF32_WARNED = False
+# Warn at most once per process for backend-ignored switches (keyed by name).
+_WARNED_ONCE: set[str] = set()
 
 
 def _get_standard_model_components(
@@ -135,14 +134,12 @@ def get_sezm_model(data: dict) -> EnergyModel:
     ("highest") matmul precision, which is numerically conservative.
     """
     data = copy.deepcopy(data)
-    if bool(data.get("enable_tf32", True)):
-        global _ENABLE_TF32_WARNED
-        if not _ENABLE_TF32_WARNED:
-            log.warning(
-                "`enable_tf32` has no effect on the pt_expt backend, which "
-                "always runs at full ('highest') matmul precision; ignoring it."
-            )
-            _ENABLE_TF32_WARNED = True
+    if bool(data.get("enable_tf32", True)) and "enable_tf32" not in _WARNED_ONCE:
+        log.warning(
+            "`enable_tf32` has no effect on the pt_expt backend, which "
+            "always runs at full ('highest') matmul precision; ignoring it."
+        )
+        _WARNED_ONCE.add("enable_tf32")
     if "spin" in data:
         raise NotImplementedError(
             "Spin DPA4/SeZM models are not supported in the pt_expt backend."
diff --git a/source/tests/pt_expt/descriptor/test_dpa4.py b/source/tests/pt_expt/descriptor/test_dpa4.py
index fc5d265dcf..67de02c2b9 100644
--- a/source/tests/pt_expt/descriptor/test_dpa4.py
+++ b/source/tests/pt_expt/descriptor/test_dpa4.py
@@ -213,23 +213,26 @@ def test_trainable_false_freezes_all_parameters(self, via_deserialize) -> None:
 def test_use_amp_warns_once(use_amp, caplog, monkeypatch) -> None:
     import logging
 
-    import deepmd.dpmodel.descriptor.dpa4 as dpa4_mod
+    # The descriptor module's logger name is its module path; derive it from the
+    # already-imported class to avoid a second (mixed-style) import of the module.
+    logger_name = DPDescrptDPA4.__module__
 
-    # reset the warn-once flag so the assertion is deterministic regardless of
-    # test ordering (other constructions in the suite may have already warned)
-    monkeypatch.setattr(dpa4_mod, "_USE_AMP_WARNED", False)
+    # reset the warn-once set so the assertion is deterministic regardless of
+    # test ordering (other constructions in the suite may have already warned).
+    # String target lets pytest resolve the module without an import statement.
+    monkeypatch.setattr(f"{logger_name}._WARNED_ONCE", set())
 
     def _construct() -> None:
         make_descriptor(2, [10, 10], 4.0, use_amp=use_amp)
 
-    with caplog.at_level(logging.WARNING, logger=dpa4_mod.log.name):
+    with caplog.at_level(logging.WARNING, logger=logger_name):
         _construct()
     matches = [r for r in caplog.records if "use_amp" in r.getMessage()]
     if use_amp:
         assert len(matches) == 1, caplog.text
         # second construction must NOT warn again (warn-once per process)
         caplog.clear()
-        with caplog.at_level(logging.WARNING, logger=dpa4_mod.log.name):
+        with caplog.at_level(logging.WARNING, logger=logger_name):
             _construct()
         assert not [r for r in caplog.records if "use_amp" in r.getMessage()]
     else:
diff --git a/source/tests/pt_expt/model/test_dpa4_export.py b/source/tests/pt_expt/model/test_dpa4_export.py
index a82ee9dc4c..bb06b25574 100644
--- a/source/tests/pt_expt/model/test_dpa4_export.py
+++ b/source/tests/pt_expt/model/test_dpa4_export.py
@@ -29,9 +29,9 @@
 import numpy as np
 import pytest
 
-# Trigger registration of the deepmd_export::border_op opaque wrapper
-# (needed by the with-comm artifact at runtime / load time).
-import deepmd.pt_expt.utils.comm  # noqa: F401  # lgtm[py/unused-import]
+# Note: registration of the deepmd_export::border_op opaque wrapper (needed by
+# the with-comm artifact) happens inside ``deserialize_to_file`` via
+# ``ensure_comm_registered()``; no explicit comm import is required here.
 from deepmd.pt_expt.model.get_model import (
     get_model,
 )
diff --git a/source/tests/pt_expt/model/test_get_model_dpa4.py b/source/tests/pt_expt/model/test_get_model_dpa4.py
index 4df8611aab..210cec431f 100644
--- a/source/tests/pt_expt/model/test_get_model_dpa4.py
+++ b/source/tests/pt_expt/model/test_get_model_dpa4.py
@@ -230,9 +230,9 @@ def test_enable_tf32_warns_once(enable_tf32, caplog, monkeypatch) -> None:
     # ``import ...get_model as`` would shadow the submodule; load it explicitly
     gm_mod = importlib.import_module("deepmd.pt_expt.model.get_model")
 
-    # reset the warn-once flag so the assertion is deterministic regardless of
+    # reset the warn-once set so the assertion is deterministic regardless of
     # test ordering (other get_sezm_model calls may have already warned)
-    monkeypatch.setattr(gm_mod, "_ENABLE_TF32_WARNED", False)
+    monkeypatch.setattr(gm_mod, "_WARNED_ONCE", set())
 
     raw = _make_raw_model_config(enable_tf32=enable_tf32)