diff --git a/.devcontainer/build_cxx.sh b/.devcontainer/build_cxx.sh
index 007c99b9e5..0d7d62d2ed 100755
--- a/.devcontainer/build_cxx.sh
+++ b/.devcontainer/build_cxx.sh
@@ -5,7 +5,7 @@ NPROC=$(nproc --all)
 SCRIPT_PATH=$(dirname $(realpath -s $0))
 
 export CMAKE_PREFIX_PATH=${SCRIPT_PATH}/../libtorch
-TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
+TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
 
 mkdir -p ${SCRIPT_PATH}/../buildcxx/
 cd ${SCRIPT_PATH}/../buildcxx/
@@ -13,7 +13,7 @@ cmake -D ENABLE_TENSORFLOW=ON \
 	-D ENABLE_PYTORCH=ON \
 	-D ENABLE_PADDLE=ON \
 	-D CMAKE_INSTALL_PREFIX=${SCRIPT_PATH}/../dp/ \
-	-D LAMMPS_VERSION=stable_29Aug2024_update1 \
+	-D LAMMPS_VERSION=stable_22Jul2025_update1 \
 	-D CMAKE_BUILD_TYPE=Debug \
 	-D BUILD_TESTING:BOOL=TRUE \
 	-D TENSORFLOW_ROOT=${TENSORFLOW_ROOT} \
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 27c40bbe6a..85d67db2a9 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -11,7 +11,6 @@
     "PATH": "${containerEnv:PATH}:${containerWorkspaceFolder}/.venv/bin",
     "DP_ENABLE_PYTORCH": "1",
     "DP_VARIANT": "cpu",
-    "LMP_CXX11_ABI_0": "1",
     "UV_EXTRA_INDEX_URL": "https://download.pytorch.org/whl/cpu"
   }
 }
diff --git a/.devcontainer/download_libtorch.sh b/.devcontainer/download_libtorch.sh
index 43a46805c5..8c1e480b7c 100755
--- a/.devcontainer/download_libtorch.sh
+++ b/.devcontainer/download_libtorch.sh
@@ -4,5 +4,5 @@ set -ev
 SCRIPT_PATH=$(dirname $(realpath -s $0))
 cd ${SCRIPT_PATH}/..
 
-wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.7.0%2Bcpu.zip -O ~/libtorch.zip
+wget https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-2.8.0%2Bcpu.zip -O ~/libtorch.zip
 unzip ~/libtorch.zip
diff --git a/.devcontainer/gdb_lmp b/.devcontainer/gdb_lmp
index 33e883780b..fc1c8b90fe 100755
--- a/.devcontainer/gdb_lmp
+++ b/.devcontainer/gdb_lmp
@@ -2,7 +2,7 @@
 SCRIPT_PATH=$(dirname $(realpath -s $0))
 
 export CMAKE_PREFIX_PATH=${SCRIPT_PATH}/../libtorch
-TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
+TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
 
 env LAMMPS_PLUGIN_PATH=${SCRIPT_PATH}/../dp/lib/deepmd_lmp \
 	LD_LIBRARY_PATH=${SCRIPT_PATH}/../dp/lib:${CMAKE_PREFIX_PATH}/lib:${TENSORFLOW_ROOT} \
diff --git a/.devcontainer/gdb_pytest_lmp b/.devcontainer/gdb_pytest_lmp
index e27e40d4b0..d27587ec43 100755
--- a/.devcontainer/gdb_pytest_lmp
+++ b/.devcontainer/gdb_pytest_lmp
@@ -2,7 +2,7 @@
 SCRIPT_PATH=$(dirname $(realpath -s $0))/../..
 
 export CMAKE_PREFIX_PATH=${SCRIPT_PATH}/../libtorch
-TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
+TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
 
 env LAMMPS_PLUGIN_PATH=${SCRIPT_PATH}/../dp/lib/deepmd_lmp \
 	LD_LIBRARY_PATH=${SCRIPT_PATH}/../dp/lib:${CMAKE_PREFIX_PATH}/lib:${TENSORFLOW_ROOT} \
diff --git a/.devcontainer/lmp b/.devcontainer/lmp
index c8e781aa57..524f99b326 100755
--- a/.devcontainer/lmp
+++ b/.devcontainer/lmp
@@ -2,7 +2,7 @@
 SCRIPT_PATH=$(dirname $(realpath -s $0))
 
 export CMAKE_PREFIX_PATH=${SCRIPT_PATH}/../libtorch
-TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
+TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
 
 env LAMMPS_PLUGIN_PATH=${SCRIPT_PATH}/../dp/lib/deepmd_lmp \
 	LD_LIBRARY_PATH=${SCRIPT_PATH}/../dp/lib:${CMAKE_PREFIX_PATH}/lib:${TENSORFLOW_ROOT} \
diff --git a/.devcontainer/pytest_lmp b/.devcontainer/pytest_lmp
index 9371ba72d5..bb88da883f 100755
--- a/.devcontainer/pytest_lmp
+++ b/.devcontainer/pytest_lmp
@@ -2,7 +2,7 @@
 SCRIPT_PATH=$(dirname $(realpath -s $0))/../..
 
 export CMAKE_PREFIX_PATH=${SCRIPT_PATH}/../libtorch
-TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
+TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
 
 env LAMMPS_PLUGIN_PATH=${SCRIPT_PATH}/../dp/lib/deepmd_lmp \
 	LD_LIBRARY_PATH=${SCRIPT_PATH}/../dp/lib:${CMAKE_PREFIX_PATH}/lib:${TENSORFLOW_ROOT} \
diff --git a/.github/workflows/build_cc.yml b/.github/workflows/build_cc.yml
index 59873c5101..81f0ed01be 100644
--- a/.github/workflows/build_cc.yml
+++ b/.github/workflows/build_cc.yml
@@ -2,6 +2,9 @@ on:
   push:
     branches-ignore:
       - "gh-readonly-queue/**"
+      - "copilot/**"
+      - "dependabot/**"
+      - "pre-commit-ci-update-config"
   pull_request:
   merge_group:
 concurrency:
@@ -26,14 +29,14 @@ jobs:
         - variant: clang
           dp_variant: clang
     steps:
-    - uses: actions/checkout@v4
-    - uses: actions/setup-python@v5
+    - uses: actions/checkout@v5
+    - uses: actions/setup-python@v6
       with:
         python-version: '3.11'
     - uses: lukka/get-cmake@latest
     - run: python -m pip install uv
     - run: source/install/uv_with_retry.sh pip install --system tensorflow
-    - run: source/install/uv_with_retry.sh pip install --system 'torch==2.7' --index-url https://download.pytorch.org/whl/cpu
+    - run: source/install/uv_with_retry.sh pip install --system 'torch==2.8.*' --index-url https://download.pytorch.org/whl/cpu
     - run: |
          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb \
          && sudo dpkg -i cuda-keyring_1.0-1_all.deb \
diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
index c520b2af59..21b0319c56 100644
--- a/.github/workflows/build_wheel.yml
+++ b/.github/workflows/build_wheel.yml
@@ -4,6 +4,9 @@ on:
   push:
     branches-ignore:
       - "gh-readonly-queue/**"
+      - "copilot/**"
+      - "dependabot/**"
+      - "pre-commit-ci-update-config"
     tags:
       - "v*"
   pull_request:
@@ -54,7 +57,7 @@ jobs:
             platform_id: manylinux_aarch64
             dp_variant: cpu
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           # https://github.com/pypa/setuptools_scm/issues/480
           fetch-depth: 0
@@ -70,7 +73,7 @@ jobs:
           rm -rf .git
         if: matrix.dp_pkg_name == 'deepmd-kit-cu11'
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.23
+        uses: pypa/cibuildwheel@v3.1
         env:
           CIBW_BUILD_VERBOSITY: 1
           CIBW_ARCHS: all
@@ -87,7 +90,7 @@ jobs:
     name: Build source distribution
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           fetch-depth: 0
       - name: Build sdist
@@ -106,7 +109,7 @@ jobs:
       id-token: write
     if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v')
     steps:
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@v5
         with:
           pattern: cibw-*
           path: dist
@@ -128,8 +131,8 @@ jobs:
     steps:
       - name: Delete huge unnecessary tools folder
         run: rm -rf /opt/hostedtoolcache
-      - uses: actions/checkout@v4
-      - uses: actions/download-artifact@v4
+      - uses: actions/checkout@v5
+      - uses: actions/download-artifact@v5
         with:
           path: source/install/docker/dist
           pattern: cibw-*-manylinux_x86_64-cu${{ matrix.cuda_version }}*
@@ -162,12 +165,12 @@ jobs:
     needs: [build_wheels, build_sdist]
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@v5
         with:
           path: dist/packages
           pattern: cibw-*
           merge-multiple: true
-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@v6
         name: Install Python
         with:
           python-version: '3.11'
@@ -176,7 +179,7 @@ jobs:
           ls dist/packages > package_list.txt
           dumb-pypi --output-dir dist --packages-url ../../packages --package-list package_list.txt --title "DeePMD-kit Developed Packages"
       - name: Upload Pages artifact
-        uses: actions/upload-pages-artifact@v3
+        uses: actions/upload-pages-artifact@v4
         with:
           path: dist
   deploy_pypi_index:
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index f8005ab831..2caf615852 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -4,6 +4,9 @@ on:
   push:
     branches-ignore:
       - "gh-readonly-queue/**"
+      - "copilot/**"
+      - "dependabot/**"
+      - "pre-commit-ci-update-config"
   pull_request:
   schedule:
     - cron: '45 2 * * 2'
@@ -27,8 +30,8 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v4
-    - uses: actions/setup-python@v5
+      uses: actions/checkout@v5
+    - uses: actions/setup-python@v6
       with:
         python-version: '3.11'
         cache: 'pip'
@@ -41,7 +44,7 @@ jobs:
          && sudo apt-get update \
          && sudo apt-get -y install cuda-cudart-dev-12-2 cuda-nvcc-12-2
         python -m pip install tensorflow
-        python -m pip install 'torch==2.7' --index-url https://download.pytorch.org/whl/cpu
+        python -m pip install 'torch==2.8.*' --index-url https://download.pytorch.org/whl/cpu
       env:
         DEBIAN_FRONTEND: noninteractive
     # Initializes the CodeQL tools for scanning.
diff --git a/.github/workflows/copilot-setup-steps.yml b/.github/workflows/copilot-setup-steps.yml
new file mode 100644
index 0000000000..0468501433
--- /dev/null
+++ b/.github/workflows/copilot-setup-steps.yml
@@ -0,0 +1,69 @@
+name: "Copilot Setup Steps"
+
+# Automatically run the setup steps when they are changed to allow for easy validation, and
+# allow manual testing through the repository's "Actions" tab
+on:
+  workflow_dispatch:
+  push:
+    branches-ignore:
+      - "copilot/**"
+      - "dependabot/**"
+      - "pre-commit-ci-update-config"
+    paths:
+      - .github/workflows/copilot-setup-steps.yml
+  pull_request:
+    paths:
+      - .github/workflows/copilot-setup-steps.yml
+
+jobs:
+  # The job MUST be called `copilot-setup-steps` or it will not be picked up by Copilot.
+  copilot-setup-steps:
+    runs-on: ubuntu-latest
+
+    # Set the permissions to the lowest permissions possible needed for your steps.
+    # Copilot will be given its own token for its operations.
+    permissions:
+      # If you want to clone the repository as part of your setup steps, for example to install dependencies, you'll need the `contents: read` permission. If you don't clone the repository in your setup steps, Copilot will do this for you automatically after the steps complete.
+      contents: read
+
+    # You can define any steps you want, and they will run before the agent starts.
+    # If you do not check out your code, Copilot will do this for you.
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v5
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.10"
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v6
+        with:
+          enable-cache: true
+
+      - name: Create virtual environment
+        run: uv venv venv
+
+      - name: Activate virtual environment
+        run: echo "VIRTUAL_ENV=$PWD/venv" >> $GITHUB_ENV && echo "$PWD/venv/bin" >> $GITHUB_PATH
+
+      - name: Install base dependencies
+        run: uv pip install tensorflow-cpu
+
+      - name: Install PyTorch
+        run: uv pip install torch --index-url https://download.pytorch.org/whl/cpu
+
+      - name: Build Python package
+        run: uv pip install -e .[cpu,test]
+
+      - name: Install pre-commit tools
+        run: uv tool install pre-commit
+
+      - name: Install pre-commit hooks
+        run: pre-commit install --install-hooks
+
+      - name: Verify installation
+        run: |
+          dp --version
+          python -c "import deepmd; import deepmd.tf; print('DeePMD-kit installation verified')"
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
index be43c5cff2..77f06528fe 100644
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -9,6 +9,6 @@ jobs:
       pull-requests: write
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/labeler@v5
+    - uses: actions/labeler@v6
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
diff --git a/.github/workflows/mirror_gitee.yml b/.github/workflows/mirror_gitee.yml
index 2d090c0539..79d1ce11a5 100644
--- a/.github/workflows/mirror_gitee.yml
+++ b/.github/workflows/mirror_gitee.yml
@@ -1,6 +1,13 @@
 name: Mirror to Gitee Repo
 
-on: [ push, delete, create ]
+on:
+  push:
+    branches-ignore:
+      - "copilot/**"
+      - "dependabot/**"
+      - "pre-commit-ci-update-config"
+  delete:
+  create:
 
 # Ensures that only one mirror task will run at a time.
 concurrency:
diff --git a/.github/workflows/package_c.yml b/.github/workflows/package_c.yml
index f5e4a97d56..dae85816f5 100644
--- a/.github/workflows/package_c.yml
+++ b/.github/workflows/package_c.yml
@@ -4,6 +4,9 @@ on:
   push:
     branches-ignore:
       - "gh-readonly-queue/**"
+      - "copilot/**"
+      - "dependabot/**"
+      - "pre-commit-ci-update-config"
     tags:
       - "v*"
   pull_request:
@@ -22,10 +25,10 @@ jobs:
             tensorflow_version: ""
             filename: libdeepmd_c.tar.gz
           - tensorflow_build_version: "2.14"
-            tensorflow_version: ">=2.5.0rc0,<2.15"
+            tensorflow_version: ">=2.5.0,<2.15"
             filename: libdeepmd_c_cu11.tar.gz
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
         with:
           fetch-depth: 0
       - name: Package C library
@@ -53,9 +56,9 @@ jobs:
     needs: [build_c]
     runs-on: ubuntu-22.04
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v5
       - name: Download artifact
-        uses: actions/download-artifact@v4
+        uses: actions/download-artifact@v5
         with:
           pattern: libdeepmd_c-*
           merge-multiple: true
diff --git a/.github/workflows/test_cc.yml b/.github/workflows/test_cc.yml
index 3407a58d1e..956090fe0c 100644
--- a/.github/workflows/test_cc.yml
+++ b/.github/workflows/test_cc.yml
@@ -2,6 +2,9 @@ on:
   push:
     branches-ignore:
       - "gh-readonly-queue/**"
+      - "copilot/**"
+      - "dependabot/**"
+      - "pre-commit-ci-update-config"
   pull_request:
   merge_group:
 concurrency:
@@ -16,23 +19,19 @@ jobs:
       matrix:
         check_memleak: [true, false]
     steps:
-    - uses: actions/checkout@v4
-    - uses: actions/setup-python@v5
+    - uses: actions/checkout@v5
+    - uses: actions/setup-python@v6
       with:
         python-version: '3.11'
         cache: 'pip'
-    - name: Setup MPI
-      uses: mpi4py/setup-mpi@v1
-      with:
-        mpi: mpich
     - uses: lukka/get-cmake@latest
     - run: python -m pip install uv
     - name: Install Python dependencies
       run: |
         source/install/uv_with_retry.sh pip install --system tensorflow-cpu~=2.18.0 jax==0.5.0
-        export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
-        source/install/uv_with_retry.sh pip install --system -e .[cpu,test,lmp,jax] mpi4py
-        source/install/uv_with_retry.sh pip install --system 'torch==2.7' --index-url https://download.pytorch.org/whl/cpu
+        export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
+        source/install/uv_with_retry.sh pip install --system -e .[cpu,test,lmp,jax] mpi4py mpich
+        source/install/uv_with_retry.sh pip install --system 'torch==2.8.*' --index-url https://download.pytorch.org/whl/cpu
     - name: Convert models
       run: source/tests/infer/convert-models.sh
     # https://github.com/actions/runner-images/issues/9491
@@ -45,7 +44,6 @@ jobs:
         OMP_NUM_THREADS: 1
         TF_INTRA_OP_PARALLELISM_THREADS: 1
         TF_INTER_OP_PARALLELISM_THREADS: 1
-        LMP_CXX11_ABI_0: 1
         CMAKE_GENERATOR: Ninja
         CXXFLAGS: ${{ matrix.check_memleak && '-fsanitize=leak' || '' }}
         LSAN_OPTIONS: suppressions=${{ github.workspace }}/.github/workflows/suppr.txt
@@ -54,13 +52,13 @@ jobs:
         cp ${{ github.workspace }}/source/build_tests/paddle_inference_install_dir/paddle/lib/*.so ${{ github.workspace }}/dp_test/lib/
         cp ${{ github.workspace }}/source/build_tests/paddle_inference_install_dir/third_party/install/onednn/lib/* ${{ github.workspace }}/dp_test/lib/
         cp ${{ github.workspace }}/source/build_tests/paddle_inference_install_dir/third_party/install/mklml/lib/* ${{ github.workspace }}/dp_test/lib/
+        export LD_LIBRARY_PATH=${{ github.workspace }}/dp_test/lib:$LD_LIBRARY_PATH
         pytest --cov=deepmd source/lmp/tests
       env:
         OMP_NUM_THREADS: 1
         TF_INTRA_OP_PARALLELISM_THREADS: 1
         TF_INTER_OP_PARALLELISM_THREADS: 1
         LAMMPS_PLUGIN_PATH: ${{ github.workspace }}/dp_test/lib/deepmd_lmp
-        LD_LIBRARY_PATH: ${{ github.workspace }}/dp_test/lib
       if: ${{ !matrix.check_memleak }}
     # test ipi
     - run: |
diff --git a/.github/workflows/test_cuda.yml b/.github/workflows/test_cuda.yml
index 5f6b0e73ab..2523f71197 100644
--- a/.github/workflows/test_cuda.yml
+++ b/.github/workflows/test_cuda.yml
@@ -25,15 +25,11 @@ jobs:
     steps:
     - name: Make sudo and git work
       run: apt-get update && apt-get install -y sudo git
-    - uses: actions/checkout@v4
-    - uses: actions/setup-python@v5
+    - uses: actions/checkout@v5
+    - uses: actions/setup-python@v6
       with:
         python-version: '3.11'
         # cache: 'pip'
-    - name: Setup MPI
-      uses: mpi4py/setup-mpi@v1
-      with:
-        mpi: mpich
     - name: Install wget and unzip
       run: apt-get update && apt-get install -y wget unzip
     - uses: lukka/get-cmake@latest
@@ -50,7 +46,7 @@ jobs:
     - run: source/install/uv_with_retry.sh pip install --system "tensorflow~=2.18.0rc2" "torch~=2.7.0" "jax[cuda12]==0.5.0"
     - run: |
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
-        export TENSORFLOW_ROOT=$(python -c 'import importlib,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
+        export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
         pip install "paddlepaddle-gpu==3.0.0" -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
         source/install/uv_with_retry.sh pip install --system -v -e .[gpu,test,lmp,cu12,torch,jax] mpi4py --reinstall-package deepmd-kit
       env:
@@ -74,7 +70,6 @@ jobs:
         OMP_NUM_THREADS: 1
         TF_INTRA_OP_PARALLELISM_THREADS: 1
         TF_INTER_OP_PARALLELISM_THREADS: 1
-        LMP_CXX11_ABI_0: 1
         CMAKE_GENERATOR: Ninja
         DP_VARIANT: cuda
         DP_USE_MPICH2: 1
diff --git a/.github/workflows/test_python.yml b/.github/workflows/test_python.yml
index 0190dc002c..81738dcfe9 100644
--- a/.github/workflows/test_python.yml
+++ b/.github/workflows/test_python.yml
@@ -2,6 +2,9 @@ on:
   push:
     branches-ignore:
       - "gh-readonly-queue/**"
+      - "copilot/**"
+      - "dependabot/**"
+      - "pre-commit-ci-update-config"
   pull_request:
   merge_group:
 concurrency:
@@ -19,15 +22,15 @@ jobs:
         python: ["3.9", "3.12"]
 
     steps:
-    - uses: actions/checkout@v4
-    - uses: actions/setup-python@v5
+    - uses: actions/checkout@v5
+    - uses: actions/setup-python@v6
       with:
         python-version: ${{ matrix.python }}
     - run: python -m pip install -U uv
     - run: |
         source/install/uv_with_retry.sh pip install --system openmpi tensorflow-cpu~=2.18.0
         source/install/uv_with_retry.sh pip install --system torch -i https://download.pytorch.org/whl/cpu
-        export TENSORFLOW_ROOT=$(python -c 'import tensorflow;print(tensorflow.__path__[0])')
+        export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')
         export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')
         source/install/uv_with_retry.sh pip install --system -e .[test,jax] mpi4py "jax==0.5.0;python_version>='3.10'"
         source/install/uv_with_retry.sh pip install --system -U setuptools
@@ -40,7 +43,6 @@ jobs:
         # changes, setting `TENSORFLOW_ROOT`.
         DP_ENABLE_PYTORCH: 1
         DP_BUILD_TESTING: 1
-        UV_EXTRA_INDEX_URL: "https://pypi.anaconda.org/mpi4py/simple"
         HOROVOD_WITH_TENSORFLOW: 1
         HOROVOD_WITHOUT_PYTORCH: 1
         HOROVOD_WITH_MPI: 1
@@ -97,7 +99,7 @@ jobs:
         key: test2-durations-combined-${{ matrix.python }}-${{ github.sha }}
         restore-keys: test2-durations-combined-${{ matrix.python }}
     - name: Download artifacts
-      uses: actions/download-artifact@v4
+      uses: actions/download-artifact@v5
       with:
         pattern: split-${{ matrix.python }}-*
         merge-multiple: true
diff --git a/.github/workflows/todo.yml b/.github/workflows/todo.yml
index 2608bb1071..edde7b4be5 100644
--- a/.github/workflows/todo.yml
+++ b/.github/workflows/todo.yml
@@ -8,7 +8,7 @@ jobs:
     if: github.repository_owner == 'deepmodeling'
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v5
     - name: Run tdg-github-action
       uses: ribtoks/tdg-github-action@master
       with:
diff --git a/.gitignore b/.gitignore
index c574da757a..6382ecedd2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,7 +28,7 @@ venv*
 .vscode/**
 _build
 _templates
-API_CC
+doc/API_CC/
 doc/api_py/
 doc/api_core/
 doc/api_c/
@@ -50,3 +50,24 @@ uv.lock
 buildcxx/
 node_modules/
 *.bib.original
+
+# Coverage files
+.coverage
+.coverage.*
+
+# Test output files (temporary)
+test_dp_test/
+test_dp_test_*.out
+*_detail.out
+
+# Training and model output files
+*.pth
+*.ckpt*
+checkpoint
+lcurve.out
+out.json
+input_v2_compat.json
+frozen_model.*
+
+# Test system directories
+system/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index cee3d7f2ce..9fdd20cc81 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v5.0.0
+    rev: v6.0.0
     hooks:
       - id: trailing-whitespace
         exclude: "^.+\\.pbtxt$"
@@ -29,7 +29,7 @@ repos:
         exclude: ^source/3rdparty
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.11.13
+    rev: v0.13.2
     hooks:
       - id: ruff
         args: ["--fix"]
@@ -40,7 +40,7 @@ repos:
         types_or: [python, pyi, jupyter]
   - repo: https://github.com/pycqa/flake8
     # flake8 cannot autofix
-    rev: "7.2.0"
+    rev: "7.3.0"
     hooks:
       - id: flake8
         additional_dependencies:
@@ -55,12 +55,12 @@ repos:
         exclude: ^source/3rdparty
   # Python inside docs
   - repo: https://github.com/asottile/blacken-docs
-    rev: 1.19.1
+    rev: 1.20.0
     hooks:
       - id: blacken-docs
   # C++
   - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v20.1.5
+    rev: v21.1.2
     hooks:
       - id: clang-format
         exclude: ^(source/3rdparty|source/lib/src/gpu/cudart/.+\.inc|.+\.ipynb$|.+\.json$)
@@ -74,7 +74,7 @@ repos:
         exclude: ^(source/3rdparty|\.github/workflows|\.clang-format)
   # Shell
   - repo: https://github.com/scop/pre-commit-shfmt
-    rev: v3.11.0-1
+    rev: v3.12.0-2
     hooks:
       - id: shfmt
   # CMake
@@ -84,7 +84,7 @@ repos:
       - id: cmake-format
       #- id: cmake-lint
   - repo: https://github.com/njzjz/mirrors-bibtex-tidy
-    rev: v1.13.0
+    rev: v1.14.0
     hooks:
       - id: bibtex-tidy
         args:
@@ -154,7 +154,7 @@ repos:
         exclude: .pre-commit-config.yaml|source/lmp
   # customized pylint rules
   - repo: https://github.com/pylint-dev/pylint/
-    rev: v3.3.7
+    rev: v3.3.8
     hooks:
       - id: pylint
         entry: env PYTHONPATH=source/checker pylint
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000000..9b07cbd9e0
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,191 @@
+# DeePMD-kit
+
+DeePMD-kit is a deep learning package for many-body potential energy representation and molecular dynamics. It supports multiple backends (TensorFlow, PyTorch, JAX, Paddle) and integrates with MD packages like LAMMPS, GROMACS, and i-PI.
+
+**Always reference these instructions first and fallback to search or bash commands only when you encounter unexpected information that does not match the info here.**
+
+## Working Effectively
+
+### Bootstrap and Build Repository
+
+- Create virtual environment: `uv venv venv && source venv/bin/activate`
+- Install base dependencies: `uv pip install tensorflow-cpu` (takes ~8 seconds)
+- Install PyTorch: `uv pip install torch --index-url https://download.pytorch.org/whl/cpu` (takes ~5 seconds)
+- Build Python package: `uv pip install -e .[cpu,test]` -- takes 67 seconds. **NEVER CANCEL. Set timeout to 120+ seconds.**
+- Build C++ components: `export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')` then `export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')` then `./source/install/build_cc.sh` -- takes 164 seconds. **NEVER CANCEL. Set timeout to 300+ seconds.**
+
+### Test Repository
+
+- Run single test: `pytest source/tests/tf/test_dp_test.py::TestDPTestEner::test_1frame -v` -- takes 8-13 seconds
+- Run test subset: `pytest source/tests/tf/test_dp_test.py -v` -- takes 15 seconds. **NEVER CANCEL. Set timeout to 60+ seconds.**
+- **Recommended: Use single test cases for validation instead of full test suite** -- full suite has 314 test files and takes 60+ minutes
+
+### Lint and Format Code
+
+- Install linter: `uv pip install ruff`
+- Run linting: `ruff check .` -- takes <1 second
+- Format code: `ruff format .` -- takes <1 second
+- **Always run `ruff check .` and `ruff format .` before committing changes or the CI will fail.**
+
+### Training and Validation
+
+- Test TensorFlow training: `cd examples/water/se_e2_a && dp train input.json --skip-neighbor-stat` -- training proceeds but is slow on CPU
+- Test PyTorch training: `cd examples/water/se_e2_a && dp --pt train input_torch.json --skip-neighbor-stat` -- training proceeds but is slow on CPU
+- **Training examples are for validation only. Real training takes hours/days. Timeout training tests after 60 seconds for validation.**
+
+## Validation Scenarios
+
+**ALWAYS manually validate any new code through at least one complete scenario:**
+
+### Basic Functionality Validation
+
+1. **CLI Interface**: Run `dp --version` and `dp -h` to verify installation
+2. **Python Interface**: Run `python -c "import deepmd; import deepmd.tf; print('Both interfaces work')"`
+3. **Backend Selection**: Test `dp --tf -h`, `dp --pt -h`, `dp --jax -h`, `dp --pd -h`
+
+### Training Workflow Validation
+
+1. **TensorFlow Training**: `cd examples/water/se_e2_a && timeout 60 dp train input.json --skip-neighbor-stat` -- should start training and show decreasing loss
+2. **PyTorch Training**: `cd examples/water/se_e2_a && timeout 60 dp --pt train input_torch.json --skip-neighbor-stat` -- should start training and show decreasing loss
+3. **Verify training output**: Look for "batch X: trn: rmse" messages showing decreasing error values
+
+### Test-Based Validation
+
+1. **Core Tests**: `pytest source/tests/tf/test_dp_test.py::TestDPTestEner::test_1frame -v` -- should pass in ~10 seconds
+2. **Multi-backend**: Test both TensorFlow and PyTorch components work
+
+## Common Commands and Timing
+
+### Repository Structure
+
+```
+ls -la [repo-root]
+.github/               # GitHub workflows and templates
+CONTRIBUTING.md        # Contributing guide
+README.md             # Project overview
+deepmd/               # Python package source
+doc/                  # Documentation
+examples/             # Training examples and configurations
+pyproject.toml        # Python build configuration
+source/               # C++ source code and tests
+```
+
+### Key Directories and Files
+
+- `deepmd/` - Main Python package with backend implementations
+- `source/lib/` - Core C++ library
+- `source/op/` - Backend-specific operators (TF, PyTorch, etc.)
+- `source/api_cc/` - C++ API
+- `source/api_c/` - C API
+- `source/tests/` - Test suite (314 test files)
+- `examples/water/se_e2_a/` - Basic water training example
+- `examples/` - Various model examples for different scenarios
+
+### Common CLI Commands
+
+- `dp --version` - Show version information
+- `dp -h` - Show help and available commands
+- `dp train input.json` - Train a model (TensorFlow backend)
+- `dp --pt train input.json` - Train with PyTorch backend
+- `dp --jax train input.json` - Train with JAX backend
+- `dp --pd train input.json` - Train with Paddle backend
+- `dp test -m model.pb -s system/` - Test a trained model
+- `dp freeze -o model.pb` - Freeze/save a model
+
+### Build Dependencies and Setup
+
+- **Python 3.9+** required
+- **Virtual environment** strongly recommended: `uv venv venv && source venv/bin/activate`
+- **Backend dependencies**: TensorFlow, PyTorch, JAX, or Paddle (install before building)
+- **Build tools**: CMake, C++ compiler, scikit-build-core
+- **C++ build requires**: Both TensorFlow and PyTorch installed, set TENSORFLOW_ROOT and PYTORCH_ROOT environment variables
+
+### Key Configuration Files
+
+- `pyproject.toml` - Python build configuration and dependencies
+- `source/CMakeLists.txt` - C++ build configuration
+- `examples/water/se_e2_a/input.json` - Basic TensorFlow training config
+- `examples/water/se_e2_a/input_torch.json` - Basic PyTorch training config
+
+## Frequent Patterns and Time Expectations
+
+### Installation and Build Times
+
+- **Virtual environment setup**: ~5 seconds
+- **TensorFlow CPU install**: ~8 seconds
+- **PyTorch CPU install**: ~5 seconds
+- **Python package build**: ~67 seconds. **NEVER CANCEL.**
+- **C++ components build**: ~164 seconds. **NEVER CANCEL.**
+- **Full fresh setup**: ~3-4 minutes total
+
+### Testing Times
+
+- **Single test**: 8-13 seconds
+- **Test file (~5 tests)**: ~15 seconds
+- **Backend-specific test subset**: 15-30 minutes. **Use sparingly.**
+- **Full test suite (314 files)**: 60+ minutes. **Avoid in development - use single tests instead.**
+
+### Linting and Formatting
+
+- **Ruff check**: <1 second
+- **Ruff format**: <1 second
+- **Pre-commit hooks**: May have network issues, use individual tools
+
+### Commit Messages and PR Titles
+
+**All commit messages and PR titles must follow [conventional commit specification](https://www.conventionalcommits.org/):**
+
+- **Format**: `type(scope): description`
+- **Common types**: `feat`, `fix`, `docs`, `style`, `refactor`, `test`, `chore`, `ci`
+- **Examples**:
+  - `feat(core): add new descriptor type`
+  - `fix(tf): resolve memory leak in training`
+  - `docs: update installation guide`
+  - `ci: add workflow for testing`
+
+### Training and Model Operations
+
+- **Training initialization**: 10-30 seconds
+- **Training per batch**: 0.1-1 second (CPU), much faster on GPU
+- **Model freezing**: 5-15 seconds
+- **Model testing**: 10-30 seconds
+
+## Backend-Specific Notes
+
+### TensorFlow Backend
+
+- **Default backend** when no flag specified
+- **Configuration**: Use `input.json` format
+- **Training**: `dp train input.json`
+- **Requirements**: `tensorflow` or `tensorflow-cpu` package
+
+### PyTorch Backend
+
+- **Activation**: Use `--pt` flag or `export DP_BACKEND=pytorch`
+- **Configuration**: Use `input_torch.json` format typically
+- **Training**: `dp --pt train input_torch.json`
+- **Requirements**: `torch` package
+
+### JAX Backend
+
+- **Activation**: Use `--jax` flag
+- **Training**: `dp --jax train input.json`
+- **Requirements**: `jax` and related packages
+- **Note**: Experimental backend, may have limitations
+
+### Paddle Backend
+
+- **Activation**: Use `--pd` flag
+- **Training**: `dp --pd train input.json`
+- **Requirements**: `paddlepaddle` package
+- **Note**: Less commonly used
+
+## Critical Warnings
+
+- **NEVER CANCEL BUILD OPERATIONS**: Python build takes 67 seconds, C++ build takes 164 seconds
+- **USE SINGLE TESTS FOR VALIDATION**: Run individual tests instead of full test suite for faster feedback
+- **ALWAYS activate virtual environment**: Build and runtime failures occur without proper environment
+- **ALWAYS install backend dependencies first**: TensorFlow/PyTorch required before building C++ components
+- **ALWAYS run linting before commits**: `ruff check . && ruff format .` or CI will fail
+- **ALWAYS test both Python and C++ components**: Some features require both to be built
+- **ALWAYS follow conventional commit format**: All commit messages and PR titles must use conventional commit specification (`type(scope): description`)
diff --git a/backend/dp_backend.py b/backend/dp_backend.py
index 81c3f20f19..e32d5db38b 100644
--- a/backend/dp_backend.py
+++ b/backend/dp_backend.py
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """A PEP-517 backend to find TensorFlow."""
 
+import os
+
 from scikit_build_core import build as _orig
 
 from .find_pytorch import (
@@ -39,10 +41,15 @@ def __dir__() -> list[str]:
 def get_requires_for_build_wheel(
     config_settings: dict,
 ) -> list[str]:
+    if os.environ.get("CIBUILDWHEEL", "0") == "1":
+        cibw_deps = ["mpich"]
+    else:
+        cibw_deps = []
     return (
         _orig.get_requires_for_build_wheel(config_settings)
         + find_tensorflow()[1]
         + find_pytorch()[1]
+        + cibw_deps
     )
 
 
diff --git a/backend/find_pytorch.py b/backend/find_pytorch.py
index c51c8905b8..11a967b305 100644
--- a/backend/find_pytorch.py
+++ b/backend/find_pytorch.py
@@ -116,7 +116,7 @@ def get_pt_requirement(pt_version: str = "") -> dict:
         cuda_version = os.environ.get("CUDA_VERSION", "12.2")
         if cuda_version == "" or cuda_version in SpecifierSet(">=12,<13"):
             # CUDA 12.2, cudnn 9
-            pt_version = "2.7.0"
+            pt_version = "2.8.0"
         elif cuda_version in SpecifierSet(">=11,<12"):
             # CUDA 11.8, cudnn 8
             pt_version = "2.3.1"
@@ -124,6 +124,11 @@ def get_pt_requirement(pt_version: str = "") -> dict:
             raise RuntimeError("Unsupported CUDA version") from None
     if pt_version == "":
         pt_version = os.environ.get("PYTORCH_VERSION", "")
+    if os.environ.get("CIBUILDWHEEL", "0") == "1":
+        # PyTorch OP library is built against mpich
+        mpi_requirement = ["mpich"]
+    else:
+        mpi_requirement = []
 
     return {
         "torch": [
@@ -134,7 +139,8 @@ def get_pt_requirement(pt_version: str = "") -> dict:
             f"torch=={Version(pt_version).base_version}.*"
             if pt_version != ""
             # https://github.com/pytorch/pytorch/commit/7e0c26d4d80d6602aed95cb680dfc09c9ce533bc
-            else "torch>=2.1.0"
+            else "torch>=2.1.0",
+            *mpi_requirement,
         ],
     }
 
diff --git a/backend/find_tensorflow.py b/backend/find_tensorflow.py
index 1fc3a8a6d9..a0a1e65aca 100644
--- a/backend/find_tensorflow.py
+++ b/backend/find_tensorflow.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import os
+import re
 import site
 from functools import (
     lru_cache,
@@ -56,6 +57,10 @@ def find_tensorflow() -> tuple[Optional[str], list[str]]:
     ) is not None:
         site_packages = Path(os.environ.get("TENSORFLOW_ROOT")).parent.absolute()
         tf_spec = FileFinder(str(site_packages)).find_spec("tensorflow")
+        if tf_spec is None:
+            raise RuntimeError(
+                f"cannot find TensorFlow under TENSORFLOW_ROOT {os.environ.get('TENSORFLOW_ROOT')}"
+            )
 
     # get tensorflow spec
     # note: isolated build will not work for backend
@@ -88,14 +93,14 @@ def find_tensorflow() -> tuple[Optional[str], list[str]]:
                 # CUDA 12.2, cudnn 9
                 requires.extend(
                     [
-                        "tensorflow-cpu>=2.18.0rc0; platform_machine=='x86_64' and platform_system == 'Linux'",
+                        "tensorflow-cpu>=2.18.0; platform_machine=='x86_64' and platform_system == 'Linux'",
                     ]
                 )
             elif cuda_version in SpecifierSet(">=11,<12"):
                 # CUDA 11.8, cudnn 8
                 requires.extend(
                     [
-                        "tensorflow-cpu>=2.5.0rc0,<2.15; platform_machine=='x86_64' and platform_system == 'Linux'",
+                        "tensorflow-cpu>=2.5.0,<2.15; platform_machine=='x86_64' and platform_system == 'Linux'",
                     ]
                 )
                 tf_version = "2.14.1"
@@ -153,7 +158,8 @@ def get_tf_requirement(tf_version: str = "") -> dict:
                 "tensorflow-cpu; platform_machine!='aarch64' and (platform_machine!='arm64' or platform_system != 'Darwin')",
                 "tensorflow; platform_machine=='aarch64' or (platform_machine=='arm64' and platform_system == 'Darwin')",
                 # https://github.com/tensorflow/tensorflow/issues/61830
-                "tensorflow-cpu!=2.15.*; platform_system=='Windows'",
+                # Since TF 2.20, not all symbols are exported to the public API.
+                "tensorflow-cpu!=2.15.*,<2.20; platform_system=='Windows'",
                 # https://github.com/h5py/h5py/issues/2408
                 "h5py>=3.6.0,!=3.11.0; platform_system=='Linux' and platform_machine=='aarch64'",
                 *extra_requires,
@@ -228,6 +234,22 @@ def get_tf_version(tf_path: Optional[Union[str, Path]]) -> str:
                 patch = line.split()[-1]
             elif line.startswith("#define TF_VERSION_SUFFIX"):
                 suffix = line.split()[-1].strip('"')
+    if None in (major, minor, patch):
+        # since TF 2.20.0, version information is no more contained in version.h
+        # try to read version from tools/pip_package/setup.py
+        # _VERSION = '2.20.0'
+        setup_file = Path(tf_path) / "tools" / "pip_package" / "setup.py"
+        if setup_file.exists():
+            with open(setup_file) as f:
+                for line in f:
+                    # parse with regex
+                    match = re.search(
+                        r"_VERSION[ \t]*=[ \t]*'(\d+)\.(\d+)\.(\d+)([a-zA-Z0-9]*)?'",
+                        line,
+                    )
+                    if match:
+                        major, minor, patch, suffix = match.groups()
+                        break
     if None in (major, minor, patch):
         raise RuntimeError("Failed to read TF version")
     return ".".join((major, minor, patch)) + suffix
diff --git a/backend/read_env.py b/backend/read_env.py
index f28e2917f3..482f9766a0 100644
--- a/backend/read_env.py
+++ b/backend/read_env.py
@@ -119,6 +119,7 @@ def get_argument_from_env() -> tuple[str, list, list, dict, str, str]:
 
     cmake_args = [
         "-DBUILD_PY_IF:BOOL=TRUE",
+        f"-DCIBUILDWHEEL={os.environ.get('CIBUILDWHEEL', '0')}",
         *cmake_args,
     ]
     return (
diff --git a/deepmd/__init__.py b/deepmd/__init__.py
index 14d933da11..bc351ee59b 100644
--- a/deepmd/__init__.py
+++ b/deepmd/__init__.py
@@ -8,6 +8,14 @@
 modules for performance.
 """
 
+from typing import (
+    TYPE_CHECKING,
+    Any,
+)
+
+if TYPE_CHECKING:
+    from deepmd.infer import DeepPotential as DeepPotentialType
+
 try:
     from deepmd._version import version as __version__
 except ImportError:
@@ -16,7 +24,7 @@
     )
 
 
-def DeepPotential(*args, **kwargs):
+def DeepPotential(*args: Any, **kwargs: Any) -> "DeepPotentialType":
     """Factory function that forwards to DeepEval (for compatibility
     and performance).
 
diff --git a/deepmd/calculator.py b/deepmd/calculator.py
index cb386d091a..83ebd73e4c 100644
--- a/deepmd/calculator.py
+++ b/deepmd/calculator.py
@@ -6,6 +6,7 @@
 )
 from typing import (
     TYPE_CHECKING,
+    Any,
     ClassVar,
     Optional,
     Union,
@@ -25,6 +26,9 @@
     from ase import (
         Atoms,
     )
+    from ase.neighborlist import (
+        NeighborList,
+    )
 
 __all__ = ["DP"]
 
@@ -85,9 +89,9 @@ def __init__(
         model: Union[str, "Path"],
         label: str = "DP",
         type_dict: Optional[dict[str, int]] = None,
-        neighbor_list=None,
-        head=None,
-        **kwargs,
+        neighbor_list: Optional["NeighborList"] = None,
+        head: Optional[str] = None,
+        **kwargs: Any,
     ) -> None:
         Calculator.__init__(self, label=label, **kwargs)
         self.dp = DeepPot(
@@ -130,7 +134,12 @@ def calculate(
             cell = None
         symbols = self.atoms.get_chemical_symbols()
         atype = [self.type_dict[k] for k in symbols]
-        e, f, v = self.dp.eval(coords=coord, cells=cell, atom_types=atype)[:3]
+
+        fparam = self.atoms.info.get("fparam", None)
+        aparam = self.atoms.info.get("aparam", None)
+        e, f, v = self.dp.eval(
+            coords=coord, cells=cell, atom_types=atype, fparam=fparam, aparam=aparam
+        )[:3]
         self.results["energy"] = e[0][0]
         # see https://gitlab.com/ase/ase/-/merge_requests/2485
         self.results["free_energy"] = e[0][0]
diff --git a/deepmd/common.py b/deepmd/common.py
index 3ab936db67..26b655f876 100644
--- a/deepmd/common.py
+++ b/deepmd/common.py
@@ -35,6 +35,7 @@
 )
 
 __all__ = [
+    "GLOBAL_NP_FLOAT_PRECISION",
     "VALID_ACTIVATION",
     "VALID_PRECISION",
     "expand_sys_str",
@@ -44,7 +45,7 @@
     "select_idx_map",
 ]
 
-_PRECISION = Literal["default", "float16", "float32", "float64"]
+_PRECISION = Literal["default", "float16", "bfloat16", "float32", "float64"]
 _ACTIVATION = Literal[
     "relu",
     "relu6",
@@ -249,16 +250,11 @@ def get_np_precision(precision: "_PRECISION") -> np.dtype:
     RuntimeError
         if string is invalid
     """
-    if precision == "default":
-        return GLOBAL_NP_FLOAT_PRECISION
-    elif precision == "float16":
-        return np.float16
-    elif precision == "float32":
-        return np.float32
-    elif precision == "float64":
-        return np.float64
-    else:
-        raise RuntimeError(f"{precision} is not a valid precision")
+    from deepmd.dpmodel.common import (
+        get_xp_precision,
+    )
+
+    return get_xp_precision(np, precision)
 
 
 def symlink_prefix_files(old_prefix: str, new_prefix: str) -> None:
@@ -288,7 +284,7 @@ def symlink_prefix_files(old_prefix: str, new_prefix: str) -> None:
             shutil.copyfile(ori_ff, new_ff)
 
 
-def get_hash(obj) -> str:
+def get_hash(obj: Any) -> str:
     """Get hash of object.
 
     Parameters
diff --git a/deepmd/dpmodel/array_api.py b/deepmd/dpmodel/array_api.py
index 723718529c..6b52ba7f3e 100644
--- a/deepmd/dpmodel/array_api.py
+++ b/deepmd/dpmodel/array_api.py
@@ -1,14 +1,24 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """Utilities for the array API."""
 
+from typing import (
+    Any,
+    Callable,
+    Optional,
+    Union,
+)
+
 import array_api_compat
 import numpy as np
 from packaging.version import (
     Version,
 )
 
+# Type alias for array_api compatible arrays
+Array = Union[np.ndarray, Any]  # Any to support JAX, PyTorch, etc. arrays
+
 
-def support_array_api(version: str) -> callable:
+def support_array_api(version: str) -> Callable:
     """Mark a function as supporting the specific version of the array API.
 
     Parameters
@@ -18,7 +28,7 @@ def support_array_api(version: str) -> callable:
 
     Returns
     -------
-    callable
+    Callable
         The decorated function
 
     Examples
@@ -28,7 +38,7 @@ def support_array_api(version: str) -> callable:
     ...     pass
     """
 
-    def set_version(func: callable) -> callable:
+    def set_version(func: Callable) -> Callable:
         func.array_api_version = version
         return func
 
@@ -39,7 +49,7 @@ def set_version(func: callable) -> callable:
 # but it hasn't been released yet
 # below is a pure Python implementation of take_along_axis
 # https://github.com/data-apis/array-api/issues/177#issuecomment-2093630595
-def xp_swapaxes(a, axis1, axis2):
+def xp_swapaxes(a: Array, axis1: int, axis2: int) -> Array:
     xp = array_api_compat.array_namespace(a)
     axes = list(range(a.ndim))
     axes[axis1], axes[axis2] = axes[axis2], axes[axis1]
@@ -47,7 +57,7 @@ def xp_swapaxes(a, axis1, axis2):
     return a
 
 
-def xp_take_along_axis(arr, indices, axis):
+def xp_take_along_axis(arr: Array, indices: Array, axis: int) -> Array:
     xp = array_api_compat.array_namespace(arr)
     if Version(xp.__array_api_version__) >= Version("2024.12"):
         # see: https://github.com/data-apis/array-api-strict/blob/d086c619a58f35c38240592ef994aa19ca7beebc/array_api_strict/_indexing_functions.py#L30-L39
@@ -60,7 +70,7 @@ def xp_take_along_axis(arr, indices, axis):
 
     shape = list(arr.shape)
     shape.pop(-1)
-    shape = [*shape, n]
+    shape = (*shape, n)
 
     arr = xp.reshape(arr, (-1,))
     if n != 0:
@@ -76,7 +86,7 @@ def xp_take_along_axis(arr, indices, axis):
     return xp_swapaxes(out, axis, -1)
 
 
-def xp_scatter_sum(input, dim, index: np.ndarray, src: np.ndarray) -> np.ndarray:
+def xp_scatter_sum(input: Array, dim: int, index: Array, src: Array) -> Array:
     """Reduces all values from the src tensor to the indices specified in the index tensor."""
     # jax only
     if array_api_compat.is_jax_array(input):
@@ -94,7 +104,7 @@ def xp_scatter_sum(input, dim, index: np.ndarray, src: np.ndarray) -> np.ndarray
         raise NotImplementedError("Only JAX arrays are supported.")
 
 
-def xp_add_at(x, indices, values):
+def xp_add_at(x: Array, indices: Array, values: Array) -> Array:
     """Adds values to the specified indices of x in place or returns new x (for JAX)."""
     xp = array_api_compat.array_namespace(x, indices, values)
     if array_api_compat.is_numpy_array(x):
@@ -115,7 +125,7 @@ def xp_add_at(x, indices, values):
         return x
 
 
-def xp_bincount(x, weights=None, minlength=0):
+def xp_bincount(x: Array, weights: Optional[Array] = None, minlength: int = 0) -> Array:
     """Counts the number of occurrences of each value in x."""
     xp = array_api_compat.array_namespace(x)
     if array_api_compat.is_numpy_array(x) or array_api_compat.is_jax_array(x):
diff --git a/deepmd/dpmodel/atomic_model/base_atomic_model.py b/deepmd/dpmodel/atomic_model/base_atomic_model.py
index eb95886598..f9b9f0a15e 100644
--- a/deepmd/dpmodel/atomic_model/base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/base_atomic_model.py
@@ -1,12 +1,16 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import math
 from typing import (
+    Any,
     Optional,
 )
 
 import array_api_compat
 import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     NativeOP,
     to_numpy_array,
@@ -42,7 +46,7 @@ def __init__(
         atom_exclude_types: list[int] = [],
         pair_exclude_types: list[tuple[int, int]] = [],
         rcond: Optional[float] = None,
-        preset_out_bias: Optional[dict[str, np.ndarray]] = None,
+        preset_out_bias: Optional[dict[str, Array]] = None,
     ) -> None:
         super().__init__()
         self.type_map = type_map
@@ -68,7 +72,7 @@ def init_out_stat(self) -> None:
         self.out_bias = out_bias_data
         self.out_std = out_std_data
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Array) -> None:
         if key in ["out_bias"]:
             self.out_bias = value
         elif key in ["out_std"]:
@@ -76,7 +80,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Array:
         if key in ["out_bias"]:
             return self.out_bias
         elif key in ["out_std"]:
@@ -88,6 +92,10 @@ def get_type_map(self) -> list[str]:
         """Get the type map."""
         return self.type_map
 
+    def has_default_fparam(self) -> bool:
+        """Check if the model has default frame parameters."""
+        return False
+
     def reinit_atom_exclude(
         self,
         exclude_types: list[int] = [],
@@ -125,7 +133,7 @@ def atomic_output_def(self) -> FittingOutputDef:
         )
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -143,13 +151,13 @@ def change_type_map(
 
     def forward_common_atomic(
         self,
-        extended_coord: np.ndarray,
-        extended_atype: np.ndarray,
-        nlist: np.ndarray,
-        mapping: Optional[np.ndarray] = None,
-        fparam: Optional[np.ndarray] = None,
-        aparam: Optional[np.ndarray] = None,
-    ) -> dict[str, np.ndarray]:
+        extended_coord: Array,
+        extended_atype: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+        fparam: Optional[Array] = None,
+        aparam: Optional[Array] = None,
+    ) -> dict[str, Array]:
         """Common interface for atomic inference.
 
         This method accept extended coordinates, extended atom typs, neighbor list,
@@ -219,13 +227,13 @@ def forward_common_atomic(
 
     def call(
         self,
-        extended_coord: np.ndarray,
-        extended_atype: np.ndarray,
-        nlist: np.ndarray,
-        mapping: Optional[np.ndarray] = None,
-        fparam: Optional[np.ndarray] = None,
-        aparam: Optional[np.ndarray] = None,
-    ) -> dict[str, np.ndarray]:
+        extended_coord: Array,
+        extended_atype: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+        fparam: Optional[Array] = None,
+        aparam: Optional[Array] = None,
+    ) -> dict[str, Array]:
         return self.forward_common_atomic(
             extended_coord,
             extended_atype,
@@ -260,9 +268,9 @@ def deserialize(cls, data: dict) -> "BaseAtomicModel":
 
     def apply_out_stat(
         self,
-        ret: dict[str, np.ndarray],
-        atype: np.ndarray,
-    ):
+        ret: dict[str, Array],
+        atype: Array,
+    ) -> dict[str, Array]:
         """Apply the stat to each atomic output.
         The developer may override the method to define how the bias is applied
         to the atomic output of the model.
@@ -305,7 +313,7 @@ def _get_bias_index(
     def _fetch_out_stat(
         self,
         keys: list[str],
-    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+    ) -> tuple[dict[str, Array], dict[str, Array]]:
         ret_bias = {}
         ret_std = {}
         ntypes = self.get_ntypes()
diff --git a/deepmd/dpmodel/atomic_model/dipole_atomic_model.py b/deepmd/dpmodel/atomic_model/dipole_atomic_model.py
index 00428f4e95..7cfa24526a 100644
--- a/deepmd/dpmodel/atomic_model/dipole_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/dipole_atomic_model.py
@@ -1,6 +1,17 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-import numpy as np
+from typing import (
+    Any,
+)
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
+from deepmd.dpmodel.descriptor.base_descriptor import (
+    BaseDescriptor,
+)
+from deepmd.dpmodel.fitting.base_fitting import (
+    BaseFitting,
+)
 from deepmd.dpmodel.fitting.dipole_fitting import (
     DipoleFitting,
 )
@@ -11,7 +22,13 @@
 
 
 class DPDipoleAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self,
+        descriptor: BaseDescriptor,
+        fitting: BaseFitting,
+        type_map: list[str],
+        **kwargs: Any,
+    ) -> None:
         if not isinstance(fitting, DipoleFitting):
             raise TypeError(
                 "fitting must be an instance of DipoleFitting for DPDipoleAtomicModel"
@@ -20,8 +37,8 @@ def __init__(self, descriptor, fitting, type_map, **kwargs):
 
     def apply_out_stat(
         self,
-        ret: dict[str, np.ndarray],
-        atype: np.ndarray,
-    ):
+        ret: dict[str, Array],
+        atype: Array,
+    ) -> dict[str, Array]:
         # dipole not applying bias
         return ret
diff --git a/deepmd/dpmodel/atomic_model/dos_atomic_model.py b/deepmd/dpmodel/atomic_model/dos_atomic_model.py
index 7ef6d10ebf..ce457cb472 100644
--- a/deepmd/dpmodel/atomic_model/dos_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/dos_atomic_model.py
@@ -1,4 +1,14 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
+from deepmd.dpmodel.descriptor.base_descriptor import (
+    BaseDescriptor,
+)
+from deepmd.dpmodel.fitting.base_fitting import (
+    BaseFitting,
+)
 from deepmd.dpmodel.fitting.dos_fitting import (
     DOSFittingNet,
 )
@@ -9,7 +19,13 @@
 
 
 class DPDOSAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self,
+        descriptor: BaseDescriptor,
+        fitting: BaseFitting,
+        type_map: list[str],
+        **kwargs: Any,
+    ) -> None:
         if not isinstance(fitting, DOSFittingNet):
             raise TypeError(
                 "fitting must be an instance of DOSFittingNet for DPDOSAtomicModel"
diff --git a/deepmd/dpmodel/atomic_model/dp_atomic_model.py b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
index 2fa072cc78..60db302667 100644
--- a/deepmd/dpmodel/atomic_model/dp_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/dp_atomic_model.py
@@ -1,10 +1,12 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
-import numpy as np
-
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.descriptor.base_descriptor import (
     BaseDescriptor,
 )
@@ -41,10 +43,10 @@ class DPAtomicModel(BaseAtomicModel):
 
     def __init__(
         self,
-        descriptor,
-        fitting,
+        descriptor: BaseDescriptor,
+        fitting: BaseFitting,
         type_map: list[str],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(type_map, **kwargs)
         self.type_map = type_map
@@ -65,7 +67,7 @@ def get_sel(self) -> list[int]:
         """Get the neighbor selection."""
         return self.descriptor.get_sel()
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this atomic model by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -125,13 +127,13 @@ def enable_compression(
 
     def forward_atomic(
         self,
-        extended_coord: np.ndarray,
-        extended_atype: np.ndarray,
-        nlist: np.ndarray,
-        mapping: Optional[np.ndarray] = None,
-        fparam: Optional[np.ndarray] = None,
-        aparam: Optional[np.ndarray] = None,
-    ) -> dict[str, np.ndarray]:
+        extended_coord: Array,
+        extended_atype: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+        fparam: Optional[Array] = None,
+        aparam: Optional[Array] = None,
+    ) -> dict[str, Array]:
         """Models' atomic predictions.
 
         Parameters
@@ -175,7 +177,7 @@ def forward_atomic(
         return ret
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -213,7 +215,7 @@ def serialize(self) -> dict:
     """The base fitting class."""
 
     @classmethod
-    def deserialize(cls, data) -> "DPAtomicModel":
+    def deserialize(cls, data: dict[str, Any]) -> "DPAtomicModel":
         data = data.copy()
         check_version_compatibility(data.pop("@version", 1), 2, 2)
         data.pop("@class")
@@ -233,6 +235,10 @@ def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.fitting.get_dim_aparam()
 
+    def has_default_fparam(self) -> bool:
+        """Check if the model has default frame parameters."""
+        return self.fitting.has_default_fparam()
+
     def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
diff --git a/deepmd/dpmodel/atomic_model/energy_atomic_model.py b/deepmd/dpmodel/atomic_model/energy_atomic_model.py
index 4f9f8ec005..6deb87662d 100644
--- a/deepmd/dpmodel/atomic_model/energy_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/energy_atomic_model.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
 from deepmd.dpmodel.fitting.ener_fitting import (
     EnergyFittingNet,
     InvarFitting,
@@ -10,7 +14,9 @@
 
 
 class DPEnergyAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self, descriptor: Any, fitting: Any, type_map: list[str], **kwargs: Any
+    ) -> None:
         if not (
             isinstance(fitting, EnergyFittingNet) or isinstance(fitting, InvarFitting)
         ):
diff --git a/deepmd/dpmodel/atomic_model/linear_atomic_model.py b/deepmd/dpmodel/atomic_model/linear_atomic_model.py
index ce0f1d0cb9..ed63bb2db7 100644
--- a/deepmd/dpmodel/atomic_model/linear_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/linear_atomic_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -7,6 +8,9 @@
 import array_api_compat
 import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.utils.nlist import (
     build_multiple_neighbor_list,
     get_multiple_nlist_key,
@@ -51,7 +55,7 @@ def __init__(
         self,
         models: list[BaseAtomicModel],
         type_map: list[str],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(type_map, **kwargs)
         super().init_out_stat()
@@ -111,7 +115,7 @@ def get_type_map(self) -> list[str]:
         return self.type_map
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -134,7 +138,7 @@ def get_model_rcuts(self) -> list[float]:
     def get_sel(self) -> list[int]:
         return [max([model.get_nsel() for model in self.models])]
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this atomic model by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -150,7 +154,7 @@ def get_model_sels(self) -> list[Union[int, list[int]]]:
         """Get the sels for each individual models."""
         return [model.get_sel() for model in self.models]
 
-    def _sort_rcuts_sels(self) -> tuple[list[float], list[int]]:
+    def _sort_rcuts_sels(self) -> tuple[tuple[Array, Array], list[int]]:
         # sort the pair of rcut and sels in ascending order, first based on sel, then on rcut.
         zipped = sorted(
             zip(self.get_model_rcuts(), self.get_model_nsels()),
@@ -192,13 +196,13 @@ def enable_compression(
 
     def forward_atomic(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
-        mapping: Optional[np.ndarray] = None,
-        fparam: Optional[np.ndarray] = None,
-        aparam: Optional[np.ndarray] = None,
-    ) -> dict[str, np.ndarray]:
+        extended_coord: Array,
+        extended_atype: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+        fparam: Optional[Array] = None,
+        aparam: Optional[Array] = None,
+    ) -> dict[str, Array]:
         """Return atomic prediction.
 
         Parameters
@@ -262,7 +266,7 @@ def forward_atomic(
         return fit_ret
 
     @staticmethod
-    def remap_atype(ori_map: list[str], new_map: list[str]) -> np.ndarray:
+    def remap_atype(ori_map: list[str], new_map: list[str]) -> Array:
         """
         This method is used to map the atype from the common type_map to the original type_map of
         indivial AtomicModels.
@@ -325,10 +329,10 @@ def deserialize(cls, data: dict) -> "LinearEnergyAtomicModel":
 
     def _compute_weight(
         self,
-        extended_coord: np.ndarray,
-        extended_atype: np.ndarray,
-        nlists_: list[np.ndarray],
-    ) -> list[np.ndarray]:
+        extended_coord: Array,
+        extended_atype: Array,
+        nlists_: list[Array],
+    ) -> list[Array]:
         """This should be a list of user defined weights that matches the number of models to be combined."""
         xp = array_api_compat.array_namespace(extended_coord, extended_atype, nlists_)
         nmodels = len(self.models)
@@ -398,7 +402,7 @@ def __init__(
         sw_rmax: float,
         type_map: list[str],
         smin_alpha: Optional[float] = 0.1,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         models = [dp_model, zbl_model]
         kwargs["models"] = models
@@ -424,7 +428,7 @@ def serialize(self) -> dict:
         return dd
 
     @classmethod
-    def deserialize(cls, data) -> "DPZBLLinearEnergyAtomicModel":
+    def deserialize(cls, data: Any) -> "DPZBLLinearEnergyAtomicModel":
         data = data.copy()
         check_version_compatibility(data.pop("@version", 1), 2, 2)
         models = [
@@ -436,7 +440,7 @@ def deserialize(cls, data) -> "DPZBLLinearEnergyAtomicModel":
         data.pop("type", None)
         return super().deserialize(data)
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this atomic model by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -446,15 +450,15 @@ def set_case_embd(self, case_idx: int):
 
     def _compute_weight(
         self,
-        extended_coord: np.ndarray,
-        extended_atype: np.ndarray,
-        nlists_: list[np.ndarray],
-    ) -> list[np.ndarray]:
+        extended_coord: Array,
+        extended_atype: Array,
+        nlists_: list[Array],
+    ) -> list[Array]:
         """ZBL weight.
 
         Returns
         -------
-        list[np.ndarray]
+        list[Array]
             the atomic ZBL weight for interpolation. (nframes, nloc, 1)
         """
         assert self.sw_rmax > self.sw_rmin, (
diff --git a/deepmd/dpmodel/atomic_model/make_base_atomic_model.py b/deepmd/dpmodel/atomic_model/make_base_atomic_model.py
index 01caa7cd64..fac18c2744 100644
--- a/deepmd/dpmodel/atomic_model/make_base_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/make_base_atomic_model.py
@@ -4,6 +4,7 @@
     abstractmethod,
 )
 from typing import (
+    Any,
     Optional,
 )
 
@@ -17,9 +18,9 @@
 
 
 def make_base_atomic_model(
-    t_tensor,
+    t_tensor: type,
     fwd_method_name: str = "forward_atomic",
-):
+) -> type:
     """Make the base class for the atomic model.
 
     Parameters
@@ -147,12 +148,12 @@ def serialize(self) -> dict:
 
         @classmethod
         @abstractmethod
-        def deserialize(cls, data: dict):
+        def deserialize(cls, data: dict) -> Any:
             pass
 
         @abstractmethod
         def change_type_map(
-            self, type_map: list[str], model_with_new_type_stat=None
+            self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
         ) -> None:
             pass
 
diff --git a/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py b/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
index 9d7739d5c8..54a3712912 100644
--- a/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/pairtab_atomic_model.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
+    NoReturn,
     Optional,
     Union,
 )
@@ -8,6 +10,7 @@
 import numpy as np
 
 from deepmd.dpmodel.array_api import (
+    Array,
     xp_take_along_axis,
 )
 from deepmd.dpmodel.output_def import (
@@ -65,7 +68,7 @@ def __init__(
         type_map: list[str],
         rcond: Optional[float] = None,
         atom_ener: Optional[list[float]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(type_map, **kwargs)
         super().init_out_stat()
@@ -120,7 +123,7 @@ def get_type_map(self) -> list[str]:
     def get_sel(self) -> list[int]:
         return [self.sel]
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> NoReturn:
         """
         Set the case embedding of this atomic model by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -154,7 +157,7 @@ def need_sorted_nlist_for_lower(self) -> bool:
         return False
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Any = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -181,7 +184,7 @@ def serialize(self) -> dict:
         return dd
 
     @classmethod
-    def deserialize(cls, data) -> "PairTabAtomicModel":
+    def deserialize(cls, data: dict) -> "PairTabAtomicModel":
         data = data.copy()
         check_version_compatibility(data.pop("@version", 1), 2, 2)
         data.pop("@class")
@@ -198,13 +201,13 @@ def deserialize(cls, data) -> "PairTabAtomicModel":
 
     def forward_atomic(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
-        mapping: Optional[np.ndarray] = None,
-        fparam: Optional[np.ndarray] = None,
-        aparam: Optional[np.ndarray] = None,
-    ) -> dict[str, np.ndarray]:
+        extended_coord: Array,
+        extended_atype: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+        fparam: Optional[Array] = None,
+        aparam: Optional[Array] = None,
+    ) -> dict[str, Array]:
         xp = array_api_compat.array_namespace(extended_coord, extended_atype, nlist)
         nframes, nloc, nnei = nlist.shape
         extended_coord = xp.reshape(extended_coord, (nframes, -1, 3))
@@ -237,22 +240,22 @@ def forward_atomic(
 
     def _pair_tabulated_inter(
         self,
-        nlist: np.ndarray,
-        i_type: np.ndarray,
-        j_type: np.ndarray,
-        rr: np.ndarray,
-    ) -> np.ndarray:
+        nlist: Array,
+        i_type: Array,
+        j_type: Array,
+        rr: Array,
+    ) -> Array:
         """Pairwise tabulated energy.
 
         Parameters
         ----------
-        nlist : np.ndarray
+        nlist : Array
             The unmasked neighbour list. (nframes, nloc)
-        i_type : np.ndarray
+        i_type : Array
             The integer representation of atom type for all local atoms for all frames. (nframes, nloc)
-        j_type : np.ndarray
+        j_type : Array
             The integer representation of atom type for all neighbour atoms of all local atoms for all frames. (nframes, nloc, nnei)
-        rr : np.ndarray
+        rr : Array
             The salar distance vector between two atoms. (nframes, nloc, nnei)
 
         Returns
@@ -310,12 +313,12 @@ def _pair_tabulated_inter(
         return ener
 
     @staticmethod
-    def _get_pairwise_dist(coords: np.ndarray, nlist: np.ndarray) -> np.ndarray:
+    def _get_pairwise_dist(coords: Array, nlist: Array) -> Array:
         """Get pairwise distance `dr`.
 
         Parameters
         ----------
-        coords : np.ndarray
+        coords : Array
             The coordinate of the atoms, shape of (nframes, nall, 3).
         nlist
             The masked nlist, shape of (nframes, nloc, nnei).
@@ -337,23 +340,23 @@ def _get_pairwise_dist(coords: np.ndarray, nlist: np.ndarray) -> np.ndarray:
 
     @staticmethod
     def _extract_spline_coefficient(
-        i_type: np.ndarray,
-        j_type: np.ndarray,
-        idx: np.ndarray,
-        tab_data: np.ndarray,
+        i_type: Array,
+        j_type: Array,
+        idx: Array,
+        tab_data: Array,
         nspline: np.int64,
-    ) -> np.ndarray:
+    ) -> Array:
         """Extract the spline coefficient from the table.
 
         Parameters
         ----------
-        i_type : np.ndarray
+        i_type : Array
             The integer representation of atom type for all local atoms for all frames. (nframes, nloc)
-        j_type : np.ndarray
+        j_type : Array
             The integer representation of atom type for all neighbour atoms of all local atoms for all frames. (nframes, nloc, nnei)
-        idx : np.ndarray
+        idx : Array
             The index of the spline coefficient. (nframes, nloc, nnei)
-        tab_data : np.ndarray
+        tab_data : Array
             The table storing all the spline coefficient. (ntype, ntype, nspline, 4)
         nspline : int
             The number of splines in the table.
@@ -391,14 +394,14 @@ def _extract_spline_coefficient(
         return final_coef
 
     @staticmethod
-    def _calculate_ener(coef: np.ndarray, uu: np.ndarray) -> np.ndarray:
+    def _calculate_ener(coef: Array, uu: Array) -> Array:
         """Calculate energy using spline coeeficients.
 
         Parameters
         ----------
-        coef : np.ndarray
+        coef : Array
             The spline coefficients. (nframes, nloc, nnei, 4)
-        uu : np.ndarray
+        uu : Array
             The atom displancemnt used in interpolation and extrapolation (nframes, nloc, nnei)
 
         Returns
diff --git a/deepmd/dpmodel/atomic_model/polar_atomic_model.py b/deepmd/dpmodel/atomic_model/polar_atomic_model.py
index bc7860491c..2180e48265 100644
--- a/deepmd/dpmodel/atomic_model/polar_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/polar_atomic_model.py
@@ -1,8 +1,13 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 import array_api_compat
-import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.fitting.polarizability_fitting import (
     PolarFitting,
 )
@@ -13,7 +18,9 @@
 
 
 class DPPolarAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self, descriptor: Any, fitting: Any, type_map: list[str], **kwargs: Any
+    ) -> None:
         if not isinstance(fitting, PolarFitting):
             raise TypeError(
                 "fitting must be an instance of PolarFitting for DPPolarAtomicModel"
@@ -22,9 +29,9 @@ def __init__(self, descriptor, fitting, type_map, **kwargs):
 
     def apply_out_stat(
         self,
-        ret: dict[str, np.ndarray],
-        atype: np.ndarray,
-    ):
+        ret: dict[str, Array],
+        atype: Array,
+    ) -> dict[str, Array]:
         """Apply the stat to each atomic output.
 
         Parameters
diff --git a/deepmd/dpmodel/atomic_model/property_atomic_model.py b/deepmd/dpmodel/atomic_model/property_atomic_model.py
index e3c038e695..ec65f949e0 100644
--- a/deepmd/dpmodel/atomic_model/property_atomic_model.py
+++ b/deepmd/dpmodel/atomic_model/property_atomic_model.py
@@ -1,6 +1,11 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-import numpy as np
+from typing import (
+    Any,
+)
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.fitting.property_fitting import (
     PropertyFittingNet,
 )
@@ -11,7 +16,9 @@
 
 
 class DPPropertyAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self, descriptor: Any, fitting: Any, type_map: list[str], **kwargs: Any
+    ) -> None:
         if not isinstance(fitting, PropertyFittingNet):
             raise TypeError(
                 "fitting must be an instance of PropertyFittingNet for DPPropertyAtomicModel"
@@ -20,9 +27,9 @@ def __init__(self, descriptor, fitting, type_map, **kwargs):
 
     def apply_out_stat(
         self,
-        ret: dict[str, np.ndarray],
-        atype: np.ndarray,
-    ):
+        ret: dict[str, Array],
+        atype: Array,
+    ) -> dict[str, Array]:
         """Apply the stat to each atomic output.
 
         In property fitting, each output will be multiplied by label std and then plus the label average value.
diff --git a/deepmd/dpmodel/common.py b/deepmd/dpmodel/common.py
index 1f9d4817a2..c1b766012c 100644
--- a/deepmd/dpmodel/common.py
+++ b/deepmd/dpmodel/common.py
@@ -7,6 +7,7 @@
     wraps,
 )
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
     Optional,
@@ -20,6 +21,10 @@
 from deepmd.common import (
     VALID_PRECISION,
 )
+
+if TYPE_CHECKING:
+    from deepmd.dpmodel.array_api import Array
+
 from deepmd.env import (
     GLOBAL_ENER_FLOAT_PRECISION,
     GLOBAL_NP_FLOAT_PRECISION,
@@ -59,7 +64,7 @@
 def get_xp_precision(
     xp: Any,
     precision: str,
-):
+) -> Any:
     """Get the precision from the API compatible namespace."""
     if precision == "float16" or precision == "half":
         return xp.float16
@@ -87,16 +92,16 @@ class NativeOP(ABC):
     """The unit operation of a native model."""
 
     @abstractmethod
-    def call(self, *args, **kwargs):
+    def call(self, *args: Any, **kwargs: Any) -> "Array":
         """Forward pass in NumPy implementation."""
         pass
 
-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args: Any, **kwargs: Any) -> "Array":
         """Forward pass in NumPy implementation."""
         return self.call(*args, **kwargs)
 
 
-def to_numpy_array(x: Any) -> Optional[np.ndarray]:
+def to_numpy_array(x: Optional["Array"]) -> Optional[np.ndarray]:
     """Convert an array to a NumPy array.
 
     Parameters
@@ -158,7 +163,7 @@ def cast_precision(func: Callable[..., Any]) -> Callable[..., Any]:
     """
 
     @wraps(func)
-    def wrapper(self, *args, **kwargs):
+    def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
         # only convert tensors
         returned_tensor = func(
             self,
@@ -185,13 +190,13 @@ def wrapper(self, *args, **kwargs):
 
 @overload
 def safe_cast_array(
-    input: np.ndarray, from_precision: str, to_precision: str
-) -> np.ndarray: ...
+    input: "Array", from_precision: str, to_precision: str
+) -> "Array": ...
 @overload
 def safe_cast_array(input: None, from_precision: str, to_precision: str) -> None: ...
 def safe_cast_array(
-    input: Optional[np.ndarray], from_precision: str, to_precision: str
-) -> Optional[np.ndarray]:
+    input: Optional["Array"], from_precision: str, to_precision: str
+) -> Optional["Array"]:
     """Convert an array from a precision to another precision.
 
     If input is not an array or without the specific precision, the method will not
@@ -201,7 +206,7 @@ def safe_cast_array(
 
     Parameters
     ----------
-    input : np.ndarray or None
+    input : Array or None
         Input array
     from_precision : str
         Array data type that is casted from
diff --git a/deepmd/dpmodel/descriptor/descriptor.py b/deepmd/dpmodel/descriptor/descriptor.py
index 443a2a66f1..417104c8c1 100644
--- a/deepmd/dpmodel/descriptor/descriptor.py
+++ b/deepmd/dpmodel/descriptor/descriptor.py
@@ -5,6 +5,7 @@
     abstractmethod,
 )
 from typing import (
+    Any,
     Callable,
     NoReturn,
     Optional,
@@ -13,6 +14,9 @@
 
 import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.utils.env_mat_stat import (
     StatItem,
 )
@@ -34,7 +38,7 @@ class DescriptorBlock(ABC, make_plugin_registry("DescriptorBlock")):
 
     local_cluster = False
 
-    def __new__(cls, *args, **kwargs):
+    def __new__(cls, *args: Any, **kwargs: Any) -> Any:
         if cls is DescriptorBlock:
             try:
                 descrpt_type = kwargs["type"]
@@ -107,7 +111,9 @@ def get_stats(self) -> dict[str, StatItem]:
         """Get the statistics of the descriptor."""
         raise NotImplementedError
 
-    def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
+    def share_params(
+        self, base_class: Any, shared_level: Any, resume: bool = False
+    ) -> NoReturn:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -118,13 +124,13 @@ def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
     @abstractmethod
     def call(
         self,
-        nlist: np.ndarray,
-        extended_coord: np.ndarray,
-        extended_atype: np.ndarray,
-        extended_atype_embd: Optional[np.ndarray] = None,
-        mapping: Optional[np.ndarray] = None,
-        type_embedding: Optional[np.ndarray] = None,
-    ):
+        nlist: Array,
+        extended_coord: Array,
+        extended_atype: Array,
+        extended_atype_embd: Optional[Array] = None,
+        mapping: Optional[Array] = None,
+        type_embedding: Optional[Array] = None,
+    ) -> Any:
         """Calculate DescriptorBlock."""
         pass
 
@@ -137,7 +143,9 @@ def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the descriptor block needs sorted nlist when using `forward_lower`."""
 
 
-def extend_descrpt_stat(des, type_map, des_with_stat=None) -> None:
+def extend_descrpt_stat(
+    des: Any, type_map: list[str], des_with_stat: Any = None
+) -> None:
     r"""
     Extend the statistics of a descriptor block with types from newly provided `type_map`.
 
diff --git a/deepmd/dpmodel/descriptor/dpa1.py b/deepmd/dpmodel/descriptor/dpa1.py
index 51c56e9681..5fc04ddc30 100644
--- a/deepmd/dpmodel/descriptor/dpa1.py
+++ b/deepmd/dpmodel/descriptor/dpa1.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import math
 from typing import (
-    Any,
     Callable,
     NoReturn,
     Optional,
@@ -17,6 +16,7 @@
     NativeOP,
 )
 from deepmd.dpmodel.array_api import (
+    Array,
     xp_take_along_axis,
 )
 from deepmd.dpmodel.common import (
@@ -74,7 +74,7 @@
 )
 
 
-def np_softmax(x, axis=-1):
+def np_softmax(x: Array, axis: int = -1) -> Array:
     xp = array_api_compat.array_namespace(x)
     # x = xp.nan_to_num(x)  # to avoid value warning
     x = xp.where(xp.isnan(x), xp.zeros_like(x), x)
@@ -82,7 +82,7 @@ def np_softmax(x, axis=-1):
     return e_x / xp.sum(e_x, axis=axis, keepdims=True)
 
 
-def np_normalize(x, axis=-1):
+def np_normalize(x: Array, axis: int = -1) -> Array:
     xp = array_api_compat.array_namespace(x)
     return x / xp.linalg.vector_norm(x, axis=axis, keepdims=True)
 
@@ -262,14 +262,14 @@ def __init__(
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
-        scaling_factor=1.0,
+        scaling_factor: float = 1.0,
         normalize: bool = True,
         temperature: Optional[float] = None,
         trainable_ln: bool = True,
         ln_eps: Optional[float] = 1e-5,
         smooth_type_embedding: bool = True,
         concat_output_tebd: bool = True,
-        spin: Optional[Any] = None,
+        spin: None = None,
         stripped_type_embedding: Optional[bool] = None,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
@@ -319,6 +319,7 @@ def __init__(
             trainable_ln=trainable_ln,
             ln_eps=ln_eps,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.use_econf_tebd = use_econf_tebd
         self.use_tebd_bias = use_tebd_bias
@@ -333,6 +334,7 @@ def __init__(
             use_tebd_bias=use_tebd_bias,
             type_map=type_map,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.tebd_dim = tebd_dim
         self.concat_output_tebd = concat_output_tebd
@@ -397,7 +399,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.se_atten.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
+    def share_params(
+        self, base_class: "DescrptDPA1", shared_level: int, resume: bool = False
+    ) -> NoReturn:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -406,18 +410,18 @@ def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
         raise NotImplementedError
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         return self.get_dim_out()
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         return self.get_dim_emb()
 
     def compute_input_stats(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
-    ):
+    ) -> None:
         """
         Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
 
@@ -438,19 +442,21 @@ def compute_input_stats(
 
     def set_stat_mean_and_stddev(
         self,
-        mean: np.ndarray,
-        stddev: np.ndarray,
+        mean: Array,
+        stddev: Array,
     ) -> None:
         """Update mean and stddev for descriptor."""
         self.se_atten.mean = mean
         self.se_atten.stddev = stddev
 
-    def get_stat_mean_and_stddev(self) -> tuple[np.ndarray, np.ndarray]:
+    def get_stat_mean_and_stddev(self) -> tuple[Array, Array]:
         """Get mean and stddev for descriptor."""
         return self.se_atten.mean, self.se_atten.stddev
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["DescrptDPA1"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -479,11 +485,11 @@ def change_type_map(
     @cast_precision
     def call(
         self,
-        coord_ext,
-        atype_ext,
-        nlist,
-        mapping: Optional[np.ndarray] = None,
-    ):
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+    ) -> Array:
         """Compute the descriptor.
 
         Parameters
@@ -520,7 +526,7 @@ def call(
         type_embedding = self.type_embedding.call()
         # nf x nall x tebd_dim
         atype_embd_ext = xp.reshape(
-            xp.take(type_embedding, xp.reshape(atype_ext, [-1]), axis=0),
+            xp.take(type_embedding, xp.reshape(atype_ext, (-1,)), axis=0),
             (nf, nall, self.tebd_dim),
         )
         # nfnl x tebd_dim
@@ -634,7 +640,7 @@ def update_sel(
         train_data: DeepmdDataSystem,
         type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> tuple[dict, Optional[float]]:
+    ) -> tuple[Array, Array]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -684,13 +690,14 @@ def __init__(
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
-        scaling_factor=1.0,
+        scaling_factor: float = 1.0,
         normalize: bool = True,
         temperature: Optional[float] = None,
         trainable_ln: bool = True,
         ln_eps: Optional[float] = 1e-5,
         smooth: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         self.rcut = rcut
         self.rcut_smth = rcut_smth
@@ -741,6 +748,7 @@ def __init__(
             self.resnet_dt,
             self.precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.embeddings = embeddings
         if self.tebd_input_mode in ["strip"]:
@@ -756,6 +764,7 @@ def __init__(
                 self.resnet_dt,
                 self.precision,
                 seed=child_seed(seed, 1),
+                trainable=trainable,
             )
             self.embeddings_strip = embeddings_strip
         else:
@@ -774,6 +783,7 @@ def __init__(
             smooth=self.smooth,
             precision=self.precision,
             seed=child_seed(seed, 2),
+            trainable=trainable,
         )
 
         wanted_shape = (self.ntypes, self.nnei, 4)
@@ -814,7 +824,7 @@ def get_dim_emb(self) -> int:
         """Returns the output dimension of embedding."""
         return self.filter_neuron[-1]
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Array) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -822,7 +832,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Array:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -847,17 +857,17 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.filter_neuron[-1] * self.axis_neuron
 
     @property
-    def dim_in(self):
+    def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return self.tebd_dim
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the output dimension of embedding."""
         return self.get_dim_emb()
 
@@ -918,9 +928,9 @@ def reinit_exclude(
 
     def cal_g(
         self,
-        ss,
-        embedding_idx,
-    ):
+        ss: Array,
+        embedding_idx: int,
+    ) -> Array:
         xp = array_api_compat.array_namespace(ss)
         nfnl, nnei = ss.shape[0:2]
         shape2 = math.prod(ss.shape[2:])
@@ -931,9 +941,9 @@ def cal_g(
 
     def cal_g_strip(
         self,
-        ss,
-        embedding_idx,
-    ):
+        ss: Array,
+        embedding_idx: int,
+    ) -> Array:
         assert self.embeddings_strip is not None
         # nfnl x nnei x ng
         gg = self.embeddings_strip[embedding_idx].call(ss)
@@ -941,13 +951,13 @@ def cal_g_strip(
 
     def call(
         self,
-        nlist: np.ndarray,
-        coord_ext: np.ndarray,
-        atype_ext: np.ndarray,
-        atype_embd_ext: Optional[np.ndarray] = None,
-        mapping: Optional[np.ndarray] = None,
-        type_embedding: Optional[np.ndarray] = None,
-    ):
+        nlist: Array,
+        coord_ext: Array,
+        atype_ext: Array,
+        atype_embd_ext: Optional[Array] = None,
+        mapping: Optional[Array] = None,
+        type_embedding: Optional[Array] = None,
+    ) -> tuple[Array, Array]:
         xp = array_api_compat.array_namespace(nlist, coord_ext, atype_ext)
         # nf x nloc x nnei x 4
         dmatrix, diff, sw = self.env_mat.call(
@@ -1027,7 +1037,7 @@ def call(
                     xp.tile(
                         (xp.reshape(atype, (-1, 1)) * ntypes_with_padding), (1, nnei)
                     ),
-                    (-1),
+                    (-1,),
                 )
                 idx_j = xp.reshape(nei_type, (-1,))
                 # (nf x nl x nnei) x ng
@@ -1186,6 +1196,7 @@ def __init__(
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         """Construct a neighbor-wise attention net."""
         super().__init__()
@@ -1219,29 +1230,32 @@ def __init__(
                 smooth=smooth,
                 precision=precision,
                 seed=child_seed(seed, ii),
+                trainable=trainable,
             )
             for ii in range(layer_num)
         ]
 
     def call(
         self,
-        input_G,
-        nei_mask,
-        input_r: Optional[np.ndarray] = None,
-        sw: Optional[np.ndarray] = None,
-    ):
+        input_G: Array,
+        nei_mask: Array,
+        input_r: Optional[Array] = None,
+        sw: Optional[Array] = None,
+    ) -> Array:
         out = input_G
         for layer in self.attention_layers:
             out = layer(out, nei_mask, input_r=input_r, sw=sw)
         return out
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: int) -> "NeighborGatedAttentionLayer":
         if isinstance(key, int):
             return self.attention_layers[key]
         else:
             raise TypeError(key)
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(
+        self, key: int, value: Union["NeighborGatedAttentionLayer", dict]
+    ) -> None:
         if not isinstance(key, int):
             raise TypeError(key)
         if isinstance(value, self.network_type):
@@ -1252,7 +1266,7 @@ def __setitem__(self, key, value) -> None:
             raise TypeError(value)
         self.attention_layers[key] = value
 
-    def serialize(self):
+    def serialize(self) -> dict:
         """Serialize the networks to a dict.
 
         Returns
@@ -1314,6 +1328,7 @@ def __init__(
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         """Construct a neighbor-wise attention layer."""
         super().__init__()
@@ -1340,6 +1355,7 @@ def __init__(
             smooth=smooth,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.attn_layer_norm = LayerNorm(
             self.embed_dim,
@@ -1351,11 +1367,11 @@ def __init__(
 
     def call(
         self,
-        x,
-        nei_mask,
-        input_r: Optional[np.ndarray] = None,
-        sw: Optional[np.ndarray] = None,
-    ):
+        x: Array,
+        nei_mask: Array,
+        input_r: Optional[Array] = None,
+        sw: Optional[Array] = None,
+    ) -> Array:
         residual = x
         x, _ = self.attention_layer(x, nei_mask, input_r=input_r, sw=sw)
         x = residual + x
@@ -1387,7 +1403,7 @@ def serialize(self) -> dict:
         }
 
     @classmethod
-    def deserialize(cls, data) -> "NeighborGatedAttentionLayer":
+    def deserialize(cls, data: dict) -> "NeighborGatedAttentionLayer":
         """Deserialize the networks from a dict.
 
         Parameters
@@ -1420,6 +1436,7 @@ def __init__(
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         """Construct a multi-head neighbor-wise attention net."""
         super().__init__()
@@ -1449,6 +1466,7 @@ def __init__(
             use_timestep=False,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.out_proj = NativeLayer(
             hidden_dim,
@@ -1457,9 +1475,17 @@ def __init__(
             use_timestep=False,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
 
-    def call(self, query, nei_mask, input_r=None, sw=None, attnw_shift=20.0):
+    def call(
+        self,
+        query: Array,
+        nei_mask: Array,
+        input_r: Optional[Array] = None,
+        sw: Optional[Array] = None,
+        attnw_shift: float = 20.0,
+    ) -> tuple[Array, Array]:
         xp = array_api_compat.array_namespace(query, nei_mask)
         # Linear projection
         # q, k, v = xp.split(self.in_proj(query), 3, axis=-1)
@@ -1520,7 +1546,7 @@ def call(self, query, nei_mask, input_r=None, sw=None, attnw_shift=20.0):
         output = self.out_proj(o)
         return output, attn_weights
 
-    def serialize(self):
+    def serialize(self) -> dict:
         return {
             "nnei": self.nnei,
             "embed_dim": self.embed_dim,
@@ -1539,7 +1565,7 @@ def serialize(self):
         }
 
     @classmethod
-    def deserialize(cls, data):
+    def deserialize(cls, data: dict) -> "GatedAttentionLayer":
         data = data.copy()
         in_proj = data.pop("in_proj")
         out_proj = data.pop("out_proj")
diff --git a/deepmd/dpmodel/descriptor/dpa2.py b/deepmd/dpmodel/descriptor/dpa2.py
index 70accefa30..75bf519984 100644
--- a/deepmd/dpmodel/descriptor/dpa2.py
+++ b/deepmd/dpmodel/descriptor/dpa2.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     NoReturn,
     Optional,
@@ -7,12 +8,12 @@
 )
 
 import array_api_compat
-import numpy as np
 
 from deepmd.dpmodel import (
     NativeOP,
 )
 from deepmd.dpmodel.array_api import (
+    Array,
     xp_take_along_axis,
 )
 from deepmd.dpmodel.common import (
@@ -83,7 +84,7 @@ def __init__(
         tebd_dim: int = 8,
         tebd_input_mode: str = "concat",
         set_davg_zero: bool = True,
-        activation_function="tanh",
+        activation_function: str = "tanh",
         resnet_dt: bool = False,
         type_one_side: bool = False,
         use_three_body: bool = False,
@@ -151,7 +152,7 @@ def __init__(
         self.three_body_rcut = three_body_rcut
         self.three_body_rcut_smth = three_body_rcut_smth
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if hasattr(self, key):
             return getattr(self, key)
         else:
@@ -321,7 +322,7 @@ def __init__(
             ln_eps = 1e-5
         self.ln_eps = ln_eps
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if hasattr(self, key):
             return getattr(self, key)
         else:
@@ -442,7 +443,7 @@ def __init__(
            Comput Mater 10, 293 (2024). https://doi.org/10.1038/s41524-024-01493-2
         """
 
-        def init_subclass_params(sub_data, sub_class):
+        def init_subclass_params(sub_data: Union[dict, Any], sub_class: type) -> Any:
             if isinstance(sub_data, dict):
                 return sub_class(**sub_data)
             elif isinstance(sub_data, sub_class):
@@ -474,6 +475,7 @@ def init_subclass_params(sub_data, sub_class):
             smooth=smooth,
             type_one_side=self.repinit_args.type_one_side,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.use_three_body = self.repinit_args.use_three_body
         if self.use_three_body:
@@ -493,6 +495,7 @@ def init_subclass_params(sub_data, sub_class):
                 resnet_dt=self.repinit_args.resnet_dt,
                 smooth=smooth,
                 seed=child_seed(seed, 5),
+                trainable=trainable,
             )
         else:
             self.repinit_three_body = None
@@ -533,6 +536,7 @@ def init_subclass_params(sub_data, sub_class):
             g1_out_mlp=self.repformer_args.g1_out_mlp,
             ln_eps=self.repformer_args.ln_eps,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.rcsl_list = [
             (self.repformers.get_rcut(), self.repformers.get_nsel()),
@@ -562,6 +566,7 @@ def init_subclass_params(sub_data, sub_class):
             use_tebd_bias=use_tebd_bias,
             type_map=type_map,
             seed=child_seed(seed, 2),
+            trainable=trainable,
         )
         self.concat_output_tebd = concat_output_tebd
         self.precision = precision
@@ -585,6 +590,7 @@ def init_subclass_params(sub_data, sub_class):
                 bias=False,
                 precision=precision,
                 seed=child_seed(seed, 3),
+                trainable=trainable,
             )
         self.tebd_transform = None
         if self.add_tebd_to_repinit_out:
@@ -594,6 +600,7 @@ def init_subclass_params(sub_data, sub_class):
                 bias=False,
                 precision=precision,
                 seed=child_seed(seed, 4),
+                trainable=trainable,
             )
         assert self.repinit.rcut > self.repformers.rcut
         assert self.repinit.sel[0] > self.repformers.sel[0]
@@ -665,7 +672,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.env_protection
 
-    def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> NoReturn:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -674,7 +683,7 @@ def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Any = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -729,11 +738,11 @@ def change_type_map(
             repinit_three_body["dstd"] = repinit_three_body["dstd"][remap_index]
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         return self.get_dim_out()
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the embedding dimension g2."""
         return self.get_dim_emb()
 
@@ -741,7 +750,7 @@ def compute_input_stats(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
-    ):
+    ) -> None:
         """
         Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
 
@@ -766,8 +775,8 @@ def compute_input_stats(
 
     def set_stat_mean_and_stddev(
         self,
-        mean: list[np.ndarray],
-        stddev: list[np.ndarray],
+        mean: list[Array],
+        stddev: list[Array],
     ) -> None:
         """Update mean and stddev for descriptor."""
         descrpt_list = [self.repinit, self.repformers]
@@ -777,7 +786,9 @@ def set_stat_mean_and_stddev(
             descrpt.mean = mean[ii]
             descrpt.stddev = stddev[ii]
 
-    def get_stat_mean_and_stddev(self) -> tuple[list[np.ndarray], list[np.ndarray]]:
+    def get_stat_mean_and_stddev(
+        self,
+    ) -> tuple[list[Array], list[Array]]:
         """Get mean and stddev for descriptor."""
         mean_list = [self.repinit.mean, self.repformers.mean]
         stddev_list = [
@@ -792,11 +803,11 @@ def get_stat_mean_and_stddev(self) -> tuple[list[np.ndarray], list[np.ndarray]]:
     @cast_precision
     def call(
         self,
-        coord_ext: np.ndarray,
-        atype_ext: np.ndarray,
-        nlist: np.ndarray,
-        mapping: Optional[np.ndarray] = None,
-    ):
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+    ) -> tuple[Array, Array]:
         """Compute the descriptor.
 
         Parameters
@@ -841,7 +852,7 @@ def call(
         type_embedding = self.type_embedding.call()
         # repinit
         g1_ext = xp.reshape(
-            xp.take(type_embedding, xp.reshape(atype_ext, [-1]), axis=0),
+            xp.take(type_embedding, xp.reshape(atype_ext, (-1,)), axis=0),
             (nframes, nall, self.tebd_dim),
         )
         g1_inp = g1_ext[:, :nloc, :]
@@ -1064,7 +1075,7 @@ def update_sel(
         train_data: DeepmdDataSystem,
         type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> tuple[dict, Optional[float]]:
+    ) -> tuple[Array, Array]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/dpmodel/descriptor/dpa3.py b/deepmd/dpmodel/descriptor/dpa3.py
index 25550be926..a54591339f 100644
--- a/deepmd/dpmodel/descriptor/dpa3.py
+++ b/deepmd/dpmodel/descriptor/dpa3.py
@@ -1,15 +1,18 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
     Union,
 )
 
 import array_api_compat
-import numpy as np
 
 from deepmd.dpmodel import (
     NativeOP,
 )
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     cast_precision,
     to_numpy_array,
@@ -208,7 +211,7 @@ def __init__(
         self.use_dynamic_sel = use_dynamic_sel
         self.sel_reduce_factor = sel_reduce_factor
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if hasattr(self, key):
             return getattr(self, key)
         else:
@@ -310,7 +313,7 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        def init_subclass_params(sub_data, sub_class):
+        def init_subclass_params(sub_data: Union[dict, Any], sub_class: type) -> Any:
             if isinstance(sub_data, dict):
                 return sub_class(**sub_data)
             elif isinstance(sub_data, sub_class):
@@ -357,6 +360,7 @@ def init_subclass_params(sub_data, sub_class):
             env_protection=env_protection,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
 
         self.use_econf_tebd = use_econf_tebd
@@ -374,6 +378,7 @@ def init_subclass_params(sub_data, sub_class):
             use_tebd_bias=use_tebd_bias,
             type_map=type_map,
             seed=child_seed(seed, 2),
+            trainable=trainable,
         )
         self.concat_output_tebd = concat_output_tebd
         self.precision = precision
@@ -448,7 +453,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.repflows.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -457,7 +464,7 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Any = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -486,15 +493,17 @@ def change_type_map(
         repflow["dstd"] = repflow["dstd"][remap_index]
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         return self.get_dim_out()
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the embedding dimension g2."""
         return self.get_dim_emb()
 
-    def compute_input_stats(self, merged: list[dict], path: Optional[DPPath] = None):
+    def compute_input_stats(
+        self, merged: list[dict], path: Optional[DPPath] = None
+    ) -> None:
         """Update mean and stddev for descriptor elements."""
         descrpt_list = [self.repflows]
         for ii, descrpt in enumerate(descrpt_list):
@@ -502,8 +511,8 @@ def compute_input_stats(self, merged: list[dict], path: Optional[DPPath] = None)
 
     def set_stat_mean_and_stddev(
         self,
-        mean: list[np.ndarray],
-        stddev: list[np.ndarray],
+        mean: list[Array],
+        stddev: list[Array],
     ) -> None:
         """Update mean and stddev for descriptor."""
         descrpt_list = [self.repflows]
@@ -511,7 +520,7 @@ def set_stat_mean_and_stddev(
             descrpt.mean = mean[ii]
             descrpt.stddev = stddev[ii]
 
-    def get_stat_mean_and_stddev(self) -> tuple[list[np.ndarray], list[np.ndarray]]:
+    def get_stat_mean_and_stddev(self) -> tuple[list[Array], list[Array]]:
         """Get mean and stddev for descriptor."""
         mean_list = [self.repflows.mean]
         stddev_list = [self.repflows.stddev]
@@ -520,11 +529,11 @@ def get_stat_mean_and_stddev(self) -> tuple[list[np.ndarray], list[np.ndarray]]:
     @cast_precision
     def call(
         self,
-        coord_ext: np.ndarray,
-        atype_ext: np.ndarray,
-        nlist: np.ndarray,
-        mapping: Optional[np.ndarray] = None,
-    ):
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+    ) -> tuple[Array, Array]:
         """Compute the descriptor.
 
         Parameters
@@ -562,12 +571,12 @@ def call(
         type_embedding = self.type_embedding.call()
         if self.use_loc_mapping:
             node_ebd_ext = xp.reshape(
-                xp.take(type_embedding, xp.reshape(atype_ext[:, :nloc], [-1]), axis=0),
+                xp.take(type_embedding, xp.reshape(atype_ext[:, :nloc], (-1,)), axis=0),
                 (nframes, nloc, self.tebd_dim),
             )
         else:
             node_ebd_ext = xp.reshape(
-                xp.take(type_embedding, xp.reshape(atype_ext, [-1]), axis=0),
+                xp.take(type_embedding, xp.reshape(atype_ext, (-1,)), axis=0),
                 (nframes, nall, self.tebd_dim),
             )
         node_ebd_inp = node_ebd_ext[:, :nloc, :]
@@ -656,7 +665,7 @@ def update_sel(
         train_data: DeepmdDataSystem,
         type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> tuple[dict, Optional[float]]:
+    ) -> tuple[Array, Array]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/dpmodel/descriptor/hybrid.py b/deepmd/dpmodel/descriptor/hybrid.py
index f050bb6222..083adf4240 100644
--- a/deepmd/dpmodel/descriptor/hybrid.py
+++ b/deepmd/dpmodel/descriptor/hybrid.py
@@ -10,6 +10,9 @@
 import array_api_compat
 import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     NativeOP,
 )
@@ -76,7 +79,7 @@ def __init__(
             )
         # if hybrid sel is larger than sub sel, the nlist needs to be cut for each type
         hybrid_sel = self.get_sel()
-        nlist_cut_idx: list[np.ndarray] = []
+        nlist_cut_idx: list[Array] = []
         if self.mixed_types() and not all(
             descrpt.mixed_types() for descrpt in self.descrpt_list
         ):
@@ -144,7 +147,7 @@ def get_dim_emb(self) -> int:
         """Returns the output dimension."""
         return np.sum([descrpt.get_dim_emb() for descrpt in self.descrpt_list]).item()
 
-    def mixed_types(self):
+    def mixed_types(self) -> bool:
         """Returns if the descriptor requires a neighbor list that distinguish different
         atomic types or not.
         """
@@ -168,7 +171,9 @@ def get_env_protection(self) -> float:
             )
         return all_protection[0]
 
-    def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
+    def share_params(
+        self, base_class: Any, shared_level: Any, resume: bool = False
+    ) -> NoReturn:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -177,7 +182,7 @@ def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Any = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -199,8 +204,8 @@ def compute_input_stats(
 
     def set_stat_mean_and_stddev(
         self,
-        mean: list[Union[np.ndarray, list[np.ndarray]]],
-        stddev: list[Union[np.ndarray, list[np.ndarray]]],
+        mean: list[Union[np.ndarray, list[Array]]],
+        stddev: list[Union[np.ndarray, list[Array]]],
     ) -> None:
         """Update mean and stddev for descriptor."""
         for ii, descrpt in enumerate(self.descrpt_list):
@@ -209,8 +214,8 @@ def set_stat_mean_and_stddev(
     def get_stat_mean_and_stddev(
         self,
     ) -> tuple[
-        list[Union[np.ndarray, list[np.ndarray]]],
-        list[Union[np.ndarray, list[np.ndarray]]],
+        list[Union[Array, list[Array]]],
+        list[Union[Array, list[Array]]],
     ]:
         """Get mean and stddev for descriptor."""
         mean_list = []
@@ -255,11 +260,17 @@ def enable_compression(
 
     def call(
         self,
-        coord_ext,
-        atype_ext,
-        nlist,
-        mapping: Optional[np.ndarray] = None,
-    ):
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+    ) -> tuple[
+        Array,
+        Optional[Array],
+        Optional[Array],
+        Optional[Array],
+        Optional[Array],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -324,7 +335,7 @@ def update_sel(
         train_data: DeepmdDataSystem,
         type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> tuple[dict, Optional[float]]:
+    ) -> tuple[Array, Array]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/dpmodel/descriptor/make_base_descriptor.py b/deepmd/dpmodel/descriptor/make_base_descriptor.py
index f45e85e516..e867ecdaa9 100644
--- a/deepmd/dpmodel/descriptor/make_base_descriptor.py
+++ b/deepmd/dpmodel/descriptor/make_base_descriptor.py
@@ -4,6 +4,7 @@
     abstractmethod,
 )
 from typing import (
+    Any,
     Callable,
     NoReturn,
     Optional,
@@ -13,6 +14,9 @@
 from deepmd.common import (
     j_get_type,
 )
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.utils.data_system import (
     DeepmdDataSystem,
 )
@@ -26,9 +30,9 @@
 
 
 def make_base_descriptor(
-    t_tensor,
+    t_tensor: type,
     fwd_method_name: str = "forward",
-):
+) -> type:
     """Make the base class for the descriptor.
 
     Parameters
@@ -44,7 +48,7 @@ def make_base_descriptor(
     class BD(ABC, PluginVariant, make_plugin_registry("descriptor")):
         """Base descriptor provides the interfaces of descriptor."""
 
-        def __new__(cls, *args, **kwargs):
+        def __new__(cls, *args: Any, **kwargs: Any) -> Any:
             if cls is BD:
                 cls = cls.get_class_by_type(j_get_type(kwargs, cls.__name__))
             return super().__new__(cls)
@@ -113,7 +117,9 @@ def get_env_protection(self) -> float:
             pass
 
         @abstractmethod
-        def share_params(self, base_class, shared_level, resume=False):
+        def share_params(
+            self, base_class: Any, shared_level: Any, resume: bool = False
+        ) -> None:
             """
             Share the parameters of self to the base_class with shared_level during multitask training.
             If not start from checkpoint (resume is False),
@@ -123,7 +129,7 @@ def share_params(self, base_class, shared_level, resume=False):
 
         @abstractmethod
         def change_type_map(
-            self, type_map: list[str], model_with_new_type_stat=None
+            self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
         ) -> None:
             """Change the type related params to new ones, according to `type_map` and the original one in the model.
             If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -131,12 +137,12 @@ def change_type_map(
             pass
 
         @abstractmethod
-        def set_stat_mean_and_stddev(self, mean, stddev) -> None:
+        def set_stat_mean_and_stddev(self, mean: Any, stddev: Any) -> None:
             """Update mean and stddev for descriptor."""
             pass
 
         @abstractmethod
-        def get_stat_mean_and_stddev(self):
+        def get_stat_mean_and_stddev(self) -> Any:
             """Get mean and stddev for descriptor."""
             pass
 
@@ -176,11 +182,11 @@ def enable_compression(
         @abstractmethod
         def fwd(
             self,
-            extended_coord,
-            extended_atype,
-            nlist,
-            mapping: Optional[t_tensor] = None,
-        ):
+            extended_coord: Array,
+            extended_atype: Array,
+            nlist: Array,
+            mapping: Optional[Array] = None,
+        ) -> Array:
             """Calculate descriptor."""
             pass
 
diff --git a/deepmd/dpmodel/descriptor/repflows.py b/deepmd/dpmodel/descriptor/repflows.py
index 0a39e4c596..407bf95351 100644
--- a/deepmd/dpmodel/descriptor/repflows.py
+++ b/deepmd/dpmodel/descriptor/repflows.py
@@ -13,6 +13,7 @@
     NativeOP,
 )
 from deepmd.dpmodel.array_api import (
+    Array,
     xp_take_along_axis,
 )
 from deepmd.dpmodel.common import (
@@ -167,15 +168,17 @@ class DescrptBlockRepflows(NativeOP, DescriptorBlock):
         For example, when using paddings, there may be zero distances of neighbors, which may make division by zero error during environment matrix calculations without protection.
     seed : int, optional
         Random seed for parameter initialization.
+    trainable : bool, default: True
+        Whether the block is trainable
     """
 
     def __init__(
         self,
-        e_rcut,
-        e_rcut_smth,
+        e_rcut: float,
+        e_rcut_smth: float,
         e_sel: int,
-        a_rcut,
-        a_rcut_smth,
+        a_rcut: float,
+        a_rcut_smth: float,
         a_sel: int,
         ntypes: int,
         nlayers: int = 6,
@@ -205,6 +208,7 @@ def __init__(
         sel_reduce_factor: float = 10.0,
         use_loc_mapping: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.e_rcut = float(e_rcut)
@@ -269,10 +273,19 @@ def __init__(
         self.seed = seed
 
         self.edge_embd = NativeLayer(
-            1, self.e_dim, precision=precision, seed=child_seed(seed, 0)
+            1,
+            self.e_dim,
+            precision=precision,
+            seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.angle_embd = NativeLayer(
-            1, self.a_dim, precision=precision, bias=False, seed=child_seed(seed, 1)
+            1,
+            self.a_dim,
+            precision=precision,
+            bias=False,
+            seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         layers = []
         for ii in range(nlayers):
@@ -304,6 +317,7 @@ def __init__(
                     sel_reduce_factor=self.sel_reduce_factor,
                     smooth_edge_update=self.smooth_edge_update,
                     seed=child_seed(child_seed(seed, 1), ii),
+                    trainable=trainable,
                 )
             )
         self.layers = layers
@@ -358,7 +372,7 @@ def get_dim_emb(self) -> int:
         """Returns the embedding dimension e_dim."""
         return self.e_dim
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Array) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -366,7 +380,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Array:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -391,17 +405,17 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.n_dim
 
     @property
-    def dim_in(self):
+    def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return self.n_dim
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the embedding dimension e_dim."""
         return self.get_dim_emb()
 
@@ -462,12 +476,12 @@ def reinit_exclude(
 
     def call(
         self,
-        nlist: np.ndarray,
-        coord_ext: np.ndarray,
-        atype_ext: np.ndarray,
-        atype_embd_ext: Optional[np.ndarray] = None,
-        mapping: Optional[np.ndarray] = None,
-    ):
+        nlist: Array,
+        coord_ext: Array,
+        atype_ext: Array,
+        atype_embd_ext: Optional[Array] = None,
+        mapping: Optional[Array] = None,
+    ) -> tuple[Array, Array]:
         xp = array_api_compat.array_namespace(nlist, coord_ext, atype_ext)
         nframes, nloc, nnei = nlist.shape
         nall = xp.reshape(coord_ext, (nframes, -1)).shape[1] // 3
@@ -490,7 +504,7 @@ def call(
         sw = xp.where(nlist_mask, sw, xp.zeros_like(sw))
 
         # get angle nlist (maybe smaller)
-        a_dist_mask = (xp.linalg.vector_norm(diff, axis=-1) < self.a_rcut)[
+        a_dist_mask = (safe_for_vector_norm(diff, axis=-1) < self.a_rcut)[
             :, :, : self.a_sel
         ]
         a_nlist = nlist[:, :, : self.a_sel]
@@ -578,7 +592,8 @@ def call(
             # n_angle x 1
             a_sw = (a_sw[:, :, :, None] * a_sw[:, :, None, :])[a_nlist_mask]
         else:
-            edge_index = angle_index = xp.zeros([1, 3], dtype=nlist.dtype)
+            edge_index = xp.zeros([2, 1], dtype=nlist.dtype)
+            angle_index = xp.zeros([3, 1], dtype=nlist.dtype)
 
         # get edge and angle embedding
         # nb x nloc x nnei x e_dim [OR] n_edge x e_dim
@@ -622,7 +637,7 @@ def call(
                 edge_ebd,
                 h2,
                 sw,
-                owner=edge_index[:, 0],
+                owner=edge_index[0, :],
                 num_owner=nframes * nloc,
                 nb=nframes,
                 nloc=nloc,
@@ -649,7 +664,7 @@ def need_sorted_nlist_for_lower(self) -> bool:
         return True
 
     @classmethod
-    def deserialize(cls, data):
+    def deserialize(cls, data: dict) -> "DescrptBlockRepflows":
         """Deserialize the descriptor block."""
         data = data.copy()
         edge_embd = NativeLayer.deserialize(data.pop("edge_embd"))
@@ -670,7 +685,7 @@ def deserialize(cls, data):
         obj.stddev = dstd
         return obj
 
-    def serialize(self):
+    def serialize(self) -> dict:
         """Serialize the descriptor block."""
         return {
             "e_rcut": self.e_rcut,
@@ -720,15 +735,15 @@ def serialize(self):
 
 
 def _cal_hg_dynamic(
-    flat_edge_ebd: np.ndarray,
-    flat_h2: np.ndarray,
-    flat_sw: np.ndarray,
-    owner: np.ndarray,
+    flat_edge_ebd: Array,
+    flat_h2: Array,
+    flat_sw: Array,
+    owner: Array,
     num_owner: int,
     nb: int,
     nloc: int,
     scale_factor: float,
-) -> np.ndarray:
+) -> Array:
     """
     Calculate the transposed rotation matrix.
 
@@ -775,16 +790,16 @@ def _cal_hg_dynamic(
 
 
 def symmetrization_op_dynamic(
-    flat_edge_ebd: np.ndarray,
-    flat_h2: np.ndarray,
-    flat_sw: np.ndarray,
-    owner: np.ndarray,
+    flat_edge_ebd: Array,
+    flat_h2: Array,
+    flat_sw: Array,
+    owner: Array,
     num_owner: int,
     nb: int,
     nloc: int,
     scale_factor: float,
     axis_neuron: int,
-) -> np.ndarray:
+) -> Array:
     """
     Symmetrization operator to obtain atomic invariant rep.
 
@@ -860,6 +875,7 @@ def __init__(
         update_residual_init: str = "const",
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.epsilon = 1e-4  # protection of 1./nnei
@@ -922,6 +938,7 @@ def __init__(
             n_dim,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         if self.update_style == "res_residual":
             self.n_residual.append(
@@ -931,6 +948,7 @@ def __init__(
                     self.update_residual_init,
                     precision=precision,
                     seed=child_seed(seed, 1),
+                    trainable=trainable,
                 )
             )
 
@@ -941,6 +959,7 @@ def __init__(
             n_dim,
             precision=precision,
             seed=child_seed(seed, 2),
+            trainable=trainable,
         )
         if self.update_style == "res_residual":
             self.n_residual.append(
@@ -950,6 +969,7 @@ def __init__(
                     self.update_residual_init,
                     precision=precision,
                     seed=child_seed(seed, 3),
+                    trainable=trainable,
                 )
             )
 
@@ -959,6 +979,7 @@ def __init__(
             self.n_multi_edge_message * n_dim,
             precision=precision,
             seed=child_seed(seed, 4),
+            trainable=trainable,
         )
         if self.update_style == "res_residual":
             for head_index in range(self.n_multi_edge_message):
@@ -969,6 +990,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(child_seed(seed, 5), head_index),
+                        trainable=trainable,
                     )
                 )
 
@@ -978,6 +1000,7 @@ def __init__(
             e_dim,
             precision=precision,
             seed=child_seed(seed, 6),
+            trainable=trainable,
         )
         if self.update_style == "res_residual":
             self.e_residual.append(
@@ -987,6 +1010,7 @@ def __init__(
                     self.update_residual_init,
                     precision=precision,
                     seed=child_seed(seed, 7),
+                    trainable=trainable,
                 )
             )
 
@@ -1015,6 +1039,7 @@ def __init__(
                         precision=precision,
                         bias=False,
                         seed=child_seed(seed, 8),
+                        trainable=trainable,
                     )
                     self.a_compress_e_linear = NativeLayer(
                         self.e_dim,
@@ -1022,6 +1047,7 @@ def __init__(
                         precision=precision,
                         bias=False,
                         seed=child_seed(seed, 9),
+                        trainable=trainable,
                     )
                 else:
                     self.a_compress_n_linear = None
@@ -1033,12 +1059,14 @@ def __init__(
                 self.e_dim,
                 precision=precision,
                 seed=child_seed(seed, 10),
+                trainable=trainable,
             )
             self.edge_angle_linear2 = NativeLayer(
                 self.e_dim,
                 self.e_dim,
                 precision=precision,
                 seed=child_seed(seed, 11),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.e_residual.append(
@@ -1048,6 +1076,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 12),
+                        trainable=trainable,
                     )
                 )
 
@@ -1057,6 +1086,7 @@ def __init__(
                 self.a_dim,
                 precision=precision,
                 seed=child_seed(seed, 13),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.a_residual.append(
@@ -1066,6 +1096,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 14),
+                        trainable=trainable,
                     )
                 )
         else:
@@ -1078,11 +1109,11 @@ def __init__(
 
     def optim_angle_update(
         self,
-        angle_ebd: np.ndarray,
-        node_ebd: np.ndarray,
-        edge_ebd: np.ndarray,
+        angle_ebd: Array,
+        node_ebd: Array,
+        edge_ebd: Array,
         feat: str = "edge",
-    ) -> np.ndarray:
+    ) -> Array:
         xp = array_api_compat.array_namespace(angle_ebd, node_ebd, edge_ebd)
 
         if feat == "edge":
@@ -1126,14 +1157,14 @@ def optim_angle_update(
 
     def optim_angle_update_dynamic(
         self,
-        flat_angle_ebd: np.ndarray,
-        node_ebd: np.ndarray,
-        flat_edge_ebd: np.ndarray,
-        n2a_index: np.ndarray,
-        eij2a_index: np.ndarray,
-        eik2a_index: np.ndarray,
-        feat="edge",
-    ):
+        flat_angle_ebd: Array,
+        node_ebd: Array,
+        flat_edge_ebd: Array,
+        n2a_index: Array,
+        eij2a_index: Array,
+        eik2a_index: Array,
+        feat: str = "edge",
+    ) -> Array:
         xp = array_api_compat.array_namespace(
             flat_angle_ebd, node_ebd, flat_edge_ebd, n2a_index, eij2a_index, eik2a_index
         )
@@ -1185,12 +1216,12 @@ def optim_angle_update_dynamic(
 
     def optim_edge_update(
         self,
-        node_ebd: np.ndarray,
-        node_ebd_ext: np.ndarray,
-        edge_ebd: np.ndarray,
-        nlist: np.ndarray,
+        node_ebd: Array,
+        node_ebd_ext: Array,
+        edge_ebd: Array,
+        nlist: Array,
         feat: str = "node",
-    ) -> np.ndarray:
+    ) -> Array:
         xp = array_api_compat.array_namespace(node_ebd, node_ebd_ext, edge_ebd, nlist)
 
         if feat == "node":
@@ -1228,13 +1259,13 @@ def optim_edge_update(
 
     def optim_edge_update_dynamic(
         self,
-        node_ebd: np.ndarray,
-        node_ebd_ext: np.ndarray,
-        flat_edge_ebd: np.ndarray,
-        n2e_index: np.ndarray,
-        n_ext2e_index: np.ndarray,
+        node_ebd: Array,
+        node_ebd_ext: Array,
+        flat_edge_ebd: Array,
+        n2e_index: Array,
+        n_ext2e_index: Array,
         feat: str = "node",
-    ):
+    ) -> Array:
         xp = array_api_compat.array_namespace(
             node_ebd, node_ebd_ext, flat_edge_ebd, n2e_index, n_ext2e_index
         )
@@ -1276,19 +1307,19 @@ def optim_edge_update_dynamic(
 
     def call(
         self,
-        node_ebd_ext: np.ndarray,  # nf x nall x n_dim
-        edge_ebd: np.ndarray,  # nf x nloc x nnei x e_dim
-        h2: np.ndarray,  # nf x nloc x nnei x 3
-        angle_ebd: np.ndarray,  # nf x nloc x a_nnei x a_nnei x a_dim
-        nlist: np.ndarray,  # nf x nloc x nnei
-        nlist_mask: np.ndarray,  # nf x nloc x nnei
-        sw: np.ndarray,  # switch func, nf x nloc x nnei
-        a_nlist: np.ndarray,  # nf x nloc x a_nnei
-        a_nlist_mask: np.ndarray,  # nf x nloc x a_nnei
-        a_sw: np.ndarray,  # switch func, nf x nloc x a_nnei
-        edge_index: np.ndarray,  # n_edge x 2
-        angle_index: np.ndarray,  # n_angle x 3
-    ):
+        node_ebd_ext: Array,  # nf x nall x n_dim
+        edge_ebd: Array,  # nf x nloc x nnei x e_dim
+        h2: Array,  # nf x nloc x nnei x 3
+        angle_ebd: Array,  # nf x nloc x a_nnei x a_nnei x a_dim
+        nlist: Array,  # nf x nloc x nnei
+        nlist_mask: Array,  # nf x nloc x nnei
+        sw: Array,  # switch func, nf x nloc x nnei
+        a_nlist: Array,  # nf x nloc x a_nnei
+        a_nlist_mask: Array,  # nf x nloc x a_nnei
+        a_sw: Array,  # switch func, nf x nloc x a_nnei
+        edge_index: Array,  # 2 x n_edge
+        angle_index: Array,  # 3 x n_angle
+    ) -> tuple[Array, Array]:
         """
         Parameters
         ----------
@@ -1312,12 +1343,12 @@ def call(
             Masks of the neighbor list for angle. real nei 1 otherwise 0
         a_sw : nf x nloc x a_nnei
             Switch function for angle.
-        edge_index : Optional for dynamic sel, n_edge x 2
+        edge_index : Optional for dynamic sel, 2 x n_edge
             n2e_index : n_edge
                 Broadcast indices from node(i) to edge(ij), or reduction indices from edge(ij) to node(i).
             n_ext2e_index : n_edge
                 Broadcast indices from extended node(j) to edge(ij).
-        angle_index : Optional for dynamic sel, n_angle x 3
+        angle_index : Optional for dynamic sel, 3 x n_angle
             n2a_index : n_angle
                 Broadcast indices from extended node(j) to angle(ijk).
             eij2a_index : n_angle
@@ -1362,11 +1393,11 @@ def call(
             assert (n_edge, 3) == h2.shape
         del a_nlist  # may be used in the future
 
-        n2e_index, n_ext2e_index = edge_index[:, 0], edge_index[:, 1]
+        n2e_index, n_ext2e_index = edge_index[0, :], edge_index[1, :]
         n2a_index, eij2a_index, eik2a_index = (
-            angle_index[:, 0],
-            angle_index[:, 1],
-            angle_index[:, 2],
+            angle_index[0, :],
+            angle_index[1, :],
+            angle_index[2, :],
         )
 
         # nb x nloc x nnei x n_dim [OR] n_edge x n_dim
@@ -1378,16 +1409,16 @@ def call(
             )
         )
 
-        n_update_list: list[np.ndarray] = [node_ebd]
-        e_update_list: list[np.ndarray] = [edge_ebd]
-        a_update_list: list[np.ndarray] = [angle_ebd]
+        n_update_list: list[Array] = [node_ebd]
+        e_update_list: list[Array] = [edge_ebd]
+        a_update_list: list[Array] = [angle_ebd]
 
         # node self mlp
         node_self_mlp = self.act(self.node_self_mlp(node_ebd))
         n_update_list.append(node_self_mlp)
 
         # node sym (grrg + drrd)
-        node_sym_list: list[np.ndarray] = []
+        node_sym_list: list[Array] = []
         node_sym_list.append(
             symmetrization_op(
                 edge_ebd,
@@ -1757,15 +1788,15 @@ def call(
 
     def list_update_res_avg(
         self,
-        update_list: list[np.ndarray],
-    ) -> np.ndarray:
+        update_list: list[Array],
+    ) -> Array:
         nitem = len(update_list)
         uu = update_list[0]
         for ii in range(1, nitem):
             uu = uu + update_list[ii]
         return uu / (float(nitem) ** 0.5)
 
-    def list_update_res_incr(self, update_list: list[np.ndarray]) -> np.ndarray:
+    def list_update_res_incr(self, update_list: list[Array]) -> Array:
         nitem = len(update_list)
         uu = update_list[0]
         scale = 1.0 / (float(nitem - 1) ** 0.5) if nitem > 1 else 0.0
@@ -1774,8 +1805,8 @@ def list_update_res_incr(self, update_list: list[np.ndarray]) -> np.ndarray:
         return uu
 
     def list_update_res_residual(
-        self, update_list: list[np.ndarray], update_name: str = "node"
-    ) -> np.ndarray:
+        self, update_list: list[Array], update_name: str = "node"
+    ) -> Array:
         nitem = len(update_list)
         uu = update_list[0]
         if update_name == "node":
@@ -1791,9 +1822,7 @@ def list_update_res_residual(
             raise NotImplementedError
         return uu
 
-    def list_update(
-        self, update_list: list[np.ndarray], update_name: str = "node"
-    ) -> np.ndarray:
+    def list_update(self, update_list: list[Array], update_name: str = "node") -> Array:
         if self.update_style == "res_avg":
             return self.list_update_res_avg(update_list)
         elif self.update_style == "res_incr":
diff --git a/deepmd/dpmodel/descriptor/repformers.py b/deepmd/dpmodel/descriptor/repformers.py
index 3d02054350..9b5b21c1ea 100644
--- a/deepmd/dpmodel/descriptor/repformers.py
+++ b/deepmd/dpmodel/descriptor/repformers.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -13,6 +14,7 @@
     NativeOP,
 )
 from deepmd.dpmodel.array_api import (
+    Array,
     xp_take_along_axis,
 )
 from deepmd.dpmodel.common import (
@@ -54,7 +56,7 @@
 )
 
 
-def xp_transpose_01423(x):
+def xp_transpose_01423(x: Array) -> Array:
     xp = array_api_compat.array_namespace(x)
     x_shape2 = x.shape[2]
     x_shape3 = x.shape[3]
@@ -65,7 +67,7 @@ def xp_transpose_01423(x):
     return x
 
 
-def xp_transpose_01342(x):
+def xp_transpose_01342(x: Array) -> Array:
     xp = array_api_compat.array_namespace(x)
     x_shape2 = x.shape[2]
     x_shape3 = x.shape[3]
@@ -164,17 +166,19 @@ class DescrptBlockRepformers(NativeOP, DescriptorBlock):
         The epsilon value for layer normalization.
     seed : int, optional
         The random seed for initialization.
+    trainable : bool, default: True
+        Whether the block is trainable
     """
 
     def __init__(
         self,
-        rcut,
-        rcut_smth,
+        rcut: float,
+        rcut_smth: float,
         sel: int,
         ntypes: int,
         nlayers: int = 3,
-        g1_dim=128,
-        g2_dim=16,
+        g1_dim: int = 128,
+        g2_dim: int = 16,
         axis_neuron: int = 4,
         direct_dist: bool = False,
         update_g1_has_conv: bool = True,
@@ -204,6 +208,7 @@ def __init__(
         g1_out_mlp: bool = True,
         ln_eps: Optional[float] = 1e-5,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.rcut = rcut
@@ -252,7 +257,11 @@ def __init__(
         self.epsilon = 1e-4
 
         self.g2_embd = NativeLayer(
-            1, self.g2_dim, precision=precision, seed=child_seed(seed, 0)
+            1,
+            self.g2_dim,
+            precision=precision,
+            seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         layers = []
         for ii in range(nlayers):
@@ -290,6 +299,7 @@ def __init__(
                     g1_out_conv=self.g1_out_conv,
                     g1_out_mlp=self.g1_out_mlp,
                     seed=child_seed(child_seed(seed, 1), ii),
+                    trainable=trainable,
                 )
             )
         self.layers = layers
@@ -328,7 +338,7 @@ def get_dim_emb(self) -> int:
         """Returns the embedding dimension g2."""
         return self.g2_dim
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Array) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -336,7 +346,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Array:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -357,17 +367,17 @@ def mixed_types(self) -> bool:
         return True
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.g1_dim
 
     @property
-    def dim_in(self):
+    def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return self.g1_dim
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the embedding dimension g2."""
         return self.get_dim_emb()
 
@@ -428,13 +438,13 @@ def reinit_exclude(
 
     def call(
         self,
-        nlist: np.ndarray,
-        coord_ext: np.ndarray,
-        atype_ext: np.ndarray,
-        atype_embd_ext: Optional[np.ndarray] = None,
-        mapping: Optional[np.ndarray] = None,
-        type_embedding: Optional[np.ndarray] = None,
-    ):
+        nlist: Array,
+        coord_ext: Array,
+        atype_ext: Array,
+        atype_embd_ext: Optional[Array] = None,
+        mapping: Optional[Array] = None,
+        type_embedding: Optional[Array] = None,
+    ) -> Array:
         xp = array_api_compat.array_namespace(nlist, coord_ext, atype_ext)
         exclude_mask = self.emask.build_type_exclude_mask(nlist, atype_ext)
         exclude_mask = xp.astype(exclude_mask, xp.bool)
@@ -509,7 +519,7 @@ def need_sorted_nlist_for_lower(self) -> bool:
         return False
 
     @classmethod
-    def deserialize(cls, data):
+    def deserialize(cls, data: dict[str, Any]) -> "DescrptBlockRepformers":
         """Deserialize the descriptor block."""
         data = data.copy()
         g2_embd = NativeLayer.deserialize(data.pop("g2_embd"))
@@ -526,7 +536,7 @@ def deserialize(cls, data):
         obj.stddev = dstd
         return obj
 
-    def serialize(self):
+    def serialize(self) -> dict[str, Any]:
         """Serialize the descriptor block."""
         return {
             "rcut": self.rcut,
@@ -583,7 +593,7 @@ def get_residual(
     trainable: bool = True,
     precision: str = "float64",
     seed: Optional[Union[int, list[int]]] = None,
-) -> np.ndarray:
+) -> Array:
     """
     Get residual tensor for one update vector.
 
@@ -617,9 +627,9 @@ def get_residual(
 
 
 def _make_nei_g1(
-    g1_ext: np.ndarray,
-    nlist: np.ndarray,
-) -> np.ndarray:
+    g1_ext: Array,
+    nlist: Array,
+) -> Array:
     """
     Make neighbor-wise atomic invariant rep.
 
@@ -632,7 +642,7 @@ def _make_nei_g1(
 
     Returns
     -------
-    gg1: np.ndarray
+    gg1: Array
         Neighbor-wise atomic invariant rep, with shape [nf, nloc, nnei, ng1].
     """
     xp = array_api_compat.array_namespace(g1_ext, nlist)
@@ -650,9 +660,9 @@ def _make_nei_g1(
 
 
 def _apply_nlist_mask(
-    gg: np.ndarray,
-    nlist_mask: np.ndarray,
-) -> np.ndarray:
+    gg: Array,
+    nlist_mask: Array,
+) -> Array:
     """
     Apply nlist mask to neighbor-wise rep tensors.
 
@@ -668,7 +678,7 @@ def _apply_nlist_mask(
     return masked_gg
 
 
-def _apply_switch(gg: np.ndarray, sw: np.ndarray) -> np.ndarray:
+def _apply_switch(gg: Array, sw: Array) -> Array:
     """
     Apply switch function to neighbor-wise rep tensors.
 
@@ -686,14 +696,14 @@ def _apply_switch(gg: np.ndarray, sw: np.ndarray) -> np.ndarray:
 
 
 def _cal_hg(
-    g: np.ndarray,
-    h: np.ndarray,
-    nlist_mask: np.ndarray,
-    sw: np.ndarray,
+    g: Array,
+    h: Array,
+    nlist_mask: Array,
+    sw: Array,
     smooth: bool = True,
     epsilon: float = 1e-4,
     use_sqrt_nnei: bool = True,
-) -> np.ndarray:
+) -> Array:
     """
     Calculate the transposed rotation matrix.
 
@@ -751,7 +761,7 @@ def _cal_hg(
     return hg
 
 
-def _cal_grrg(hg: np.ndarray, axis_neuron: int) -> np.ndarray:
+def _cal_grrg(hg: Array, axis_neuron: int) -> Array:
     """
     Calculate the atomic invariant rep.
 
@@ -780,15 +790,15 @@ def _cal_grrg(hg: np.ndarray, axis_neuron: int) -> np.ndarray:
 
 
 def symmetrization_op(
-    g: np.ndarray,
-    h: np.ndarray,
-    nlist_mask: np.ndarray,
-    sw: np.ndarray,
+    g: Array,
+    h: Array,
+    nlist_mask: Array,
+    sw: Array,
     axis_neuron: int,
     smooth: bool = True,
     epsilon: float = 1e-4,
     use_sqrt_nnei: bool = True,
-) -> np.ndarray:
+) -> Array:
     """
     Symmetrization operator to obtain atomic invariant rep.
 
@@ -847,6 +857,7 @@ def __init__(
         attnw_shift: float = 20.0,
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         """Return neighbor-wise multi-head self-attention maps, with gate mechanism."""
         super().__init__()
@@ -859,6 +870,7 @@ def __init__(
             bias=False,
             precision=precision,
             seed=seed,
+            trainable=trainable,
         )
         self.has_gate = has_gate
         self.smooth = smooth
@@ -867,11 +879,11 @@ def __init__(
 
     def call(
         self,
-        g2: np.ndarray,  # nf x nloc x nnei x ng2
-        h2: np.ndarray,  # nf x nloc x nnei x 3
-        nlist_mask: np.ndarray,  # nf x nloc x nnei
-        sw: np.ndarray,  # nf x nloc x nnei
-    ) -> np.ndarray:
+        g2: Array,  # nf x nloc x nnei x ng2
+        h2: Array,  # nf x nloc x nnei x 3
+        nlist_mask: Array,  # nf x nloc x nnei
+        sw: Array,  # nf x nloc x nnei
+    ) -> Array:
         xp = array_api_compat.array_namespace(g2, h2, nlist_mask, sw)
         (
             nf,
@@ -970,6 +982,7 @@ def __init__(
         head_num: int,
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.input_dim = input_dim
@@ -980,20 +993,22 @@ def __init__(
             bias=False,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.head_map = NativeLayer(
             input_dim * head_num,
             input_dim,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.precision = precision
 
     def call(
         self,
-        AA: np.ndarray,  # nf x nloc x nnei x nnei x nh
-        g2: np.ndarray,  # nf x nloc x nnei x ng2
-    ) -> np.ndarray:
+        AA: Array,  # nf x nloc x nnei x nnei x nh
+        g2: Array,  # nf x nloc x nnei x ng2
+    ) -> Array:
         xp = array_api_compat.array_namespace(AA, g2)
         nf, nloc, nnei, ng2 = g2.shape
         nh = self.head_num
@@ -1058,20 +1073,26 @@ def __init__(
         head_num: int,
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.input_dim = input_dim
         self.head_num = head_num
         self.head_map = NativeLayer(
-            head_num, 1, bias=False, precision=precision, seed=seed
+            head_num,
+            1,
+            bias=False,
+            precision=precision,
+            seed=seed,
+            trainable=trainable,
         )
         self.precision = precision
 
     def call(
         self,
-        AA: np.ndarray,  # nf x nloc x nnei x nnei x nh
-        h2: np.ndarray,  # nf x nloc x nnei x 3
-    ) -> np.ndarray:
+        AA: Array,  # nf x nloc x nnei x nnei x nh
+        h2: Array,  # nf x nloc x nnei x 3
+    ) -> Array:
         xp = array_api_compat.array_namespace(AA, h2)
         nf, nloc, nnei, _ = h2.shape
         nh = self.head_num
@@ -1133,6 +1154,7 @@ def __init__(
         attnw_shift: float = 20.0,
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.input_dim = input_dim
@@ -1144,6 +1166,7 @@ def __init__(
             bias=False,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.mapkv = NativeLayer(
             input_dim,
@@ -1151,12 +1174,14 @@ def __init__(
             bias=False,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.head_map = NativeLayer(
             input_dim * head_num,
             input_dim,
             precision=precision,
             seed=child_seed(seed, 2),
+            trainable=trainable,
         )
         self.smooth = smooth
         self.attnw_shift = attnw_shift
@@ -1164,11 +1189,11 @@ def __init__(
 
     def call(
         self,
-        g1: np.ndarray,  # nf x nloc x ng1
-        gg1: np.ndarray,  # nf x nloc x nnei x ng1
-        nlist_mask: np.ndarray,  # nf x nloc x nnei
-        sw: np.ndarray,  # nf x nloc x nnei
-    ) -> np.ndarray:
+        g1: Array,  # nf x nloc x ng1
+        gg1: Array,  # nf x nloc x nnei x ng1
+        nlist_mask: Array,  # nf x nloc x nnei
+        sw: Array,  # nf x nloc x nnei
+    ) -> Array:
         xp = array_api_compat.array_namespace(g1, gg1, nlist_mask, sw)
         nf, nloc, nnei = nlist_mask.shape
         ni, nd, nh = self.input_dim, self.hidden_dim, self.head_num
@@ -1263,12 +1288,12 @@ def deserialize(cls, data: dict) -> "LocalAtten":
 class RepformerLayer(NativeOP):
     def __init__(
         self,
-        rcut,
-        rcut_smth,
+        rcut: float,
+        rcut_smth: float,
         sel: int,
         ntypes: int,
-        g1_dim=128,
-        g2_dim=16,
+        g1_dim: int = 128,
+        g2_dim: int = 16,
         axis_neuron: int = 4,
         update_chnnl_2: bool = True,
         update_g1_has_conv: bool = True,
@@ -1295,6 +1320,7 @@ def __init__(
         g1_out_mlp: bool = True,
         ln_eps: Optional[float] = 1e-5,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.epsilon = 1e-4  # protection of 1./nnei
@@ -1354,6 +1380,7 @@ def __init__(
                     self.update_residual_init,
                     precision=precision,
                     seed=child_seed(seed, 0),
+                    trainable=trainable,
                 )
             )
 
@@ -1363,6 +1390,7 @@ def __init__(
             g1_dim,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.linear2 = None
         self.proj_g1g2 = None
@@ -1379,6 +1407,7 @@ def __init__(
                 g2_dim,
                 precision=precision,
                 seed=child_seed(seed, 2),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 g2_residual.append(
@@ -1388,6 +1417,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 3),
+                        trainable=trainable,
                     )
                 )
         if self.g1_out_mlp:
@@ -1396,6 +1426,7 @@ def __init__(
                 g1_dim,
                 precision=precision,
                 seed=child_seed(seed, 15),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 g1_residual.append(
@@ -1405,6 +1436,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 16),
+                        trainable=trainable,
                     )
                 )
         else:
@@ -1417,6 +1449,7 @@ def __init__(
                     bias=False,
                     precision=precision,
                     seed=child_seed(seed, 4),
+                    trainable=trainable,
                 )
             else:
                 self.proj_g1g2 = NativeLayer(
@@ -1425,6 +1458,7 @@ def __init__(
                     bias=False,
                     precision=precision,
                     seed=child_seed(seed, 4),
+                    trainable=trainable,
                 )
                 if self.update_style == "res_residual":
                     g1_residual.append(
@@ -1434,6 +1468,7 @@ def __init__(
                             self.update_residual_init,
                             precision=precision,
                             seed=child_seed(seed, 17),
+                            trainable=trainable,
                         )
                     )
         if self.update_g2_has_g1g1:
@@ -1443,6 +1478,7 @@ def __init__(
                 bias=False,
                 precision=precision,
                 seed=child_seed(seed, 5),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 g2_residual.append(
@@ -1452,6 +1488,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 6),
+                        trainable=trainable,
                     )
                 )
         if self.update_g2_has_attn or self.update_h2:
@@ -1463,10 +1500,15 @@ def __init__(
                 self.smooth,
                 precision=precision,
                 seed=child_seed(seed, 7),
+                trainable=trainable,
             )
             if self.update_g2_has_attn:
                 self.attn2_mh_apply = Atten2MultiHeadApply(
-                    g2_dim, attn2_nhead, precision=precision, seed=child_seed(seed, 8)
+                    g2_dim,
+                    attn2_nhead,
+                    precision=precision,
+                    seed=child_seed(seed, 8),
+                    trainable=trainable,
                 )
                 self.attn2_lm = LayerNorm(
                     g2_dim,
@@ -1483,12 +1525,17 @@ def __init__(
                             self.update_residual_init,
                             precision=precision,
                             seed=child_seed(seed, 10),
+                            trainable=trainable,
                         )
                     )
 
             if self.update_h2:
                 self.attn2_ev_apply = Atten2EquiVarApply(
-                    g2_dim, attn2_nhead, precision=precision, seed=child_seed(seed, 11)
+                    g2_dim,
+                    attn2_nhead,
+                    precision=precision,
+                    seed=child_seed(seed, 11),
+                    trainable=trainable,
                 )
                 if self.update_style == "res_residual":
                     h2_residual.append(
@@ -1498,6 +1545,7 @@ def __init__(
                             self.update_residual_init,
                             precision=precision,
                             seed=child_seed(seed, 12),
+                            trainable=trainable,
                         )
                     )
         if self.update_g1_has_attn:
@@ -1508,6 +1556,7 @@ def __init__(
                 self.smooth,
                 precision=precision,
                 seed=child_seed(seed, 13),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 g1_residual.append(
@@ -1517,6 +1566,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 14),
+                        trainable=trainable,
                     )
                 )
 
@@ -1536,9 +1586,9 @@ def cal_1_dim(self, g1d: int, g2d: int, ax: int) -> int:
 
     def _update_h2(
         self,
-        h2: np.ndarray,
-        attn: np.ndarray,
-    ) -> np.ndarray:
+        h2: Array,
+        attn: Array,
+    ) -> Array:
         """
         Calculate the attention weights update for pair-wise equivariant rep.
 
@@ -1556,11 +1606,11 @@ def _update_h2(
 
     def _update_g1_conv(
         self,
-        gg1: np.ndarray,
-        g2: np.ndarray,
-        nlist_mask: np.ndarray,
-        sw: np.ndarray,
-    ) -> np.ndarray:
+        gg1: Array,
+        g2: Array,
+        nlist_mask: Array,
+        sw: Array,
+    ) -> Array:
         """
         Calculate the convolution update for atomic invariant rep.
 
@@ -1614,11 +1664,11 @@ def _update_g1_conv(
 
     def _update_g2_g1g1(
         self,
-        g1: np.ndarray,  # nf x nloc x ng1
-        gg1: np.ndarray,  # nf x nloc x nnei x ng1
-        nlist_mask: np.ndarray,  # nf x nloc x nnei
-        sw: np.ndarray,  # nf x nloc x nnei
-    ) -> np.ndarray:
+        g1: Array,  # nf x nloc x ng1
+        gg1: Array,  # nf x nloc x nnei x ng1
+        nlist_mask: Array,  # nf x nloc x nnei
+        sw: Array,  # nf x nloc x nnei
+    ) -> Array:
         """
         Update the g2 using element-wise dot g1_i * g1_j.
 
@@ -1644,13 +1694,13 @@ def _update_g2_g1g1(
 
     def call(
         self,
-        g1_ext: np.ndarray,  # nf x nall x ng1
-        g2: np.ndarray,  # nf x nloc x nnei x ng2
-        h2: np.ndarray,  # nf x nloc x nnei x 3
-        nlist: np.ndarray,  # nf x nloc x nnei
-        nlist_mask: np.ndarray,  # nf x nloc x nnei
-        sw: np.ndarray,  # switch func, nf x nloc x nnei
-    ):
+        g1_ext: Array,  # nf x nall x ng1
+        g2: Array,  # nf x nloc x nnei x ng2
+        h2: Array,  # nf x nloc x nnei x 3
+        nlist: Array,  # nf x nloc x nnei
+        nlist_mask: Array,  # nf x nloc x nnei
+        sw: Array,  # switch func, nf x nloc x nnei
+    ) -> tuple[Array, Array]:
         """
         Parameters
         ----------
@@ -1682,10 +1732,10 @@ def call(
         assert (nf, nloc) == g1.shape[:2]
         assert (nf, nloc, nnei) == h2.shape[:3]
 
-        g2_update: list[np.ndarray] = [g2]
-        h2_update: list[np.ndarray] = [h2]
-        g1_update: list[np.ndarray] = [g1]
-        g1_mlp: list[np.ndarray] = [g1] if not self.g1_out_mlp else []
+        g2_update: list[Array] = [g2]
+        h2_update: list[Array] = [h2]
+        g1_update: list[Array] = [g1]
+        g1_mlp: list[Array] = [g1] if not self.g1_out_mlp else []
         if self.g1_out_mlp:
             assert self.g1_self_mlp is not None
             g1_self_mlp = self.act(self.g1_self_mlp(g1))
@@ -1787,15 +1837,15 @@ def call(
 
     def list_update_res_avg(
         self,
-        update_list: list[np.ndarray],
-    ) -> np.ndarray:
+        update_list: list[Array],
+    ) -> Array:
         nitem = len(update_list)
         uu = update_list[0]
         for ii in range(1, nitem):
             uu = uu + update_list[ii]
         return uu / (float(nitem) ** 0.5)
 
-    def list_update_res_incr(self, update_list: list[np.ndarray]) -> np.ndarray:
+    def list_update_res_incr(self, update_list: list[Array]) -> Array:
         nitem = len(update_list)
         uu = update_list[0]
         scale = 1.0 / (float(nitem - 1) ** 0.5) if nitem > 1 else 0.0
@@ -1804,8 +1854,8 @@ def list_update_res_incr(self, update_list: list[np.ndarray]) -> np.ndarray:
         return uu
 
     def list_update_res_residual(
-        self, update_list: list[np.ndarray], update_name: str = "g1"
-    ) -> np.ndarray:
+        self, update_list: list[Array], update_name: str = "g1"
+    ) -> Array:
         nitem = len(update_list)
         uu = update_list[0]
         if update_name == "g1":
@@ -1821,9 +1871,7 @@ def list_update_res_residual(
             raise NotImplementedError
         return uu
 
-    def list_update(
-        self, update_list: list[np.ndarray], update_name: str = "g1"
-    ) -> np.ndarray:
+    def list_update(self, update_list: list[Array], update_name: str = "g1") -> Array:
         if self.update_style == "res_avg":
             return self.list_update_res_avg(update_list)
         elif self.update_style == "res_incr":
diff --git a/deepmd/dpmodel/descriptor/se_atten_v2.py b/deepmd/dpmodel/descriptor/se_atten_v2.py
index 897863ec0f..f6c497d151 100644
--- a/deepmd/dpmodel/descriptor/se_atten_v2.py
+++ b/deepmd/dpmodel/descriptor/se_atten_v2.py
@@ -56,7 +56,7 @@ def __init__(
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = DEFAULT_PRECISION,
-        scaling_factor=1.0,
+        scaling_factor: float = 1.0,
         normalize: bool = True,
         temperature: Optional[float] = None,
         trainable_ln: bool = True,
diff --git a/deepmd/dpmodel/descriptor/se_e2_a.py b/deepmd/dpmodel/descriptor/se_e2_a.py
index bd72d936e3..7cdfa963ee 100644
--- a/deepmd/dpmodel/descriptor/se_e2_a.py
+++ b/deepmd/dpmodel/descriptor/se_e2_a.py
@@ -16,6 +16,9 @@
     PRECISION_DICT,
     NativeOP,
 )
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     cast_precision,
     to_numpy_array,
@@ -207,6 +210,7 @@ def __init__(
                 self.resnet_dt,
                 self.precision,
                 seed=child_seed(seed, ii),
+                trainable=trainable,
             )
         self.embeddings = embeddings
         self.env_mat = EnvMat(self.rcut, self.rcut_smth, protection=self.env_protection)
@@ -221,7 +225,7 @@ def __init__(
         self.sel_cumsum = [0, *np.cumsum(self.sel).tolist()]
         self.ndescrpt = self.nnei * 4
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Array) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.davg = value
         elif key in ("std", "data_std", "dstd"):
@@ -229,7 +233,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Array:
         if key in ("avg", "data_avg", "davg"):
             return self.davg
         elif key in ("std", "data_std", "dstd"):
@@ -238,19 +242,19 @@ def __getitem__(self, key):
             raise KeyError(key)
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.get_dim_out()
 
-    def get_dim_out(self):
+    def get_dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.neuron[-1] * self.axis_neuron
 
-    def get_dim_emb(self):
+    def get_dim_emb(self) -> int:
         """Returns the embedding (g2) dimension of this descriptor."""
         return self.neuron[-1]
 
-    def get_rcut(self):
+    def get_rcut(self) -> float:
         """Returns cutoff radius."""
         return self.rcut
 
@@ -258,7 +262,7 @@ def get_rcut_smth(self) -> float:
         """Returns the radius where the neighbor information starts to smoothly decay to 0."""
         return self.rcut_smth
 
-    def get_sel(self):
+    def get_sel(self) -> list[int]:
         """Returns cutoff radius."""
         return self.sel
 
@@ -280,7 +284,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.env_protection
 
-    def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
+    def share_params(
+        self, base_class: Any, shared_level: Any, resume: bool = False
+    ) -> NoReturn:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -289,7 +295,7 @@ def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -350,22 +356,22 @@ def compute_input_stats(
 
     def set_stat_mean_and_stddev(
         self,
-        mean: np.ndarray,
-        stddev: np.ndarray,
+        mean: Array,
+        stddev: Array,
     ) -> None:
         """Update mean and stddev for descriptor."""
         self.davg = mean
         self.dstd = stddev
 
-    def get_stat_mean_and_stddev(self) -> tuple[np.ndarray, np.ndarray]:
+    def get_stat_mean_and_stddev(self) -> tuple[Array, Array]:
         """Get mean and stddev for descriptor."""
         return self.davg, self.dstd
 
     def cal_g(
         self,
-        ss,
-        embedding_idx,
-    ):
+        ss: Array,
+        embedding_idx: int,
+    ) -> Array:
         xp = array_api_compat.array_namespace(ss)
         nf_times_nloc, nnei = ss.shape[0:2]
         ss = xp.reshape(ss, (nf_times_nloc, nnei, 1))
@@ -383,11 +389,11 @@ def reinit_exclude(
     @cast_precision
     def call(
         self,
-        coord_ext,
-        atype_ext,
-        nlist,
-        mapping: Optional[np.ndarray] = None,
-    ):
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+    ) -> Array:
         """Compute the descriptor.
 
         Parameters
@@ -518,7 +524,7 @@ def update_sel(
         train_data: DeepmdDataSystem,
         type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> tuple[dict, Optional[float]]:
+    ) -> tuple[Array, Array]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -548,11 +554,11 @@ class DescrptSeAArrayAPI(DescrptSeA):
     @cast_precision
     def call(
         self,
-        coord_ext,
-        atype_ext,
-        nlist,
-        mapping: Optional[np.ndarray] = None,
-    ):
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+    ) -> Array:
         """Compute the descriptor.
 
         Parameters
diff --git a/deepmd/dpmodel/descriptor/se_r.py b/deepmd/dpmodel/descriptor/se_r.py
index 5b2931b23f..4287083442 100644
--- a/deepmd/dpmodel/descriptor/se_r.py
+++ b/deepmd/dpmodel/descriptor/se_r.py
@@ -15,6 +15,9 @@
     PRECISION_DICT,
     NativeOP,
 )
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     cast_precision,
     get_xp_precision,
@@ -166,6 +169,7 @@ def __init__(
                 self.resnet_dt,
                 self.precision,
                 seed=child_seed(seed, ii),
+                trainable=trainable,
             )
         self.embeddings = embeddings
         self.env_mat = EnvMat(self.rcut, self.rcut_smth, protection=self.env_protection)
@@ -180,7 +184,7 @@ def __init__(
         self.sel_cumsum = [0, *np.cumsum(self.sel).tolist()]
         self.ndescrpt = self.nnei
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Array) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.davg = value
         elif key in ("std", "data_std", "dstd"):
@@ -188,7 +192,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Array:
         if key in ("avg", "data_avg", "davg"):
             return self.davg
         elif key in ("std", "data_std", "dstd"):
@@ -197,11 +201,11 @@ def __getitem__(self, key):
             raise KeyError(key)
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.get_dim_out()
 
-    def get_dim_out(self):
+    def get_dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.neuron[-1]
 
@@ -209,7 +213,7 @@ def get_dim_emb(self) -> NoReturn:
         """Returns the embedding (g2) dimension of this descriptor."""
         raise NotImplementedError
 
-    def get_rcut(self):
+    def get_rcut(self) -> float:
         """Returns cutoff radius."""
         return self.rcut
 
@@ -217,7 +221,7 @@ def get_rcut_smth(self) -> float:
         """Returns the radius where the neighbor information starts to smoothly decay to 0."""
         return self.rcut_smth
 
-    def get_sel(self):
+    def get_sel(self) -> list[int]:
         """Returns cutoff radius."""
         return self.sel
 
@@ -239,7 +243,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.env_protection
 
-    def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
+    def share_params(
+        self, base_class: Any, shared_level: Any, resume: bool = False
+    ) -> NoReturn:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -248,7 +254,7 @@ def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -309,22 +315,22 @@ def compute_input_stats(
 
     def set_stat_mean_and_stddev(
         self,
-        mean: np.ndarray,
-        stddev: np.ndarray,
+        mean: Array,
+        stddev: Array,
     ) -> None:
         """Update mean and stddev for descriptor."""
         self.davg = mean
         self.dstd = stddev
 
-    def get_stat_mean_and_stddev(self) -> tuple[np.ndarray, np.ndarray]:
+    def get_stat_mean_and_stddev(self) -> tuple[Array, Array]:
         """Get mean and stddev for descriptor."""
         return self.davg, self.dstd
 
     def cal_g(
         self,
-        ss,
-        ll,
-    ):
+        ss: Array,
+        ll: int,
+    ) -> Array:
         xp = array_api_compat.array_namespace(ss)
         nf, nloc, nnei = ss.shape[0:3]
         ss = xp.reshape(ss, (nf, nloc, nnei, 1))
@@ -335,11 +341,11 @@ def cal_g(
     @cast_precision
     def call(
         self,
-        coord_ext,
-        atype_ext,
-        nlist,
-        mapping: Optional[np.ndarray] = None,
-    ):
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+    ) -> Array:
         """Compute the descriptor.
 
         Parameters
@@ -455,7 +461,7 @@ def update_sel(
         train_data: DeepmdDataSystem,
         type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> tuple[dict, Optional[float]]:
+    ) -> tuple[Array, Array]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/dpmodel/descriptor/se_t.py b/deepmd/dpmodel/descriptor/se_t.py
index fb30f04961..cfeb5d7735 100644
--- a/deepmd/dpmodel/descriptor/se_t.py
+++ b/deepmd/dpmodel/descriptor/se_t.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import itertools
 from typing import (
+    Any,
     Callable,
     NoReturn,
     Optional,
@@ -15,6 +16,9 @@
     PRECISION_DICT,
     NativeOP,
 )
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     cast_precision,
     get_xp_precision,
@@ -147,6 +151,7 @@ def __init__(
                 self.resnet_dt,
                 self.precision,
                 seed=child_seed(self.seed, ii),
+                trainable=trainable,
             )
         self.embeddings = embeddings
         self.env_mat = EnvMat(self.rcut, self.rcut_smth, protection=self.env_protection)
@@ -160,7 +165,7 @@ def __init__(
         self.orig_sel = self.sel
         self.ndescrpt = self.nnei * 4
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Array) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.davg = value
         elif key in ("std", "data_std", "dstd"):
@@ -168,7 +173,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Array:
         if key in ("avg", "data_avg", "davg"):
             return self.davg
         elif key in ("std", "data_std", "dstd"):
@@ -177,12 +182,12 @@ def __getitem__(self, key):
             raise KeyError(key)
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.get_dim_out()
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Any = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -193,15 +198,15 @@ def change_type_map(
             "We may consider adding this support in the future if there is a clear demand for it."
         )
 
-    def get_dim_out(self):
+    def get_dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.neuron[-1]
 
-    def get_dim_emb(self):
+    def get_dim_emb(self) -> int:
         """Returns the embedding (g2) dimension of this descriptor."""
         return self.neuron[-1]
 
-    def get_rcut(self):
+    def get_rcut(self) -> float:
         """Returns cutoff radius."""
         return self.rcut
 
@@ -209,7 +214,7 @@ def get_rcut_smth(self) -> float:
         """Returns the radius where the neighbor information starts to smoothly decay to 0."""
         return self.rcut_smth
 
-    def get_sel(self):
+    def get_sel(self) -> list:
         """Returns cutoff radius."""
         return self.sel
 
@@ -231,7 +236,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.env_protection
 
-    def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> NoReturn:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -289,14 +296,14 @@ def compute_input_stats(
 
     def set_stat_mean_and_stddev(
         self,
-        mean: np.ndarray,
-        stddev: np.ndarray,
+        mean: Array,
+        stddev: Array,
     ) -> None:
         """Update mean and stddev for descriptor."""
         self.davg = mean
         self.dstd = stddev
 
-    def get_stat_mean_and_stddev(self) -> tuple[np.ndarray, np.ndarray]:
+    def get_stat_mean_and_stddev(self) -> tuple[Array, Array]:
         """Get mean and stddev for descriptor."""
         return self.davg, self.dstd
 
@@ -310,11 +317,11 @@ def reinit_exclude(
     @cast_precision
     def call(
         self,
-        coord_ext,
-        atype_ext,
-        nlist,
-        mapping: Optional[np.ndarray] = None,
-    ):
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+    ) -> tuple[Array, Array]:
         """Compute the descriptor.
 
         Parameters
@@ -453,7 +460,7 @@ def update_sel(
         train_data: DeepmdDataSystem,
         type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> tuple[dict, Optional[float]]:
+    ) -> tuple[Array, Array]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
diff --git a/deepmd/dpmodel/descriptor/se_t_tebd.py b/deepmd/dpmodel/descriptor/se_t_tebd.py
index ff26024aad..b9e0e62531 100644
--- a/deepmd/dpmodel/descriptor/se_t_tebd.py
+++ b/deepmd/dpmodel/descriptor/se_t_tebd.py
@@ -14,6 +14,7 @@
     NativeOP,
 )
 from deepmd.dpmodel.array_api import (
+    Array,
     xp_take_along_axis,
 )
 from deepmd.dpmodel.common import (
@@ -138,7 +139,7 @@ def __init__(
         type_map: Optional[list[str]] = None,
         concat_output_tebd: bool = True,
         use_econf_tebd: bool = False,
-        use_tebd_bias=False,
+        use_tebd_bias: bool = False,
         smooth: bool = True,
     ) -> None:
         self.se_ttebd = DescrptBlockSeTTebd(
@@ -157,6 +158,7 @@ def __init__(
             env_protection=env_protection,
             smooth=smooth,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.use_econf_tebd = use_econf_tebd
         self.type_map = type_map
@@ -171,6 +173,7 @@ def __init__(
             use_tebd_bias=use_tebd_bias,
             type_map=type_map,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.tebd_dim = tebd_dim
         self.concat_output_tebd = concat_output_tebd
@@ -235,7 +238,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.se_ttebd.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
+    def share_params(
+        self, base_class: "DescrptSeTTebd", shared_level: int, resume: bool = False
+    ) -> NoReturn:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -244,18 +249,18 @@ def share_params(self, base_class, shared_level, resume=False) -> NoReturn:
         raise NotImplementedError
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         return self.get_dim_out()
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         return self.get_dim_emb()
 
     def compute_input_stats(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
-    ):
+    ) -> None:
         """
         Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
 
@@ -276,19 +281,21 @@ def compute_input_stats(
 
     def set_stat_mean_and_stddev(
         self,
-        mean: np.ndarray,
-        stddev: np.ndarray,
+        mean: Array,
+        stddev: Array,
     ) -> None:
         """Update mean and stddev for descriptor."""
         self.se_ttebd.mean = mean
         self.se_ttebd.stddev = stddev
 
-    def get_stat_mean_and_stddev(self) -> tuple[np.ndarray, np.ndarray]:
+    def get_stat_mean_and_stddev(self) -> tuple[Array, Array]:
         """Get mean and stddev for descriptor."""
         return self.se_ttebd.mean, self.se_ttebd.stddev
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["DescrptSeTTebd"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -317,11 +324,11 @@ def change_type_map(
     @cast_precision
     def call(
         self,
-        coord_ext,
-        atype_ext,
-        nlist,
-        mapping: Optional[np.ndarray] = None,
-    ):
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+    ) -> tuple[Array, Array]:
         """Compute the descriptor.
 
         Parameters
@@ -358,7 +365,7 @@ def call(
         type_embedding = self.type_embedding.call()
         # nf x nall x tebd_dim
         atype_embd_ext = xp.reshape(
-            xp.take(type_embedding, xp.reshape(atype_ext, [-1]), axis=0),
+            xp.take(type_embedding, xp.reshape(atype_ext, (-1,)), axis=0),
             (nf, nall, self.tebd_dim),
         )
         # nfnl x tebd_dim
@@ -451,7 +458,7 @@ def update_sel(
         train_data: DeepmdDataSystem,
         type_map: Optional[list[str]],
         local_jdata: dict,
-    ) -> tuple[dict, Optional[float]]:
+    ) -> tuple[Array, Array]:
         """Update the selection and perform neighbor statistics.
 
         Parameters
@@ -490,13 +497,14 @@ def __init__(
         tebd_dim: int = 8,
         tebd_input_mode: str = "concat",
         set_davg_zero: bool = True,
-        activation_function="tanh",
+        activation_function: str = "tanh",
         precision: str = "float64",
         resnet_dt: bool = False,
         exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         smooth: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         self.rcut = rcut
         self.rcut_smth = rcut_smth
@@ -542,6 +550,7 @@ def __init__(
             self.resnet_dt,
             self.precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.embeddings = embeddings
         if self.tebd_input_mode in ["strip"]:
@@ -557,6 +566,7 @@ def __init__(
                 self.resnet_dt,
                 self.precision,
                 seed=child_seed(seed, 1),
+                trainable=trainable,
             )
             self.embeddings_strip = embeddings_strip
         else:
@@ -600,7 +610,7 @@ def get_dim_emb(self) -> int:
         """Returns the output dimension of embedding."""
         return self.filter_neuron[-1]
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Array) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -608,7 +618,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Array:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -633,17 +643,17 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.filter_neuron[-1]
 
     @property
-    def dim_in(self):
+    def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return self.tebd_dim
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the output dimension of embedding."""
         return self.get_dim_emb()
 
@@ -704,18 +714,18 @@ def reinit_exclude(
 
     def cal_g(
         self,
-        ss,
-        embedding_idx,
-    ):
+        ss: Array,
+        embedding_idx: int,
+    ) -> Array:
         # nfnl x nt_i x nt_j x ng
         gg = self.embeddings[embedding_idx].call(ss)
         return gg
 
     def cal_g_strip(
         self,
-        ss,
-        embedding_idx,
-    ):
+        ss: Array,
+        embedding_idx: int,
+    ) -> Array:
         assert self.embeddings_strip is not None
         # nfnl x nt_i x nt_j x ng
         gg = self.embeddings_strip[embedding_idx].call(ss)
@@ -723,13 +733,13 @@ def cal_g_strip(
 
     def call(
         self,
-        nlist: np.ndarray,
-        coord_ext: np.ndarray,
-        atype_ext: np.ndarray,
-        atype_embd_ext: Optional[np.ndarray] = None,
-        mapping: Optional[np.ndarray] = None,
-        type_embedding: Optional[np.ndarray] = None,
-    ):
+        nlist: Array,
+        coord_ext: Array,
+        atype_ext: Array,
+        atype_embd_ext: Optional[Array] = None,
+        mapping: Optional[Array] = None,
+        type_embedding: Optional[Array] = None,
+    ) -> tuple[Array, Array]:
         xp = array_api_compat.array_namespace(nlist, coord_ext, atype_ext)
         # nf x nloc x nnei x 4
         dmatrix, diff, sw = self.env_mat.call(
diff --git a/deepmd/dpmodel/fitting/dipole_fitting.py b/deepmd/dpmodel/fitting/dipole_fitting.py
index fcaea43338..e6bea408f8 100644
--- a/deepmd/dpmodel/fitting/dipole_fitting.py
+++ b/deepmd/dpmodel/fitting/dipole_fitting.py
@@ -6,11 +6,13 @@
 )
 
 import array_api_compat
-import numpy as np
 
 from deepmd.dpmodel import (
     DEFAULT_PRECISION,
 )
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     cast_precision,
 )
@@ -84,6 +86,9 @@ class DipoleFitting(GeneralFitting):
             Only reducible variable are differentiable.
     type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
+    default_fparam: list[float], optional
+            The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+            this value will be used as the default value for the frame parameter in the fitting net.
     """
 
     def __init__(
@@ -110,6 +115,7 @@ def __init__(
         c_differentiable: bool = True,
         type_map: Optional[list[str]] = None,
         seed: Optional[Union[int, list[int]]] = None,
+        default_fparam: Optional[list[float]] = None,
     ) -> None:
         if tot_ener_zero:
             raise NotImplementedError("tot_ener_zero is not implemented")
@@ -144,9 +150,10 @@ def __init__(
             exclude_types=exclude_types,
             type_map=type_map,
             seed=seed,
+            default_fparam=default_fparam,
         )
 
-    def _net_out_dim(self):
+    def _net_out_dim(self) -> int:
         """Set the FittingNet output dim."""
         return self.embedding_width
 
@@ -161,12 +168,12 @@ def serialize(self) -> dict:
     @classmethod
     def deserialize(cls, data: dict) -> "GeneralFitting":
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 3, 1)
+        check_version_compatibility(data.pop("@version", 1), 4, 1)
         var_name = data.pop("var_name", None)
         assert var_name == "dipole"
         return super().deserialize(data)
 
-    def output_def(self):
+    def output_def(self) -> FittingOutputDef:
         return FittingOutputDef(
             [
                 OutputVariableDef(
@@ -182,14 +189,14 @@ def output_def(self):
     @cast_precision
     def call(
         self,
-        descriptor: np.ndarray,
-        atype: np.ndarray,
-        gr: Optional[np.ndarray] = None,
-        g2: Optional[np.ndarray] = None,
-        h2: Optional[np.ndarray] = None,
-        fparam: Optional[np.ndarray] = None,
-        aparam: Optional[np.ndarray] = None,
-    ) -> dict[str, np.ndarray]:
+        descriptor: Array,
+        atype: Array,
+        gr: Optional[Array] = None,
+        g2: Optional[Array] = None,
+        h2: Optional[Array] = None,
+        fparam: Optional[Array] = None,
+        aparam: Optional[Array] = None,
+    ) -> dict[str, Array]:
         """Calculate the fitting.
 
         Parameters
diff --git a/deepmd/dpmodel/fitting/dos_fitting.py b/deepmd/dpmodel/fitting/dos_fitting.py
index 2f6df77eac..b444e8ae13 100644
--- a/deepmd/dpmodel/fitting/dos_fitting.py
+++ b/deepmd/dpmodel/fitting/dos_fitting.py
@@ -7,6 +7,9 @@
 
 import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     DEFAULT_PRECISION,
     to_numpy_array,
@@ -37,7 +40,7 @@ def __init__(
         numb_fparam: int = 0,
         numb_aparam: int = 0,
         dim_case_embd: int = 0,
-        bias_dos: Optional[np.ndarray] = None,
+        bias_dos: Optional[Array] = None,
         rcond: Optional[float] = None,
         trainable: Union[bool, list[bool]] = True,
         activation_function: str = "tanh",
@@ -46,6 +49,7 @@ def __init__(
         exclude_types: list[int] = [],
         type_map: Optional[list[str]] = None,
         seed: Optional[Union[int, list[int]]] = None,
+        default_fparam: Optional[list] = None,
     ) -> None:
         if bias_dos is not None:
             self.bias_dos = bias_dos
@@ -70,12 +74,13 @@ def __init__(
             exclude_types=exclude_types,
             type_map=type_map,
             seed=seed,
+            default_fparam=default_fparam,
         )
 
     @classmethod
     def deserialize(cls, data: dict) -> "GeneralFitting":
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 3, 1)
+        check_version_compatibility(data.pop("@version", 1), 4, 1)
         data["numb_dos"] = data.pop("dim_out")
         data.pop("tot_ener_zero", None)
         data.pop("var_name", None)
diff --git a/deepmd/dpmodel/fitting/ener_fitting.py b/deepmd/dpmodel/fitting/ener_fitting.py
index 6435b6468f..794c074485 100644
--- a/deepmd/dpmodel/fitting/ener_fitting.py
+++ b/deepmd/dpmodel/fitting/ener_fitting.py
@@ -46,6 +46,7 @@ def __init__(
         exclude_types: list[int] = [],
         type_map: Optional[list[str]] = None,
         seed: Optional[Union[int, list[int]]] = None,
+        default_fparam: Optional[list] = None,
     ) -> None:
         super().__init__(
             var_name="energy",
@@ -70,12 +71,13 @@ def __init__(
             exclude_types=exclude_types,
             type_map=type_map,
             seed=seed,
+            default_fparam=default_fparam,
         )
 
     @classmethod
     def deserialize(cls, data: dict) -> "GeneralFitting":
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 3, 1)
+        check_version_compatibility(data.pop("@version", 1), 4, 1)
         data.pop("var_name")
         data.pop("dim_out")
         return super().deserialize(data)
diff --git a/deepmd/dpmodel/fitting/general_fitting.py b/deepmd/dpmodel/fitting/general_fitting.py
index cd0d4e72d4..a380717927 100644
--- a/deepmd/dpmodel/fitting/general_fitting.py
+++ b/deepmd/dpmodel/fitting/general_fitting.py
@@ -16,6 +16,9 @@
     PRECISION_DICT,
     NativeOP,
 )
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     get_xp_precision,
     to_numpy_array,
@@ -94,6 +97,9 @@ class GeneralFitting(NativeOP, BaseFitting):
             A list of strings. Give the name to each type of atoms.
     seed: Optional[Union[int, list[int]]]
         Random seed for initializing the network parameters.
+    default_fparam: list[float], optional
+        The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+        this value will be used as the default value for the frame parameter in the fitting net.
     """
 
     def __init__(
@@ -106,7 +112,7 @@ def __init__(
         numb_fparam: int = 0,
         numb_aparam: int = 0,
         dim_case_embd: int = 0,
-        bias_atom_e: Optional[np.ndarray] = None,
+        bias_atom_e: Optional[Array] = None,
         rcond: Optional[float] = None,
         tot_ener_zero: bool = False,
         trainable: Optional[list[bool]] = None,
@@ -120,6 +126,7 @@ def __init__(
         remove_vaccum_contribution: Optional[list[bool]] = None,
         type_map: Optional[list[str]] = None,
         seed: Optional[Union[int, list[int]]] = None,
+        default_fparam: Optional[list[float]] = None,
     ) -> None:
         self.var_name = var_name
         self.ntypes = ntypes
@@ -129,6 +136,7 @@ def __init__(
         self.numb_fparam = numb_fparam
         self.numb_aparam = numb_aparam
         self.dim_case_embd = dim_case_embd
+        self.default_fparam = default_fparam
         self.rcond = rcond
         self.tot_ener_zero = tot_ener_zero
         self.trainable = trainable
@@ -177,6 +185,15 @@ def __init__(
             self.case_embd = np.zeros(self.dim_case_embd, dtype=self.prec)
         else:
             self.case_embd = None
+
+        if self.default_fparam is not None:
+            if self.numb_fparam > 0:
+                assert len(self.default_fparam) == self.numb_fparam, (
+                    "default_fparam length mismatch!"
+                )
+            self.default_fparam_tensor = np.array(self.default_fparam, dtype=self.prec)
+        else:
+            self.default_fparam_tensor = None
         # init networks
         in_dim = (
             self.dim_descrpt
@@ -198,13 +215,14 @@ def __init__(
                     self.precision,
                     bias_out=True,
                     seed=child_seed(seed, ii),
+                    trainable=trainable,
                 )
                 for ii in range(self.ntypes if not self.mixed_types else 1)
             ],
         )
 
     @abstractmethod
-    def _net_out_dim(self):
+    def _net_out_dim(self) -> int:
         """Set the FittingNet output dim."""
         pass
 
@@ -216,6 +234,10 @@ def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.numb_aparam
 
+    def has_default_fparam(self) -> bool:
+        """Check if the fitting has default frame parameters."""
+        return self.default_fparam is not None
+
     def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
@@ -229,7 +251,7 @@ def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this fitting net by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -237,7 +259,7 @@ def set_case_embd(self, case_idx: int):
         self.case_embd = np.eye(self.dim_case_embd, dtype=self.prec)[case_idx]
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -258,7 +280,7 @@ def change_type_map(
             )
         self.bias_atom_e = self.bias_atom_e[remap_index]
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ["bias_atom_e"]:
             self.bias_atom_e = value
         elif key in ["fparam_avg"]:
@@ -273,10 +295,12 @@ def __setitem__(self, key, value) -> None:
             self.case_embd = value
         elif key in ["scale"]:
             self.scale = value
+        elif key in ["default_fparam_tensor"]:
+            self.default_fparam_tensor = value
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ["bias_atom_e"]:
             return self.bias_atom_e
         elif key in ["fparam_avg"]:
@@ -291,6 +315,8 @@ def __getitem__(self, key):
             return self.case_embd
         elif key in ["scale"]:
             return self.scale
+        elif key in ["default_fparam_tensor"]:
+            return self.default_fparam_tensor
         else:
             raise KeyError(key)
 
@@ -305,7 +331,7 @@ def serialize(self) -> dict:
         """Serialize the fitting to dict."""
         return {
             "@class": "Fitting",
-            "@version": 3,
+            "@version": 4,
             "var_name": self.var_name,
             "ntypes": self.ntypes,
             "dim_descrpt": self.dim_descrpt,
@@ -314,6 +340,7 @@ def serialize(self) -> dict:
             "numb_fparam": self.numb_fparam,
             "numb_aparam": self.numb_aparam,
             "dim_case_embd": self.dim_case_embd,
+            "default_fparam": self.default_fparam,
             "rcond": self.rcond,
             "activation_function": self.activation_function,
             "precision": self.precision,
@@ -352,14 +379,14 @@ def deserialize(cls, data: dict) -> "GeneralFitting":
 
     def _call_common(
         self,
-        descriptor: np.ndarray,
-        atype: np.ndarray,
-        gr: Optional[np.ndarray] = None,
-        g2: Optional[np.ndarray] = None,
-        h2: Optional[np.ndarray] = None,
-        fparam: Optional[np.ndarray] = None,
-        aparam: Optional[np.ndarray] = None,
-    ) -> dict[str, np.ndarray]:
+        descriptor: Array,
+        atype: Array,
+        gr: Optional[Array] = None,
+        g2: Optional[Array] = None,
+        h2: Optional[Array] = None,
+        fparam: Optional[Array] = None,
+        aparam: Optional[Array] = None,
+    ) -> dict[str, Array]:
         """Calculate the fitting.
 
         Parameters
@@ -402,6 +429,14 @@ def _call_common(
             xx_zeros = xp.zeros_like(xx)
         else:
             xx_zeros = None
+
+        if self.numb_fparam > 0 and fparam is None:
+            # use default fparam
+            assert self.default_fparam_tensor is not None
+            fparam = xp.tile(
+                xp.reshape(self.default_fparam_tensor, (1, self.numb_fparam)), (nf, 1)
+            )
+
         # check fparam dim, concate to input descriptor
         if self.numb_fparam > 0:
             assert fparam is not None, "fparam should not be None"
@@ -412,7 +447,7 @@ def _call_common(
                 )
             fparam = (fparam - self.fparam_avg[...]) * self.fparam_inv_std[...]
             fparam = xp.tile(
-                xp.reshape(fparam, [nf, 1, self.numb_fparam]), (1, nloc, 1)
+                xp.reshape(fparam, (nf, 1, self.numb_fparam)), (1, nloc, 1)
             )
             xx = xp.concat(
                 [xx, fparam],
@@ -431,7 +466,7 @@ def _call_common(
                     f"get an input aparam of dim {aparam.shape[-1]}, "
                     f"which is not consistent with {self.numb_aparam}."
                 )
-            aparam = xp.reshape(aparam, [nf, nloc, self.numb_aparam])
+            aparam = xp.reshape(aparam, (nf, nloc, self.numb_aparam))
             aparam = (aparam - self.aparam_avg[...]) * self.aparam_inv_std[...]
             xx = xp.concat(
                 [xx, aparam],
@@ -446,7 +481,7 @@ def _call_common(
         if self.dim_case_embd > 0:
             assert self.case_embd is not None
             case_embd = xp.tile(
-                xp.reshape(self.case_embd[...], [1, 1, -1]), [nf, nloc, 1]
+                xp.reshape(self.case_embd[...], (1, 1, -1)), (nf, nloc, 1)
             )
             xx = xp.concat(
                 [xx, case_embd],
@@ -465,7 +500,7 @@ def _call_common(
             )
             for type_i in range(self.ntypes):
                 mask = xp.tile(
-                    xp.reshape((atype == type_i), [nf, nloc, 1]), (1, 1, net_dim_out)
+                    xp.reshape((atype == type_i), (nf, nloc, 1)), (1, 1, net_dim_out)
                 )
                 atom_property = self.nets[(type_i,)](xx)
                 if self.remove_vaccum_contribution is not None and not (
@@ -485,10 +520,10 @@ def _call_common(
         outs += xp.reshape(
             xp.take(
                 xp.astype(self.bias_atom_e[...], outs.dtype),
-                xp.reshape(atype, [-1]),
+                xp.reshape(atype, (-1,)),
                 axis=0,
             ),
-            [nf, nloc, net_dim_out],
+            (nf, nloc, net_dim_out),
         )
         # nf x nloc
         exclude_mask = self.emask.build_type_exclude_mask(atype)
diff --git a/deepmd/dpmodel/fitting/invar_fitting.py b/deepmd/dpmodel/fitting/invar_fitting.py
index b5d3a02d86..15ecacbf56 100644
--- a/deepmd/dpmodel/fitting/invar_fitting.py
+++ b/deepmd/dpmodel/fitting/invar_fitting.py
@@ -6,11 +6,12 @@
     Union,
 )
 
-import numpy as np
-
 from deepmd.dpmodel import (
     DEFAULT_PRECISION,
 )
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     cast_precision,
 )
@@ -110,6 +111,9 @@ class InvarFitting(GeneralFitting):
             Atomic contributions of the excluded atom types are set zero.
     type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
+    default_fparam: list[float], optional
+        The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+        this value will be used as the default value for the frame parameter in the fitting net.
 
     """
 
@@ -124,7 +128,7 @@ def __init__(
         numb_fparam: int = 0,
         numb_aparam: int = 0,
         dim_case_embd: int = 0,
-        bias_atom: Optional[np.ndarray] = None,
+        bias_atom: Optional[Array] = None,
         rcond: Optional[float] = None,
         tot_ener_zero: bool = False,
         trainable: Optional[list[bool]] = None,
@@ -138,6 +142,7 @@ def __init__(
         exclude_types: list[int] = [],
         type_map: Optional[list[str]] = None,
         seed: Optional[Union[int, list[int]]] = None,
+        default_fparam: Optional[list[float]] = None,
     ) -> None:
         if tot_ener_zero:
             raise NotImplementedError("tot_ener_zero is not implemented")
@@ -173,6 +178,7 @@ def __init__(
             else [x is not None for x in atom_ener],
             type_map=type_map,
             seed=seed,
+            default_fparam=default_fparam,
         )
 
     def serialize(self) -> dict:
@@ -185,18 +191,18 @@ def serialize(self) -> dict:
     @classmethod
     def deserialize(cls, data: dict) -> "GeneralFitting":
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 3, 1)
+        check_version_compatibility(data.pop("@version", 1), 4, 1)
         return super().deserialize(data)
 
-    def _net_out_dim(self):
+    def _net_out_dim(self) -> int:
         """Set the FittingNet output dim."""
         return self.dim_out
 
-    def compute_output_stats(self, merged) -> NoReturn:
+    def compute_output_stats(self, merged: Any) -> NoReturn:
         """Update the output bias for fitting net."""
         raise NotImplementedError
 
-    def output_def(self):
+    def output_def(self) -> FittingOutputDef:
         return FittingOutputDef(
             [
                 OutputVariableDef(
@@ -212,14 +218,14 @@ def output_def(self):
     @cast_precision
     def call(
         self,
-        descriptor: np.ndarray,
-        atype: np.ndarray,
-        gr: Optional[np.ndarray] = None,
-        g2: Optional[np.ndarray] = None,
-        h2: Optional[np.ndarray] = None,
-        fparam: Optional[np.ndarray] = None,
-        aparam: Optional[np.ndarray] = None,
-    ) -> dict[str, np.ndarray]:
+        descriptor: Array,
+        atype: Array,
+        gr: Optional[Array] = None,
+        g2: Optional[Array] = None,
+        h2: Optional[Array] = None,
+        fparam: Optional[Array] = None,
+        aparam: Optional[Array] = None,
+    ) -> dict[str, Array]:
         """Calculate the fitting.
 
         Parameters
diff --git a/deepmd/dpmodel/fitting/make_base_fitting.py b/deepmd/dpmodel/fitting/make_base_fitting.py
index 201b5e27d1..be9c5edb1f 100644
--- a/deepmd/dpmodel/fitting/make_base_fitting.py
+++ b/deepmd/dpmodel/fitting/make_base_fitting.py
@@ -4,6 +4,7 @@
     abstractmethod,
 )
 from typing import (
+    Any,
     NoReturn,
     Optional,
 )
@@ -21,9 +22,9 @@
 
 
 def make_base_fitting(
-    t_tensor,
+    t_tensor: Any,
     fwd_method_name: str = "forward",
-):
+) -> type:
     """Make the base class for the fitting.
 
     Parameters
@@ -39,7 +40,7 @@ def make_base_fitting(
     class BF(ABC, PluginVariant, make_plugin_registry("fitting")):
         """Base fitting provides the interfaces of fitting net."""
 
-        def __new__(cls, *args, **kwargs):
+        def __new__(cls: type, *args: Any, **kwargs: Any) -> Any:
             if cls is BF:
                 cls = cls.get_class_by_type(j_get_type(kwargs, cls.__name__))
             return super().__new__(cls)
@@ -63,7 +64,7 @@ def fwd(
             """Calculate fitting."""
             pass
 
-        def compute_output_stats(self, merged) -> NoReturn:
+        def compute_output_stats(self, merged: Any) -> NoReturn:
             """Update the output bias for fitting net."""
             raise NotImplementedError
 
@@ -74,7 +75,7 @@ def get_type_map(self) -> list[str]:
 
         @abstractmethod
         def change_type_map(
-            self, type_map: list[str], model_with_new_type_stat=None
+            self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
         ) -> None:
             """Change the type related params to new ones, according to `type_map` and the original one in the model.
             If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
diff --git a/deepmd/dpmodel/fitting/polarizability_fitting.py b/deepmd/dpmodel/fitting/polarizability_fitting.py
index 8acb818a46..04a19b394c 100644
--- a/deepmd/dpmodel/fitting/polarizability_fitting.py
+++ b/deepmd/dpmodel/fitting/polarizability_fitting.py
@@ -14,6 +14,9 @@
 from deepmd.dpmodel import (
     DEFAULT_PRECISION,
 )
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     cast_precision,
     to_numpy_array,
@@ -90,6 +93,9 @@ class PolarFitting(GeneralFitting):
             Whether to shift the diagonal part of the polarizability matrix. The shift operation is carried out after scale.
     type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
+    default_fparam: list[float], optional
+            The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+            this value will be used as the default value for the frame parameter in the fitting net.
     """
 
     def __init__(
@@ -117,6 +123,7 @@ def __init__(
         shift_diag: bool = True,
         type_map: Optional[list[str]] = None,
         seed: Optional[Union[int, list[int]]] = None,
+        default_fparam: Optional[list[float]] = None,
     ) -> None:
         if tot_ener_zero:
             raise NotImplementedError("tot_ener_zero is not implemented")
@@ -164,9 +171,10 @@ def __init__(
             exclude_types=exclude_types,
             type_map=type_map,
             seed=seed,
+            default_fparam=default_fparam,
         )
 
-    def _net_out_dim(self):
+    def _net_out_dim(self) -> int:
         """Set the FittingNet output dim."""
         return (
             self.embedding_width
@@ -174,13 +182,13 @@ def _net_out_dim(self):
             else self.embedding_width * self.embedding_width
         )
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Array) -> None:
         if key in ["constant_matrix"]:
             self.constant_matrix = value
         else:
             super().__setitem__(key, value)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Array:
         if key in ["constant_matrix"]:
             return self.constant_matrix
         else:
@@ -189,7 +197,7 @@ def __getitem__(self, key):
     def serialize(self) -> dict:
         data = super().serialize()
         data["type"] = "polar"
-        data["@version"] = 4
+        data["@version"] = 5
         data["embedding_width"] = self.embedding_width
         data["fit_diag"] = self.fit_diag
         data["shift_diag"] = self.shift_diag
@@ -200,12 +208,12 @@ def serialize(self) -> dict:
     @classmethod
     def deserialize(cls, data: dict) -> "GeneralFitting":
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 4, 1)
+        check_version_compatibility(data.pop("@version", 1), 5, 1)
         var_name = data.pop("var_name", None)
         assert var_name == "polar"
         return super().deserialize(data)
 
-    def output_def(self):
+    def output_def(self) -> FittingOutputDef:
         return FittingOutputDef(
             [
                 OutputVariableDef(
@@ -219,7 +227,7 @@ def output_def(self):
         )
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Any = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -247,14 +255,14 @@ def change_type_map(
     @cast_precision
     def call(
         self,
-        descriptor: np.ndarray,
-        atype: np.ndarray,
-        gr: Optional[np.ndarray] = None,
-        g2: Optional[np.ndarray] = None,
-        h2: Optional[np.ndarray] = None,
-        fparam: Optional[np.ndarray] = None,
-        aparam: Optional[np.ndarray] = None,
-    ) -> dict[str, np.ndarray]:
+        descriptor: Array,
+        atype: Array,
+        gr: Optional[Array] = None,
+        g2: Optional[Array] = None,
+        h2: Optional[Array] = None,
+        fparam: Optional[Array] = None,
+        aparam: Optional[Array] = None,
+    ) -> dict[str, Array]:
         """Calculate the fitting.
 
         Parameters
@@ -289,7 +297,7 @@ def call(
         ]
         # out = out * self.scale[atype, ...]
         scale_atype = xp.reshape(
-            xp.take(xp.astype(self.scale, out.dtype), xp.reshape(atype, [-1]), axis=0),
+            xp.take(xp.astype(self.scale, out.dtype), xp.reshape(atype, (-1,)), axis=0),
             (*atype.shape, 1),
         )
         out = out * scale_atype
@@ -315,7 +323,7 @@ def call(
             bias = xp.reshape(
                 xp.take(
                     xp.astype(self.constant_matrix, out.dtype),
-                    xp.reshape(atype, [-1]),
+                    xp.reshape(atype, (-1,)),
                     axis=0,
                 ),
                 (nframes, nloc),
diff --git a/deepmd/dpmodel/fitting/property_fitting.py b/deepmd/dpmodel/fitting/property_fitting.py
index 6d0aa3546f..b4e8a4d10c 100644
--- a/deepmd/dpmodel/fitting/property_fitting.py
+++ b/deepmd/dpmodel/fitting/property_fitting.py
@@ -4,14 +4,19 @@
     Union,
 )
 
-import numpy as np
-
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     DEFAULT_PRECISION,
 )
 from deepmd.dpmodel.fitting.invar_fitting import (
     InvarFitting,
 )
+from deepmd.dpmodel.output_def import (
+    FittingOutputDef,
+    OutputVariableDef,
+)
 from deepmd.utils.version import (
     check_version_compatibility,
 )
@@ -61,6 +66,9 @@ class PropertyFittingNet(InvarFitting):
             Atomic contributions of the excluded atom types are set zero.
     type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
+    default_fparam: list[float], optional
+            The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+            this value will be used as the default value for the frame parameter in the fitting net.
     """
 
     def __init__(
@@ -69,7 +77,7 @@ def __init__(
         dim_descrpt: int,
         task_dim: int = 1,
         neuron: list[int] = [128, 128, 128],
-        bias_atom_p: Optional[np.ndarray] = None,
+        bias_atom_p: Optional[Array] = None,
         rcond: Optional[float] = None,
         trainable: Union[bool, list[bool]] = True,
         intensive: bool = False,
@@ -83,6 +91,7 @@ def __init__(
         mixed_types: bool = True,
         exclude_types: list[int] = [],
         type_map: Optional[list[str]] = None,
+        default_fparam: Optional[list] = None,
         # not used
         seed: Optional[int] = None,
     ) -> None:
@@ -106,12 +115,27 @@ def __init__(
             mixed_types=mixed_types,
             exclude_types=exclude_types,
             type_map=type_map,
+            default_fparam=default_fparam,
+        )
+
+    def output_def(self) -> FittingOutputDef:
+        return FittingOutputDef(
+            [
+                OutputVariableDef(
+                    self.var_name,
+                    [self.dim_out],
+                    reducible=True,
+                    r_differentiable=False,
+                    c_differentiable=False,
+                    intensive=self.intensive,
+                ),
+            ]
         )
 
     @classmethod
     def deserialize(cls, data: dict) -> "PropertyFittingNet":
         data = data.copy()
-        check_version_compatibility(data.pop("@version"), 4, 1)
+        check_version_compatibility(data.pop("@version"), 5, 1)
         data.pop("dim_out")
         data["property_name"] = data.pop("var_name")
         data.pop("tot_ener_zero")
@@ -131,6 +155,6 @@ def serialize(self) -> dict:
             "task_dim": self.task_dim,
             "intensive": self.intensive,
         }
-        dd["@version"] = 4
+        dd["@version"] = 5
 
         return dd
diff --git a/deepmd/dpmodel/infer/deep_eval.py b/deepmd/dpmodel/infer/deep_eval.py
index 91fa0ac2ac..b307f2f15b 100644
--- a/deepmd/dpmodel/infer/deep_eval.py
+++ b/deepmd/dpmodel/infer/deep_eval.py
@@ -10,6 +10,9 @@
 
 import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.model.base_model import (
     BaseModel,
 )
@@ -62,7 +65,7 @@ class DeepEval(DeepEvalBackend):
         The output definition of the model.
     *args : list
         Positional arguments.
-    auto_batch_size : bool or int or AutomaticBatchSize, default: False
+    auto_batch_size : bool or int or AutomaticBatchSize, default: True
         If True, automatic batch size will be used. If int, it will be used
         as the initial batch size.
     neighbor_list : ase.neighborlist.NewPrimitiveNeighborList, optional
@@ -120,6 +123,10 @@ def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this DP."""
         return self.dp.get_dim_aparam()
 
+    def has_default_fparam(self) -> bool:
+        """Check if the model has default frame parameters."""
+        return self.dp.has_default_fparam()
+
     @property
     def model_type(self) -> type["DeepEvalWrapper"]:
         """The the evaluator of the model type."""
@@ -160,14 +167,14 @@ def get_ntypes_spin(self) -> int:
 
     def eval(
         self,
-        coords: np.ndarray,
-        cells: Optional[np.ndarray],
-        atom_types: np.ndarray,
+        coords: Array,
+        cells: Optional[Array],
+        atom_types: Array,
         atomic: bool = False,
-        fparam: Optional[np.ndarray] = None,
-        aparam: Optional[np.ndarray] = None,
+        fparam: Optional[Array] = None,
+        aparam: Optional[Array] = None,
         **kwargs: Any,
-    ) -> dict[str, np.ndarray]:
+    ) -> dict[str, Array]:
         """Evaluate the energy, force and virial by using this DP.
 
         Parameters
@@ -273,7 +280,7 @@ def _eval_func(self, inner_func: Callable, numb_test: int, natoms: int) -> Calla
         """
         if self.auto_batch_size is not None:
 
-            def eval_func(*args, **kwargs):
+            def eval_func(*args: Any, **kwargs: Any) -> Any:
                 return self.auto_batch_size.execute_all(
                     inner_func, numb_test, natoms, *args, **kwargs
                 )
@@ -284,8 +291,8 @@ def eval_func(*args, **kwargs):
 
     def _get_natoms_and_nframes(
         self,
-        coords: np.ndarray,
-        atom_types: np.ndarray,
+        coords: Array,
+        atom_types: Array,
         mixed_type: bool = False,
     ) -> tuple[int, int]:
         if mixed_type:
@@ -301,13 +308,13 @@ def _get_natoms_and_nframes(
 
     def _eval_model(
         self,
-        coords: np.ndarray,
-        cells: Optional[np.ndarray],
-        atom_types: np.ndarray,
-        fparam: Optional[np.ndarray],
-        aparam: Optional[np.ndarray],
+        coords: Array,
+        cells: Optional[Array],
+        atom_types: Array,
+        fparam: Optional[Array],
+        aparam: Optional[Array],
         request_defs: list[OutputVariableDef],
-    ):
+    ) -> dict[str, Array]:
         model = self.dp
 
         nframes = coords.shape[0]
@@ -365,7 +372,9 @@ def _eval_model(
                 )  # this is kinda hacky
         return tuple(results)
 
-    def _get_output_shape(self, odef, nframes, natoms):
+    def _get_output_shape(
+        self, odef: OutputVariableDef, nframes: int, natoms: int
+    ) -> list[int]:
         if odef.category == OutputVariableCategory.DERV_C_REDU:
             # virial
             return [nframes, *odef.shape[:-1], 9]
@@ -391,4 +400,14 @@ def _get_output_shape(self, odef, nframes, natoms):
 
     def get_model_def_script(self) -> dict:
         """Get model definition script."""
-        return json.loads(self.model.get_model_def_script())
+        return json.loads(self.dp.get_model_def_script())
+
+    def get_model(self) -> "BaseModel":
+        """Get the dpmodel BaseModel.
+
+        Returns
+        -------
+        BaseModel
+            The dpmodel BaseModel.
+        """
+        return self.dp
diff --git a/deepmd/dpmodel/loss/ener.py b/deepmd/dpmodel/loss/ener.py
index 7a17fcfcf0..55e6c90a4e 100644
--- a/deepmd/dpmodel/loss/ener.py
+++ b/deepmd/dpmodel/loss/ener.py
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
 import array_api_compat
-import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.loss.loss import (
     Loss,
 )
@@ -17,7 +20,7 @@
 )
 
 
-def custom_huber_loss(predictions, targets, delta=1.0):
+def custom_huber_loss(predictions: Array, targets: Array, delta: float = 1.0) -> Array:
     xp = array_api_compat.array_namespace(predictions, targets)
     error = targets - predictions
     abs_error = xp.abs(error)
@@ -46,9 +49,9 @@ def __init__(
         start_pref_gf: float = 0.0,
         limit_pref_gf: float = 0.0,
         numb_generalized_coord: int = 0,
-        use_huber=False,
-        huber_delta=0.01,
-        **kwargs,
+        use_huber: bool = False,
+        huber_delta: float = 0.01,
+        **kwargs: Any,
     ) -> None:
         self.starter_learning_rate = starter_learning_rate
         self.start_pref_e = start_pref_e
@@ -89,9 +92,9 @@ def call(
         self,
         learning_rate: float,
         natoms: int,
-        model_dict: dict[str, np.ndarray],
-        label_dict: dict[str, np.ndarray],
-    ) -> dict[str, np.ndarray]:
+        model_dict: dict[str, Array],
+        label_dict: dict[str, Array],
+    ) -> dict[str, Array]:
         """Calculate loss from model results and labeled results."""
         energy = model_dict["energy_redu"]
         force = model_dict["energy_derv_r"]
@@ -132,18 +135,18 @@ def call(
             atom_ener_coeff = xp.reshape(atom_ener_coeff, xp.shape(atom_ener))
             energy = xp.sum(atom_ener_coeff * atom_ener, 1)
         if self.has_f or self.has_pf or self.relative_f or self.has_gf:
-            force_reshape = xp.reshape(force, [-1])
-            force_hat_reshape = xp.reshape(force_hat, [-1])
+            force_reshape = xp.reshape(force, (-1,))
+            force_hat_reshape = xp.reshape(force_hat, (-1,))
             diff_f = force_hat_reshape - force_reshape
         else:
             diff_f = None
 
         if self.relative_f is not None:
-            force_hat_3 = xp.reshape(force_hat, [-1, 3])
-            norm_f = xp.reshape(xp.norm(force_hat_3, axis=1), [-1, 1]) + self.relative_f
-            diff_f_3 = xp.reshape(diff_f, [-1, 3])
+            force_hat_3 = xp.reshape(force_hat, (-1, 3))
+            norm_f = xp.reshape(xp.norm(force_hat_3, axis=1), (-1, 1)) + self.relative_f
+            diff_f_3 = xp.reshape(diff_f, (-1, 3))
             diff_f_3 = diff_f_3 / norm_f
-            diff_f = xp.reshape(diff_f_3, [-1])
+            diff_f = xp.reshape(diff_f_3, (-1,))
 
         atom_norm = 1.0 / natoms
         atom_norm_ener = 1.0 / natoms
@@ -184,15 +187,15 @@ def call(
                 loss += pref_f * l2_force_loss
             else:
                 l_huber_loss = custom_huber_loss(
-                    xp.reshape(force, [-1]),
-                    xp.reshape(force_hat, [-1]),
+                    xp.reshape(force, (-1,)),
+                    xp.reshape(force_hat, (-1,)),
                     delta=self.huber_delta,
                 )
                 loss += pref_f * l_huber_loss
             more_loss["rmse_f"] = self.display_if_exist(l2_force_loss, find_force)
         if self.has_v:
-            virial_reshape = xp.reshape(virial, [-1])
-            virial_hat_reshape = xp.reshape(virial_hat, [-1])
+            virial_reshape = xp.reshape(virial, (-1,))
+            virial_hat_reshape = xp.reshape(virial_hat, (-1,))
             l2_virial_loss = xp.mean(
                 xp.square(virial_hat_reshape - virial_reshape),
             )
@@ -207,8 +210,8 @@ def call(
                 loss += pref_v * l_huber_loss
             more_loss["rmse_v"] = self.display_if_exist(l2_virial_loss, find_virial)
         if self.has_ae:
-            atom_ener_reshape = xp.reshape(atom_ener, [-1])
-            atom_ener_hat_reshape = xp.reshape(atom_ener_hat, [-1])
+            atom_ener_reshape = xp.reshape(atom_ener, (-1,))
+            atom_ener_hat_reshape = xp.reshape(atom_ener_hat, (-1,))
             l2_atom_ener_loss = xp.mean(
                 xp.square(atom_ener_hat_reshape - atom_ener_reshape),
             )
@@ -225,7 +228,7 @@ def call(
                 l2_atom_ener_loss, find_atom_ener
             )
         if self.has_pf:
-            atom_pref_reshape = xp.reshape(atom_pref, [-1])
+            atom_pref_reshape = xp.reshape(atom_pref, (-1,))
             l2_pref_force_loss = xp.mean(
                 xp.multiply(xp.square(diff_f), atom_pref_reshape),
             )
@@ -236,10 +239,10 @@ def call(
         if self.has_gf:
             find_drdq = label_dict["find_drdq"]
             drdq = label_dict["drdq"]
-            force_reshape_nframes = xp.reshape(force, [-1, natoms[0] * 3])
-            force_hat_reshape_nframes = xp.reshape(force_hat, [-1, natoms[0] * 3])
+            force_reshape_nframes = xp.reshape(force, (-1, natoms[0] * 3))
+            force_hat_reshape_nframes = xp.reshape(force_hat, (-1, natoms[0] * 3))
             drdq_reshape = xp.reshape(
-                drdq, [-1, natoms[0] * 3, self.numb_generalized_coord]
+                drdq, (-1, natoms[0] * 3, self.numb_generalized_coord)
             )
             gen_force_hat = xp.einsum(
                 "bij,bi->bj", drdq_reshape, force_hat_reshape_nframes
diff --git a/deepmd/dpmodel/loss/loss.py b/deepmd/dpmodel/loss/loss.py
index ff3a462cf1..6dc468582a 100644
--- a/deepmd/dpmodel/loss/loss.py
+++ b/deepmd/dpmodel/loss/loss.py
@@ -5,8 +5,10 @@
 )
 
 import array_api_compat
-import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     NativeOP,
 )
@@ -24,9 +26,9 @@ def call(
         self,
         learning_rate: float,
         natoms: int,
-        model_dict: dict[str, np.ndarray],
-        label_dict: dict[str, np.ndarray],
-    ) -> dict[str, np.ndarray]:
+        model_dict: dict[str, Array],
+        label_dict: dict[str, Array],
+    ) -> dict[str, Array]:
         """Calculate loss from model results and labeled results."""
 
     @property
@@ -35,12 +37,12 @@ def label_requirement(self) -> list[DataRequirementItem]:
         """Return data label requirements needed for this loss calculation."""
 
     @staticmethod
-    def display_if_exist(loss: np.ndarray, find_property: float) -> np.ndarray:
+    def display_if_exist(loss: Array, find_property: float) -> Array:
         """Display NaN if labeled property is not found.
 
         Parameters
         ----------
-        loss : np.ndarray
+        loss : Array
             the loss scalar
         find_property : float
             whether the property is found
diff --git a/deepmd/dpmodel/model/base_model.py b/deepmd/dpmodel/model/base_model.py
index 15c0bfc083..f7a56437a4 100644
--- a/deepmd/dpmodel/model/base_model.py
+++ b/deepmd/dpmodel/model/base_model.py
@@ -36,7 +36,7 @@ class BaseBaseModel(ABC, PluginVariant, make_plugin_registry("model")):
             BaseModel class for DPModel backend.
         """
 
-        def __new__(cls, *args, **kwargs):
+        def __new__(cls, *args: Any, **kwargs: Any) -> "BaseModel":
             if inspect.isabstract(cls):
                 # getting model type based on fitting type
                 model_type = kwargs.get("type", "standard")
@@ -68,15 +68,15 @@ def get_type_map(self) -> list[str]:
             """Get the type map."""
 
         @abstractmethod
-        def get_rcut(self):
+        def get_rcut(self) -> float:
             """Get the cut-off radius."""
 
         @abstractmethod
-        def get_dim_fparam(self):
+        def get_dim_fparam(self) -> int:
             """Get the number (dimension) of frame parameters of this atomic model."""
 
         @abstractmethod
-        def get_dim_aparam(self):
+        def get_dim_aparam(self) -> int:
             """Get the number (dimension) of atomic parameters of this atomic model."""
 
         @abstractmethod
diff --git a/deepmd/dpmodel/model/dipole_model.py b/deepmd/dpmodel/model/dipole_model.py
index 4ca523f79b..d213514551 100644
--- a/deepmd/dpmodel/model/dipole_model.py
+++ b/deepmd/dpmodel/model/dipole_model.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
-
+from typing import (
+    Any,
+)
 
 from deepmd.dpmodel.atomic_model import (
     DPDipoleAtomicModel,
@@ -24,8 +26,8 @@ class DipoleModel(DPModelCommon, DPDipoleModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
-    ):
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
         DPModelCommon.__init__(self)
         DPDipoleModel_.__init__(self, *args, **kwargs)
diff --git a/deepmd/dpmodel/model/dos_model.py b/deepmd/dpmodel/model/dos_model.py
index 3df887b460..5c5d2a5e90 100644
--- a/deepmd/dpmodel/model/dos_model.py
+++ b/deepmd/dpmodel/model/dos_model.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 from deepmd.dpmodel.atomic_model import (
     DPDOSAtomicModel,
@@ -23,8 +26,8 @@ class DOSModel(DPModelCommon, DPDOSModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
-    ):
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
         DPModelCommon.__init__(self)
         DPDOSModel_.__init__(self, *args, **kwargs)
diff --git a/deepmd/dpmodel/model/dp_model.py b/deepmd/dpmodel/model/dp_model.py
index 769bba0b20..9098d1c011 100644
--- a/deepmd/dpmodel/model/dp_model.py
+++ b/deepmd/dpmodel/model/dp_model.py
@@ -8,6 +8,9 @@
 from deepmd.dpmodel.descriptor.base_descriptor import (
     BaseDescriptor,
 )
+from deepmd.dpmodel.fitting.base_fitting import (
+    BaseFitting,
+)
 from deepmd.utils.data_system import (
     DeepmdDataSystem,
 )
@@ -45,3 +48,7 @@ def update_sel(
             train_data, type_map, local_jdata["descriptor"]
         )
         return local_jdata_cpy, min_nbor_dist
+
+    def get_fitting_net(self) -> BaseFitting:
+        """Get the fitting network."""
+        return self.atomic_model.fitting
diff --git a/deepmd/dpmodel/model/dp_zbl_model.py b/deepmd/dpmodel/model/dp_zbl_model.py
index 7bf22dfc6b..f3f106f1c7 100644
--- a/deepmd/dpmodel/model/dp_zbl_model.py
+++ b/deepmd/dpmodel/model/dp_zbl_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -29,8 +30,8 @@ class DPZBLModel(DPZBLModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         super().__init__(*args, **kwargs)
 
diff --git a/deepmd/dpmodel/model/ener_model.py b/deepmd/dpmodel/model/ener_model.py
index 88e65a849a..9d38a17513 100644
--- a/deepmd/dpmodel/model/ener_model.py
+++ b/deepmd/dpmodel/model/ener_model.py
@@ -2,6 +2,9 @@
 from copy import (
     deepcopy,
 )
+from typing import (
+    Any,
+)
 
 from deepmd.dpmodel.atomic_model import (
     DPEnergyAtomicModel,
@@ -27,15 +30,15 @@
 class EnergyModel(DPModelCommon, DPEnergyModel_):
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         DPModelCommon.__init__(self)
         DPEnergyModel_.__init__(self, *args, **kwargs)
         self._enable_hessian = False
         self.hess_fitting_def = None
 
-    def enable_hessian(self):
+    def enable_hessian(self) -> None:
         self.hess_fitting_def = deepcopy(self.atomic_output_def())
         self.hess_fitting_def["energy"].r_hessian = True
         self._enable_hessian = True
diff --git a/deepmd/dpmodel/model/make_model.py b/deepmd/dpmodel/model/make_model.py
index ec0b986394..74d5dfd4bb 100644
--- a/deepmd/dpmodel/model/make_model.py
+++ b/deepmd/dpmodel/model/make_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
 )
@@ -7,6 +8,9 @@
 import array_api_compat
 import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.atomic_model.base_atomic_model import (
     BaseAtomicModel,
 )
@@ -51,19 +55,19 @@ def model_call_from_call_lower(
             Optional[np.ndarray],
             bool,
         ],
-        dict[str, np.ndarray],
+        dict[str, Array],
     ],
     rcut: float,
     sel: list[int],
     mixed_types: bool,
     model_output_def: ModelOutputDef,
-    coord: np.ndarray,
-    atype: np.ndarray,
-    box: Optional[np.ndarray] = None,
-    fparam: Optional[np.ndarray] = None,
-    aparam: Optional[np.ndarray] = None,
+    coord: Array,
+    atype: Array,
+    box: Optional[Array] = None,
+    fparam: Optional[Array] = None,
+    aparam: Optional[Array] = None,
     do_atomic_virial: bool = False,
-):
+) -> dict[str, Array]:
     """Return model prediction from lower interface.
 
     Parameters
@@ -131,7 +135,7 @@ def model_call_from_call_lower(
     return model_predict
 
 
-def make_model(T_AtomicModel: type[BaseAtomicModel]):
+def make_model(T_AtomicModel: type[BaseAtomicModel]) -> type:
     """Make a model as a derived class of an atomic model.
 
     The model provide two interfaces.
@@ -157,10 +161,10 @@ def make_model(T_AtomicModel: type[BaseAtomicModel]):
     class CM(NativeOP, BaseModel):
         def __init__(
             self,
-            *args,
+            *args: Any,
             # underscore to prevent conflict with normal inputs
             atomic_model_: Optional[T_AtomicModel] = None,
-            **kwargs,
+            **kwargs: Any,
         ) -> None:
             BaseModel.__init__(self)
             if atomic_model_ is not None:
@@ -173,7 +177,7 @@ def __init__(
             self.global_np_float_precision = GLOBAL_NP_FLOAT_PRECISION
             self.global_ener_float_precision = GLOBAL_ENER_FLOAT_PRECISION
 
-        def model_output_def(self):
+        def model_output_def(self) -> ModelOutputDef:
             """Get the output def for the model."""
             return ModelOutputDef(self.atomic_output_def())
 
@@ -218,13 +222,13 @@ def enable_compression(
 
         def call(
             self,
-            coord,
-            atype,
-            box: Optional[np.ndarray] = None,
-            fparam: Optional[np.ndarray] = None,
-            aparam: Optional[np.ndarray] = None,
+            coord: Array,
+            atype: Array,
+            box: Optional[Array] = None,
+            fparam: Optional[Array] = None,
+            aparam: Optional[Array] = None,
             do_atomic_virial: bool = False,
-        ) -> dict[str, np.ndarray]:
+        ) -> dict[str, Array]:
             """Return model prediction.
 
             Parameters
@@ -272,14 +276,14 @@ def call(
 
         def call_lower(
             self,
-            extended_coord: np.ndarray,
-            extended_atype: np.ndarray,
-            nlist: np.ndarray,
-            mapping: Optional[np.ndarray] = None,
-            fparam: Optional[np.ndarray] = None,
-            aparam: Optional[np.ndarray] = None,
+            extended_coord: Array,
+            extended_atype: Array,
+            nlist: Array,
+            mapping: Optional[Array] = None,
+            fparam: Optional[Array] = None,
+            aparam: Optional[Array] = None,
             do_atomic_virial: bool = False,
-        ):
+        ) -> dict[str, Array]:
             """Return model prediction. Lower interface that takes
             extended atomic coordinates and types, nlist, and mapping
             as input, and returns the predictions on the extended region.
@@ -334,14 +338,14 @@ def call_lower(
 
         def forward_common_atomic(
             self,
-            extended_coord: np.ndarray,
-            extended_atype: np.ndarray,
-            nlist: np.ndarray,
-            mapping: Optional[np.ndarray] = None,
-            fparam: Optional[np.ndarray] = None,
-            aparam: Optional[np.ndarray] = None,
+            extended_coord: Array,
+            extended_atype: Array,
+            nlist: Array,
+            mapping: Optional[Array] = None,
+            fparam: Optional[Array] = None,
+            aparam: Optional[Array] = None,
             do_atomic_virial: bool = False,
-        ):
+        ) -> dict[str, Array]:
             atomic_ret = self.atomic_model.forward_common_atomic(
                 extended_coord,
                 extended_atype,
@@ -355,23 +359,18 @@ def forward_common_atomic(
                 self.atomic_output_def(),
                 extended_coord,
                 do_atomic_virial=do_atomic_virial,
+                mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
             )
 
         forward_lower = call_lower
 
         def input_type_cast(
             self,
-            coord: np.ndarray,
-            box: Optional[np.ndarray] = None,
-            fparam: Optional[np.ndarray] = None,
-            aparam: Optional[np.ndarray] = None,
-        ) -> tuple[
-            np.ndarray,
-            Optional[np.ndarray],
-            Optional[np.ndarray],
-            Optional[np.ndarray],
-            str,
-        ]:
+            coord: Array,
+            box: Optional[Array] = None,
+            fparam: Optional[Array] = None,
+            aparam: Optional[Array] = None,
+        ) -> tuple[Array, Array, Optional[np.ndarray], Optional[np.ndarray], str]:
             """Cast the input data to global float type."""
             input_prec = RESERVED_PRECISION_DICT[self.precision_dict[coord.dtype.name]]
             ###
@@ -396,9 +395,9 @@ def input_type_cast(
 
         def output_type_cast(
             self,
-            model_ret: dict[str, np.ndarray],
+            model_ret: dict[str, Array],
             input_prec: str,
-        ) -> dict[str, np.ndarray]:
+        ) -> dict[str, Array]:
             """Convert the model output to the input prec."""
             do_cast = (
                 input_prec != RESERVED_PRECISION_DICT[self.global_np_float_precision]
@@ -423,11 +422,11 @@ def output_type_cast(
 
         def format_nlist(
             self,
-            extended_coord: np.ndarray,
-            extended_atype: np.ndarray,
-            nlist: np.ndarray,
+            extended_coord: Array,
+            extended_atype: Array,
+            nlist: Array,
             extra_nlist_sort: bool = False,
-        ):
+        ) -> Array:
             """Format the neighbor list.
 
             1. If the number of neighbors in the `nlist` is equal to sum(self.sel),
@@ -475,11 +474,11 @@ def format_nlist(
 
         def _format_nlist(
             self,
-            extended_coord: np.ndarray,
-            nlist: np.ndarray,
+            extended_coord: Array,
+            nlist: Array,
             nnei: int,
             extra_nlist_sort: bool = False,
-        ):
+        ) -> Array:
             xp = array_api_compat.array_namespace(extended_coord, nlist)
             n_nf, n_nloc, n_nnei = nlist.shape
             extended_coord = extended_coord.reshape([n_nf, -1, 3])
@@ -538,7 +537,7 @@ def do_grad_c(
             return self.atomic_model.do_grad_c(var_name)
 
         def change_type_map(
-            self, type_map: list[str], model_with_new_type_stat=None
+            self, type_map: list[str], model_with_new_type_stat: Any = None
         ) -> None:
             """Change the type related params to new ones, according to `type_map` and the original one in the model.
             If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -549,10 +548,10 @@ def serialize(self) -> dict:
             return self.atomic_model.serialize()
 
         @classmethod
-        def deserialize(cls, data) -> "CM":
+        def deserialize(cls, data: dict) -> "CM":
             return cls(atomic_model_=T_AtomicModel.deserialize(data))
 
-        def set_case_embd(self, case_idx: int):
+        def set_case_embd(self, case_idx: int) -> None:
             self.atomic_model.set_case_embd(case_idx)
 
         def get_dim_fparam(self) -> int:
@@ -563,6 +562,10 @@ def get_dim_aparam(self) -> int:
             """Get the number (dimension) of atomic parameters of this atomic model."""
             return self.atomic_model.get_dim_aparam()
 
+        def has_default_fparam(self) -> bool:
+            """Check if the model has default frame parameters."""
+            return self.atomic_model.has_default_fparam()
+
         def get_sel_type(self) -> list[int]:
             """Get the selected atom types of this model.
 
diff --git a/deepmd/dpmodel/model/model.py b/deepmd/dpmodel/model/model.py
index 1d18b70e8e..339998aa89 100644
--- a/deepmd/dpmodel/model/model.py
+++ b/deepmd/dpmodel/model/model.py
@@ -1,5 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import copy
+from typing import (
+    Any,
+)
 
 from deepmd.dpmodel.atomic_model.dp_atomic_model import (
     DPAtomicModel,
@@ -45,7 +48,9 @@
 )
 
 
-def _get_standard_model_components(data, ntypes):
+def _get_standard_model_components(
+    data: dict[str, Any], ntypes: int
+) -> tuple[BaseDescriptor, BaseFitting, str]:
     # descriptor
     data["descriptor"]["ntypes"] = ntypes
     data["descriptor"]["type_map"] = copy.deepcopy(data["type_map"])
@@ -181,7 +186,7 @@ def get_spin_model(data: dict) -> SpinModel:
     return SpinModel(backbone_model=backbone_model, spin=spin)
 
 
-def get_model(data: dict):
+def get_model(data: dict) -> BaseModel:
     """Get a model from a dictionary.
 
     Parameters
diff --git a/deepmd/dpmodel/model/polar_model.py b/deepmd/dpmodel/model/polar_model.py
index 994b3556c2..b898eababd 100644
--- a/deepmd/dpmodel/model/polar_model.py
+++ b/deepmd/dpmodel/model/polar_model.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 from deepmd.dpmodel.atomic_model import (
     DPPolarAtomicModel,
@@ -23,8 +26,8 @@ class PolarModel(DPModelCommon, DPPolarModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
-    ):
+        *args: Any,
+        **kwargs: Any,
+    ) -> None:
         DPModelCommon.__init__(self)
         DPPolarModel_.__init__(self, *args, **kwargs)
diff --git a/deepmd/dpmodel/model/property_model.py b/deepmd/dpmodel/model/property_model.py
index 9bd07bd349..20c884cd20 100644
--- a/deepmd/dpmodel/model/property_model.py
+++ b/deepmd/dpmodel/model/property_model.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
 from deepmd.dpmodel.atomic_model import (
     DPPropertyAtomicModel,
 )
@@ -20,8 +24,12 @@
 class PropertyModel(DPModelCommon, DPPropertyModel_):
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         DPModelCommon.__init__(self)
         DPPropertyModel_.__init__(self, *args, **kwargs)
+
+    def get_var_name(self) -> str:
+        """Get the name of the property."""
+        return self.get_fitting_net().var_name
diff --git a/deepmd/dpmodel/model/spin_model.py b/deepmd/dpmodel/model/spin_model.py
index d149d427e0..7706a009fc 100644
--- a/deepmd/dpmodel/model/spin_model.py
+++ b/deepmd/dpmodel/model/spin_model.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
 import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.atomic_model.dp_atomic_model import (
     DPAtomicModel,
 )
@@ -27,7 +31,7 @@ class SpinModel(NativeOP):
 
     def __init__(
         self,
-        backbone_model,
+        backbone_model: DPAtomicModel,
         spin: Spin,
     ) -> None:
         super().__init__()
@@ -37,7 +41,9 @@ def __init__(
         self.virtual_scale_mask = self.spin.get_virtual_scale_mask()
         self.spin_mask = self.spin.get_spin_mask()
 
-    def process_spin_input(self, coord, atype, spin):
+    def process_spin_input(
+        self, coord: Array, atype: Array, spin: Array
+    ) -> tuple[Array, Array]:
         """Generate virtual coordinates and types, concat into the input."""
         nframes, nloc = coord.shape[:-1]
         atype_spin = np.concatenate([atype, atype + self.ntypes_real], axis=-1)
@@ -49,12 +55,12 @@ def process_spin_input(self, coord, atype, spin):
 
     def process_spin_input_lower(
         self,
-        extended_coord: np.ndarray,
-        extended_atype: np.ndarray,
-        extended_spin: np.ndarray,
-        nlist: np.ndarray,
-        mapping: Optional[np.ndarray] = None,
-    ):
+        extended_coord: Array,
+        extended_atype: Array,
+        extended_spin: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+    ) -> tuple[Array, Array]:
         """
         Add `extended_spin` into `extended_coord` to generate virtual atoms, and extend `nlist` and `mapping`.
         Note that the final `extended_coord_updated` with shape [nframes, nall + nall, 3] has the following order:
@@ -92,8 +98,12 @@ def process_spin_input_lower(
         )
 
     def process_spin_output(
-        self, atype, out_tensor, add_mag: bool = True, virtual_scale: bool = True
-    ):
+        self,
+        atype: Array,
+        out_tensor: Array,
+        add_mag: bool = True,
+        virtual_scale: bool = True,
+    ) -> tuple[Array, Array]:
         """Split the output both real and virtual atoms, and scale the latter."""
         nframes, nloc_double = out_tensor.shape[:2]
         nloc = nloc_double // 2
@@ -112,12 +122,12 @@ def process_spin_output(
 
     def process_spin_output_lower(
         self,
-        extended_atype,
-        extended_out_tensor,
+        extended_atype: Array,
+        extended_out_tensor: Array,
         nloc: int,
         add_mag: bool = True,
         virtual_scale: bool = True,
-    ):
+    ) -> tuple[Array, Array]:
         """Split the extended output of both real and virtual atoms with switch, and scale the latter."""
         nframes, nall_double = extended_out_tensor.shape[:2]
         nall = nall_double // 2
@@ -148,7 +158,7 @@ def process_spin_output_lower(
         return extended_out_real, extended_out_mag, atomic_mask > 0.0
 
     @staticmethod
-    def extend_nlist(extended_atype, nlist):
+    def extend_nlist(extended_atype: Array, nlist: Array) -> Array:
         nframes, nloc, nnei = nlist.shape
         nall = extended_atype.shape[1]
         nlist_mask = nlist != -1
@@ -178,7 +188,9 @@ def extend_nlist(extended_atype, nlist):
         return extended_nlist
 
     @staticmethod
-    def concat_switch_virtual(extended_tensor, extended_tensor_virtual, nloc: int):
+    def concat_switch_virtual(
+        extended_tensor: Array, extended_tensor_virtual: Array, nloc: int
+    ) -> Array:
         nframes, nall = extended_tensor.shape[:2]
         out_shape = list(extended_tensor.shape)
         out_shape[1] *= 2
@@ -197,7 +209,7 @@ def concat_switch_virtual(extended_tensor, extended_tensor_virtual, nloc: int):
         return extended_tensor_updated.reshape(out_shape)
 
     @staticmethod
-    def expand_aparam(aparam, nloc: int):
+    def expand_aparam(aparam: Array, nloc: int) -> Array:
         """Expand the atom parameters for virtual atoms if necessary."""
         nframes, natom, numb_aparam = aparam.shape
         if natom == nloc:  # good
@@ -226,19 +238,19 @@ def get_type_map(self) -> list[str]:
         ntypes = len(tmap) // 2  # ignore the virtual type
         return tmap[:ntypes]
 
-    def get_ntypes(self):
+    def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return len(self.get_type_map())
 
-    def get_rcut(self):
+    def get_rcut(self) -> float:
         """Get the cut-off radius."""
         return self.backbone_model.get_rcut()
 
-    def get_dim_fparam(self):
+    def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this atomic model."""
         return self.backbone_model.get_dim_fparam()
 
-    def get_dim_aparam(self):
+    def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.backbone_model.get_dim_aparam()
 
@@ -288,7 +300,7 @@ def has_spin() -> bool:
         """Returns whether it has spin input and output."""
         return True
 
-    def model_output_def(self):
+    def model_output_def(self) -> ModelOutputDef:
         """Get the output def for the model."""
         model_output_type = self.backbone_model.model_output_type()
         if "mask" in model_output_type:
@@ -298,7 +310,7 @@ def model_output_def(self):
         backbone_model_atomic_output_def[var_name].magnetic = True
         return ModelOutputDef(backbone_model_atomic_output_def)
 
-    def __getattr__(self, name):
+    def __getattr__(self, name: str) -> Any:
         """Get attribute from the wrapped model."""
         if name in self.__dict__:
             return self.__dict__[name]
@@ -312,7 +324,7 @@ def serialize(self) -> dict:
         }
 
     @classmethod
-    def deserialize(cls, data) -> "SpinModel":
+    def deserialize(cls, data: dict) -> "SpinModel":
         backbone_model_obj = make_model(DPAtomicModel).deserialize(
             data["backbone_model"]
         )
@@ -324,14 +336,14 @@ def deserialize(cls, data) -> "SpinModel":
 
     def call(
         self,
-        coord,
-        atype,
-        spin,
-        box: Optional[np.ndarray] = None,
-        fparam: Optional[np.ndarray] = None,
-        aparam: Optional[np.ndarray] = None,
+        coord: Array,
+        atype: Array,
+        spin: Array,
+        box: Optional[Array] = None,
+        fparam: Optional[Array] = None,
+        aparam: Optional[Array] = None,
         do_atomic_virial: bool = False,
-    ) -> dict[str, np.ndarray]:
+    ) -> dict[str, Array]:
         """Return model prediction.
 
         Parameters
@@ -386,15 +398,15 @@ def call(
 
     def call_lower(
         self,
-        extended_coord: np.ndarray,
-        extended_atype: np.ndarray,
-        extended_spin: np.ndarray,
-        nlist: np.ndarray,
-        mapping: Optional[np.ndarray] = None,
-        fparam: Optional[np.ndarray] = None,
-        aparam: Optional[np.ndarray] = None,
+        extended_coord: Array,
+        extended_atype: Array,
+        extended_spin: Array,
+        nlist: Array,
+        mapping: Optional[Array] = None,
+        fparam: Optional[Array] = None,
+        aparam: Optional[Array] = None,
         do_atomic_virial: bool = False,
-    ):
+    ) -> dict[str, Array]:
         """Return model prediction. Lower interface that takes
         extended atomic coordinates, types and spins, nlist, and mapping
         as input, and returns the predictions on the extended region.
diff --git a/deepmd/dpmodel/model/transform_output.py b/deepmd/dpmodel/model/transform_output.py
index 9d7873f081..f35faf444e 100644
--- a/deepmd/dpmodel/model/transform_output.py
+++ b/deepmd/dpmodel/model/transform_output.py
@@ -1,9 +1,14 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 
+from typing import (
+    Optional,
+)
+
 import array_api_compat
 import numpy as np
 
 from deepmd.dpmodel.array_api import (
+    Array,
     xp_scatter_sum,
 )
 from deepmd.dpmodel.common import (
@@ -20,11 +25,12 @@
 
 
 def fit_output_to_model_output(
-    fit_ret: dict[str, np.ndarray],
+    fit_ret: dict[str, Array],
     fit_output_def: FittingOutputDef,
-    coord_ext: np.ndarray,
+    coord_ext: Array,
     do_atomic_virial: bool = False,
-) -> dict[str, np.ndarray]:
+    mask: Optional[Array] = None,
+) -> dict[str, Array]:
     """Transform the output of the fitting network to
     the model output.
 
@@ -38,9 +44,19 @@ def fit_output_to_model_output(
         if vdef.reducible:
             kk_redu = get_reduce_name(kk)
             # cast to energy prec before reduction
-            model_ret[kk_redu] = xp.sum(
-                vv.astype(GLOBAL_ENER_FLOAT_PRECISION), axis=atom_axis
-            )
+            if vdef.intensive:
+                if mask is not None:
+                    model_ret[kk_redu] = xp.sum(
+                        vv.astype(GLOBAL_ENER_FLOAT_PRECISION), axis=atom_axis
+                    ) / np.sum(mask, axis=-1, keepdims=True)
+                else:
+                    model_ret[kk_redu] = xp.mean(
+                        vv.astype(GLOBAL_ENER_FLOAT_PRECISION), axis=atom_axis
+                    )
+            else:
+                model_ret[kk_redu] = xp.sum(
+                    vv.astype(GLOBAL_ENER_FLOAT_PRECISION), axis=atom_axis
+                )
             if vdef.r_differentiable:
                 kk_derv_r, kk_derv_c = get_deriv_name(kk)
                 # name-holders
@@ -53,14 +69,14 @@ def fit_output_to_model_output(
 
 
 def get_leading_dims(
-    vv: np.ndarray,
+    vv: Array,
     vdef: OutputVariableDef,
-):
+) -> list[int]:
     """Get the dimensions of nf x nloc.
 
     Parameters
     ----------
-    vv : np.ndarray
+    vv : Array
         The input array from which to compute the leading dimensions.
     vdef : OutputVariableDef
         The output variable definition containing the shape to exclude from `vv`.
@@ -75,11 +91,11 @@ def get_leading_dims(
 
 
 def communicate_extended_output(
-    model_ret: dict[str, np.ndarray],
+    model_ret: dict[str, Array],
     model_output_def: ModelOutputDef,
-    mapping: np.ndarray,  # nf x nloc
+    mapping: Array,  # nf x nloc
     do_atomic_virial: bool = False,
-) -> dict[str, np.ndarray]:
+) -> dict[str, Array]:
     """Transform the output of the model network defined on
     local and ghost (extended) atoms to local atoms.
 
@@ -100,7 +116,9 @@ def communicate_extended_output(
             if vdef.r_differentiable:
                 if model_ret[kk_derv_r] is not None:
                     derv_r_ext_dims = list(vdef.shape) + [3]  # noqa:RUF005
-                    mapping = xp.reshape(mapping, (mldims + [1] * len(derv_r_ext_dims)))
+                    mapping = xp.reshape(
+                        mapping, tuple(mldims + [1] * len(derv_r_ext_dims))
+                    )
                     mapping = xp.tile(mapping, [1] * len(mldims) + derv_r_ext_dims)
                     force = xp.zeros(vldims + derv_r_ext_dims, dtype=vv.dtype)
                     force = xp_scatter_sum(
diff --git a/deepmd/dpmodel/modifier/base_modifier.py b/deepmd/dpmodel/modifier/base_modifier.py
index 9edc4722e1..febb9b75e8 100644
--- a/deepmd/dpmodel/modifier/base_modifier.py
+++ b/deepmd/dpmodel/modifier/base_modifier.py
@@ -4,6 +4,9 @@
     ABC,
     abstractmethod,
 )
+from typing import (
+    Any,
+)
 
 from deepmd.utils.plugin import (
     PluginVariant,
@@ -15,7 +18,7 @@ def make_base_modifier() -> type[object]:
     class BaseModifier(ABC, PluginVariant, make_plugin_registry("modifier")):
         """Base class for data modifier."""
 
-        def __new__(cls, *args, **kwargs):
+        def __new__(cls, *args: Any, **kwargs: Any) -> "BaseModifier":
             if cls is BaseModifier:
                 cls = cls.get_class_by_type(kwargs["type"])
             return super().__new__(cls)
diff --git a/deepmd/dpmodel/output_def.py b/deepmd/dpmodel/output_def.py
index c2a1147786..5028bc43a3 100644
--- a/deepmd/dpmodel/output_def.py
+++ b/deepmd/dpmodel/output_def.py
@@ -3,6 +3,9 @@
 from enum import (
     IntEnum,
 )
+from typing import (
+    Any,
+)
 
 
 def check_shape(
@@ -19,7 +22,7 @@ def check_shape(
             raise ValueError(f"{shape} shape not matching def {def_shape}")
 
 
-def check_var(var, var_def) -> None:
+def check_var(var: Any, var_def: Any) -> None:
     if var_def.atomic:
         # var.shape == [nf, nloc, *var_def.shape]
         if len(var.shape) != len(var_def.shape) + 2:
@@ -32,7 +35,7 @@ def check_var(var, var_def) -> None:
         check_shape(list(var.shape[1:]), var_def.shape)
 
 
-def model_check_output(cls):
+def model_check_output(cls: type) -> type:
     """Check if the output of the Model is consistent with the definition.
 
     Two methods are assumed to be provided by the Model:
@@ -45,17 +48,17 @@ def model_check_output(cls):
     class wrapper(cls):
         def __init__(
             self,
-            *args,
-            **kwargs,
+            *args: Any,
+            **kwargs: Any,
         ) -> None:
             super().__init__(*args, **kwargs)
             self.md = self.output_def()
 
         def __call__(
             self,
-            *args,
-            **kwargs,
-        ):
+            *args: Any,
+            **kwargs: Any,
+        ) -> Any:
             ret = cls.__call__(self, *args, **kwargs)
             for kk in self.md.keys_outp():
                 dd = self.md[kk]
@@ -74,7 +77,7 @@ def __call__(
     return wrapper
 
 
-def fitting_check_output(cls):
+def fitting_check_output(cls: type) -> type:
     """Check if the output of the Fitting is consistent with the definition.
 
     Two methods are assumed to be provided by the Fitting:
@@ -87,17 +90,17 @@ def fitting_check_output(cls):
     class wrapper(cls):
         def __init__(
             self,
-            *args,
-            **kwargs,
+            *args: Any,
+            **kwargs: Any,
         ) -> None:
             super().__init__(*args, **kwargs)
             self.md = self.output_def()
 
         def __call__(
             self,
-            *args,
-            **kwargs,
-        ):
+            *args: Any,
+            **kwargs: Any,
+        ) -> Any:
             ret = cls.__call__(self, *args, **kwargs)
             for kk in self.md.keys():
                 dd = self.md[kk]
@@ -227,10 +230,10 @@ def __init__(
                 raise ValueError("only r_differentiable variable can calculate hessian")
 
     @property
-    def size(self):
+    def size(self) -> int:
         return self.output_size
 
-    def squeeze(self, dim) -> None:
+    def squeeze(self, dim: int) -> None:
         # squeeze the shape on given dimension
         if -len(self.shape) <= dim < len(self.shape) and self.shape[dim] == 1:
             self.shape.pop(dim)
@@ -264,7 +267,7 @@ def __getitem__(
     def get_data(self) -> dict[str, OutputVariableDef]:
         return self.var_defs
 
-    def keys(self):
+    def keys(self):  # noqa: ANN201
         return self.var_defs.keys()
 
 
@@ -316,25 +319,25 @@ def get_data(
     ) -> dict[str, OutputVariableDef]:
         return self.var_defs
 
-    def keys(self):
+    def keys(self):  # noqa: ANN201
         return self.var_defs.keys()
 
-    def keys_outp(self):
+    def keys_outp(self):  # noqa: ANN201
         return self.def_outp.keys()
 
-    def keys_redu(self):
+    def keys_redu(self):  # noqa: ANN201
         return self.def_redu.keys()
 
-    def keys_derv_r(self):
+    def keys_derv_r(self):  # noqa: ANN201
         return self.def_derv_r.keys()
 
-    def keys_hess_r(self):
+    def keys_hess_r(self):  # noqa: ANN201
         return self.def_hess_r.keys()
 
-    def keys_derv_c(self):
+    def keys_derv_c(self):  # noqa: ANN201
         return self.def_derv_c.keys()
 
-    def keys_derv_c_redu(self):
+    def keys_derv_c_redu(self):  # noqa: ANN201
         return self.def_derv_c_redu.keys()
 
 
diff --git a/deepmd/dpmodel/utils/env_mat.py b/deepmd/dpmodel/utils/env_mat.py
index ee11678d3a..2302e24c71 100644
--- a/deepmd/dpmodel/utils/env_mat.py
+++ b/deepmd/dpmodel/utils/env_mat.py
@@ -1,15 +1,16 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
 import array_api_compat
-import numpy as np
 
 from deepmd.dpmodel import (
     NativeOP,
 )
 from deepmd.dpmodel.array_api import (
+    Array,
     support_array_api,
     xp_take_along_axis,
 )
@@ -20,10 +21,10 @@
 
 @support_array_api(version="2023.12")
 def compute_smooth_weight(
-    distance: np.ndarray,
+    distance: Array,
     rmin: float,
     rmax: float,
-):
+) -> Array:
     """Compute smooth weight for descriptor elements."""
     if rmin >= rmax:
         raise ValueError("rmin should be less than rmax.")
@@ -37,10 +38,10 @@ def compute_smooth_weight(
 
 @support_array_api(version="2023.12")
 def compute_exp_sw(
-    distance: np.ndarray,
+    distance: Array,
     rmin: float,
     rmax: float,
-):
+) -> Array:
     """Compute the exponential switch function for neighbor update."""
     if rmin >= rmax:
         raise ValueError("rmin should be less than rmax.")
@@ -54,14 +55,14 @@ def compute_exp_sw(
 
 
 def _make_env_mat(
-    nlist,
-    coord,
+    nlist: Any,
+    coord: Any,
     rcut: float,
     ruct_smth: float,
     radial_only: bool = False,
     protection: float = 0.0,
     use_exp_switch: bool = False,
-):
+) -> tuple[Any, Any, Any]:
     """Make smooth environment matrix."""
     xp = array_api_compat.array_namespace(nlist)
     nf, nloc, nnei = nlist.shape
@@ -101,8 +102,8 @@ def _make_env_mat(
 class EnvMat(NativeOP):
     def __init__(
         self,
-        rcut,
-        rcut_smth,
+        rcut: float,
+        rcut_smth: float,
         protection: float = 0.0,
         use_exp_switch: bool = False,
     ) -> None:
@@ -113,13 +114,13 @@ def __init__(
 
     def call(
         self,
-        coord_ext: np.ndarray,
-        atype_ext: np.ndarray,
-        nlist: np.ndarray,
-        davg: Optional[np.ndarray] = None,
-        dstd: Optional[np.ndarray] = None,
+        coord_ext: Array,
+        atype_ext: Array,
+        nlist: Array,
+        davg: Optional[Array] = None,
+        dstd: Optional[Array] = None,
         radial_only: bool = False,
-    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    ) -> tuple[Array, Array, Array]:
         """Compute the environment matrix.
 
         Parameters
@@ -159,7 +160,9 @@ def call(
             em /= xp.reshape(xp.take(dstd, xp.reshape(atype, (-1,)), axis=0), em.shape)
         return em, diff, sw
 
-    def _call(self, nlist, coord_ext, radial_only):
+    def _call(
+        self, nlist: Any, coord_ext: Any, radial_only: bool
+    ) -> tuple[Any, Any, Any]:
         em, diff, ww = _make_env_mat(
             nlist,
             coord_ext,
diff --git a/deepmd/dpmodel/utils/env_mat_stat.py b/deepmd/dpmodel/utils/env_mat_stat.py
index e25739fa56..a26a99f2c2 100644
--- a/deepmd/dpmodel/utils/env_mat_stat.py
+++ b/deepmd/dpmodel/utils/env_mat_stat.py
@@ -13,6 +13,9 @@
 from deepmd.common import (
     get_hash,
 )
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     get_xp_precision,
 )
@@ -38,12 +41,12 @@
 
 
 class EnvMatStat(BaseEnvMatStat):
-    def compute_stat(self, env_mat: dict[str, np.ndarray]) -> dict[str, StatItem]:
+    def compute_stat(self, env_mat: dict[str, Array]) -> dict[str, StatItem]:
         """Compute the statistics of the environment matrix for a single system.
 
         Parameters
         ----------
-        env_mat : np.ndarray
+        env_mat : Array
             The environment matrix.
 
         Returns
@@ -166,7 +169,7 @@ def iter(
                     self.last_dim,
                 ),
             )
-            atype = xp.reshape(atype, (coord.shape[0] * coord.shape[1]))
+            atype = xp.reshape(atype, (coord.shape[0] * coord.shape[1],))
             # (1, nloc) eq (ntypes, 1), so broadcast is possible
             # shape: (ntypes, nloc)
             type_idx = xp.equal(
@@ -189,7 +192,7 @@ def iter(
             for type_i in range(self.descriptor.get_ntypes()):
                 dd = env_mat[type_idx[type_i, ...]]
                 dd = xp.reshape(
-                    dd, [-1, self.last_dim]
+                    dd, (-1, self.last_dim)
                 )  # typen_atoms * unmasked_nnei, 4
                 env_mats = {}
                 env_mats[f"r_{type_i}"] = dd[:, :1]
@@ -218,7 +221,7 @@ def get_hash(self) -> str:
             }
         )
 
-    def __call__(self):
+    def __call__(self) -> tuple[Array, Array]:
         avgs = self.get_avg()
         stds = self.get_std()
 
diff --git a/deepmd/dpmodel/utils/exclude_mask.py b/deepmd/dpmodel/utils/exclude_mask.py
index f390bbc7c1..9d8f0c8572 100644
--- a/deepmd/dpmodel/utils/exclude_mask.py
+++ b/deepmd/dpmodel/utils/exclude_mask.py
@@ -4,6 +4,7 @@
 import numpy as np
 
 from deepmd.dpmodel.array_api import (
+    Array,
     xp_take_along_axis,
 )
 
@@ -25,16 +26,16 @@ def __init__(
         # (ntypes)
         self.type_mask = type_mask.reshape([-1])
 
-    def get_exclude_types(self):
+    def get_exclude_types(self) -> list[int]:
         return self.exclude_types
 
-    def get_type_mask(self):
+    def get_type_mask(self) -> Array:
         return self.type_mask
 
     def build_type_exclude_mask(
         self,
-        atype: np.ndarray,
-    ):
+        atype: Array,
+    ) -> Array:
         """Compute type exclusion mask for atoms.
 
         Parameters
@@ -53,7 +54,7 @@ def build_type_exclude_mask(
         xp = array_api_compat.array_namespace(atype)
         nf, natom = atype.shape
         return xp.reshape(
-            xp.take(self.type_mask[...], xp.reshape(atype, [-1]), axis=0),
+            xp.take(self.type_mask[...], xp.reshape(atype, (-1,)), axis=0),
             (nf, natom),
         )
 
@@ -86,14 +87,14 @@ def __init__(
         # (ntypes+1 x ntypes+1)
         self.type_mask = type_mask.reshape([-1])
 
-    def get_exclude_types(self):
+    def get_exclude_types(self) -> list[tuple[int, int]]:
         return self.exclude_types
 
     def build_type_exclude_mask(
         self,
-        nlist: np.ndarray,
-        atype_ext: np.ndarray,
-    ):
+        nlist: Array,
+        atype_ext: Array,
+    ) -> Array:
         """Compute type exclusion mask for atom pairs.
 
         Parameters
@@ -137,5 +138,5 @@ def build_type_exclude_mask(
         )
         return mask
 
-    def __contains__(self, item) -> bool:
+    def __contains__(self, item: tuple[int, int]) -> bool:
         return item in self.exclude_types
diff --git a/deepmd/dpmodel/utils/learning_rate.py b/deepmd/dpmodel/utils/learning_rate.py
index 90c18fca22..499c068a93 100644
--- a/deepmd/dpmodel/utils/learning_rate.py
+++ b/deepmd/dpmodel/utils/learning_rate.py
@@ -1,16 +1,21 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+    Optional,
+)
+
 import numpy as np
 
 
 class LearningRateExp:
     def __init__(
         self,
-        start_lr,
-        stop_lr,
-        decay_steps,
-        stop_steps,
-        decay_rate=None,
-        **kwargs,
+        start_lr: float,
+        stop_lr: float,
+        decay_steps: int,
+        stop_steps: int,
+        decay_rate: Optional[float] = None,
+        **kwargs: Any,
     ) -> None:
         """
         Construct an exponential-decayed learning rate.
@@ -45,7 +50,7 @@ def __init__(
             self.decay_rate = decay_rate
         self.min_lr = stop_lr
 
-    def value(self, step) -> np.float64:
+    def value(self, step: int) -> np.float64:
         """Get the learning rate at the given step."""
         step_lr = self.start_lr * np.power(self.decay_rate, step // self.decay_steps)
         if step_lr < self.min_lr:
diff --git a/deepmd/dpmodel/utils/neighbor_stat.py b/deepmd/dpmodel/utils/neighbor_stat.py
index 3aea8ceeb9..289e047cf2 100644
--- a/deepmd/dpmodel/utils/neighbor_stat.py
+++ b/deepmd/dpmodel/utils/neighbor_stat.py
@@ -9,6 +9,9 @@
 import array_api_compat
 import numpy as np
 
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 from deepmd.dpmodel.common import (
     NativeOP,
 )
@@ -46,10 +49,10 @@ def __init__(
 
     def call(
         self,
-        coord: np.ndarray,
-        atype: np.ndarray,
-        cell: Optional[np.ndarray],
-    ) -> tuple[float, np.ndarray]:
+        coord: Array,
+        atype: Array,
+        cell: Optional[Array],
+    ) -> tuple[Array, Array]:
         """Calculate the neareest neighbor distance between atoms, maximum nbor size of
         atoms and the output data range of the environment matrix.
 
@@ -82,8 +85,8 @@ def call(
         nall = coord1.shape[1] // 3
         coord0 = coord1[:, : nloc * 3]
         diff = (
-            xp.reshape(coord1, [nframes, -1, 3])[:, None, :, :]
-            - xp.reshape(coord0, [nframes, -1, 3])[:, :, None, :]
+            xp.reshape(coord1, (nframes, -1, 3))[:, None, :, :]
+            - xp.reshape(coord0, (nframes, -1, 3))[:, :, None, :]
         )
         assert list(diff.shape) == [nframes, nloc, nall, 3]
         # remove the diagonal elements
diff --git a/deepmd/dpmodel/utils/network.py b/deepmd/dpmodel/utils/network.py
index bf28b66b7b..d48c42ad08 100644
--- a/deepmd/dpmodel/utils/network.py
+++ b/deepmd/dpmodel/utils/network.py
@@ -6,6 +6,7 @@
 
 import itertools
 from typing import (
+    Any,
     Callable,
     ClassVar,
     Optional,
@@ -21,6 +22,7 @@
     NativeOP,
 )
 from deepmd.dpmodel.array_api import (
+    Array,
     support_array_api,
     xp_add_at,
     xp_bincount,
@@ -36,7 +38,7 @@
 )
 
 
-def sigmoid_t(x: np.ndarray) -> np.ndarray:
+def sigmoid_t(x):  # noqa: ANN001, ANN201
     """Sigmoid."""
     if array_api_compat.is_jax_array(x):
         from deepmd.jax.env import (
@@ -53,7 +55,7 @@ class Identity(NativeOP):
     def __init__(self) -> None:
         super().__init__()
 
-    def call(self, x: np.ndarray) -> np.ndarray:
+    def call(self, x):  # noqa: ANN001, ANN201
         """The Identity operation layer."""
         return x
 
@@ -73,11 +75,11 @@ class NativeLayer(NativeOP):
 
     Parameters
     ----------
-    w : np.ndarray, optional
+    w : Array, optional
         The weights of the layer.
-    b : np.ndarray, optional
+    b : Array, optional
         The biases of the layer.
-    idt : np.ndarray, optional
+    idt : Array, optional
         The identity matrix of the layer.
     activation_function : str, optional
         The activation function of the layer.
@@ -87,19 +89,24 @@ class NativeLayer(NativeOP):
         The precision of the layer.
     seed : int, optional
         Random seed.
+    trainable : bool, default=True
+        Whether the layer is trainable.
     """
 
     def __init__(
         self,
-        num_in,
-        num_out,
+        num_in: int,
+        num_out: int,
         bias: bool = True,
         use_timestep: bool = False,
         activation_function: Optional[str] = None,
         resnet: bool = False,
         precision: str = DEFAULT_PRECISION,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
+        # trainable must be set before any array attribute is set
+        self.trainable = trainable
         prec = PRECISION_DICT[precision.lower()]
         self.precision = precision
         # only use_timestep when skip connection is established.
@@ -139,13 +146,14 @@ def serialize(self) -> dict:
         }
         return {
             "@class": "Layer",
-            "@version": 1,
+            "@version": 2,
             "bias": self.b is not None,
             "use_timestep": self.idt is not None,
             "activation_function": self.activation_function,
             "resnet": self.resnet,
             # make deterministic
             "precision": np.dtype(PRECISION_DICT[self.precision]).name,
+            "trainable": self.trainable,
             "@variables": data,
         }
 
@@ -159,7 +167,7 @@ def deserialize(cls, data: dict) -> "NativeLayer":
             The dict to deserialize from.
         """
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 1, 1)
+        check_version_compatibility(data.pop("@version", 1), 2, 1)
         data.pop("@class", None)
         variables = data.pop("@variables")
         assert variables["w"] is not None and len(variables["w"].shape) == 2
@@ -199,7 +207,7 @@ def check_shape_consistency(self) -> None:
     def check_type_consistency(self) -> None:
         precision = self.precision
 
-        def check_var(var) -> None:
+        def check_var(var: Optional[Array]) -> None:
             if var is not None:
                 # array api standard doesn't provide a API to get the dtype name
                 # this is really hacked
@@ -211,7 +219,7 @@ def check_var(var) -> None:
         check_var(self.b)
         check_var(self.idt)
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("w", "matrix"):
             self.w = value
         elif key in ("b", "bias"):
@@ -227,7 +235,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("w", "matrix"):
             return self.w
         elif key in ("b", "bias"):
@@ -240,6 +248,8 @@ def __getitem__(self, key):
             return self.resnet
         elif key == "precision":
             return self.precision
+        elif key == "trainable":
+            return self.trainable
         else:
             raise KeyError(key)
 
@@ -250,12 +260,12 @@ def dim_out(self) -> int:
         return self.w.shape[1]
 
     @support_array_api(version="2022.12")
-    def call(self, x: np.ndarray) -> np.ndarray:
+    def call(self, x):  # noqa: ANN001, ANN201
         """Forward pass.
 
         Parameters
         ----------
-        x : np.ndarray
+        x : Array
             The input.
 
         Returns
@@ -291,14 +301,14 @@ def get_activation_fn(activation_function: str) -> Callable[[np.ndarray], np.nda
     activation_function = activation_function.lower()
     if activation_function == "tanh":
 
-        def fn(x):
+        def fn(x):  # noqa: ANN001, ANN202  # noqa: ANN001, ANN202
             xp = array_api_compat.array_namespace(x)
             return xp.tanh(x)
 
         return fn
     elif activation_function == "relu":
 
-        def fn(x):
+        def fn(x):  # noqa: ANN001, ANN202
             xp = array_api_compat.array_namespace(x)
             # https://stackoverflow.com/a/47936476/9567349
             return x * xp.astype(x > 0, x.dtype)
@@ -306,7 +316,7 @@ def fn(x):
         return fn
     elif activation_function in ("gelu", "gelu_tf"):
 
-        def fn(x):
+        def fn(x):  # noqa: ANN001, ANN202
             xp = array_api_compat.array_namespace(x)
             # generated by GitHub Copilot
             return (
@@ -318,7 +328,7 @@ def fn(x):
         return fn
     elif activation_function == "relu6":
 
-        def fn(x):
+        def fn(x):  # noqa: ANN001, ANN202
             xp = array_api_compat.array_namespace(x)
             # generated by GitHub Copilot
             return xp.where(
@@ -328,7 +338,7 @@ def fn(x):
         return fn
     elif activation_function == "softplus":
 
-        def fn(x):
+        def fn(x):  # noqa: ANN001, ANN202
             xp = array_api_compat.array_namespace(x)
             # generated by GitHub Copilot
             return xp.log(1 + xp.exp(x))
@@ -336,14 +346,14 @@ def fn(x):
         return fn
     elif activation_function == "sigmoid":
 
-        def fn(x):
+        def fn(x):  # noqa: ANN001, ANN202
             # generated by GitHub Copilot
             return sigmoid_t(x)
 
         return fn
     elif activation_function == "silu":
 
-        def fn(x):
+        def fn(x):  # noqa: ANN001, ANN202
             # generated by GitHub Copilot
             return x * sigmoid_t(x)
 
@@ -352,13 +362,13 @@ def fn(x):
         "custom_silu"
     ):
 
-        def sigmoid(x):
+        def sigmoid(x):  # noqa: ANN001, ANN202
             return 1 / (1 + np.exp(-x))
 
-        def silu(x):
+        def silu(x):  # noqa: ANN001, ANN202
             return x * sigmoid(x)
 
-        def silu_grad(x):
+        def silu_grad(x):  # noqa: ANN001, ANN202
             sig = sigmoid(x)
             return sig + x * sig * (1 - sig)
 
@@ -370,7 +380,7 @@ def silu_grad(x):
         slope = float(silu_grad(threshold))
         const = float(silu(threshold))
 
-        def fn(x):
+        def fn(x):  # noqa: ANN001, ANN202
             xp = array_api_compat.array_namespace(x)
             return xp.where(
                 x < threshold,
@@ -381,7 +391,7 @@ def fn(x):
         return fn
     elif activation_function.lower() in ("none", "linear"):
 
-        def fn(x):
+        def fn(x):  # noqa: ANN001, ANN202
             return x
 
         return fn
@@ -429,6 +439,7 @@ def __init__(
             resnet=False,
             precision=precision,
             seed=seed,
+            trainable=trainable,
         )
         xp = array_api_compat.array_namespace(self.w, self.b)
         self.w = xp.squeeze(self.w, 0)  # keep the weight shape to be [num_in]
@@ -493,7 +504,7 @@ def _check_shape_consistency(self) -> None:
                 f"of b {self.b.shape[0]}",
             )
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("w", "matrix"):
             self.w = value
         elif key in ("b", "bias"):
@@ -507,7 +518,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("w", "matrix"):
             return self.w
         elif key in ("b", "bias"):
@@ -524,12 +535,12 @@ def __getitem__(self, key):
     def dim_out(self) -> int:
         return self.w.shape[0]
 
-    def call(self, x: np.ndarray) -> np.ndarray:
+    def call(self, x):  # noqa: ANN001, ANN201
         """Forward pass.
 
         Parameters
         ----------
-        x : np.ndarray
+        x : Array
             The input.
 
         Returns
@@ -541,7 +552,13 @@ def call(self, x: np.ndarray) -> np.ndarray:
         return y
 
     @staticmethod
-    def layer_norm_numpy(x, shape, weight=None, bias=None, eps=1e-5):
+    def layer_norm_numpy(  # noqa: ANN205
+        x,  # noqa: ANN001
+        shape: tuple[int, ...],
+        weight=None,  # noqa: ANN001
+        bias=None,  # noqa: ANN001
+        eps: float = 1e-5,
+    ):
         xp = array_api_compat.array_namespace(x)
         # mean and variance
         mean = xp.mean(x, axis=tuple(range(-len(shape), 0)), keepdims=True)
@@ -554,7 +571,7 @@ def layer_norm_numpy(x, shape, weight=None, bias=None, eps=1e-5):
         return x_normalized
 
 
-def make_multilayer_network(T_NetworkLayer, ModuleBase):
+def make_multilayer_network(T_NetworkLayer: type, ModuleBase: type) -> type:
     class NN(ModuleBase):
         """Native representation of a neural network.
 
@@ -599,11 +616,11 @@ def deserialize(cls, data: dict) -> "NN":
             data.pop("@class", None)
             return cls(data["layers"])
 
-        def __getitem__(self, key):
+        def __getitem__(self, key: int) -> Any:
             assert isinstance(key, int)
             return self.layers[key]
 
-        def __setitem__(self, key, value) -> None:
+        def __setitem__(self, key: int, value: Any) -> None:
             assert isinstance(key, int)
             self.layers[key] = value
 
@@ -616,12 +633,12 @@ def check_shape_consistency(self) -> None:
                         f"output {self.layers[ii].dim_out}",
                     )
 
-        def call(self, x):
+        def call(self, x):  # noqa: ANN001, ANN202
             """Forward pass.
 
             Parameters
             ----------
-            x : np.ndarray
+            x : Array
                 The input.
 
             Returns
@@ -633,6 +650,25 @@ def call(self, x):
                 x = layer(x)
             return x
 
+        def call_until_last(self, x):  # noqa: ANN001, ANN202
+            """Return the output before last layer.
+
+            Parameters
+            ----------
+            x : Array
+                The input.
+
+            Returns
+            -------
+            np.ndarray
+                The output before last layer.
+            """
+            # avoid slice (self.layers[:-1]) for jit
+            for ii, layer in enumerate(self.layers):
+                if ii < len(self.layers) - 1:
+                    x = layer(x)
+            return x
+
         def clear(self) -> None:
             """Clear the network parameters to zero."""
             for layer in self.layers:
@@ -649,7 +685,7 @@ def clear(self) -> None:
 NativeNet = make_multilayer_network(NativeLayer, NativeOP)
 
 
-def make_embedding_network(T_Network, T_NetworkLayer):
+def make_embedding_network(T_Network: type, T_NetworkLayer: type) -> type:
     class EN(T_Network):
         """The embedding network.
 
@@ -674,16 +710,19 @@ class EN(T_Network):
 
         def __init__(
             self,
-            in_dim,
+            in_dim: int,
             neuron: list[int] = [24, 48, 96],
             activation_function: str = "tanh",
             resnet_dt: bool = False,
             precision: str = DEFAULT_PRECISION,
             seed: Optional[Union[int, list[int]]] = None,
             bias: bool = True,
+            trainable: Union[bool, list[bool]] = True,
         ) -> None:
             layers = []
             i_in = in_dim
+            if isinstance(trainable, bool):
+                trainable = [trainable] * len(neuron)
             for idx, ii in enumerate(neuron):
                 i_ot = ii
                 layers.append(
@@ -696,6 +735,7 @@ def __init__(
                         resnet=True,
                         precision=precision,
                         seed=child_seed(seed, idx),
+                        trainable=trainable[idx],
                     ).serialize()
                 )
                 i_in = i_ot
@@ -751,7 +791,9 @@ def deserialize(cls, data: dict) -> "EmbeddingNet":
 EmbeddingNet = make_embedding_network(NativeNet, NativeLayer)
 
 
-def make_fitting_network(T_EmbeddingNet, T_Network, T_NetworkLayer):
+def make_fitting_network(
+    T_EmbeddingNet: type, T_Network: type, T_NetworkLayer: type
+) -> type:
     class FN(T_EmbeddingNet):
         """The fitting network. It may be implemented as an embedding
         net connected with a linear output layer.
@@ -778,15 +820,22 @@ class FN(T_EmbeddingNet):
 
         def __init__(
             self,
-            in_dim,
-            out_dim,
+            in_dim: int,
+            out_dim: int,
             neuron: list[int] = [24, 48, 96],
             activation_function: str = "tanh",
             resnet_dt: bool = False,
             precision: str = DEFAULT_PRECISION,
             bias_out: bool = True,
             seed: Optional[Union[int, list[int]]] = None,
+            trainable: Union[bool, list[bool]] = True,
         ) -> None:
+            if trainable is None:
+                trainable = [True] * (len(neuron) + 1)
+            elif isinstance(trainable, bool):
+                trainable = [trainable] * (len(neuron) + 1)
+            else:
+                pass
             super().__init__(
                 in_dim,
                 neuron=neuron,
@@ -794,6 +843,7 @@ def __init__(
                 resnet_dt=resnet_dt,
                 precision=precision,
                 seed=seed,
+                trainable=trainable[:-1],
             )
             i_in = neuron[-1] if len(neuron) > 0 else in_dim
             i_ot = out_dim
@@ -807,6 +857,7 @@ def __init__(
                     resnet=False,
                     precision=precision,
                     seed=child_seed(seed, len(neuron)),
+                    trainable=trainable[-1],
                 )
             )
             self.out_dim = out_dim
@@ -894,7 +945,7 @@ def __init__(
         self._networks = [None for ii in range(ntypes**ndim)]
         for ii, network in enumerate(networks):
             self[ii] = network
-        if len(networks):
+        if len(networks) and all(net is not None for net in networks):
             self.check_completeness()
 
     def check_completeness(self) -> None:
@@ -909,7 +960,7 @@ def check_completeness(self) -> None:
             if self[tuple(tt)] is None:
                 raise RuntimeError(f"network for {tt} not found")
 
-    def _convert_key(self, key):
+    def _convert_key(self, key: Union[int, tuple]) -> int:
         if isinstance(key, int):
             idx = key
         else:
@@ -924,11 +975,13 @@ def _convert_key(self, key):
             idx = sum([tt * self.ntypes**ii for ii, tt in enumerate(key)])
         return idx
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: Union[int, tuple]) -> Any:
         return self._networks[self._convert_key(key)]
 
-    def __setitem__(self, key, value) -> None:
-        if isinstance(value, self.network_type):
+    def __setitem__(self, key: Union[int, tuple], value: Any) -> None:
+        if value is None:
+            pass
+        elif isinstance(value, self.network_type):
             pass
         elif isinstance(value, dict):
             value = self.network_type.deserialize(value)
@@ -952,7 +1005,9 @@ def serialize(self) -> dict:
             "ndim": self.ndim,
             "ntypes": self.ntypes,
             "network_type": network_type_name,
-            "networks": [nn.serialize() for nn in self._networks],
+            "networks": [
+                nn.serialize() if nn is not None else None for nn in self._networks
+            ],
         }
 
     @classmethod
@@ -970,11 +1025,11 @@ def deserialize(cls, data: dict) -> "NetworkCollection":
         return cls(**data)
 
 
-def aggregate(
-    data: np.ndarray,
-    owners: np.ndarray,
-    average=True,
-    num_owner=None,
+def aggregate(  # noqa: ANN201
+    data,  # noqa: ANN001
+    owners,  # noqa: ANN001
+    average: bool = True,
+    num_owner: Optional[int] = None,
 ):
     """
     Aggregate rows in data by specifying the owners.
@@ -1010,10 +1065,10 @@ def aggregate(
     return output
 
 
-def get_graph_index(
-    nlist: np.ndarray,
-    nlist_mask: np.ndarray,
-    a_nlist_mask: np.ndarray,
+def get_graph_index(  # noqa: ANN201
+    nlist,  # noqa: ANN001
+    nlist_mask,  # noqa: ANN001
+    a_nlist_mask,  # noqa: ANN001
     nall: int,
     use_loc_mapping: bool = True,
 ):
@@ -1036,12 +1091,12 @@ def get_graph_index(
 
     Returns
     -------
-    edge_index : n_edge x 2
+    edge_index : 2 x n_edge
         n2e_index : n_edge
             Broadcast indices from node(i) to edge(ij), or reduction indices from edge(ij) to node(i).
         n_ext2e_index : n_edge
             Broadcast indices from extended node(j) to edge(ij).
-    angle_index : n_angle x 3
+    angle_index : 3 x n_angle
         n2a_index : n_angle
             Broadcast indices from extended node(j) to angle(ijk).
         eij2a_index : n_angle
@@ -1111,7 +1166,7 @@ def get_graph_index(
     # n_angle
     eik2a_index = edge_index_ik[a_nlist_mask_3d]
 
-    edge_index_result = xp.stack([n2e_index, n_ext2e_index], axis=-1)
-    angle_index_result = xp.stack([n2a_index, eij2a_index, eik2a_index], axis=-1)
+    edge_index_result = xp.stack([n2e_index, n_ext2e_index], axis=0)
+    angle_index_result = xp.stack([n2a_index, eij2a_index, eik2a_index], axis=0)
 
     return edge_index_result, angle_index_result
diff --git a/deepmd/dpmodel/utils/nlist.py b/deepmd/dpmodel/utils/nlist.py
index 4115871f3b..86b1353485 100644
--- a/deepmd/dpmodel/utils/nlist.py
+++ b/deepmd/dpmodel/utils/nlist.py
@@ -5,9 +5,9 @@
 )
 
 import array_api_compat
-import numpy as np
 
 from deepmd.dpmodel.array_api import (
+    Array,
     xp_take_along_axis,
 )
 
@@ -18,13 +18,13 @@
 
 
 def extend_input_and_build_neighbor_list(
-    coord,
-    atype,
+    coord: Array,
+    atype: Array,
     rcut: float,
     sel: list[int],
     mixed_types: bool = False,
-    box: Optional[np.ndarray] = None,
-):
+    box: Optional[Array] = None,
+) -> tuple[Array, Array]:
     xp = array_api_compat.array_namespace(coord, atype)
     nframes, nloc = atype.shape[:2]
     if box is not None:
@@ -51,20 +51,20 @@ def extend_input_and_build_neighbor_list(
 
 ## translated from torch implementation by chatgpt
 def build_neighbor_list(
-    coord: np.ndarray,
-    atype: np.ndarray,
+    coord: Array,
+    atype: Array,
     nloc: int,
     rcut: float,
     sel: Union[int, list[int]],
     distinguish_types: bool = True,
-) -> np.ndarray:
+) -> Array:
     """Build neighbor list for a single frame. keeps nsel neighbors.
 
     Parameters
     ----------
-    coord : np.ndarray
+    coord : Array
         exptended coordinates of shape [batch_size, nall x 3]
-    atype : np.ndarray
+    atype : Array
         extended atomic types of shape [batch_size, nall]
         type < 0 the atom is treat as virtual atoms.
     nloc : int
@@ -81,7 +81,7 @@ def build_neighbor_list(
 
     Returns
     -------
-    neighbor_list : np.ndarray
+    neighbor_list : Array
         Neighbor list of shape [batch_size, nloc, nsel], the neighbors
         are stored in an ascending order. If the number of
         neighbors is less than nsel, the positions are masked
@@ -115,8 +115,8 @@ def build_neighbor_list(
     nsel = sum(sel)
     coord0 = coord1[:, : nloc * 3]
     diff = (
-        xp.reshape(coord1, [batch_size, -1, 3])[:, None, :, :]
-        - xp.reshape(coord0, [batch_size, -1, 3])[:, :, None, :]
+        xp.reshape(coord1, (batch_size, -1, 3))[:, None, :, :]
+        - xp.reshape(coord0, (batch_size, -1, 3))[:, :, None, :]
     )
     assert list(diff.shape) == [batch_size, nloc, nall, 3]
     rr = xp.linalg.vector_norm(diff, axis=-1)
@@ -153,10 +153,10 @@ def build_neighbor_list(
 
 
 def nlist_distinguish_types(
-    nlist: np.ndarray,
-    atype: np.ndarray,
+    nlist: Array,
+    atype: Array,
     sel: list[int],
-):
+) -> Array:
     """Given a nlist that does not distinguish atom types, return a nlist that
     distinguish atom types.
 
@@ -188,20 +188,20 @@ def get_multiple_nlist_key(rcut: float, nsel: int) -> str:
 
 ## translated from torch implementation by chatgpt
 def build_multiple_neighbor_list(
-    coord: np.ndarray,
-    nlist: np.ndarray,
+    coord: Array,
+    nlist: Array,
     rcuts: list[float],
     nsels: list[int],
-) -> dict[str, np.ndarray]:
+) -> dict[str, Array]:
     """Input one neighbor list, and produce multiple neighbor lists with
     different cutoff radius and numbers of selection out of it.  The
     required rcuts and nsels should be smaller or equal to the input nlist.
 
     Parameters
     ----------
-    coord : np.ndarray
+    coord : Array
         exptended coordinates of shape [batch_size, nall x 3]
-    nlist : np.ndarray
+    nlist : Array
         Neighbor list of shape [batch_size, nloc, nsel], the neighbors
         should be stored in an ascending order.
     rcuts : list[float]
@@ -211,7 +211,7 @@ def build_multiple_neighbor_list(
 
     Returns
     -------
-    nlist_dict : dict[str, np.ndarray]
+    nlist_dict : dict[str, Array]
         A dict of nlists, key given by get_multiple_nlist_key(rc, nsel)
         value being the corresponding nlist.
 
@@ -247,33 +247,33 @@ def build_multiple_neighbor_list(
 
 ## translated from torch implementation by chatgpt
 def extend_coord_with_ghosts(
-    coord: np.ndarray,
-    atype: np.ndarray,
-    cell: Optional[np.ndarray],
+    coord: Array,
+    atype: Array,
+    cell: Optional[Array],
     rcut: float,
-):
+) -> tuple[Array, Array]:
     """Extend the coordinates of the atoms by appending peridoc images.
     The number of images is large enough to ensure all the neighbors
     within rcut are appended.
 
     Parameters
     ----------
-    coord : np.ndarray
+    coord : Array
         original coordinates of shape [-1, nloc*3].
-    atype : np.ndarray
+    atype : Array
         atom type of shape [-1, nloc].
-    cell : np.ndarray
+    cell : Array
         simulation cell tensor of shape [-1, 9].
     rcut : float
         the cutoff radius
 
     Returns
     -------
-    extended_coord: np.ndarray
+    extended_coord: Array
         extended coordinates of shape [-1, nall*3].
-    extended_atype: np.ndarray
+    extended_atype: Array
         extended atom type of shape [-1, nall].
-    index_mapping: np.ndarray
+    index_mapping: Array
         mapping extended index to the local index
 
     """
diff --git a/deepmd/dpmodel/utils/region.py b/deepmd/dpmodel/utils/region.py
index bc9b9479a0..6d8dfebf88 100644
--- a/deepmd/dpmodel/utils/region.py
+++ b/deepmd/dpmodel/utils/region.py
@@ -1,24 +1,27 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import array_api_compat
-import numpy as np
+
+from deepmd.dpmodel.array_api import (
+    Array,
+)
 
 
 def phys2inter(
-    coord: np.ndarray,
-    cell: np.ndarray,
-) -> np.ndarray:
+    coord: Array,
+    cell: Array,
+) -> Array:
     """Convert physical coordinates to internal(direct) coordinates.
 
     Parameters
     ----------
-    coord : np.ndarray
+    coord : Array
         physical coordinates of shape [*, na, 3].
-    cell : np.ndarray
+    cell : Array
         simulation cell tensor of shape [*, 3, 3].
 
     Returns
     -------
-    inter_coord: np.ndarray
+    inter_coord: Array
         the internal coordinates
 
     """
@@ -28,21 +31,21 @@ def phys2inter(
 
 
 def inter2phys(
-    coord: np.ndarray,
-    cell: np.ndarray,
-) -> np.ndarray:
+    coord: Array,
+    cell: Array,
+) -> Array:
     """Convert internal(direct) coordinates to physical coordinates.
 
     Parameters
     ----------
-    coord : np.ndarray
+    coord : Array
         internal coordinates of shape [*, na, 3].
-    cell : np.ndarray
+    cell : Array
         simulation cell tensor of shape [*, 3, 3].
 
     Returns
     -------
-    phys_coord: np.ndarray
+    phys_coord: Array
         the physical coordinates
 
     """
@@ -51,21 +54,21 @@ def inter2phys(
 
 
 def normalize_coord(
-    coord: np.ndarray,
-    cell: np.ndarray,
-) -> np.ndarray:
+    coord: Array,
+    cell: Array,
+) -> Array:
     """Apply PBC according to the atomic coordinates.
 
     Parameters
     ----------
-    coord : np.ndarray
+    coord : Array
         original coordinates of shape [*, na, 3].
-    cell : np.ndarray
+    cell : Array
         simulation cell shape [*, 3, 3].
 
     Returns
     -------
-    wrapped_coord: np.ndarray
+    wrapped_coord: Array
         wrapped coordinates of shape [*, na, 3].
 
     """
@@ -76,28 +79,28 @@ def normalize_coord(
 
 
 def to_face_distance(
-    cell: np.ndarray,
-) -> np.ndarray:
+    cell: Array,
+) -> Array:
     """Compute the to-face-distance of the simulation cell.
 
     Parameters
     ----------
-    cell : np.ndarray
+    cell : Array
         simulation cell tensor of shape [*, 3, 3].
 
     Returns
     -------
-    dist: np.ndarray
+    dist: Array
         the to face distances of shape [*, 3]
 
     """
     xp = array_api_compat.array_namespace(cell)
     cshape = cell.shape
-    dist = b_to_face_distance(xp.reshape(cell, [-1, 3, 3]))
-    return xp.reshape(dist, list(cshape[:-2]) + [3])  # noqa:RUF005
+    dist = b_to_face_distance(xp.reshape(cell, (-1, 3, 3)))
+    return xp.reshape(dist, tuple(list(cshape[:-2]) + [3]))  # noqa:RUF005
 
 
-def b_to_face_distance(cell):
+def b_to_face_distance(cell: Array) -> Array:
     xp = array_api_compat.array_namespace(cell)
     volume = xp.linalg.det(cell)
     c_yz = xp.linalg.cross(cell[:, 1, ...], cell[:, 2, ...], axis=-1)
diff --git a/deepmd/dpmodel/utils/safe_gradient.py b/deepmd/dpmodel/utils/safe_gradient.py
index 2baf530c08..08ffa9bb10 100644
--- a/deepmd/dpmodel/utils/safe_gradient.py
+++ b/deepmd/dpmodel/utils/safe_gradient.py
@@ -5,17 +5,24 @@
 for more information.
 """
 
+from typing import (
+    Any,
+    Optional,
+)
+
 import array_api_compat
 
 
-def safe_for_sqrt(x):
+def safe_for_sqrt(x: Any) -> Any:
     """Safe version of sqrt that has a gradient of 0 at x = 0."""
     xp = array_api_compat.array_namespace(x)
     mask = x > 0.0
     return xp.where(mask, xp.sqrt(xp.where(mask, x, xp.ones_like(x))), xp.zeros_like(x))
 
 
-def safe_for_vector_norm(x, /, *, axis=None, keepdims=False, ord=2):
+def safe_for_vector_norm(
+    x: Any, /, *, axis: Optional[Any] = None, keepdims: bool = False, ord: Any = 2
+) -> Any:
     """Safe version of sqrt that has a gradient of 0 at x = 0."""
     xp = array_api_compat.array_namespace(x)
     mask = xp.sum(xp.square(x), axis=axis, keepdims=True) > 0
diff --git a/deepmd/dpmodel/utils/serialization.py b/deepmd/dpmodel/utils/serialization.py
index 5520933753..b765e2eca3 100644
--- a/deepmd/dpmodel/utils/serialization.py
+++ b/deepmd/dpmodel/utils/serialization.py
@@ -5,6 +5,7 @@
     Path,
 )
 from typing import (
+    Any,
     Callable,
 )
 
@@ -18,7 +19,9 @@
     __version__ = "unknown"
 
 
-def traverse_model_dict(model_obj, callback: Callable, is_variable: bool = False):
+def traverse_model_dict(
+    model_obj: Any, callback: Callable, is_variable: bool = False
+) -> Any:
     """Traverse a model dict and call callback on each variable.
 
     Parameters
@@ -67,7 +70,7 @@ class Counter:
     def __init__(self) -> None:
         self.count = -1
 
-    def __call__(self):
+    def __call__(self) -> int:
         self.count += 1
         return self.count
 
@@ -149,7 +152,7 @@ def load_dp_model(filename: str) -> dict:
             model_dict = traverse_model_dict(model_dict, lambda x: f[x][()].copy())
     elif filename_extension in {".yaml", ".yml"}:
 
-        def convert_numpy_ndarray(x):
+        def convert_numpy_ndarray(x: Any) -> Any:
             if isinstance(x, dict) and x.get("@class") == "np.ndarray":
                 dtype = np.dtype(x["dtype"])
                 value = np.asarray(x["value"], dtype=dtype)
diff --git a/deepmd/dpmodel/utils/type_embed.py b/deepmd/dpmodel/utils/type_embed.py
index 17e40f3592..33c70c5763 100644
--- a/deepmd/dpmodel/utils/type_embed.py
+++ b/deepmd/dpmodel/utils/type_embed.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -8,6 +9,7 @@
 import numpy as np
 
 from deepmd.dpmodel.array_api import (
+    Array,
     support_array_api,
 )
 from deepmd.dpmodel.common import (
@@ -93,10 +95,11 @@ def __init__(
             self.precision,
             seed=self.seed,
             bias=self.use_tebd_bias,
+            trainable=trainable,
         )
 
     @support_array_api(version="2022.12")
-    def call(self) -> np.ndarray:
+    def call(self) -> Array:
         """Compute the type embedding network."""
         sample_array = self.embedding_net[0]["w"]
         xp = array_api_compat.array_namespace(sample_array)
@@ -110,7 +113,7 @@ def call(self) -> np.ndarray:
         return embed
 
     @classmethod
-    def deserialize(cls, data: dict):
+    def deserialize(cls, data: dict) -> "TypeEmbedNet":
         """Deserialize the model.
 
         Parameters
@@ -161,7 +164,7 @@ def serialize(self) -> dict:
         }
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Any = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -218,7 +221,9 @@ def change_type_map(
         self.ntypes = len(type_map)
 
 
-def get_econf_tebd(type_map, precision: str = "default"):
+def get_econf_tebd(
+    type_map: list[str], precision: str = "default"
+) -> tuple[Array, int]:
     from deepmd.utils.econf_embd import (
         ECONF_DIM,
     )
diff --git a/deepmd/entrypoints/convert_backend.py b/deepmd/entrypoints/convert_backend.py
index 39967d565c..a8cf20c6b3 100644
--- a/deepmd/entrypoints/convert_backend.py
+++ b/deepmd/entrypoints/convert_backend.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
 from deepmd.backend.backend import (
     Backend,
 )
@@ -8,7 +12,7 @@ def convert_backend(
     *,  # Enforce keyword-only arguments
     INPUT: str,
     OUTPUT: str,
-    **kwargs,
+    **kwargs: Any,
 ) -> None:
     """Convert a model file from one backend to another.
 
diff --git a/deepmd/entrypoints/doc.py b/deepmd/entrypoints/doc.py
index 5679d838ac..74fdd90ebd 100644
--- a/deepmd/entrypoints/doc.py
+++ b/deepmd/entrypoints/doc.py
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """Module that prints train input arguments docstrings."""
 
+from typing import (
+    Any,
+)
+
 from deepmd.utils.argcheck import (
     gen_doc,
     gen_json,
@@ -11,7 +15,7 @@
 
 
 def doc_train_input(
-    *, out_type: str = "rst", multi_task: bool = False, **kwargs
+    *, out_type: str = "rst", multi_task: bool = False, **kwargs: Any
 ) -> None:
     """Print out trining input arguments to console."""
     if out_type == "rst":
diff --git a/deepmd/entrypoints/eval_desc.py b/deepmd/entrypoints/eval_desc.py
new file mode 100644
index 0000000000..da47b6d065
--- /dev/null
+++ b/deepmd/entrypoints/eval_desc.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Evaluate descriptors using trained DeePMD model."""
+
+import logging
+import os
+from pathlib import (
+    Path,
+)
+from typing import (
+    Any,
+    Optional,
+)
+
+import numpy as np
+
+from deepmd.common import (
+    expand_sys_str,
+)
+from deepmd.infer.deep_eval import (
+    DeepEval,
+)
+from deepmd.utils.data import (
+    DeepmdData,
+)
+
+__all__ = ["eval_desc"]
+
+log = logging.getLogger(__name__)
+
+
+def eval_desc(
+    *,
+    model: str,
+    system: str,
+    datafile: str,
+    output: str = "desc",
+    head: Optional[str] = None,
+    **kwargs: Any,
+) -> None:
+    """Evaluate descriptors for given systems.
+
+    Parameters
+    ----------
+    model : str
+        path where model is stored
+    system : str
+        system directory
+    datafile : str
+        the path to the list of systems to process
+    output : str
+        output directory for descriptor files
+    head : Optional[str], optional
+        (Supported backend: PyTorch) Task head if in multi-task mode.
+    **kwargs
+        additional arguments
+
+    Notes
+    -----
+    Descriptors are saved as 3D numpy arrays with shape (nframes, natoms, ndesc)
+    where each frame contains the descriptors for all atoms.
+
+    Raises
+    ------
+    RuntimeError
+        if no valid system was found
+    """
+    if datafile is not None:
+        with open(datafile) as datalist:
+            all_sys = datalist.read().splitlines()
+    else:
+        all_sys = expand_sys_str(system)
+
+    if len(all_sys) == 0:
+        raise RuntimeError("Did not find valid system")
+
+    # init model
+    dp = DeepEval(model, head=head)
+
+    # create output directory
+    output_dir = Path(output)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    for cc, system_path in enumerate(all_sys):
+        log.info("# -------output of dp eval_desc------- ")
+        log.info(f"# processing system : {system_path}")
+
+        # create data class
+        tmap = dp.get_type_map()
+        data = DeepmdData(
+            system_path,
+            set_prefix="set",
+            shuffle_test=False,
+            type_map=tmap,
+            sort_atoms=False,
+        )
+
+        # get test data
+        test_data = data.get_test()
+        mixed_type = data.mixed_type
+        natoms = len(test_data["type"][0])
+        nframes = test_data["box"].shape[0]
+
+        # prepare input data
+        coord = test_data["coord"].reshape([nframes, -1])
+        box = test_data["box"]
+        if not data.pbc:
+            box = None
+        if mixed_type:
+            atype = test_data["type"].reshape([nframes, -1])
+        else:
+            atype = test_data["type"][0]
+
+        # handle optional parameters
+        fparam = None
+        if dp.get_dim_fparam() > 0:
+            if "fparam" in test_data:
+                fparam = test_data["fparam"]
+
+        aparam = None
+        if dp.get_dim_aparam() > 0:
+            if "aparam" in test_data:
+                aparam = test_data["aparam"]
+
+        # evaluate descriptors
+        log.info(f"# evaluating descriptors for {nframes} frames")
+        descriptors = dp.eval_descriptor(
+            coord,
+            box,
+            atype,
+            fparam=fparam,
+            aparam=aparam,
+        )
+
+        # descriptors are kept in 3D format (nframes, natoms, ndesc)
+
+        # save descriptors
+        system_name = os.path.basename(system_path.rstrip("/"))
+        desc_file = output_dir / f"{system_name}.npy"
+        np.save(desc_file, descriptors)
+
+        log.info(f"# descriptors saved to {desc_file}")
+        log.info(f"# descriptor shape: {descriptors.shape}")
+        log.info("# ----------------------------------- ")
+
+    log.info("# eval_desc completed successfully")
diff --git a/deepmd/entrypoints/gui.py b/deepmd/entrypoints/gui.py
index 7a61eb07ee..3d17810bec 100644
--- a/deepmd/entrypoints/gui.py
+++ b/deepmd/entrypoints/gui.py
@@ -1,8 +1,12 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 """DP-GUI entrypoint."""
 
+from typing import (
+    Any,
+)
 
-def start_dpgui(*, port: int, bind_all: bool, **kwargs) -> None:
+
+def start_dpgui(*, port: int, bind_all: bool, **kwargs: Any) -> None:
     """Host DP-GUI server.
 
     Parameters
diff --git a/deepmd/entrypoints/ipi.py b/deepmd/entrypoints/ipi.py
index 2f7e1f5458..bd527defb1 100644
--- a/deepmd/entrypoints/ipi.py
+++ b/deepmd/entrypoints/ipi.py
@@ -15,7 +15,7 @@
 ROOT_DIR = get_op_dir()
 
 
-def _program(name: str, args: list[str]):
+def _program(name: str, args: list[str]) -> None:
     """Execute a program.
 
     Parameters
diff --git a/deepmd/entrypoints/main.py b/deepmd/entrypoints/main.py
index 2c91ca5f29..34ebe4d2e3 100644
--- a/deepmd/entrypoints/main.py
+++ b/deepmd/entrypoints/main.py
@@ -18,6 +18,9 @@
 from deepmd.entrypoints.doc import (
     doc_train_input,
 )
+from deepmd.entrypoints.eval_desc import (
+    eval_desc,
+)
 from deepmd.entrypoints.gui import (
     start_dpgui,
 )
@@ -65,6 +68,14 @@ def main(args: argparse.Namespace) -> None:
             strict_prefer=False,
         )
         test(**dict_args)
+    elif args.command == "eval-desc":
+        dict_args["model"] = format_model_suffix(
+            dict_args["model"],
+            feature=Backend.Feature.DEEP_EVAL,
+            preferred_backend=args.backend,
+            strict_prefer=False,
+        )
+        eval_desc(**dict_args)
     elif args.command == "doc-train-input":
         doc_train_input(**dict_args)
     elif args.command == "model-devi":
diff --git a/deepmd/entrypoints/neighbor_stat.py b/deepmd/entrypoints/neighbor_stat.py
index 62dceb24fd..d492b072ad 100644
--- a/deepmd/entrypoints/neighbor_stat.py
+++ b/deepmd/entrypoints/neighbor_stat.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Optional,
 )
 
@@ -24,8 +25,8 @@ def neighbor_stat(
     type_map: Optional[list[str]],
     mixed_type: bool = False,
     backend: str = "tensorflow",
-    **kwargs,
-):
+    **kwargs: Any,
+) -> None:
     """Calculate neighbor statistics.
 
     Parameters
diff --git a/deepmd/entrypoints/show.py b/deepmd/entrypoints/show.py
index e279fbe3e7..ddd097d22c 100644
--- a/deepmd/entrypoints/show.py
+++ b/deepmd/entrypoints/show.py
@@ -1,9 +1,19 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
+from typing import (
+    Any,
+)
 
 from deepmd.infer.deep_eval import (
     DeepEval,
 )
+from deepmd.utils.econf_embd import (
+    sort_element_type,
+)
+from deepmd.utils.model_branch_dict import (
+    OrderedDictTableWrapper,
+    get_model_dict,
+)
 
 log = logging.getLogger(__name__)
 
@@ -12,7 +22,7 @@ def show(
     *,
     INPUT: str,
     ATTRIBUTES: list[str],
-    **kwargs,
+    **kwargs: Any,
 ) -> None:
     model = DeepEval(INPUT, head=0)
     model_params = model.get_model_def_script()
@@ -30,10 +40,15 @@ def show(
             )
         model_branches = list(model_params["model_dict"].keys())
         model_branches += ["RANDOM"]
+        _, model_branch_dict = get_model_dict(model_params["model_dict"])
         log.info(
             f"Available model branches are {model_branches}, "
             f"where 'RANDOM' means using a randomly initialized fitting net."
         )
+        log.info(
+            "Detailed information: \n"
+            + OrderedDictTableWrapper(model_branch_dict).as_table()
+        )
     if "type-map" in ATTRIBUTES:
         if model_is_multi_task:
             model_branches = list(model_params["model_dict"].keys())
@@ -69,3 +84,48 @@ def show(
         log.info(f"Parameter counts{log_prefix}:")
         for k in sorted(size_dict):
             log.info(f"Parameters in {k}: {size_dict[k]:,}")
+
+    if "observed-type" in ATTRIBUTES:
+        if model_is_multi_task:
+            log.info("The observed types for each branch: ")
+            total_observed_types_list = []
+            model_branches = list(model_params["model_dict"].keys())
+            for branch in model_branches:
+                if (
+                    model_params["model_dict"][branch]
+                    .get("info", {})
+                    .get("observed_type", None)
+                    is not None
+                ):
+                    observed_type_list = model_params["model_dict"][branch]["info"][
+                        "observed_type"
+                    ]
+                    observed_types = {
+                        "type_num": len(observed_type_list),
+                        "observed_type": observed_type_list,
+                    }
+                else:
+                    tmp_model = DeepEval(INPUT, head=branch, no_jit=True)
+                    observed_types = tmp_model.get_observed_types()
+                log.info(
+                    f"{branch}: Number of observed types: {observed_types['type_num']} "
+                )
+                log.info(
+                    f"{branch}: Observed types: {observed_types['observed_type']} "
+                )
+                total_observed_types_list += [
+                    tt
+                    for tt in observed_types["observed_type"]
+                    if tt not in total_observed_types_list
+                ]
+            log.info(
+                f"TOTAL number of observed types in the model: {len(total_observed_types_list)} "
+            )
+            log.info(
+                f"TOTAL observed types in the model: {sort_element_type(total_observed_types_list)} "
+            )
+        else:
+            log.info("The observed types for this model: ")
+            observed_types = model.get_observed_types()
+            log.info(f"Number of observed types: {observed_types['type_num']} ")
+            log.info(f"Observed types: {observed_types['observed_type']} ")
diff --git a/deepmd/entrypoints/test.py b/deepmd/entrypoints/test.py
index 919d23f757..16c097f743 100644
--- a/deepmd/entrypoints/test.py
+++ b/deepmd/entrypoints/test.py
@@ -7,6 +7,7 @@
 )
 from typing import (
     TYPE_CHECKING,
+    Any,
     Optional,
 )
 
@@ -14,6 +15,7 @@
 
 from deepmd.common import (
     expand_sys_str,
+    j_loader,
 )
 from deepmd.infer.deep_dipole import (
     DeepDipole,
@@ -38,9 +40,15 @@
     DeepWFC,
 )
 from deepmd.utils import random as dp_random
+from deepmd.utils.compat import (
+    update_deepmd_input,
+)
 from deepmd.utils.data import (
     DeepmdData,
 )
+from deepmd.utils.data_system import (
+    process_systems,
+)
 from deepmd.utils.weight_avg import (
     weighted_average,
 )
@@ -58,15 +66,17 @@
 def test(
     *,
     model: str,
-    system: str,
-    datafile: str,
+    system: Optional[str],
+    datafile: Optional[str],
+    train_json: Optional[str] = None,
+    valid_json: Optional[str] = None,
     numb_test: int,
     rand_seed: Optional[int],
     shuffle_test: bool,
     detail_file: str,
     atomic: bool,
     head: Optional[str] = None,
-    **kwargs,
+    **kwargs: Any,
 ) -> None:
     """Test model predictions.
 
@@ -74,12 +84,16 @@ def test(
     ----------
     model : str
         path where model is stored
-    system : str
+    system : str, optional
         system directory
-    datafile : str
+    datafile : str, optional
         the path to the list of systems to test
+    train_json : Optional[str]
+        Path to the input.json file provided via ``--train-data``. Training systems will be used for testing.
+    valid_json : Optional[str]
+        Path to the input.json file provided via ``--valid-data``. Validation systems will be used for testing.
     numb_test : int
-        munber of tests to do. 0 means all data.
+        number of tests to do. 0 means all data.
     rand_seed : Optional[int]
         seed for random generator
     shuffle_test : bool
@@ -101,11 +115,41 @@ def test(
     if numb_test == 0:
         # only float has inf, but should work for min
         numb_test = float("inf")
-    if datafile is not None:
+    if train_json is not None:
+        jdata = j_loader(train_json)
+        jdata = update_deepmd_input(jdata)
+        data_params = jdata.get("training", {}).get("training_data", {})
+        systems = data_params.get("systems")
+        if not systems:
+            raise RuntimeError("No training data found in input json")
+        root = Path(train_json).parent
+        if isinstance(systems, str):
+            systems = str((root / Path(systems)).resolve())
+        else:
+            systems = [str((root / Path(ss)).resolve()) for ss in systems]
+        patterns = data_params.get("rglob_patterns", None)
+        all_sys = process_systems(systems, patterns=patterns)
+    elif valid_json is not None:
+        jdata = j_loader(valid_json)
+        jdata = update_deepmd_input(jdata)
+        data_params = jdata.get("training", {}).get("validation_data", {})
+        systems = data_params.get("systems")
+        if not systems:
+            raise RuntimeError("No validation data found in input json")
+        root = Path(valid_json).parent
+        if isinstance(systems, str):
+            systems = str((root / Path(systems)).resolve())
+        else:
+            systems = [str((root / Path(ss)).resolve()) for ss in systems]
+        patterns = data_params.get("rglob_patterns", None)
+        all_sys = process_systems(systems, patterns=patterns)
+    elif datafile is not None:
         with open(datafile) as datalist:
             all_sys = datalist.read().splitlines()
-    else:
+    elif system is not None:
         all_sys = expand_sys_str(system)
+    else:
+        raise RuntimeError("No data source specified for testing")
 
     if len(all_sys) == 0:
         raise RuntimeError("Did not find valid system")
@@ -291,6 +335,7 @@ def test_ener(
 
     data.add("energy", 1, atomic=False, must=False, high_prec=True)
     data.add("force", 3, atomic=True, must=False, high_prec=False)
+    data.add("atom_pref", 1, atomic=True, must=False, high_prec=False, repeat=3)
     data.add("virial", 9, atomic=False, must=False, high_prec=False)
     if dp.has_efield:
         data.add("efield", 3, atomic=True, must=True, high_prec=False)
@@ -298,7 +343,11 @@ def test_ener(
         data.add("atom_ener", 1, atomic=True, must=True, high_prec=False)
     if dp.get_dim_fparam() > 0:
         data.add(
-            "fparam", dp.get_dim_fparam(), atomic=False, must=True, high_prec=False
+            "fparam",
+            dp.get_dim_fparam(),
+            atomic=False,
+            must=not dp.has_default_fparam(),
+            high_prec=False,
         )
     if dp.get_dim_aparam() > 0:
         data.add("aparam", dp.get_dim_aparam(), atomic=True, must=True, high_prec=False)
@@ -313,6 +362,7 @@ def test_ener(
     find_force = test_data.get("find_force")
     find_virial = test_data.get("find_virial")
     find_force_mag = test_data.get("find_force_mag")
+    find_atom_pref = test_data.get("find_atom_pref")
     mixed_type = data.mixed_type
     natoms = len(test_data["type"][0])
     nframes = test_data["box"].shape[0]
@@ -334,7 +384,7 @@ def test_ener(
         atype = test_data["type"][:numb_test].reshape([numb_test, -1])
     else:
         atype = test_data["type"][0]
-    if dp.get_dim_fparam() > 0:
+    if dp.get_dim_fparam() > 0 and test_data["find_fparam"] != 0.0:
         fparam = test_data["fparam"][:numb_test]
     else:
         fparam = None
@@ -419,6 +469,16 @@ def test_ener(
     diff_f = force - test_data["force"][:numb_test]
     mae_f = mae(diff_f)
     rmse_f = rmse(diff_f)
+    size_f = diff_f.size
+    if find_atom_pref == 1:
+        atom_weight = test_data["atom_pref"][:numb_test]
+        weight_sum = np.sum(atom_weight)
+        if weight_sum > 0:
+            mae_fw = np.sum(np.abs(diff_f) * atom_weight) / weight_sum
+            rmse_fw = np.sqrt(np.sum(diff_f * diff_f * atom_weight) / weight_sum)
+        else:
+            mae_fw = 0.0
+            rmse_fw = 0.0
     diff_v = virial - test_data["virial"][:numb_test]
     mae_v = mae(diff_v)
     rmse_v = rmse(diff_v)
@@ -451,13 +511,18 @@ def test_ener(
         dict_to_return["rmse_e"] = (rmse_e, energy.size)
         dict_to_return["rmse_ea"] = (rmse_ea, energy.size)
     if not out_put_spin and find_force == 1:
-        log.info(f"Force  MAE         : {mae_f:e} eV/A")
-        log.info(f"Force  RMSE        : {rmse_f:e} eV/A")
-        dict_to_return["mae_f"] = (mae_f, force.size)
-        dict_to_return["rmse_f"] = (rmse_f, force.size)
+        log.info(f"Force  MAE         : {mae_f:e} eV/Å")
+        log.info(f"Force  RMSE        : {rmse_f:e} eV/Å")
+        dict_to_return["mae_f"] = (mae_f, size_f)
+        dict_to_return["rmse_f"] = (rmse_f, size_f)
+        if find_atom_pref == 1:
+            log.info(f"Force weighted MAE : {mae_fw:e} eV/Å")
+            log.info(f"Force weighted RMSE: {rmse_fw:e} eV/Å")
+            dict_to_return["mae_fw"] = (mae_fw, weight_sum)
+            dict_to_return["rmse_fw"] = (rmse_fw, weight_sum)
     if out_put_spin and find_force == 1:
-        log.info(f"Force atom MAE      : {mae_fr:e} eV/A")
-        log.info(f"Force atom RMSE     : {rmse_fr:e} eV/A")
+        log.info(f"Force atom MAE      : {mae_fr:e} eV/Å")
+        log.info(f"Force atom RMSE     : {rmse_fr:e} eV/Å")
         dict_to_return["mae_fr"] = (mae_fr, force_r.size)
         dict_to_return["rmse_fr"] = (rmse_fr, force_r.size)
     if out_put_spin and find_force_mag == 1:
@@ -478,8 +543,8 @@ def test_ener(
         log.info(f"Atomic ener MAE    : {mae_ae:e} eV")
         log.info(f"Atomic ener RMSE   : {rmse_ae:e} eV")
     if dp.has_hessian:
-        log.info(f"Hessian MAE        : {mae_h:e} eV/A^2")
-        log.info(f"Hessian RMSE       : {rmse_h:e} eV/A^2")
+        log.info(f"Hessian MAE        : {mae_h:e} eV/Å^2")
+        log.info(f"Hessian RMSE       : {rmse_h:e} eV/Å^2")
         dict_to_return["mae_h"] = (mae_h, hessian.size)
         dict_to_return["rmse_h"] = (rmse_h, hessian.size)
 
@@ -597,21 +662,25 @@ def print_ener_sys_avg(avg: dict[str, float]) -> None:
     log.info(f"Energy RMSE        : {avg['rmse_e']:e} eV")
     log.info(f"Energy MAE/Natoms  : {avg['mae_ea']:e} eV")
     log.info(f"Energy RMSE/Natoms : {avg['rmse_ea']:e} eV")
-    if "rmse_f" in avg.keys():
-        log.info(f"Force  MAE         : {avg['mae_f']:e} eV/A")
-        log.info(f"Force  RMSE        : {avg['rmse_f']:e} eV/A")
+    if "rmse_f" in avg:
+        log.info(f"Force  MAE         : {avg['mae_f']:e} eV/Å")
+        log.info(f"Force  RMSE        : {avg['rmse_f']:e} eV/Å")
+        if "rmse_fw" in avg:
+            log.info(f"Force weighted MAE : {avg['mae_fw']:e} eV/Å")
+            log.info(f"Force weighted RMSE: {avg['rmse_fw']:e} eV/Å")
     else:
-        log.info(f"Force atom MAE      : {avg['mae_fr']:e} eV/A")
+        log.info(f"Force atom MAE      : {avg['mae_fr']:e} eV/Å")
         log.info(f"Force spin MAE      : {avg['mae_fm']:e} eV/uB")
-        log.info(f"Force atom RMSE     : {avg['rmse_fr']:e} eV/A")
+        log.info(f"Force atom RMSE     : {avg['rmse_fr']:e} eV/Å")
         log.info(f"Force spin RMSE     : {avg['rmse_fm']:e} eV/uB")
-    log.info(f"Virial MAE         : {avg['mae_v']:e} eV")
-    log.info(f"Virial RMSE        : {avg['rmse_v']:e} eV")
-    log.info(f"Virial MAE/Natoms  : {avg['mae_va']:e} eV")
-    log.info(f"Virial RMSE/Natoms : {avg['rmse_va']:e} eV")
-    if "rmse_h" in avg.keys():
-        log.info(f"Hessian MAE         : {avg['mae_h']:e} eV/A^2")
-        log.info(f"Hessian RMSE        : {avg['rmse_h']:e} eV/A^2")
+    if "rmse_v" in avg:
+        log.info(f"Virial MAE         : {avg['mae_v']:e} eV")
+        log.info(f"Virial RMSE        : {avg['rmse_v']:e} eV")
+        log.info(f"Virial MAE/Natoms  : {avg['mae_va']:e} eV")
+        log.info(f"Virial RMSE/Natoms : {avg['rmse_va']:e} eV")
+    if "rmse_h" in avg:
+        log.info(f"Hessian MAE         : {avg['mae_h']:e} eV/Å^2")
+        log.info(f"Hessian RMSE        : {avg['rmse_h']:e} eV/Å^2")
 
 
 def test_dos(
@@ -934,7 +1003,9 @@ def print_property_sys_avg(avg: dict[str, float]) -> None:
     log.info(f"PROPERTY RMSE           : {avg['rmse_property']:e} units")
 
 
-def run_test(dp: "DeepTensor", test_data: dict, numb_test: int, test_sys: DeepmdData):
+def run_test(
+    dp: "DeepTensor", test_data: dict, numb_test: int, test_sys: DeepmdData
+) -> dict:
     """Run tests.
 
     Parameters
@@ -998,8 +1069,8 @@ def test_wfc(
     wfc, numb_test, _ = run_test(dp, test_data, numb_test, data)
     rmse_f = rmse(wfc - test_data["wfc"][:numb_test])
 
-    log.info("# number of test data : {numb_test:d} ")
-    log.info("WFC  RMSE : {rmse_f:e} eV/A")
+    log.info(f"# number of test data : {numb_test:d} ")
+    log.info(f"WFC  RMSE : {rmse_f:e}")
 
     if detail_file is not None:
         detail_path = Path(detail_file)
@@ -1018,7 +1089,7 @@ def test_wfc(
     return {"rmse": (rmse_f, wfc.size)}
 
 
-def print_wfc_sys_avg(avg) -> None:
+def print_wfc_sys_avg(avg: dict) -> None:
     """Print errors summary for wfc type potential.
 
     Parameters
@@ -1026,7 +1097,7 @@ def print_wfc_sys_avg(avg) -> None:
     avg : np.ndarray
         array with summaries
     """
-    log.info(f"WFC  RMSE : {avg['rmse']:e} eV/A")
+    log.info(f"WFC  RMSE : {avg['rmse']:e}")
 
 
 def test_polar(
@@ -1160,7 +1231,7 @@ def test_polar(
     return {"rmse": (rmse_f, polar.size)}
 
 
-def print_polar_sys_avg(avg) -> None:
+def print_polar_sys_avg(avg: dict) -> None:
     """Print errors summary for polar type potential.
 
     Parameters
@@ -1168,7 +1239,7 @@ def print_polar_sys_avg(avg) -> None:
     avg : np.ndarray
         array with summaries
     """
-    log.info(f"Polarizability  RMSE : {avg['rmse']:e} eV/A")
+    log.info(f"Polarizability  RMSE : {avg['rmse']:e}")
 
 
 def test_dipole(
@@ -1274,7 +1345,7 @@ def test_dipole(
     return {"rmse": (rmse_f, dipole.size)}
 
 
-def print_dipole_sys_avg(avg) -> None:
+def print_dipole_sys_avg(avg: dict) -> None:
     """Print errors summary for dipole type potential.
 
     Parameters
@@ -1282,4 +1353,4 @@ def print_dipole_sys_avg(avg) -> None:
     avg : np.ndarray
         array with summaries
     """
-    log.info(f"Dipole  RMSE         : {avg['rmse']:e} eV/A")
+    log.info(f"Dipole  RMSE         : {avg['rmse']:e}")
diff --git a/deepmd/infer/__init__.py b/deepmd/infer/__init__.py
index a0330c4ffb..77780d5922 100644
--- a/deepmd/infer/__init__.py
+++ b/deepmd/infer/__init__.py
@@ -17,7 +17,7 @@
 ]
 
 
-def DeepPotential(*args, **kwargs) -> "DeepEval":
+def DeepPotential(*args: object, **kwargs: object) -> "DeepEval":
     """Factory function that forwards to DeepEval (for compatibility).
 
     Parameters
diff --git a/deepmd/infer/deep_eval.py b/deepmd/infer/deep_eval.py
index ee61abe58c..5f29f08330 100644
--- a/deepmd/infer/deep_eval.py
+++ b/deepmd/infer/deep_eval.py
@@ -90,7 +90,9 @@ def __init__(
     ) -> None:
         pass
 
-    def __new__(cls, model_file: str, *args, **kwargs):
+    def __new__(
+        cls, model_file: str, *args: object, **kwargs: object
+    ) -> "DeepEvalBackend":
         if cls is DeepEvalBackend:
             backend = Backend.detect_backend_by_model(model_file)
             return super().__new__(backend().deep_eval)
@@ -160,6 +162,10 @@ def get_type_map(self) -> list[str]:
     def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this DP."""
 
+    def has_default_fparam(self) -> bool:
+        """Check if the model has default frame parameters."""
+        return False
+
     @abstractmethod
     def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this DP."""
@@ -215,6 +221,48 @@ def eval_descriptor(
         """
         raise NotImplementedError
 
+    def eval_fitting_last_layer(
+        self,
+        coords: np.ndarray,
+        cells: Optional[np.ndarray],
+        atom_types: np.ndarray,
+        fparam: Optional[np.ndarray] = None,
+        aparam: Optional[np.ndarray] = None,
+        **kwargs: Any,
+    ) -> np.ndarray:
+        """Evaluate fitting before last layer by using this DP.
+
+        Parameters
+        ----------
+        coords
+            The coordinates of atoms.
+            The array should be of size nframes x natoms x 3
+        cells
+            The cell of the region.
+            If None then non-PBC is assumed, otherwise using PBC.
+            The array should be of size nframes x 9
+        atom_types
+            The atom types
+            The list should contain natoms ints
+        fparam
+            The frame parameter.
+            The array can be of size :
+            - nframes x dim_fparam.
+            - dim_fparam. Then all frames are assumed to be provided with the same fparam.
+        aparam
+            The atomic parameter
+            The array can be of size :
+            - nframes x natoms x dim_aparam.
+            - natoms x dim_aparam. Then all frames are assumed to be provided with the same aparam.
+            - dim_aparam. Then all frames and atoms are provided with the same aparam.
+
+        Returns
+        -------
+        fitting
+            Fitting output before last layer.
+        """
+        raise NotImplementedError
+
     def eval_typeebd(self) -> np.ndarray:
         """Evaluate output of type embedding network by using this model.
 
@@ -275,7 +323,7 @@ def get_has_spin(self) -> bool:
         """Check if the model has spin atom types."""
         return False
 
-    def get_has_hessian(self):
+    def get_has_hessian(self) -> bool:
         """Check if the model has hessian."""
         return False
 
@@ -295,6 +343,24 @@ def get_model_size(self) -> dict:
         """Get model parameter count."""
         raise NotImplementedError("Not implemented in this backend.")
 
+    def get_observed_types(self) -> dict:
+        """Get observed types (elements) of the model during data statistics."""
+        raise NotImplementedError("Not implemented in this backend.")
+
+    @abstractmethod
+    def get_model(self) -> Any:
+        """Get the model module implemented by the deep learning framework.
+
+        For PyTorch, this returns the nn.Module. For Paddle, this returns
+        the paddle.nn.Layer. For TensorFlow, this returns the graph.
+        For dpmodel, this returns the BaseModel.
+
+        Returns
+        -------
+        model
+            The model module implemented by the deep learning framework.
+        """
+
 
 class DeepEval(ABC):
     """High-level Deep Evaluator interface.
@@ -319,7 +385,7 @@ class DeepEval(ABC):
         Keyword arguments.
     """
 
-    def __new__(cls, model_file: str, *args, **kwargs):
+    def __new__(cls, model_file: str, *args: object, **kwargs: object) -> "DeepEval":
         if cls is DeepEval:
             deep_eval = DeepEvalBackend(
                 model_file,
@@ -370,6 +436,10 @@ def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this DP."""
         return self.deep_eval.get_dim_fparam()
 
+    def has_default_fparam(self) -> bool:
+        """Check if the model has default frame parameters."""
+        return self.deep_eval.has_default_fparam()
+
     def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this DP."""
         return self.deep_eval.get_dim_aparam()
@@ -391,7 +461,9 @@ def _get_natoms_and_nframes(
         nframes = coords.shape[0]
         return natoms, nframes
 
-    def _expande_atype(self, atype: np.ndarray, nframes: int, mixed_type: bool):
+    def _expande_atype(
+        self, atype: np.ndarray, nframes: int, mixed_type: bool
+    ) -> np.ndarray:
         if not mixed_type:
             atype = np.tile(atype.reshape(1, -1), (nframes, 1))
         return atype
@@ -463,6 +535,73 @@ def eval_descriptor(
         )
         return descriptor
 
+    def eval_fitting_last_layer(
+        self,
+        coords: np.ndarray,
+        cells: Optional[np.ndarray],
+        atom_types: np.ndarray,
+        fparam: Optional[np.ndarray] = None,
+        aparam: Optional[np.ndarray] = None,
+        mixed_type: bool = False,
+        **kwargs: Any,
+    ) -> np.ndarray:
+        """Evaluate fitting before last layer by using this DP.
+
+        Parameters
+        ----------
+        coords
+            The coordinates of atoms.
+            The array should be of size nframes x natoms x 3
+        cells
+            The cell of the region.
+            If None then non-PBC is assumed, otherwise using PBC.
+            The array should be of size nframes x 9
+        atom_types
+            The atom types
+            The list should contain natoms ints
+        fparam
+            The frame parameter.
+            The array can be of size :
+            - nframes x dim_fparam.
+            - dim_fparam. Then all frames are assumed to be provided with the same fparam.
+        aparam
+            The atomic parameter
+            The array can be of size :
+            - nframes x natoms x dim_aparam.
+            - natoms x dim_aparam. Then all frames are assumed to be provided with the same aparam.
+            - dim_aparam. Then all frames and atoms are provided with the same aparam.
+        efield
+            The external field on atoms.
+            The array should be of size nframes x natoms x 3
+        mixed_type
+            Whether to perform the mixed_type mode.
+            If True, the input data has the mixed_type format (see doc/model/train_se_atten.md),
+            in which frames in a system may have different natoms_vec(s), with the same nloc.
+
+        Returns
+        -------
+        fitting
+            Fitting output before last layer.
+        """
+        (
+            coords,
+            cells,
+            atom_types,
+            fparam,
+            aparam,
+            nframes,
+            natoms,
+        ) = self._standard_input(coords, cells, atom_types, fparam, aparam, mixed_type)
+        fitting = self.deep_eval.eval_fitting_last_layer(
+            coords,
+            cells,
+            atom_types,
+            fparam=fparam,
+            aparam=aparam,
+            **kwargs,
+        )
+        return fitting
+
     def eval_typeebd(self) -> np.ndarray:
         """Evaluate output of type embedding network by using this model.
 
@@ -492,7 +631,21 @@ def eval_typeebd(self) -> np.ndarray:
         """
         return self.deep_eval.eval_typeebd()
 
-    def _standard_input(self, coords, cells, atom_types, fparam, aparam, mixed_type):
+    def _standard_input(
+        self,
+        coords: Union[np.ndarray, list],
+        cells: Optional[Union[np.ndarray, list]],
+        atom_types: Union[np.ndarray, list],
+        fparam: Optional[Union[np.ndarray, list]],
+        aparam: Optional[Union[np.ndarray, list]],
+        mixed_type: bool,
+    ) -> tuple[
+        np.ndarray,
+        Optional[np.ndarray],
+        np.ndarray,
+        Optional[np.ndarray],
+        Optional[np.ndarray],
+    ]:
         coords = np.array(coords)
         if cells is not None:
             cells = np.array(cells)
@@ -539,7 +692,7 @@ def get_sel_type(self) -> list[int]:
         """
         return self.deep_eval.get_sel_type()
 
-    def _get_sel_natoms(self, atype) -> int:
+    def _get_sel_natoms(self, atype: np.ndarray) -> int:
         return np.sum(np.isin(atype, self.get_sel_type()).astype(int))
 
     @property
@@ -568,3 +721,21 @@ def get_model_def_script(self) -> dict:
     def get_model_size(self) -> dict:
         """Get model parameter count."""
         return self.deep_eval.get_model_size()
+
+    def get_observed_types(self) -> dict:
+        """Get observed types (elements) of the model during data statistics."""
+        return self.deep_eval.get_observed_types()
+
+    def get_model(self) -> Any:
+        """Get the model module implemented by the deep learning framework.
+
+        For PyTorch, this returns the nn.Module. For Paddle, this returns
+        the paddle.nn.Layer. For TensorFlow, this returns the graph.
+        For dpmodel, this returns the BaseModel.
+
+        Returns
+        -------
+        model
+            The model module implemented by the deep learning framework.
+        """
+        return self.deep_eval.get_model()
diff --git a/deepmd/infer/deep_polar.py b/deepmd/infer/deep_polar.py
index 7220e53637..52ffd78b26 100644
--- a/deepmd/infer/deep_polar.py
+++ b/deepmd/infer/deep_polar.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -55,7 +56,7 @@ def eval(
         fparam: Optional[np.ndarray] = None,
         aparam: Optional[np.ndarray] = None,
         mixed_type: bool = False,
-        **kwargs,
+        **kwargs: Any,
     ) -> np.ndarray:
         """Evaluate the model.
 
diff --git a/deepmd/infer/model_devi.py b/deepmd/infer/model_devi.py
index 1828d40c49..c025297fa1 100644
--- a/deepmd/infer/model_devi.py
+++ b/deepmd/infer/model_devi.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
     overload,
 )
@@ -186,7 +187,7 @@ def calc_model_devi_v(
 
 def write_model_devi_out(
     devi: np.ndarray, fname: str, header: str = "", atomic: bool = False
-):
+) -> np.ndarray:
     """Write output of model deviation.
 
     Parameters
@@ -225,7 +226,7 @@ def write_model_devi_out(
     return devi
 
 
-def _check_tmaps(tmaps, ref_tmap=None):
+def _check_tmaps(tmaps: list[list[str]], ref_tmap: Optional[list[str]] = None) -> bool:
     """Check whether type maps are identical."""
     assert isinstance(tmaps, list)
     if ref_tmap is None:
@@ -241,20 +242,20 @@ def _check_tmaps(tmaps, ref_tmap=None):
 
 
 def calc_model_devi(
-    coord,
-    box,
-    atype,
-    models,
-    fname=None,
-    frequency=1,
-    mixed_type=False,
+    coord: np.ndarray,
+    box: Optional[np.ndarray],
+    atype: np.ndarray,
+    models: list[DeepPot],
+    fname: Optional[str] = None,
+    frequency: int = 1,
+    mixed_type: bool = False,
     fparam: Optional[np.ndarray] = None,
     aparam: Optional[np.ndarray] = None,
     real_data: Optional[dict] = None,
     atomic: bool = False,
     relative: Optional[float] = None,
     relative_v: Optional[float] = None,
-):
+) -> np.ndarray:
     """Python interface to calculate model deviation.
 
     Parameters
@@ -363,8 +364,8 @@ def make_model_devi(
     atomic: bool = False,
     relative: Optional[float] = None,
     relative_v: Optional[float] = None,
-    **kwargs,
-):
+    **kwargs: Any,
+) -> None:
     """Make model deviation calculation.
 
     Parameters
diff --git a/deepmd/jax/atomic_model/base_atomic_model.py b/deepmd/jax/atomic_model/base_atomic_model.py
index ffd58daf5e..474fcb03c7 100644
--- a/deepmd/jax/atomic_model/base_atomic_model.py
+++ b/deepmd/jax/atomic_model/base_atomic_model.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
 from deepmd.jax.common import (
     ArrayAPIVariable,
     to_jax_array,
@@ -9,7 +13,7 @@
 )
 
 
-def base_atomic_model_set_attr(name, value):
+def base_atomic_model_set_attr(name: str, value: Any) -> Any:
     if name in {"out_bias", "out_std"}:
         value = to_jax_array(value)
         if value is not None:
diff --git a/deepmd/jax/common.py b/deepmd/jax/common.py
index 59f36d11ad..14ae1cad9d 100644
--- a/deepmd/jax/common.py
+++ b/deepmd/jax/common.py
@@ -70,11 +70,11 @@ def flax_module(
         metas.add(type(nnx.Module))
 
     class MixedMetaClass(*metas):
-        def __call__(self, *args, **kwargs):
+        def __call__(self, *args: Any, **kwargs: Any) -> Any:
             return type(nnx.Module).__call__(self, *args, **kwargs)
 
     class FlaxModule(module, nnx.Module, metaclass=MixedMetaClass):
-        def __init_subclass__(cls, **kwargs) -> None:
+        def __init_subclass__(cls, **kwargs: Any) -> None:
             return super().__init_subclass__(**kwargs)
 
         def __setattr__(self, name: str, value: Any) -> None:
@@ -84,20 +84,22 @@ def __setattr__(self, name: str, value: Any) -> None:
 
 
 class ArrayAPIVariable(nnx.Variable):
-    def __array__(self, *args, **kwargs):
+    def __array__(self, *args: Any, **kwargs: Any) -> np.ndarray:
         return self.value.__array__(*args, **kwargs)
 
-    def __array_namespace__(self, *args, **kwargs):
+    def __array_namespace__(self, *args: Any, **kwargs: Any) -> Any:
         return self.value.__array_namespace__(*args, **kwargs)
 
-    def __dlpack__(self, *args, **kwargs):
+    def __dlpack__(self, *args: Any, **kwargs: Any) -> Any:
         return self.value.__dlpack__(*args, **kwargs)
 
-    def __dlpack_device__(self, *args, **kwargs):
+    def __dlpack_device__(self, *args: Any, **kwargs: Any) -> Any:
         return self.value.__dlpack_device__(*args, **kwargs)
 
 
-def scatter_sum(input, dim, index: jnp.ndarray, src: jnp.ndarray) -> jnp.ndarray:
+def scatter_sum(
+    input: jnp.ndarray, dim: int, index: jnp.ndarray, src: jnp.ndarray
+) -> jnp.ndarray:
     """Reduces all values from the src tensor to the indices specified in the index tensor."""
     idx = jnp.arange(input.size, dtype=jnp.int64).reshape(input.shape)
     new_idx = jnp.take_along_axis(idx, index, axis=dim).ravel()
diff --git a/deepmd/jax/fitting/fitting.py b/deepmd/jax/fitting/fitting.py
index d62681490c..e69bded640 100644
--- a/deepmd/jax/fitting/fitting.py
+++ b/deepmd/jax/fitting/fitting.py
@@ -35,6 +35,7 @@ def setattr_for_general_fitting(name: str, value: Any) -> Any:
         "fparam_inv_std",
         "aparam_avg",
         "aparam_inv_std",
+        "default_fparam_tensor",
     }:
         value = to_jax_array(value)
         if value is not None:
diff --git a/deepmd/jax/infer/deep_eval.py b/deepmd/jax/infer/deep_eval.py
index acfd42b66a..92ed78a13e 100644
--- a/deepmd/jax/infer/deep_eval.py
+++ b/deepmd/jax/infer/deep_eval.py
@@ -301,7 +301,7 @@ def _eval_func(self, inner_func: Callable, numb_test: int, natoms: int) -> Calla
         """
         if self.auto_batch_size is not None:
 
-            def eval_func(*args, **kwargs):
+            def eval_func(*args: Any, **kwargs: Any) -> Any:
                 return self.auto_batch_size.execute_all(
                     inner_func, numb_test, natoms, *args, **kwargs
                 )
@@ -335,7 +335,7 @@ def _eval_model(
         fparam: Optional[np.ndarray],
         aparam: Optional[np.ndarray],
         request_defs: list[OutputVariableDef],
-    ):
+    ) -> tuple[np.ndarray, ...]:
         model = self.dp
 
         nframes = coords.shape[0]
@@ -395,7 +395,9 @@ def _eval_model(
                 )  # this is kinda hacky
         return tuple(results)
 
-    def _get_output_shape(self, odef, nframes, natoms):
+    def _get_output_shape(
+        self, odef: OutputVariableDef, nframes: int, natoms: int
+    ) -> list[int]:
         if odef.category == OutputVariableCategory.DERV_C_REDU:
             # virial
             return [nframes, *odef.shape[:-1], 9]
@@ -420,3 +422,13 @@ def _get_output_shape(self, odef, nframes, natoms):
     def get_model_def_script(self) -> dict:
         """Get model definition script."""
         return json.loads(self.dp.get_model_def_script())
+
+    def get_model(self) -> Any:
+        """Get the JAX model as BaseModel.
+
+        Returns
+        -------
+        BaseModel
+            The JAX model as BaseModel instance.
+        """
+        return self.dp
diff --git a/deepmd/jax/jax2tf/format_nlist.py b/deepmd/jax/jax2tf/format_nlist.py
index f0c630206f..5cf93610e7 100644
--- a/deepmd/jax/jax2tf/format_nlist.py
+++ b/deepmd/jax/jax2tf/format_nlist.py
@@ -9,7 +9,7 @@ def format_nlist(
     nlist: tnp.ndarray,
     nsel: int,
     rcut: float,
-):
+) -> tnp.ndarray:
     """Format neighbor list.
 
     If nnei == nsel, do nothing;
diff --git a/deepmd/jax/jax2tf/make_model.py b/deepmd/jax/jax2tf/make_model.py
index 29ed131f8e..341fdf0d1f 100644
--- a/deepmd/jax/jax2tf/make_model.py
+++ b/deepmd/jax/jax2tf/make_model.py
@@ -44,7 +44,7 @@ def model_call_from_call_lower(
     fparam: tnp.ndarray,
     aparam: tnp.ndarray,
     do_atomic_virial: bool = False,
-):
+) -> dict[str, tnp.ndarray]:
     """Return model prediction from lower interface.
 
     Parameters
diff --git a/deepmd/jax/jax2tf/nlist.py b/deepmd/jax/jax2tf/nlist.py
index 5a0ed58b63..f85526f1e9 100644
--- a/deepmd/jax/jax2tf/nlist.py
+++ b/deepmd/jax/jax2tf/nlist.py
@@ -115,7 +115,7 @@ def nlist_distinguish_types(
     nlist: tnp.ndarray,
     atype: tnp.ndarray,
     sel: list[int],
-):
+) -> tnp.ndarray:
     """Given a nlist that does not distinguish atom types, return a nlist that
     distinguish atom types.
 
@@ -140,7 +140,7 @@ def nlist_distinguish_types(
     return ret
 
 
-def tf_outer(a, b):
+def tf_outer(a: tnp.ndarray, b: tnp.ndarray) -> tnp.ndarray:
     return tf.einsum("i,j->ij", a, b)
 
 
@@ -150,7 +150,7 @@ def extend_coord_with_ghosts(
     atype: tnp.ndarray,
     cell: tnp.ndarray,
     rcut: float,
-):
+) -> tuple[tnp.ndarray, tnp.ndarray, tnp.ndarray]:
     """Extend the coordinates of the atoms by appending peridoc images.
     The number of images is large enough to ensure all the neighbors
     within rcut are appended.
diff --git a/deepmd/jax/jax2tf/region.py b/deepmd/jax/jax2tf/region.py
index 96024bd79a..a90e693478 100644
--- a/deepmd/jax/jax2tf/region.py
+++ b/deepmd/jax/jax2tf/region.py
@@ -93,7 +93,7 @@ def to_face_distance(
     return tnp.reshape(dist, tf.concat([cshape[:-2], [3]], axis=0))
 
 
-def b_to_face_distance(cell):
+def b_to_face_distance(cell: tnp.ndarray) -> tnp.ndarray:
     volume = tf.linalg.det(cell)
     c_yz = tf.linalg.cross(cell[:, 1, ...], cell[:, 2, ...])
     _h2yz = volume / tf.linalg.norm(c_yz, axis=-1)
diff --git a/deepmd/jax/jax2tf/serialization.py b/deepmd/jax/jax2tf/serialization.py
index aac022ace9..096fc41e5a 100644
--- a/deepmd/jax/jax2tf/serialization.py
+++ b/deepmd/jax/jax2tf/serialization.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import json
 from typing import (
+    Callable,
     Optional,
 )
 
@@ -38,10 +39,17 @@ def deserialize_to_file(model_file: str, data: dict) -> None:
 
         tf_model = tf.Module()
 
-        def exported_whether_do_atomic_virial(do_atomic_virial, has_ghost_atoms):
+        def exported_whether_do_atomic_virial(
+            do_atomic_virial: bool, has_ghost_atoms: bool
+        ) -> Callable:
             def call_lower_with_fixed_do_atomic_virial(
-                coord, atype, nlist, mapping, fparam, aparam
-            ):
+                coord: tnp.ndarray,
+                atype: tnp.ndarray,
+                nlist: tnp.ndarray,
+                mapping: tnp.ndarray,
+                fparam: tnp.ndarray,
+                aparam: tnp.ndarray,
+            ) -> dict[str, tnp.ndarray]:
                 return call_lower(
                     coord,
                     atype,
@@ -86,8 +94,13 @@ def call_lower_with_fixed_do_atomic_virial(
             ],
         )
         def call_lower_without_atomic_virial(
-            coord, atype, nlist, mapping, fparam, aparam
-        ):
+            coord: tnp.ndarray,
+            atype: tnp.ndarray,
+            nlist: tnp.ndarray,
+            mapping: tnp.ndarray,
+            fparam: tnp.ndarray,
+            aparam: tnp.ndarray,
+        ) -> dict[str, tnp.ndarray]:
             nlist = format_nlist(coord, nlist, model.get_nnei(), model.get_rcut())
             return tf.cond(
                 tf.shape(coord)[1] == tf.shape(nlist)[1],
@@ -112,7 +125,14 @@ def call_lower_without_atomic_virial(
                 tf.TensorSpec([None, None, model.get_dim_aparam()], tf.float64),
             ],
         )
-        def call_lower_with_atomic_virial(coord, atype, nlist, mapping, fparam, aparam):
+        def call_lower_with_atomic_virial(
+            coord: tnp.ndarray,
+            atype: tnp.ndarray,
+            nlist: tnp.ndarray,
+            mapping: tnp.ndarray,
+            fparam: tnp.ndarray,
+            aparam: tnp.ndarray,
+        ) -> dict[str, tnp.ndarray]:
             nlist = format_nlist(coord, nlist, model.get_nnei(), model.get_rcut())
             return tf.cond(
                 tf.shape(coord)[1] == tf.shape(nlist)[1],
@@ -126,7 +146,7 @@ def call_lower_with_atomic_virial(coord, atype, nlist, mapping, fparam, aparam):
 
         tf_model.call_lower_atomic_virial = call_lower_with_atomic_virial
 
-        def make_call_whether_do_atomic_virial(do_atomic_virial: bool):
+        def make_call_whether_do_atomic_virial(do_atomic_virial: bool) -> Callable:
             if do_atomic_virial:
                 call_lower = call_lower_with_atomic_virial
             else:
@@ -138,7 +158,7 @@ def call(
                 box: Optional[tnp.ndarray] = None,
                 fparam: Optional[tnp.ndarray] = None,
                 aparam: Optional[tnp.ndarray] = None,
-            ):
+            ) -> dict[str, tnp.ndarray]:
                 """Return model prediction.
 
                 Parameters
@@ -194,7 +214,7 @@ def call_with_atomic_virial(
             box: tnp.ndarray,
             fparam: tnp.ndarray,
             aparam: tnp.ndarray,
-        ):
+        ) -> dict[str, tnp.ndarray]:
             return make_call_whether_do_atomic_virial(do_atomic_virial=True)(
                 coord, atype, box, fparam, aparam
             )
@@ -217,7 +237,7 @@ def call_without_atomic_virial(
             box: tnp.ndarray,
             fparam: tnp.ndarray,
             aparam: tnp.ndarray,
-        ):
+        ) -> dict[str, tnp.ndarray]:
             return make_call_whether_do_atomic_virial(do_atomic_virial=False)(
                 coord, atype, box, fparam, aparam
             )
@@ -226,49 +246,49 @@ def call_without_atomic_virial(
 
         # set functions to export other attributes
         @tf.function
-        def get_type_map():
+        def get_type_map() -> tf.Tensor:
             return tf.constant(model.get_type_map(), dtype=tf.string)
 
         tf_model.get_type_map = get_type_map
 
         @tf.function
-        def get_rcut():
+        def get_rcut() -> tf.Tensor:
             return tf.constant(model.get_rcut(), dtype=tf.double)
 
         tf_model.get_rcut = get_rcut
 
         @tf.function
-        def get_dim_fparam():
+        def get_dim_fparam() -> tf.Tensor:
             return tf.constant(model.get_dim_fparam(), dtype=tf.int64)
 
         tf_model.get_dim_fparam = get_dim_fparam
 
         @tf.function
-        def get_dim_aparam():
+        def get_dim_aparam() -> tf.Tensor:
             return tf.constant(model.get_dim_aparam(), dtype=tf.int64)
 
         tf_model.get_dim_aparam = get_dim_aparam
 
         @tf.function
-        def get_sel_type():
+        def get_sel_type() -> tf.Tensor:
             return tf.constant(model.get_sel_type(), dtype=tf.int64)
 
         tf_model.get_sel_type = get_sel_type
 
         @tf.function
-        def is_aparam_nall():
+        def is_aparam_nall() -> tf.Tensor:
             return tf.constant(model.is_aparam_nall(), dtype=tf.bool)
 
         tf_model.is_aparam_nall = is_aparam_nall
 
         @tf.function
-        def model_output_type():
+        def model_output_type() -> tf.Tensor:
             return tf.constant(model.model_output_type(), dtype=tf.string)
 
         tf_model.model_output_type = model_output_type
 
         @tf.function
-        def mixed_types():
+        def mixed_types() -> tf.Tensor:
             return tf.constant(model.mixed_types(), dtype=tf.bool)
 
         tf_model.mixed_types = mixed_types
@@ -276,19 +296,19 @@ def mixed_types():
         if model.get_min_nbor_dist() is not None:
 
             @tf.function
-            def get_min_nbor_dist():
+            def get_min_nbor_dist() -> tf.Tensor:
                 return tf.constant(model.get_min_nbor_dist(), dtype=tf.double)
 
             tf_model.get_min_nbor_dist = get_min_nbor_dist
 
         @tf.function
-        def get_sel():
+        def get_sel() -> tf.Tensor:
             return tf.constant(model.get_sel(), dtype=tf.int64)
 
         tf_model.get_sel = get_sel
 
         @tf.function
-        def get_model_def_script():
+        def get_model_def_script() -> tf.Tensor:
             return tf.constant(
                 json.dumps(model_def_script, separators=(",", ":")), dtype=tf.string
             )
diff --git a/deepmd/jax/jax2tf/tfmodel.py b/deepmd/jax/jax2tf/tfmodel.py
index 0d7b13ba1f..61c83fa028 100644
--- a/deepmd/jax/jax2tf/tfmodel.py
+++ b/deepmd/jax/jax2tf/tfmodel.py
@@ -45,7 +45,7 @@ def decode_list_of_bytes(list_of_bytes: list[bytes]) -> list[str]:
 class TFModelWrapper(tf.Module):
     def __init__(
         self,
-        model,
+        model: str,
     ) -> None:
         self.model = tf.saved_model.load(model)
         self._call_lower = jax2tf.call_tf(self.model.call_lower)
@@ -115,7 +115,7 @@ def call(
         fparam: Optional[jnp.ndarray] = None,
         aparam: Optional[jnp.ndarray] = None,
         do_atomic_virial: bool = False,
-    ):
+    ) -> dict[str, jnp.ndarray]:
         """Return model prediction.
 
         Parameters
@@ -165,7 +165,7 @@ def call(
             aparam,
         )
 
-    def model_output_def(self):
+    def model_output_def(self) -> ModelOutputDef:
         return ModelOutputDef(
             FittingOutputDef([OUTPUT_DEFS[tt] for tt in self.model_output_type()])
         )
@@ -179,7 +179,7 @@ def call_lower(
         fparam: Optional[jnp.ndarray] = None,
         aparam: Optional[jnp.ndarray] = None,
         do_atomic_virial: bool = False,
-    ):
+    ) -> dict[str, jnp.ndarray]:
         if do_atomic_virial:
             call_lower = self._call_lower_atomic_virial
         else:
@@ -207,15 +207,15 @@ def get_type_map(self) -> list[str]:
         """Get the type map."""
         return self.type_map
 
-    def get_rcut(self):
+    def get_rcut(self) -> float:
         """Get the cut-off radius."""
         return self.rcut
 
-    def get_dim_fparam(self):
+    def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this atomic model."""
         return self.dim_fparam
 
-    def get_dim_aparam(self):
+    def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.dim_aparam
 
diff --git a/deepmd/jax/model/base_model.py b/deepmd/jax/model/base_model.py
index 7c97ff692f..203da40d07 100644
--- a/deepmd/jax/model/base_model.py
+++ b/deepmd/jax/model/base_model.py
@@ -20,7 +20,7 @@
 
 
 def forward_common_atomic(
-    self,
+    self: "BaseModel",
     extended_coord: jnp.ndarray,
     extended_atype: jnp.ndarray,
     nlist: jnp.ndarray,
@@ -28,7 +28,7 @@ def forward_common_atomic(
     fparam: Optional[jnp.ndarray] = None,
     aparam: Optional[jnp.ndarray] = None,
     do_atomic_virial: bool = False,
-):
+) -> dict[str, jnp.ndarray]:
     atomic_ret = self.atomic_model.forward_common_atomic(
         extended_coord,
         extended_atype,
@@ -46,21 +46,30 @@ def forward_common_atomic(
         atom_axis = -(len(shap) + 1)
         if vdef.reducible:
             kk_redu = get_reduce_name(kk)
-            model_predict[kk_redu] = jnp.sum(vv, axis=atom_axis)
+            if vdef.intensive:
+                mask = atomic_ret["mask"] if "mask" in atomic_ret else None
+                if mask is not None:
+                    model_predict[kk_redu] = jnp.sum(vv, axis=atom_axis) / jnp.sum(
+                        mask, axis=-1, keepdims=True
+                    )
+                else:
+                    model_predict[kk_redu] = jnp.mean(vv, axis=atom_axis)
+            else:
+                model_predict[kk_redu] = jnp.sum(vv, axis=atom_axis)
             kk_derv_r, kk_derv_c = get_deriv_name(kk)
             if vdef.r_differentiable:
 
                 def eval_output(
-                    cc_ext,
-                    extended_atype,
-                    nlist,
-                    mapping,
-                    fparam,
-                    aparam,
+                    cc_ext: jnp.ndarray,
+                    extended_atype: jnp.ndarray,
+                    nlist: jnp.ndarray,
+                    mapping: Optional[jnp.ndarray],
+                    fparam: Optional[jnp.ndarray],
+                    aparam: Optional[jnp.ndarray],
                     *,
-                    _kk=kk,
-                    _atom_axis=atom_axis,
-                ):
+                    _kk: str = kk,
+                    _atom_axis: int = atom_axis,
+                ) -> jnp.ndarray:
                     atomic_ret = self.atomic_model.forward_common_atomic(
                         cc_ext[None, ...],
                         extended_atype[None, ...],
@@ -108,16 +117,16 @@ def eval_output(
                 if do_atomic_virial:
 
                     def eval_ce(
-                        cc_ext,
-                        extended_atype,
-                        nlist,
-                        mapping,
-                        fparam,
-                        aparam,
+                        cc_ext: jnp.ndarray,
+                        extended_atype: jnp.ndarray,
+                        nlist: jnp.ndarray,
+                        mapping: Optional[jnp.ndarray],
+                        fparam: Optional[jnp.ndarray],
+                        aparam: Optional[jnp.ndarray],
                         *,
-                        _kk=kk,
-                        _atom_axis=atom_axis - 1,
-                    ):
+                        _kk: str = kk,
+                        _atom_axis: int = atom_axis - 1,
+                    ) -> jnp.ndarray:
                         # atomic_ret[_kk]: [nf, nloc, *def]
                         atomic_ret = self.atomic_model.forward_common_atomic(
                             cc_ext[None, ...],
diff --git a/deepmd/jax/model/dp_model.py b/deepmd/jax/model/dp_model.py
index 436582f22b..ee98a689e4 100644
--- a/deepmd/jax/model/dp_model.py
+++ b/deepmd/jax/model/dp_model.py
@@ -56,7 +56,7 @@ def forward_common_atomic(
             fparam: Optional[jnp.ndarray] = None,
             aparam: Optional[jnp.ndarray] = None,
             do_atomic_virial: bool = False,
-        ):
+        ) -> dict[str, jnp.ndarray]:
             return forward_common_atomic(
                 self,
                 extended_coord,
@@ -74,7 +74,7 @@ def format_nlist(
             extended_atype: jnp.ndarray,
             nlist: jnp.ndarray,
             extra_nlist_sort: bool = False,
-        ):
+        ) -> jnp.ndarray:
             return dpmodel_model.format_nlist(
                 self,
                 jax.lax.stop_gradient(extended_coord),
diff --git a/deepmd/jax/model/dp_zbl_model.py b/deepmd/jax/model/dp_zbl_model.py
index babbc65233..065dbc7aa7 100644
--- a/deepmd/jax/model/dp_zbl_model.py
+++ b/deepmd/jax/model/dp_zbl_model.py
@@ -38,7 +38,7 @@ def forward_common_atomic(
         fparam: Optional[jnp.ndarray] = None,
         aparam: Optional[jnp.ndarray] = None,
         do_atomic_virial: bool = False,
-    ):
+    ) -> dict[str, jnp.ndarray]:
         return forward_common_atomic(
             self,
             extended_coord,
@@ -56,7 +56,7 @@ def format_nlist(
         extended_atype: jnp.ndarray,
         nlist: jnp.ndarray,
         extra_nlist_sort: bool = False,
-    ):
+    ) -> jnp.ndarray:
         return DPZBLModelDP.format_nlist(
             self,
             jax.lax.stop_gradient(extended_coord),
diff --git a/deepmd/jax/model/hlo.py b/deepmd/jax/model/hlo.py
index 4d59957456..cbeb915329 100644
--- a/deepmd/jax/model/hlo.py
+++ b/deepmd/jax/model/hlo.py
@@ -44,21 +44,21 @@
 class HLO(BaseModel):
     def __init__(
         self,
-        stablehlo,
-        stablehlo_atomic_virial,
-        stablehlo_no_ghost,
-        stablehlo_atomic_virial_no_ghost,
-        model_def_script,
-        type_map,
-        rcut,
-        dim_fparam,
-        dim_aparam,
-        sel_type,
-        is_aparam_nall,
-        model_output_type,
-        mixed_types,
-        min_nbor_dist,
-        sel,
+        stablehlo: bytearray,
+        stablehlo_atomic_virial: bytearray,
+        stablehlo_no_ghost: bytearray,
+        stablehlo_atomic_virial_no_ghost: bytearray,
+        model_def_script: str,
+        type_map: list[str],
+        rcut: float,
+        dim_fparam: int,
+        dim_aparam: int,
+        sel_type: list[int],
+        is_aparam_nall: bool,
+        model_output_type: str,
+        mixed_types: bool,
+        min_nbor_dist: Optional[float],
+        sel: list[int],
     ) -> None:
         self._call_lower = jax_export.deserialize(stablehlo).call
         self._call_lower_atomic_virial = jax_export.deserialize(
@@ -125,7 +125,7 @@ def call(
         fparam: Optional[jnp.ndarray] = None,
         aparam: Optional[jnp.ndarray] = None,
         do_atomic_virial: bool = False,
-    ):
+    ) -> dict[str, jnp.ndarray]:
         """Return model prediction.
 
         Parameters
@@ -165,7 +165,7 @@ def call(
             do_atomic_virial=do_atomic_virial,
         )
 
-    def model_output_def(self):
+    def model_output_def(self) -> ModelOutputDef:
         return ModelOutputDef(
             FittingOutputDef([OUTPUT_DEFS[tt] for tt in self.model_output_type()])
         )
@@ -179,7 +179,7 @@ def call_lower(
         fparam: Optional[jnp.ndarray] = None,
         aparam: Optional[jnp.ndarray] = None,
         do_atomic_virial: bool = False,
-    ):
+    ) -> dict[str, jnp.ndarray]:
         if extended_coord.shape[1] > nlist.shape[1]:
             if do_atomic_virial:
                 call_lower = self._call_lower_atomic_virial
@@ -203,15 +203,15 @@ def get_type_map(self) -> list[str]:
         """Get the type map."""
         return self.type_map
 
-    def get_rcut(self):
+    def get_rcut(self) -> float:
         """Get the cut-off radius."""
         return self.rcut
 
-    def get_dim_fparam(self):
+    def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this atomic model."""
         return self.dim_fparam
 
-    def get_dim_aparam(self):
+    def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.dim_aparam
 
diff --git a/deepmd/jax/model/model.py b/deepmd/jax/model/model.py
index dc350e968c..321f33b315 100644
--- a/deepmd/jax/model/model.py
+++ b/deepmd/jax/model/model.py
@@ -26,7 +26,7 @@
 )
 
 
-def get_standard_model(data: dict):
+def get_standard_model(data: dict) -> BaseModel:
     """Get a Model from a dictionary.
 
     Parameters
@@ -103,7 +103,7 @@ def get_zbl_model(data: dict) -> DPZBLModel:
     )
 
 
-def get_model(data: dict):
+def get_model(data: dict) -> BaseModel:
     """Get a model from a dictionary.
 
     Parameters
diff --git a/deepmd/jax/utils/auto_batch_size.py b/deepmd/jax/utils/auto_batch_size.py
index ef93d85d96..1ecf020086 100644
--- a/deepmd/jax/utils/auto_batch_size.py
+++ b/deepmd/jax/utils/auto_batch_size.py
@@ -1,6 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 
-import jaxlib
 
 from deepmd.jax.env import (
     jax,
@@ -52,7 +51,7 @@ def is_oom_error(self, e: Exception) -> bool:
         # several sources think CUSOLVER_STATUS_INTERNAL_ERROR is another out-of-memory error,
         # such as https://github.com/JuliaGPU/CUDA.jl/issues/1924
         # (the meaningless error message should be considered as a bug in cusolver)
-        if isinstance(e, (jaxlib.xla_extension.XlaRuntimeError, ValueError)) and (
+        if isinstance(e, (RuntimeError, ValueError)) and (
             "RESOURCE_EXHAUSTED:" in e.args[0]
         ):
             return True
diff --git a/deepmd/jax/utils/neighbor_stat.py b/deepmd/jax/utils/neighbor_stat.py
index 6d9bc872e8..ddfc4199a3 100644
--- a/deepmd/jax/utils/neighbor_stat.py
+++ b/deepmd/jax/utils/neighbor_stat.py
@@ -82,7 +82,7 @@ def _execute(
         coord: np.ndarray,
         atype: np.ndarray,
         cell: Optional[np.ndarray],
-    ):
+    ) -> tuple[np.ndarray, np.ndarray]:
         """Execute the operation.
 
         Parameters
diff --git a/deepmd/jax/utils/network.py b/deepmd/jax/utils/network.py
index 2c406095cd..5a42323b90 100644
--- a/deepmd/jax/utils/network.py
+++ b/deepmd/jax/utils/network.py
@@ -4,6 +4,8 @@
     ClassVar,
 )
 
+import numpy as np
+
 from deepmd.dpmodel.common import (
     NativeOP,
 )
@@ -16,6 +18,7 @@
     make_multilayer_network,
 )
 from deepmd.jax.common import (
+    ArrayAPIVariable,
     flax_module,
     to_jax_array,
 )
@@ -25,16 +28,16 @@
 
 
 class ArrayAPIParam(nnx.Param):
-    def __array__(self, *args, **kwargs):
+    def __array__(self, *args: Any, **kwargs: Any) -> np.ndarray:
         return self.value.__array__(*args, **kwargs)
 
-    def __array_namespace__(self, *args, **kwargs):
+    def __array_namespace__(self, *args: Any, **kwargs: Any) -> Any:
         return self.value.__array_namespace__(*args, **kwargs)
 
-    def __dlpack__(self, *args, **kwargs):
+    def __dlpack__(self, *args: Any, **kwargs: Any) -> Any:
         return self.value.__dlpack__(*args, **kwargs)
 
-    def __dlpack_device__(self, *args, **kwargs):
+    def __dlpack_device__(self, *args: Any, **kwargs: Any) -> Any:
         return self.value.__dlpack_device__(*args, **kwargs)
 
 
@@ -44,7 +47,10 @@ def __setattr__(self, name: str, value: Any) -> None:
         if name in {"w", "b", "idt"}:
             value = to_jax_array(value)
             if value is not None:
-                value = ArrayAPIParam(value)
+                if self.trainable:
+                    value = ArrayAPIParam(value)
+                else:
+                    value = ArrayAPIVariable(value)
         return super().__setattr__(name, value)
 
 
diff --git a/deepmd/jax/utils/serialization.py b/deepmd/jax/utils/serialization.py
index 5d4da49e08..6a3c839608 100644
--- a/deepmd/jax/utils/serialization.py
+++ b/deepmd/jax/utils/serialization.py
@@ -55,10 +55,15 @@ def deserialize_to_file(model_file: str, data: dict) -> None:
 
         def exported_whether_do_atomic_virial(
             do_atomic_virial: bool, has_ghost_atoms: bool
-        ):
+        ) -> "jax_export.Exported":
             def call_lower_with_fixed_do_atomic_virial(
-                coord, atype, nlist, mapping, fparam, aparam
-            ):
+                coord: jnp.ndarray,
+                atype: jnp.ndarray,
+                nlist: jnp.ndarray,
+                mapping: jnp.ndarray,
+                fparam: jnp.ndarray,
+                aparam: jnp.ndarray,
+            ) -> dict[str, jnp.ndarray]:
                 return call_lower(
                     coord,
                     atype,
diff --git a/deepmd/lmp.py b/deepmd/lmp.py
index 15959cf243..7ac1570f0f 100644
--- a/deepmd/lmp.py
+++ b/deepmd/lmp.py
@@ -81,7 +81,6 @@ def get_library_path(module: str, filename: str) -> list[str]:
 pt_dir = os.path.join(torch.__path__[0], "lib")
 op_dir = str(SHARED_LIB_DIR)
 
-
 cuda_library_paths = []
 if platform.system() == "Linux":
     cuda_library_paths.extend(
diff --git a/deepmd/lmp_check_build.py b/deepmd/lmp_check_build.py
new file mode 100644
index 0000000000..dc81a31d8e
--- /dev/null
+++ b/deepmd/lmp_check_build.py
@@ -0,0 +1,19 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from deepmd.env import (
+    GLOBAL_CONFIG,
+)
+
+if GLOBAL_CONFIG.get("lammps_version", "") == "":
+
+    def get_op_dir() -> str:
+        """Get the directory of the deepmd-kit OP library."""
+        # empty
+        return ""
+else:
+    from deepmd.lmp import (
+        get_op_dir,
+    )
+
+__all__ = [
+    "get_op_dir",
+]
diff --git a/deepmd/loggers/loggers.py b/deepmd/loggers/loggers.py
index 060ac1a4b0..a57d63be59 100644
--- a/deepmd/loggers/loggers.py
+++ b/deepmd/loggers/loggers.py
@@ -44,7 +44,7 @@
 class _AppFilter(logging.Filter):
     """Add field `app_name` to log messages."""
 
-    def filter(self, record) -> bool:
+    def filter(self, record: logging.LogRecord) -> bool:
         record.app_name = "DEEPMD"
         return True
 
@@ -56,7 +56,7 @@ def __init__(self, rank: int) -> None:
         super().__init__(name="MPI_rank_id")
         self.mpi_rank = str(rank)
 
-    def filter(self, record) -> bool:
+    def filter(self, record: logging.LogRecord) -> bool:
         record.rank = self.mpi_rank
         return True
 
@@ -68,7 +68,7 @@ def __init__(self, rank: int) -> None:
         super().__init__(name="MPI_master_log")
         self.mpi_rank = rank
 
-    def filter(self, record) -> bool:
+    def filter(self, record: logging.LogRecord) -> bool:
         if self.mpi_rank == 0:
             return True
         else:
@@ -135,10 +135,10 @@ def __init__(
         self.MPI = MPI
         super().__init__(filename, mode=mode, encoding=None, delay=False)
 
-    def _open(self):
+    def _open(self) -> "_MPIFileStream":
         return _MPIFileStream(self.baseFilename, self.MPI, self.mode)
 
-    def setStream(self, stream) -> NoReturn:
+    def setStream(self, stream: "_MPIFileStream") -> NoReturn:
         """Stream cannot be reasigned in MPI mode."""
         raise NotImplementedError("Unable to do for MPI file handler!")
 
diff --git a/deepmd/main.py b/deepmd/main.py
index 14c0390bdc..d829f11ba2 100644
--- a/deepmd/main.py
+++ b/deepmd/main.py
@@ -14,6 +14,7 @@
     defaultdict,
 )
 from typing import (
+    Any,
     Optional,
 )
 
@@ -63,19 +64,31 @@ class RawTextArgumentDefaultsHelpFormatter(
 class BackendOption(argparse.Action):
     """Map backend alias to unique name."""
 
-    def __call__(self, parser, namespace, values, option_string=None):
+    def __call__(
+        self,
+        parser: argparse.ArgumentParser,
+        namespace: argparse.Namespace,
+        values: Any,
+        option_string: Optional[str] = None,
+    ) -> None:
         setattr(namespace, self.dest, BACKEND_TABLE[values])
 
 
 class DeprecateAction(argparse.Action):
     # See https://stackoverflow.com/a/69052677/9567349 by Ibolit under CC BY-SA 4.0
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         self.call_count = 0
         if "help" in kwargs:
             kwargs["help"] = f"[DEPRECATED] {kwargs['help']}"
         super().__init__(*args, **kwargs)
 
-    def __call__(self, parser, namespace, values, option_string=None):
+    def __call__(
+        self,
+        parser: argparse.ArgumentParser,
+        namespace: argparse.Namespace,
+        values: Any,
+        option_string: Optional[str] = None,
+    ) -> None:
         if self.call_count == 0:
             warnings.warn(
                 f"The option `{option_string}` is deprecated. It will be ignored.",
@@ -112,7 +125,7 @@ def main_parser() -> argparse.ArgumentParser:
     if default_backend not in BACKEND_TABLE.keys():
         raise ValueError(
             f"Unknown backend {default_backend}. "
-            "Please set DP_BACKEND to either tensorflow or pytorch."
+            "Please set DP_BACKEND to either tensorflow, pytorch, or paddle."
         )
 
     parser_backend = parser.add_mutually_exclusive_group()
@@ -312,7 +325,7 @@ def main_parser() -> argparse.ArgumentParser:
         "--output",
         type=str,
         default="frozen_model",
-        help="Filename (prefix) of the output model file. TensorFlow backend: suffix is .pb; PyTorch backend: suffix is .pth",
+        help="Filename (prefix) of the output model file. TensorFlow backend: suffix is .pb; PyTorch backend: suffix is .pth; Paddle backend: suffix is .json and .pdiparams",
     )
     parser_frz.add_argument(
         "-n",
@@ -371,6 +384,24 @@ def main_parser() -> argparse.ArgumentParser:
         type=str,
         help="The path to the datafile, each line of which is a path to one data system.",
     )
+    parser_tst_subgroup.add_argument(
+        "--train-data",
+        dest="train_json",
+        default=None,
+        type=str,
+        help=(
+            "The input json file. Training data in the file will be used for testing."
+        ),
+    )
+    parser_tst_subgroup.add_argument(
+        "--valid-data",
+        dest="valid_json",
+        default=None,
+        type=str,
+        help=(
+            "The input json file. Validation data in the file will be used for testing."
+        ),
+    )
     parser_tst.add_argument(
         "-S",
         "--set-prefix",
@@ -416,6 +447,56 @@ def main_parser() -> argparse.ArgumentParser:
         help="(Supported backend: PyTorch) Task head (alias: model branch) to test if in multi-task mode.",
     )
 
+    # * eval_desc script ***************************************************************
+    parser_eval_desc = subparsers.add_parser(
+        "eval-desc",
+        parents=[parser_log],
+        help="evaluate descriptors using the model",
+        formatter_class=RawTextArgumentDefaultsHelpFormatter,
+        epilog=textwrap.dedent(
+            """\
+        examples:
+            dp eval-desc -m graph.pb -s /path/to/system -o desc
+        """
+        ),
+    )
+    parser_eval_desc.add_argument(
+        "-m",
+        "--model",
+        default="frozen_model",
+        type=str,
+        help="Frozen model file (prefix) to import. TensorFlow backend: suffix is .pb; PyTorch backend: suffix is .pth.",
+    )
+    parser_eval_desc_subgroup = parser_eval_desc.add_mutually_exclusive_group()
+    parser_eval_desc_subgroup.add_argument(
+        "-s",
+        "--system",
+        default=".",
+        type=str,
+        help="The system dir. Recursively detect systems in this directory",
+    )
+    parser_eval_desc_subgroup.add_argument(
+        "-f",
+        "--datafile",
+        default=None,
+        type=str,
+        help="The path to the datafile, each line of which is a path to one data system.",
+    )
+    parser_eval_desc.add_argument(
+        "-o",
+        "--output",
+        default="desc",
+        type=str,
+        help="Output directory for descriptor files. Descriptors will be saved as desc/(system_name).npy",
+    )
+    parser_eval_desc.add_argument(
+        "--head",
+        "--model-branch",
+        default=None,
+        type=str,
+        help="(Supported backend: PyTorch) Task head (alias: model branch) to use if in multi-task mode.",
+    )
+
     # * compress model *****************************************************************
     # Compress a model, which including tabulating the embedding-net.
     # The table is composed of fifth-order polynomial coefficients and is assembled
@@ -671,12 +752,13 @@ def main_parser() -> argparse.ArgumentParser:
     parser_change_bias = subparsers.add_parser(
         "change-bias",
         parents=[parser_log],
-        help="(Supported backend: PyTorch) Change model out bias according to the input data.",
+        help="Change model out bias according to the input data.",
         formatter_class=RawTextArgumentDefaultsHelpFormatter,
         epilog=textwrap.dedent(
             """\
         examples:
-            dp change-bias model.pt -s data -n 10 -m change
+            dp --pt change-bias model.pt -s data -n 10 -m change
+            dp --tf change-bias model.ckpt -s data -n 10 -m change
         """
         ),
     )
@@ -851,7 +933,14 @@ def main_parser() -> argparse.ArgumentParser:
     )
     parser_show.add_argument(
         "ATTRIBUTES",
-        choices=["model-branch", "type-map", "descriptor", "fitting-net", "size"],
+        choices=[
+            "model-branch",
+            "type-map",
+            "descriptor",
+            "fitting-net",
+            "size",
+            "observed-type",
+        ],
         nargs="+",
     )
     return parser
@@ -902,6 +991,7 @@ def main(args: Optional[list[str]] = None) -> None:
 
     if args.command in (
         "test",
+        "eval-desc",
         "doc-train-input",
         "model-devi",
         "neighbor-stat",
diff --git a/deepmd/pd/cxx_op.py b/deepmd/pd/cxx_op.py
new file mode 100644
index 0000000000..9962bc0d24
--- /dev/null
+++ b/deepmd/pd/cxx_op.py
@@ -0,0 +1,35 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import importlib
+from types import (
+    ModuleType,
+)
+
+
+def load_library(module_name: str) -> tuple[bool, ModuleType]:
+    """Load OP library and return the module if success.
+
+    Parameters
+    ----------
+    module_name : str
+        Name of the module
+
+    Returns
+    -------
+    bool
+        Whether the library is loaded successfully
+    ModuleType
+        loaded custom operator module
+    """
+    if importlib.util.find_spec(module_name) is not None:
+        module = importlib.import_module(module_name)
+        return True, module
+
+    return False, None
+
+
+ENABLE_CUSTOMIZED_OP, paddle_ops_deepmd = load_library("deepmd_op_pd")
+
+__all__ = [
+    "ENABLE_CUSTOMIZED_OP",
+    "paddle_ops_deepmd",
+]
diff --git a/deepmd/pd/entrypoints/main.py b/deepmd/pd/entrypoints/main.py
index 8d96c4e6f2..fe092111b1 100644
--- a/deepmd/pd/entrypoints/main.py
+++ b/deepmd/pd/entrypoints/main.py
@@ -7,6 +7,7 @@
     Path,
 )
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -22,6 +23,7 @@
 )
 from deepmd.common import (
     expand_sys_str,
+    j_loader,
 )
 from deepmd.loggers.loggers import (
     set_log_handles,
@@ -79,15 +81,15 @@
 
 
 def get_trainer(
-    config,
-    init_model=None,
-    restart_model=None,
-    finetune_model=None,
-    force_load=False,
-    init_frz_model=None,
-    shared_links=None,
-    finetune_links=None,
-):
+    config: dict[str, Any],
+    init_model: Optional[str] = None,
+    restart_model: Optional[str] = None,
+    finetune_model: Optional[str] = None,
+    force_load: bool = False,
+    init_frz_model: Optional[str] = None,
+    shared_links: Optional[dict[str, Any]] = None,
+    finetune_links: Optional[dict[str, Any]] = None,
+) -> training.Trainer:
     multi_task = "model_dict" in config.get("model", {})
 
     # Initialize DDP
@@ -97,17 +99,22 @@ def get_trainer(
         fleet.init(is_collective=True)
 
     def prepare_trainer_input_single(
-        model_params_single, data_dict_single, rank=0, seed=None
-    ):
+        model_params_single: dict[str, Any],
+        data_dict_single: dict[str, Any],
+        rank: int = 0,
+        seed: Optional[int] = None,
+    ) -> tuple[DpLoaderSet, Optional[DpLoaderSet], Optional[DPPath]]:
         training_dataset_params = data_dict_single["training_data"]
         validation_dataset_params = data_dict_single.get("validation_data", None)
         validation_systems = (
             validation_dataset_params["systems"] if validation_dataset_params else None
         )
         training_systems = training_dataset_params["systems"]
-        training_systems = process_systems(training_systems)
+        trn_patterns = training_dataset_params.get("rglob_patterns", None)
+        training_systems = process_systems(training_systems, patterns=trn_patterns)
         if validation_systems is not None:
-            validation_systems = process_systems(validation_systems)
+            val_patterns = validation_dataset_params.get("rglob_patterns", None)
+            validation_systems = process_systems(validation_systems, val_patterns)
 
         # stat files
         stat_file_path_single = data_dict_single.get("stat_file", None)
@@ -235,8 +242,7 @@ def train(
     log.info("Configuration path: %s", input_file)
     if LOCAL_RANK == 0:
         SummaryPrinter()()
-    with open(input_file) as fin:
-        config = json.load(fin)
+    config = j_loader(input_file)
     # ensure suffix, as in the command line help, we say "path prefix of checkpoint files"
     if init_model is not None and not init_model.endswith(".pd"):
         init_model += ".pd"
@@ -342,6 +348,7 @@ def freeze(
     model: str,
     output: str = "frozen_model.json",
     head: Optional[str] = None,
+    do_atomic_virial: bool = False,
 ) -> None:
     paddle.set_flags(
         {
@@ -368,12 +375,13 @@ def freeze(
         model.forward = paddle.jit.to_static(
             model.forward,
             input_spec=[
-                InputSpec([1, -1, 3], dtype="float64", name="coord"),  # coord
-                InputSpec([1, -1], dtype="int64", name="atype"),  # atype
-                InputSpec([1, 9], dtype="float64", name="box"),  # box
+                InputSpec([-1, -1, 3], dtype="float64", name="coord"),  # coord
+                InputSpec([-1, -1], dtype="int64", name="atype"),  # atype
+                InputSpec([-1, 9], dtype="float64", name="box"),  # box
                 None,  # fparam
                 None,  # aparam
-                True,  # do_atomic_virial
+                # InputSpec([], dtype="bool", name="do_atomic_virial"),  # do_atomic_virial
+                do_atomic_virial,  # do_atomic_virial
             ],
             full_graph=True,
         )
@@ -388,17 +396,46 @@ def freeze(
         model.forward_lower = paddle.jit.to_static(
             model.forward_lower,
             input_spec=[
-                InputSpec([1, -1, 3], dtype="float64", name="coord"),  # extended_coord
-                InputSpec([1, -1], dtype="int32", name="atype"),  # extended_atype
-                InputSpec([1, -1, -1], dtype="int32", name="nlist"),  # nlist
-                InputSpec([1, -1], dtype="int64", name="mapping"),  # mapping
+                InputSpec([-1, -1, 3], dtype="float64", name="coord"),  # extended_coord
+                InputSpec([-1, -1], dtype="int32", name="atype"),  # extended_atype
+                InputSpec([-1, -1, -1], dtype="int32", name="nlist"),  # nlist
+                InputSpec([-1, -1], dtype="int64", name="mapping"),  # mapping
                 None,  # fparam
                 None,  # aparam
-                True,  # do_atomic_virial
-                None,  # comm_dict
+                # InputSpec([], dtype="bool", name="do_atomic_virial"),  # do_atomic_virial
+                do_atomic_virial,  # do_atomic_virial
+                (
+                    InputSpec([-1], "int64", name="send_list"),
+                    InputSpec([-1], "int32", name="send_proc"),
+                    InputSpec([-1], "int32", name="recv_proc"),
+                    InputSpec([-1], "int32", name="send_num"),
+                    InputSpec([-1], "int32", name="recv_num"),
+                    InputSpec([-1], "int64", name="communicator"),
+                    # InputSpec([1], "int64", name="has_spin"),
+                ),  # comm_dict
             ],
             full_graph=True,
         )
+    for method_name in [
+        "get_buffer_rcut",
+        "get_buffer_type_map",
+        "get_buffer_dim_fparam",
+        "get_buffer_dim_aparam",
+        "get_buffer_intensive",
+        "get_buffer_sel_type",
+        "get_buffer_numb_dos",
+        "get_buffer_task_dim",
+    ]:
+        if hasattr(model, method_name):
+            setattr(
+                model,
+                method_name,
+                paddle.jit.to_static(
+                    getattr(model, method_name),
+                    input_spec=[],
+                    full_graph=True,
+                ),
+            )
     if output.endswith(".json"):
         output = output[:-5]
     paddle.jit.save(
diff --git a/deepmd/pd/infer/deep_eval.py b/deepmd/pd/infer/deep_eval.py
index b0b4bab980..696531ed7f 100644
--- a/deepmd/pd/infer/deep_eval.py
+++ b/deepmd/pd/infer/deep_eval.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -9,6 +10,7 @@
 
 import numpy as np
 import paddle
+from paddle import inference as paddle_inference
 
 from deepmd.dpmodel.common import PRECISION_DICT as NP_PRECISION_DICT
 from deepmd.dpmodel.output_def import (
@@ -16,16 +18,35 @@
     OutputVariableCategory,
     OutputVariableDef,
 )
+from deepmd.infer.deep_dipole import (
+    DeepDipole,
+)
+from deepmd.infer.deep_dos import (
+    DeepDOS,
+)
 from deepmd.infer.deep_eval import DeepEval as DeepEvalWrapper
 from deepmd.infer.deep_eval import (
     DeepEvalBackend,
 )
+from deepmd.infer.deep_polar import (
+    DeepGlobalPolar,
+    DeepPolar,
+)
 from deepmd.infer.deep_pot import (
     DeepPot,
 )
+from deepmd.infer.deep_property import (
+    DeepProperty,
+)
+from deepmd.infer.deep_wfc import (
+    DeepWFC,
+)
 from deepmd.pd.model.model import (
     get_model,
 )
+from deepmd.pd.model.network.network import (
+    TypeEmbedNetConsistent,
+)
 from deepmd.pd.train.wrapper import (
     ModelWrapper,
 )
@@ -42,10 +63,22 @@
     to_numpy_array,
     to_paddle_tensor,
 )
+from deepmd.utils.econf_embd import (
+    sort_element_type,
+)
+from deepmd.utils.model_branch_dict import (
+    get_model_dict,
+)
 
 if TYPE_CHECKING:
     import ase.neighborlist
 
+    from deepmd.pd.model.model.model import (
+        BaseModel,
+    )
+
+log = logging.getLogger(__name__)
+
 
 class DeepEval(DeepEvalBackend):
     """Paddle backend implementation of DeepEval.
@@ -58,7 +91,7 @@ class DeepEval(DeepEvalBackend):
         The output definition of the model.
     *args : list
         Positional arguments.
-    auto_batch_size : bool or int or AutomaticBatchSize, default: False
+    auto_batch_size : bool or int or AutomaticBatchSize, default: True
         If True, automatic batch size will be used. If int, it will be used
         as the initial batch size.
     neighbor_list : ase.neighborlist.NewPrimitiveNeighborList, optional
@@ -76,8 +109,9 @@ def __init__(
         auto_batch_size: Union[bool, int, AutoBatchSize] = True,
         neighbor_list: Optional["ase.neighborlist.NewPrimitiveNeighborList"] = None,
         head: Optional[Union[str, int]] = None,
+        no_jit: bool = False,
         **kwargs: Any,
-    ):
+    ) -> None:
         enable_prim(True)
         self.output_def = output_def
         self.model_path = model_file
@@ -110,12 +144,42 @@ def __init__(
             # model = paddle.jit.to_static(model)
             self.dp = ModelWrapper(model)
             self.dp.set_state_dict(state_dict)
+            self.rcut = self.dp.model["Default"].get_rcut()
+            self.type_map: list[str] = self.dp.model["Default"].get_type_map()
+            self.dp.eval()
+            self.static_model = False
+        elif str(self.model_path).endswith(".json"):
+            self.dp = paddle.jit.load(self.model_path[:-5])
+            self.rcut = self.dp.get_buffer_rcut().item()
+            self.type_map: list[str] = "".join(
+                [chr(x) for x in self.dp.get_buffer_type_map().numpy()]
+            ).split(" ")
+            config = paddle_inference.Config(
+                self.model_path, self.model_path.replace(".json", ".pdiparams")
+            )
+            config.enable_custom_passes(
+                ["add_shadow_output_after_dead_parameter_pass"], True
+            )
+            config.enable_use_gpu(4096, 0)
+            config.disable_glog_info()
+
+            self.predictor = paddle_inference.create_predictor(config)
+            self.coord_handle = self.predictor.get_input_handle("coord")
+            self.atype_handle = self.predictor.get_input_handle("atype")
+            self.box_handle = self.predictor.get_input_handle("box")
+
+            self.atom_energy_handle = self.predictor.get_output_handle("fetch_name_0")
+            self.atom_virial_handle = self.predictor.get_output_handle("fetch_name_1")
+            self.energy_handle = self.predictor.get_output_handle("fetch_name_2")
+            self.force_handle = self.predictor.get_output_handle("fetch_name_3")
+            self.mask_handle = self.predictor.get_output_handle("fetch_name_4")
+            self.virial_handle = self.predictor.get_output_handle("fetch_name_5")
+            self.static_model = True
         else:
-            # self.dp = paddle.jit.load(self.model_path.split(".json")[0])
-            raise ValueError(f"Unknown model file format: {self.model_path}!")
-        self.dp.eval()
-        self.rcut = self.dp.model["Default"].get_rcut()
-        self.type_map = self.dp.model["Default"].get_type_map()
+            raise NotImplementedError(
+                f"Only support .pd or .json format, but got {model_file}"
+            )
+
         if isinstance(auto_batch_size, bool):
             if auto_batch_size:
                 self.auto_batch_size = AutoBatchSize()
@@ -127,9 +191,14 @@ def __init__(
             self.auto_batch_size = auto_batch_size
         else:
             raise TypeError("auto_batch_size should be bool, int, or AutoBatchSize")
-        self._has_spin = getattr(self.dp.model["Default"], "has_spin", False)
+        self._has_spin = (
+            getattr(self.dp.model["Default"], "has_spin", False)
+            if isinstance(self.dp, ModelWrapper)
+            else False
+        )
         if callable(self._has_spin):
-            self._has_spin = self._has_spin()
+            self._has_spin = False
+        self._has_hessian = False
 
     def get_rcut(self) -> float:
         """Get the cutoff radius of this model."""
@@ -145,21 +214,56 @@ def get_type_map(self) -> list[str]:
 
     def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this DP."""
+        if self.static_model:
+            return self.dp.get_buffer_dim_fparam()
         return self.dp.model["Default"].get_dim_fparam()
 
     def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this DP."""
+        if self.static_model:
+            return self.dp.get_buffer_dim_aparam()
         return self.dp.model["Default"].get_dim_aparam()
 
+    def has_default_fparam(self) -> bool:
+        """Check if the model has default frame parameters."""
+        try:
+            return self.dp.model["Default"].has_default_fparam()
+        except AttributeError:
+            # for compatibility with old models
+            return False
+
     def get_intensive(self) -> bool:
         return self.dp.model["Default"].get_intensive()
 
+    def get_var_name(self) -> str:
+        """Get the name of the property."""
+        if hasattr(self.dp.model["Default"], "get_var_name") and callable(
+            getattr(self.dp.model["Default"], "get_var_name")
+        ):
+            return self.dp.model["Default"].get_var_name()
+        else:
+            raise NotImplementedError
+
     @property
     def model_type(self) -> type["DeepEvalWrapper"]:
         """The the evaluator of the model type."""
+        if self.static_model:
+            return DeepPot
         model_output_type = self.dp.model["Default"].model_output_type()
         if "energy" in model_output_type:
             return DeepPot
+        elif "dos" in model_output_type:
+            return DeepDOS
+        elif "dipole" in model_output_type:
+            return DeepDipole
+        elif "polar" in model_output_type:
+            return DeepPolar
+        elif "global_polar" in model_output_type:
+            return DeepGlobalPolar
+        elif "wfc" in model_output_type:
+            return DeepWFC
+        elif self.get_var_name() in model_output_type:
+            return DeepProperty
         else:
             raise RuntimeError("Unknown model type")
 
@@ -180,18 +284,33 @@ def get_task_dim(self) -> int:
         """Get the output dimension."""
         return self.dp.model["Default"].get_task_dim()
 
-    def get_has_efield(self):
+    def get_has_efield(self) -> bool:
         """Check if the model has efield."""
         return False
 
-    def get_ntypes_spin(self):
+    def get_ntypes_spin(self) -> int:
         """Get the number of spin atom types of this model. Only used in old implement."""
         return 0
 
-    def get_has_spin(self):
+    def get_has_spin(self) -> bool:
         """Check if the model has spin atom types."""
         return self._has_spin
 
+    def get_has_hessian(self) -> bool:
+        """Check if the model has hessian."""
+        return self._has_hessian
+
+    def get_model_branch(self) -> tuple[dict[str, str], dict[str, dict[str, Any]]]:
+        """Get the model branch information."""
+        if "model_dict" in self.model_def_script:
+            model_alias_dict, model_branch_dict = get_model_dict(
+                self.model_def_script["model_dict"]
+            )
+            return model_alias_dict, model_branch_dict
+        else:
+            # single-task model
+            return {"Default": "Default"}, {"Default": {"alias": [], "info": {}}}
+
     def eval(
         self,
         coords: np.ndarray,
@@ -297,6 +416,7 @@ def _get_request_defs(self, atomic: bool) -> list[OutputVariableDef]:
                     OutputVariableCategory.REDU,
                     OutputVariableCategory.DERV_R,
                     OutputVariableCategory.DERV_C_REDU,
+                    OutputVariableCategory.DERV_R_DERV_R,
                 )
             ]
 
@@ -319,7 +439,7 @@ def _eval_func(self, inner_func: Callable, numb_test: int, natoms: int) -> Calla
         """
         if self.auto_batch_size is not None:
 
-            def eval_func(*args, **kwargs):
+            def eval_func(*args: Any, **kwargs: Any) -> Any:
                 return self.auto_batch_size.execute_all(
                     inner_func, numb_test, natoms, *args, **kwargs
                 )
@@ -354,9 +474,127 @@ def _eval_model(
         aparam: Optional[np.ndarray],
         request_defs: list[OutputVariableDef],
     ):
-        model = self.dp.to(DEVICE)
+        if not self.static_model:
+            model = self.dp.to(DEVICE)
         prec = NP_PRECISION_DICT[RESERVED_PRECISION_DICT[GLOBAL_PD_FLOAT_PRECISION]]
 
+        nframes = coords.shape[0]
+        if len(atom_types.shape) == 1:
+            natoms = len(atom_types)
+            atom_types = np.tile(atom_types, nframes).reshape([nframes, -1])
+        else:
+            natoms = len(atom_types[0])
+
+        if self.static_model:
+            self.coord_handle.copy_from_cpu(
+                coords.reshape([nframes, natoms, 3]).astype(prec)
+            )
+            self.atype_handle.copy_from_cpu(
+                atom_types.astype(
+                    NP_PRECISION_DICT[RESERVED_PRECISION_DICT[paddle.int64]]
+                )
+            )
+            if cells is not None:
+                self.box_handle.copy_from_cpu(cells.reshape([nframes, 3, 3]))
+
+            if fparam is not None:
+                raise NotImplementedError(
+                    "fparam_input is not supported for .json files. Please use a .pd file instead."
+                )
+
+            if aparam is not None:
+                raise NotImplementedError(
+                    "aparam_input is not supported for .json files. Please use a .pd file instead."
+                )
+
+        else:
+            coord_input = paddle.to_tensor(
+                coords.reshape([nframes, natoms, 3]).astype(prec),
+                dtype=GLOBAL_PD_FLOAT_PRECISION,
+                place=DEVICE,
+            )
+            type_input = paddle.to_tensor(
+                atom_types.astype(
+                    NP_PRECISION_DICT[RESERVED_PRECISION_DICT[paddle.int64]]
+                ),
+                dtype=paddle.int64,
+                place=DEVICE,
+            )
+            if cells is not None:
+                box_input = paddle.to_tensor(
+                    cells.reshape([nframes, 3, 3]),
+                    dtype=GLOBAL_PD_FLOAT_PRECISION,
+                    place=DEVICE,
+                )
+            else:
+                box_input = None
+            if fparam is not None:
+                fparam_input = to_paddle_tensor(
+                    fparam.reshape([nframes, self.get_dim_fparam()])
+                )
+            else:
+                fparam_input = None
+            if aparam is not None:
+                aparam_input = to_paddle_tensor(
+                    aparam.reshape([nframes, natoms, self.get_dim_aparam()])
+                )
+            else:
+                aparam_input = None
+
+        do_atomic_virial = any(
+            x.category == OutputVariableCategory.DERV_C for x in request_defs
+        )
+        if self.static_model:
+            self.predictor.run()
+            batch_output = {
+                "atom_energy": self.atom_energy_handle.copy_to_cpu(),
+                "energy": self.energy_handle.copy_to_cpu(),
+                "force": self.force_handle.copy_to_cpu(),
+                "mask": self.mask_handle.copy_to_cpu(),
+                "virial": self.virial_handle.copy_to_cpu(),
+            }
+            if do_atomic_virial:
+                batch_output["atom_virial"] = self.atom_virial_handle.copy_to_cpu()
+        else:
+            batch_output = model(
+                coord_input,
+                type_input,
+                box=box_input,
+                do_atomic_virial=do_atomic_virial,
+                fparam=fparam_input,
+                aparam=aparam_input,
+            )
+            if isinstance(batch_output, tuple):
+                batch_output = batch_output[0]
+
+        results = []
+        for odef in request_defs:
+            pd_name = self._OUTDEF_DP2BACKEND[odef.name]
+            if pd_name in batch_output:
+                shape = self._get_output_shape(odef, nframes, natoms)
+                out = batch_output[pd_name].reshape(shape)
+                if not self.static_model:
+                    out = out.numpy()
+                results.append(out)
+            else:
+                shape = self._get_output_shape(odef, nframes, natoms)
+                results.append(
+                    np.full(np.abs(shape), np.nan, dtype=prec)
+                )  # this is kinda hacky
+        return tuple(results)
+
+    def _eval_model_spin(
+        self,
+        coords: np.ndarray,
+        cells: Optional[np.ndarray],
+        atom_types: np.ndarray,
+        spins: np.ndarray,
+        fparam: Optional[np.ndarray],
+        aparam: Optional[np.ndarray],
+        request_defs: list[OutputVariableDef],
+    ) -> tuple[np.ndarray, ...]:
+        model = self.dp.to(DEVICE)
+
         nframes = coords.shape[0]
         if len(atom_types.shape) == 1:
             natoms = len(atom_types)
@@ -365,13 +603,14 @@ def _eval_model(
             natoms = len(atom_types[0])
 
         coord_input = paddle.to_tensor(
-            coords.reshape([nframes, natoms, 3]).astype(prec),
+            coords.reshape([nframes, natoms, 3]),
             dtype=GLOBAL_PD_FLOAT_PRECISION,
             place=DEVICE,
         )
-        type_input = paddle.to_tensor(
-            atom_types.astype(NP_PRECISION_DICT[RESERVED_PRECISION_DICT[paddle.int64]]),
-            dtype=paddle.int64,
+        type_input = paddle.to_tensor(atom_types, dtype=paddle.int64, place=DEVICE)
+        spin_input = paddle.to_tensor(
+            spins.reshape([nframes, natoms, 3]),
+            dtype=GLOBAL_PD_FLOAT_PRECISION,
             place=DEVICE,
         )
         if cells is not None:
@@ -394,12 +633,14 @@ def _eval_model(
             )
         else:
             aparam_input = None
+
         do_atomic_virial = any(
-            x.category == OutputVariableCategory.DERV_C for x in request_defs
+            x.category == OutputVariableCategory.DERV_C_REDU for x in request_defs
         )
         batch_output = model(
             coord_input,
             type_input,
+            spin=spin_input,
             box=box_input,
             do_atomic_virial=do_atomic_virial,
             fparam=fparam_input,
@@ -413,28 +654,24 @@ def _eval_model(
             pd_name = self._OUTDEF_DP2BACKEND[odef.name]
             if pd_name in batch_output:
                 shape = self._get_output_shape(odef, nframes, natoms)
-                out = batch_output[pd_name].reshape(shape).numpy()
+                out = batch_output[pd_name].reshape(shape).detach().cpu().numpy()
                 results.append(out)
             else:
                 shape = self._get_output_shape(odef, nframes, natoms)
                 results.append(
-                    np.full(np.abs(shape), np.nan, dtype=prec)
+                    np.full(
+                        np.abs(shape),
+                        np.nan,
+                        dtype=NP_PRECISION_DICT[
+                            RESERVED_PRECISION_DICT[GLOBAL_PD_FLOAT_PRECISION]
+                        ],
+                    )
                 )  # this is kinda hacky
         return tuple(results)
 
-    def _eval_model_spin(
-        self,
-        coords: np.ndarray,
-        cells: Optional[np.ndarray],
-        atom_types: np.ndarray,
-        spins: np.ndarray,
-        fparam: Optional[np.ndarray],
-        aparam: Optional[np.ndarray],
-        request_defs: list[OutputVariableDef],
-    ):
-        raise NotImplementedError("_eval_model_spin is not supported yet.")
-
-    def _get_output_shape(self, odef, nframes, natoms):
+    def _get_output_shape(
+        self, odef: OutputVariableDef, nframes: int, natoms: int
+    ) -> list[int]:
         if odef.category == OutputVariableCategory.DERV_C_REDU:
             # virial
             return [nframes, *odef.shape[:-1], 9]
@@ -452,6 +689,9 @@ def _get_output_shape(self, odef, nframes, natoms):
             # Something wrong here?
             # return [nframes, *shape, natoms, 1]
             return [nframes, natoms, *odef.shape, 1]
+        elif odef.category == OutputVariableCategory.DERV_R_DERV_R:
+            return [nframes, 3 * natoms, 3 * natoms]
+            # return [nframes, *odef.shape, 3 * natoms, 3 * natoms]
         else:
             raise RuntimeError("unknown category")
 
@@ -476,7 +716,14 @@ def eval_typeebd(self) -> np.ndarray:
         deepmd.pd.model.network.network.TypeEmbedNetConsistent :
             The type embedding network.
         """
-        raise NotImplementedError("eval_typeebd is not supported yet.")
+        out = []
+        for mm in self.dp.model["Default"].sublayers():
+            if mm.__class__.__name__ == TypeEmbedNetConsistent.__name__:
+                out.append(mm(DEVICE))
+        if not out:
+            raise KeyError("The model has no type embedding networks.")
+        typeebd = paddle.concat(out, axis=1)
+        return to_numpy_array(typeebd)
 
     def get_model_def_script(self) -> str:
         """Get model definition script."""
@@ -506,6 +753,32 @@ def get_model_size(self) -> dict:
             "total": sum_param_des + sum_param_fit,
         }
 
+    def get_observed_types(self) -> dict:
+        """Get observed types (elements) of the model during data statistics.
+
+        Returns
+        -------
+        dict
+            A dictionary containing the information of observed type in the model:
+            - 'type_num': the total number of observed types in this model.
+            - 'observed_type': a list of the observed types in this model.
+        """
+        observed_type_list = self.dp.model["Default"].get_observed_type_list()
+        return {
+            "type_num": len(observed_type_list),
+            "observed_type": sort_element_type(observed_type_list),
+        }
+
+    def get_model(self) -> "BaseModel":
+        """Get the Paddle model.
+
+        Returns
+        -------
+        BaseModel
+            The Paddle model instance.
+        """
+        return self.dp.model["Default"]
+
     def eval_descriptor(
         self,
         coords: np.ndarray,
@@ -546,7 +819,9 @@ def eval_descriptor(
         descriptor
             Descriptors.
         """
-        model = self.dp.model["Default"]
+        model = (
+            self.dp.model["Default"] if isinstance(self.dp, ModelWrapper) else self.dp
+        )
         model.set_eval_descriptor_hook(True)
         self.eval(
             coords,
@@ -560,3 +835,58 @@ def eval_descriptor(
         descriptor = model.eval_descriptor()
         model.set_eval_descriptor_hook(False)
         return to_numpy_array(descriptor)
+
+    def eval_fitting_last_layer(
+        self,
+        coords: np.ndarray,
+        cells: Optional[np.ndarray],
+        atom_types: np.ndarray,
+        fparam: Optional[np.ndarray] = None,
+        aparam: Optional[np.ndarray] = None,
+        **kwargs: Any,
+    ) -> np.ndarray:
+        """Evaluate fitting before last layer by using this DP.
+
+        Parameters
+        ----------
+        coords
+            The coordinates of atoms.
+            The array should be of size nframes x natoms x 3
+        cells
+            The cell of the region.
+            If None then non-PBC is assumed, otherwise using PBC.
+            The array should be of size nframes x 9
+        atom_types
+            The atom types
+            The list should contain natoms ints
+        fparam
+            The frame parameter.
+            The array can be of size :
+            - nframes x dim_fparam.
+            - dim_fparam. Then all frames are assumed to be provided with the same fparam.
+        aparam
+            The atomic parameter
+            The array can be of size :
+            - nframes x natoms x dim_aparam.
+            - natoms x dim_aparam. Then all frames are assumed to be provided with the same aparam.
+            - dim_aparam. Then all frames and atoms are provided with the same aparam.
+
+        Returns
+        -------
+        fitting
+            Fitting output before last layer.
+        """
+        model = self.dp.model["Default"]
+        model.set_eval_fitting_last_layer_hook(True)
+        self.eval(
+            coords,
+            cells,
+            atom_types,
+            atomic=False,
+            fparam=fparam,
+            aparam=aparam,
+            **kwargs,
+        )
+        fitting_net = model.eval_fitting_last_layer()
+        model.set_eval_fitting_last_layer_hook(False)
+        return to_numpy_array(fitting_net)
diff --git a/deepmd/pd/model/atomic_model/base_atomic_model.py b/deepmd/pd/model/atomic_model/base_atomic_model.py
index 1100813fb4..4f40117fb7 100644
--- a/deepmd/pd/model/atomic_model/base_atomic_model.py
+++ b/deepmd/pd/model/atomic_model/base_atomic_model.py
@@ -1,9 +1,9 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 
-import copy
 import logging
 from typing import (
     Callable,
+    NoReturn,
     Optional,
     Union,
 )
@@ -64,9 +64,9 @@ class BaseAtomicModel(paddle.nn.Layer, BaseAtomicModel_):
         of the atomic model. Implemented by removing the pairs from the nlist.
     rcond : float, optional
         The condition number for the regression of atomic energy.
-    preset_out_bias : Dict[str, list[Optional[paddle.Tensor]]], optional
+    preset_out_bias : dict[str, list[Optional[np.ndarray]]], optional
         Specifying atomic energy contribution in vacuum. Given by key:value pairs.
-        The value is a list specifying the bias. the elements can be None or np.array of output shape.
+        The value is a list specifying the bias. the elements can be None or np.ndarray of output shape.
         For example: [None, [2.]] means type 0 is not set, type 1 is set to [2.]
         The `set_davg_zero` key in the descriptor should be set.
 
@@ -79,18 +79,28 @@ def __init__(
         pair_exclude_types: list[tuple[int, int]] = [],
         rcond: Optional[float] = None,
         preset_out_bias: Optional[dict[str, np.ndarray]] = None,
-    ):
+        data_stat_protect: float = 1e-2,
+    ) -> None:
         paddle.nn.Layer.__init__(self)
         BaseAtomicModel_.__init__(self)
         self.type_map = type_map
+        if type_map is not None:
+            self.register_buffer(
+                "buffer_type_map",
+                paddle.to_tensor([ord(c) for c in " ".join(type_map)]),
+            )
+        self.ntypes = len(self.type_map)
+        self.register_buffer(
+            "buffer_ntypes", paddle.to_tensor(self.ntypes, dtype="int64")
+        )
         self.reinit_atom_exclude(atom_exclude_types)
         self.reinit_pair_exclude(pair_exclude_types)
         self.rcond = rcond
         self.preset_out_bias = preset_out_bias
+        self.data_stat_protect = data_stat_protect
 
-    def init_out_stat(self):
+    def init_out_stat(self) -> None:
         """Initialize the output bias."""
-        ntypes = self.get_ntypes()
         self.bias_keys: list[str] = list(self.fitting_output_def().keys())
         self.max_out_size = max(
             [self.atomic_output_def()[kk].size for kk in self.bias_keys]
@@ -104,7 +114,7 @@ def init_out_stat(self):
     def set_out_bias(self, out_bias: paddle.Tensor) -> None:
         self.out_bias = out_bias
 
-    def __setitem__(self, key, value):
+    def __setitem__(self, key: str, value: paddle.Tensor) -> None:
         if key in ["out_bias"]:
             self.out_bias = value
         elif key in ["out_std"]:
@@ -112,7 +122,7 @@ def __setitem__(self, key, value):
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> paddle.Tensor:
         if key in ["out_bias"]:
             return self.out_bias
         elif key in ["out_std"]:
@@ -124,10 +134,36 @@ def get_type_map(self) -> list[str]:
         """Get the type map."""
         return self.type_map
 
+    def get_buffer_type_map(self) -> paddle.Tensor:
+        """
+        Return the type map as a buffer-style Tensor for JIT saving.
+
+        The original type map (e.g., ['Ni', 'O']) is first joined into a single space-separated string
+        (e.g., "Ni O"). Each character in this string is then converted to its ASCII code using `ord()`,
+        and the resulting integer sequence is stored as a 1D paddle.Tensor of dtype int.
+
+        This format allows the type map to be serialized as a raw byte buffer during JIT model saving.
+        """
+        return self.buffer_type_map
+
+    def get_compute_stats_distinguish_types(self) -> bool:
+        """Get whether the fitting net computes stats which are not distinguished between different types of atoms."""
+        return True
+
+    def get_intensive(self) -> bool:
+        """Whether the fitting property is intensive."""
+        return False
+
+    def has_default_fparam(self) -> bool:
+        """Check if the model has default frame parameters."""
+        return False
+
     def reinit_atom_exclude(
         self,
-        exclude_types: list[int] = [],
-    ):
+        exclude_types: Optional[list[int]] = None,
+    ) -> None:
+        if exclude_types is None:
+            exclude_types = []
         self.atom_exclude_types = exclude_types
         if exclude_types == []:
             self.atom_excl = None
@@ -137,7 +173,7 @@ def reinit_atom_exclude(
     def reinit_pair_exclude(
         self,
         exclude_types: list[tuple[int, int]] = [],
-    ):
+    ) -> None:
         self.pair_exclude_types = exclude_types
         if exclude_types == []:
             self.pair_excl = None
@@ -191,7 +227,7 @@ def forward_common_atomic(
         mapping: Optional[paddle.Tensor] = None,
         fparam: Optional[paddle.Tensor] = None,
         aparam: Optional[paddle.Tensor] = None,
-        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+        comm_dict: Optional[list[paddle.Tensor]] = None,
     ) -> dict[str, paddle.Tensor]:
         """Common interface for atomic inference.
 
@@ -232,7 +268,7 @@ def forward_common_atomic(
         if self.pair_excl is not None:
             pair_mask = self.pair_excl(nlist, extended_atype)
             # exclude neighbors in the nlist
-            nlist = paddle.where(pair_mask == 1, nlist, -1)
+            nlist = paddle.where(pair_mask == 1, nlist, paddle.full_like(nlist, -1))
 
         ext_atom_mask = self.make_atom_mask(extended_atype)
         ret_dict = self.forward_atomic(
@@ -247,7 +283,6 @@ def forward_common_atomic(
             comm_dict=comm_dict,
         )
         ret_dict = self.apply_out_stat(ret_dict, atype)
-
         # nf x nloc
         atom_mask = ext_atom_mask[:, :nloc].astype(paddle.int32)
         if self.atom_excl is not None:
@@ -260,10 +295,10 @@ def forward_common_atomic(
                 out_shape2 *= ss
             ret_dict[kk] = (
                 ret_dict[kk].reshape([out_shape[0], out_shape[1], out_shape2])
-                * atom_mask.unsqueeze(2).astype(ret_dict[kk].dtype)
+                * atom_mask[:, :, None].astype(ret_dict[kk].dtype)
             ).reshape(out_shape)
         ret_dict["mask"] = atom_mask
-
+        # raise
         return ret_dict
 
     def forward(
@@ -274,7 +309,7 @@ def forward(
         mapping: Optional[paddle.Tensor] = None,
         fparam: Optional[paddle.Tensor] = None,
         aparam: Optional[paddle.Tensor] = None,
-        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+        comm_dict: Optional[list[paddle.Tensor]] = None,
     ) -> dict[str, paddle.Tensor]:
         return self.forward_common_atomic(
             extended_coord,
@@ -287,7 +322,9 @@ def forward(
         )
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["BaseAtomicModel"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -332,7 +369,7 @@ def serialize(self) -> dict:
 
     @classmethod
     def deserialize(cls, data: dict) -> "BaseAtomicModel":
-        data = copy.deepcopy(data)
+        data = data.copy()
         variables = data.pop("@variables", None)
         variables = (
             {"out_bias": None, "out_std": None} if variables is None else variables
@@ -354,21 +391,25 @@ def compute_or_load_stat(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         stat_file_path: Optional[DPPath] = None,
-    ):
+        compute_or_load_out_stat: bool = True,
+    ) -> NoReturn:
         """
-        Compute the output statistics (e.g. energy bias) for the fitting net from packed data.
+        Compute or load the statistics parameters of the model,
+        such as mean and standard deviation of descriptors or the energy bias of the fitting net.
+        When `sampled` is provided, all the statistics parameters will be calculated (or re-calculated for update),
+        and saved in the `stat_file_path`(s).
+        When `sampled` is not provided, it will check the existence of `stat_file_path`(s)
+        and load the calculated statistics parameters.
 
         Parameters
         ----------
-        merged : Union[Callable[[], list[dict]], list[dict]]
-            - list[dict]: A list of data samples from various data systems.
-                Each element, `merged[i]`, is a data dictionary containing `keys`: `paddle.Tensor`
-                originating from the `i`-th data system.
-            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
-                only when needed. Since the sampling process can be slow and memory-intensive,
-                the lazy function helps by only sampling once.
-        stat_file_path : Optional[DPPath]
-            The path to the stat file.
+        merged
+            The lazy sampled function to get data frames from different data systems.
+        stat_file_path
+            The dictionary of paths to the statistics files.
+        compute_or_load_out_stat : bool
+            Whether to compute the output statistics.
+            If False, it will only compute the input statistics (e.g. mean and standard deviation of descriptors).
 
         """
         raise NotImplementedError
@@ -377,7 +418,7 @@ def compute_or_load_out_stat(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         stat_file_path: Optional[DPPath] = None,
-    ):
+    ) -> None:
         """
         Compute the output statistics (e.g. energy bias) for the fitting net from packed data.
 
@@ -404,7 +445,7 @@ def apply_out_stat(
         self,
         ret: dict[str, paddle.Tensor],
         atype: paddle.Tensor,
-    ):
+    ) -> dict[str, paddle.Tensor]:
         """Apply the stat to each atomic output.
         The developer may override the method to define how the bias is applied
         to the atomic output of the model.
@@ -425,9 +466,9 @@ def apply_out_stat(
 
     def change_out_bias(
         self,
-        sample_merged,
+        sample_merged: Union[Callable[[], list[dict]], list[dict]],
         stat_file_path: Optional[DPPath] = None,
-        bias_adjust_mode="change-by-statistic",
+        bias_adjust_mode: str = "change-by-statistic",
     ) -> None:
         """Change the output bias according to the input data and the pretrained model.
 
@@ -457,7 +498,6 @@ def change_out_bias(
                 model_forward=self._get_forward_wrapper_func(),
                 rcond=self.rcond,
                 preset_bias=self.preset_out_bias,
-                atomic_output=self.atomic_output_def(),
             )
             self._store_out_stat(delta_bias, out_std, add=True)
         elif bias_adjust_mode == "set-by-statistic":
@@ -468,7 +508,8 @@ def change_out_bias(
                 stat_file_path=stat_file_path,
                 rcond=self.rcond,
                 preset_bias=self.preset_out_bias,
-                atomic_output=self.atomic_output_def(),
+                stats_distinguish_types=self.get_compute_stats_distinguish_types(),
+                intensive=self.get_intensive(),
             )
             self._store_out_stat(bias_out, std_out)
         else:
@@ -477,7 +518,13 @@ def change_out_bias(
     def _get_forward_wrapper_func(self) -> Callable[..., paddle.Tensor]:
         """Get a forward wrapper of the atomic model for output bias calculation."""
 
-        def model_forward(coord, atype, box, fparam=None, aparam=None):
+        def model_forward(
+            coord: paddle.Tensor,
+            atype: paddle.Tensor,
+            box: Optional[paddle.Tensor],
+            fparam: Optional[paddle.Tensor] = None,
+            aparam: Optional[paddle.Tensor] = None,
+        ) -> dict[str, paddle.Tensor]:
             with (
                 paddle.no_grad()
             ):  # it's essential for pure paddle forward function to use auto_batchsize
@@ -506,7 +553,7 @@ def model_forward(coord, atype, box, fparam=None, aparam=None):
 
         return model_forward
 
-    def _default_bias(self):
+    def _default_bias(self) -> paddle.Tensor:
         ntypes = self.get_ntypes()
         return paddle.zeros([self.n_out, ntypes, self.max_out_size], dtype=dtype).to(
             device=device
@@ -544,7 +591,7 @@ def _store_out_stat(
         out_bias: dict[str, paddle.Tensor],
         out_std: dict[str, paddle.Tensor],
         add: bool = False,
-    ):
+    ) -> None:
         ntypes = self.get_ntypes()
         out_bias_data = paddle.clone(self.out_bias)
         out_std_data = paddle.clone(self.out_std)
@@ -560,6 +607,12 @@ def _store_out_stat(
         paddle.assign(out_bias_data, self.out_bias)
         paddle.assign(out_std_data, self.out_std)
 
+    def get_ntypes(self):
+        return len(self.type_map)
+
+    def get_buffer_ntypes(self) -> paddle.Tensor:
+        return self.buffer_ntypes
+
     def _fetch_out_stat(
         self,
         keys: list[str],
diff --git a/deepmd/pd/model/atomic_model/dp_atomic_model.py b/deepmd/pd/model/atomic_model/dp_atomic_model.py
index 1089b93a68..816245c28a 100644
--- a/deepmd/pd/model/atomic_model/dp_atomic_model.py
+++ b/deepmd/pd/model/atomic_model/dp_atomic_model.py
@@ -2,7 +2,10 @@
 import functools
 import logging
 from typing import (
+    Any,
+    Callable,
     Optional,
+    Union,
 )
 
 import paddle
@@ -47,10 +50,10 @@ class DPAtomicModel(BaseAtomicModel):
 
     def __init__(
         self,
-        descriptor,
-        fitting,
+        descriptor: BaseDescriptor,
+        fitting: BaseFitting,
         type_map: list[str],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(type_map, **kwargs)
         ntypes = len(type_map)
@@ -62,17 +65,22 @@ def __init__(
         self.fitting_net = fitting
         super().init_out_stat()
         self.enable_eval_descriptor_hook = False
+        self.enable_eval_fitting_last_layer_hook = False
         self.eval_descriptor_list = []
+        self.eval_fitting_last_layer_list = []
 
         # register 'type_map' as buffer
-        def _string_to_array(s: str) -> list[int]:
+        def _string_to_array(s: Union[str, list[str]]) -> list[int]:
             return [ord(c) for c in s]
 
-        self.register_buffer(
-            "buffer_type_map",
-            paddle.to_tensor(_string_to_array(" ".join(self.type_map)), dtype="int32"),
-        )
-        self.buffer_type_map.name = "buffer_type_map"
+        if type_map is not None:
+            self.register_buffer(
+                "buffer_type_map",
+                paddle.to_tensor(
+                    _string_to_array(" ".join(self.type_map)), dtype="int32"
+                ),
+            )
+            self.buffer_type_map.name = "buffer_type_map"
         if hasattr(self.descriptor, "has_message_passing"):
             # register 'has_message_passing' as buffer(cast to int32 as problems may meets with vector<bool>)
             self.register_buffer(
@@ -112,16 +120,29 @@ def _string_to_array(s: str) -> list[int]:
         self.buffer_aparam_nall.name = "buffer_aparam_nall"
 
     eval_descriptor_list: list[paddle.Tensor]
+    eval_fitting_last_layer_list: list[paddle.Tensor]
 
     def set_eval_descriptor_hook(self, enable: bool) -> None:
         """Set the hook for evaluating descriptor and clear the cache for descriptor list."""
         self.enable_eval_descriptor_hook = enable
-        self.eval_descriptor_list = []
+        # = [] does not work; See #4533
+        self.eval_descriptor_list.clear()
 
     def eval_descriptor(self) -> paddle.Tensor:
         """Evaluate the descriptor."""
         return paddle.concat(self.eval_descriptor_list)
 
+    def set_eval_fitting_last_layer_hook(self, enable: bool) -> None:
+        """Set the hook for evaluating fitting last layer output and clear the cache for fitting last layer output list."""
+        self.enable_eval_fitting_last_layer_hook = enable
+        self.fitting_net.set_return_middle_output(enable)
+        # = [] does not work; See #4533
+        self.eval_fitting_last_layer_list.clear()
+
+    def eval_fitting_last_layer(self) -> paddle.Tensor:
+        """Evaluate the fitting last layer output."""
+        return paddle.concat(self.eval_fitting_last_layer_list)
+
     def fitting_output_def(self) -> FittingOutputDef:
         """Get the output def of the fitting net."""
         return (
@@ -138,7 +159,27 @@ def get_sel(self) -> list[int]:
         """Get the neighbor selection."""
         return self.sel
 
-    def set_case_embd(self, case_idx: int):
+    def get_buffer_type_map(self) -> paddle.Tensor:
+        """
+        Return the type map as a buffer-style Tensor for JIT saving.
+
+        The original type map (e.g., ['Ni', 'O']) is first joined into a single space-separated string
+        (e.g., "Ni O"). Each character in this string is then converted to its ASCII code using `ord()`,
+        and the resulting integer sequence is stored as a 1D paddle.Tensor of dtype int.
+
+        This format allows the type map to be serialized as a raw byte buffer during JIT model saving.
+        """
+        return self.buffer_type_map
+
+    def get_buffer_rcut(self) -> paddle.Tensor:
+        """Get the cut-off radius as a buffer-style Tensor."""
+        return self.descriptor.get_buffer_rcut()
+
+    def get_buffer_sel(self) -> paddle.Tensor:
+        """Get the neighbor selection as a buffer-style Tensor."""
+        return self.descriptor.get_buffer_sel()
+
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this atomic model by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -158,7 +199,9 @@ def mixed_types(self) -> bool:
         return self.descriptor.mixed_types()
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["DPAtomicModel"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -199,7 +242,7 @@ def serialize(self) -> dict:
         return dd
 
     @classmethod
-    def deserialize(cls, data) -> "DPAtomicModel":
+    def deserialize(cls, data: dict) -> "DPAtomicModel":
         data = data.copy()
         check_version_compatibility(data.pop("@version", 1), 2, 1)
         data.pop("@class", None)
@@ -244,9 +287,9 @@ def enable_compression(
 
     def forward_atomic(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: paddle.Tensor,
+        extended_atype: paddle.Tensor,
+        nlist: paddle.Tensor,
         mapping: Optional[paddle.Tensor] = None,
         fparam: Optional[paddle.Tensor] = None,
         aparam: Optional[paddle.Tensor] = None,
@@ -288,7 +331,7 @@ def forward_atomic(
         )
         assert descriptor is not None
         if self.enable_eval_descriptor_hook:
-            self.eval_descriptor_list.append(descriptor)
+            self.eval_descriptor_list.append(descriptor.detach())
         # energy, force
         fit_ret = self.fitting_net(
             descriptor,
@@ -299,6 +342,13 @@ def forward_atomic(
             fparam=fparam,
             aparam=aparam,
         )
+        if self.enable_eval_fitting_last_layer_hook:
+            assert "middle_output" in fit_ret, (
+                "eval_fitting_last_layer not supported for this fitting net!"
+            )
+            self.eval_fitting_last_layer_list.append(
+                fit_ret.pop("middle_output").detach()
+            )
         return fit_ret
 
     def get_out_bias(self) -> paddle.Tensor:
@@ -306,8 +356,9 @@ def get_out_bias(self) -> paddle.Tensor:
 
     def compute_or_load_stat(
         self,
-        sampled_func,
+        sampled_func: Callable[[], list[dict]],
         stat_file_path: Optional[DPPath] = None,
+        compute_or_load_out_stat: bool = True,
     ) -> None:
         """
         Compute or load the statistics parameters of the model,
@@ -323,6 +374,9 @@ def compute_or_load_stat(
             The lazy sampled function to get data frames from different data systems.
         stat_file_path
             The dictionary of paths to the statistics files.
+        compute_or_load_out_stat : bool
+            Whether to compute the output statistics.
+            If False, it will only compute the input statistics (e.g. mean and standard deviation of descriptors).
         """
         if stat_file_path is not None and self.type_map is not None:
             # descriptors and fitting net with different type_map
@@ -343,16 +397,32 @@ def wrapped_sampler():
             return sampled
 
         self.descriptor.compute_input_stats(wrapped_sampler, stat_file_path)
-        self.compute_or_load_out_stat(wrapped_sampler, stat_file_path)
+        self.fitting_net.compute_input_stats(
+            wrapped_sampler, protection=self.data_stat_protect
+        )
+        if compute_or_load_out_stat:
+            self.compute_or_load_out_stat(wrapped_sampler, stat_file_path)
 
     def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this atomic model."""
         return self.fitting_net.get_dim_fparam()
 
+    def get_buffer_dim_fparam(self) -> paddle.Tensor:
+        """Get the number (dimension) of frame parameters of this atomic model as a buffer-style Tensor."""
+        return self.fitting_net.get_buffer_dim_fparam()
+
+    def has_default_fparam(self) -> bool:
+        """Check if the model has default frame parameters."""
+        return self.fitting_net.has_default_fparam()
+
     def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.fitting_net.get_dim_aparam()
 
+    def get_buffer_dim_aparam(self) -> paddle.Tensor:
+        """Get the number (dimension) of atomic parameters of this atomic model as a buffer-style Tensor."""
+        return self.fitting_net.get_buffer_dim_aparam()
+
     def get_sel_type(self) -> list[int]:
         """Get the selected atom types of this model.
 
diff --git a/deepmd/pd/model/descriptor/dpa1.py b/deepmd/pd/model/descriptor/dpa1.py
index 6942b096c9..1722140316 100644
--- a/deepmd/pd/model/descriptor/dpa1.py
+++ b/deepmd/pd/model/descriptor/dpa1.py
@@ -292,10 +292,16 @@ def __init__(
             trainable_ln=trainable_ln,
             ln_eps=ln_eps,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.use_econf_tebd = use_econf_tebd
         self.use_tebd_bias = use_tebd_bias
         self.type_map = type_map
+        if type_map is not None:
+            self.register_buffer(
+                "buffer_type_map",
+                paddle.to_tensor([ord(c) for c in " ".join(type_map)]),
+            )
         self.compress = False
         self.type_embedding = TypeEmbedNet(
             ntypes,
@@ -305,6 +311,7 @@ def __init__(
             use_econf_tebd=use_econf_tebd,
             use_tebd_bias=use_tebd_bias,
             type_map=type_map,
+            trainable=trainable,
         )
         self.prec = PRECISION_DICT[precision]
         self.tebd_dim = tebd_dim
@@ -318,10 +325,18 @@ def get_rcut(self) -> float:
         """Returns the cut-off radius."""
         return self.se_atten.get_rcut()
 
+    def get_buffer_rcut(self) -> paddle.Tensor:
+        """Returns the cut-off radius as a buffer-style Tensor."""
+        return self.se_atten.get_buffer_rcut()
+
     def get_rcut_smth(self) -> float:
         """Returns the radius where the neighbor information starts to smoothly decay to 0."""
         return self.se_atten.get_rcut_smth()
 
+    def get_buffer_rcut_smth(self) -> paddle.Tensor:
+        """Returns the radius where the neighbor information starts to smoothly decay to 0 as a buffer-style Tensor."""
+        return self.se_atten.get_buffer_rcut_smth()
+
     def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return self.se_atten.get_nsel()
@@ -338,6 +353,18 @@ def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
+    def get_buffer_type_map(self) -> paddle.Tensor:
+        """
+        Return the type map as a buffer-style Tensor for JIT saving.
+
+        The original type map (e.g., ['Ni', 'O']) is first joined into a single space-separated string
+        (e.g., "Ni O"). Each character in this string is then converted to its ASCII code using `ord()`,
+        and the resulting integer sequence is stored as a 1D paddle.Tensor of dtype int.
+
+        This format allows the type map to be serialized as a raw byte buffer during JIT model saving.
+        """
+        return self.buffer_type_map
+
     def get_dim_out(self) -> int:
         """Returns the output dimension."""
         ret = self.se_atten.get_dim_out()
@@ -594,7 +621,7 @@ def forward(
         extended_atype: paddle.Tensor,
         nlist: paddle.Tensor,
         mapping: Optional[paddle.Tensor] = None,
-        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+        comm_dict: Optional[list[paddle.Tensor]] = None,
     ):
         """Compute the descriptor.
 
diff --git a/deepmd/pd/model/descriptor/dpa2.py b/deepmd/pd/model/descriptor/dpa2.py
index 0e3b24397f..c8eb4ca117 100644
--- a/deepmd/pd/model/descriptor/dpa2.py
+++ b/deepmd/pd/model/descriptor/dpa2.py
@@ -184,6 +184,7 @@ def init_subclass_params(sub_data, sub_class):
             smooth=smooth,
             type_one_side=self.repinit_args.type_one_side,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.use_three_body = self.repinit_args.use_three_body
         if self.use_three_body:
@@ -203,6 +204,7 @@ def init_subclass_params(sub_data, sub_class):
                 resnet_dt=self.repinit_args.resnet_dt,
                 smooth=smooth,
                 seed=child_seed(seed, 5),
+                trainable=trainable,
             )
         else:
             self.repinit_three_body = None
@@ -243,6 +245,7 @@ def init_subclass_params(sub_data, sub_class):
             g1_out_conv=self.repformer_args.g1_out_conv,
             g1_out_mlp=self.repformer_args.g1_out_mlp,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.rcsl_list = [
             (self.repformers.get_rcut(), self.repformers.get_nsel()),
@@ -262,6 +265,11 @@ def init_subclass_params(sub_data, sub_class):
         self.use_econf_tebd = use_econf_tebd
         self.use_tebd_bias = use_tebd_bias
         self.type_map = type_map
+        if type_map is not None:
+            self.register_buffer(
+                "buffer_type_map",
+                paddle.to_tensor([ord(c) for c in " ".join(type_map)]),
+            )
         self.type_embedding = TypeEmbedNet(
             ntypes,
             self.repinit_args.tebd_dim,
@@ -270,6 +278,7 @@ def init_subclass_params(sub_data, sub_class):
             use_econf_tebd=self.use_econf_tebd,
             use_tebd_bias=use_tebd_bias,
             type_map=type_map,
+            trainable=trainable,
         )
         self.concat_output_tebd = concat_output_tebd
         self.precision = precision
@@ -295,6 +304,7 @@ def init_subclass_params(sub_data, sub_class):
                 precision=precision,
                 init="glorot",
                 seed=child_seed(seed, 3),
+                trainable=trainable,
             )
         self.tebd_transform = None
         if self.add_tebd_to_repinit_out:
@@ -304,6 +314,7 @@ def init_subclass_params(sub_data, sub_class):
                 bias=False,
                 precision=precision,
                 seed=child_seed(seed, 4),
+                trainable=trainable,
             )
         assert self.repinit.rcut > self.repformers.rcut
         assert self.repinit.sel[0] > self.repformers.sel[0]
@@ -312,6 +323,9 @@ def init_subclass_params(sub_data, sub_class):
         self.rcut = self.repinit.get_rcut()
         self.rcut_smth = self.repinit.get_rcut_smth()
         self.ntypes = ntypes
+        self.register_buffer(
+            "buffer_ntypes", paddle.to_tensor(self.ntypes, dtype="int64")
+        )
         self.sel = self.repinit.sel
         # set trainable
         for param in self.parameters():
@@ -326,6 +340,14 @@ def get_rcut_smth(self) -> float:
         """Returns the radius where the neighbor information starts to smoothly decay to 0."""
         return self.rcut_smth
 
+    def get_buffer_rcut(self) -> paddle.Tensor:
+        """Returns the cut-off radius."""
+        return self.repinit.get_buffer_rcut()
+
+    def get_buffer_rcut_smth(self) -> paddle.Tensor:
+        """Returns the radius where the neighbor information starts to smoothly decay to 0 as a buffer-style Tensor."""
+        return self.repinit.get_buffer_rcut_smth()
+
     def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
@@ -336,7 +358,7 @@ def get_sel(self) -> list[int]:
 
     def get_ntypes(self) -> int:
         """Returns the number of element types."""
-        return self.ntypes
+        return self.ntypes if paddle.in_dynamic_mode() else self.buffer_ntypes
 
     def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
@@ -706,7 +728,7 @@ def forward(
         extended_atype: paddle.Tensor,
         nlist: paddle.Tensor,
         mapping: Optional[paddle.Tensor] = None,
-        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+        comm_dict: Optional[list[paddle.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -741,7 +763,7 @@ def forward(
 
         """
         # cast the input to internal precsion
-        extended_coord = extended_coord.to(dtype=self.prec)
+        extended_coord = extended_coord.astype(dtype=self.prec)
 
         use_three_body = self.use_three_body
         nframes, nloc, nnei = nlist.shape
@@ -762,7 +784,7 @@ def forward(
             type_embedding = None
         g1, _, _, _, _ = self.repinit(
             nlist_dict[
-                get_multiple_nlist_key(self.repinit.get_rcut(), self.repinit.get_nsel())
+                get_multiple_nlist_key(self.repinit.rcut, sum(self.repinit.sel))
             ],
             extended_coord,
             extended_atype,
@@ -792,14 +814,15 @@ def forward(
             assert self.tebd_transform is not None
             g1 = g1 + self.tebd_transform(g1_inp)
         # mapping g1
-        if comm_dict is None:
-            assert mapping is not None
+        if comm_dict is None or len(comm_dict) == 0:
+            if paddle.in_dynamic_mode():
+                assert mapping is not None
             mapping_ext = (
                 mapping.reshape([nframes, nall])
                 .unsqueeze(-1)
                 .expand([-1, -1, g1.shape[-1]])
             )
-            g1_ext = paddle.take_along_axis(g1, mapping_ext, 1)
+            g1_ext = paddle.take_along_axis(g1, mapping_ext, 1, broadcast=False)
             g1 = g1_ext
         # repformer
         g1, g2, h2, rot_mat, sw = self.repformers(
@@ -817,11 +840,11 @@ def forward(
         if self.concat_output_tebd:
             g1 = paddle.concat([g1, g1_inp], axis=-1)
         return (
-            g1.to(dtype=env.GLOBAL_PD_FLOAT_PRECISION),
-            rot_mat.to(dtype=env.GLOBAL_PD_FLOAT_PRECISION),
-            g2.to(dtype=env.GLOBAL_PD_FLOAT_PRECISION),
-            h2.to(dtype=env.GLOBAL_PD_FLOAT_PRECISION),
-            sw.to(dtype=env.GLOBAL_PD_FLOAT_PRECISION),
+            g1.astype(dtype=env.GLOBAL_PD_FLOAT_PRECISION),
+            rot_mat.astype(dtype=env.GLOBAL_PD_FLOAT_PRECISION),
+            g2.astype(dtype=env.GLOBAL_PD_FLOAT_PRECISION),
+            h2.astype(dtype=env.GLOBAL_PD_FLOAT_PRECISION),
+            sw.astype(dtype=env.GLOBAL_PD_FLOAT_PRECISION),
         )
 
     @classmethod
diff --git a/deepmd/pd/model/descriptor/dpa3.py b/deepmd/pd/model/descriptor/dpa3.py
index 99fd78c62f..80c5b1c000 100644
--- a/deepmd/pd/model/descriptor/dpa3.py
+++ b/deepmd/pd/model/descriptor/dpa3.py
@@ -91,7 +91,7 @@ class DescrptDPA3(BaseDescriptor, paddle.nn.Layer):
         Whether to use bias in the type embedding layer.
     use_loc_mapping : bool, Optional
         Whether to use local atom index mapping in training or non-parallel inference.
-        Not supported yet in Paddle.
+        When True, local indexing and mapping are applied to neighbor lists and embeddings during descriptor computation.
     type_map : list[str], Optional
         A list of strings. Give the name to each type of atoms.
 
@@ -117,7 +117,7 @@ def __init__(
         seed: Optional[Union[int, list[int]]] = None,
         use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
-        use_loc_mapping: bool = False,
+        use_loc_mapping: bool = True,
         type_map: Optional[list[str]] = None,
     ) -> None:
         super().__init__()
@@ -160,6 +160,8 @@ def init_subclass_params(sub_data, sub_class):
             fix_stat_std=self.repflow_args.fix_stat_std,
             optim_update=self.repflow_args.optim_update,
             smooth_edge_update=self.repflow_args.smooth_edge_update,
+            edge_init_use_dist=self.repflow_args.edge_init_use_dist,
+            use_exp_switch=self.repflow_args.use_exp_switch,
             use_dynamic_sel=self.repflow_args.use_dynamic_sel,
             sel_reduce_factor=self.repflow_args.sel_reduce_factor,
             use_loc_mapping=use_loc_mapping,
@@ -167,12 +169,18 @@ def init_subclass_params(sub_data, sub_class):
             env_protection=env_protection,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
 
         self.use_econf_tebd = use_econf_tebd
-        self.use_tebd_bias = use_tebd_bias
         self.use_loc_mapping = use_loc_mapping
+        self.use_tebd_bias = use_tebd_bias
         self.type_map = type_map
+        if type_map is not None:
+            self.register_buffer(
+                "buffer_type_map",
+                paddle.to_tensor([ord(c) for c in " ".join(self.type_map)]),
+            )
         self.tebd_dim = self.repflow_args.n_dim
         self.type_embedding = TypeEmbedNet(
             ntypes,
@@ -182,6 +190,7 @@ def init_subclass_params(sub_data, sub_class):
             use_econf_tebd=self.use_econf_tebd,
             use_tebd_bias=use_tebd_bias,
             type_map=type_map,
+            trainable=trainable,
         )
         self.concat_output_tebd = concat_output_tebd
         self.precision = precision
@@ -203,6 +212,9 @@ def init_subclass_params(sub_data, sub_class):
         self.rcut_smth = self.repflows.get_rcut_smth()
         self.sel = self.repflows.get_sel()
         self.ntypes = ntypes
+        self.register_buffer(
+            "buffer_ntypes", paddle.to_tensor(self.ntypes, dtype="int64")
+        )
 
         # set trainable
         for param in self.parameters():
@@ -217,6 +229,14 @@ def get_rcut_smth(self) -> float:
         """Returns the radius where the neighbor information starts to smoothly decay to 0."""
         return self.rcut_smth
 
+    def get_buffer_rcut(self) -> paddle.Tensor:
+        """Returns the cut-off radius as a buffer-style Tensor."""
+        return self.repflows.get_buffer_rcut()
+
+    def get_buffer_rcut_smth(self) -> paddle.Tensor:
+        """Returns the radius where the neighbor information starts to smoothly decay to 0 as a buffer-style Tensor."""
+        return self.repflows.get_buffer_rcut_smth()
+
     def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
@@ -225,14 +245,30 @@ def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sel
 
+    def get_buffer_sel(self) -> paddle.Tensor:
+        """Returns the number of selected atoms for each type as a buffer-style Tensor."""
+        return self.repflows.get_sel()
+
     def get_ntypes(self) -> int:
         """Returns the number of element types."""
-        return self.ntypes
+        return self.ntypes if paddle.in_dynamic_mode() else self.buffer_ntypes
 
     def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
+    def get_buffer_type_map(self) -> paddle.Tensor:
+        """
+        Return the type map as a buffer-style Tensor for JIT saving.
+
+        The original type map (e.g., ['Ni', 'O']) is first joined into a single space-separated string
+        (e.g., "Ni O"). Each character in this string is then converted to its ASCII code using `ord()`,
+        and the resulting integer sequence is stored as a 1D paddle.Tensor of dtype int.
+
+        This format allows the type map to be serialized as a raw byte buffer during JIT model saving.
+        """
+        return self.buffer_type_map
+
     def get_dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         ret = self.repflows.dim_out
@@ -453,7 +489,7 @@ def forward(
         extended_atype: paddle.Tensor,
         nlist: paddle.Tensor,
         mapping: Optional[paddle.Tensor] = None,
-        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+        comm_dict: Optional[list[paddle.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -487,12 +523,16 @@ def forward(
             The smooth switch function. shape: nf x nloc x nnei
 
         """
+        parallel_mode = comm_dict is not None
         # cast the input to internal precsion
         extended_coord = extended_coord.to(dtype=self.prec)
         nframes, nloc, nnei = nlist.shape
         nall = extended_coord.reshape([nframes, -1]).shape[1] // 3
 
-        node_ebd_ext = self.type_embedding(extended_atype)
+        if not parallel_mode and self.use_loc_mapping:
+            node_ebd_ext = self.type_embedding(extended_atype[:, :nloc])
+        else:
+            node_ebd_ext = self.type_embedding(extended_atype)
         node_ebd_inp = node_ebd_ext[:, :nloc, :]
         # repflows
         node_ebd, edge_ebd, h2, rot_mat, sw = self.repflows(
diff --git a/deepmd/pd/model/descriptor/env_mat.py b/deepmd/pd/model/descriptor/env_mat.py
index 9b72da0b16..2cc23fcadf 100644
--- a/deepmd/pd/model/descriptor/env_mat.py
+++ b/deepmd/pd/model/descriptor/env_mat.py
@@ -3,6 +3,7 @@
 import paddle
 
 from deepmd.pd.utils.preprocess import (
+    compute_exp_sw,
     compute_smooth_weight,
 )
 
@@ -14,6 +15,7 @@ def _make_env_mat(
     ruct_smth: float,
     radial_only: bool = False,
     protection: float = 0.0,
+    use_exp_switch: bool = False,
 ):
     """Make smooth environment matrix."""
     bsz, natoms, nnei = nlist.shape
@@ -21,10 +23,11 @@ def _make_env_mat(
     nall = coord.shape[1]
     mask = nlist >= 0
     # nlist = nlist * mask  ## this impl will contribute nans in Hessian calculation.
-    nlist = paddle.where(mask, nlist, nall - 1)
+    nlist = paddle.where(mask, nlist, paddle.full_like(nlist, nall - 1))
     coord_l = coord[:, :natoms].reshape([bsz, -1, 1, 3])
     index = nlist.reshape([bsz, -1]).unsqueeze(-1).expand([-1, -1, 3])
-    coord_r = paddle.take_along_axis(coord, axis=1, indices=index)
+    coord_pad = paddle.concat([coord, coord[:, -1:, :] + rcut], axis=1)
+    coord_r = paddle.take_along_axis(coord_pad, axis=1, indices=index, broadcast=False)
     coord_r = coord_r.reshape([bsz, natoms, nnei, 3])
     diff = coord_r - coord_l
     length = paddle.linalg.norm(diff, axis=-1, keepdim=True)
@@ -32,7 +35,11 @@ def _make_env_mat(
     length = length + (~mask.unsqueeze(-1)).astype(length.dtype)
     t0 = 1 / (length + protection)
     t1 = diff / (length + protection) ** 2
-    weight = compute_smooth_weight(length, ruct_smth, rcut)
+    weight = (
+        compute_smooth_weight(length, ruct_smth, rcut)
+        if not use_exp_switch
+        else compute_exp_sw(length, ruct_smth, rcut)
+    )
     weight = weight * mask.unsqueeze(-1).astype(weight.dtype)
     if radial_only:
         env_mat = t0 * weight
@@ -51,6 +58,7 @@ def prod_env_mat(
     rcut_smth: float,
     radial_only: bool = False,
     protection: float = 0.0,
+    use_exp_switch: bool = False,
 ):
     """Generate smooth environment matrix from atom coordinates and other context.
 
@@ -63,6 +71,7 @@ def prod_env_mat(
     - rcut_smth: Smooth hyper-parameter for pair force & energy.
     - radial_only: Whether to return a full description or a radial-only descriptor.
     - protection: Protection parameter to prevent division by zero errors during calculations.
+    - use_exp_switch: Whether to use the exponential switch function.
 
     Returns
     -------
@@ -75,6 +84,7 @@ def prod_env_mat(
         rcut_smth,
         radial_only,
         protection=protection,
+        use_exp_switch=use_exp_switch,
     )  # shape [n_atom, dim, 4 or 1]
     t_avg = mean[atype]  # [n_atom, dim, 4 or 1]
     t_std = stddev[atype]  # [n_atom, dim, 4 or 1]
diff --git a/deepmd/pd/model/descriptor/repflow_layer.py b/deepmd/pd/model/descriptor/repflow_layer.py
index f1bdd0439d..71e4d44ce2 100644
--- a/deepmd/pd/model/descriptor/repflow_layer.py
+++ b/deepmd/pd/model/descriptor/repflow_layer.py
@@ -19,6 +19,9 @@
 from deepmd.pd.model.network.mlp import (
     MLPLayer,
 )
+from deepmd.pd.model.network.utils import (
+    aggregate,
+)
 from deepmd.pd.utils.env import (
     PRECISION_DICT,
 )
@@ -61,6 +64,7 @@ def __init__(
         update_residual_init: str = "const",
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.epsilon = 1e-4  # protection of 1./nnei
@@ -123,6 +127,7 @@ def __init__(
             n_dim,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         if self.update_style == "res_residual":
             self.n_residual.append(
@@ -132,6 +137,7 @@ def __init__(
                     self.update_residual_init,
                     precision=precision,
                     seed=child_seed(seed, 1),
+                    trainable=trainable,
                 )
             )
 
@@ -142,6 +148,7 @@ def __init__(
             n_dim,
             precision=precision,
             seed=child_seed(seed, 2),
+            trainable=trainable,
         )
         if self.update_style == "res_residual":
             self.n_residual.append(
@@ -151,6 +158,7 @@ def __init__(
                     self.update_residual_init,
                     precision=precision,
                     seed=child_seed(seed, 3),
+                    trainable=trainable,
                 )
             )
 
@@ -160,6 +168,7 @@ def __init__(
             self.n_multi_edge_message * n_dim,
             precision=precision,
             seed=child_seed(seed, 4),
+            trainable=trainable,
         )
         if self.update_style == "res_residual":
             for head_index in range(self.n_multi_edge_message):
@@ -170,6 +179,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(child_seed(seed, 5), head_index),
+                        trainable=trainable,
                     )
                 )
 
@@ -179,6 +189,7 @@ def __init__(
             e_dim,
             precision=precision,
             seed=child_seed(seed, 6),
+            trainable=trainable,
         )
         if self.update_style == "res_residual":
             self.e_residual.append(
@@ -188,6 +199,7 @@ def __init__(
                     self.update_residual_init,
                     precision=precision,
                     seed=child_seed(seed, 7),
+                    trainable=trainable,
                 )
             )
 
@@ -216,6 +228,7 @@ def __init__(
                         precision=precision,
                         bias=False,
                         seed=child_seed(seed, 8),
+                        trainable=trainable,
                     )
                     self.a_compress_e_linear = MLPLayer(
                         self.e_dim,
@@ -223,6 +236,7 @@ def __init__(
                         precision=precision,
                         bias=False,
                         seed=child_seed(seed, 9),
+                        trainable=trainable,
                     )
                 else:
                     self.a_compress_n_linear = None
@@ -234,12 +248,14 @@ def __init__(
                 self.e_dim,
                 precision=precision,
                 seed=child_seed(seed, 10),
+                trainable=trainable,
             )
             self.edge_angle_linear2 = MLPLayer(
                 self.e_dim,
                 self.e_dim,
                 precision=precision,
                 seed=child_seed(seed, 11),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.e_residual.append(
@@ -249,6 +265,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 12),
+                        trainable=trainable,
                     )
                 )
 
@@ -258,6 +275,7 @@ def __init__(
                 self.a_dim,
                 precision=precision,
                 seed=child_seed(seed, 13),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.a_residual.append(
@@ -267,6 +285,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 14),
+                        trainable=trainable,
                     )
                 )
         else:
@@ -326,6 +345,61 @@ def _cal_hg(
         h2g2 = paddle.matmul(paddle.matrix_transpose(h2), edge_ebd) * invnnei
         return h2g2
 
+    @staticmethod
+    def _cal_hg_dynamic(
+        flat_edge_ebd: paddle.Tensor,
+        flat_h2: paddle.Tensor,
+        flat_sw: paddle.Tensor,
+        owner: paddle.Tensor,
+        num_owner: int,
+        nb: int,
+        nloc: int,
+        scale_factor: float,
+    ) -> paddle.Tensor:
+        """
+        Calculate the transposed rotation matrix.
+
+        Parameters
+        ----------
+        flat_edge_ebd
+            Flatted neighbor-wise/pair-wise invariant rep tensors, with shape n_edge x e_dim.
+        flat_h2
+            Flatted neighbor-wise/pair-wise equivariant rep tensors, with shape n_edge x 3.
+        flat_sw
+            Flatted switch function, which equals 1 within the rcut_smth range, smoothly decays from 1 to 0 between rcut_smth and rcut,
+            and remains 0 beyond rcut, with shape n_edge.
+        owner
+            The owner index of the neighbor to reduce on.
+        num_owner : int
+            The total number of the owner.
+        nb : int
+            The number of batches.
+        nloc : int
+            The number of local atoms.
+        scale_factor : float
+            The scale factor to apply after reduce.
+
+        Returns
+        -------
+        hg
+            The transposed rotation matrix, with shape nf x nloc x 3 x e_dim.
+        """
+        n_edge, e_dim = flat_edge_ebd.shape
+        # n_edge x e_dim
+        flat_edge_ebd = flat_edge_ebd * flat_sw.unsqueeze(-1)
+        # n_edge x 3 x e_dim
+        flat_h2g2 = (flat_h2.unsqueeze(-1) * flat_edge_ebd.unsqueeze(-2)).reshape(
+            [-1, 3 * e_dim]
+        )
+        # nf x nloc x 3 x e_dim
+        h2g2 = (
+            aggregate(flat_h2g2, owner, average=False, num_owner=num_owner).reshape(
+                [nb, nloc, 3, e_dim]
+            )
+            * scale_factor
+        )
+        return h2g2
+
     @staticmethod
     def _cal_grrg(h2g2: paddle.Tensor, axis_neuron: int) -> paddle.Tensor:
         """
@@ -398,6 +472,63 @@ def symmetrization_op(
         g1_13 = self._cal_grrg(h2g2, axis_neuron)
         return g1_13
 
+    def symmetrization_op_dynamic(
+        self,
+        flat_edge_ebd: paddle.Tensor,
+        flat_h2: paddle.Tensor,
+        flat_sw: paddle.Tensor,
+        owner: paddle.Tensor,
+        num_owner: int,
+        nb: int,
+        nloc: int,
+        scale_factor: float,
+        axis_neuron: int,
+    ) -> paddle.Tensor:
+        """
+        Symmetrization operator to obtain atomic invariant rep.
+
+        Parameters
+        ----------
+        flat_edge_ebd
+            Flatted neighbor-wise/pair-wise invariant rep tensors, with shape n_edge x e_dim.
+        flat_h2
+            Flatted neighbor-wise/pair-wise equivariant rep tensors, with shape n_edge x 3.
+        flat_sw
+            Flatted switch function, which equals 1 within the rcut_smth range, smoothly decays from 1 to 0 between rcut_smth and rcut,
+            and remains 0 beyond rcut, with shape n_edge.
+        owner
+            The owner index of the neighbor to reduce on.
+        num_owner : int
+            The total number of the owner.
+        nb : int
+            The number of batches.
+        nloc : int
+            The number of local atoms.
+        scale_factor : float
+            The scale factor to apply after reduce.
+        axis_neuron
+            Size of the submatrix.
+
+        Returns
+        -------
+        grrg
+            Atomic invariant rep, with shape nb x nloc x (axis_neuron x e_dim)
+        """
+        # nb x nloc x 3 x e_dim
+        h2g2 = self._cal_hg_dynamic(
+            flat_edge_ebd,
+            flat_h2,
+            flat_sw,
+            owner,
+            num_owner,
+            nb,
+            nloc,
+            scale_factor,
+        )
+        # nb x nloc x (axis x e_dim)
+        grrg = self._cal_grrg(h2g2, axis_neuron)
+        return grrg
+
     def optim_angle_update(
         self,
         angle_ebd: paddle.Tensor,
@@ -419,7 +550,7 @@ def optim_angle_update(
         node_dim = node_ebd.shape[-1]
         edge_dim = edge_ebd.shape[-1]
         # angle_dim, node_dim, edge_dim, edge_dim
-        sub_angle, sub_node, sub_edge_ij, sub_edge_ik = paddle.split(
+        sub_angle, sub_node, sub_edge_ik, sub_edge_ij = paddle.split(
             matrix, [angle_dim, node_dim, edge_dim, edge_dim]
         )
 
@@ -428,14 +559,66 @@ def optim_angle_update(
         # nf * nloc * angle_dim
         sub_node_update = paddle.matmul(node_ebd, sub_node)
         # nf * nloc * a_nnei * angle_dim
-        sub_edge_update_ij = paddle.matmul(edge_ebd, sub_edge_ij)
         sub_edge_update_ik = paddle.matmul(edge_ebd, sub_edge_ik)
+        sub_edge_update_ij = paddle.matmul(edge_ebd, sub_edge_ij)
 
         result_update = (
             bias
             + sub_node_update.unsqueeze(2).unsqueeze(3)
-            + sub_edge_update_ij.unsqueeze(2)
-            + sub_edge_update_ik.unsqueeze(3)
+            + sub_edge_update_ik.unsqueeze(2)
+            + sub_edge_update_ij.unsqueeze(3)
+            + sub_angle_update
+        )
+        return result_update
+
+    def optim_angle_update_dynamic(
+        self,
+        flat_angle_ebd: paddle.Tensor,
+        node_ebd: paddle.Tensor,
+        flat_edge_ebd: paddle.Tensor,
+        n2a_index: paddle.Tensor,
+        eij2a_index: paddle.Tensor,
+        eik2a_index: paddle.Tensor,
+        feat: str = "edge",
+    ) -> paddle.Tensor:
+        if feat == "edge":
+            matrix, bias = self.edge_angle_linear1.matrix, self.edge_angle_linear1.bias
+        elif feat == "angle":
+            matrix, bias = self.angle_self_linear.matrix, self.angle_self_linear.bias
+        else:
+            raise NotImplementedError
+        nf, nloc, node_dim = node_ebd.shape
+        edge_dim = flat_edge_ebd.shape[-1]
+        angle_dim = flat_angle_ebd.shape[-1]
+        # angle_dim, node_dim, edge_dim, edge_dim
+        sub_angle, sub_node, sub_edge_ik, sub_edge_ij = paddle.split(
+            matrix, [angle_dim, node_dim, edge_dim, edge_dim]
+        )
+
+        # n_angle * angle_dim
+        sub_angle_update = paddle.matmul(flat_angle_ebd, sub_angle)
+
+        # nf * nloc * angle_dim
+        sub_node_update = paddle.matmul(node_ebd, sub_node)
+        # n_angle * angle_dim
+        sub_node_update = paddle.index_select(
+            sub_node_update.reshape([nf * nloc, sub_node_update.shape[-1]]),
+            n2a_index,
+            0,
+        )
+
+        # n_edge * angle_dim
+        sub_edge_update_ik = paddle.matmul(flat_edge_ebd, sub_edge_ik)
+        sub_edge_update_ij = paddle.matmul(flat_edge_ebd, sub_edge_ij)
+        # n_angle * angle_dim
+        sub_edge_update_ik = paddle.index_select(sub_edge_update_ik, eik2a_index, 0)
+        sub_edge_update_ij = paddle.index_select(sub_edge_update_ij, eij2a_index, 0)
+
+        result_update = (
+            bias
+            + sub_node_update
+            + sub_edge_update_ik
+            + sub_edge_update_ij
             + sub_angle_update
         )
         return result_update
@@ -475,9 +658,55 @@ def optim_edge_update(
         )
         return result_update
 
+    def optim_edge_update_dynamic(
+        self,
+        node_ebd: paddle.Tensor,
+        node_ebd_ext: paddle.Tensor,
+        flat_edge_ebd: paddle.Tensor,
+        n2e_index: paddle.Tensor,
+        n_ext2e_index: paddle.Tensor,
+        feat: str = "node",
+    ) -> paddle.Tensor:
+        if feat == "node":
+            matrix, bias = self.node_edge_linear.matrix, self.node_edge_linear.bias
+        elif feat == "edge":
+            matrix, bias = self.edge_self_linear.matrix, self.edge_self_linear.bias
+        else:
+            raise NotImplementedError
+        assert bias is not None
+        nf, nall, node_dim = node_ebd_ext.shape
+        _, nloc, _ = node_ebd.shape
+        edge_dim = flat_edge_ebd.shape[-1]
+        # node_dim, node_dim, edge_dim
+        node, node_ext, edge = paddle.split(matrix, [node_dim, node_dim, edge_dim])
+
+        # nf * nloc * node/edge_dim
+        sub_node_update = paddle.matmul(node_ebd, node)
+        # n_edge * node/edge_dim
+        sub_node_update = paddle.index_select(
+            sub_node_update.reshape([nf * nloc, sub_node_update.shape[-1]]),
+            n2e_index,
+            0,
+        )
+
+        # nf * nall * node/edge_dim
+        sub_node_ext_update = paddle.matmul(node_ebd_ext, node_ext)
+        # n_edge * node/edge_dim
+        sub_node_ext_update = paddle.index_select(
+            sub_node_ext_update.reshape([nf * nall, sub_node_update.shape[-1]]),
+            n_ext2e_index,
+            0,
+        )
+
+        # n_edge * node/edge_dim
+        sub_edge_update = paddle.matmul(flat_edge_ebd, edge)
+
+        result_update = bias + sub_node_update + sub_edge_update + sub_node_ext_update
+        return result_update
+
     def forward(
         self,
-        node_ebd_ext: paddle.Tensor,  # nf x nall x n_dim
+        node_ebd_ext: paddle.Tensor,  # nf x nall x n_dim [OR] nf x nloc x n_dim when not parallel_mode
         edge_ebd: paddle.Tensor,  # nf x nloc x nnei x e_dim
         h2: paddle.Tensor,  # nf x nloc x nnei x 3
         angle_ebd: paddle.Tensor,  # nf x nloc x a_nnei x a_nnei x a_dim
@@ -487,6 +716,8 @@ def forward(
         a_nlist: paddle.Tensor,  # nf x nloc x a_nnei
         a_nlist_mask: paddle.Tensor,  # nf x nloc x a_nnei
         a_sw: paddle.Tensor,  # switch func, nf x nloc x a_nnei
+        edge_index: paddle.Tensor,  # 2 x n_edge
+        angle_index: paddle.Tensor,  # 3 x n_angle
     ):
         """
         Parameters
@@ -511,6 +742,18 @@ def forward(
             Masks of the neighbor list for angle. real nei 1 otherwise 0
         a_sw : nf x nloc x a_nnei
             Switch function for angle.
+        edge_index : Optional for dynamic sel, 2 x n_edge
+            n2e_index : n_edge
+                Broadcast indices from node(i) to edge(ij), or reduction indices from edge(ij) to node(i).
+            n_ext2e_index : n_edge
+                Broadcast indices from extended node(j) to edge(ij).
+        angle_index : Optional for dynamic sel, 3 x n_angle
+            n2a_index : n_angle
+                Broadcast indices from extended node(j) to angle(ijk).
+            eij2a_index : n_angle
+                Broadcast indices from extended edge(ij) to angle(ijk), or reduction indices from angle(ijk) to edge(ij).
+            eik2a_index : n_angle
+                Broadcast indices from extended edge(ik) to angle(ijk).
 
         Returns
         -------
@@ -521,15 +764,37 @@ def forward(
         a_updated : nf x nloc x a_nnei x a_nnei x a_dim
             Updated angle embedding.
         """
-        nb, nloc, nnei, _ = edge_ebd.shape
+        nb, nloc, nnei = nlist.shape
         nall = node_ebd_ext.shape[1]
         node_ebd = node_ebd_ext[:, :nloc, :]
         if paddle.in_dynamic_mode():
             assert [nb, nloc] == node_ebd.shape[:2]
-        if paddle.in_dynamic_mode():
-            assert [nb, nloc, nnei] == h2.shape[:3]
+        if not self.use_dynamic_sel:
+            if paddle.in_dynamic_mode():
+                assert [nb, nloc, nnei, 3] == h2.shape
+            n_edge = None
+        else:
+            n_edge = h2.shape[0]
         del a_nlist  # may be used in the future
 
+        n2e_index, n_ext2e_index = edge_index[0], edge_index[1]
+        n2a_index, eij2a_index, eik2a_index = (
+            angle_index[0],
+            angle_index[1],
+            angle_index[2],
+        )
+
+        # nb x nloc x nnei x n_dim [OR] n_edge x n_dim
+        nei_node_ebd = (
+            _make_nei_g1(node_ebd_ext, nlist)
+            if not self.use_dynamic_sel
+            else paddle.index_select(
+                node_ebd_ext.reshape([-1, self.n_dim]),
+                n_ext2e_index,
+                0,
+            )
+        )
+
         n_update_list: list[paddle.Tensor] = [node_ebd]
         e_update_list: list[paddle.Tensor] = [edge_ebd]
         a_update_list: list[paddle.Tensor] = [angle_ebd]
@@ -538,8 +803,6 @@ def forward(
         node_self_mlp = self.act(self.node_self_mlp(node_ebd))
         n_update_list.append(node_self_mlp)
 
-        nei_node_ebd = _make_nei_g1(node_ebd_ext, nlist)
-
         # node sym (grrg + drrd)
         node_sym_list: list[paddle.Tensor] = []
         node_sym_list.append(
@@ -550,6 +813,18 @@ def forward(
                 sw,
                 self.axis_neuron,
             )
+            if not self.use_dynamic_sel
+            else self.symmetrization_op_dynamic(
+                edge_ebd,
+                h2,
+                sw,
+                owner=n2e_index,
+                num_owner=nb * nloc,
+                nb=nb,
+                nloc=nloc,
+                scale_factor=self.dynamic_e_sel ** (-0.5),
+                axis_neuron=self.axis_neuron,
+            )
         )
         node_sym_list.append(
             self.symmetrization_op(
@@ -559,20 +834,47 @@ def forward(
                 sw,
                 self.axis_neuron,
             )
+            if not self.use_dynamic_sel
+            else self.symmetrization_op_dynamic(
+                nei_node_ebd,
+                h2,
+                sw,
+                owner=n2e_index,
+                num_owner=nb * nloc,
+                nb=nb,
+                nloc=nloc,
+                scale_factor=self.dynamic_e_sel ** (-0.5),
+                axis_neuron=self.axis_neuron,
+            )
         )
         node_sym = self.act(self.node_sym_linear(paddle.concat(node_sym_list, axis=-1)))
         n_update_list.append(node_sym)
 
         if not self.optim_update:
-            # nb x nloc x nnei x (n_dim * 2 + e_dim)
-            edge_info = paddle.concat(
-                [
-                    paddle.tile(node_ebd.unsqueeze(-2), [1, 1, self.nnei, 1]),
-                    nei_node_ebd,
-                    edge_ebd,
-                ],
-                axis=-1,
-            )
+            if not self.use_dynamic_sel:
+                # nb x nloc x nnei x (n_dim * 2 + e_dim)
+                edge_info = paddle.concat(
+                    [
+                        paddle.tile(node_ebd.unsqueeze(-2), [1, 1, self.nnei, 1]),
+                        nei_node_ebd,
+                        edge_ebd,
+                    ],
+                    axis=-1,
+                )
+            else:
+                # n_edge x (n_dim * 2 + e_dim)
+                edge_info = paddle.concat(
+                    [
+                        paddle.index_select(
+                            node_ebd.reshape([-1, self.n_dim]),
+                            n2e_index,
+                            0,
+                        ),
+                        nei_node_ebd,
+                        edge_ebd,
+                    ],
+                    axis=-1,
+                )
         else:
             edge_info = None
 
@@ -592,16 +894,37 @@ def forward(
                     nlist,
                     "node",
                 )
+                if not self.use_dynamic_sel
+                else self.optim_edge_update_dynamic(
+                    node_ebd,
+                    node_ebd_ext,
+                    edge_ebd,
+                    n2e_index,
+                    n_ext2e_index,
+                    "node",
+                )
             ) * sw.unsqueeze(-1)
+        node_edge_update = (
+            (paddle.sum(node_edge_update, axis=-2) / self.nnei)
+            if not self.use_dynamic_sel
+            else (
+                aggregate(
+                    node_edge_update,
+                    n2e_index,
+                    average=False,
+                    num_owner=nb * nloc,
+                ).reshape([nb, nloc, node_edge_update.shape[-1]])
+                / self.dynamic_e_sel
+            )
+        )
 
-        node_edge_update = paddle.sum(node_edge_update, axis=-2) / self.nnei
         if self.n_multi_edge_message > 1:
-            # nb x nloc x nnei x h x n_dim
+            # nb x nloc x h x n_dim
             node_edge_update_mul_head = node_edge_update.reshape(
                 [nb, nloc, self.n_multi_edge_message, self.n_dim]
             )
             for head_index in range(self.n_multi_edge_message):
-                n_update_list.append(node_edge_update_mul_head[:, :, head_index, :])
+                n_update_list.append(node_edge_update_mul_head[..., head_index, :])
         else:
             n_update_list.append(node_edge_update)
         # update node_ebd
@@ -620,6 +943,15 @@ def forward(
                     nlist,
                     "edge",
                 )
+                if not self.use_dynamic_sel
+                else self.optim_edge_update_dynamic(
+                    node_ebd,
+                    node_ebd_ext,
+                    edge_ebd,
+                    n2e_index,
+                    n_ext2e_index,
+                    "edge",
+                )
             )
         e_update_list.append(edge_self_update)
 
@@ -641,48 +973,66 @@ def forward(
                     edge_ebd_for_angle = self.a_compress_e_linear(edge_ebd)
                 else:
                     # use the first a_compress_dim dim for node and edge
-                    node_ebd_for_angle = node_ebd[:, :, : self.n_a_compress_dim]
-                    edge_ebd_for_angle = edge_ebd[:, :, :, : self.e_a_compress_dim]
+                    node_ebd_for_angle = node_ebd[..., : self.n_a_compress_dim]
+                    edge_ebd_for_angle = edge_ebd[..., : self.e_a_compress_dim]
             else:
                 node_ebd_for_angle = node_ebd
                 edge_ebd_for_angle = edge_ebd
 
-            # nb x nloc x a_nnei x e_dim
-            edge_for_angle = edge_ebd_for_angle[:, :, : self.a_sel, :]
-            # nb x nloc x a_nnei x e_dim
-            edge_for_angle = paddle.where(
-                a_nlist_mask.unsqueeze(-1),
-                edge_for_angle,
-                paddle.zeros_like(edge_for_angle),
-            ).astype(edge_for_angle.dtype)
+            if not self.use_dynamic_sel:
+                # nb x nloc x a_nnei x e_dim
+                edge_ebd_for_angle = edge_ebd_for_angle[..., : self.a_sel, :]
+                # nb x nloc x a_nnei x e_dim
+                edge_ebd_for_angle = edge_ebd_for_angle.masked_fill(
+                    ~a_nlist_mask.unsqueeze(-1), 0.0
+                )
             if not self.optim_update:
-                # nb x nloc x a_nnei x a_nnei x n_dim
-                node_for_angle_info = paddle.tile(
-                    node_ebd_for_angle.unsqueeze(2).unsqueeze(2),
-                    [1, 1, self.a_sel, self.a_sel, 1],
+                # nb x nloc x a_nnei x a_nnei x n_dim [OR] n_angle x n_dim
+                node_for_angle_info = (
+                    paddle.tile(
+                        node_ebd_for_angle.unsqueeze(2).unsqueeze(2),
+                        (1, 1, self.a_sel, self.a_sel, 1),
+                    )
+                    if not self.use_dynamic_sel
+                    else paddle.index_select(
+                        node_ebd_for_angle.reshape([-1, self.n_a_compress_dim]),
+                        n2a_index,
+                        0,
+                    )
                 )
-                # nb x nloc x (a_nnei) x a_nnei x edge_ebd
-                edge_for_angle_i = paddle.tile(
-                    edge_for_angle.unsqueeze(2), (1, 1, self.a_sel, 1, 1)
+
+                # nb x nloc x (a_nnei) x a_nnei x e_dim [OR] n_angle x e_dim
+                edge_for_angle_k = (
+                    paddle.tile(
+                        edge_ebd_for_angle.unsqueeze(2), (1, 1, self.a_sel, 1, 1)
+                    )
+                    if not self.use_dynamic_sel
+                    else paddle.index_select(edge_ebd_for_angle, eik2a_index, 0)
                 )
-                # nb x nloc x a_nnei x (a_nnei) x e_dim
-                edge_for_angle_j = paddle.tile(
-                    edge_for_angle.unsqueeze(3), (1, 1, 1, self.a_sel, 1)
+                # nb x nloc x a_nnei x (a_nnei) x e_dim [OR] n_angle x e_dim
+                edge_for_angle_j = (
+                    paddle.tile(
+                        edge_ebd_for_angle.unsqueeze(3), (1, 1, 1, self.a_sel, 1)
+                    )
+                    if not self.use_dynamic_sel
+                    else paddle.index_select(edge_ebd_for_angle, eij2a_index, 0)
                 )
-                # nb x nloc x a_nnei x a_nnei x (e_dim + e_dim)
+                # nb x nloc x a_nnei x a_nnei x (e_dim + e_dim) [OR] n_angle x (e_dim + e_dim)
                 edge_for_angle_info = paddle.concat(
-                    [edge_for_angle_i, edge_for_angle_j], axis=-1
+                    [edge_for_angle_k, edge_for_angle_j], axis=-1
                 )
                 angle_info_list = [angle_ebd]
                 angle_info_list.append(node_for_angle_info)
                 angle_info_list.append(edge_for_angle_info)
                 # nb x nloc x a_nnei x a_nnei x (a + n_dim + e_dim*2) or (a + a/c + a/c)
+                # [OR]
+                # n_angle x (a + n_dim + e_dim*2) or (a + a/c + a/c)
                 angle_info = paddle.concat(angle_info_list, axis=-1)
             else:
                 angle_info = None
 
             # edge angle message
-            # nb x nloc x a_nnei x a_nnei x e_dim
+            # nb x nloc x a_nnei x a_nnei x e_dim [OR] n_angle x e_dim
             if not self.optim_update:
                 assert angle_info is not None
                 edge_angle_update = self.act(self.edge_angle_linear1(angle_info))
@@ -691,32 +1041,61 @@ def forward(
                     self.optim_angle_update(
                         angle_ebd,
                         node_ebd_for_angle,
-                        edge_for_angle,
+                        edge_ebd_for_angle,
+                        "edge",
+                    )
+                    if not self.use_dynamic_sel
+                    else self.optim_angle_update_dynamic(
+                        angle_ebd,
+                        node_ebd_for_angle,
+                        edge_ebd_for_angle,
+                        n2a_index,
+                        eij2a_index,
+                        eik2a_index,
                         "edge",
                     )
                 )
 
-            # nb x nloc x a_nnei x a_nnei x e_dim
-            weighted_edge_angle_update = (
-                a_sw[..., None, None] * a_sw[..., None, :, None] * edge_angle_update
-            )
-            # nb x nloc x a_nnei x e_dim
-            reduced_edge_angle_update = paddle.sum(
-                weighted_edge_angle_update, axis=-2
-            ) / (self.a_sel**0.5)
-            # nb x nloc x nnei x e_dim
-            padding_edge_angle_update = paddle.concat(
-                [
-                    reduced_edge_angle_update,
-                    paddle.zeros(
-                        [nb, nloc, self.nnei - self.a_sel, self.e_dim],
-                        dtype=edge_ebd.dtype,
-                    ).to(device=edge_ebd.place),
-                ],
-                axis=2,
-            )
+            if not self.use_dynamic_sel:
+                # nb x nloc x a_nnei x a_nnei x e_dim
+                weighted_edge_angle_update = (
+                    a_sw.unsqueeze(-1).unsqueeze(-1)
+                    * a_sw.unsqueeze(-2).unsqueeze(-1)
+                    * edge_angle_update
+                )
+                # nb x nloc x a_nnei x e_dim
+                reduced_edge_angle_update = paddle.sum(
+                    weighted_edge_angle_update, axis=-2
+                ) / (self.a_sel**0.5)
+                # nb x nloc x nnei x e_dim
+                padding_edge_angle_update = paddle.concat(
+                    [
+                        reduced_edge_angle_update,
+                        paddle.zeros(
+                            [nb, nloc, self.nnei - self.a_sel, self.e_dim],
+                            dtype=edge_ebd.dtype,
+                        ),
+                    ],
+                    axis=2,
+                )
+            else:
+                # n_angle x e_dim
+                weighted_edge_angle_update = edge_angle_update * a_sw.unsqueeze(-1)
+                # n_edge x e_dim
+                padding_edge_angle_update = aggregate(
+                    weighted_edge_angle_update,
+                    eij2a_index,
+                    average=False,
+                    num_owner=n_edge,
+                ) / (self.dynamic_a_sel**0.5)
+
             if not self.smooth_edge_update:
                 # will be deprecated in the future
+                # not support dynamic index, will pass anyway
+                if self.use_dynamic_sel:
+                    raise NotImplementedError(
+                        "smooth_edge_update must be True when use_dynamic_sel is True!"
+                    )
                 full_mask = paddle.concat(
                     [
                         a_nlist_mask,
@@ -727,8 +1106,8 @@ def forward(
                     ],
                     axis=-1,
                 )
-                padding_edge_angle_update = paddle.where(
-                    full_mask.unsqueeze(-1), padding_edge_angle_update, edge_ebd
+                padding_edge_angle_update = padding_edge_angle_update.masked_fill(
+                    ~full_mask.unsqueeze(-1), edge_ebd
                 )
             e_update_list.append(
                 self.act(self.edge_angle_linear2(padding_edge_angle_update))
@@ -746,7 +1125,17 @@ def forward(
                     self.optim_angle_update(
                         angle_ebd,
                         node_ebd_for_angle,
-                        edge_for_angle,
+                        edge_ebd_for_angle,
+                        "angle",
+                    )
+                    if not self.use_dynamic_sel
+                    else self.optim_angle_update_dynamic(
+                        angle_ebd,
+                        node_ebd_for_angle,
+                        edge_ebd_for_angle,
+                        n2a_index,
+                        eij2a_index,
+                        eik2a_index,
                         "angle",
                     )
                 )
diff --git a/deepmd/pd/model/descriptor/repflows.py b/deepmd/pd/model/descriptor/repflows.py
index 3200c26dba..2b9760bbe6 100644
--- a/deepmd/pd/model/descriptor/repflows.py
+++ b/deepmd/pd/model/descriptor/repflows.py
@@ -10,6 +10,10 @@
 from deepmd.dpmodel.utils.seed import (
     child_seed,
 )
+from deepmd.pd.cxx_op import (
+    ENABLE_CUSTOMIZED_OP,
+    paddle_ops_deepmd,
+)
 from deepmd.pd.model.descriptor.descriptor import (
     DescriptorBlock,
 )
@@ -19,6 +23,9 @@
 from deepmd.pd.model.network.mlp import (
     MLPLayer,
 )
+from deepmd.pd.model.network.utils import (
+    get_graph_index,
+)
 from deepmd.pd.utils import (
     env,
 )
@@ -31,6 +38,9 @@
 from deepmd.pd.utils.exclude_mask import (
     PairExcludeMask,
 )
+from deepmd.pd.utils.spin import (
+    concat_switch_virtual,
+)
 from deepmd.pd.utils.utils import (
     ActivationFn,
 )
@@ -45,6 +55,30 @@
     RepFlowLayer,
 )
 
+if not ENABLE_CUSTOMIZED_OP:
+
+    def border_op(
+        argument0,
+        argument1,
+        argument2,
+        argument3,
+        argument4,
+        argument5,
+        argument6,
+        argument7,
+        argument8,
+    ) -> paddle.Tensor:
+        raise NotImplementedError(
+            "The 'border_op' operator is unavailable because the custom Paddle OP library was not built when freezing the model.\n"
+            "To install 'border_op', run: python source/op/pd/setup.py install\n"
+            "For more information, please refer to the DPA3 documentation."
+        )
+
+    # Note: this hack cannot actually save a model that can be run using LAMMPS.
+    paddle_ops_deepmd_border_op = border_op
+else:
+    paddle_ops_deepmd_border_op = paddle_ops_deepmd.border_op
+
 
 @DescriptorBlock.register("se_repflow")
 class DescrptBlockRepflows(DescriptorBlock):
@@ -109,12 +143,35 @@ class DescrptBlockRepflows(DescriptorBlock):
     smooth_edge_update : bool, optional
         Whether to make edge update smooth.
         If True, the edge update from angle message will not use self as padding.
+    edge_init_use_dist : bool, optional
+        Whether to use direct distance r to initialize the edge features instead of 1/r.
+        Note that when using this option, the activation function will not be used when initializing edge features.
+    use_exp_switch : bool, optional
+        Whether to use an exponential switch function instead of a polynomial one in the neighbor update.
+        The exponential switch function ensures neighbor contributions smoothly diminish as the interatomic distance
+        `r` approaches the cutoff radius `rcut`. Specifically, the function is defined as:
+        s(r) = \\exp(-\\exp(20 * (r - rcut_smth) / rcut_smth)) for 0 < r \\leq rcut, and s(r) = 0 for r > rcut.
+        Here, `rcut_smth` is an adjustable smoothing factor and `rcut_smth` should be chosen carefully
+        according to `rcut`, ensuring s(r) approaches zero smoothly at the cutoff.
+        Typical recommended values are `rcut_smth` = 5.3 for `rcut` = 6.0, and 3.5 for `rcut` = 4.0.
+    use_dynamic_sel : bool, optional
+        Whether to dynamically select neighbors within the cutoff radius.
+        If True, the exact number of neighbors within the cutoff radius is used
+        without padding to a fixed selection numbers.
+        When enabled, users can safely set larger values for `e_sel` or `a_sel` (e.g., 1200 or 300, respectively)
+        to guarantee capturing all neighbors within the cutoff radius.
+        Note that when using dynamic selection, the `smooth_edge_update` must be True.
+    sel_reduce_factor : float, optional
+        Reduction factor applied to neighbor-scale normalization when `use_dynamic_sel` is True.
+        In the dynamic selection case, neighbor-scale normalization will use `e_sel / sel_reduce_factor`
+        or `a_sel / sel_reduce_factor` instead of the raw `e_sel` or `a_sel` values,
+        accommodating larger selection numbers.
+    use_loc_mapping : bool, Optional
+        Whether to use local atom index mapping in training or non-parallel inference.
+        When True, local indexing and mapping are applied to neighbor lists and embeddings during descriptor computation.
     optim_update : bool, optional
         Whether to enable the optimized update method.
         Uses a more efficient process when enabled. Defaults to True
-    use_loc_mapping : bool, Optional
-        Whether to use local atom index mapping in training or non-parallel inference.
-        Not supported yet in Paddle.
     ntypes : int
         Number of element types
     activation_function : str, optional
@@ -131,6 +188,8 @@ class DescrptBlockRepflows(DescriptorBlock):
         For example, when using paddings, there may be zero distances of neighbors, which may make division by zero error during environment matrix calculations without protection.
     seed : int, optional
         Random seed for parameter initialization.
+    trainable : bool, default: True
+        Whether this block is trainable
     """
 
     def __init__(
@@ -162,11 +221,14 @@ def __init__(
         precision: str = "float64",
         fix_stat_std: float = 0.3,
         smooth_edge_update: bool = False,
+        edge_init_use_dist: bool = False,
+        use_exp_switch: bool = False,
         use_dynamic_sel: bool = False,
         sel_reduce_factor: float = 10.0,
-        use_loc_mapping: bool = False,
+        use_loc_mapping: bool = True,
         optim_update: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.e_rcut = float(e_rcut)
@@ -176,6 +238,9 @@ def __init__(
         self.a_rcut_smth = float(a_rcut_smth)
         self.a_sel = a_sel
         self.ntypes = ntypes
+        self.register_buffer(
+            "buffer_ntypes", paddle.to_tensor(self.ntypes, dtype="int64")
+        )
         self.nlayers = nlayers
         # for other common desciptor method
         sel = [e_sel] if isinstance(e_sel, int) else e_sel
@@ -183,7 +248,9 @@ def __init__(
         self.ndescrpt = self.nnei * 4  # use full descriptor.
         assert len(sel) == 1
         self.sel = sel
+        self.register_buffer("buffer_sel", paddle.to_tensor(sel))
         self.rcut = e_rcut
+        self.register_buffer("buffer_rcut", paddle.to_tensor(self.e_rcut))
         self.rcut_smth = e_rcut_smth
         self.sec = self.sel
         self.split_sel = self.sel
@@ -195,13 +262,21 @@ def __init__(
         self.fix_stat_std = fix_stat_std
         self.set_stddev_constant = fix_stat_std != 0.0
         self.a_compress_use_split = a_compress_use_split
+        self.use_loc_mapping = use_loc_mapping
         self.optim_update = optim_update
         self.smooth_edge_update = smooth_edge_update
-        self.use_dynamic_sel = use_dynamic_sel  # not supported yet
+        self.edge_init_use_dist = edge_init_use_dist
+        self.use_exp_switch = use_exp_switch
+        self.use_dynamic_sel = use_dynamic_sel
         self.sel_reduce_factor = sel_reduce_factor
-        assert not self.use_dynamic_sel, "Dynamic selection is not supported yet."
-        self.use_loc_mapping = use_loc_mapping
-        assert not self.use_loc_mapping, "Local mapping is not supported yet."
+        if self.use_dynamic_sel and not self.smooth_edge_update:
+            raise NotImplementedError(
+                "smooth_edge_update must be True when use_dynamic_sel is True!"
+            )
+        if self.sel_reduce_factor <= 0:
+            raise ValueError(
+                f"`sel_reduce_factor` must be > 0, got {self.sel_reduce_factor}"
+            )
 
         self.n_dim = n_dim
         self.e_dim = e_dim
@@ -223,10 +298,19 @@ def __init__(
         self.seed = seed
 
         self.edge_embd = MLPLayer(
-            1, self.e_dim, precision=precision, seed=child_seed(seed, 0)
+            1,
+            self.e_dim,
+            precision=precision,
+            seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.angle_embd = MLPLayer(
-            1, self.a_dim, precision=precision, bias=False, seed=child_seed(seed, 1)
+            1,
+            self.a_dim,
+            precision=precision,
+            bias=False,
+            seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         layers = []
         for ii in range(nlayers):
@@ -258,6 +342,7 @@ def __init__(
                     sel_reduce_factor=self.sel_reduce_factor,
                     smooth_edge_update=self.smooth_edge_update,
                     seed=child_seed(child_seed(seed, 1), ii),
+                    trainable=trainable,
                 )
             )
         self.layers = paddle.nn.LayerList(layers)
@@ -275,6 +360,10 @@ def get_rcut(self) -> float:
         """Returns the cut-off radius."""
         return self.e_rcut
 
+    def get_buffer_rcut(self) -> paddle.Tensor:
+        """Returns the cut-off radius as a buffer-style Tensor."""
+        return self.buffer_rcut
+
     def get_rcut_smth(self) -> float:
         """Returns the radius where the neighbor information starts to smoothly decay to 0."""
         return self.e_rcut_smth
@@ -287,9 +376,13 @@ def get_sel(self) -> list[int]:
         """Returns the number of selected atoms for each type."""
         return self.sel
 
+    def get_buffer_sel(self) -> paddle.Tensor:
+        """Returns the number of selected atoms for each type as a buffer-style Tensor."""
+        return self.buffer_sel
+
     def get_ntypes(self) -> int:
         """Returns the number of element types."""
-        return self.ntypes
+        return self.ntypes if paddle.in_dynamic_mode() else self.buffer_ntypes
 
     def get_dim_out(self) -> int:
         """Returns the output dimension."""
@@ -364,17 +457,18 @@ def forward(
         extended_atype: paddle.Tensor,
         extended_atype_embd: Optional[paddle.Tensor] = None,
         mapping: Optional[paddle.Tensor] = None,
-        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+        comm_dict: Optional[list[paddle.Tensor]] = None,
     ):
-        if comm_dict is None:
-            assert mapping is not None
-            assert extended_atype_embd is not None
+        parallel_mode = comm_dict is not None
+        if not parallel_mode:
+            if paddle.in_dynamic_mode():
+                assert mapping is not None
         nframes, nloc, nnei = nlist.shape
         nall = extended_coord.reshape([nframes, -1]).shape[1] // 3
         atype = extended_atype[:, :nloc]
         # nb x nloc x nnei
         exclude_mask = self.emask(nlist, extended_atype)
-        nlist = paddle.where(exclude_mask != 0, nlist, -1)
+        nlist = paddle.where(exclude_mask != 0, nlist, paddle.full_like(nlist, -1))
         # nb x nloc x nnei x 4, nb x nloc x nnei x 3, nb x nloc x nnei x 1
         dmatrix, diff, sw = prod_env_mat(
             extended_coord,
@@ -385,36 +479,19 @@ def forward(
             self.e_rcut,
             self.e_rcut_smth,
             protection=self.env_protection,
+            use_exp_switch=self.use_exp_switch,
         )
         nlist_mask = nlist != -1
         sw = paddle.squeeze(sw, -1)
         # beyond the cutoff sw should be 0.0
         sw = sw.masked_fill(~nlist_mask, 0.0)
 
-        # [nframes, nloc, tebd_dim]
-        if comm_dict is None:
-            if paddle.in_dynamic_mode():
-                assert isinstance(extended_atype_embd, paddle.Tensor)
-            atype_embd = extended_atype_embd[:, :nloc, :]
-            if paddle.in_dynamic_mode():
-                assert atype_embd.shape == [nframes, nloc, self.n_dim]
-        else:
-            atype_embd = extended_atype_embd
-            if paddle.in_dynamic_mode():
-                assert isinstance(atype_embd, paddle.Tensor)
-        node_ebd = self.act(atype_embd)
-        n_dim = node_ebd.shape[-1]
-        # nb x nloc x nnei x 1,  nb x nloc x nnei x 3
-        edge_input, h2 = paddle.split(dmatrix, [1, 3], axis=-1)
-        # nb x nloc x nnei x e_dim
-        edge_ebd = self.act(self.edge_embd(edge_input))
-
         # get angle nlist (maybe smaller)
         a_dist_mask = (paddle.linalg.norm(diff, axis=-1) < self.a_rcut)[
             :, :, : self.a_sel
         ]
         a_nlist = nlist[:, :, : self.a_sel]
-        a_nlist = paddle.where(a_dist_mask, a_nlist, -1)
+        a_nlist = paddle.where(a_dist_mask, a_nlist, paddle.full_like(a_nlist, -1))
         _, a_diff, a_sw = prod_env_mat(
             extended_coord,
             a_nlist,
@@ -424,13 +501,34 @@ def forward(
             self.a_rcut,
             self.a_rcut_smth,
             protection=self.env_protection,
+            use_exp_switch=self.use_exp_switch,
         )
         a_nlist_mask = a_nlist != -1
         a_sw = paddle.squeeze(a_sw, -1)
         # beyond the cutoff sw should be 0.0
         a_sw = a_sw.masked_fill(~a_nlist_mask, 0.0)
+        # set all padding positions to index of 0
+        # if the a neighbor is real or not is indicated by nlist_mask
+        nlist[nlist == -1] = 0
         a_nlist[a_nlist == -1] = 0
 
+        # get node embedding
+        # [nframes, nloc, tebd_dim]
+        assert extended_atype_embd is not None
+        atype_embd = extended_atype_embd[:, :nloc, :]
+        if paddle.in_dynamic_mode():
+            assert list(atype_embd.shape) == [nframes, nloc, self.n_dim]
+            assert isinstance(atype_embd, paddle.Tensor)  # for jit
+        node_ebd = self.act(atype_embd)
+        n_dim = node_ebd.shape[-1]
+
+        # get edge and angle embedding input
+        # nb x nloc x nnei x 1,  nb x nloc x nnei x 3
+        edge_input, h2 = paddle.split(dmatrix, [1, 3], axis=-1)
+        if self.edge_init_use_dist:
+            # nb x nloc x nnei x 1
+            edge_input = paddle.linalg.norm(diff, axis=-1, keepdim=True)
+
         # nf x nloc x a_nnei x 3
         normalized_diff_i = a_diff / (
             paddle.linalg.norm(a_diff, axis=-1, keepdim=True) + 1e-6
@@ -440,19 +538,57 @@ def forward(
         # nf x nloc x a_nnei x a_nnei
         # 1 - 1e-6 for paddle.acos stability
         cosine_ij = paddle.matmul(normalized_diff_i, normalized_diff_j) * (1 - 1e-6)
-        # nf x nloc x a_nnei x a_nnei x 1
-        cosine_ij = cosine_ij.unsqueeze(-1) / (paddle.pi**0.5)
-        # nf x nloc x a_nnei x a_nnei x a_dim
-        angle_ebd = self.angle_embd(cosine_ij).reshape(
-            [nframes, nloc, self.a_sel, self.a_sel, self.a_dim]
-        )
+        angle_input = cosine_ij.unsqueeze(-1) / (paddle.pi**0.5)
+
+        if not parallel_mode and self.use_loc_mapping:
+            if paddle.in_dynamic_mode():
+                assert mapping is not None
+            # convert nlist from nall to nloc index
+            nlist = paddle.take_along_axis(
+                mapping,
+                nlist.reshape([nframes, -1]),
+                1,
+                broadcast=False,
+            ).reshape(nlist.shape)
+        if self.use_dynamic_sel:
+            # get graph index
+            edge_index, angle_index = get_graph_index(
+                nlist,
+                nlist_mask,
+                a_nlist_mask,
+                nall,
+                use_loc_mapping=self.use_loc_mapping,
+            )
+            # flat all the tensors
+            # n_edge x 1
+            edge_input = edge_input[nlist_mask]
+            # n_edge x 3
+            h2 = h2[nlist_mask]
+            # n_edge x 1
+            sw = sw[nlist_mask]
+            # nb x nloc x a_nnei x a_nnei
+            a_nlist_mask = a_nlist_mask[:, :, :, None] & a_nlist_mask[:, :, None, :]
+            # n_angle x 1
+            angle_input = angle_input[a_nlist_mask]
+            # n_angle x 1
+            a_sw = (a_sw[:, :, :, None] * a_sw[:, :, None, :])[a_nlist_mask]
+        else:
+            # avoid jit assertion
+            edge_index = paddle.zeros([2, 1], dtype=nlist.dtype)
+            angle_index = paddle.zeros([3, 1], dtype=nlist.dtype)
+        # get edge and angle embedding
+        # nb x nloc x nnei x e_dim [OR] n_edge x e_dim
+        if not self.edge_init_use_dist:
+            edge_ebd = self.act(self.edge_embd(edge_input))
+        else:
+            edge_ebd = self.edge_embd(edge_input)
+        # nf x nloc x a_nnei x a_nnei x a_dim [OR] n_angle x a_dim
+        angle_ebd = self.angle_embd(angle_input)
 
-        # set all padding positions to index of 0
-        # if the a neighbor is real or not is indicated by nlist_mask
-        nlist[nlist == -1] = 0
         # nb x nall x n_dim
-        if comm_dict is None:
-            assert mapping is not None
+        if not parallel_mode:
+            if paddle.in_dynamic_mode():
+                assert mapping is not None
             mapping = (
                 mapping.reshape([nframes, nall])
                 .unsqueeze(-1)
@@ -460,14 +596,97 @@ def forward(
             )
         for idx, ll in enumerate(self.layers):
             # node_ebd:     nb x nloc x n_dim
-            # node_ebd_ext: nb x nall x n_dim
-            if comm_dict is None:
-                assert mapping is not None
-                node_ebd_ext = paddle.take_along_axis(
-                    node_ebd, mapping, 1, broadcast=False
+            # node_ebd_ext: nb x nall x n_dim [OR] nb x nloc x n_dim when not parallel_mode
+            if not parallel_mode:
+                if paddle.in_dynamic_mode():
+                    assert mapping is not None
+                node_ebd_ext = (
+                    paddle.take_along_axis(node_ebd, mapping, 1, broadcast=False)
+                    if not self.use_loc_mapping
+                    else node_ebd
                 )
             else:
-                raise NotImplementedError("Not implemented")
+                assert len(comm_dict) >= 6
+                has_spin = len(comm_dict) >= 7
+                if not has_spin:
+                    n_padding = nall - nloc
+                    if paddle.in_dynamic_mode():
+                        node_ebd = paddle.nn.functional.pad(
+                            node_ebd.squeeze(0), [0, 0, 0, n_padding], value=0.0
+                        )
+                    else:
+                        _fill_shape = node_ebd.shape[1:]
+                        _fill_shape[0] = n_padding
+                        node_ebd = paddle.concat(
+                            [
+                                node_ebd.squeeze(0),
+                                paddle.zeros(_fill_shape, dtype=node_ebd.dtype),
+                            ],
+                            axis=0,
+                        )
+                    # [nframes, nloc, tebd_dim]
+                    real_nloc = nloc
+                    real_nall = nall
+                else:
+                    # for spin
+                    real_nloc = nloc // 2
+                    real_nall = nall // 2
+                    real_n_padding = real_nall - real_nloc
+                    node_ebd_real, node_ebd_virtual = paddle.split(
+                        node_ebd, [real_nloc, real_nloc], axis=1
+                    )
+                    # mix_node_ebd: nb x real_nloc x (n_dim * 2)
+                    mix_node_ebd = paddle.concat(
+                        [node_ebd_real, node_ebd_virtual], axis=2
+                    )
+                    # nb x real_nall x (n_dim * 2)
+                    if paddle.in_dynamic_mode():
+                        node_ebd = paddle.nn.functional.pad(
+                            mix_node_ebd.squeeze(0),
+                            (0, 0, 0, real_n_padding),
+                            value=0.0,
+                        )
+                    else:
+                        _fill_shape = mix_node_ebd.shape[1:]
+                        _fill_shape[0] = real_n_padding
+                        node_ebd = paddle.concat(
+                            [
+                                mix_node_ebd.squeeze(0),
+                                paddle.zeros(_fill_shape, dtype=mix_node_ebd.dtype),
+                            ],
+                            axis=0,
+                        )
+
+                assert len(comm_dict) >= 6
+                ret = paddle_ops_deepmd_border_op(
+                    comm_dict[0],
+                    comm_dict[1],
+                    comm_dict[2],
+                    comm_dict[3],
+                    comm_dict[4],
+                    node_ebd,
+                    comm_dict[5],
+                    paddle.to_tensor(
+                        real_nloc,
+                        dtype=paddle.int32,
+                        place=paddle.CPUPlace(),
+                    ),  # should be int of c++, placed on cpu
+                    paddle.to_tensor(
+                        real_nall - real_nloc,
+                        dtype=paddle.int32,
+                        place=paddle.CPUPlace(),
+                    ),  # should be int of c++, placed on cpu
+                )
+                if not paddle.in_dynamic_mode():
+                    ret = paddle.assign(ret)
+                node_ebd_ext = ret.unsqueeze(0)
+                if has_spin:
+                    node_ebd_real_ext, node_ebd_virtual_ext = paddle.split(
+                        node_ebd_ext, [n_dim, n_dim], axis=2
+                    )
+                    node_ebd_ext = concat_switch_virtual(
+                        node_ebd_real_ext, node_ebd_virtual_ext, real_nloc
+                    )
             node_ebd, edge_ebd, angle_ebd = ll.forward(
                 node_ebd_ext,
                 edge_ebd,
@@ -479,12 +698,27 @@ def forward(
                 a_nlist,
                 a_nlist_mask,
                 a_sw,
+                edge_index=edge_index,
+                angle_index=angle_index,
             )
 
         # nb x nloc x 3 x e_dim
-        h2g2 = RepFlowLayer._cal_hg(edge_ebd, h2, nlist_mask, sw)
+        h2g2 = (
+            RepFlowLayer._cal_hg(edge_ebd, h2, nlist_mask, sw)
+            if not self.use_dynamic_sel
+            else RepFlowLayer._cal_hg_dynamic(
+                edge_ebd,
+                h2,
+                sw,
+                owner=edge_index[0],
+                num_owner=nframes * nloc,
+                nb=nframes,
+                nloc=nloc,
+                scale_factor=(self.nnei / self.sel_reduce_factor) ** (-0.5),
+            )
+        )
         # (nb x nloc) x e_dim x 3
-        rot_mat = paddle.transpose(h2g2, (0, 1, 3, 2))
+        rot_mat = paddle.transpose(h2g2, [0, 1, 3, 2])
 
         return (
             node_ebd,
diff --git a/deepmd/pd/model/descriptor/repformer_layer.py b/deepmd/pd/model/descriptor/repformer_layer.py
index b4d93d8301..fc66e1d6af 100644
--- a/deepmd/pd/model/descriptor/repformer_layer.py
+++ b/deepmd/pd/model/descriptor/repformer_layer.py
@@ -163,6 +163,7 @@ def __init__(
         attnw_shift: float = 20.0,
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         """Return neighbor-wise multi-head self-attention maps, with gate mechanism."""
         super().__init__()
@@ -175,6 +176,7 @@ def __init__(
             bias=False,
             precision=precision,
             seed=seed,
+            trainable=trainable,
         )
         self.has_gate = has_gate
         self.smooth = smooth
@@ -288,6 +290,7 @@ def __init__(
         head_num: int,
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.input_dim = input_dim
@@ -298,12 +301,14 @@ def __init__(
             bias=False,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.head_map = MLPLayer(
             input_dim * head_num,
             input_dim,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.precision = precision
 
@@ -375,12 +380,18 @@ def __init__(
         head_num: int,
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.input_dim = input_dim
         self.head_num = head_num
         self.head_map = MLPLayer(
-            head_num, 1, bias=False, precision=precision, seed=seed
+            head_num,
+            1,
+            bias=False,
+            precision=precision,
+            seed=seed,
+            trainable=trainable,
         )
         self.precision = precision
 
@@ -448,6 +459,7 @@ def __init__(
         attnw_shift: float = 20.0,
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.input_dim = input_dim
@@ -459,6 +471,7 @@ def __init__(
             bias=False,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.mapkv = MLPLayer(
             input_dim,
@@ -466,12 +479,14 @@ def __init__(
             bias=False,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.head_map = MLPLayer(
             input_dim * head_num,
             input_dim,
             precision=precision,
             seed=child_seed(seed, 2),
+            trainable=trainable,
         )
         self.smooth = smooth
         self.attnw_shift = attnw_shift
@@ -612,6 +627,7 @@ def __init__(
         g1_out_conv: bool = True,
         g1_out_mlp: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.epsilon = 1e-4  # protection of 1./nnei
@@ -672,6 +688,7 @@ def __init__(
                     self.update_residual_init,
                     precision=precision,
                     seed=child_seed(seed, 0),
+                    trainable=trainable,
                 )
             )
 
@@ -681,6 +698,7 @@ def __init__(
             g1_dim,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.linear2 = None
         self.proj_g1g2 = None
@@ -697,6 +715,7 @@ def __init__(
                 g2_dim,
                 precision=precision,
                 seed=child_seed(seed, 2),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.g2_residual.append(
@@ -706,6 +725,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 3),
+                        trainable=trainable,
                     )
                 )
         if self.g1_out_mlp:
@@ -714,6 +734,7 @@ def __init__(
                 g1_dim,
                 precision=precision,
                 seed=child_seed(seed, 15),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.g1_residual.append(
@@ -723,6 +744,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 16),
+                        trainable=trainable,
                     )
                 )
         else:
@@ -735,6 +757,7 @@ def __init__(
                     bias=False,
                     precision=precision,
                     seed=child_seed(seed, 4),
+                    trainable=trainable,
                 )
             else:
                 self.proj_g1g2 = MLPLayer(
@@ -743,6 +766,7 @@ def __init__(
                     bias=False,
                     precision=precision,
                     seed=child_seed(seed, 4),
+                    trainable=trainable,
                 )
                 if self.update_style == "res_residual":
                     self.g1_residual.append(
@@ -752,6 +776,7 @@ def __init__(
                             self.update_residual_init,
                             precision=precision,
                             seed=child_seed(seed, 17),
+                            trainable=trainable,
                         )
                     )
         if self.update_g2_has_g1g1:
@@ -761,6 +786,7 @@ def __init__(
                 bias=False,
                 precision=precision,
                 seed=child_seed(seed, 5),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.g2_residual.append(
@@ -770,6 +796,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 6),
+                        trainable=trainable,
                     )
                 )
         if self.update_g2_has_attn or self.update_h2:
@@ -781,10 +808,15 @@ def __init__(
                 self.smooth,
                 precision=precision,
                 seed=child_seed(seed, 7),
+                trainable=trainable,
             )
             if self.update_g2_has_attn:
                 self.attn2_mh_apply = Atten2MultiHeadApply(
-                    g2_dim, attn2_nhead, precision=precision, seed=child_seed(seed, 8)
+                    g2_dim,
+                    attn2_nhead,
+                    precision=precision,
+                    seed=child_seed(seed, 8),
+                    trainable=trainable,
                 )
                 self.attn2_lm = LayerNorm(
                     g2_dim,
@@ -801,12 +833,17 @@ def __init__(
                             self.update_residual_init,
                             precision=precision,
                             seed=child_seed(seed, 10),
+                            trainable=trainable,
                         )
                     )
 
             if self.update_h2:
                 self.attn2_ev_apply = Atten2EquiVarApply(
-                    g2_dim, attn2_nhead, precision=precision, seed=child_seed(seed, 11)
+                    g2_dim,
+                    attn2_nhead,
+                    precision=precision,
+                    seed=child_seed(seed, 11),
+                    trainable=trainable,
                 )
                 if self.update_style == "res_residual":
                     self.h2_residual.append(
@@ -816,6 +853,7 @@ def __init__(
                             self.update_residual_init,
                             precision=precision,
                             seed=child_seed(seed, 12),
+                            trainable=trainable,
                         )
                     )
         if self.update_g1_has_attn:
@@ -826,6 +864,7 @@ def __init__(
                 self.smooth,
                 precision=precision,
                 seed=child_seed(seed, 13),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.g1_residual.append(
@@ -835,6 +874,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 14),
+                        trainable=trainable,
                     )
                 )
 
@@ -914,9 +954,7 @@ def _update_g1_conv(
             ).unsqueeze(-1)
         else:
             gg1 = _apply_switch(gg1, sw)
-            invnnei = (1.0 / float(nnei)) * paddle.ones(
-                (nb, nloc, 1), dtype=gg1.dtype
-            ).to(device=gg1.place)
+            invnnei = (1.0 / float(nnei)) * paddle.ones((nb, nloc, 1), dtype=gg1.dtype)
         if not self.g1_out_conv:
             # nb x nloc x ng2
             g1_11 = paddle.sum(g2 * gg1, axis=2) * invnnei
@@ -986,11 +1024,10 @@ def _cal_hg(
             if not use_sqrt_nnei:
                 invnnei = (1.0 / float(nnei)) * paddle.ones(
                     (nb, nloc, 1, 1), dtype=g2.dtype
-                ).to(device=g2.place)
+                )
             else:
                 invnnei = paddle.rsqrt(
-                    float(nnei)
-                    * paddle.ones([nb, nloc, 1, 1], dtype=g2.dtype).to(device=g2.place)
+                    float(nnei) * paddle.ones([nb, nloc, 1, 1], dtype=g2.dtype)
                 )
         # nb x nloc x 3 x ng2
         h2g2 = paddle.matmul(paddle.transpose(h2, [0, 1, 3, 2]), g2) * invnnei
diff --git a/deepmd/pd/model/descriptor/repformers.py b/deepmd/pd/model/descriptor/repformers.py
index 32f88dd1d3..f0d18a0908 100644
--- a/deepmd/pd/model/descriptor/repformers.py
+++ b/deepmd/pd/model/descriptor/repformers.py
@@ -10,6 +10,10 @@
 from deepmd.dpmodel.utils.seed import (
     child_seed,
 )
+from deepmd.pd.cxx_op import (
+    ENABLE_CUSTOMIZED_OP,
+    paddle_ops_deepmd,
+)
 from deepmd.pd.model.descriptor.descriptor import (
     DescriptorBlock,
 )
@@ -31,6 +35,9 @@
 from deepmd.pd.utils.exclude_mask import (
     PairExcludeMask,
 )
+from deepmd.pd.utils.spin import (
+    concat_switch_virtual,
+)
 from deepmd.pd.utils.utils import (
     ActivationFn,
 )
@@ -45,6 +52,30 @@
     RepformerLayer,
 )
 
+if not ENABLE_CUSTOMIZED_OP:
+
+    def border_op(
+        argument0,
+        argument1,
+        argument2,
+        argument3,
+        argument4,
+        argument5,
+        argument6,
+        argument7,
+        argument8,
+    ) -> paddle.Tensor:
+        raise NotImplementedError(
+            "The 'border_op' operator is unavailable because the custom Paddle OP library was not built when freezing the model.\n"
+            "To install 'border_op', run: python source/op/pd/setup.py install\n"
+            "For more information, please refer to the DPA3 documentation."
+        )
+
+    # Note: this hack cannot actually save a model that can be run using LAMMPS.
+    paddle_ops_deepmd_border_op = border_op
+else:
+    paddle_ops_deepmd_border_op = paddle_ops_deepmd.border_op
+
 
 @DescriptorBlock.register("se_repformer")
 @DescriptorBlock.register("se_uni")
@@ -87,6 +118,7 @@ def __init__(
         use_sqrt_nnei: bool = True,
         g1_out_conv: bool = True,
         g1_out_mlp: bool = True,
+        trainable: bool = True,
     ) -> None:
         r"""
         The repformer descriptor block.
@@ -173,11 +205,19 @@ def __init__(
             The epsilon value for layer normalization.
         seed : int, optional
             Random seed for parameter initialization.
+        trainable : bool
+            Whether the block is trainable
         """
         super().__init__()
         self.rcut = float(rcut)
+        self.register_buffer("buffer_rcut", paddle.to_tensor(self.rcut))
         self.rcut_smth = float(rcut_smth)
+        self.register_buffer("buffer_rcut_smth", paddle.to_tensor(self.rcut_smth))
         self.ntypes = ntypes
+        self.register_buffer(
+            "buffer_ntypes", paddle.to_tensor(self.ntypes, dtype="int64")
+        )
+
         self.nlayers = nlayers
         sel = [sel] if isinstance(sel, int) else sel
         self.nnei = sum(sel)
@@ -223,7 +263,11 @@ def __init__(
         self.seed = seed
 
         self.g2_embd = MLPLayer(
-            1, self.g2_dim, precision=precision, seed=child_seed(seed, 0)
+            1,
+            self.g2_dim,
+            precision=precision,
+            seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         layers = []
         for ii in range(nlayers):
@@ -261,16 +305,17 @@ def __init__(
                     g1_out_conv=self.g1_out_conv,
                     g1_out_mlp=self.g1_out_mlp,
                     seed=child_seed(child_seed(seed, 1), ii),
+                    trainable=trainable,
                 )
             )
         self.layers = paddle.nn.LayerList(layers)
 
         wanted_shape = (self.ntypes, self.nnei, 4)
         mean = paddle.zeros(wanted_shape, dtype=env.GLOBAL_PD_FLOAT_PRECISION).to(
-            device=env.DEVICE
+            env.DEVICE
         )
         stddev = paddle.ones(wanted_shape, dtype=env.GLOBAL_PD_FLOAT_PRECISION).to(
-            device=env.DEVICE
+            env.DEVICE
         )
         self.register_buffer("mean", mean)
         self.register_buffer("stddev", stddev)
@@ -284,6 +329,14 @@ def get_rcut_smth(self) -> float:
         """Returns the radius where the neighbor information starts to smoothly decay to 0."""
         return self.rcut_smth
 
+    def get_buffer_rcut(self) -> paddle.Tensor:
+        """Returns the cut-off radius as a buffer-style Tensor."""
+        return self.buffer_rcut
+
+    def get_buffer_rcut_smth(self) -> paddle.Tensor:
+        """Returns the radius where the neighbor information starts to smoothly decay to 0 as a buffer-style Tensor."""
+        return self.buffer_rcut_smth
+
     def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
@@ -294,7 +347,7 @@ def get_sel(self) -> list[int]:
 
     def get_ntypes(self) -> int:
         """Returns the number of element types."""
-        return self.ntypes
+        return self.ntypes if paddle.in_dynamic_mode() else self.buffer_ntypes
 
     def get_dim_out(self) -> int:
         """Returns the output dimension."""
@@ -370,9 +423,9 @@ def forward(
         extended_atype_embd: Optional[paddle.Tensor] = None,
         mapping: Optional[paddle.Tensor] = None,
         type_embedding: Optional[paddle.Tensor] = None,
-        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+        comm_dict: Optional[list[paddle.Tensor]] = None,
     ):
-        if comm_dict is None:
+        if (comm_dict is None or len(comm_dict) == 0) and paddle.in_dynamic_mode():
             assert mapping is not None
             assert extended_atype_embd is not None
         nframes, nloc, nnei = nlist.shape
@@ -398,7 +451,7 @@ def forward(
         sw = sw.masked_fill(~nlist_mask, 0.0)
 
         # [nframes, nloc, tebd_dim]
-        if comm_dict is None:
+        if comm_dict is None or len(comm_dict) == 0:
             if paddle.in_dynamic_mode():
                 assert isinstance(extended_atype_embd, paddle.Tensor)  # for jit
             atype_embd = extended_atype_embd[:, :nloc, :]
@@ -406,8 +459,8 @@ def forward(
                 assert list(atype_embd.shape) == [nframes, nloc, self.g1_dim]
         else:
             atype_embd = extended_atype_embd
-            if paddle.in_dynamic_mode():
-                assert isinstance(atype_embd, paddle.Tensor)  # for jit
+        if paddle.in_dynamic_mode():
+            assert isinstance(atype_embd, paddle.Tensor)  # for jit
         g1 = self.act(atype_embd)
         ng1 = g1.shape[-1]
         # nb x nloc x nnei x 1,  nb x nloc x nnei x 3
@@ -424,8 +477,9 @@ def forward(
         # if the a neighbor is real or not is indicated by nlist_mask
         nlist[nlist == -1] = 0
         # nb x nall x ng1
-        if comm_dict is None:
-            assert mapping is not None
+        if comm_dict is None or len(comm_dict) == 0:
+            if paddle.in_dynamic_mode():
+                assert mapping is not None
             mapping = (
                 mapping.reshape([nframes, nall])
                 .unsqueeze(-1)
@@ -434,14 +488,91 @@ def forward(
         for idx, ll in enumerate(self.layers):
             # g1:     nb x nloc x ng1
             # g1_ext: nb x nall x ng1
-            if comm_dict is None:
-                assert mapping is not None
+            if comm_dict is None or len(comm_dict) == 0:
+                if paddle.in_dynamic_mode():
+                    assert mapping is not None
                 g1_ext = paddle.take_along_axis(
                     g1, axis=1, indices=mapping, broadcast=False
                 )
             else:
-                raise NotImplementedError("Not implemented yet")
-
+                has_spin = len(comm_dict) >= 7
+                if not has_spin:
+                    n_padding = nall - nloc
+                    if paddle.in_dynamic_mode():
+                        g1 = paddle.nn.functional.pad(
+                            g1.squeeze(0),
+                            (0, 0, 0, n_padding),
+                            value=0.0,
+                            pad_from_left_axis=False,
+                        )
+                    else:
+                        _fill_shape = g1.shape[1:]
+                        _fill_shape[0] = n_padding
+                        g1 = paddle.concat(
+                            [g1.squeeze(0), paddle.zeros(_fill_shape, dtype=g1.dtype)],
+                            axis=0,
+                        )
+                    real_nloc = nloc
+                    real_nall = nall
+                else:
+                    # for spin
+                    real_nloc = nloc // 2
+                    real_nall = nall // 2
+                    real_n_padding = real_nall - real_nloc
+                    g1_real, g1_virtual = paddle.split(
+                        g1, [real_nloc, real_nloc], axis=1
+                    )
+                    # mix_g1: nb x real_nloc x (ng1 * 2)
+                    mix_g1 = paddle.concat([g1_real, g1_virtual], axis=2)
+                    # nb x real_nall x (ng1 * 2)
+                    if paddle.in_dynamic_mode():
+                        g1 = paddle.nn.functional.pad(
+                            mix_g1.squeeze(0),
+                            (0, 0, 0, real_n_padding),
+                            value=0.0,
+                            pad_from_left_axis=False,
+                        )
+                    else:
+                        _fill_shape = mix_g1.shape[1:]
+                        _fill_shape[0] = real_n_padding
+                        g1 = paddle.concat(
+                            [
+                                mix_g1.squeeze(0),
+                                paddle.zeros(_fill_shape, dtype=mix_g1.dtype),
+                            ],
+                            axis=0,
+                        )
+
+                assert len(comm_dict) >= 6
+                ret = paddle_ops_deepmd_border_op(
+                    comm_dict[0],
+                    comm_dict[1],
+                    comm_dict[2],
+                    comm_dict[3],
+                    comm_dict[4],
+                    g1,
+                    comm_dict[5],
+                    paddle.to_tensor(
+                        [real_nloc],
+                        dtype=paddle.int32,
+                        place=paddle.CPUPlace(),
+                    ),  # should be int of c++, placed on cpu
+                    paddle.to_tensor(
+                        [real_nall - real_nloc],
+                        dtype=paddle.int32,
+                        place=paddle.CPUPlace(),
+                    ),  # should be int of c++, placed on cpu
+                )
+                if not paddle.in_dynamic_mode():
+                    ret = paddle.assign(ret)
+                g1_ext = ret.unsqueeze(0)
+                if has_spin:
+                    g1_real_ext, g1_virtual_ext = paddle.split(
+                        g1_ext, [ng1, ng1], axis=2
+                    )
+                    g1_ext = concat_switch_virtual(
+                        g1_real_ext, g1_virtual_ext, real_nloc
+                    )
             g1, g2, h2 = ll.forward(
                 g1_ext,
                 g2,
@@ -503,11 +634,11 @@ def compute_input_stats(
         mean, stddev = env_mat_stat()
         if not self.set_davg_zero:
             paddle.assign(
-                paddle.to_tensor(mean, dtype=self.mean.dtype).to(device=env.DEVICE),
+                paddle.to_tensor(mean, dtype=self.mean.dtype).to(env.DEVICE),
                 self.mean,
             )  # pylint: disable=no-explicit-dtype
         paddle.assign(
-            paddle.to_tensor(stddev, dtype=self.stddev.dtype).to(device=env.DEVICE),
+            paddle.to_tensor(stddev, dtype=self.stddev.dtype).to(env.DEVICE),
             self.stddev,
         )  # pylint: disable=no-explicit-dtype
 
diff --git a/deepmd/pd/model/descriptor/se_a.py b/deepmd/pd/model/descriptor/se_a.py
index 7b70a742ce..109c7ba3c4 100644
--- a/deepmd/pd/model/descriptor/se_a.py
+++ b/deepmd/pd/model/descriptor/se_a.py
@@ -95,6 +95,11 @@ def __init__(
             raise NotImplementedError("old implementation of spin is not supported.")
         super().__init__()
         self.type_map = type_map
+        if type_map is not None:
+            self.register_buffer(
+                "buffer_type_map",
+                paddle.to_tensor([ord(c) for c in " ".join(type_map)]),
+            )
         self.compress = False
         self.prec = PRECISION_DICT[precision]
         self.sea = DescrptBlockSeA(
@@ -122,6 +127,14 @@ def get_rcut_smth(self) -> float:
         """Returns the radius where the neighbor information starts to smoothly decay to 0."""
         return self.sea.get_rcut_smth()
 
+    def get_buffer_rcut(self) -> paddle.Tensor:
+        """Returns the cut-off radius as a buffer-style Tensor."""
+        return self.sea.get_buffer_rcut()
+
+    def get_buffer_rcut_smth(self) -> paddle.Tensor:
+        """Returns the radius where the neighbor information starts to smoothly decay to 0 as a buffer-style Tensor."""
+        return self.sea.get_buffer_rcut_smth()
+
     def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return self.sea.get_nsel()
@@ -138,6 +151,18 @@ def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
+    def get_buffer_type_map(self) -> paddle.Tensor:
+        """
+        Return the type map as a buffer-style Tensor for JIT saving.
+
+        The original type map (e.g., ['Ni', 'O']) is first joined into a single space-separated string
+        (e.g., "Ni O"). Each character in this string is then converted to its ASCII code using `ord()`,
+        and the resulting integer sequence is stored as a 1D paddle.Tensor of dtype int.
+
+        This format allows the type map to be serialized as a raw byte buffer during JIT model saving.
+        """
+        return self.buffer_type_map
+
     def get_dim_out(self) -> int:
         """Returns the output dimension."""
         return self.sea.get_dim_out()
@@ -260,7 +285,7 @@ def forward(
         atype_ext: paddle.Tensor,
         nlist: paddle.Tensor,
         mapping: Optional[paddle.Tensor] = None,
-        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+        comm_dict: Optional[list[paddle.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -438,7 +463,9 @@ def __init__(
         """
         super().__init__()
         self.rcut = float(rcut)
+        self.register_buffer("buffer_rcut", paddle.to_tensor(self.rcut))
         self.rcut_smth = float(rcut_smth)
+        self.register_buffer("buffer_rcut_smth", paddle.to_tensor(self.rcut_smth))
         self.neuron = neuron
         self.filter_neuron = self.neuron
         self.axis_neuron = axis_neuron
@@ -449,6 +476,9 @@ def __init__(
         self.resnet_dt = resnet_dt
         self.env_protection = env_protection
         self.ntypes = len(sel)
+        self.register_buffer(
+            "buffer_ntypes", paddle.to_tensor(self.ntypes, dtype="int64")
+        )
         self.type_one_side = type_one_side
         self.seed = seed
         # order matters, placed after the assignment of self.ntypes
@@ -481,6 +511,7 @@ def __init__(
                 precision=self.precision,
                 resnet_dt=self.resnet_dt,
                 seed=child_seed(self.seed, ii),
+                trainable=trainable,
             )
         self.filter_layers = filter_layers
         self.stats = None
@@ -512,6 +543,14 @@ def get_rcut_smth(self) -> float:
         """Returns the radius where the neighbor information starts to smoothly decay to 0."""
         return self.rcut_smth
 
+    def get_buffer_rcut(self) -> paddle.Tensor:
+        """Returns the cut-off radius as a buffer-style Tensor."""
+        return self.buffer_rcut
+
+    def get_buffer_rcut_smth(self) -> paddle.Tensor:
+        """Returns the radius where the neighbor information starts to smoothly decay to 0 as a buffer-style Tensor."""
+        return self.buffer_rcut_smth
+
     def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
@@ -522,7 +561,7 @@ def get_sel(self) -> list[int]:
 
     def get_ntypes(self) -> int:
         """Returns the number of element types."""
-        return self.ntypes
+        return self.ntypes if paddle.in_dynamic_mode() else self.buffer_ntypes
 
     def get_dim_out(self) -> int:
         """Returns the output dimension."""
diff --git a/deepmd/pd/model/descriptor/se_atten.py b/deepmd/pd/model/descriptor/se_atten.py
index 6bec47b12e..ceae16f409 100644
--- a/deepmd/pd/model/descriptor/se_atten.py
+++ b/deepmd/pd/model/descriptor/se_atten.py
@@ -81,6 +81,7 @@ def __init__(
         ln_eps: Optional[float] = 1e-5,
         seed: Optional[Union[int, list[int]]] = None,
         type: Optional[str] = None,
+        trainable: bool = True,
     ) -> None:
         r"""Construct an embedding net of type `se_atten`.
 
@@ -146,11 +147,15 @@ def __init__(
             If not None, the scaling of attention weights is `temperature` itself.
         seed : int, Optional
             Random seed for parameter initialization.
+        trainable : bool, default: True
+            Whether this block is trainable
         """
         super().__init__()
         del type
         self.rcut = float(rcut)
+        self.register_buffer("buffer_rcut", paddle.to_tensor(self.rcut))
         self.rcut_smth = float(rcut_smth)
+        self.register_buffer("buffer_rcut_smth", paddle.to_tensor(self.rcut_smth))
         self.neuron = neuron
         self.filter_neuron = self.neuron
         self.axis_neuron = axis_neuron
@@ -182,6 +187,10 @@ def __init__(
             sel = [sel]
 
         self.ntypes = ntypes
+        self.register_buffer(
+            "buffer_ntypes", paddle.to_tensor(self.ntypes, dtype="int64")
+        )
+
         self.sel = sel
         self.sec = self.sel
         self.split_sel = self.sel
@@ -205,6 +214,7 @@ def __init__(
             smooth=self.smooth,
             precision=self.precision,
             seed=child_seed(self.seed, 0),
+            trainable=trainable,
         )
 
         wanted_shape = (self.ntypes, self.nnei, 4)
@@ -229,6 +239,7 @@ def __init__(
             precision=self.precision,
             resnet_dt=self.resnet_dt,
             seed=child_seed(self.seed, 1),
+            trainable=trainable,
         )
         self.filter_layers = filter_layers
         if self.tebd_input_mode in ["strip"]:
@@ -242,6 +253,7 @@ def __init__(
                 precision=self.precision,
                 resnet_dt=self.resnet_dt,
                 seed=child_seed(self.seed, 2),
+                trainable=trainable,
             )
             self.filter_layers_strip = filter_layers_strip
         self.stats = None
@@ -272,6 +284,14 @@ def get_rcut_smth(self) -> float:
         """Returns the radius where the neighbor information starts to smoothly decay to 0."""
         return self.rcut_smth
 
+    def get_buffer_rcut(self) -> paddle.Tensor:
+        """Returns the cut-off radius as a buffer-style Tensor."""
+        return self.buffer_rcut
+
+    def get_buffer_rcut_smth(self) -> paddle.Tensor:
+        """Returns the radius where the neighbor information starts to smoothly decay to 0 as a buffer-style Tensor."""
+        return self.buffer_rcut_smth
+
     def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
@@ -282,7 +302,7 @@ def get_sel(self) -> list[int]:
 
     def get_ntypes(self) -> int:
         """Returns the number of element types."""
-        return self.ntypes
+        return self.ntypes if paddle.in_dynamic_mode() else self.buffer_ntypes
 
     def get_dim_in(self) -> int:
         """Returns the input dimension."""
@@ -519,7 +539,10 @@ def forward(
             index = nlist.reshape([nb, nloc * nnei]).unsqueeze(-1).expand([-1, -1, nt])
             # nb x (nloc x nnei) x nt
             atype_tebd_nlist = paddle.take_along_axis(
-                atype_tebd_ext, axis=1, indices=index
+                atype_tebd_ext,
+                axis=1,
+                indices=index,
+                broadcast=False,
             )  # j
             # nb x nloc x nnei x nt
             atype_tebd_nlist = atype_tebd_nlist.reshape([nb, nloc, nnei, nt])
@@ -557,14 +580,16 @@ def forward(
             nlist_index = nlist.reshape([nb, nloc * nnei])
             # nf x (nl x nnei)
             nei_type = paddle.take_along_axis(
-                extended_atype, indices=nlist_index, axis=1
+                extended_atype, indices=nlist_index, axis=1, broadcast=False
             )
             # (nf x nl x nnei) x ng
             nei_type_index = nei_type.reshape([-1, 1]).expand([-1, ng]).to(paddle.int64)
             if self.type_one_side:
                 tt_full = self.filter_layers_strip.networks[0](type_embedding)
                 # (nf x nl x nnei) x ng
-                gg_t = paddle.take_along_axis(tt_full, indices=nei_type_index, axis=0)
+                gg_t = paddle.take_along_axis(
+                    tt_full, indices=nei_type_index, axis=0, broadcast=False
+                )
             else:
                 idx_i = paddle.tile(
                     atype.reshape([-1, 1]) * ntypes_with_padding, [1, nnei]
@@ -588,7 +613,9 @@ def forward(
                 ).reshape([-1, nt * 2])
                 tt_full = self.filter_layers_strip.networks[0](two_side_type_embedding)
                 # (nf x nl x nnei) x ng
-                gg_t = paddle.take_along_axis(tt_full, axis=0, indices=idx)
+                gg_t = paddle.take_along_axis(
+                    tt_full, axis=0, indices=idx, broadcast=False
+                )
             # (nf x nl) x nnei x ng
             gg_t = gg_t.reshape([nfnl, nnei, ng])
             if self.smooth:
@@ -655,6 +682,7 @@ def __init__(
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         """Construct a neighbor-wise attention net."""
         super().__init__()
@@ -690,6 +718,7 @@ def __init__(
                     smooth=smooth,
                     precision=precision,
                     seed=child_seed(seed, i),
+                    trainable=trainable,
                 )
             )
         self.attention_layers = nn.LayerList(attention_layers)
@@ -797,6 +826,7 @@ def __init__(
         ln_eps: float = 1e-5,
         precision: str = DEFAULT_PRECISION,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         """Construct a neighbor-wise attention layer."""
         super().__init__()
@@ -824,6 +854,7 @@ def __init__(
             smooth=smooth,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.attn_layer_norm = LayerNorm(
             self.embed_dim,
@@ -904,6 +935,7 @@ def __init__(
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         """Construct a multi-head neighbor-wise attention net."""
         super().__init__()
@@ -936,6 +968,7 @@ def __init__(
             stddev=1.0,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.out_proj = MLPLayer(
             hidden_dim,
@@ -946,6 +979,7 @@ def __init__(
             stddev=1.0,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
 
     def forward(
diff --git a/deepmd/pd/model/descriptor/se_t_tebd.py b/deepmd/pd/model/descriptor/se_t_tebd.py
index 2898283f0c..e9d4053612 100644
--- a/deepmd/pd/model/descriptor/se_t_tebd.py
+++ b/deepmd/pd/model/descriptor/se_t_tebd.py
@@ -160,10 +160,16 @@ def __init__(
             env_protection=env_protection,
             smooth=smooth,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.prec = PRECISION_DICT[precision]
         self.use_econf_tebd = use_econf_tebd
         self.type_map = type_map
+        if type_map is not None:
+            self.register_buffer(
+                "buffer_type_map",
+                paddle.to_tensor([ord(c) for c in " ".join(type_map)]),
+            )
         self.smooth = smooth
         self.type_embedding = TypeEmbedNet(
             ntypes,
@@ -173,6 +179,7 @@ def __init__(
             use_econf_tebd=use_econf_tebd,
             type_map=type_map,
             use_tebd_bias=use_tebd_bias,
+            trainable=trainable,
         )
         self.tebd_dim = tebd_dim
         self.tebd_input_mode = tebd_input_mode
@@ -206,6 +213,18 @@ def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
+    def get_buffer_type_map(self) -> paddle.Tensor:
+        """
+        Return the type map as a buffer-style Tensor for JIT saving.
+
+        The original type map (e.g., ['Ni', 'O']) is first joined into a single space-separated string
+        (e.g., "Ni O"). Each character in this string is then converted to its ASCII code using `ord()`,
+        and the resulting integer sequence is stored as a 1D paddle.Tensor of dtype int.
+
+        This format allows the type map to be serialized as a raw byte buffer during JIT model saving.
+        """
+        return self.buffer_type_map
+
     def get_dim_out(self) -> int:
         """Returns the output dimension."""
         ret = self.se_ttebd.get_dim_out()
@@ -413,7 +432,7 @@ def forward(
         extended_atype: paddle.Tensor,
         nlist: paddle.Tensor,
         mapping: Optional[paddle.Tensor] = None,
-        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+        comm_dict: Optional[list[paddle.Tensor]] = None,
     ):
         """Compute the descriptor.
 
@@ -529,10 +548,13 @@ def __init__(
         env_protection: float = 0.0,
         smooth: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.rcut = float(rcut)
+        self.register_buffer("buffer_rcut", paddle.to_tensor(self.rcut))
         self.rcut_smth = float(rcut_smth)
+        self.register_buffer("buffer_rcut_smth", paddle.to_tensor(self.rcut_smth))
         self.neuron = neuron
         self.filter_neuron = self.neuron
         self.tebd_dim = tebd_dim
@@ -550,6 +572,10 @@ def __init__(
             sel = [sel]
 
         self.ntypes = ntypes
+        self.register_buffer(
+            "buffer_ntypes", paddle.to_tensor(self.ntypes, dtype="int64")
+        )
+
         self.sel = sel
         self.sec = self.sel
         self.split_sel = self.sel
@@ -585,6 +611,7 @@ def __init__(
             precision=self.precision,
             resnet_dt=self.resnet_dt,
             seed=child_seed(self.seed, 1),
+            trainable=trainable,
         )
         self.filter_layers = filter_layers
         if self.tebd_input_mode in ["strip"]:
@@ -598,6 +625,7 @@ def __init__(
                 precision=self.precision,
                 resnet_dt=self.resnet_dt,
                 seed=child_seed(self.seed, 2),
+                trainable=trainable,
             )
             self.filter_layers_strip = filter_layers_strip
         self.stats = None
@@ -610,6 +638,14 @@ def get_rcut_smth(self) -> float:
         """Returns the radius where the neighbor information starts to smoothly decay to 0."""
         return self.rcut_smth
 
+    def get_buffer_rcut(self) -> paddle.Tensor:
+        """Returns the cut-off radius as a buffer-style Tensor."""
+        return self.buffer_rcut
+
+    def get_buffer_rcut_smth(self) -> paddle.Tensor:
+        """Returns the radius where the neighbor information starts to smoothly decay to 0 as a buffer-style Tensor."""
+        return self.buffer_rcut_smth
+
     def get_nsel(self) -> int:
         """Returns the number of selected atoms in the cut-off radius."""
         return sum(self.sel)
@@ -620,7 +656,7 @@ def get_sel(self) -> list[int]:
 
     def get_ntypes(self) -> int:
         """Returns the number of element types."""
-        return self.ntypes
+        return self.ntypes if paddle.in_dynamic_mode() else self.buffer_ntypes
 
     def get_dim_in(self) -> int:
         """Returns the input dimension."""
@@ -840,7 +876,7 @@ def forward(
             # nb x (nloc x nnei) x nt
             # atype_tebd_nlist = paddle.take_along_axis(atype_tebd_ext, axis=1, index=index)
             atype_tebd_nlist = paddle.take_along_axis(
-                atype_tebd_ext, axis=1, indices=index
+                atype_tebd_ext, axis=1, indices=index, broadcast=False
             )
             # nb x nloc x nnei x nt
             atype_tebd_nlist = atype_tebd_nlist.reshape([nb, nloc, nnei, nt])
@@ -864,7 +900,7 @@ def forward(
             nlist_index = nlist.reshape([nb, nloc * nnei])
             # nf x (nl x nnei)
             nei_type = paddle.take_along_axis(
-                extended_atype, indices=nlist_index, axis=1
+                extended_atype, indices=nlist_index, axis=1, broadcast=False
             )
             # nfnl x nnei
             nei_type = nei_type.reshape([nfnl, nnei])
@@ -897,7 +933,7 @@ def forward(
             ).reshape([-1, nt * 2])
             tt_full = self.filter_layers_strip.networks[0](two_side_type_embedding)
             # (nfnl x nt_i x nt_j) x ng
-            gg_t = paddle.take_along_axis(tt_full, indices=idx, axis=0)
+            gg_t = paddle.take_along_axis(tt_full, indices=idx, axis=0, broadcast=False)
             # (nfnl x nt_i x nt_j) x ng
             gg_t = gg_t.reshape([nfnl, nnei, nnei, ng])
             if self.smooth:
diff --git a/deepmd/pd/model/model/ener_model.py b/deepmd/pd/model/model/ener_model.py
index a5b1b9d4b3..36cd1211ba 100644
--- a/deepmd/pd/model/model/ener_model.py
+++ b/deepmd/pd/model/model/ener_model.py
@@ -34,6 +34,18 @@ def __init__(
         DPModelCommon.__init__(self)
         DPEnergyModel_.__init__(self, *args, **kwargs)
 
+    def get_buffer_type_map(self) -> paddle.Tensor:
+        """
+        Return the type map as a buffer-style Tensor for JIT saving.
+
+        The original type map (e.g., ['Ni', 'O']) is first joined into a single space-separated string
+        (e.g., "Ni O"). Each character in this string is then converted to its ASCII code using `ord()`,
+        and the resulting integer sequence is stored as a 1D paddle.Tensor of dtype int.
+
+        This format allows the type map to be serialized as a raw byte buffer during JIT model saving.
+        """
+        return super().get_buffer_type_map()
+
     def translated_output_def(self):
         out_def_data = self.model_output_def().get_data()
         output_def = {
@@ -81,6 +93,10 @@ def forward(
                     model_predict["atom_virial"] = model_ret["energy_derv_c"].squeeze(
                         -3
                     )
+                else:
+                    model_predict["atom_virial"] = paddle.zeros(
+                        [model_predict["energy"].shape[0], 1, 9], dtype=paddle.float64
+                    )
             else:
                 model_predict["force"] = model_ret["dforce"]
             if "mask" in model_ret:
@@ -99,7 +115,7 @@ def forward_lower(
         fparam: Optional[paddle.Tensor] = None,
         aparam: Optional[paddle.Tensor] = None,
         do_atomic_virial: bool = False,
-        comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+        comm_dict: Optional[list[paddle.Tensor]] = None,
     ):
         model_ret = self.forward_common_lower(
             extended_coord,
@@ -124,6 +140,10 @@ def forward_lower(
                     model_predict["extended_virial"] = model_ret[
                         "energy_derv_c"
                     ].squeeze(-3)
+                else:
+                    model_predict["extended_virial"] = paddle.zeros(
+                        [model_predict["energy"].shape[0], 1, 9], dtype=paddle.float64
+                    )
             else:
                 assert model_ret["dforce"] is not None
                 model_predict["dforce"] = model_ret["dforce"]
diff --git a/deepmd/pd/model/model/make_model.py b/deepmd/pd/model/model/make_model.py
index acb237b5ac..42c406f8d7 100644
--- a/deepmd/pd/model/model/make_model.py
+++ b/deepmd/pd/model/model/make_model.py
@@ -238,7 +238,7 @@ def forward_common_lower(
             fparam: Optional[paddle.Tensor] = None,
             aparam: Optional[paddle.Tensor] = None,
             do_atomic_virial: bool = False,
-            comm_dict: Optional[dict[str, paddle.Tensor]] = None,
+            comm_dict: Optional[list[paddle.Tensor]] = None,
             extra_nlist_sort: bool = False,
         ):
             """Return model prediction. Lower interface that takes
@@ -364,7 +364,7 @@ def output_type_cast(
                     continue
                 if check_operation_applied(odef[kk], OutputVariableOperation.REDU):
                     model_ret[kk] = (
-                        model_ret[kk].to(self.global_pd_ener_float_precision)
+                        model_ret[kk].astype(self.global_pd_ener_float_precision)
                         if model_ret[kk] is not None
                         else None
                     )
@@ -445,7 +445,7 @@ def _format_nlist(
                         * paddle.ones(
                             [n_nf, n_nloc, nnei - n_nnei],
                             dtype=nlist.dtype,
-                        ).to(nlist.place),
+                        ),
                     ],
                     axis=-1,
                 )
@@ -458,17 +458,21 @@ def _format_nlist(
                 coord0 = extended_coord[:, :n_nloc, :]
                 # nf x (nloc x nnei) x 3
                 index = nlist.reshape([n_nf, n_nloc * n_nnei, 1]).expand([-1, -1, 3])
-                coord1 = paddle.take_along_axis(extended_coord, axis=1, indices=index)
+                coord1 = paddle.take_along_axis(
+                    extended_coord, axis=1, indices=index, broadcast=False
+                )
                 # nf x nloc x nnei x 3
                 coord1 = coord1.reshape([n_nf, n_nloc, n_nnei, 3])
                 # nf x nloc x nnei
                 rr = paddle.linalg.norm(coord0[:, :, None, :] - coord1, axis=-1)
-                rr = paddle.where(m_real_nei, rr, float("inf"))
+                rr = paddle.where(m_real_nei, rr, paddle.full_like(rr, float("inf")))
                 rr, nlist_mapping = (
                     paddle.sort(rr, axis=-1),
                     paddle.argsort(rr, axis=-1),
                 )
-                nlist = paddle.take_along_axis(nlist, axis=2, indices=nlist_mapping)
+                nlist = paddle.take_along_axis(
+                    nlist, axis=2, indices=nlist_mapping, broadcast=False
+                )
                 nlist = paddle.where(rr > rcut, paddle.full_like(nlist, -1), nlist)
                 nlist = nlist[..., :nnei]
             else:  # not extra_nlist_sort and n_nnei <= nnei:
@@ -525,6 +529,14 @@ def get_dim_aparam(self) -> int:
             """Get the number (dimension) of atomic parameters of this atomic model."""
             return self.atomic_model.get_dim_aparam()
 
+        def get_buffer_dim_fparam(self) -> paddle.Tensor:
+            """Get the number (dimension) of frame parameters of this atomic model as a buffer-style Tensor."""
+            return self.atomic_model.get_buffer_dim_fparam()
+
+        def get_buffer_dim_aparam(self) -> paddle.Tensor:
+            """Get the number (dimension) of atomic parameters of this atomic model as a buffer-style Tensor."""
+            return self.atomic_model.get_buffer_dim_aparam()
+
         def get_sel_type(self) -> list[int]:
             """Get the selected atom types of this model.
 
@@ -549,6 +561,22 @@ def get_type_map(self) -> list[str]:
             """Get the type map."""
             return self.atomic_model.get_type_map()
 
+        def get_buffer_rcut(self) -> paddle.Tensor:
+            """Get the cut-off radius as a buffer-style Tensor."""
+            return self.atomic_model.get_buffer_rcut()
+
+        def get_buffer_type_map(self) -> paddle.Tensor:
+            """
+            Return the type map as a buffer-style Tensor for JIT saving.
+
+            The original type map (e.g., ['Ni', 'O']) is first joined into a single space-separated string
+            (e.g., "Ni O"). Each character in this string is then converted to its ASCII code using `ord()`,
+            and the resulting integer sequence is stored as a 1D paddle.Tensor of dtype int.
+
+            This format allows the type map to be serialized as a raw byte buffer during JIT model saving.
+            """
+            return self.atomic_model.get_buffer_type_map()
+
         def get_nsel(self) -> int:
             """Returns the total number of selected neighboring atoms in the cut-off radius."""
             return self.atomic_model.get_nsel()
diff --git a/deepmd/pd/model/model/model.py b/deepmd/pd/model/model/model.py
index 06a2c6910f..0151a9c36b 100644
--- a/deepmd/pd/model/model/model.py
+++ b/deepmd/pd/model/model/model.py
@@ -53,3 +53,7 @@ def get_min_nbor_dist(self) -> Optional[float]:
     def get_ntypes(self):
         """Returns the number of element types."""
         return len(self.get_type_map())
+
+    def get_buffer_ntypes(self) -> paddle.Tensor:
+        """Returns the number of element types as a buffer-style Tensor."""
+        return self.get_buffer_ntypes()
diff --git a/deepmd/pd/model/model/transform_output.py b/deepmd/pd/model/model/transform_output.py
index 469bfd3168..47004265c7 100644
--- a/deepmd/pd/model/model/transform_output.py
+++ b/deepmd/pd/model/model/transform_output.py
@@ -223,9 +223,7 @@ def communicate_extended_output(
                 mapping = mapping.reshape(mldims + [1] * len(derv_r_ext_dims)).expand(
                     [-1] * len(mldims) + derv_r_ext_dims
                 )
-                force = paddle.zeros(vldims + derv_r_ext_dims, dtype=vv.dtype).to(
-                    device=vv.place
-                )
+                force = paddle.zeros(vldims + derv_r_ext_dims, dtype=vv.dtype)
                 # nf x nloc x nvar x 3
                 new_ret[kk_derv_r] = decomp.scatter_reduce(
                     force,
@@ -242,9 +240,7 @@ def communicate_extended_output(
                     mapping,
                     [1] * (len(mldims) + len(vdef.shape)) + [3],
                 )
-                virial = paddle.zeros(vldims + derv_c_ext_dims, dtype=vv.dtype).to(
-                    device=vv.place
-                )
+                virial = paddle.zeros(vldims + derv_c_ext_dims, dtype=vv.dtype)
                 # nf x nloc x nvar x 9
                 new_ret[kk_derv_c] = decomp.scatter_reduce(
                     virial,
@@ -254,9 +250,9 @@ def communicate_extended_output(
                     reduce="sum",
                 )
                 new_ret[kk_derv_c + "_redu"] = paddle.sum(
-                    new_ret[kk_derv_c].to(redu_prec), axis=1
+                    new_ret[kk_derv_c].astype(redu_prec), axis=1
                 )
-                if not do_atomic_virial:
+                if not do_atomic_virial and paddle.in_dynamic_mode():
                     # pop atomic virial, because it is not correctly calculated.
                     new_ret.pop(kk_derv_c)
     return new_ret
diff --git a/deepmd/pd/model/network/mlp.py b/deepmd/pd/model/network/mlp.py
index 41286fbbae..ee408b8719 100644
--- a/deepmd/pd/model/network/mlp.py
+++ b/deepmd/pd/model/network/mlp.py
@@ -85,8 +85,10 @@ def __init__(
         precision: str = DEFAULT_PRECISION,
         init: str = "default",
         seed: int | list[int] | None = None,
+        trainable: bool = True,
     ):
         super().__init__()
+        self.trainable = trainable
         # only use_timestep when skip connection is established.
         self.use_timestep = use_timestep and (
             num_out == num_in or num_out == num_in * 2
@@ -251,6 +253,7 @@ def serialize(self) -> dict:
             activation_function=self.activate_name,
             resnet=self.resnet,
             precision=self.precision,
+            trainable=self.trainable,
         )
         nl.w, nl.b, nl.idt = (
             to_numpy_array(self.matrix),
@@ -277,6 +280,7 @@ def deserialize(cls, data: dict) -> MLPLayer:
             activation_function=nl["activation_function"],
             resnet=nl["resnet"],
             precision=nl["precision"],
+            trainable=nl["trainable"],
         )
         prec = PRECISION_DICT[obj.precision]
 
diff --git a/deepmd/pd/model/network/network.py b/deepmd/pd/model/network/network.py
index 9cdb7b3adc..68053896d1 100644
--- a/deepmd/pd/model/network/network.py
+++ b/deepmd/pd/model/network/network.py
@@ -45,6 +45,7 @@ def __init__(
         use_econf_tebd=False,
         use_tebd_bias: bool = False,
         type_map=None,
+        trainable: bool = True,
     ) -> None:
         """Construct a type embedding net."""
         super().__init__()
@@ -65,6 +66,7 @@ def __init__(
             type_map=type_map,
             precision=precision,
             seed=seed,
+            trainable=trainable,
         )
         # init.normal_(self.embedding.weight[:-1], mean=bavg, std=stddev)
 
@@ -195,6 +197,7 @@ def __init__(
             self.precision,
             self.seed,
             bias=self.use_tebd_bias,
+            trainable=trainable,
         )
         for param in self.parameters():
             param.stop_gradient = not trainable
@@ -208,9 +211,7 @@ def forward(self, device: str):
             Type embedding network.
         """
         if not self.use_econf_tebd:
-            embed = self.embedding_net(
-                paddle.eye(self.ntypes, dtype=self.prec).to(device=device)
-            )
+            embed = self.embedding_net(paddle.eye(self.ntypes, dtype=self.prec))
         else:
             assert self.econf_tebd is not None
             embed = self.embedding_net(self.econf_tebd.to(device))
@@ -218,9 +219,7 @@ def forward(self, device: str):
             embed = paddle.concat(
                 [
                     embed,
-                    paddle.zeros([1, embed.shape[1]], dtype=self.prec).to(
-                        device=device
-                    ),
+                    paddle.zeros([1, embed.shape[1]], dtype=self.prec),
                 ]
             )
         return embed
diff --git a/deepmd/pd/model/network/utils.py b/deepmd/pd/model/network/utils.py
new file mode 100644
index 0000000000..9fae72c2cc
--- /dev/null
+++ b/deepmd/pd/model/network/utils.py
@@ -0,0 +1,143 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Optional,
+)
+
+import paddle
+
+
+def aggregate(
+    data: paddle.Tensor,
+    owners: paddle.Tensor,
+    average: bool = True,
+    num_owner: Optional[int] = None,
+) -> paddle.Tensor:
+    """
+    Aggregate rows in data by specifying the owners.
+
+    Parameters
+    ----------
+    data : data tensor to aggregate [n_row, feature_dim]
+    owners : specify the owner of each row [n_row, 1]
+    average : if True, average the rows, if False, sum the rows.
+        Default = True
+    num_owner : the number of owners, this is needed if the
+        max idx of owner is not presented in owners tensor
+        Default = None
+
+    Returns
+    -------
+    output: [num_owner, feature_dim]
+    """
+    if num_owner is None or average:
+        # requires bincount
+        bin_count = paddle.bincount(owners)
+        bin_count = bin_count.where(bin_count != 0, paddle.ones_like(bin_count))
+
+        if (num_owner is not None) and (bin_count.shape[0] != num_owner):
+            difference = num_owner - bin_count.shape[0]
+            bin_count = paddle.concat(
+                [bin_count, paddle.ones([difference], dtype=bin_count.dtype)]
+            )
+    else:
+        bin_count = None
+
+    # make sure this operation is done on the same device of data and owners
+    output = paddle.zeros([num_owner, data.shape[1]])
+    output = output.index_add_(owners, 0, data.astype(output.dtype))
+    if average:
+        assert bin_count is not None
+        output = (output.T / bin_count).T
+    return output
+
+
+def get_graph_index(
+    nlist: paddle.Tensor,
+    nlist_mask: paddle.Tensor,
+    a_nlist_mask: paddle.Tensor,
+    nall: int,
+    use_loc_mapping: bool = True,
+):
+    """
+    Get the index mapping for edge graph and angle graph, ready in `aggregate` or `index_select`.
+
+    Parameters
+    ----------
+    nlist : nf x nloc x nnei
+        Neighbor list. (padded neis are set to 0)
+    nlist_mask : nf x nloc x nnei
+        Masks of the neighbor list. real nei 1 otherwise 0
+    a_nlist_mask : nf x nloc x a_nnei
+        Masks of the neighbor list for angle. real nei 1 otherwise 0
+    nall
+        The number of extended atoms.
+
+    Returns
+    -------
+    edge_index : 2 x n_edge
+        n2e_index : n_edge
+            Broadcast indices from node(i) to edge(ij), or reduction indices from edge(ij) to node(i).
+        n_ext2e_index : n_edge
+            Broadcast indices from extended node(j) to edge(ij).
+    angle_index : 3 x n_angle
+        n2a_index : n_angle
+            Broadcast indices from extended node(j) to angle(ijk).
+        eij2a_index : n_angle
+            Broadcast indices from extended edge(ij) to angle(ijk), or reduction indices from angle(ijk) to edge(ij).
+        eik2a_index : n_angle
+            Broadcast indices from extended edge(ik) to angle(ijk).
+    """
+    nf, nloc, nnei = nlist.shape
+    _, _, a_nnei = a_nlist_mask.shape
+    # nf x nloc x nnei x nnei
+    # nlist_mask_3d = nlist_mask[:, :, :, None] & nlist_mask[:, :, None, :]
+    a_nlist_mask_3d = a_nlist_mask[:, :, :, None] & a_nlist_mask[:, :, None, :]
+    n_edge = nlist_mask.sum().item()
+    # n_angle = a_nlist_mask_3d.sum().item()
+
+    # following: get n2e_index, n_ext2e_index, n2a_index, eij2a_index, eik2a_index
+
+    # 1. atom graph
+    # node(i) to edge(ij) index_select; edge(ij) to node aggregate
+    nlist_loc_index = paddle.arange(0, nf * nloc, dtype=nlist.dtype).to(nlist.place)
+    # nf x nloc x nnei
+    n2e_index = nlist_loc_index.reshape([nf, nloc, 1]).expand([-1, -1, nnei])
+    # n_edge
+    n2e_index = n2e_index[nlist_mask]  # graph node index, atom_graph[:, 0]
+
+    # node_ext(j) to edge(ij) index_select
+    frame_shift = paddle.arange(0, nf, dtype=nlist.dtype) * (
+        nall if not use_loc_mapping else nloc
+    )
+    shifted_nlist = nlist + frame_shift[:, None, None]
+    # n_edge
+    n_ext2e_index = shifted_nlist[nlist_mask]  # graph neighbor index, atom_graph[:, 1]
+
+    # 2. edge graph
+    # node(i) to angle(ijk) index_select
+    n2a_index = nlist_loc_index.reshape([nf, nloc, 1, 1]).expand(
+        [-1, -1, a_nnei, a_nnei]
+    )
+    # n_angle
+    n2a_index = n2a_index[a_nlist_mask_3d]
+
+    # edge(ij) to angle(ijk) index_select; angle(ijk) to edge(ij) aggregate
+    edge_id = paddle.arange(0, n_edge, dtype=nlist.dtype)
+    # nf x nloc x nnei
+    edge_index = paddle.zeros([nf, nloc, nnei], dtype=nlist.dtype)
+    edge_index[nlist_mask] = edge_id
+    # only cut a_nnei neighbors, to avoid nnei x nnei
+    edge_index = edge_index[:, :, :a_nnei]
+    edge_index_ij = edge_index.unsqueeze(-1).expand([-1, -1, -1, a_nnei])
+    # n_angle
+    eij2a_index = edge_index_ij[a_nlist_mask_3d]
+
+    # edge(ik) to angle(ijk) index_select
+    edge_index_ik = edge_index.unsqueeze(-2).expand([-1, -1, a_nnei, -1])
+    # n_angle
+    eik2a_index = edge_index_ik[a_nlist_mask_3d]
+
+    edge_index_result = paddle.stack([n2e_index, n_ext2e_index], axis=0)
+    angle_index_result = paddle.stack([n2a_index, eij2a_index, eik2a_index], axis=0)
+
+    return edge_index_result, angle_index_result
diff --git a/deepmd/pd/model/task/ener.py b/deepmd/pd/model/task/ener.py
index 789ef75066..738990b2d8 100644
--- a/deepmd/pd/model/task/ener.py
+++ b/deepmd/pd/model/task/ener.py
@@ -72,7 +72,7 @@ def __init__(
     @classmethod
     def deserialize(cls, data: dict) -> "GeneralFitting":
         data = copy.deepcopy(data)
-        check_version_compatibility(data.pop("@version", 1), 3, 1)
+        check_version_compatibility(data.pop("@version", 1), 4, 1)
         data.pop("var_name")
         data.pop("dim_out")
         return super().deserialize(data)
diff --git a/deepmd/pd/model/task/fitting.py b/deepmd/pd/model/task/fitting.py
index a478c12f97..398630e1d2 100644
--- a/deepmd/pd/model/task/fitting.py
+++ b/deepmd/pd/model/task/fitting.py
@@ -4,6 +4,7 @@
     abstractmethod,
 )
 from typing import (
+    Callable,
     Optional,
     Union,
 )
@@ -71,6 +72,93 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
         else:
             raise NotImplementedError
 
+    def compute_input_stats(
+        self,
+        merged: Union[Callable[[], list[dict]], list[dict]],
+        protection: float = 1e-2,
+    ) -> None:
+        """
+        Compute the input statistics (e.g. mean and stddev) for the fittings from packed data.
+
+        Parameters
+        ----------
+        merged : Union[Callable[[], list[dict]], list[dict]]
+            - list[dict]: A list of data samples from various data systems.
+                Each element, `merged[i]`, is a data dictionary containing `keys`: `paddle.Tensor`
+                originating from the `i`-th data system.
+            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
+                only when needed. Since the sampling process can be slow and memory-intensive,
+                the lazy function helps by only sampling once.
+        protection : float
+            Divided-by-zero protection
+        """
+        if self.numb_fparam == 0 and self.numb_aparam == 0:
+            # skip data statistics
+            return
+        if callable(merged):
+            sampled = merged()
+        else:
+            sampled = merged
+        # stat fparam
+        if self.numb_fparam > 0:
+            cat_data = paddle.concat([frame["fparam"] for frame in sampled], axis=0)
+            cat_data = paddle.reshape(cat_data, [-1, self.numb_fparam])
+            fparam_avg = paddle.mean(cat_data, axis=0)
+            fparam_std = paddle.std(cat_data, axis=0, unbiased=False)
+            fparam_std = paddle.where(
+                fparam_std < protection,
+                paddle.to_tensor(protection, dtype=fparam_std.dtype),
+                fparam_std,
+            )
+            fparam_inv_std = 1.0 / fparam_std
+            paddle.assign(
+                paddle.to_tensor(
+                    fparam_avg, place=env.DEVICE, dtype=self.fparam_avg.dtype
+                ),
+                self.fparam_avg,
+            )
+            paddle.assign(
+                paddle.to_tensor(
+                    fparam_inv_std, place=env.DEVICE, dtype=self.fparam_inv_std.dtype
+                ),
+                self.fparam_inv_std,
+            )
+        # stat aparam
+        if self.numb_aparam > 0:
+            sys_sumv = []
+            sys_sumv2 = []
+            sys_sumn = []
+            for ss_ in [frame["aparam"] for frame in sampled]:
+                ss = paddle.reshape(ss_, [-1, self.numb_aparam])
+                sys_sumv.append(paddle.sum(ss, axis=0))
+                sys_sumv2.append(paddle.sum(ss * ss, axis=0))
+                sys_sumn.append(ss.shape[0])
+            sumv = paddle.sum(paddle.stack(sys_sumv), axis=0)
+            sumv2 = paddle.sum(paddle.stack(sys_sumv2), axis=0)
+            sumn = sum(sys_sumn)
+            aparam_avg = sumv / sumn
+            aparam_std = paddle.sqrt(sumv2 / sumn - (sumv / sumn) ** 2)
+            aparam_std = paddle.where(
+                aparam_std < protection,
+                paddle.to_tensor(
+                    protection, dtype=aparam_std.dtype, place=aparam_std.device
+                ),
+                aparam_std,
+            )
+            aparam_inv_std = 1.0 / aparam_std
+            paddle.assign(
+                paddle.to_tensor(
+                    aparam_avg, place=env.DEVICE, dtype=self.aparam_avg.dtype
+                ),
+                self.aparam_avg,
+            )
+            paddle.assign(
+                paddle.to_tensor(
+                    aparam_inv_std, place=env.DEVICE, dtype=self.aparam_inv_std.dtype
+                ),
+                self.aparam_inv_std,
+            )
+
 
 class GeneralFitting(Fitting):
     """Construct a general fitting net.
@@ -95,6 +183,10 @@ class GeneralFitting(Fitting):
         Number of frame parameters.
     numb_aparam : int
         Number of atomic parameters.
+    default_fparam: list[float], optional
+        The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+        this value will be used as the default value for the frame parameter in the fitting net.
+        This parameter is not supported in PaddlePaddle.
     dim_case_embd : int
         Dimension of case specific embedding.
     activation_function : str
@@ -145,6 +237,7 @@ def __init__(
         remove_vaccum_contribution: Optional[list[bool]] = None,
         type_map: Optional[list[str]] = None,
         use_aparam_as_mask: bool = False,
+        default_fparam: Optional[list[float]] = None,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -155,14 +248,26 @@ def __init__(
         self.mixed_types = mixed_types
         self.resnet_dt = resnet_dt
         self.numb_fparam = numb_fparam
+        self.register_buffer(
+            "buffer_numb_fparam", paddle.to_tensor([numb_fparam], dtype=paddle.int64)
+        )
         self.numb_aparam = numb_aparam
+        self.register_buffer(
+            "buffer_numb_aparam", paddle.to_tensor([numb_aparam], dtype=paddle.int64)
+        )
         self.dim_case_embd = dim_case_embd
+        self.default_fparam = default_fparam
         self.activation_function = activation_function
         self.precision = precision
         self.prec = PRECISION_DICT[self.precision]
         self.rcond = rcond
         self.seed = seed
         self.type_map = type_map
+        if type_map is not None:
+            self.register_buffer(
+                "buffer_type_map",
+                paddle.to_tensor([ord(c) for c in " ".join(self.type_map)]),
+            )
         self.use_aparam_as_mask = use_aparam_as_mask
         # order matters, should be place after the assignment of ntypes
         self.reinit_exclude(exclude_types)
@@ -246,6 +351,8 @@ def __init__(
         for param in self.parameters():
             param.stop_gradient = not self.trainable
 
+        self.eval_return_middle_output = False
+
     def reinit_exclude(
         self,
         exclude_types: list[int] = [],
@@ -282,7 +389,7 @@ def serialize(self) -> dict:
         """Serialize the fitting to dict."""
         return {
             "@class": "Fitting",
-            "@version": 3,
+            "@version": 4,
             "var_name": self.var_name,
             "ntypes": self.ntypes,
             "dim_descrpt": self.dim_descrpt,
@@ -291,6 +398,7 @@ def serialize(self) -> dict:
             "numb_fparam": self.numb_fparam,
             "numb_aparam": self.numb_aparam,
             "dim_case_embd": self.dim_case_embd,
+            "default_fparam": self.default_fparam,
             "activation_function": self.activation_function,
             "precision": self.precision,
             "mixed_types": self.mixed_types,
@@ -338,6 +446,14 @@ def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.numb_aparam
 
+    def get_buffer_dim_fparam(self) -> paddle.Tensor:
+        """Get the number (dimension) of frame parameters of this atomic model as a buffer-style Tensor."""
+        return self.buffer_numb_fparam
+
+    def get_buffer_dim_aparam(self) -> paddle.Tensor:
+        """Get the number (dimension) of atomic parameters of this atomic model as a buffer-style Tensor."""
+        return self.buffer_numb_aparam
+
     # make jit happy
     exclude_types: list[int]
 
@@ -359,6 +475,18 @@ def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
+    def get_buffer_type_map(self) -> paddle.Tensor:
+        """
+        Return the type map as a buffer-style Tensor for JIT saving.
+
+        The original type map (e.g., ['Ni', 'O']) is first joined into a single space-separated string
+        (e.g., "Ni O"). Each character in this string is then converted to its ASCII code using `ord()`,
+        and the resulting integer sequence is stored as a 1D paddle.Tensor of dtype int.
+
+        This format allows the type map to be serialized as a raw byte buffer during JIT model saving.
+        """
+        return self.buffer_type_map
+
     def set_case_embd(self, case_idx: int):
         """
         Set the case embedding of this fitting net by the given case_idx,
@@ -368,6 +496,9 @@ def set_case_embd(self, case_idx: int):
             case_idx
         ]
 
+    def set_return_middle_output(self, return_middle_output: bool = True) -> None:
+        self.eval_return_middle_output = return_middle_output
+
     def __setitem__(self, key, value) -> None:
         if key in ["bias_atom_e"]:
             value = value.reshape([self.ntypes, self._net_out_dim()])
@@ -427,9 +558,9 @@ def _forward_common(
         aparam: Optional[paddle.Tensor] = None,
     ):
         # cast the input to internal precsion
-        xx = descriptor.to(self.prec)
-        fparam = fparam.to(self.prec) if fparam is not None else None
-        aparam = aparam.to(self.prec) if aparam is not None else None
+        xx = descriptor.astype(self.prec)
+        fparam = fparam.astype(self.prec) if fparam is not None else None
+        aparam = aparam.astype(self.prec) if aparam is not None else None
 
         if self.remove_vaccum_contribution is not None:
             # TODO: compute the input for vaccm when remove_vaccum_contribution is set
@@ -514,15 +645,37 @@ def _forward_common(
         outs = paddle.zeros(
             (nf, nloc, net_dim_out),
             dtype=env.GLOBAL_PD_FLOAT_PRECISION,
-        ).to(device=descriptor.place)
+        )
+        results = {}
+
         if self.mixed_types:
-            atom_property = self.filter_layers.networks[0](xx) + self.bias_atom_e[atype]
+            atom_property = self.filter_layers.networks[0](xx)
+            if self.eval_return_middle_output:
+                results["middle_output"] = self.filter_layers.networks[
+                    0
+                ].call_until_last(xx)
             if xx_zeros is not None:
                 atom_property -= self.filter_layers.networks[0](xx_zeros)
             outs = (
-                outs + atom_property + self.bias_atom_e[atype].to(self.prec)
+                outs + atom_property + self.bias_atom_e[atype].astype(self.prec)
             )  # Shape is [nframes, natoms[0], net_dim_out]
         else:
+            if self.eval_return_middle_output:
+                outs_middle = paddle.zeros(
+                    (nf, nloc, self.neuron[-1]),
+                    dtype=self.prec,
+                ).to(device=descriptor.place)  # jit assertion
+                for type_i, ll in enumerate(self.filter_layers.networks):
+                    mask = (atype == type_i).unsqueeze(-1)
+                    mask = paddle.tile(mask, (1, 1, net_dim_out))
+                    middle_output_type = ll.call_until_last(xx)
+                    middle_output_type = paddle.where(
+                        paddle.tile(mask, (1, 1, self.neuron[-1])),
+                        middle_output_type,
+                        paddle.zeros_like(middle_output_type),
+                    )
+                    outs_middle = outs_middle + middle_output_type
+                results["middle_output"] = outs_middle
             for type_i, ll in enumerate(self.filter_layers.networks):
                 mask = (atype == type_i).unsqueeze(-1)
                 mask.stop_gradient = True
@@ -537,12 +690,15 @@ def _forward_common(
                     ):
                         atom_property -= ll(xx_zeros)
                 atom_property = atom_property + self.bias_atom_e[type_i]
-                atom_property = paddle.where(mask, atom_property, 0.0)
+                atom_property = paddle.where(
+                    mask, atom_property, paddle.full_like(atom_property, 0.0)
+                )
                 outs = (
                     outs + atom_property
                 )  # Shape is [nframes, natoms[0], net_dim_out]
         # nf x nloc
-        mask = self.emask(atype).to("bool")
+        mask = self.emask(atype).astype("bool")
         # nf x nloc x nod
-        outs = paddle.where(mask[:, :, None], outs, 0.0)
-        return {self.var_name: outs.astype(env.GLOBAL_PD_FLOAT_PRECISION)}
+        outs = paddle.where(mask[:, :, None], outs, paddle.zeros_like(outs))
+        results.update({self.var_name: outs})
+        return results
diff --git a/deepmd/pd/model/task/invar_fitting.py b/deepmd/pd/model/task/invar_fitting.py
index b92c862dc8..176acdeb20 100644
--- a/deepmd/pd/model/task/invar_fitting.py
+++ b/deepmd/pd/model/task/invar_fitting.py
@@ -147,7 +147,7 @@ def serialize(self) -> dict:
     @classmethod
     def deserialize(cls, data: dict) -> "GeneralFitting":
         data = copy.deepcopy(data)
-        check_version_compatibility(data.pop("@version", 1), 3, 1)
+        check_version_compatibility(data.pop("@version", 1), 4, 1)
         return super().deserialize(data)
 
     def output_def(self) -> FittingOutputDef:
diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
index d72c270667..4e5fea081f 100644
--- a/deepmd/pd/train/training.py
+++ b/deepmd/pd/train/training.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import contextlib
 import functools
 import logging
 import time
@@ -18,6 +19,7 @@
 from paddle.distributed import (
     fleet,
 )
+from paddle.distributed.fleet.utils import hybrid_parallel_util as hpu
 from paddle.framework import (
     core,
 )
@@ -52,6 +54,7 @@
 )
 from deepmd.pd.utils.env import (
     CINN,
+    CINN_ALLOW_DYNAMIC_SHAPE,
     DEFAULT_PRECISION,
     DEVICE,
     JIT,
@@ -130,6 +133,9 @@ def __init__(
 
         # Iteration config
         self.num_steps = training_params["numb_steps"]
+        self.acc_freq: int = training_params.get(
+            "acc_freq", 1
+        )  # gradient accumulation steps
         self.disp_file = training_params.get("disp_file", "lcurve.out")
         self.disp_freq = training_params.get("disp_freq", 1000)
         self.save_ckpt = training_params.get("save_ckpt", "model.ckpt")
@@ -607,40 +613,65 @@ def warm_up_linear(step, warmup_steps):
             )
 
             backend = "CINN" if CINN else None
-            self.wrapper.forward = jit.to_static(
-                backend=backend,
-                input_spec=[
-                    static.InputSpec([1, -1, 3], "float64", name="coord"),  # coord
-                    static.InputSpec([1, -1], "int32", name="atype"),  # atype
-                    None,  # spin
-                    static.InputSpec([1, 9], "float64", name="box"),  # box
-                    static.InputSpec([], "float64", name="cur_lr"),  # cur_lr
-                    {
-                        "find_box": np.float32(1.0),
-                        "find_coord": np.float32(1.0),
-                        "find_numb_copy": np.float32(0.0),
-                        "numb_copy": static.InputSpec(
-                            [1, 1], "int64", name="numb_copy"
-                        ),
-                        "find_energy": np.float32(1.0),
-                        "energy": static.InputSpec([1, 1], "float64", name="energy"),
-                        "find_force": np.float32(1.0),
-                        "force": static.InputSpec([1, -1, 3], "float64", name="force"),
-                        "natoms": static.InputSpec([1, -1], "int32", name="natoms"),
-                    },  # label,
-                    # None, # task_key
-                    # False, # inference_only
-                    # False, # do_atomic_virial
-                    # None, # fparam
-                    # None, # aparam
-                ],
-                full_graph=True,
-            )(self.wrapper.forward)
+            if CINN_ALLOW_DYNAMIC_SHAPE:
+                # Build spec only for keys present in sample data
+                # NOTE: This is a trick to decide the right input_spec for wrapper.forward
+                _, label_dict, _ = self.get_data(is_train=True)
+                # Define specification templates
+                spec_templates = {
+                    "find_box": np.float32(1.0),
+                    "find_coord": np.float32(1.0),
+                    "find_numb_copy": np.float32(0.0),
+                    "numb_copy": static.InputSpec([1, 1], "int64", name="numb_copy"),
+                    "find_energy": np.float32(1.0),
+                    "energy": static.InputSpec([1, 1], "float64", name="energy"),
+                    "find_force": np.float32(1.0),
+                    "force": static.InputSpec([1, -1, 3], "float64", name="force"),
+                    "find_virial": np.float32(0.0),
+                    "virial": static.InputSpec([1, 9], "float64", name="virial"),
+                    "natoms": static.InputSpec([1, -1], "int32", name="natoms"),
+                }
+                label_dict_spec = {
+                    k: spec_templates[k]
+                    for k in label_dict.keys()
+                    if k in spec_templates
+                }
+                self.wrapper.forward = jit.to_static(
+                    backend=backend,
+                    input_spec=[
+                        static.InputSpec([1, -1, 3], "float64", name="coord"),  # coord
+                        static.InputSpec([1, -1], "int32", name="atype"),  # atype
+                        None,  # spin
+                        static.InputSpec([1, 9], "float64", name="box"),  # box
+                        static.InputSpec([], "float64", name="cur_lr"),  # cur_lr
+                        label_dict_spec,  # label,
+                        # None, # task_key
+                        # False, # inference_only
+                        # False, # do_atomic_virial
+                        # None, # fparam
+                        # None, # aparam
+                    ],
+                    full_graph=True,
+                )(self.wrapper.forward)
+            else:
+                self.wrapper.forward = jit.to_static(full_graph=True, backend=backend)(
+                    self.wrapper.forward
+                )
 
             log.info(
-                "Enable CINN during training, there may be some additional "
-                "compilation time in the first traning step."
+                "[CINN] Enable CINN during training, there may be some additional "
+                "compilation time in the first training step."
             )
+            if not CINN_ALLOW_DYNAMIC_SHAPE:
+                log.info(
+                    "[CINN] Dynamic shape is disabled (CINN_ALLOW_DYNAMIC_SHAPE=0). "
+                    "Make sure the input batch shapes are fixed during training. "
+                    "This is recommended for optimal performance, e.g., as in examples/water."
+                )
+                log.info(
+                    "[CINN] If batch data from your dataset(s) has varying input shapes, consider setting "
+                    "CINN_ALLOW_DYNAMIC_SHAPE=1 to enable dynamic shape support."
+                )
 
         if dist.is_available() and dist.is_initialized():
             # DDP will guarantee the model parameters are identical across all processes
@@ -716,7 +747,6 @@ def step(_step_id, task_key="Default") -> None:
                 _lr = self.lr_exp
             cur_lr = _lr.value(_step_id)
             pref_lr = cur_lr
-            self.optimizer.clear_grad(set_to_zero=False)
 
             with nvprof_context(enable_profiling, "Fetching data"):
                 input_dict, label_dict, log_dict = self.get_data(
@@ -732,28 +762,47 @@ def step(_step_id, task_key="Default") -> None:
                     pref_lr = _lr.start_lr
                 else:
                     pref_lr = cur_lr
-                with nvprof_context(enable_profiling, "Forward pass"):
-                    model_pred, loss, more_loss = self.wrapper(
-                        **input_dict,
-                        cur_lr=paddle.full([], pref_lr, DEFAULT_PRECISION),
-                        label=label_dict,
-                        task_key=task_key,
-                    )
 
-                with nvprof_context(enable_profiling, "Backward pass"):
-                    loss.backward()
+                # disable synchronization in forward-backward manually
+                # as derivatives exist in model forward
+                no_sync_context = (
+                    self.wrapper.no_sync
+                    if self.world_size > 1
+                    else contextlib.nullcontext
+                )
+                with no_sync_context():
+                    with nvprof_context(enable_profiling, "Forward pass"):
+                        model_pred, loss, more_loss = self.wrapper(
+                            **input_dict,
+                            cur_lr=paddle.full([], pref_lr, DEFAULT_PRECISION),
+                            label=label_dict,
+                            task_key=task_key,
+                        )
+
+                    with nvprof_context(enable_profiling, "Backward pass"):
+                        loss.backward()
 
-                if self.gradient_max_norm > 0.0:
-                    with nvprof_context(enable_profiling, "Gradient clip"):
-                        paddle.nn.utils.clip_grad_norm_(
-                            self.wrapper.parameters(),
-                            self.gradient_max_norm,
-                            error_if_nonfinite=True,
+                # gradient accumulation
+                if (_step_id + 1) % self.acc_freq == 0:
+                    # fuse + allreduce manually before optimization if use DDP + no_sync
+                    # details in https://github.com/PaddlePaddle/Paddle/issues/48898#issuecomment-1343838622
+                    if self.world_size > 1:
+                        hpu.fused_allreduce_gradients(
+                            list(self.wrapper.parameters()), None
                         )
 
-                with nvprof_context(enable_profiling, "Adam update"):
-                    self.optimizer.step()
-                self.scheduler.step()
+                    if self.gradient_max_norm > 0.0:
+                        with nvprof_context(enable_profiling, "Gradient clip"):
+                            paddle.nn.utils.clip_grad_norm_(
+                                self.wrapper.parameters(),
+                                self.gradient_max_norm,
+                                error_if_nonfinite=True,
+                            )
+
+                    with nvprof_context(enable_profiling, "Adam update"):
+                        self.optimizer.step()
+                    self.optimizer.clear_grad(set_to_zero=False)
+                    self.scheduler.step()
 
             else:
                 raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
@@ -876,7 +925,9 @@ def log_loss_valid(_task_key="Default"):
                 self.t0 = current_time
                 if self.rank == 0 and self.timing_in_training:
                     eta = int(
-                        (self.num_steps - display_step_id) / self.disp_freq * train_time
+                        (self.num_steps - display_step_id)
+                        / min(self.disp_freq, display_step_id - self.start_step)
+                        * train_time
                     )
                     log.info(
                         format_training_message(
diff --git a/deepmd/pd/utils/decomp.py b/deepmd/pd/utils/decomp.py
index 3b7bddbcd1..ab9a57dbfd 100644
--- a/deepmd/pd/utils/decomp.py
+++ b/deepmd/pd/utils/decomp.py
@@ -112,10 +112,7 @@ def masked_add__decomp(
     """
     assert mask.dtype == paddle.bool, f"mask must be bool type, but got {mask.dtype}"
     # indices is bool mask
-    mask_coord = paddle.concat(
-        paddle.nonzero(mask, as_tuple=True),
-        axis=1,
-    )  # [nz, dim]
+    mask_coord = paddle.nonzero(mask, as_tuple=False)  # [nz, dim]
     if not paddle.is_tensor(v):
         v = paddle.full([mask_coord.shape[0]], v, dtype=x.dtype)
     t = paddle.scatter_nd_add(
diff --git a/deepmd/pd/utils/env.py b/deepmd/pd/utils/env.py
index cf5b1f835c..28606d0945 100644
--- a/deepmd/pd/utils/env.py
+++ b/deepmd/pd/utils/env.py
@@ -27,7 +27,7 @@
     ncpus = os.cpu_count()
 NUM_WORKERS = int(os.environ.get("NUM_WORKERS", min(0, ncpus)))
 # Make sure DDP uses correct device if applicable
-LOCAL_RANK = paddle.distributed.get_rank()
+LOCAL_RANK = int(os.environ.get("PADDLE_LOCAL_RANK", 0))
 
 if os.environ.get("DEVICE") == "cpu" or paddle.device.cuda.device_count() <= 0:
     DEVICE = "cpu"
@@ -69,8 +69,17 @@ def to_bool(flag: int | bool | str) -> bool:
         "installation or recompiling with CINN enabled."
     )
 
+# NOTE: Allow the CINN compiler to optimize inputs with dynamic shapes,
+# may lead to a slight performance decrease compared to static shapes.
+
+# If you can confirm that the shape of the input tensors will not change,
+# you can set it to False to further enhance performance.
+# Otherwise, please use the default value(True) to improve runtime compatibility.
+CINN_ALLOW_DYNAMIC_SHAPE = to_bool(os.environ.get("CINN_ALLOW_DYNAMIC_SHAPE", True))
+
 CACHE_PER_SYS = 5  # keep at most so many sets per sys in memory
 ENERGY_BIAS_TRAINABLE = True
+CUSTOM_OP_USE_JIT = to_bool(os.environ.get("CUSTOM_OP_USE_JIT", False))
 
 PRECISION_DICT = {
     "float16": paddle.float16,
@@ -198,6 +207,8 @@ def enable_prim(enable: bool = True):
 __all__ = [
     "CACHE_PER_SYS",
     "CINN",
+    "CINN_ALLOW_DYNAMIC_SHAPE",
+    "CUSTOM_OP_USE_JIT",
     "DEFAULT_PRECISION",
     "DEVICE",
     "ENERGY_BIAS_TRAINABLE",
diff --git a/deepmd/pd/utils/exclude_mask.py b/deepmd/pd/utils/exclude_mask.py
index 29c9cc3501..cde8730c9a 100644
--- a/deepmd/pd/utils/exclude_mask.py
+++ b/deepmd/pd/utils/exclude_mask.py
@@ -58,7 +58,7 @@ def forward(
 
         """
         nf, natom = atype.shape
-        return self.type_mask[atype].reshape([nf, natom]).to(atype.place)
+        return self.type_mask[atype].reshape([nf, natom])
 
 
 class PairExcludeMask(paddle.nn.Layer):
@@ -126,31 +126,25 @@ def forward(
         """
         if self.no_exclusion:
             # safely return 1 if nothing is excluded.
-            return paddle.ones_like(nlist, dtype=paddle.int32).to(device=nlist.place)
+            return paddle.ones_like(nlist, dtype=paddle.int32)
         nf, nloc, nnei = nlist.shape
         nall = atype_ext.shape[1]
         # add virtual atom of type ntypes. nf x nall+1
         ae = paddle.concat(
             [
                 atype_ext,
-                self.ntypes
-                * paddle.ones([nf, 1], dtype=atype_ext.dtype).to(atype_ext.place),
+                self.ntypes * paddle.ones([nf, 1], dtype=atype_ext.dtype),
             ],
             axis=-1,
         )
         type_i = atype_ext[:, :nloc].reshape([nf, nloc]) * (self.ntypes + 1)
         # nf x nloc x nnei
         index = paddle.where(nlist == -1, nall, nlist).reshape([nf, nloc * nnei])
-        type_j = paddle.take_along_axis(ae, axis=1, indices=index).reshape(
-            [nf, nloc, nnei]
-        )
+        type_j = paddle.take_along_axis(
+            ae, axis=1, indices=index, broadcast=False
+        ).reshape([nf, nloc, nnei])
         type_ij = type_i[:, :, None] + type_j
         # nf x (nloc x nnei)
         type_ij = type_ij.reshape([nf, nloc * nnei])
-        mask = (
-            self.type_mask[type_ij]
-            .reshape([nf, nloc, nnei])
-            .to(atype_ext.place)
-            .astype("bool")
-        )
+        mask = self.type_mask[type_ij].reshape([nf, nloc, nnei]).astype("bool")
         return mask
diff --git a/deepmd/pd/utils/nlist.py b/deepmd/pd/utils/nlist.py
index 707cbd125b..9157fba61a 100644
--- a/deepmd/pd/utils/nlist.py
+++ b/deepmd/pd/utils/nlist.py
@@ -94,36 +94,36 @@ def build_neighbor_list(
 
     """
     batch_size = coord.shape[0]
-    coord = coord.reshape([batch_size, -1])
     nall = coord.shape[1] // 3
     # fill virtual atoms with large coords so they are not neighbors of any
     # real atom.
-
     # NOTE: control flow with double backward is not supported well yet by paddle.jit
     if not paddle.in_dynamic_mode() or decomp.numel(coord) > 0:
         xmax = paddle.max(coord) + 2.0 * rcut
     else:
         xmax = paddle.zeros([], dtype=coord.dtype).to(device=coord.place) + 2.0 * rcut
+    coord_xyz = coord.reshape([batch_size, nall, 3])
     # nf x nall
     is_vir = atype < 0
-    coord1 = paddle.where(
-        is_vir[:, :, None], xmax, coord.reshape([batch_size, nall, 3])
-    ).reshape([batch_size, nall * 3])
+    # batch_size x nall x 3
+    vcoord_xyz = paddle.where(is_vir.unsqueeze(2), xmax, coord_xyz)
     if isinstance(sel, int):
         sel = [sel]
-    # nloc x 3
-    coord0 = coord1[:, : nloc * 3]
-    # nloc x nall x 3
-    diff = coord1.reshape([batch_size, -1, 3]).unsqueeze(1) - coord0.reshape(
-        [batch_size, -1, 3]
-    ).unsqueeze(2)
+
+    # Get the coordinates for the local atoms (first nloc atoms)
+    # batch_size x nloc x 3
+    vcoord_local_xyz = vcoord_xyz[:, :nloc, :]
+
+    # Calculate displacement vectors.
+    diff = vcoord_xyz.unsqueeze(1) - vcoord_local_xyz.unsqueeze(2)
     if paddle.in_dynamic_mode():
         assert list(diff.shape) == [batch_size, nloc, nall, 3]
     # nloc x nall
     rr = paddle.linalg.norm(diff, axis=-1)
     # if central atom has two zero distances, sorting sometimes can not exclude itself
-    rr = rr - paddle.eye(nloc, nall, dtype=rr.dtype).to(device=rr.place).unsqueeze(0)
+    rr = rr - paddle.eye(nloc, nall, dtype=rr.dtype).unsqueeze(0)
     rr, nlist = paddle.sort(rr, axis=-1), paddle.argsort(rr, axis=-1)
+
     # nloc x (nall-1)
     rr = rr[:, :, 1:]
     nlist = nlist[:, :, 1:]
@@ -155,20 +155,13 @@ def _trim_mask_distinguish_nlist(
         rr = paddle.concat(
             [
                 rr,
-                paddle.ones([batch_size, nloc, nsel - nnei]).to(
-                    device=rr.place, dtype=rr.dtype
-                )
+                paddle.ones([batch_size, nloc, nsel - nnei]).astype(dtype=rr.dtype)
                 + rcut,
             ],
             axis=-1,
         )
         nlist = paddle.concat(
-            [
-                nlist,
-                paddle.ones([batch_size, nloc, nsel - nnei], dtype=nlist.dtype).to(
-                    device=rr.place
-                ),
-            ],
+            [nlist, paddle.ones([batch_size, nloc, nsel - nnei], dtype=nlist.dtype)],
             axis=-1,
         )
         if paddle.in_dynamic_mode():
@@ -318,7 +311,11 @@ def nlist_distinguish_types(
             paddle.argsort(pick_mask, axis=-1, descending=True, stable=True),
         )
         # nloc x s(nsel)
-        inlist = paddle.take_along_axis(nlist, axis=2, indices=imap)
+        inlist = paddle.take_along_axis(
+            nlist,
+            axis=2,
+            indices=imap,
+        )
         inlist = inlist.masked_fill(~(pick_mask.to(paddle.bool)), -1)
         # nloc x nsel[ii]
         ret_nlist.append(paddle.split(inlist, [ss, snsel - ss], axis=-1)[0])
@@ -377,7 +374,7 @@ def build_multiple_neighbor_list(
         pad = -paddle.ones(
             [nb, nloc, nsels[-1] - nsel],
             dtype=nlist.dtype,
-        ).to(device=nlist.place)
+        )
         # nb x nloc x nsel
         nlist = paddle.concat([nlist, pad], axis=-1)
         if paddle.is_tensor(nsel):
@@ -399,9 +396,11 @@ def build_multiple_neighbor_list(
         .expand([-1, -1, 3])
     )
     # nb x nloc x nsel x 3
-    coord2 = paddle.take_along_axis(coord1, axis=1, indices=index).reshape(
-        [nb, nloc, nsel, 3]
-    )
+    coord2 = paddle.take_along_axis(
+        coord1,
+        axis=1,
+        indices=index,
+    ).reshape([nb, nloc, nsel, 3])
     # nb x nloc x nsel x 3
     diff = coord2 - coord0[:, :, None, :]
     # nb x nloc x nsel
@@ -452,7 +451,7 @@ def extend_coord_with_ghosts(
     device = coord.place
     nf, nloc = atype.shape[:2]
     # int64 for index
-    aidx = paddle.tile(paddle.arange(nloc).to(device=device).unsqueeze(0), [nf, 1])  # pylint: disable=no-explicit-dtype
+    aidx = paddle.tile(paddle.arange(nloc).unsqueeze(0), [nf, 1])  # pylint: disable=no-explicit-dtype
     if cell is None:
         nall = nloc
         extend_coord = coord.clone()
@@ -496,14 +495,12 @@ def extend_coord_with_ghosts(
             # .cpu()
         )  # pylint: disable=no-explicit-dtype
         eye_3 = (
-            paddle.eye(3, dtype=env.GLOBAL_PD_FLOAT_PRECISION).to(
-                dtype=env.GLOBAL_PD_FLOAT_PRECISION
-            )
+            paddle.eye(3)
             # .cpu()
-        )
-        xyz = xi.reshape([-1, 1, 1, 1]) * eye_3[0]
-        xyz = xyz + yi.reshape([1, -1, 1, 1]) * eye_3[1]
-        xyz = xyz + zi.reshape([1, 1, -1, 1]) * eye_3[2]
+        ).to(dtype=env.GLOBAL_PD_FLOAT_PRECISION)
+        xyz = xi.reshape([-1, 1, 1, 1]).astype(eye_3.dtype) * eye_3[0]
+        xyz = xyz + yi.reshape([1, -1, 1, 1]).astype(eye_3.dtype) * eye_3[1]
+        xyz = xyz + zi.reshape([1, 1, -1, 1]).astype(eye_3.dtype) * eye_3[2]
         xyz = xyz.reshape([-1, 3])
         # xyz = xyz.to(device=device)
         # ns x 3
@@ -519,7 +516,7 @@ def extend_coord_with_ghosts(
         # nf x ns x nloc
         extend_aidx = paddle.tile(aidx.unsqueeze(-2), [1, ns, 1])
     return (
-        extend_coord.reshape([nf, nall * 3]).to(device),
-        extend_atype.reshape([nf, nall]).to(device),
-        extend_aidx.reshape([nf, nall]).to(device),
+        extend_coord.reshape([nf, nall * 3]),
+        extend_atype.reshape([nf, nall]),
+        extend_aidx.reshape([nf, nall]),
     )
diff --git a/deepmd/pd/utils/preprocess.py b/deepmd/pd/utils/preprocess.py
index 3e047c1b8b..3be42b522e 100644
--- a/deepmd/pd/utils/preprocess.py
+++ b/deepmd/pd/utils/preprocess.py
@@ -10,9 +10,20 @@ def compute_smooth_weight(distance, rmin: float, rmax: float):
     """Compute smooth weight for descriptor elements."""
     if rmin >= rmax:
         raise ValueError("rmin should be less than rmax.")
-    min_mask = distance <= rmin
-    max_mask = distance >= rmax
-    mid_mask = paddle.logical_not(paddle.logical_or(min_mask, max_mask))
+    distance = paddle.clip(distance, min=rmin, max=rmax)
     uu = (distance - rmin) / (rmax - rmin)
-    vv = uu * uu * uu * (-6 * uu * uu + 15 * uu - 10) + 1
-    return vv * mid_mask.astype(vv.dtype) + min_mask.astype(vv.dtype)
+    uu2 = uu * uu
+    vv = uu2 * uu * (-6 * uu2 + 15 * uu - 10) + 1
+    return vv
+
+
+def compute_exp_sw(distance, rmin: float, rmax: float):
+    """Compute the exponential switch function for neighbor update."""
+    if rmin >= rmax:
+        raise ValueError("rmin should be less than rmax.")
+    distance = paddle.clip(distance, min=0.0, max=rmax)
+    C = 20
+    a = C / rmin
+    b = rmin
+    exp_sw = paddle.exp(-paddle.exp(a * (distance - b)))
+    return exp_sw
diff --git a/deepmd/pd/utils/serialization.py b/deepmd/pd/utils/serialization.py
index f4206ce993..bd70deb75c 100644
--- a/deepmd/pd/utils/serialization.py
+++ b/deepmd/pd/utils/serialization.py
@@ -69,9 +69,9 @@ def deserialize_to_file(model_file: str, data: dict) -> None:
         model.forward,
         full_graph=True,
         input_spec=[
-            InputSpec([1, -1, 3], dtype="float64", name="coord"),
-            InputSpec([1, -1], dtype="int64", name="atype"),
-            InputSpec([1, 9], dtype="float64", name="box"),
+            InputSpec([-1, -1, 3], dtype="float64", name="coord"),
+            InputSpec([-1, -1], dtype="int64", name="atype"),
+            InputSpec([-1, 9], dtype="float64", name="box"),
             None,
             None,
             True,
@@ -88,9 +88,9 @@ def deserialize_to_file(model_file: str, data: dict) -> None:
         model.forward_lower,
         full_graph=True,
         input_spec=[
-            InputSpec([1, -1, 3], dtype="float64", name="coord"),
-            InputSpec([1, -1], dtype="int32", name="atype"),
-            InputSpec([1, -1, -1], dtype="int32", name="nlist"),
+            InputSpec([-1, -1, 3], dtype="float64", name="coord"),
+            InputSpec([-1, -1], dtype="int32", name="atype"),
+            InputSpec([-1, -1, -1], dtype="int32", name="nlist"),
             None,
             None,
             None,
@@ -101,4 +101,5 @@ def deserialize_to_file(model_file: str, data: dict) -> None:
     paddle.jit.save(
         model,
         model_file.split(".json")[0],
+        skip_prune_program=True,
     )
diff --git a/deepmd/pd/utils/spin.py b/deepmd/pd/utils/spin.py
index 934fb3762a..27bc355877 100644
--- a/deepmd/pd/utils/spin.py
+++ b/deepmd/pd/utils/spin.py
@@ -21,7 +21,6 @@ def concat_switch_virtual(
     extended_tensor_updated = paddle.zeros(
         out_shape,
         dtype=extended_tensor.dtype,
-        device=extended_tensor.place,
     )
     extended_tensor_updated[:, :nloc] = extended_tensor[:, :nloc]
     extended_tensor_updated[:, nloc : nloc + nloc] = extended_tensor_virtual[:, :nloc]
diff --git a/deepmd/pd/utils/stat.py b/deepmd/pd/utils/stat.py
index e0abb1b289..4132d0a5f7 100644
--- a/deepmd/pd/utils/stat.py
+++ b/deepmd/pd/utils/stat.py
@@ -4,6 +4,7 @@
     defaultdict,
 )
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -12,9 +13,6 @@
 import numpy as np
 import paddle
 
-from deepmd.dpmodel.output_def import (
-    FittingOutputDef,
-)
 from deepmd.pd.utils import (
     AtomExcludeMask,
 )
@@ -27,6 +25,7 @@
     to_paddle_tensor,
 )
 from deepmd.utils.out_stat import (
+    compute_stats_do_not_distinguish_types,
     compute_stats_from_atomic,
     compute_stats_from_redu,
 )
@@ -37,7 +36,9 @@
 log = logging.getLogger(__name__)
 
 
-def make_stat_input(datasets, dataloaders, nbatches):
+def make_stat_input(
+    datasets: list[Any], dataloaders: list[Any], nbatches: int
+) -> dict[str, Any]:
     """Pack data for statistics.
 
     Args:
@@ -61,6 +62,14 @@ def make_stat_input(datasets, dataloaders, nbatches):
             except StopIteration:
                 iterator = iter(dataloaders[i])
                 stat_data = next(iterator)
+            if (
+                "find_fparam" in stat_data
+                and "fparam" in stat_data
+                and stat_data["find_fparam"] == 0.0
+            ):
+                # for model using default fparam
+                stat_data.pop("fparam")
+                stat_data.pop("find_fparam")
             for dd in stat_data:
                 if stat_data[dd] is None:
                     sys_stat[dd] = None
@@ -117,7 +126,7 @@ def _save_to_file(
     stat_file_path: DPPath,
     bias_out: dict,
     std_out: dict,
-):
+) -> None:
     assert stat_file_path is not None
     stat_file_path.mkdir(exist_ok=True, parents=True)
     for kk, vv in bias_out.items():
@@ -129,18 +138,23 @@ def _save_to_file(
 
 
 def _post_process_stat(
-    out_bias,
-    out_std,
-):
+    out_bias: paddle.Tensor,
+    out_std: paddle.Tensor,
+) -> tuple[paddle.Tensor, paddle.Tensor]:
     """Post process the statistics.
 
     For global statistics, we do not have the std for each type of atoms,
     thus fake the output std by ones for all the types.
+    If the shape of out_std is already the same as out_bias,
+    we do not need to do anything.
 
     """
     new_std = {}
     for kk, vv in out_bias.items():
-        new_std[kk] = np.ones_like(vv)
+        if vv.shape == out_std[kk].shape:
+            new_std[kk] = out_std[kk]
+        else:
+            new_std[kk] = np.ones_like(vv)
     return out_bias, new_std
 
 
@@ -148,7 +162,7 @@ def _compute_model_predict(
     sampled: Union[Callable[[], list[dict]], list[dict]],
     keys: list[str],
     model_forward: Callable[..., paddle.Tensor],
-):
+) -> dict[str, list[paddle.Tensor]]:
     auto_batch_size = AutoBatchSize()
     model_predict = {kk: [] for kk in keys}
     for system in sampled:
@@ -211,7 +225,7 @@ def _make_preset_out_bias(
 def _fill_stat_with_global(
     atomic_stat: Union[np.ndarray, None],
     global_stat: np.ndarray,
-):
+) -> Union[np.ndarray, None]:
     """This function is used to fill atomic stat with global stat.
 
     Parameters
@@ -242,8 +256,9 @@ def compute_output_stats(
     rcond: Optional[float] = None,
     preset_bias: Optional[dict[str, list[Optional[np.ndarray]]]] = None,
     model_forward: Optional[Callable[..., paddle.Tensor]] = None,
-    atomic_output: Optional[FittingOutputDef] = None,
-):
+    stats_distinguish_types: bool = True,
+    intensive: bool = False,
+) -> dict[str, Any]:
     """
     Compute the output statistics (e.g. energy bias) for the fitting net from packed data.
 
@@ -272,8 +287,10 @@ def compute_output_stats(
         If not None, the model will be utilized to generate the original energy prediction,
         which will be subtracted from the energy label of the data.
         The difference will then be used to calculate the delta complement energy bias for each type.
-    atomic_output : FittingOutputDef, optional
-        The output of atomic model.
+    stats_distinguish_types : bool, optional
+        Whether to distinguish different element types in the statistics.
+    intensive : bool, optional
+        Whether the fitting target is intensive.
     """
     # try to restore the bias from stat file
     bias_atom_e, std_atom_e = _restore_from_file(stat_file_path, keys)
@@ -362,7 +379,8 @@ def compute_output_stats(
             rcond,
             preset_bias,
             model_pred_g,
-            atomic_output,
+            stats_distinguish_types,
+            intensive,
         )
         bias_atom_a, std_atom_a = compute_output_stats_atomic(
             sampled,
@@ -405,8 +423,9 @@ def compute_output_stats_global(
     rcond: Optional[float] = None,
     preset_bias: Optional[dict[str, list[Optional[paddle.Tensor]]]] = None,
     model_pred: Optional[dict[str, np.ndarray]] = None,
-    atomic_output: Optional[FittingOutputDef] = None,
-):
+    stats_distinguish_types: bool = True,
+    intensive: bool = False,
+) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
     """This function only handle stat computation from reduced global labels."""
     # return directly if model predict is empty for global
     if model_pred == {}:
@@ -469,26 +488,31 @@ def compute_output_stats_global(
         # subtract the model bias and output the delta bias
 
         stats_input = {
-            kk: merged_output[kk] - model_pred[kk] for kk in keys if kk in merged_output
+            kk: merged_output[kk] - model_pred[kk].reshape(merged_output[kk].shape)
+            for kk in keys
+            if kk in merged_output
         }
 
     bias_atom_e = {}
     std_atom_e = {}
     for kk in keys:
         if kk in stats_input:
-            if atomic_output is not None and atomic_output.get_data()[kk].intensive:
-                task_dim = stats_input[kk].shape[1]
-                assert merged_natoms[kk].shape == (nf[kk], ntypes)
-                stats_input[kk] = (
-                    merged_natoms[kk].sum(axis=1).reshape([-1, 1]) * stats_input[kk]
+            if not stats_distinguish_types:
+                bias_atom_e[kk], std_atom_e[kk] = (
+                    compute_stats_do_not_distinguish_types(
+                        stats_input[kk],
+                        merged_natoms[kk],
+                        assigned_bias=assigned_atom_ener[kk],
+                        intensive=intensive,
+                    )
+                )
+            else:
+                bias_atom_e[kk], std_atom_e[kk] = compute_stats_from_redu(
+                    stats_input[kk],
+                    merged_natoms[kk],
+                    assigned_bias=assigned_atom_ener[kk],
+                    rcond=rcond,
                 )
-                assert stats_input[kk].shape == (nf[kk], task_dim)
-            bias_atom_e[kk], std_atom_e[kk] = compute_stats_from_redu(
-                stats_input[kk],
-                merged_natoms[kk],
-                assigned_bias=assigned_atom_ener[kk],
-                rcond=rcond,
-            )
         else:
             # this key does not have global labels, skip it.
             continue
@@ -509,16 +533,16 @@ def compute_output_stats_global(
         }
     atom_numbs = {kk: merged_natoms[kk].sum(-1) for kk in bias_atom_e.keys()}
 
-    def rmse(x):
+    def rmse(x: np.ndarray) -> float:
         return np.sqrt(np.mean(np.square(x)))
 
     for kk in bias_atom_e.keys():
         rmse_ae = rmse(
             (
-                unbias_e[kk].reshape([nf[kk], -1]).astype(merged_output[kk].dtype)
+                unbias_e[kk].reshape([nf[kk], -1])
                 - merged_output[kk].reshape([nf[kk], -1])
             )
-            / atom_numbs[kk][:, None].astype(merged_output[kk].dtype)
+            / atom_numbs[kk][:, None]
         )
         log.info(
             f"RMSE of {kk} per atom after linear regression is: {rmse_ae} in the unit of {kk}."
@@ -531,7 +555,7 @@ def compute_output_stats_atomic(
     ntypes: int,
     keys: list[str],
     model_pred: Optional[dict[str, np.ndarray]] = None,
-):
+) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
     # get label dict from sample; for each key, only picking the system with atomic labels.
     outputs = {
         kk: [
@@ -549,7 +573,17 @@ def compute_output_stats_atomic(
         ]
         for kk in keys
     }
-    # shape: (nframes, nloc, ndim)
+    # reshape outputs [nframes, nloc * ndim] --> reshape to [nframes * nloc, 1, ndim] for concatenation
+    # reshape natoms [nframes, nloc] --> reshape to [nframes * nolc, 1] for concatenation
+    natoms = {k: [sys_v.reshape([-1, 1]) for sys_v in v] for k, v in natoms.items()}
+    outputs = {
+        k: [
+            sys.reshape([natoms[k][sys_idx].shape[0], 1, -1])
+            for sys_idx, sys in enumerate(v)
+        ]
+        for k, v in outputs.items()
+    }
+
     merged_output = {
         kk: to_numpy_array(paddle.concat(outputs[kk]))
         for kk in keys
diff --git a/deepmd/pd/utils/utils.py b/deepmd/pd/utils/utils.py
index a756491a8d..175ac5019b 100644
--- a/deepmd/pd/utils/utils.py
+++ b/deepmd/pd/utils/utils.py
@@ -20,6 +20,9 @@
 )
 
 from deepmd.dpmodel.common import PRECISION_DICT as NP_PRECISION_DICT
+from deepmd.pd.utils import (
+    env,
+)
 
 from .env import (
     DEVICE,
@@ -32,15 +35,126 @@
     )
 
 
+def silut_forward(
+    x: paddle.Tensor, threshold: float, slope: float, const_val: float
+) -> paddle.Tensor:
+    sig = F.sigmoid(x)
+    silu = x * sig
+    tanh = paddle.tanh(slope * (x - threshold)) + const_val
+    return paddle.where(x >= threshold, tanh, silu)
+
+
+def silut_backward(
+    x: paddle.Tensor, grad_output: paddle.Tensor, threshold: float, slope: float
+) -> paddle.Tensor:
+    sig = F.sigmoid(x)
+    grad_silu = sig * (1 + x * (1 - sig))
+
+    tanh = paddle.tanh(slope * (x - threshold))
+    grad_tanh = slope * (1 - tanh * tanh)
+
+    grad = paddle.where(x >= threshold, grad_tanh, grad_silu)
+    return grad * grad_output
+
+
+def silut_double_backward(
+    x: paddle.Tensor,
+    grad_grad_output: paddle.Tensor,
+    grad_output: paddle.Tensor,
+    threshold: float,
+    slope: float,
+) -> tuple[paddle.Tensor, paddle.Tensor]:
+    # SiLU branch
+    sig = F.sigmoid(x)
+
+    sig_prime = sig * (1 - sig)
+    grad_silu = sig + x * sig_prime
+    grad_grad_silu = sig_prime * (2 + x * (1 - 2 * sig))
+
+    # Tanh branch
+    tanh = paddle.tanh(slope * (x - threshold))
+    tanh_square = tanh * tanh  #  .square is slow for jit.script!
+    grad_tanh = slope * (1 - tanh_square)
+    grad_grad_tanh = -2 * slope * tanh * grad_tanh
+
+    grad = paddle.where(x >= threshold, grad_tanh, grad_silu)
+    grad_grad = paddle.where(x >= threshold, grad_grad_tanh, grad_grad_silu)
+    return grad_output * grad_grad * grad_grad_output, grad * grad_grad_output
+
+
+class SiLUTScript(paddle.nn.Layer):
+    def __init__(self, threshold: float = 3.0):
+        super().__init__()
+        self.threshold = threshold
+
+        # Precompute parameters for the tanh replacement
+        sigmoid_threshold = 1 / (1 + np.exp(-threshold))
+        self.slope = float(
+            sigmoid_threshold + threshold * sigmoid_threshold * (1 - sigmoid_threshold)
+        )
+        self.const_val = float(threshold * sigmoid_threshold)
+        self.get_script_code()
+
+    def get_script_code(self):
+        silut_forward_script = paddle.jit.to_static(silut_forward, full_graph=True)
+        silut_backward_script = paddle.jit.to_static(silut_backward, full_graph=True)
+        silut_double_backward_script = paddle.jit.to_static(
+            silut_double_backward, full_graph=True
+        )
+
+        class SiLUTFunction(paddle.autograd.PyLayer):
+            @staticmethod
+            def forward(ctx, x, threshold, slope, const_val):
+                ctx.save_for_backward(x)
+                ctx.threshold = threshold
+                ctx.slope = slope
+                ctx.const_val = const_val
+                return silut_forward_script(x, threshold, slope, const_val)
+
+            @staticmethod
+            def backward(ctx, grad_output):
+                (x,) = ctx.saved_tensor()
+                threshold = ctx.threshold
+                slope = ctx.slope
+
+                grad_input = SiLUTGradFunction.apply(x, grad_output, threshold, slope)
+                return grad_input
+
+        class SiLUTGradFunction(paddle.autograd.PyLayer):
+            @staticmethod
+            def forward(ctx, x, grad_output, threshold, slope):
+                ctx.threshold = threshold
+                ctx.slope = slope
+                grad_input = silut_backward_script(x, grad_output, threshold, slope)
+                ctx.save_for_backward(x, grad_output)
+                return grad_input
+
+            @staticmethod
+            def backward(ctx, grad_grad_output):
+                (x, grad_output) = ctx.saved_tensor()
+                threshold = ctx.threshold
+                slope = ctx.slope
+
+                grad_input, grad_mul_grad_grad_output = silut_double_backward_script(
+                    x, grad_grad_output, grad_output, threshold, slope
+                )
+                return grad_input, grad_mul_grad_grad_output
+
+        self.SiLUTFunction = SiLUTFunction
+
+    def forward(self, x):
+        return self.SiLUTFunction.apply(x, self.threshold, self.slope, self.const_val)
+
+
 class SiLUT(paddle.nn.Layer):
     def __init__(self, threshold=3.0):
         super().__init__()
 
         def sigmoid(x):
-            return paddle.nn.functional.sigmoid(x)
+            return F.sigmoid(x)
 
         def silu(x):
-            return paddle.nn.functional.silu(x)
+            return F.silu(x)
 
         def silu_grad(x):
             sig = sigmoid(x)
@@ -76,7 +190,11 @@ def __init__(self, activation: str | None):
             threshold = (
                 float(self.activation.split(":")[-1]) if ":" in self.activation else 3.0
             )
-            self.silut = SiLUT(threshold=threshold)
+            if env.CUSTOM_OP_USE_JIT:
+                # for efficient training but can not be jit
+                self.silut = SiLUTScript(threshold=threshold)
+            else:
+                self.silut = SiLUT(threshold=threshold)
         else:
             self.silut = None
 
diff --git a/deepmd/pt/cxx_op.py b/deepmd/pt/cxx_op.py
index f7922a5c52..1106a8887f 100644
--- a/deepmd/pt/cxx_op.py
+++ b/deepmd/pt/cxx_op.py
@@ -1,5 +1,12 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import platform
+from ctypes import (
+    CDLL,
+    RTLD_GLOBAL,
+)
+from importlib import (
+    metadata,
+)
 
 import torch
 from packaging.version import (
@@ -87,6 +94,29 @@ def load_library(module_name: str) -> bool:
     return False
 
 
+def load_mpi_library() -> None:
+    """Load MPI library.
+
+    When building with cibuildwheel, the link to the MPI library is lost
+    after the wheel is repaired.
+    """
+    if platform.system() == "Linux":
+        libname = "libmpi.so.*"
+    elif platform.system() == "Darwin":
+        libname = "libmpi.*.dylib"
+    else:
+        raise RuntimeError("Unsupported platform")
+    MPI_LIB = next(p for p in metadata.files("mpich") if p.match(libname)).locate()
+    # use CDLL to load the library
+    CDLL(MPI_LIB, mode=RTLD_GLOBAL)
+
+
+if GLOBAL_CONFIG.get("cibuildwheel", "0") == "1" and platform.system() in (
+    "Linux",
+    "Darwin",
+):
+    load_mpi_library()
+
 ENABLE_CUSTOMIZED_OP = load_library("deepmd_op_pt")
 
 __all__ = [
diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py
index 0e248583ec..06a7603cc0 100644
--- a/deepmd/pt/entrypoints/main.py
+++ b/deepmd/pt/entrypoints/main.py
@@ -8,6 +8,7 @@
     Path,
 )
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -25,6 +26,7 @@
 )
 from deepmd.common import (
     expand_sys_str,
+    j_loader,
 )
 from deepmd.env import (
     GLOBAL_CONFIG,
@@ -94,20 +96,23 @@
 
 
 def get_trainer(
-    config,
-    init_model=None,
-    restart_model=None,
-    finetune_model=None,
-    force_load=False,
-    init_frz_model=None,
-    shared_links=None,
-    finetune_links=None,
-):
+    config: dict[str, Any],
+    init_model: Optional[str] = None,
+    restart_model: Optional[str] = None,
+    finetune_model: Optional[str] = None,
+    force_load: bool = False,
+    init_frz_model: Optional[str] = None,
+    shared_links: Optional[dict[str, Any]] = None,
+    finetune_links: Optional[dict[str, Any]] = None,
+) -> training.Trainer:
     multi_task = "model_dict" in config.get("model", {})
 
     def prepare_trainer_input_single(
-        model_params_single, data_dict_single, rank=0, seed=None
-    ):
+        model_params_single: dict[str, Any],
+        data_dict_single: dict[str, Any],
+        rank: int = 0,
+        seed: Optional[int] = None,
+    ) -> tuple[DpLoaderSet, Optional[DpLoaderSet], Optional[DPPath]]:
         training_dataset_params = data_dict_single["training_data"]
         validation_dataset_params = data_dict_single.get("validation_data", None)
         validation_systems = (
@@ -254,8 +259,7 @@ def train(
     env.CUSTOM_OP_USE_JIT = True
     if LOCAL_RANK == 0:
         SummaryPrinter()()
-    with open(input_file) as fin:
-        config = json.load(fin)
+    config = j_loader(input_file)
     # ensure suffix, as in the command line help, we say "path prefix of checkpoint files"
     if init_model is not None and not init_model.endswith(".pt"):
         init_model += ".pt"
diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py
index c4d5d028ce..f3e52cdac0 100644
--- a/deepmd/pt/infer/deep_eval.py
+++ b/deepmd/pt/infer/deep_eval.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import json
+import logging
 from typing import (
     TYPE_CHECKING,
     Any,
@@ -64,10 +65,22 @@
     to_numpy_array,
     to_torch_tensor,
 )
+from deepmd.utils.econf_embd import (
+    sort_element_type,
+)
+from deepmd.utils.model_branch_dict import (
+    get_model_dict,
+)
 
 if TYPE_CHECKING:
     import ase.neighborlist
 
+    from deepmd.pt.model.model.model import (
+        BaseModel,
+    )
+
+log = logging.getLogger(__name__)
+
 
 class DeepEval(DeepEvalBackend):
     """PyTorch backend implementation of DeepEval.
@@ -80,7 +93,7 @@ class DeepEval(DeepEvalBackend):
         The output definition of the model.
     *args : list
         Positional arguments.
-    auto_batch_size : bool or int or AutomaticBatchSize, default: False
+    auto_batch_size : bool or int or AutomaticBatchSize, default: True
         If True, automatic batch size will be used. If int, it will be used
         as the initial batch size.
     neighbor_list : ase.neighborlist.NewPrimitiveNeighborList, optional
@@ -98,6 +111,7 @@ def __init__(
         auto_batch_size: Union[bool, int, AutoBatchSize] = True,
         neighbor_list: Optional["ase.neighborlist.NewPrimitiveNeighborList"] = None,
         head: Optional[Union[str, int]] = None,
+        no_jit: bool = False,
         **kwargs: Any,
     ) -> None:
         self.output_def = output_def
@@ -112,15 +126,36 @@ def __init__(
             self.model_def_script = self.input_param
             self.multi_task = "model_dict" in self.input_param
             if self.multi_task:
+                model_alias_dict, model_branch_dict = get_model_dict(
+                    self.input_param["model_dict"]
+                )
                 model_keys = list(self.input_param["model_dict"].keys())
+                if head is None and "Default" in model_alias_dict:
+                    head = "Default"
+                    log.info(
+                        f"Using default head {model_alias_dict[head]} for multitask model."
+                    )
                 if isinstance(head, int):
                     head = model_keys[0]
                 assert head is not None, (
-                    f"Head must be set for multitask model! Available heads are: {model_keys}"
+                    f"Head must be set for multitask model! Available heads are: {model_keys}, "
+                    f"use `dp --pt show your_model.pt model-branch` to show detail information."
                 )
-                assert head in model_keys, (
-                    f"No head named {head} in model! Available heads are: {model_keys}"
+                if head not in model_alias_dict:
+                    # preprocess with potentially case-insensitive input
+                    head_lower = head.lower()
+                    for mk in model_alias_dict:
+                        if mk.lower() == head_lower:
+                            # mapped the first matched head
+                            head = mk
+                            break
+                # replace with alias
+                assert head in model_alias_dict, (
+                    f"No head or alias named {head} in model! Available heads are: {model_keys},"
+                    f"use `dp --pt show your_model.pt model-branch` to show detail information."
                 )
+                head = model_alias_dict[head]
+
                 self.input_param = self.input_param["model_dict"][head]
                 state_dict_head = {"_extra_state": state_dict["_extra_state"]}
                 for item in state_dict:
@@ -130,7 +165,7 @@ def __init__(
                         ] = state_dict[item].clone()
                 state_dict = state_dict_head
             model = get_model(self.input_param).to(DEVICE)
-            if not self.input_param.get("hessian_mode"):
+            if not self.input_param.get("hessian_mode") and not no_jit:
                 model = torch.jit.script(model)
             self.dp = ModelWrapper(model)
             self.dp.load_state_dict(state_dict)
@@ -183,6 +218,14 @@ def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this DP."""
         return self.dp.model["Default"].get_dim_aparam()
 
+    def has_default_fparam(self) -> bool:
+        """Check if the model has default frame parameters."""
+        try:
+            return self.dp.model["Default"].has_default_fparam()
+        except AttributeError:
+            # for compatibility with old models
+            return False
+
     def get_intensive(self) -> bool:
         return self.dp.model["Default"].get_intensive()
 
@@ -241,14 +284,25 @@ def get_ntypes_spin(self) -> int:
         """Get the number of spin atom types of this model. Only used in old implement."""
         return 0
 
-    def get_has_spin(self):
+    def get_has_spin(self) -> bool:
         """Check if the model has spin atom types."""
         return self._has_spin
 
-    def get_has_hessian(self):
+    def get_has_hessian(self) -> bool:
         """Check if the model has hessian."""
         return self._has_hessian
 
+    def get_model_branch(self) -> tuple[dict[str, str], dict[str, dict[str, Any]]]:
+        """Get the model branch information."""
+        if "model_dict" in self.model_def_script:
+            model_alias_dict, model_branch_dict = get_model_dict(
+                self.model_def_script["model_dict"]
+            )
+            return model_alias_dict, model_branch_dict
+        else:
+            # single-task model
+            return {"Default": "Default"}, {"Default": {"alias": [], "info": {}}}
+
     def eval(
         self,
         coords: np.ndarray,
@@ -377,7 +431,7 @@ def _eval_func(self, inner_func: Callable, numb_test: int, natoms: int) -> Calla
         """
         if self.auto_batch_size is not None:
 
-            def eval_func(*args, **kwargs):
+            def eval_func(*args: Any, **kwargs: Any) -> Any:
                 return self.auto_batch_size.execute_all(
                     inner_func, numb_test, natoms, *args, **kwargs
                 )
@@ -411,7 +465,7 @@ def _eval_model(
         fparam: Optional[np.ndarray],
         aparam: Optional[np.ndarray],
         request_defs: list[OutputVariableDef],
-    ):
+    ) -> tuple[np.ndarray, ...]:
         model = self.dp.to(DEVICE)
         prec = NP_PRECISION_DICT[RESERVED_PRECISION_DICT[GLOBAL_PT_FLOAT_PRECISION]]
 
@@ -489,7 +543,7 @@ def _eval_model_spin(
         fparam: Optional[np.ndarray],
         aparam: Optional[np.ndarray],
         request_defs: list[OutputVariableDef],
-    ):
+    ) -> tuple[np.ndarray, ...]:
         model = self.dp.to(DEVICE)
 
         nframes = coords.shape[0]
@@ -566,7 +620,9 @@ def _eval_model_spin(
                 )  # this is kinda hacky
         return tuple(results)
 
-    def _get_output_shape(self, odef, nframes, natoms):
+    def _get_output_shape(
+        self, odef: OutputVariableDef, nframes: int, natoms: int
+    ) -> list[int]:
         if odef.category == OutputVariableCategory.DERV_C_REDU:
             # virial
             return [nframes, *odef.shape[:-1], 9]
@@ -648,6 +704,32 @@ def get_model_size(self) -> dict:
             "total": sum_param_des + sum_param_fit,
         }
 
+    def get_observed_types(self) -> dict:
+        """Get observed types (elements) of the model during data statistics.
+
+        Returns
+        -------
+        dict
+            A dictionary containing the information of observed type in the model:
+            - 'type_num': the total number of observed types in this model.
+            - 'observed_type': a list of the observed types in this model.
+        """
+        observed_type_list = self.dp.model["Default"].get_observed_type_list()
+        return {
+            "type_num": len(observed_type_list),
+            "observed_type": sort_element_type(observed_type_list),
+        }
+
+    def get_model(self) -> "BaseModel":
+        """Get the PyTorch model.
+
+        Returns
+        -------
+        BaseModel
+            The PyTorch model instance.
+        """
+        return self.dp.model["Default"]
+
     def eval_descriptor(
         self,
         coords: np.ndarray,
@@ -702,3 +784,58 @@ def eval_descriptor(
         descriptor = model.eval_descriptor()
         model.set_eval_descriptor_hook(False)
         return to_numpy_array(descriptor)
+
+    def eval_fitting_last_layer(
+        self,
+        coords: np.ndarray,
+        cells: Optional[np.ndarray],
+        atom_types: np.ndarray,
+        fparam: Optional[np.ndarray] = None,
+        aparam: Optional[np.ndarray] = None,
+        **kwargs: Any,
+    ) -> np.ndarray:
+        """Evaluate fitting before last layer by using this DP.
+
+        Parameters
+        ----------
+        coords
+            The coordinates of atoms.
+            The array should be of size nframes x natoms x 3
+        cells
+            The cell of the region.
+            If None then non-PBC is assumed, otherwise using PBC.
+            The array should be of size nframes x 9
+        atom_types
+            The atom types
+            The list should contain natoms ints
+        fparam
+            The frame parameter.
+            The array can be of size :
+            - nframes x dim_fparam.
+            - dim_fparam. Then all frames are assumed to be provided with the same fparam.
+        aparam
+            The atomic parameter
+            The array can be of size :
+            - nframes x natoms x dim_aparam.
+            - natoms x dim_aparam. Then all frames are assumed to be provided with the same aparam.
+            - dim_aparam. Then all frames and atoms are provided with the same aparam.
+
+        Returns
+        -------
+        fitting
+            Fitting output before last layer.
+        """
+        model = self.dp.model["Default"]
+        model.set_eval_fitting_last_layer_hook(True)
+        self.eval(
+            coords,
+            cells,
+            atom_types,
+            atomic=False,
+            fparam=fparam,
+            aparam=aparam,
+            **kwargs,
+        )
+        fitting_net = model.eval_fitting_last_layer()
+        model.set_eval_fitting_last_layer_hook(False)
+        return to_numpy_array(fitting_net)
diff --git a/deepmd/pt/infer/inference.py b/deepmd/pt/infer/inference.py
index dd0e7eaccb..ac11d160aa 100644
--- a/deepmd/pt/infer/inference.py
+++ b/deepmd/pt/infer/inference.py
@@ -3,6 +3,10 @@
 from copy import (
     deepcopy,
 )
+from typing import (
+    Optional,
+    Union,
+)
 
 import torch
 
@@ -25,8 +29,8 @@
 class Tester:
     def __init__(
         self,
-        model_ckpt,
-        head=None,
+        model_ckpt: Union[str, torch.nn.Module],
+        head: Optional[str] = None,
     ) -> None:
         """Construct a DeePMD tester.
 
diff --git a/deepmd/pt/loss/denoise.py b/deepmd/pt/loss/denoise.py
index 574210adb6..c8eeff6185 100644
--- a/deepmd/pt/loss/denoise.py
+++ b/deepmd/pt/loss/denoise.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
 import torch
 import torch.nn.functional as F
 
@@ -13,15 +17,15 @@
 class DenoiseLoss(TaskLoss):
     def __init__(
         self,
-        ntypes,
-        masked_token_loss=1.0,
-        masked_coord_loss=1.0,
-        norm_loss=0.01,
-        use_l1=True,
-        beta=1.00,
-        mask_loss_coord=True,
-        mask_loss_token=True,
-        **kwargs,
+        ntypes: int,
+        masked_token_loss: float = 1.0,
+        masked_coord_loss: float = 1.0,
+        norm_loss: float = 0.01,
+        use_l1: bool = True,
+        beta: float = 1.00,
+        mask_loss_coord: bool = True,
+        mask_loss_token: bool = True,
+        **kwargs: Any,
     ) -> None:
         """Construct a layer to compute loss on coord, and type reconstruction."""
         super().__init__()
@@ -38,7 +42,14 @@ def __init__(
         self.mask_loss_coord = mask_loss_coord
         self.mask_loss_token = mask_loss_token
 
-    def forward(self, model_pred, label, natoms, learning_rate, mae=False):
+    def forward(
+        self,
+        model_pred: dict[str, torch.Tensor],
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float,
+        mae: bool = False,
+    ) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
         """Return loss on coord and type denoise.
 
         Returns
diff --git a/deepmd/pt/loss/dos.py b/deepmd/pt/loss/dos.py
index 493cc85694..bc77f34437 100644
--- a/deepmd/pt/loss/dos.py
+++ b/deepmd/pt/loss/dos.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 import torch
 
@@ -26,8 +29,8 @@ def __init__(
         limit_pref_ados: float = 0.0,
         start_pref_acdf: float = 0.0,
         limit_pref_acdf: float = 0.0,
-        inference=False,
-        **kwargs,
+        inference: bool = False,
+        **kwargs: Any,
     ) -> None:
         r"""Construct a loss for local and global tensors.
 
@@ -85,7 +88,15 @@ def __init__(
             )
         )
 
-    def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False):
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float = 0.0,
+        mae: bool = False,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, dict[str, torch.Tensor]]:
         """Return loss on local and global tensors.
 
         Parameters
diff --git a/deepmd/pt/loss/ener.py b/deepmd/pt/loss/ener.py
index 10e2bf9971..cccdc8949e 100644
--- a/deepmd/pt/loss/ener.py
+++ b/deepmd/pt/loss/ener.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -23,7 +24,9 @@
 )
 
 
-def custom_huber_loss(predictions, targets, delta=1.0):
+def custom_huber_loss(
+    predictions: torch.Tensor, targets: torch.Tensor, delta: float = 1.0
+) -> torch.Tensor:
     error = targets - predictions
     abs_error = torch.abs(error)
     quadratic_loss = 0.5 * torch.pow(error, 2)
@@ -35,13 +38,13 @@ def custom_huber_loss(predictions, targets, delta=1.0):
 class EnergyStdLoss(TaskLoss):
     def __init__(
         self,
-        starter_learning_rate=1.0,
-        start_pref_e=0.0,
-        limit_pref_e=0.0,
-        start_pref_f=0.0,
-        limit_pref_f=0.0,
-        start_pref_v=0.0,
-        limit_pref_v=0.0,
+        starter_learning_rate: float = 1.0,
+        start_pref_e: float = 0.0,
+        limit_pref_e: float = 0.0,
+        start_pref_f: float = 0.0,
+        limit_pref_f: float = 0.0,
+        start_pref_v: float = 0.0,
+        limit_pref_v: float = 0.0,
         start_pref_ae: float = 0.0,
         limit_pref_ae: float = 0.0,
         start_pref_pf: float = 0.0,
@@ -52,10 +55,10 @@ def __init__(
         limit_pref_gf: float = 0.0,
         numb_generalized_coord: int = 0,
         use_l1_all: bool = False,
-        inference=False,
-        use_huber=False,
-        huber_delta=0.01,
-        **kwargs,
+        inference: bool = False,
+        use_huber: bool = False,
+        huber_delta: float = 0.01,
+        **kwargs: Any,
     ) -> None:
         r"""Construct a layer to compute loss on energy, force and virial.
 
@@ -149,7 +152,15 @@ def __init__(
                 "Huber loss is not implemented for force with atom_pref, generalized force and relative force. "
             )
 
-    def forward(self, input_dict, model, label, natoms, learning_rate, mae=False):
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float,
+        mae: bool = False,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, dict[str, torch.Tensor]]:
         """Return loss on energy and force.
 
         Parameters
@@ -528,10 +539,10 @@ def deserialize(cls, data: dict) -> "TaskLoss":
 class EnergyHessianStdLoss(EnergyStdLoss):
     def __init__(
         self,
-        start_pref_h=0.0,
-        limit_pref_h=0.0,
-        **kwargs,
-    ):
+        start_pref_h: float = 0.0,
+        limit_pref_h: float = 0.0,
+        **kwargs: Any,
+    ) -> None:
         r"""Enable the layer to compute loss on hessian.
 
         Parameters
@@ -549,7 +560,15 @@ def __init__(
         self.start_pref_h = start_pref_h
         self.limit_pref_h = limit_pref_h
 
-    def forward(self, input_dict, model, label, natoms, learning_rate, mae=False):
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float,
+        mae: bool = False,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, dict[str, torch.Tensor]]:
         model_pred, loss, more_loss = super().forward(
             input_dict, model, label, natoms, learning_rate, mae=mae
         )
diff --git a/deepmd/pt/loss/ener_spin.py b/deepmd/pt/loss/ener_spin.py
index 6a926f4051..9b87d4234f 100644
--- a/deepmd/pt/loss/ener_spin.py
+++ b/deepmd/pt/loss/ener_spin.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 import torch
 import torch.nn.functional as F
@@ -20,21 +23,21 @@
 class EnergySpinLoss(TaskLoss):
     def __init__(
         self,
-        starter_learning_rate=1.0,
-        start_pref_e=0.0,
-        limit_pref_e=0.0,
-        start_pref_fr=0.0,
-        limit_pref_fr=0.0,
-        start_pref_fm=0.0,
-        limit_pref_fm=0.0,
-        start_pref_v=0.0,
-        limit_pref_v=0.0,
+        starter_learning_rate: float = 1.0,
+        start_pref_e: float = 0.0,
+        limit_pref_e: float = 0.0,
+        start_pref_fr: float = 0.0,
+        limit_pref_fr: float = 0.0,
+        start_pref_fm: float = 0.0,
+        limit_pref_fm: float = 0.0,
+        start_pref_v: float = 0.0,
+        limit_pref_v: float = 0.0,
         start_pref_ae: float = 0.0,
         limit_pref_ae: float = 0.0,
         enable_atom_ener_coeff: bool = False,
         use_l1_all: bool = False,
-        inference=False,
-        **kwargs,
+        inference: bool = False,
+        **kwargs: Any,
     ) -> None:
         r"""Construct a layer to compute loss on energy, real force, magnetic force and virial.
 
@@ -93,7 +96,15 @@ def __init__(
         self.use_l1_all = use_l1_all
         self.inference = inference
 
-    def forward(self, input_dict, model, label, natoms, learning_rate, mae=False):
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float,
+        mae: bool = False,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, dict[str, torch.Tensor]]:
         """Return energy loss with magnetic labels.
 
         Parameters
diff --git a/deepmd/pt/loss/loss.py b/deepmd/pt/loss/loss.py
index d1777a29b3..13cad6f59b 100644
--- a/deepmd/pt/loss/loss.py
+++ b/deepmd/pt/loss/loss.py
@@ -4,7 +4,9 @@
     abstractmethod,
 )
 from typing import (
+    Any,
     NoReturn,
+    Union,
 )
 
 import torch
@@ -18,11 +20,18 @@
 
 
 class TaskLoss(torch.nn.Module, ABC, make_plugin_registry("loss")):
-    def __init__(self, **kwargs) -> None:
+    def __init__(self, **kwargs: Any) -> None:
         """Construct loss."""
         super().__init__()
 
-    def forward(self, input_dict, model, label, natoms, learning_rate) -> NoReturn:
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: Union[float, torch.Tensor],
+    ) -> NoReturn:
         """Return loss ."""
         raise NotImplementedError
 
diff --git a/deepmd/pt/loss/property.py b/deepmd/pt/loss/property.py
index 9d42c81b45..1cd842650d 100644
--- a/deepmd/pt/loss/property.py
+++ b/deepmd/pt/loss/property.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Union,
 )
 
@@ -23,15 +24,15 @@
 class PropertyLoss(TaskLoss):
     def __init__(
         self,
-        task_dim,
+        task_dim: int,
         var_name: str,
         loss_func: str = "smooth_mae",
-        metric: list = ["mae"],
+        metric: list[str] = ["mae"],
         beta: float = 1.00,
         out_bias: Union[list, None] = None,
         out_std: Union[list, None] = None,
         intensive: bool = False,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         r"""Construct a layer to compute loss on property.
 
@@ -42,7 +43,7 @@ def __init__(
         var_name : str
             The atomic property to fit, 'energy', 'dipole', and 'polar'.
         loss_func : str
-            The loss function, such as "smooth_mae", "mae", "rmse".
+            The loss function, such as "smooth_mae", "mae", "rmse", "mape".
         metric : list
             The metric such as mae, rmse which will be printed.
         beta : float
@@ -66,7 +67,15 @@ def __init__(
         self.intensive = intensive
         self.var_name = var_name
 
-    def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False):
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float = 0.0,
+        mae: bool = False,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, dict[str, torch.Tensor]]:
         """Return loss on properties .
 
         Parameters
@@ -151,6 +160,12 @@ def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False
                     reduction="mean",
                 )
             )
+        elif self.loss_func == "mape":
+            loss += torch.mean(
+                torch.abs(
+                    (label[var_name] - model_pred[var_name]) / (label[var_name] + 1e-3)
+                )
+            )
         else:
             raise RuntimeError(f"Unknown loss function : {self.loss_func}")
 
@@ -182,6 +197,12 @@ def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False
                     reduction="mean",
                 )
             ).detach()
+        if "mape" in self.metric:
+            more_loss["mape"] = torch.mean(
+                torch.abs(
+                    (label[var_name] - model_pred[var_name]) / (label[var_name] + 1e-3)
+                )
+            ).detach()
 
         return model_pred, loss, more_loss
 
diff --git a/deepmd/pt/loss/tensor.py b/deepmd/pt/loss/tensor.py
index 0acc3989be..625a9b30bc 100644
--- a/deepmd/pt/loss/tensor.py
+++ b/deepmd/pt/loss/tensor.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 import torch
 
@@ -21,9 +24,9 @@ def __init__(
         label_name: str,
         pref_atomic: float = 0.0,
         pref: float = 0.0,
-        inference=False,
+        inference: bool = False,
         enable_atomic_weight: bool = False,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         r"""Construct a loss for local and global tensors.
 
@@ -64,7 +67,15 @@ def __init__(
             "Can not assian zero weight both to `pref` and `pref_atomic`"
         )
 
-    def forward(self, input_dict, model, label, natoms, learning_rate=0.0, mae=False):
+    def forward(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        model: torch.nn.Module,
+        label: dict[str, torch.Tensor],
+        natoms: int,
+        learning_rate: float = 0.0,
+        mae: bool = False,
+    ) -> tuple[dict[str, torch.Tensor], torch.Tensor, dict[str, torch.Tensor]]:
         """Return loss on local and global tensors.
 
         Parameters
diff --git a/deepmd/pt/model/atomic_model/base_atomic_model.py b/deepmd/pt/model/atomic_model/base_atomic_model.py
index 56af5f4f43..b8ba0a1981 100644
--- a/deepmd/pt/model/atomic_model/base_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/base_atomic_model.py
@@ -106,7 +106,7 @@ def init_out_stat(self) -> None:
     def set_out_bias(self, out_bias: torch.Tensor) -> None:
         self.out_bias = out_bias
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: torch.Tensor) -> None:
         if key in ["out_bias"]:
             self.out_bias = value
         elif key in ["out_std"]:
@@ -114,7 +114,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> torch.Tensor:
         if key in ["out_bias"]:
             return self.out_bias
         elif key in ["out_std"]:
@@ -135,6 +135,10 @@ def get_intensive(self) -> bool:
         """Whether the fitting property is intensive."""
         return False
 
+    def has_default_fparam(self) -> bool:
+        """Check if the model has default frame parameters."""
+        return False
+
     def reinit_atom_exclude(
         self,
         exclude_types: list[int] = [],
@@ -296,7 +300,9 @@ def forward(
         )
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["BaseAtomicModel"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -363,21 +369,25 @@ def compute_or_load_stat(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         stat_file_path: Optional[DPPath] = None,
+        compute_or_load_out_stat: bool = True,
     ) -> NoReturn:
         """
-        Compute the output statistics (e.g. energy bias) for the fitting net from packed data.
+        Compute or load the statistics parameters of the model,
+        such as mean and standard deviation of descriptors or the energy bias of the fitting net.
+        When `sampled` is provided, all the statistics parameters will be calculated (or re-calculated for update),
+        and saved in the `stat_file_path`(s).
+        When `sampled` is not provided, it will check the existence of `stat_file_path`(s)
+        and load the calculated statistics parameters.
 
         Parameters
         ----------
-        merged : Union[Callable[[], list[dict]], list[dict]]
-            - list[dict]: A list of data samples from various data systems.
-                Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
-                originating from the `i`-th data system.
-            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
-                only when needed. Since the sampling process can be slow and memory-intensive,
-                the lazy function helps by only sampling once.
-        stat_file_path : Optional[DPPath]
-            The path to the stat file.
+        merged
+            The lazy sampled function to get data frames from different data systems.
+        stat_file_path
+            The dictionary of paths to the statistics files.
+        compute_or_load_out_stat : bool
+            Whether to compute the output statistics.
+            If False, it will only compute the input statistics (e.g. mean and standard deviation of descriptors).
 
         """
         raise NotImplementedError
@@ -413,7 +423,7 @@ def apply_out_stat(
         self,
         ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
-    ):
+    ) -> dict[str, torch.Tensor]:
         """Apply the stat to each atomic output.
         The developer may override the method to define how the bias is applied
         to the atomic output of the model.
@@ -434,9 +444,9 @@ def apply_out_stat(
 
     def change_out_bias(
         self,
-        sample_merged,
+        sample_merged: Union[Callable[[], list[dict]], list[dict]],
         stat_file_path: Optional[DPPath] = None,
-        bias_adjust_mode="change-by-statistic",
+        bias_adjust_mode: str = "change-by-statistic",
     ) -> None:
         """Change the output bias according to the input data and the pretrained model.
 
@@ -486,7 +496,13 @@ def change_out_bias(
     def _get_forward_wrapper_func(self) -> Callable[..., torch.Tensor]:
         """Get a forward wrapper of the atomic model for output bias calculation."""
 
-        def model_forward(coord, atype, box, fparam=None, aparam=None):
+        def model_forward(
+            coord: torch.Tensor,
+            atype: torch.Tensor,
+            box: Optional[torch.Tensor],
+            fparam: Optional[torch.Tensor] = None,
+            aparam: Optional[torch.Tensor] = None,
+        ) -> dict[str, torch.Tensor]:
             with (
                 torch.no_grad()
             ):  # it's essential for pure torch forward function to use auto_batchsize
@@ -515,13 +531,13 @@ def model_forward(coord, atype, box, fparam=None, aparam=None):
 
         return model_forward
 
-    def _default_bias(self):
+    def _default_bias(self) -> torch.Tensor:
         ntypes = self.get_ntypes()
         return torch.zeros(
             [self.n_out, ntypes, self.max_out_size], dtype=dtype, device=device
         )
 
-    def _default_std(self):
+    def _default_std(self) -> torch.Tensor:
         ntypes = self.get_ntypes()
         return torch.ones(
             [self.n_out, ntypes, self.max_out_size], dtype=dtype, device=device
diff --git a/deepmd/pt/model/atomic_model/dipole_atomic_model.py b/deepmd/pt/model/atomic_model/dipole_atomic_model.py
index 3796aa2e83..c9badefcad 100644
--- a/deepmd/pt/model/atomic_model/dipole_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/dipole_atomic_model.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 import torch
 
@@ -12,7 +15,9 @@
 
 
 class DPDipoleAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self, descriptor: Any, fitting: Any, type_map: Any, **kwargs: Any
+    ) -> None:
         if not isinstance(fitting, DipoleFittingNet):
             raise TypeError(
                 "fitting must be an instance of DipoleFittingNet for DPDipoleAtomicModel"
@@ -23,6 +28,6 @@ def apply_out_stat(
         self,
         ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
-    ):
+    ) -> dict[str, torch.Tensor]:
         # dipole not applying bias
         return ret
diff --git a/deepmd/pt/model/atomic_model/dos_atomic_model.py b/deepmd/pt/model/atomic_model/dos_atomic_model.py
index 2af1a4e052..7bc0108fc5 100644
--- a/deepmd/pt/model/atomic_model/dos_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/dos_atomic_model.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
 from deepmd.pt.model.task.dos import (
     DOSFittingNet,
 )
@@ -9,7 +13,9 @@
 
 
 class DPDOSAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self, descriptor: Any, fitting: Any, type_map: Any, **kwargs: Any
+    ) -> None:
         if not isinstance(fitting, DOSFittingNet):
             raise TypeError(
                 "fitting must be an instance of DOSFittingNet for DPDOSAtomicModel"
diff --git a/deepmd/pt/model/atomic_model/dp_atomic_model.py b/deepmd/pt/model/atomic_model/dp_atomic_model.py
index 5a5655b72c..5b7d96560f 100644
--- a/deepmd/pt/model/atomic_model/dp_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/dp_atomic_model.py
@@ -2,6 +2,8 @@
 import functools
 import logging
 from typing import (
+    Any,
+    Callable,
     Optional,
 )
 
@@ -47,10 +49,10 @@ class DPAtomicModel(BaseAtomicModel):
 
     def __init__(
         self,
-        descriptor,
-        fitting,
+        descriptor: BaseDescriptor,
+        fitting: BaseFitting,
         type_map: list[str],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(type_map, **kwargs)
         ntypes = len(type_map)
@@ -62,9 +64,12 @@ def __init__(
         self.fitting_net = fitting
         super().init_out_stat()
         self.enable_eval_descriptor_hook = False
+        self.enable_eval_fitting_last_layer_hook = False
         self.eval_descriptor_list = []
+        self.eval_fitting_last_layer_list = []
 
     eval_descriptor_list: list[torch.Tensor]
+    eval_fitting_last_layer_list: list[torch.Tensor]
 
     def set_eval_descriptor_hook(self, enable: bool) -> None:
         """Set the hook for evaluating descriptor and clear the cache for descriptor list."""
@@ -76,6 +81,17 @@ def eval_descriptor(self) -> torch.Tensor:
         """Evaluate the descriptor."""
         return torch.concat(self.eval_descriptor_list)
 
+    def set_eval_fitting_last_layer_hook(self, enable: bool) -> None:
+        """Set the hook for evaluating fitting last layer output and clear the cache for fitting last layer output list."""
+        self.enable_eval_fitting_last_layer_hook = enable
+        self.fitting_net.set_return_middle_output(enable)
+        # = [] does not work; See #4533
+        self.eval_fitting_last_layer_list.clear()
+
+    def eval_fitting_last_layer(self) -> torch.Tensor:
+        """Evaluate the fitting last layer output."""
+        return torch.concat(self.eval_fitting_last_layer_list)
+
     @torch.jit.export
     def fitting_output_def(self) -> FittingOutputDef:
         """Get the output def of the fitting net."""
@@ -94,7 +110,7 @@ def get_sel(self) -> list[int]:
         """Get the neighbor selection."""
         return self.sel
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this atomic model by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -114,7 +130,9 @@ def mixed_types(self) -> bool:
         return self.descriptor.mixed_types()
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["DPAtomicModel"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -155,7 +173,7 @@ def serialize(self) -> dict:
         return dd
 
     @classmethod
-    def deserialize(cls, data) -> "DPAtomicModel":
+    def deserialize(cls, data: dict) -> "DPAtomicModel":
         data = data.copy()
         check_version_compatibility(data.pop("@version", 1), 2, 1)
         data.pop("@class", None)
@@ -200,9 +218,9 @@ def enable_compression(
 
     def forward_atomic(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -255,6 +273,13 @@ def forward_atomic(
             fparam=fparam,
             aparam=aparam,
         )
+        if self.enable_eval_fitting_last_layer_hook:
+            assert "middle_output" in fit_ret, (
+                "eval_fitting_last_layer not supported for this fitting net!"
+            )
+            self.eval_fitting_last_layer_list.append(
+                fit_ret.pop("middle_output").detach()
+            )
         return fit_ret
 
     def get_out_bias(self) -> torch.Tensor:
@@ -262,8 +287,9 @@ def get_out_bias(self) -> torch.Tensor:
 
     def compute_or_load_stat(
         self,
-        sampled_func,
+        sampled_func: Callable[[], list[dict]],
         stat_file_path: Optional[DPPath] = None,
+        compute_or_load_out_stat: bool = True,
     ) -> None:
         """
         Compute or load the statistics parameters of the model,
@@ -279,6 +305,9 @@ def compute_or_load_stat(
             The lazy sampled function to get data frames from different data systems.
         stat_file_path
             The dictionary of paths to the statistics files.
+        compute_or_load_out_stat : bool
+            Whether to compute the output statistics.
+            If False, it will only compute the input statistics (e.g. mean and standard deviation of descriptors).
         """
         if stat_file_path is not None and self.type_map is not None:
             # descriptors and fitting net with different type_map
@@ -286,7 +315,7 @@ def compute_or_load_stat(
             stat_file_path /= " ".join(self.type_map)
 
         @functools.lru_cache
-        def wrapped_sampler():
+        def wrapped_sampler() -> list[dict]:
             sampled = sampled_func()
             if self.pair_excl is not None:
                 pair_exclude_types = self.pair_excl.get_exclude_types()
@@ -302,12 +331,17 @@ def wrapped_sampler():
         self.fitting_net.compute_input_stats(
             wrapped_sampler, protection=self.data_stat_protect
         )
-        self.compute_or_load_out_stat(wrapped_sampler, stat_file_path)
+        if compute_or_load_out_stat:
+            self.compute_or_load_out_stat(wrapped_sampler, stat_file_path)
 
     def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this atomic model."""
         return self.fitting_net.get_dim_fparam()
 
+    def has_default_fparam(self) -> bool:
+        """Check if the model has default frame parameters."""
+        return self.fitting_net.has_default_fparam()
+
     def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.fitting_net.get_dim_aparam()
diff --git a/deepmd/pt/model/atomic_model/energy_atomic_model.py b/deepmd/pt/model/atomic_model/energy_atomic_model.py
index 6d894b4aab..9f513fc53d 100644
--- a/deepmd/pt/model/atomic_model/energy_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/energy_atomic_model.py
@@ -1,4 +1,8 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
+
 from deepmd.pt.model.task.ener import (
     EnergyFittingNet,
     EnergyFittingNetDirect,
@@ -11,7 +15,9 @@
 
 
 class DPEnergyAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self, descriptor: Any, fitting: Any, type_map: Any, **kwargs: Any
+    ) -> None:
         if not (
             isinstance(fitting, EnergyFittingNet)
             or isinstance(fitting, EnergyFittingNetDirect)
diff --git a/deepmd/pt/model/atomic_model/linear_atomic_model.py b/deepmd/pt/model/atomic_model/linear_atomic_model.py
index 3d894dc3a0..b510448ec3 100644
--- a/deepmd/pt/model/atomic_model/linear_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/linear_atomic_model.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import functools
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -56,7 +58,7 @@ def __init__(
         models: list[BaseAtomicModel],
         type_map: list[str],
         weights: Optional[Union[str, list[float]]] = "mean",
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(type_map, **kwargs)
         super().init_out_stat()
@@ -135,7 +137,9 @@ def get_type_map(self) -> list[str]:
         return self.type_map
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["LinearEnergyAtomicModel"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -158,7 +162,7 @@ def get_model_rcuts(self) -> list[float]:
     def get_sel(self) -> list[int]:
         return [max([model.get_nsel() for model in self.models])]
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this atomic model by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -290,6 +294,7 @@ def forward_atomic(
                     mapping,
                     fparam,
                     aparam,
+                    comm_dict=comm_dict,
                 )["energy"]
             )
         weights = self._compute_weight(extended_coord, extended_atype, nlists_)
@@ -306,7 +311,7 @@ def apply_out_stat(
         self,
         ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
-    ):
+    ) -> dict[str, torch.Tensor]:
         """Apply the stat to each atomic output.
         The developer may override the method to define how the bias is applied
         to the atomic output of the model.
@@ -319,6 +324,10 @@ def apply_out_stat(
             The atom types. nf x nloc
 
         """
+        out_bias, out_std = self._fetch_out_stat(self.bias_keys)
+        for kk in self.bias_keys:
+            # nf x nloc x odims, out_bias: ntypes x odims
+            ret[kk] = ret[kk] + out_bias[kk][atype]
         return ret
 
     @staticmethod
@@ -464,34 +473,11 @@ def is_aparam_nall(self) -> bool:
         """
         return False
 
-    def compute_or_load_out_stat(
-        self,
-        merged: Union[Callable[[], list[dict]], list[dict]],
-        stat_file_path: Optional[DPPath] = None,
-    ) -> None:
-        """
-        Compute the output statistics (e.g. energy bias) for the fitting net from packed data.
-
-        Parameters
-        ----------
-        merged : Union[Callable[[], list[dict]], list[dict]]
-            - list[dict]: A list of data samples from various data systems.
-                Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
-                originating from the `i`-th data system.
-            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
-                only when needed. Since the sampling process can be slow and memory-intensive,
-                the lazy function helps by only sampling once.
-        stat_file_path : Optional[DPPath]
-            The path to the stat file.
-
-        """
-        for md in self.models:
-            md.compute_or_load_out_stat(merged, stat_file_path)
-
     def compute_or_load_stat(
         self,
-        sampled_func,
+        sampled_func: Callable[[], list[dict[str, Any]]],
         stat_file_path: Optional[DPPath] = None,
+        compute_or_load_out_stat: bool = True,
     ) -> None:
         """
         Compute or load the statistics parameters of the model,
@@ -507,9 +493,34 @@ def compute_or_load_stat(
             The lazy sampled function to get data frames from different data systems.
         stat_file_path
             The dictionary of paths to the statistics files.
+        compute_or_load_out_stat : bool
+            Whether to compute the output statistics.
+            If False, it will only compute the input statistics (e.g. mean and standard deviation of descriptors).
         """
         for md in self.models:
-            md.compute_or_load_stat(sampled_func, stat_file_path)
+            md.compute_or_load_stat(
+                sampled_func, stat_file_path, compute_or_load_out_stat=False
+            )
+
+        if stat_file_path is not None and self.type_map is not None:
+            # descriptors and fitting net with different type_map
+            # should not share the same parameters
+            stat_file_path /= " ".join(self.type_map)
+
+        @functools.lru_cache
+        def wrapped_sampler() -> list[dict[str, Any]]:
+            sampled = sampled_func()
+            if self.pair_excl is not None:
+                pair_exclude_types = self.pair_excl.get_exclude_types()
+                for sample in sampled:
+                    sample["pair_exclude_types"] = list(pair_exclude_types)
+            if self.atom_excl is not None:
+                atom_exclude_types = self.atom_excl.get_exclude_types()
+                for sample in sampled:
+                    sample["atom_exclude_types"] = list(atom_exclude_types)
+            return sampled
+
+        self.compute_or_load_out_stat(wrapped_sampler, stat_file_path)
 
 
 class DPZBLLinearEnergyAtomicModel(LinearEnergyAtomicModel):
@@ -541,7 +552,7 @@ def __init__(
         sw_rmax: float,
         type_map: list[str],
         smin_alpha: Optional[float] = 0.1,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         models = [dp_model, zbl_model]
         kwargs["models"] = models
@@ -569,7 +580,7 @@ def serialize(self) -> dict:
         )
         return dd
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this atomic model by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -578,7 +589,7 @@ def set_case_embd(self, case_idx: int):
         self.models[0].set_case_embd(case_idx)
 
     @classmethod
-    def deserialize(cls, data) -> "DPZBLLinearEnergyAtomicModel":
+    def deserialize(cls, data: dict[str, Any]) -> "DPZBLLinearEnergyAtomicModel":
         data = data.copy()
         check_version_compatibility(data.pop("@version", 1), 2, 1)
         models = [
diff --git a/deepmd/pt/model/atomic_model/pairtab_atomic_model.py b/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
index 62b47afb32..b022e6bfc9 100644
--- a/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/pairtab_atomic_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -68,7 +69,7 @@ def __init__(
         rcut: float,
         sel: Union[int, list[int]],
         type_map: list[str],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__(type_map, **kwargs)
         super().init_out_stat()
@@ -141,7 +142,7 @@ def get_type_map(self) -> list[str]:
     def get_sel(self) -> list[int]:
         return [self.sel]
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this atomic model by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -175,7 +176,9 @@ def need_sorted_nlist_for_lower(self) -> bool:
         return False
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["PairTabAtomicModel"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -202,7 +205,7 @@ def serialize(self) -> dict:
         return dd
 
     @classmethod
-    def deserialize(cls, data) -> "PairTabAtomicModel":
+    def deserialize(cls, data: dict[str, Any]) -> "PairTabAtomicModel":
         data = data.copy()
         check_version_compatibility(data.pop("@version", 1), 2, 1)
         tab = PairTab.deserialize(data.pop("tab"))
@@ -224,26 +227,31 @@ def deserialize(cls, data) -> "PairTabAtomicModel":
 
     def compute_or_load_stat(
         self,
-        merged: Union[Callable[[], list[dict]], list[dict]],
+        sampled_func: Union[Callable[[], list[dict]], list[dict]],
         stat_file_path: Optional[DPPath] = None,
+        compute_or_load_out_stat: bool = True,
     ) -> None:
         """
-        Compute the output statistics (e.g. energy bias) for the fitting net from packed data.
+        Compute or load the statistics parameters of the model,
+        such as mean and standard deviation of descriptors or the energy bias of the fitting net.
+        When `sampled` is provided, all the statistics parameters will be calculated (or re-calculated for update),
+        and saved in the `stat_file_path`(s).
+        When `sampled` is not provided, it will check the existence of `stat_file_path`(s)
+        and load the calculated statistics parameters.
 
         Parameters
         ----------
-        merged : Union[Callable[[], list[dict]], list[dict]]
-            - list[dict]: A list of data samples from various data systems.
-                Each element, `merged[i]`, is a data dictionary containing `keys`: `torch.Tensor`
-                originating from the `i`-th data system.
-            - Callable[[], list[dict]]: A lazy function that returns data samples in the above format
-                only when needed. Since the sampling process can be slow and memory-intensive,
-                the lazy function helps by only sampling once.
-        stat_file_path : Optional[DPPath]
-            The path to the stat file.
+        sampled_func
+            The lazy sampled function to get data frames from different data systems.
+        stat_file_path
+            The dictionary of paths to the statistics files.
+        compute_or_load_out_stat : bool
+            Whether to compute the output statistics.
+            If False, it will only compute the input statistics (e.g. mean and standard deviation of descriptors).
 
         """
-        self.compute_or_load_out_stat(merged, stat_file_path)
+        if compute_or_load_out_stat:
+            self.compute_or_load_out_stat(sampled_func, stat_file_path)
 
     def forward_atomic(
         self,
diff --git a/deepmd/pt/model/atomic_model/polar_atomic_model.py b/deepmd/pt/model/atomic_model/polar_atomic_model.py
index 6bd063591f..4484d1945b 100644
--- a/deepmd/pt/model/atomic_model/polar_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/polar_atomic_model.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 import torch
 
@@ -12,7 +15,9 @@
 
 
 class DPPolarAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self, descriptor: Any, fitting: Any, type_map: Any, **kwargs: Any
+    ) -> None:
         if not isinstance(fitting, PolarFittingNet):
             raise TypeError(
                 "fitting must be an instance of PolarFittingNet for DPPolarAtomicModel"
@@ -23,7 +28,7 @@ def apply_out_stat(
         self,
         ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
-    ):
+    ) -> dict[str, torch.Tensor]:
         """Apply the stat to each atomic output.
 
         Parameters
diff --git a/deepmd/pt/model/atomic_model/property_atomic_model.py b/deepmd/pt/model/atomic_model/property_atomic_model.py
index 3622c9f476..baf9c5b7fc 100644
--- a/deepmd/pt/model/atomic_model/property_atomic_model.py
+++ b/deepmd/pt/model/atomic_model/property_atomic_model.py
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from typing import (
+    Any,
+)
 
 import torch
 
@@ -12,7 +15,9 @@
 
 
 class DPPropertyAtomicModel(DPAtomicModel):
-    def __init__(self, descriptor, fitting, type_map, **kwargs):
+    def __init__(
+        self, descriptor: Any, fitting: Any, type_map: Any, **kwargs: Any
+    ) -> None:
         if not isinstance(fitting, PropertyFittingNet):
             raise TypeError(
                 "fitting must be an instance of PropertyFittingNet for DPPropertyAtomicModel"
@@ -31,7 +36,7 @@ def apply_out_stat(
         self,
         ret: dict[str, torch.Tensor],
         atype: torch.Tensor,
-    ):
+    ) -> dict[str, torch.Tensor]:
         """Apply the stat to each atomic output.
         In property fitting, each output will be multiplied by label std and then plus the label average value.
 
diff --git a/deepmd/pt/model/descriptor/descriptor.py b/deepmd/pt/model/descriptor/descriptor.py
index 3b374751c7..c1a3529ae0 100644
--- a/deepmd/pt/model/descriptor/descriptor.py
+++ b/deepmd/pt/model/descriptor/descriptor.py
@@ -5,6 +5,7 @@
     abstractmethod,
 )
 from typing import (
+    Any,
     Callable,
     NoReturn,
     Optional,
@@ -43,7 +44,7 @@ class DescriptorBlock(torch.nn.Module, ABC, make_plugin_registry("DescriptorBloc
 
     local_cluster = False
 
-    def __new__(cls, *args, **kwargs):
+    def __new__(cls, *args: Any, **kwargs: Any) -> "DescriptorBlock":
         if cls is DescriptorBlock:
             try:
                 descrpt_type = kwargs["type"]
@@ -126,7 +127,9 @@ def get_stats(self) -> dict[str, StatItem]:
         """Get the statistics of the descriptor."""
         raise NotImplementedError
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: "DescriptorBlock", shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -178,7 +181,13 @@ def forward(
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
         type_embedding: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Calculate DescriptorBlock."""
         pass
 
@@ -192,14 +201,18 @@ def need_sorted_nlist_for_lower(self) -> bool:
 
 
 def make_default_type_embedding(
-    ntypes,
-):
+    ntypes: int,
+) -> tuple[TypeEmbedNet, dict[str, Any]]:
     aux = {}
     aux["tebd_dim"] = 8
     return TypeEmbedNet(ntypes, aux["tebd_dim"]), aux
 
 
-def extend_descrpt_stat(des, type_map, des_with_stat=None) -> None:
+def extend_descrpt_stat(
+    des: DescriptorBlock,
+    type_map: list[str],
+    des_with_stat: Optional[DescriptorBlock] = None,
+) -> None:
     r"""
     Extend the statistics of a descriptor block with types from newly provided `type_map`.
 
diff --git a/deepmd/pt/model/descriptor/dpa1.py b/deepmd/pt/model/descriptor/dpa1.py
index 9c1e144f48..e158dd3725 100644
--- a/deepmd/pt/model/descriptor/dpa1.py
+++ b/deepmd/pt/model/descriptor/dpa1.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -236,8 +237,8 @@ def __init__(
         exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         scaling_factor: int = 1.0,
-        normalize=True,
-        temperature=None,
+        normalize: bool = True,
+        temperature: Optional[float] = None,
         concat_output_tebd: bool = True,
         trainable: bool = True,
         trainable_ln: bool = True,
@@ -250,7 +251,7 @@ def __init__(
         use_tebd_bias: bool = False,
         type_map: Optional[list[str]] = None,
         # not implemented
-        spin=None,
+        spin: Optional[Any] = None,
         type: Optional[str] = None,
     ) -> None:
         super().__init__()
@@ -298,6 +299,7 @@ def __init__(
             trainable_ln=trainable_ln,
             ln_eps=ln_eps,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.use_econf_tebd = use_econf_tebd
         self.use_tebd_bias = use_tebd_bias
@@ -311,6 +313,7 @@ def __init__(
             use_econf_tebd=use_econf_tebd,
             use_tebd_bias=use_tebd_bias,
             type_map=type_map,
+            trainable=trainable,
         )
         self.prec = PRECISION_DICT[precision]
         self.tebd_dim = tebd_dim
@@ -378,7 +381,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.se_atten.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -402,18 +407,18 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         return self.get_dim_out()
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         return self.get_dim_emb()
 
     def compute_input_stats(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
-    ):
+    ) -> None:
         """
         Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
 
@@ -446,7 +451,7 @@ def get_stat_mean_and_stddev(self) -> tuple[torch.Tensor, torch.Tensor]:
         return self.se_atten.mean, self.se_atten.stddev
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -546,7 +551,7 @@ def deserialize(cls, data: dict) -> "DescrptDPA1":
             data["use_tebd_bias"] = True
         obj = cls(**data)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.se_atten.prec, device=env.DEVICE)
 
         obj.type_embedding.embedding = TypeEmbedNetConsistent.deserialize(
@@ -649,7 +654,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -706,10 +717,12 @@ def forward(
 
         return (
             g1.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION)
+            if rot_mat is not None
+            else None,
             g2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if g2 is not None else None,
-            h2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            h2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if h2 is not None else None,
+            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if sw is not None else None,
         )
 
     @classmethod
diff --git a/deepmd/pt/model/descriptor/dpa2.py b/deepmd/pt/model/descriptor/dpa2.py
index 969fdca5fc..5858206cc3 100644
--- a/deepmd/pt/model/descriptor/dpa2.py
+++ b/deepmd/pt/model/descriptor/dpa2.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -155,7 +156,7 @@ def __init__(
         """
         super().__init__()
 
-        def init_subclass_params(sub_data, sub_class):
+        def init_subclass_params(sub_data: Any, sub_class: Any) -> Any:
             if isinstance(sub_data, dict):
                 return sub_class(**sub_data)
             elif isinstance(sub_data, sub_class):
@@ -188,6 +189,7 @@ def init_subclass_params(sub_data, sub_class):
             smooth=smooth,
             type_one_side=self.repinit_args.type_one_side,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.use_three_body = self.repinit_args.use_three_body
         if self.use_three_body:
@@ -207,6 +209,7 @@ def init_subclass_params(sub_data, sub_class):
                 resnet_dt=self.repinit_args.resnet_dt,
                 smooth=smooth,
                 seed=child_seed(seed, 5),
+                trainable=trainable,
             )
         else:
             self.repinit_three_body = None
@@ -247,6 +250,7 @@ def init_subclass_params(sub_data, sub_class):
             g1_out_conv=self.repformer_args.g1_out_conv,
             g1_out_mlp=self.repformer_args.g1_out_mlp,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.rcsl_list = [
             (self.repformers.get_rcut(), self.repformers.get_nsel()),
@@ -274,6 +278,7 @@ def init_subclass_params(sub_data, sub_class):
             use_econf_tebd=self.use_econf_tebd,
             use_tebd_bias=use_tebd_bias,
             type_map=type_map,
+            trainable=trainable,
         )
         self.concat_output_tebd = concat_output_tebd
         self.precision = precision
@@ -299,6 +304,7 @@ def init_subclass_params(sub_data, sub_class):
                 precision=precision,
                 init="glorot",
                 seed=child_seed(seed, 3),
+                trainable=trainable,
             )
         self.tebd_transform = None
         if self.add_tebd_to_repinit_out:
@@ -308,6 +314,7 @@ def init_subclass_params(sub_data, sub_class):
                 bias=False,
                 precision=precision,
                 seed=child_seed(seed, 4),
+                trainable=trainable,
             )
         assert self.repinit.rcut > self.repformers.rcut
         assert self.repinit.sel[0] > self.repformers.sel[0]
@@ -384,7 +391,9 @@ def get_env_protection(self) -> float:
         # the env_protection of repinit is the same as that of the repformer
         return self.repinit.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -416,7 +425,7 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -471,11 +480,11 @@ def change_type_map(
             repinit_three_body["dstd"] = repinit_three_body["dstd"][remap_index]
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         return self.get_dim_out()
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the embedding dimension g2."""
         return self.get_dim_emb()
 
@@ -650,7 +659,7 @@ def deserialize(cls, data: dict) -> "DescrptDPA2":
         if obj.repinit.dim_out != obj.repformers.dim_in:
             obj.g1_shape_tranform = MLPLayer.deserialize(g1_shape_tranform)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.repinit.prec, device=env.DEVICE)
 
         # deserialize repinit
@@ -705,7 +714,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -814,10 +829,12 @@ def forward(
             g1 = torch.cat([g1, g1_inp], dim=-1)
         return (
             g1.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            g2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            h2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION)
+            if rot_mat is not None
+            else None,
+            g2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if g2 is not None else None,
+            h2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if h2 is not None else None,
+            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if sw is not None else None,
         )
 
     @classmethod
diff --git a/deepmd/pt/model/descriptor/dpa3.py b/deepmd/pt/model/descriptor/dpa3.py
index dd2da9a3c8..2de7851a51 100644
--- a/deepmd/pt/model/descriptor/dpa3.py
+++ b/deepmd/pt/model/descriptor/dpa3.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -122,7 +123,7 @@ def __init__(
     ) -> None:
         super().__init__()
 
-        def init_subclass_params(sub_data, sub_class):
+        def init_subclass_params(sub_data: Any, sub_class: Any) -> Any:
             if isinstance(sub_data, dict):
                 return sub_class(**sub_data)
             elif isinstance(sub_data, sub_class):
@@ -169,6 +170,7 @@ def init_subclass_params(sub_data, sub_class):
             env_protection=env_protection,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
 
         self.use_econf_tebd = use_econf_tebd
@@ -184,6 +186,7 @@ def init_subclass_params(sub_data, sub_class):
             use_econf_tebd=self.use_econf_tebd,
             use_tebd_bias=use_tebd_bias,
             type_map=type_map,
+            trainable=trainable,
         )
         self.concat_output_tebd = concat_output_tebd
         self.precision = precision
@@ -270,7 +273,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.repflows.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -294,7 +299,7 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -323,11 +328,11 @@ def change_type_map(
         repflow["dstd"] = repflow["dstd"][remap_index]
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         return self.get_dim_out()
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the embedding dimension g2."""
         return self.get_dim_emb()
 
@@ -425,7 +430,7 @@ def deserialize(cls, data: dict) -> "DescrptDPA3":
             type_embedding
         )
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.repflows.prec, device=env.DEVICE)
 
         # deserialize repflow
@@ -450,7 +455,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -507,10 +518,14 @@ def forward(
             node_ebd = torch.cat([node_ebd, node_ebd_inp], dim=-1)
         return (
             node_ebd.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            edge_ebd.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            h2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION)
+            if rot_mat is not None
+            else None,
+            edge_ebd.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION)
+            if edge_ebd is not None
+            else None,
+            h2.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if h2 is not None else None,
+            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if sw is not None else None,
         )
 
     @classmethod
diff --git a/deepmd/pt/model/descriptor/env_mat.py b/deepmd/pt/model/descriptor/env_mat.py
index c57ae209fd..0ffdbb7dbb 100644
--- a/deepmd/pt/model/descriptor/env_mat.py
+++ b/deepmd/pt/model/descriptor/env_mat.py
@@ -9,14 +9,14 @@
 
 
 def _make_env_mat(
-    nlist,
-    coord,
+    nlist: torch.Tensor,
+    coord: torch.Tensor,
     rcut: float,
     ruct_smth: float,
     radial_only: bool = False,
     protection: float = 0.0,
     use_exp_switch: bool = False,
-):
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Make smooth environment matrix."""
     bsz, natoms, nnei = nlist.shape
     coord = coord.view(bsz, -1, 3)
@@ -49,17 +49,17 @@ def _make_env_mat(
 
 
 def prod_env_mat(
-    extended_coord,
-    nlist,
-    atype,
-    mean,
-    stddev,
+    extended_coord: torch.Tensor,
+    nlist: torch.Tensor,
+    atype: torch.Tensor,
+    mean: torch.Tensor,
+    stddev: torch.Tensor,
     rcut: float,
     rcut_smth: float,
     radial_only: bool = False,
     protection: float = 0.0,
     use_exp_switch: bool = False,
-):
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Generate smooth environment matrix from atom coordinates and other context.
 
     Args:
diff --git a/deepmd/pt/model/descriptor/hybrid.py b/deepmd/pt/model/descriptor/hybrid.py
index e13b014037..545fba7019 100644
--- a/deepmd/pt/model/descriptor/hybrid.py
+++ b/deepmd/pt/model/descriptor/hybrid.py
@@ -45,7 +45,7 @@ class DescrptHybrid(BaseDescriptor, torch.nn.Module):
     def __init__(
         self,
         list: list[Union[BaseDescriptor, dict[str, Any]]],
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__()
         # warning: list is conflict with built-in list
@@ -140,7 +140,7 @@ def get_dim_emb(self) -> int:
         """Returns the output dimension."""
         return sum([descrpt.get_dim_emb() for descrpt in self.descrpt_list])
 
-    def mixed_types(self):
+    def mixed_types(self) -> bool:
         """Returns if the descriptor requires a neighbor list that distinguish different
         atomic types or not.
         """
@@ -164,7 +164,9 @@ def get_env_protection(self) -> float:
             )
         return all_protection[0]
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: "DescrptHybrid", shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -182,7 +184,9 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["DescrptHybrid"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -265,7 +269,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
diff --git a/deepmd/pt/model/descriptor/repflow_layer.py b/deepmd/pt/model/descriptor/repflow_layer.py
index 37d4f07bb4..62145958c8 100644
--- a/deepmd/pt/model/descriptor/repflow_layer.py
+++ b/deepmd/pt/model/descriptor/repflow_layer.py
@@ -64,6 +64,7 @@ def __init__(
         update_residual_init: str = "const",
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.epsilon = 1e-4  # protection of 1./nnei
@@ -126,6 +127,7 @@ def __init__(
             n_dim,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         if self.update_style == "res_residual":
             self.n_residual.append(
@@ -135,6 +137,7 @@ def __init__(
                     self.update_residual_init,
                     precision=precision,
                     seed=child_seed(seed, 1),
+                    trainable=trainable,
                 )
             )
 
@@ -145,6 +148,7 @@ def __init__(
             n_dim,
             precision=precision,
             seed=child_seed(seed, 2),
+            trainable=trainable,
         )
         if self.update_style == "res_residual":
             self.n_residual.append(
@@ -154,6 +158,7 @@ def __init__(
                     self.update_residual_init,
                     precision=precision,
                     seed=child_seed(seed, 3),
+                    trainable=trainable,
                 )
             )
 
@@ -163,6 +168,7 @@ def __init__(
             self.n_multi_edge_message * n_dim,
             precision=precision,
             seed=child_seed(seed, 4),
+            trainable=trainable,
         )
         if self.update_style == "res_residual":
             for head_index in range(self.n_multi_edge_message):
@@ -173,6 +179,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(child_seed(seed, 5), head_index),
+                        trainable=trainable,
                     )
                 )
 
@@ -182,6 +189,7 @@ def __init__(
             e_dim,
             precision=precision,
             seed=child_seed(seed, 6),
+            trainable=trainable,
         )
         if self.update_style == "res_residual":
             self.e_residual.append(
@@ -191,6 +199,7 @@ def __init__(
                     self.update_residual_init,
                     precision=precision,
                     seed=child_seed(seed, 7),
+                    trainable=trainable,
                 )
             )
 
@@ -219,6 +228,7 @@ def __init__(
                         precision=precision,
                         bias=False,
                         seed=child_seed(seed, 8),
+                        trainable=trainable,
                     )
                     self.a_compress_e_linear = MLPLayer(
                         self.e_dim,
@@ -226,6 +236,7 @@ def __init__(
                         precision=precision,
                         bias=False,
                         seed=child_seed(seed, 9),
+                        trainable=trainable,
                     )
                 else:
                     self.a_compress_n_linear = None
@@ -237,12 +248,14 @@ def __init__(
                 self.e_dim,
                 precision=precision,
                 seed=child_seed(seed, 10),
+                trainable=trainable,
             )
             self.edge_angle_linear2 = MLPLayer(
                 self.e_dim,
                 self.e_dim,
                 precision=precision,
                 seed=child_seed(seed, 11),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.e_residual.append(
@@ -252,6 +265,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 12),
+                        trainable=trainable,
                     )
                 )
 
@@ -261,6 +275,7 @@ def __init__(
                 self.a_dim,
                 precision=precision,
                 seed=child_seed(seed, 13),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.a_residual.append(
@@ -270,6 +285,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 14),
+                        trainable=trainable,
                     )
                 )
         else:
@@ -370,7 +386,7 @@ def _cal_hg_dynamic(
         # n_edge x e_dim
         flat_edge_ebd = flat_edge_ebd * flat_sw.unsqueeze(-1)
         # n_edge x 3 x e_dim
-        flat_h2g2 = (flat_h2[..., None] * flat_edge_ebd[:, None, :]).reshape(
+        flat_h2g2 = (flat_h2.unsqueeze(-1) * flat_edge_ebd.unsqueeze(-2)).reshape(
             -1, 3 * e_dim
         )
         # nf x nloc x 3 x e_dim
@@ -694,9 +710,9 @@ def forward(
         a_nlist: torch.Tensor,  # nf x nloc x a_nnei
         a_nlist_mask: torch.Tensor,  # nf x nloc x a_nnei
         a_sw: torch.Tensor,  # switch func, nf x nloc x a_nnei
-        edge_index: torch.Tensor,  # n_edge x 2
-        angle_index: torch.Tensor,  # n_angle x 3
-    ):
+        edge_index: torch.Tensor,  # 2 x n_edge
+        angle_index: torch.Tensor,  # 3 x n_angle
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Parameters
         ----------
@@ -720,12 +736,12 @@ def forward(
             Masks of the neighbor list for angle. real nei 1 otherwise 0
         a_sw : nf x nloc x a_nnei
             Switch function for angle.
-        edge_index : Optional for dynamic sel, n_edge x 2
+        edge_index : Optional for dynamic sel, 2 x n_edge
             n2e_index : n_edge
                 Broadcast indices from node(i) to edge(ij), or reduction indices from edge(ij) to node(i).
             n_ext2e_index : n_edge
                 Broadcast indices from extended node(j) to edge(ij).
-        angle_index : Optional for dynamic sel, n_angle x 3
+        angle_index : Optional for dynamic sel, 3 x n_angle
             n2a_index : n_angle
                 Broadcast indices from extended node(j) to angle(ijk).
             eij2a_index : n_angle
@@ -745,19 +761,21 @@ def forward(
         nb, nloc, nnei = nlist.shape
         nall = node_ebd_ext.shape[1]
         node_ebd = node_ebd_ext[:, :nloc, :]
-        n_edge = int(nlist_mask.sum().item())
         assert (nb, nloc) == node_ebd.shape[:2]
         if not self.use_dynamic_sel:
             assert (nb, nloc, nnei, 3) == h2.shape
+            n_edge = None
         else:
-            assert (n_edge, 3) == h2.shape
+            # n_edge = int(nlist_mask.sum().item())
+            # assert (n_edge, 3) == h2.shape
+            n_edge = h2.shape[0]
         del a_nlist  # may be used in the future
 
-        n2e_index, n_ext2e_index = edge_index[:, 0], edge_index[:, 1]
+        n2e_index, n_ext2e_index = edge_index[0], edge_index[1]
         n2a_index, eij2a_index, eik2a_index = (
-            angle_index[:, 0],
-            angle_index[:, 1],
-            angle_index[:, 2],
+            angle_index[0],
+            angle_index[1],
+            angle_index[2],
         )
 
         # nb x nloc x nnei x n_dim [OR] n_edge x n_dim
@@ -1026,7 +1044,9 @@ def forward(
             if not self.use_dynamic_sel:
                 # nb x nloc x a_nnei x a_nnei x e_dim
                 weighted_edge_angle_update = (
-                    a_sw[..., None, None] * a_sw[..., None, :, None] * edge_angle_update
+                    a_sw.unsqueeze(-1).unsqueeze(-1)
+                    * a_sw.unsqueeze(-2).unsqueeze(-1)
+                    * edge_angle_update
                 )
                 # nb x nloc x a_nnei x e_dim
                 reduced_edge_angle_update = torch.sum(
diff --git a/deepmd/pt/model/descriptor/repflows.py b/deepmd/pt/model/descriptor/repflows.py
index 5408c49482..69b5e3b593 100644
--- a/deepmd/pt/model/descriptor/repflows.py
+++ b/deepmd/pt/model/descriptor/repflows.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -54,15 +55,15 @@
 if not hasattr(torch.ops.deepmd, "border_op"):
 
     def border_op(
-        argument0,
-        argument1,
-        argument2,
-        argument3,
-        argument4,
-        argument5,
-        argument6,
-        argument7,
-        argument8,
+        argument0: Any,
+        argument1: Any,
+        argument2: Any,
+        argument3: Any,
+        argument4: Any,
+        argument5: Any,
+        argument6: Any,
+        argument7: Any,
+        argument8: Any,
     ) -> torch.Tensor:
         raise NotImplementedError(
             "border_op is not available since customized PyTorch OP library is not built when freezing the model. "
@@ -181,15 +182,17 @@ class DescrptBlockRepflows(DescriptorBlock):
         For example, when using paddings, there may be zero distances of neighbors, which may make division by zero error during environment matrix calculations without protection.
     seed : int, optional
         Random seed for parameter initialization.
+    trainable : bool, default: True
+        Whether this block is trainable
     """
 
     def __init__(
         self,
-        e_rcut,
-        e_rcut_smth,
+        e_rcut: float,
+        e_rcut_smth: float,
         e_sel: int,
-        a_rcut,
-        a_rcut_smth,
+        a_rcut: float,
+        a_rcut_smth: float,
         a_sel: int,
         ntypes: int,
         nlayers: int = 6,
@@ -219,6 +222,7 @@ def __init__(
         use_loc_mapping: bool = True,
         optim_update: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.e_rcut = float(e_rcut)
@@ -283,10 +287,19 @@ def __init__(
         self.seed = seed
 
         self.edge_embd = MLPLayer(
-            1, self.e_dim, precision=precision, seed=child_seed(seed, 0)
+            1,
+            self.e_dim,
+            precision=precision,
+            seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.angle_embd = MLPLayer(
-            1, self.a_dim, precision=precision, bias=False, seed=child_seed(seed, 1)
+            1,
+            self.a_dim,
+            precision=precision,
+            bias=False,
+            seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         layers = []
         for ii in range(nlayers):
@@ -318,6 +331,7 @@ def __init__(
                     sel_reduce_factor=self.sel_reduce_factor,
                     smooth_edge_update=self.smooth_edge_update,
                     seed=child_seed(child_seed(seed, 1), ii),
+                    trainable=trainable,
                 )
             )
         self.layers = torch.nn.ModuleList(layers)
@@ -363,7 +377,7 @@ def get_dim_emb(self) -> int:
         """Returns the embedding dimension e_dim."""
         return self.e_dim
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -371,7 +385,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -396,17 +410,17 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.n_dim
 
     @property
-    def dim_in(self):
+    def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return self.n_dim
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the embedding dimension e_dim."""
         return self.get_dim_emb()
 
@@ -425,7 +439,13 @@ def forward(
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         parallel_mode = comm_dict is not None
         if not parallel_mode:
             assert mapping is not None
@@ -537,9 +557,8 @@ def forward(
             a_sw = (a_sw[:, :, :, None] * a_sw[:, :, None, :])[a_nlist_mask]
         else:
             # avoid jit assertion
-            edge_index = angle_index = torch.zeros(
-                [1, 3], device=nlist.device, dtype=nlist.dtype
-            )
+            edge_index = torch.zeros([2, 1], device=nlist.device, dtype=nlist.dtype)
+            angle_index = torch.zeros([3, 1], device=nlist.device, dtype=nlist.dtype)
         # get edge and angle embedding
         # nb x nloc x nnei x e_dim [OR] n_edge x e_dim
         if not self.edge_init_use_dist:
@@ -646,7 +665,7 @@ def forward(
                 edge_ebd,
                 h2,
                 sw,
-                owner=edge_index[:, 0],
+                owner=edge_index[0],
                 num_owner=nframes * nloc,
                 nb=nframes,
                 nloc=nloc,
diff --git a/deepmd/pt/model/descriptor/repformer_layer.py b/deepmd/pt/model/descriptor/repformer_layer.py
index 1e2cba66d6..32012af92d 100644
--- a/deepmd/pt/model/descriptor/repformer_layer.py
+++ b/deepmd/pt/model/descriptor/repformer_layer.py
@@ -160,6 +160,7 @@ def __init__(
         attnw_shift: float = 20.0,
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         """Return neighbor-wise multi-head self-attention maps, with gate mechanism."""
         super().__init__()
@@ -172,6 +173,7 @@ def __init__(
             bias=False,
             precision=precision,
             seed=seed,
+            trainable=trainable,
         )
         self.has_gate = has_gate
         self.smooth = smooth
@@ -285,6 +287,7 @@ def __init__(
         head_num: int,
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.input_dim = input_dim
@@ -295,12 +298,14 @@ def __init__(
             bias=False,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.head_map = MLPLayer(
             input_dim * head_num,
             input_dim,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.precision = precision
 
@@ -370,12 +375,18 @@ def __init__(
         head_num: int,
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.input_dim = input_dim
         self.head_num = head_num
         self.head_map = MLPLayer(
-            head_num, 1, bias=False, precision=precision, seed=seed
+            head_num,
+            1,
+            bias=False,
+            precision=precision,
+            seed=seed,
+            trainable=trainable,
         )
         self.precision = precision
 
@@ -443,6 +454,7 @@ def __init__(
         attnw_shift: float = 20.0,
         precision: str = "float64",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.input_dim = input_dim
@@ -454,6 +466,7 @@ def __init__(
             bias=False,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.mapkv = MLPLayer(
             input_dim,
@@ -461,12 +474,14 @@ def __init__(
             bias=False,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.head_map = MLPLayer(
             input_dim * head_num,
             input_dim,
             precision=precision,
             seed=child_seed(seed, 2),
+            trainable=trainable,
         )
         self.smooth = smooth
         self.attnw_shift = attnw_shift
@@ -570,12 +585,12 @@ def deserialize(cls, data: dict) -> "LocalAtten":
 class RepformerLayer(torch.nn.Module):
     def __init__(
         self,
-        rcut,
-        rcut_smth,
+        rcut: float,
+        rcut_smth: float,
         sel: int,
         ntypes: int,
-        g1_dim=128,
-        g2_dim=16,
+        g1_dim: int = 128,
+        g2_dim: int = 16,
         axis_neuron: int = 4,
         update_chnnl_2: bool = True,
         update_g1_has_conv: bool = True,
@@ -602,6 +617,7 @@ def __init__(
         g1_out_conv: bool = True,
         g1_out_mlp: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.epsilon = 1e-4  # protection of 1./nnei
@@ -662,6 +678,7 @@ def __init__(
                     self.update_residual_init,
                     precision=precision,
                     seed=child_seed(seed, 0),
+                    trainable=trainable,
                 )
             )
 
@@ -671,6 +688,7 @@ def __init__(
             g1_dim,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.linear2 = None
         self.proj_g1g2 = None
@@ -687,6 +705,7 @@ def __init__(
                 g2_dim,
                 precision=precision,
                 seed=child_seed(seed, 2),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.g2_residual.append(
@@ -696,6 +715,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 3),
+                        trainable=trainable,
                     )
                 )
         if self.g1_out_mlp:
@@ -704,6 +724,7 @@ def __init__(
                 g1_dim,
                 precision=precision,
                 seed=child_seed(seed, 15),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.g1_residual.append(
@@ -713,6 +734,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 16),
+                        trainable=trainable,
                     )
                 )
         else:
@@ -725,6 +747,7 @@ def __init__(
                     bias=False,
                     precision=precision,
                     seed=child_seed(seed, 4),
+                    trainable=trainable,
                 )
             else:
                 self.proj_g1g2 = MLPLayer(
@@ -733,6 +756,7 @@ def __init__(
                     bias=False,
                     precision=precision,
                     seed=child_seed(seed, 4),
+                    trainable=trainable,
                 )
                 if self.update_style == "res_residual":
                     self.g1_residual.append(
@@ -742,6 +766,7 @@ def __init__(
                             self.update_residual_init,
                             precision=precision,
                             seed=child_seed(seed, 17),
+                            trainable=trainable,
                         )
                     )
         if self.update_g2_has_g1g1:
@@ -751,6 +776,7 @@ def __init__(
                 bias=False,
                 precision=precision,
                 seed=child_seed(seed, 5),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.g2_residual.append(
@@ -760,6 +786,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 6),
+                        trainable=trainable,
                     )
                 )
         if self.update_g2_has_attn or self.update_h2:
@@ -771,10 +798,15 @@ def __init__(
                 self.smooth,
                 precision=precision,
                 seed=child_seed(seed, 7),
+                trainable=trainable,
             )
             if self.update_g2_has_attn:
                 self.attn2_mh_apply = Atten2MultiHeadApply(
-                    g2_dim, attn2_nhead, precision=precision, seed=child_seed(seed, 8)
+                    g2_dim,
+                    attn2_nhead,
+                    precision=precision,
+                    seed=child_seed(seed, 8),
+                    trainable=trainable,
                 )
                 self.attn2_lm = LayerNorm(
                     g2_dim,
@@ -791,12 +823,17 @@ def __init__(
                             self.update_residual_init,
                             precision=precision,
                             seed=child_seed(seed, 10),
+                            trainable=trainable,
                         )
                     )
 
             if self.update_h2:
                 self.attn2_ev_apply = Atten2EquiVarApply(
-                    g2_dim, attn2_nhead, precision=precision, seed=child_seed(seed, 11)
+                    g2_dim,
+                    attn2_nhead,
+                    precision=precision,
+                    seed=child_seed(seed, 11),
+                    trainable=trainable,
                 )
                 if self.update_style == "res_residual":
                     self.h2_residual.append(
@@ -806,6 +843,7 @@ def __init__(
                             self.update_residual_init,
                             precision=precision,
                             seed=child_seed(seed, 12),
+                            trainable=trainable,
                         )
                     )
         if self.update_g1_has_attn:
@@ -816,6 +854,7 @@ def __init__(
                 self.smooth,
                 precision=precision,
                 seed=child_seed(seed, 13),
+                trainable=trainable,
             )
             if self.update_style == "res_residual":
                 self.g1_residual.append(
@@ -825,6 +864,7 @@ def __init__(
                         self.update_residual_init,
                         precision=precision,
                         seed=child_seed(seed, 14),
+                        trainable=trainable,
                     )
                 )
 
@@ -1101,7 +1141,7 @@ def forward(
         nlist: torch.Tensor,  # nf x nloc x nnei
         nlist_mask: torch.Tensor,  # nf x nloc x nnei
         sw: torch.Tensor,  # switch func, nf x nloc x nnei
-    ):
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Parameters
         ----------
diff --git a/deepmd/pt/model/descriptor/repformers.py b/deepmd/pt/model/descriptor/repformers.py
index 82773d1a78..2c383640f1 100644
--- a/deepmd/pt/model/descriptor/repformers.py
+++ b/deepmd/pt/model/descriptor/repformers.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -51,15 +52,15 @@
 if not hasattr(torch.ops.deepmd, "border_op"):
 
     def border_op(
-        argument0,
-        argument1,
-        argument2,
-        argument3,
-        argument4,
-        argument5,
-        argument6,
-        argument7,
-        argument8,
+        argument0: Any,
+        argument1: Any,
+        argument2: Any,
+        argument3: Any,
+        argument4: Any,
+        argument5: Any,
+        argument6: Any,
+        argument7: Any,
+        argument8: Any,
     ) -> torch.Tensor:
         raise NotImplementedError(
             "border_op is not available since customized PyTorch OP library is not built when freezing the model. "
@@ -75,13 +76,13 @@ def border_op(
 class DescrptBlockRepformers(DescriptorBlock):
     def __init__(
         self,
-        rcut,
-        rcut_smth,
+        rcut: float,
+        rcut_smth: float,
         sel: int,
         ntypes: int,
         nlayers: int = 3,
-        g1_dim=128,
-        g2_dim=16,
+        g1_dim: int = 128,
+        g2_dim: int = 16,
         axis_neuron: int = 4,
         direct_dist: bool = False,
         update_g1_has_conv: bool = True,
@@ -111,6 +112,7 @@ def __init__(
         use_sqrt_nnei: bool = True,
         g1_out_conv: bool = True,
         g1_out_mlp: bool = True,
+        trainable: bool = True,
     ) -> None:
         r"""
         The repformer descriptor block.
@@ -197,6 +199,8 @@ def __init__(
             The epsilon value for layer normalization.
         seed : int, optional
             Random seed for parameter initialization.
+        trainable : bool
+            Whether the block is trainable
         """
         super().__init__()
         self.rcut = float(rcut)
@@ -247,7 +251,11 @@ def __init__(
         self.seed = seed
 
         self.g2_embd = MLPLayer(
-            1, self.g2_dim, precision=precision, seed=child_seed(seed, 0)
+            1,
+            self.g2_dim,
+            precision=precision,
+            seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         layers = []
         for ii in range(nlayers):
@@ -285,6 +293,7 @@ def __init__(
                     g1_out_conv=self.g1_out_conv,
                     g1_out_mlp=self.g1_out_mlp,
                     seed=child_seed(child_seed(seed, 1), ii),
+                    trainable=trainable,
                 )
             )
         self.layers = torch.nn.ModuleList(layers)
@@ -328,7 +337,7 @@ def get_dim_emb(self) -> int:
         """Returns the embedding dimension g2."""
         return self.g2_dim
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -336,7 +345,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -361,17 +370,17 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.g1_dim
 
     @property
-    def dim_in(self):
+    def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return self.g1_dim
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the embedding dimension g2."""
         return self.get_dim_emb()
 
@@ -391,7 +400,13 @@ def forward(
         mapping: Optional[torch.Tensor] = None,
         type_embedding: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         if comm_dict is None:
             assert mapping is not None
             assert extended_atype_embd is not None
diff --git a/deepmd/pt/model/descriptor/se_a.py b/deepmd/pt/model/descriptor/se_a.py
index fc3e14bd25..17fa6a830e 100644
--- a/deepmd/pt/model/descriptor/se_a.py
+++ b/deepmd/pt/model/descriptor/se_a.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import itertools
 from typing import (
+    Any,
     Callable,
     ClassVar,
     Optional,
@@ -93,11 +94,11 @@ def tabulate_fusion_se_a(
 class DescrptSeA(BaseDescriptor, torch.nn.Module):
     def __init__(
         self,
-        rcut,
-        rcut_smth,
-        sel,
-        neuron=[25, 50, 100],
-        axis_neuron=16,
+        rcut: float,
+        rcut_smth: float,
+        sel: Union[list[int], int],
+        neuron: list[int] = [25, 50, 100],
+        axis_neuron: int = 16,
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = "float64",
@@ -110,7 +111,7 @@ def __init__(
         ntypes: Optional[int] = None,  # to be compat with input
         type_map: Optional[list[str]] = None,
         # not implemented
-        spin=None,
+        spin: Optional[Any] = None,
     ) -> None:
         del ntypes
         if spin is not None:
@@ -168,7 +169,7 @@ def get_dim_emb(self) -> int:
         """Returns the output dimension."""
         return self.sea.get_dim_emb()
 
-    def mixed_types(self):
+    def mixed_types(self) -> bool:
         """Returns if the descriptor requires a neighbor list that distinguish different
         atomic types or not.
         """
@@ -186,7 +187,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.sea.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -205,12 +208,12 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.sea.dim_out
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -225,7 +228,7 @@ def compute_input_stats(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
-    ):
+    ) -> None:
         """
         Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
 
@@ -305,7 +308,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -345,10 +354,12 @@ def forward(
         )
         return (
             g1.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
-            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            rot_mat.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION)
+            if rot_mat is not None
+            else None,
             None,
             None,
-            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if sw is not None else None,
         )
 
     def set_stat_mean_and_stddev(
@@ -408,7 +419,7 @@ def deserialize(cls, data: dict) -> "DescrptSeA":
         env_mat = data.pop("env_mat")
         obj = cls(**data)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.sea.prec, device=env.DEVICE)
 
         obj.sea["davg"] = t_cvt(variables["davg"])
@@ -455,11 +466,11 @@ class DescrptBlockSeA(DescriptorBlock):
 
     def __init__(
         self,
-        rcut,
-        rcut_smth,
-        sel,
-        neuron=[25, 50, 100],
-        axis_neuron=16,
+        rcut: float,
+        rcut_smth: float,
+        sel: Union[int, list[int]],
+        neuron: list[int] = [25, 50, 100],
+        axis_neuron: int = 16,
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = "float64",
@@ -469,7 +480,7 @@ def __init__(
         type_one_side: bool = True,
         trainable: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         """Construct an embedding net of type `se_a`.
 
@@ -525,6 +536,7 @@ def __init__(
                 precision=self.precision,
                 resnet_dt=self.resnet_dt,
                 seed=child_seed(self.seed, ii),
+                trainable=trainable,
             )
         self.filter_layers = filter_layers
         self.stats = None
@@ -601,7 +613,7 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.filter_neuron[-1] * self.axis_neuron
 
@@ -610,7 +622,7 @@ def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return 0
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: torch.Tensor) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -618,7 +630,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> torch.Tensor:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -728,7 +740,13 @@ def forward(
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
         type_embedding: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Calculate decoded embedding for each atom.
 
         Args:
diff --git a/deepmd/pt/model/descriptor/se_atten.py b/deepmd/pt/model/descriptor/se_atten.py
index 1ce6ad4583..bfcb510810 100644
--- a/deepmd/pt/model/descriptor/se_atten.py
+++ b/deepmd/pt/model/descriptor/se_atten.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -86,12 +87,12 @@ def __init__(
         attn_layer: int = 2,
         attn_dotr: bool = True,
         attn_mask: bool = False,
-        activation_function="tanh",
+        activation_function: str = "tanh",
         precision: str = "float64",
         resnet_dt: bool = False,
-        scaling_factor=1.0,
-        normalize=True,
-        temperature=None,
+        scaling_factor: float = 1.0,
+        normalize: bool = True,
+        temperature: Optional[float] = None,
         smooth: bool = True,
         type_one_side: bool = False,
         exclude_types: list[tuple[int, int]] = [],
@@ -100,6 +101,7 @@ def __init__(
         ln_eps: Optional[float] = 1e-5,
         seed: Optional[Union[int, list[int]]] = None,
         type: Optional[str] = None,
+        trainable: bool = True,
     ) -> None:
         r"""Construct an embedding net of type `se_atten`.
 
@@ -165,6 +167,8 @@ def __init__(
             If not None, the scaling of attention weights is `temperature` itself.
         seed : int, Optional
             Random seed for parameter initialization.
+        trainable : bool, default: True
+            Whether this block is trainable
         """
         super().__init__()
         del type
@@ -224,6 +228,7 @@ def __init__(
             smooth=self.smooth,
             precision=self.precision,
             seed=child_seed(self.seed, 0),
+            trainable=trainable,
         )
 
         wanted_shape = (self.ntypes, self.nnei, 4)
@@ -248,6 +253,7 @@ def __init__(
             precision=self.precision,
             resnet_dt=self.resnet_dt,
             seed=child_seed(self.seed, 1),
+            trainable=trainable,
         )
         self.filter_layers = filter_layers
         if self.tebd_input_mode in ["strip"]:
@@ -261,6 +267,7 @@ def __init__(
                 precision=self.precision,
                 resnet_dt=self.resnet_dt,
                 seed=child_seed(self.seed, 2),
+                trainable=trainable,
             )
             self.filter_layers_strip = filter_layers_strip
         self.stats = None
@@ -311,7 +318,7 @@ def get_dim_emb(self) -> int:
         """Returns the output dimension of embedding."""
         return self.filter_neuron[-1]
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -319,7 +326,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -344,17 +351,17 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.filter_neuron[-1] * self.axis_neuron
 
     @property
-    def dim_in(self):
+    def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return self.tebd_dim
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the output dimension of embedding."""
         return self.get_dim_emb()
 
@@ -419,10 +426,10 @@ def reinit_exclude(
 
     def enable_compression(
         self,
-        table_data,
-        table_config,
-        lower,
-        upper,
+        table_data: dict,
+        table_config: dict,
+        lower: dict,
+        upper: dict,
     ) -> None:
         net = "filter_net"
         self.compress_info[0] = torch.as_tensor(
@@ -448,7 +455,13 @@ def forward(
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
         type_embedding: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -680,6 +693,7 @@ def __init__(
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         """Construct a neighbor-wise attention net."""
         super().__init__()
@@ -715,17 +729,18 @@ def __init__(
                     smooth=smooth,
                     precision=precision,
                     seed=child_seed(seed, i),
+                    trainable=trainable,
                 )
             )
         self.attention_layers = nn.ModuleList(attention_layers)
 
     def forward(
         self,
-        input_G,
-        nei_mask,
+        input_G: torch.Tensor,
+        nei_mask: torch.Tensor,
         input_r: Optional[torch.Tensor] = None,
         sw: Optional[torch.Tensor] = None,
-    ):
+    ) -> torch.Tensor:
         """Compute the multi-layer gated self-attention.
 
         Parameters
@@ -745,13 +760,13 @@ def forward(
             out = layer(out, nei_mask, input_r=input_r, sw=sw)
         return out
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: int) -> Any:
         if isinstance(key, int):
             return self.attention_layers[key]
         else:
             raise TypeError(key)
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: int, value: Any) -> None:
         if not isinstance(key, int):
             raise TypeError(key)
         if isinstance(value, self.network_type):
@@ -823,6 +838,7 @@ def __init__(
         ln_eps: float = 1e-5,
         precision: str = DEFAULT_PRECISION,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         """Construct a neighbor-wise attention layer."""
         super().__init__()
@@ -850,6 +866,7 @@ def __init__(
             smooth=smooth,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.attn_layer_norm = LayerNorm(
             self.embed_dim,
@@ -861,11 +878,11 @@ def __init__(
 
     def forward(
         self,
-        x,
-        nei_mask,
+        x: torch.Tensor,
+        nei_mask: torch.Tensor,
         input_r: Optional[torch.Tensor] = None,
         sw: Optional[torch.Tensor] = None,
-    ):
+    ) -> torch.Tensor:
         residual = x
         x, _ = self.attention_layer(x, nei_mask, input_r=input_r, sw=sw)
         x = residual + x
@@ -930,6 +947,7 @@ def __init__(
         smooth: bool = True,
         precision: str = DEFAULT_PRECISION,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         """Construct a multi-head neighbor-wise attention net."""
         super().__init__()
@@ -962,6 +980,7 @@ def __init__(
             stddev=1.0,
             precision=precision,
             seed=child_seed(seed, 0),
+            trainable=trainable,
         )
         self.out_proj = MLPLayer(
             hidden_dim,
@@ -972,16 +991,17 @@ def __init__(
             stddev=1.0,
             precision=precision,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
 
     def forward(
         self,
-        query,
-        nei_mask,
+        query: torch.Tensor,
+        nei_mask: torch.Tensor,
         input_r: Optional[torch.Tensor] = None,
         sw: Optional[torch.Tensor] = None,
         attnw_shift: float = 20.0,
-    ):
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """Compute the multi-head gated self-attention.
 
         Parameters
diff --git a/deepmd/pt/model/descriptor/se_atten_v2.py b/deepmd/pt/model/descriptor/se_atten_v2.py
index 533d7887e0..5377d919b0 100644
--- a/deepmd/pt/model/descriptor/se_atten_v2.py
+++ b/deepmd/pt/model/descriptor/se_atten_v2.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -56,8 +57,8 @@ def __init__(
         exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         scaling_factor: int = 1.0,
-        normalize=True,
-        temperature=None,
+        normalize: bool = True,
+        temperature: Optional[float] = None,
         concat_output_tebd: bool = True,
         trainable: bool = True,
         trainable_ln: bool = True,
@@ -69,7 +70,7 @@ def __init__(
         use_tebd_bias: bool = False,
         type_map: Optional[list[str]] = None,
         # not implemented
-        spin=None,
+        spin: Optional[Any] = None,
         type: Optional[str] = None,
     ) -> None:
         r"""Construct smooth version of embedding net of type `se_atten_v2`.
@@ -257,7 +258,7 @@ def deserialize(cls, data: dict) -> "DescrptSeAttenV2":
             data["use_tebd_bias"] = True
         obj = cls(**data)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.se_atten.prec, device=env.DEVICE)
 
         obj.type_embedding.embedding = TypeEmbedNetConsistent.deserialize(
diff --git a/deepmd/pt/model/descriptor/se_r.py b/deepmd/pt/model/descriptor/se_r.py
index a91757460c..294323a48c 100644
--- a/deepmd/pt/model/descriptor/se_r.py
+++ b/deepmd/pt/model/descriptor/se_r.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -81,10 +82,10 @@ def tabulate_fusion_se_r(
 class DescrptSeR(BaseDescriptor, torch.nn.Module):
     def __init__(
         self,
-        rcut,
-        rcut_smth,
-        sel,
-        neuron=[25, 50, 100],
+        rcut: float,
+        rcut_smth: float,
+        sel: Union[list[int], int],
+        neuron: list[int] = [25, 50, 100],
         set_davg_zero: bool = False,
         activation_function: str = "tanh",
         precision: str = "float64",
@@ -94,7 +95,7 @@ def __init__(
         trainable: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
         type_map: Optional[list[str]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> None:
         super().__init__()
         self.rcut = float(rcut)
@@ -142,6 +143,7 @@ def __init__(
                 precision=self.precision,
                 resnet_dt=self.resnet_dt,
                 seed=child_seed(self.seed, ii),
+                trainable=trainable,
             )
         self.filter_layers = filter_layers
         self.stats = None
@@ -225,7 +227,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.env_protection
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -267,7 +271,7 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -329,7 +333,7 @@ def get_stats(self) -> dict[str, StatItem]:
             )
         return self.stats
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -337,7 +341,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -423,7 +427,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -574,7 +584,7 @@ def deserialize(cls, data: dict) -> "DescrptSeR":
         env_mat = data.pop("env_mat")
         obj = cls(**data)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.prec, device=env.DEVICE)
 
         obj["davg"] = t_cvt(variables["davg"])
diff --git a/deepmd/pt/model/descriptor/se_t.py b/deepmd/pt/model/descriptor/se_t.py
index 6e075a04e4..c489d0be06 100644
--- a/deepmd/pt/model/descriptor/se_t.py
+++ b/deepmd/pt/model/descriptor/se_t.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import itertools
 from typing import (
+    Any,
     Callable,
     ClassVar,
     Optional,
@@ -146,7 +147,7 @@ def __init__(
         type_map: Optional[list[str]] = None,
         ntypes: Optional[int] = None,  # to be compat with input
         # not implemented
-        spin=None,
+        spin: Optional[dict] = None,
     ) -> None:
         del ntypes
         if spin is not None:
@@ -202,7 +203,7 @@ def get_dim_emb(self) -> int:
         """Returns the output dimension."""
         return self.seat.get_dim_emb()
 
-    def mixed_types(self):
+    def mixed_types(self) -> bool:
         """Returns if the descriptor requires a neighbor list that distinguish different
         atomic types or not.
         """
@@ -220,7 +221,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.seat.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -239,12 +242,12 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.seat.dim_out
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -259,7 +262,7 @@ def compute_input_stats(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
-    ):
+    ) -> None:
         """
         Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
 
@@ -340,7 +343,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -384,7 +393,7 @@ def forward(
             None,
             None,
             None,
-            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if sw is not None else None,
         )
 
     def set_stat_mean_and_stddev(
@@ -439,7 +448,7 @@ def deserialize(cls, data: dict) -> "DescrptSeT":
         env_mat = data.pop("env_mat")
         obj = cls(**data)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.seat.prec, device=env.DEVICE)
 
         obj.seat["davg"] = t_cvt(variables["davg"])
@@ -575,6 +584,7 @@ def __init__(
                 precision=self.precision,
                 resnet_dt=self.resnet_dt,
                 seed=child_seed(self.seed, ii),
+                trainable=trainable,
             )
         self.filter_layers = filter_layers
         self.stats = None
@@ -647,7 +657,7 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.filter_neuron[-1]
 
@@ -656,7 +666,7 @@ def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return 0
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -664,7 +674,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -732,10 +742,10 @@ def reinit_exclude(
 
     def enable_compression(
         self,
-        table_data,
-        table_config,
-        lower,
-        upper,
+        table_data: dict,
+        table_config: dict,
+        lower: dict,
+        upper: dict,
     ) -> None:
         for embedding_idx, ll in enumerate(self.filter_layers.networks):
             ti = embedding_idx % self.ntypes
@@ -767,7 +777,13 @@ def forward(
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
         type_embedding: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
diff --git a/deepmd/pt/model/descriptor/se_t_tebd.py b/deepmd/pt/model/descriptor/se_t_tebd.py
index 7e27805bd5..f7de1c3015 100644
--- a/deepmd/pt/model/descriptor/se_t_tebd.py
+++ b/deepmd/pt/model/descriptor/se_t_tebd.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -140,7 +141,7 @@ def __init__(
         type_map: Optional[list[str]] = None,
         concat_output_tebd: bool = True,
         use_econf_tebd: bool = False,
-        use_tebd_bias=False,
+        use_tebd_bias: bool = False,
         smooth: bool = True,
     ) -> None:
         super().__init__()
@@ -160,6 +161,7 @@ def __init__(
             env_protection=env_protection,
             smooth=smooth,
             seed=child_seed(seed, 1),
+            trainable=trainable,
         )
         self.prec = PRECISION_DICT[precision]
         self.use_econf_tebd = use_econf_tebd
@@ -170,6 +172,7 @@ def __init__(
             tebd_dim,
             precision=precision,
             seed=child_seed(seed, 2),
+            trainable=trainable,
             use_econf_tebd=use_econf_tebd,
             type_map=type_map,
             use_tebd_bias=use_tebd_bias,
@@ -240,7 +243,9 @@ def get_env_protection(self) -> float:
         """Returns the protection of building environment matrix."""
         return self.se_ttebd.get_env_protection()
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -264,18 +269,18 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         return self.get_dim_out()
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         return self.get_dim_emb()
 
     def compute_input_stats(
         self,
         merged: Union[Callable[[], list[dict]], list[dict]],
         path: Optional[DPPath] = None,
-    ):
+    ) -> None:
         """
         Compute the input statistics (e.g. mean and stddev) for the descriptors from packed data.
 
@@ -308,7 +313,7 @@ def get_stat_mean_and_stddev(self) -> tuple[torch.Tensor, torch.Tensor]:
         return self.se_ttebd.mean, self.se_ttebd.stddev
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -388,7 +393,7 @@ def deserialize(cls, data: dict) -> "DescrptSeTTebd":
             embeddings_strip = None
         obj = cls(**data)
 
-        def t_cvt(xx):
+        def t_cvt(xx: Any) -> torch.Tensor:
             return torch.tensor(xx, dtype=obj.se_ttebd.prec, device=env.DEVICE)
 
         obj.type_embedding.embedding = TypeEmbedNetConsistent.deserialize(
@@ -410,7 +415,13 @@ def forward(
         nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
@@ -470,7 +481,7 @@ def forward(
             None,
             None,
             None,
-            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION),
+            sw.to(dtype=env.GLOBAL_PT_FLOAT_PRECISION) if sw is not None else None,
         )
 
     @classmethod
@@ -518,13 +529,14 @@ def __init__(
         tebd_dim: int = 8,
         tebd_input_mode: str = "concat",
         set_davg_zero: bool = True,
-        activation_function="tanh",
+        activation_function: str = "tanh",
         precision: str = "float64",
         resnet_dt: bool = False,
         exclude_types: list[tuple[int, int]] = [],
         env_protection: float = 0.0,
         smooth: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
         self.rcut = float(rcut)
@@ -577,6 +589,7 @@ def __init__(
             precision=self.precision,
             resnet_dt=self.resnet_dt,
             seed=child_seed(self.seed, 1),
+            trainable=trainable,
         )
         self.filter_layers = filter_layers
         if self.tebd_input_mode in ["strip"]:
@@ -590,6 +603,7 @@ def __init__(
                 precision=self.precision,
                 resnet_dt=self.resnet_dt,
                 seed=child_seed(self.seed, 2),
+                trainable=trainable,
             )
             self.filter_layers_strip = filter_layers_strip
         self.stats = None
@@ -626,7 +640,7 @@ def get_dim_emb(self) -> int:
         """Returns the output dimension of embedding."""
         return self.filter_neuron[-1]
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ("avg", "data_avg", "davg"):
             self.mean = value
         elif key in ("std", "data_std", "dstd"):
@@ -634,7 +648,7 @@ def __setitem__(self, key, value) -> None:
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ("avg", "data_avg", "davg"):
             return self.mean
         elif key in ("std", "data_std", "dstd"):
@@ -659,17 +673,17 @@ def get_env_protection(self) -> float:
         return self.env_protection
 
     @property
-    def dim_out(self):
+    def dim_out(self) -> int:
         """Returns the output dimension of this descriptor."""
         return self.filter_neuron[-1]
 
     @property
-    def dim_in(self):
+    def dim_in(self) -> int:
         """Returns the atomic input dimension of this descriptor."""
         return self.tebd_dim
 
     @property
-    def dim_emb(self):
+    def dim_emb(self) -> int:
         """Returns the output dimension of embedding."""
         return self.get_dim_emb()
 
@@ -739,7 +753,13 @@ def forward(
         extended_atype_embd: Optional[torch.Tensor] = None,
         mapping: Optional[torch.Tensor] = None,
         type_embedding: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[
+        torch.Tensor,
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+        Optional[torch.Tensor],
+    ]:
         """Compute the descriptor.
 
         Parameters
diff --git a/deepmd/pt/model/model/__init__.py b/deepmd/pt/model/model/__init__.py
index 8d451f087f..1be46e084a 100644
--- a/deepmd/pt/model/model/__init__.py
+++ b/deepmd/pt/model/model/__init__.py
@@ -14,6 +14,7 @@
 import copy
 import json
 from typing import (
+    Any,
     Optional,
 )
 
@@ -75,7 +76,7 @@
 )
 
 
-def _get_standard_model_components(model_params, ntypes):
+def _get_standard_model_components(model_params: dict, ntypes: int) -> tuple:
     if "type_embedding" in model_params:
         raise ValueError(
             "In the PyTorch backend, type_embedding is not at the model level, but within the descriptor. See type embedding documentation for details."
@@ -102,7 +103,7 @@ def _get_standard_model_components(model_params, ntypes):
     return descriptor, fitting, fitting_net["type"]
 
 
-def get_spin_model(model_params):
+def get_spin_model(model_params: dict) -> SpinModel:
     model_params = copy.deepcopy(model_params)
     if not model_params["spin"]["use_spin"] or isinstance(
         model_params["spin"]["use_spin"][0], int
@@ -138,7 +139,7 @@ def get_spin_model(model_params):
     return SpinEnergyModel(backbone_model=backbone_model, spin=spin)
 
 
-def get_linear_model(model_params):
+def get_linear_model(model_params: dict) -> LinearEnergyModel:
     model_params = copy.deepcopy(model_params)
     weights = model_params.get("weights", "mean")
     list_of_models = []
@@ -178,7 +179,7 @@ def get_linear_model(model_params):
     )
 
 
-def get_zbl_model(model_params):
+def get_zbl_model(model_params: dict) -> DPZBLModel:
     model_params = copy.deepcopy(model_params)
     ntypes = len(model_params["type_map"])
     descriptor, fitting, _ = _get_standard_model_components(model_params, ntypes)
@@ -209,7 +210,7 @@ def get_zbl_model(model_params):
     return model
 
 
-def _can_be_converted_to_float(value) -> Optional[bool]:
+def _can_be_converted_to_float(value: Any) -> Optional[bool]:
     try:
         float(value)
         return True
@@ -218,7 +219,9 @@ def _can_be_converted_to_float(value) -> Optional[bool]:
         return False
 
 
-def _convert_preset_out_bias_to_array(preset_out_bias, type_map):
+def _convert_preset_out_bias_to_array(
+    preset_out_bias: Optional[dict], type_map: list[str]
+) -> Optional[dict]:
     if preset_out_bias is not None:
         for kk in preset_out_bias:
             if len(preset_out_bias[kk]) != len(type_map):
@@ -241,7 +244,7 @@ def _convert_preset_out_bias_to_array(preset_out_bias, type_map):
     return preset_out_bias
 
 
-def get_standard_model(model_params):
+def get_standard_model(model_params: dict) -> BaseModel:
     model_params_old = model_params
     model_params = copy.deepcopy(model_params)
     ntypes = len(model_params["type_map"])
@@ -284,7 +287,7 @@ def get_standard_model(model_params):
     return model
 
 
-def get_model(model_params):
+def get_model(model_params: dict) -> Any:
     model_type = model_params.get("type", "standard")
     if model_type == "standard":
         if "spin" in model_params:
diff --git a/deepmd/pt/model/model/dipole_model.py b/deepmd/pt/model/model/dipole_model.py
index a24820b74a..de089e7de7 100644
--- a/deepmd/pt/model/model/dipole_model.py
+++ b/deepmd/pt/model/model/dipole_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -28,13 +29,13 @@ class DipoleModel(DPModelCommon, DPDipoleModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         DPModelCommon.__init__(self)
         DPDipoleModel_.__init__(self, *args, **kwargs)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, Any]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "dipole": out_def_data["dipole"],
@@ -54,8 +55,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -91,14 +92,15 @@ def forward(
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ):
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
@@ -107,6 +109,7 @@ def forward_lower(
             fparam=fparam,
             aparam=aparam,
             do_atomic_virial=do_atomic_virial,
+            comm_dict=comm_dict,
             extra_nlist_sort=self.need_sorted_nlist_for_lower(),
         )
         if self.get_fitting_net() is not None:
diff --git a/deepmd/pt/model/model/dos_model.py b/deepmd/pt/model/model/dos_model.py
index 41d85a559e..a68735984f 100644
--- a/deepmd/pt/model/model/dos_model.py
+++ b/deepmd/pt/model/model/dos_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -28,13 +29,13 @@ class DOSModel(DPModelCommon, DPDOSModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         DPModelCommon.__init__(self)
         DPDOSModel_.__init__(self, *args, **kwargs)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, Any]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "atom_dos": out_def_data["dos"],
@@ -46,8 +47,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -81,14 +82,15 @@ def get_numb_dos(self) -> int:
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ):
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
@@ -97,6 +99,7 @@ def forward_lower(
             fparam=fparam,
             aparam=aparam,
             do_atomic_virial=do_atomic_virial,
+            comm_dict=comm_dict,
             extra_nlist_sort=self.need_sorted_nlist_for_lower(),
         )
         if self.get_fitting_net() is not None:
diff --git a/deepmd/pt/model/model/dp_linear_model.py b/deepmd/pt/model/model/dp_linear_model.py
index fe4487a495..b71c8a10c3 100644
--- a/deepmd/pt/model/model/dp_linear_model.py
+++ b/deepmd/pt/model/model/dp_linear_model.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
 import torch
 
+from deepmd.dpmodel.output_def import (
+    OutputVariableDef,
+)
 from deepmd.pt.model.atomic_model import (
     LinearEnergyAtomicModel,
 )
@@ -31,12 +35,12 @@ class LinearEnergyModel(DPLinearModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         super().__init__(*args, **kwargs)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, OutputVariableDef]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "atom_energy": out_def_data["energy"],
@@ -56,8 +60,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -90,14 +94,15 @@ def forward(
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ):
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
@@ -106,6 +111,7 @@ def forward_lower(
             fparam=fparam,
             aparam=aparam,
             do_atomic_virial=do_atomic_virial,
+            comm_dict=comm_dict,
             extra_nlist_sort=self.need_sorted_nlist_for_lower(),
         )
 
diff --git a/deepmd/pt/model/model/dp_model.py b/deepmd/pt/model/model/dp_model.py
index e71c5e08de..875dc0dca0 100644
--- a/deepmd/pt/model/model/dp_model.py
+++ b/deepmd/pt/model/model/dp_model.py
@@ -47,11 +47,12 @@ def update_sel(
         )
         return local_jdata_cpy, min_nbor_dist
 
-    def get_fitting_net(self):
+    # sadly, use -> BaseFitting here will not make torchscript happy
+    def get_fitting_net(self):  # noqa: ANN201
         """Get the fitting network."""
         return self.atomic_model.fitting_net
 
-    def get_descriptor(self):
+    def get_descriptor(self):  # noqa: ANN201
         """Get the descriptor."""
         return self.atomic_model.descriptor
 
@@ -64,3 +65,13 @@ def set_eval_descriptor_hook(self, enable: bool) -> None:
     def eval_descriptor(self) -> torch.Tensor:
         """Evaluate the descriptor."""
         return self.atomic_model.eval_descriptor()
+
+    @torch.jit.export
+    def set_eval_fitting_last_layer_hook(self, enable: bool) -> None:
+        """Set the hook for evaluating fitting_last_layer and clear the cache for fitting_last_layer list."""
+        self.atomic_model.set_eval_fitting_last_layer_hook(enable)
+
+    @torch.jit.export
+    def eval_fitting_last_layer(self) -> torch.Tensor:
+        """Evaluate the fitting_last_layer."""
+        return self.atomic_model.eval_fitting_last_layer()
diff --git a/deepmd/pt/model/model/dp_zbl_model.py b/deepmd/pt/model/model/dp_zbl_model.py
index 3c019b9376..7f84d8abec 100644
--- a/deepmd/pt/model/model/dp_zbl_model.py
+++ b/deepmd/pt/model/model/dp_zbl_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -31,12 +32,12 @@ class DPZBLModel(DPZBLModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         super().__init__(*args, **kwargs)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, Any]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "atom_energy": out_def_data["energy"],
@@ -56,8 +57,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -90,14 +91,15 @@ def forward(
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ):
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
@@ -106,6 +108,7 @@ def forward_lower(
             fparam=fparam,
             aparam=aparam,
             do_atomic_virial=do_atomic_virial,
+            comm_dict=comm_dict,
             extra_nlist_sort=self.need_sorted_nlist_for_lower(),
         )
 
diff --git a/deepmd/pt/model/model/ener_model.py b/deepmd/pt/model/model/ener_model.py
index 8064d3eac7..dfe68d537f 100644
--- a/deepmd/pt/model/model/ener_model.py
+++ b/deepmd/pt/model/model/ener_model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -31,20 +32,46 @@ class EnergyModel(DPModelCommon, DPEnergyModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         DPModelCommon.__init__(self)
         DPEnergyModel_.__init__(self, *args, **kwargs)
         self._hessian_enabled = False
 
-    def enable_hessian(self):
+    def enable_hessian(self) -> None:
         self.__class__ = make_hessian_model(type(self))
         self.hess_fitting_def = super(type(self), self).atomic_output_def()
         self.requires_hessian("energy")
         self._hessian_enabled = True
 
-    def translated_output_def(self):
+    @torch.jit.export
+    def get_observed_type_list(self) -> list[str]:
+        """Get observed types (elements) of the model during data statistics.
+
+        Returns
+        -------
+        observed_type_list: a list of the observed types in this model.
+        """
+        type_map = self.get_type_map()
+        out_bias = self.atomic_model.get_out_bias()[0]
+
+        assert out_bias is not None, "No out_bias found in the model."
+        assert out_bias.dim() == 2, "The supported out_bias should be a 2D tensor."
+        assert out_bias.size(0) == len(type_map), (
+            "The out_bias shape does not match the type_map length."
+        )
+        bias_mask = (
+            torch.gt(torch.abs(out_bias), 1e-6).any(dim=-1).detach().cpu()
+        )  # 1e-6 for stability
+
+        observed_type_list: list[str] = []
+        for i in range(len(type_map)):
+            if bias_mask[i]:
+                observed_type_list.append(type_map[i])
+        return observed_type_list
+
+    def translated_output_def(self) -> dict[str, Any]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "atom_energy": out_def_data["energy"],
@@ -66,8 +93,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -107,15 +134,15 @@ def forward(
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
diff --git a/deepmd/pt/model/model/frozen.py b/deepmd/pt/model/model/frozen.py
index 27284ec276..2a63b093db 100644
--- a/deepmd/pt/model/model/frozen.py
+++ b/deepmd/pt/model/model/frozen.py
@@ -2,6 +2,7 @@
 import json
 import tempfile
 from typing import (
+    Any,
     NoReturn,
     Optional,
 )
@@ -32,7 +33,7 @@ class FrozenModel(BaseModel):
         The path to the frozen model
     """
 
-    def __init__(self, model_file: str, **kwargs) -> None:
+    def __init__(self, model_file: str, **kwargs: Any) -> None:
         super().__init__(**kwargs)
         self.model_file = model_file
         if model_file.endswith(".pth"):
@@ -116,8 +117,8 @@ def need_sorted_nlist_for_lower(self) -> bool:
     @torch.jit.export
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
diff --git a/deepmd/pt/model/model/make_hessian_model.py b/deepmd/pt/model/model/make_hessian_model.py
index 000b9abea4..b84e63ebd7 100644
--- a/deepmd/pt/model/model/make_hessian_model.py
+++ b/deepmd/pt/model/model/make_hessian_model.py
@@ -2,6 +2,7 @@
 import copy
 import math
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -11,9 +12,12 @@
 from deepmd.dpmodel import (
     get_hessian_name,
 )
+from deepmd.dpmodel.output_def import (
+    FittingOutputDef,
+)
 
 
-def make_hessian_model(T_Model):
+def make_hessian_model(T_Model: type) -> type:
     """Make a model that can compute Hessian.
 
     LIMITATION: this model is not jitable due to the restrictions of torch jit script.
@@ -34,8 +38,8 @@ def make_hessian_model(T_Model):
     class CM(T_Model):
         def __init__(
             self,
-            *args,
-            **kwargs,
+            *args: Any,
+            **kwargs: Any,
         ) -> None:
             super().__init__(
                 *args,
@@ -54,14 +58,14 @@ def requires_hessian(
                 if kk in keys:
                     self.hess_fitting_def[kk].r_hessian = True
 
-        def atomic_output_def(self):
+        def atomic_output_def(self) -> FittingOutputDef:
             """Get the fitting output def."""
             return self.hess_fitting_def
 
         def forward_common(
             self,
-            coord,
-            atype,
+            coord: torch.Tensor,
+            atype: torch.Tensor,
             box: Optional[torch.Tensor] = None,
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
@@ -159,9 +163,9 @@ def _cal_hessian_all(
 
         def _cal_hessian_one_component(
             self,
-            ci,
-            coord,
-            atype,
+            ci: int,
+            coord: torch.Tensor,
+            atype: torch.Tensor,
             box: Optional[torch.Tensor] = None,
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
@@ -195,8 +199,8 @@ def __init__(
 
         def __call__(
             self,
-            xx,
-        ):
+            xx: torch.Tensor,
+        ) -> torch.Tensor:
             ci = self.ci
             atype, box, fparam, aparam = self.atype, self.box, self.fparam, self.aparam
             res = super(CM, self.obj).forward_common(
diff --git a/deepmd/pt/model/model/make_model.py b/deepmd/pt/model/model/make_model.py
index c32abaa095..53d32977b0 100644
--- a/deepmd/pt/model/model/make_model.py
+++ b/deepmd/pt/model/model/make_model.py
@@ -1,5 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
+    Callable,
     Optional,
 )
 
@@ -39,7 +41,7 @@
 )
 
 
-def make_model(T_AtomicModel: type[BaseAtomicModel]):
+def make_model(T_AtomicModel: type[BaseAtomicModel]) -> type:
     """Make a model as a derived class of an atomic model.
 
     The model provide two interfaces.
@@ -65,10 +67,10 @@ def make_model(T_AtomicModel: type[BaseAtomicModel]):
     class CM(BaseModel):
         def __init__(
             self,
-            *args,
+            *args: Any,
             # underscore to prevent conflict with normal inputs
             atomic_model_: Optional[T_AtomicModel] = None,
-            **kwargs,
+            **kwargs: Any,
         ) -> None:
             super().__init__(*args, **kwargs)
             if atomic_model_ is not None:
@@ -80,7 +82,7 @@ def __init__(
             self.global_pt_float_precision = GLOBAL_PT_FLOAT_PRECISION
             self.global_pt_ener_float_precision = GLOBAL_PT_ENER_FLOAT_PRECISION
 
-        def model_output_def(self):
+        def model_output_def(self) -> ModelOutputDef:
             """Get the output def for the model."""
             return ModelOutputDef(self.atomic_output_def())
 
@@ -129,8 +131,8 @@ def enable_compression(
         # cannot use the name forward. torch script does not work
         def forward_common(
             self,
-            coord,
-            atype,
+            coord: torch.Tensor,
+            atype: torch.Tensor,
             box: Optional[torch.Tensor] = None,
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
@@ -206,8 +208,8 @@ def set_out_bias(self, out_bias: torch.Tensor) -> None:
 
         def change_out_bias(
             self,
-            merged,
-            bias_adjust_mode="change-by-statistic",
+            merged: Any,
+            bias_adjust_mode: str = "change-by-statistic",
         ) -> None:
             """Change the output bias of atomic model according to the input data and the pretrained model.
 
@@ -233,16 +235,16 @@ def change_out_bias(
 
         def forward_common_lower(
             self,
-            extended_coord,
-            extended_atype,
-            nlist,
+            extended_coord: torch.Tensor,
+            extended_atype: torch.Tensor,
+            nlist: torch.Tensor,
             mapping: Optional[torch.Tensor] = None,
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
             do_atomic_virial: bool = False,
             comm_dict: Optional[dict[str, torch.Tensor]] = None,
             extra_nlist_sort: bool = False,
-        ):
+        ) -> dict[str, torch.Tensor]:
             """Return model prediction. Lower interface that takes
             extended atomic coordinates and types, nlist, and mapping
             as input, and returns the predictions on the extended region.
@@ -299,6 +301,7 @@ def forward_common_lower(
                 cc_ext,
                 do_atomic_virial=do_atomic_virial,
                 create_graph=self.training,
+                mask=atomic_ret["mask"] if "mask" in atomic_ret else None,
             )
             model_predict = self.output_type_cast(model_predict, input_prec)
             return model_predict
@@ -382,7 +385,7 @@ def format_nlist(
             extended_atype: torch.Tensor,
             nlist: torch.Tensor,
             extra_nlist_sort: bool = False,
-        ):
+        ) -> torch.Tensor:
             """Format the neighbor list.
 
             1. If the number of neighbors in the `nlist` is equal to sum(self.sel),
@@ -433,7 +436,7 @@ def _format_nlist(
             nlist: torch.Tensor,
             nnei: int,
             extra_nlist_sort: bool = False,
-        ):
+        ) -> torch.Tensor:
             n_nf, n_nloc, n_nnei = nlist.shape
             # nf x nall x 3
             extended_coord = extended_coord.view([n_nf, -1, 3])
@@ -495,7 +498,7 @@ def do_grad_c(
             return self.atomic_model.do_grad_c(var_name)
 
         def change_type_map(
-            self, type_map: list[str], model_with_new_type_stat=None
+            self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
         ) -> None:
             """Change the type related params to new ones, according to `type_map` and the original one in the model.
             If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -511,10 +514,10 @@ def serialize(self) -> dict:
             return self.atomic_model.serialize()
 
         @classmethod
-        def deserialize(cls, data) -> "CM":
+        def deserialize(cls, data: Any) -> "CM":
             return cls(atomic_model_=T_AtomicModel.deserialize(data))
 
-        def set_case_embd(self, case_idx: int):
+        def set_case_embd(self, case_idx: int) -> None:
             self.atomic_model.set_case_embd(case_idx)
 
         @torch.jit.export
@@ -522,6 +525,11 @@ def get_dim_fparam(self) -> int:
             """Get the number (dimension) of frame parameters of this atomic model."""
             return self.atomic_model.get_dim_fparam()
 
+        @torch.jit.export
+        def has_default_fparam(self) -> bool:
+            """Check if the model has default frame parameters."""
+            return self.atomic_model.has_default_fparam()
+
         @torch.jit.export
         def get_dim_aparam(self) -> int:
             """Get the number (dimension) of atomic parameters of this atomic model."""
@@ -571,9 +579,9 @@ def atomic_output_def(self) -> FittingOutputDef:
 
         def compute_or_load_stat(
             self,
-            sampled_func,
+            sampled_func: Callable[[], Any],
             stat_file_path: Optional[DPPath] = None,
-        ):
+        ) -> None:
             """Compute or load the statistics."""
             return self.atomic_model.compute_or_load_stat(sampled_func, stat_file_path)
 
@@ -604,8 +612,8 @@ def need_sorted_nlist_for_lower(self) -> bool:
 
         def forward(
             self,
-            coord,
-            atype,
+            coord: torch.Tensor,
+            atype: torch.Tensor,
             box: Optional[torch.Tensor] = None,
             fparam: Optional[torch.Tensor] = None,
             aparam: Optional[torch.Tensor] = None,
diff --git a/deepmd/pt/model/model/model.py b/deepmd/pt/model/model/model.py
index 44600ab804..e3cf7bde17 100644
--- a/deepmd/pt/model/model/model.py
+++ b/deepmd/pt/model/model/model.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     NoReturn,
     Optional,
 )
@@ -18,7 +19,7 @@
 
 
 class BaseModel(torch.nn.Module, make_base_model()):
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         """Construct a basic model for different tasks."""
         torch.nn.Module.__init__(self)
         self.model_def_script = ""
@@ -28,7 +29,7 @@ def __init__(self, *args, **kwargs) -> None:
 
     def compute_or_load_stat(
         self,
-        sampled_func,
+        sampled_func: Any,
         stat_file_path: Optional[DPPath] = None,
     ) -> NoReturn:
         """
@@ -48,6 +49,16 @@ def compute_or_load_stat(
         """
         raise NotImplementedError
 
+    @torch.jit.export
+    def get_observed_type_list(self) -> list[str]:
+        """Get observed types (elements) of the model during data statistics.
+
+        Returns
+        -------
+        observed_type_list: a list of the observed types in this model.
+        """
+        raise NotImplementedError
+
     @torch.jit.export
     def get_model_def_script(self) -> str:
         """Get the model definition script."""
@@ -61,6 +72,6 @@ def get_min_nbor_dist(self) -> Optional[float]:
         return self.min_nbor_dist.item()
 
     @torch.jit.export
-    def get_ntypes(self):
+    def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return len(self.get_type_map())
diff --git a/deepmd/pt/model/model/polar_model.py b/deepmd/pt/model/model/polar_model.py
index cb72532366..18eac5d24c 100644
--- a/deepmd/pt/model/model/polar_model.py
+++ b/deepmd/pt/model/model/polar_model.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
 import torch
 
+from deepmd.dpmodel.output_def import (
+    OutputVariableDef,
+)
 from deepmd.pt.model.atomic_model import (
     DPPolarAtomicModel,
 )
@@ -28,13 +32,13 @@ class PolarModel(DPModelCommon, DPPolarModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         DPModelCommon.__init__(self)
         DPPolarModel_.__init__(self, *args, **kwargs)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, OutputVariableDef]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "polar": out_def_data["polarizability"],
@@ -46,8 +50,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -75,14 +79,15 @@ def forward(
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
-    ):
+        comm_dict: Optional[dict[str, torch.Tensor]] = None,
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
@@ -91,6 +96,7 @@ def forward_lower(
             fparam=fparam,
             aparam=aparam,
             do_atomic_virial=do_atomic_virial,
+            comm_dict=comm_dict,
             extra_nlist_sort=self.need_sorted_nlist_for_lower(),
         )
         if self.get_fitting_net() is not None:
diff --git a/deepmd/pt/model/model/property_model.py b/deepmd/pt/model/model/property_model.py
index 7c50c75ff1..0931862ae8 100644
--- a/deepmd/pt/model/model/property_model.py
+++ b/deepmd/pt/model/model/property_model.py
@@ -1,10 +1,14 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
 import torch
 
+from deepmd.dpmodel.output_def import (
+    OutputVariableDef,
+)
 from deepmd.pt.model.atomic_model import (
     DPPropertyAtomicModel,
 )
@@ -28,13 +32,13 @@ class PropertyModel(DPModelCommon, DPPropertyModel_):
 
     def __init__(
         self,
-        *args,
-        **kwargs,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         DPModelCommon.__init__(self)
         DPPropertyModel_.__init__(self, *args, **kwargs)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, OutputVariableDef]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             f"atom_{self.get_var_name()}": out_def_data[self.get_var_name()],
@@ -46,8 +50,8 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -86,15 +90,15 @@ def get_var_name(self) -> str:
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
diff --git a/deepmd/pt/model/model/spin_model.py b/deepmd/pt/model/model/spin_model.py
index ac94668039..bd7158fb8f 100644
--- a/deepmd/pt/model/model/spin_model.py
+++ b/deepmd/pt/model/model/spin_model.py
@@ -4,6 +4,8 @@
     deepcopy,
 )
 from typing import (
+    Any,
+    Callable,
     Optional,
 )
 
@@ -38,7 +40,7 @@ class SpinModel(torch.nn.Module):
 
     def __init__(
         self,
-        backbone_model,
+        backbone_model: DPAtomicModel,
         spin: Spin,
     ) -> None:
         super().__init__()
@@ -48,7 +50,9 @@ def __init__(
         self.virtual_scale_mask = to_torch_tensor(self.spin.get_virtual_scale_mask())
         self.spin_mask = to_torch_tensor(self.spin.get_spin_mask())
 
-    def process_spin_input(self, coord, atype, spin):
+    def process_spin_input(
+        self, coord: torch.Tensor, atype: torch.Tensor, spin: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
         """Generate virtual coordinates and types, concat into the input."""
         nframes, nloc = atype.shape
         coord = coord.reshape(nframes, nloc, 3)
@@ -62,12 +66,12 @@ def process_spin_input(self, coord, atype, spin):
 
     def process_spin_input_lower(
         self,
-        extended_coord,
-        extended_atype,
-        extended_spin,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        extended_spin: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
         """
         Add `extended_spin` into `extended_coord` to generate virtual atoms, and extend `nlist` and `mapping`.
         Note that the final `extended_coord_updated` with shape [nframes, nall + nall, 3] has the following order:
@@ -103,8 +107,12 @@ def process_spin_input_lower(
         )
 
     def process_spin_output(
-        self, atype, out_tensor, add_mag: bool = True, virtual_scale: bool = True
-    ):
+        self,
+        atype: torch.Tensor,
+        out_tensor: torch.Tensor,
+        add_mag: bool = True,
+        virtual_scale: bool = True,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Split the output both real and virtual atoms, and scale the latter.
         add_mag: whether to add magnetic tensor onto the real tensor.
@@ -132,12 +140,12 @@ def process_spin_output(
 
     def process_spin_output_lower(
         self,
-        extended_atype,
-        extended_out_tensor,
+        extended_atype: torch.Tensor,
+        extended_out_tensor: torch.Tensor,
         nloc: int,
         add_mag: bool = True,
         virtual_scale: bool = True,
-    ):
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Split the extended output of both real and virtual atoms with switch, and scale the latter.
         add_mag: whether to add magnetic tensor onto the real tensor.
@@ -177,7 +185,7 @@ def process_spin_output_lower(
         return extended_out_real, extended_out_mag, atomic_mask > 0.0
 
     @staticmethod
-    def extend_nlist(extended_atype, nlist):
+    def extend_nlist(extended_atype: torch.Tensor, nlist: torch.Tensor) -> torch.Tensor:
         nframes, nloc, nnei = nlist.shape
         nall = extended_atype.shape[1]
         nlist_mask = nlist != -1
@@ -207,7 +215,7 @@ def extend_nlist(extended_atype, nlist):
         return extended_nlist
 
     @staticmethod
-    def expand_aparam(aparam, nloc: int):
+    def expand_aparam(aparam: torch.Tensor, nloc: int) -> torch.Tensor:
         """Expand the atom parameters for virtual atoms if necessary."""
         nframes, natom, numb_aparam = aparam.shape
         if natom == nloc:  # good
@@ -239,22 +247,22 @@ def get_type_map(self) -> list[str]:
         return tmap[:ntypes]
 
     @torch.jit.export
-    def get_ntypes(self):
+    def get_ntypes(self) -> int:
         """Returns the number of element types."""
         return len(self.get_type_map())
 
     @torch.jit.export
-    def get_rcut(self):
+    def get_rcut(self) -> float:
         """Get the cut-off radius."""
         return self.backbone_model.get_rcut()
 
     @torch.jit.export
-    def get_dim_fparam(self):
+    def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this atomic model."""
         return self.backbone_model.get_dim_fparam()
 
     @torch.jit.export
-    def get_dim_aparam(self):
+    def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.backbone_model.get_dim_aparam()
 
@@ -320,7 +328,7 @@ def need_sorted_nlist_for_lower(self) -> bool:
         """Returns whether the model needs sorted nlist when using `forward_lower`."""
         return self.backbone_model.need_sorted_nlist_for_lower()
 
-    def model_output_def(self):
+    def model_output_def(self) -> ModelOutputDef:
         """Get the output def for the model."""
         model_output_type = self.backbone_model.model_output_type()
         if "mask" in model_output_type:
@@ -330,7 +338,7 @@ def model_output_def(self):
         backbone_model_atomic_output_def[var_name].magnetic = True
         return ModelOutputDef(backbone_model_atomic_output_def)
 
-    def __getattr__(self, name):
+    def __getattr__(self, name: str) -> Any:
         """Get attribute from the wrapped model."""
         if (
             name == "backbone_model"
@@ -343,7 +351,7 @@ def __getattr__(self, name):
 
     def compute_or_load_stat(
         self,
-        sampled_func,
+        sampled_func: Callable[[], list[dict[str, Any]]],
         stat_file_path: Optional[DPPath] = None,
     ) -> None:
         """
@@ -363,7 +371,7 @@ def compute_or_load_stat(
         """
 
         @functools.lru_cache
-        def spin_sampled_func():
+        def spin_sampled_func() -> list[dict[str, Any]]:
             sampled = sampled_func()
             spin_sampled = []
             for sys in sampled:
@@ -389,9 +397,9 @@ def spin_sampled_func():
 
     def forward_common(
         self,
-        coord,
-        atype,
-        spin,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
+        spin: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -437,17 +445,17 @@ def forward_common(
 
     def forward_common_lower(
         self,
-        extended_coord,
-        extended_atype,
-        extended_spin,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        extended_spin: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
         extra_nlist_sort: bool = False,
-    ):
+    ) -> dict[str, torch.Tensor]:
         nframes, nloc = nlist.shape[:2]
         (
             extended_coord_updated,
@@ -506,7 +514,7 @@ def serialize(self) -> dict:
         }
 
     @classmethod
-    def deserialize(cls, data) -> "SpinModel":
+    def deserialize(cls, data: dict[str, Any]) -> "SpinModel":
         backbone_model_obj = make_model(DPAtomicModel).deserialize(
             data["backbone_model"]
         )
@@ -524,12 +532,12 @@ class SpinEnergyModel(SpinModel):
 
     def __init__(
         self,
-        backbone_model,
+        backbone_model: DPAtomicModel,
         spin: Spin,
     ) -> None:
         super().__init__(backbone_model, spin)
 
-    def translated_output_def(self):
+    def translated_output_def(self) -> dict[str, Any]:
         out_def_data = self.model_output_def().get_data()
         output_def = {
             "atom_energy": out_def_data["energy"],
@@ -545,9 +553,9 @@ def translated_output_def(self):
 
     def forward(
         self,
-        coord,
-        atype,
-        spin,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
+        spin: torch.Tensor,
         box: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
@@ -575,16 +583,16 @@ def forward(
     @torch.jit.export
     def forward_lower(
         self,
-        extended_coord,
-        extended_atype,
-        extended_spin,
-        nlist,
+        extended_coord: torch.Tensor,
+        extended_atype: torch.Tensor,
+        extended_spin: torch.Tensor,
+        nlist: torch.Tensor,
         mapping: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
         do_atomic_virial: bool = False,
         comm_dict: Optional[dict[str, torch.Tensor]] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         model_ret = self.forward_common_lower(
             extended_coord,
             extended_atype,
diff --git a/deepmd/pt/model/model/transform_output.py b/deepmd/pt/model/model/transform_output.py
index b8f1e024e0..cd88e4cb40 100644
--- a/deepmd/pt/model/model/transform_output.py
+++ b/deepmd/pt/model/model/transform_output.py
@@ -20,7 +20,7 @@
 def atomic_virial_corr(
     extended_coord: torch.Tensor,
     atom_energy: torch.Tensor,
-):
+) -> torch.Tensor:
     nall = extended_coord.shape[1]
     nloc = atom_energy.shape[1]
     coord, _ = torch.split(extended_coord, [nloc, nall - nloc], dim=1)
@@ -72,7 +72,7 @@ def task_deriv_one(
     do_virial: bool = True,
     do_atomic_virial: bool = False,
     create_graph: bool = True,
-):
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     faked_grad = torch.ones_like(energy)
     lst = torch.jit.annotate(list[Optional[torch.Tensor]], [faked_grad])
     extended_force = torch.autograd.grad(
@@ -102,7 +102,7 @@ def task_deriv_one(
 def get_leading_dims(
     vv: torch.Tensor,
     vdef: OutputVariableDef,
-):
+) -> list[int]:
     """Get the dimensions of nf x nloc."""
     vshape = vv.shape
     return list(vshape[: (len(vshape) - len(vdef.shape))])
@@ -116,7 +116,7 @@ def take_deriv(
     do_virial: bool = False,
     do_atomic_virial: bool = False,
     create_graph: bool = True,
-):
+) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
     size = 1
     for ii in vdef.shape:
         size *= ii
@@ -158,6 +158,7 @@ def fit_output_to_model_output(
     coord_ext: torch.Tensor,
     do_atomic_virial: bool = False,
     create_graph: bool = True,
+    mask: Optional[torch.Tensor] = None,
 ) -> dict[str, torch.Tensor]:
     """Transform the output of the fitting network to
     the model output.
@@ -172,7 +173,12 @@ def fit_output_to_model_output(
         if vdef.reducible:
             kk_redu = get_reduce_name(kk)
             if vdef.intensive:
-                model_ret[kk_redu] = torch.mean(vv.to(redu_prec), dim=atom_axis)
+                if mask is not None:
+                    model_ret[kk_redu] = torch.sum(
+                        vv.to(redu_prec), dim=atom_axis
+                    ) / torch.sum(mask, dim=-1, keepdim=True)
+                else:
+                    model_ret[kk_redu] = torch.mean(vv.to(redu_prec), dim=atom_axis)
             else:
                 model_ret[kk_redu] = torch.sum(vv.to(redu_prec), dim=atom_axis)
             if vdef.r_differentiable:
diff --git a/deepmd/pt/model/network/init.py b/deepmd/pt/model/network/init.py
index 53e2c70892..6bdff61eea 100644
--- a/deepmd/pt/model/network/init.py
+++ b/deepmd/pt/model/network/init.py
@@ -18,19 +18,36 @@
 # functions that use `with torch.no_grad()`. The JIT doesn't support context
 # managers, so these need to be implemented as builtins. Using these wrappers
 # lets us keep those builtins small and reusable.
-def _no_grad_uniform_(tensor, a, b, generator=None):
+def _no_grad_uniform_(
+    tensor: torch.Tensor,
+    a: float,
+    b: float,
+    generator: _Optional[torch.Generator] = None,
+) -> torch.Tensor:
     with torch.no_grad():
         return tensor.uniform_(a, b, generator=generator)
 
 
-def _no_grad_normal_(tensor, mean, std, generator=None):
+def _no_grad_normal_(
+    tensor: torch.Tensor,
+    mean: float,
+    std: float,
+    generator: _Optional[torch.Generator] = None,
+) -> torch.Tensor:
     with torch.no_grad():
         return tensor.normal_(mean, std, generator=generator)
 
 
-def _no_grad_trunc_normal_(tensor, mean, std, a, b, generator=None):
+def _no_grad_trunc_normal_(
+    tensor: torch.Tensor,
+    mean: float,
+    std: float,
+    a: float,
+    b: float,
+    generator: _Optional[torch.Generator] = None,
+) -> torch.Tensor:
     # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
+    def norm_cdf(x: float) -> float:
         # Computes standard normal cumulative distribution function
         return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
 
@@ -65,17 +82,17 @@ def norm_cdf(x):
         return tensor
 
 
-def _no_grad_zero_(tensor):
+def _no_grad_zero_(tensor: torch.Tensor) -> torch.Tensor:
     with torch.no_grad():
         return tensor.zero_()
 
 
-def _no_grad_fill_(tensor, val):
+def _no_grad_fill_(tensor: torch.Tensor, val: float) -> torch.Tensor:
     with torch.no_grad():
         return tensor.fill_(val)
 
 
-def calculate_gain(nonlinearity, param=None):
+def calculate_gain(nonlinearity: str, param: _Optional[float] = None) -> float:
     r"""Return the recommended gain value for the given nonlinearity function.
 
     The values are as follows:
@@ -146,7 +163,7 @@ def calculate_gain(nonlinearity, param=None):
         raise ValueError(f"Unsupported nonlinearity {nonlinearity}")
 
 
-def _calculate_fan_in_and_fan_out(tensor):
+def _calculate_fan_in_and_fan_out(tensor: torch.Tensor) -> tuple[int, int]:
     dimensions = tensor.dim()
     if dimensions < 2:
         raise ValueError(
@@ -167,7 +184,7 @@ def _calculate_fan_in_and_fan_out(tensor):
     return fan_in, fan_out
 
 
-def _calculate_correct_fan(tensor, mode):
+def _calculate_correct_fan(tensor: torch.Tensor, mode: str) -> int:
     mode = mode.lower()
     valid_modes = ["fan_in", "fan_out"]
     if mode not in valid_modes:
@@ -290,7 +307,7 @@ def kaiming_uniform_(
     mode: str = "fan_in",
     nonlinearity: str = "leaky_relu",
     generator: _Optional[torch.Generator] = None,
-):
+) -> Tensor:
     r"""Fill the input `Tensor` with values using a Kaiming uniform distribution.
 
     The method is described in `Delving deep into rectifiers: Surpassing
@@ -348,7 +365,7 @@ def kaiming_normal_(
     mode: str = "fan_in",
     nonlinearity: str = "leaky_relu",
     generator: _Optional[torch.Generator] = None,
-):
+) -> Tensor:
     r"""Fill the input `Tensor` with values using a Kaiming normal distribution.
 
     The method is described in `Delving deep into rectifiers: Surpassing
diff --git a/deepmd/pt/model/network/layernorm.py b/deepmd/pt/model/network/layernorm.py
index 89bd16d569..fdf31d0ffd 100644
--- a/deepmd/pt/model/network/layernorm.py
+++ b/deepmd/pt/model/network/layernorm.py
@@ -30,14 +30,14 @@
 device = env.DEVICE
 
 
-def empty_t(shape, precision):
+def empty_t(shape: tuple[int, ...], precision: torch.dtype) -> torch.Tensor:
     return torch.empty(shape, dtype=precision, device=device)
 
 
 class LayerNorm(nn.Module):
     def __init__(
         self,
-        num_in,
+        num_in: int,
         eps: float = 1e-5,
         uni_init: bool = True,
         bavg: float = 0.0,
@@ -141,7 +141,7 @@ def deserialize(cls, data: dict) -> "LayerNorm":
         )
         prec = PRECISION_DICT[obj.precision]
 
-        def check_load_param(ss):
+        def check_load_param(ss: str) -> Optional[nn.Parameter]:
             return (
                 nn.Parameter(data=to_torch_tensor(nl[ss]))
                 if nl[ss] is not None
diff --git a/deepmd/pt/model/network/mlp.py b/deepmd/pt/model/network/mlp.py
index 22675d6163..a850c85a9b 100644
--- a/deepmd/pt/model/network/mlp.py
+++ b/deepmd/pt/model/network/mlp.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     ClassVar,
     Optional,
     Union,
@@ -43,7 +44,7 @@
 )
 
 
-def empty_t(shape, precision):
+def empty_t(shape: tuple[int, ...], precision: torch.dtype) -> torch.Tensor:
     return torch.empty(shape, dtype=precision, device=device)
 
 
@@ -72,8 +73,8 @@ def deserialize(cls, data: dict) -> "Identity":
 class MLPLayer(nn.Module):
     def __init__(
         self,
-        num_in,
-        num_out,
+        num_in: int,
+        num_out: int,
         bias: bool = True,
         use_timestep: bool = False,
         activation_function: Optional[str] = None,
@@ -83,8 +84,10 @@ def __init__(
         precision: str = DEFAULT_PRECISION,
         init: str = "default",
         seed: Optional[Union[int, list[int]]] = None,
+        trainable: bool = True,
     ) -> None:
         super().__init__()
+        self.trainable = trainable
         # only use_timestep when skip connection is established.
         self.use_timestep = use_timestep and (
             num_out == num_in or num_out == num_in * 2
@@ -130,7 +133,7 @@ def __init__(
     def check_type_consistency(self) -> None:
         precision = self.precision
 
-        def check_var(var) -> None:
+        def check_var(var: Optional[torch.Tensor]) -> None:
             if var is not None:
                 # assertion "float64" == "double" would fail
                 assert PRECISION_DICT[var.dtype.name] is PRECISION_DICT[precision]
@@ -162,7 +165,7 @@ def _default_normal_init(
             normal_(self.idt.data, mean=0.1, std=0.001, generator=generator)
 
     def _trunc_normal_init(
-        self, scale=1.0, generator: Optional[torch.Generator] = None
+        self, scale: float = 1.0, generator: Optional[torch.Generator] = None
     ) -> None:
         # Constant from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
         TRUNCATED_NORMAL_STDDEV_FACTOR = 0.87962566103423978
@@ -174,7 +177,7 @@ def _trunc_normal_init(
     def _glorot_uniform_init(self, generator: Optional[torch.Generator] = None) -> None:
         xavier_uniform_(self.matrix, gain=1, generator=generator)
 
-    def _zero_init(self, use_bias=True) -> None:
+    def _zero_init(self, use_bias: bool = True) -> None:
         with torch.no_grad():
             self.matrix.fill_(0.0)
             if use_bias and self.bias is not None:
@@ -233,6 +236,7 @@ def serialize(self) -> dict:
             activation_function=self.activate_name,
             resnet=self.resnet,
             precision=self.precision,
+            trainable=self.trainable,
         )
         nl.w, nl.b, nl.idt = (
             to_numpy_array(self.matrix),
@@ -259,10 +263,11 @@ def deserialize(cls, data: dict) -> "MLPLayer":
             activation_function=nl["activation_function"],
             resnet=nl["resnet"],
             precision=nl["precision"],
+            trainable=nl["trainable"],
         )
         prec = PRECISION_DICT[obj.precision]
 
-        def check_load_param(ss):
+        def check_load_param(ss: str) -> Optional[nn.Parameter]:
             return (
                 nn.Parameter(data=to_torch_tensor(nl[ss]))
                 if nl[ss] is not None
@@ -279,7 +284,7 @@ def check_load_param(ss):
 
 
 class MLP(MLP_):
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
         self.layers = torch.nn.ModuleList(self.layers)
 
@@ -300,7 +305,7 @@ class NetworkCollection(DPNetworkCollection, nn.Module):
         "fitting_network": FittingNet,
     }
 
-    def __init__(self, *args, **kwargs) -> None:
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
         # init both two base classes
         DPNetworkCollection.__init__(self, *args, **kwargs)
         nn.Module.__init__(self)
diff --git a/deepmd/pt/model/network/network.py b/deepmd/pt/model/network/network.py
index ab01a90774..d95741b05c 100644
--- a/deepmd/pt/model/network/network.py
+++ b/deepmd/pt/model/network/network.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Final,
     Optional,
     Union,
@@ -32,7 +33,7 @@
 )
 
 
-def Tensor(*shape):
+def Tensor(*shape: int) -> torch.Tensor:
     return torch.empty(shape, dtype=env.GLOBAL_PT_FLOAT_PRECISION, device=env.DEVICE)
 
 
@@ -41,12 +42,12 @@ class SimpleLinear(nn.Module):
 
     def __init__(
         self,
-        num_in,
-        num_out,
-        bavg=0.0,
-        stddev=1.0,
-        use_timestep=False,
-        activate=None,
+        num_in: int,
+        num_out: int,
+        bavg: float = 0.0,
+        stddev: float = 1.0,
+        use_timestep: bool = False,
+        activate: Optional[str] = None,
         bias: bool = True,
     ) -> None:
         """Construct a linear layer.
@@ -74,7 +75,7 @@ def __init__(
             self.idt = nn.Parameter(data=Tensor(1, num_out))
             nn.init.normal_(self.idt.data, mean=0.1, std=0.001)
 
-    def forward(self, inputs):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
         """Return X*W+b."""
         xw = torch.matmul(inputs, self.matrix)
         hidden = xw + self.bias if self.bias is not None else xw
@@ -121,7 +122,7 @@ def __init__(
         else:
             raise ValueError("Invalid init method.")
 
-    def _trunc_normal_init(self, scale=1.0) -> None:
+    def _trunc_normal_init(self, scale: float = 1.0) -> None:
         # Constant from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
         TRUNCATED_NORMAL_STDDEV_FACTOR = 0.87962566103423978
         _, fan_in = self.weight.shape
@@ -132,7 +133,7 @@ def _trunc_normal_init(self, scale=1.0) -> None:
     def _glorot_uniform_init(self) -> None:
         nn.init.xavier_uniform_(self.weight, gain=1)
 
-    def _zero_init(self, use_bias=True) -> None:
+    def _zero_init(self, use_bias: bool = True) -> None:
         with torch.no_grad():
             self.weight.fill_(0.0)
             if use_bias:
@@ -144,13 +145,19 @@ def _normal_init(self) -> None:
 
 
 class NonLinearHead(nn.Module):
-    def __init__(self, input_dim, out_dim, activation_fn, hidden=None) -> None:
+    def __init__(
+        self,
+        input_dim: int,
+        out_dim: int,
+        activation_fn: str,
+        hidden: Optional[int] = None,
+    ) -> None:
         super().__init__()
         hidden = input_dim if not hidden else hidden
         self.linear1 = SimpleLinear(input_dim, hidden, activate=activation_fn)
         self.linear2 = SimpleLinear(hidden, out_dim)
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         x = self.linear1(x)
         x = self.linear2(x)
         return x
@@ -159,7 +166,13 @@ def forward(self, x):
 class MaskLMHead(nn.Module):
     """Head for masked language modeling."""
 
-    def __init__(self, embed_dim, output_dim, activation_fn, weight=None) -> None:
+    def __init__(
+        self,
+        embed_dim: int,
+        output_dim: int,
+        activation_fn: str,
+        weight: Optional[torch.Tensor] = None,
+    ) -> None:
         super().__init__()
         self.dense = SimpleLinear(embed_dim, embed_dim)
         self.activation_fn = ActivationFn(activation_fn)
@@ -174,7 +187,12 @@ def __init__(self, embed_dim, output_dim, activation_fn, weight=None) -> None:
             torch.zeros(output_dim, dtype=env.GLOBAL_PT_FLOAT_PRECISION)  # pylint: disable=no-explicit-dtype,no-explicit-device
         )
 
-    def forward(self, features, masked_tokens: Optional[torch.Tensor] = None, **kwargs):
+    def forward(
+        self,
+        features: torch.Tensor,
+        masked_tokens: Optional[torch.Tensor] = None,
+        **kwargs: Any,
+    ) -> torch.Tensor:
         # Only project the masked tokens while training,
         # saves both memory and computation
         if masked_tokens is not None:
@@ -190,7 +208,13 @@ def forward(self, features, masked_tokens: Optional[torch.Tensor] = None, **kwar
 
 class ResidualDeep(nn.Module):
     def __init__(
-        self, type_id, embedding_width, neuron, bias_atom_e, out_dim=1, resnet_dt=False
+        self,
+        type_id: int,
+        embedding_width: int,
+        neuron: list[int],
+        bias_atom_e: float,
+        out_dim: int = 1,
+        resnet_dt: bool = False,
     ) -> None:
         """Construct a filter on the given element as neighbor.
 
@@ -221,7 +245,7 @@ def __init__(
             bias_atom_e = 0
         self.final_layer = SimpleLinear(self.neuron[-1], self.out_dim, bias_atom_e)
 
-    def forward(self, inputs):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
         """Calculate decoded embedding for each atom.
 
         Args:
@@ -244,15 +268,16 @@ def forward(self, inputs):
 class TypeEmbedNet(nn.Module):
     def __init__(
         self,
-        type_nums,
-        embed_dim,
-        bavg=0.0,
-        stddev=1.0,
-        precision="default",
+        type_nums: int,
+        embed_dim: int,
+        bavg: float = 0.0,
+        stddev: float = 1.0,
+        precision: str = "default",
         seed: Optional[Union[int, list[int]]] = None,
-        use_econf_tebd=False,
+        use_econf_tebd: bool = False,
         use_tebd_bias: bool = False,
-        type_map=None,
+        type_map: Optional[list[str]] = None,
+        trainable: bool = True,
     ) -> None:
         """Construct a type embedding net."""
         super().__init__()
@@ -273,10 +298,11 @@ def __init__(
             type_map=type_map,
             precision=precision,
             seed=seed,
+            trainable=trainable,
         )
         # nn.init.normal_(self.embedding.weight[:-1], mean=bavg, std=stddev)
 
-    def forward(self, atype):
+    def forward(self, atype: torch.Tensor) -> torch.Tensor:
         """
         Args:
             atype: Type of each input, [nframes, nloc] or [nframes, nloc, nnei].
@@ -288,7 +314,7 @@ def forward(self, atype):
         """
         return torch.embedding(self.embedding(atype.device), atype)
 
-    def get_full_embedding(self, device: torch.device):
+    def get_full_embedding(self, device: torch.device) -> torch.Tensor:
         """
         Get the type embeddings of all types.
 
@@ -305,7 +331,9 @@ def get_full_embedding(self, device: torch.device):
         """
         return self.embedding(device)
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: Any, shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -322,7 +350,7 @@ def share_params(self, base_class, shared_level, resume=False) -> None:
             raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -407,7 +435,7 @@ def __init__(
         for param in self.parameters():
             param.requires_grad = trainable
 
-    def forward(self, device: torch.device):
+    def forward(self, device: torch.device) -> torch.Tensor:
         """Caulate type embedding network.
 
         Returns
@@ -429,7 +457,7 @@ def forward(self, device: torch.device):
         return embed
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -491,7 +519,7 @@ def change_type_map(
         self.ntypes = len(type_map)
 
     @classmethod
-    def deserialize(cls, data: dict):
+    def deserialize(cls, data: dict) -> "TypeEmbedNetConsistent":
         """Deserialize the model.
 
         Parameters
diff --git a/deepmd/pt/model/network/utils.py b/deepmd/pt/model/network/utils.py
index 2047efec2b..7af8b7c032 100644
--- a/deepmd/pt/model/network/utils.py
+++ b/deepmd/pt/model/network/utils.py
@@ -57,7 +57,7 @@ def get_graph_index(
     a_nlist_mask: torch.Tensor,
     nall: int,
     use_loc_mapping: bool = True,
-):
+) -> tuple[torch.Tensor, torch.Tensor]:
     """
     Get the index mapping for edge graph and angle graph, ready in `aggregate` or `index_select`.
 
@@ -74,12 +74,12 @@ def get_graph_index(
 
     Returns
     -------
-    edge_index : n_edge x 2
+    edge_index : 2 x n_edge
         n2e_index : n_edge
             Broadcast indices from node(i) to edge(ij), or reduction indices from edge(ij) to node(i).
         n_ext2e_index : n_edge
             Broadcast indices from extended node(j) to edge(ij).
-    angle_index : n_angle x 3
+    angle_index : 3 x n_angle
         n2a_index : n_angle
             Broadcast indices from extended node(j) to angle(ijk).
         eij2a_index : n_angle
@@ -135,9 +135,7 @@ def get_graph_index(
     # n_angle
     eik2a_index = edge_index_ik[a_nlist_mask_3d]
 
-    return torch.cat(
-        [n2e_index.unsqueeze(-1), n_ext2e_index.unsqueeze(-1)], dim=-1
-    ), torch.cat(
-        [n2a_index.unsqueeze(-1), eij2a_index.unsqueeze(-1), eik2a_index.unsqueeze(-1)],
-        dim=-1,
-    )
+    edge_index_result = torch.stack([n2e_index, n_ext2e_index], dim=0)
+    angle_index_result = torch.stack([n2a_index, eij2a_index, eik2a_index], dim=0)
+
+    return edge_index_result, angle_index_result
diff --git a/deepmd/pt/model/task/denoise.py b/deepmd/pt/model/task/denoise.py
index fc9e8943e9..50cae4fb12 100644
--- a/deepmd/pt/model/task/denoise.py
+++ b/deepmd/pt/model/task/denoise.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -26,12 +27,12 @@
 class DenoiseNet(Fitting):
     def __init__(
         self,
-        feature_dim,
-        ntypes,
-        attn_head=8,
-        prefactor=[0.5, 0.5],
-        activation_function="gelu",
-        **kwargs,
+        feature_dim: int,
+        ntypes: int,
+        attn_head: int = 8,
+        prefactor: list[float] = [0.5, 0.5],
+        activation_function: str = "gelu",
+        **kwargs: Any,
     ) -> None:
         """Construct a denoise net.
 
@@ -71,7 +72,7 @@ def __init__(
                 self.pair2coord_proj.append(_pair2coord_proj)
             self.pair2coord_proj = torch.nn.ModuleList(self.pair2coord_proj)
 
-    def output_def(self):
+    def output_def(self) -> FittingOutputDef:
         return FittingOutputDef(
             [
                 OutputVariableDef(
@@ -93,13 +94,13 @@ def output_def(self):
 
     def forward(
         self,
-        pair_weights,
-        diff,
-        nlist_mask,
-        features,
-        sw,
+        pair_weights: torch.Tensor,
+        diff: torch.Tensor,
+        nlist_mask: torch.Tensor,
+        features: torch.Tensor,
+        sw: torch.Tensor,
         masked_tokens: Optional[torch.Tensor] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         """Calculate the updated coord.
         Args:
         - coord: Input noisy coord with shape [nframes, nloc, 3].
diff --git a/deepmd/pt/model/task/dipole.py b/deepmd/pt/model/task/dipole.py
index 65b64220ae..b6a1477f7a 100644
--- a/deepmd/pt/model/task/dipole.py
+++ b/deepmd/pt/model/task/dipole.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -72,6 +73,9 @@ class DipoleFittingNet(GeneralFitting):
         Only reducible variable are differentiable.
     type_map: list[str], Optional
         A list of strings. Give the name to each type of atoms.
+    default_fparam: list[float], optional
+        The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+        this value will be used as the default value for the frame parameter in the fitting net.
     """
 
     def __init__(
@@ -93,7 +97,8 @@ def __init__(
         r_differentiable: bool = True,
         c_differentiable: bool = True,
         type_map: Optional[list[str]] = None,
-        **kwargs,
+        default_fparam: Optional[list] = None,
+        **kwargs: Any,
     ) -> None:
         self.embedding_width = embedding_width
         self.r_differentiable = r_differentiable
@@ -114,10 +119,11 @@ def __init__(
             seed=seed,
             exclude_types=exclude_types,
             type_map=type_map,
+            default_fparam=default_fparam,
             **kwargs,
         )
 
-    def _net_out_dim(self):
+    def _net_out_dim(self) -> int:
         """Set the FittingNet output dim."""
         return self.embedding_width
 
@@ -132,7 +138,7 @@ def serialize(self) -> dict:
     @classmethod
     def deserialize(cls, data: dict) -> "GeneralFitting":
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 3, 1)
+        check_version_compatibility(data.pop("@version", 1), 4, 1)
         data.pop("var_name", None)
         return super().deserialize(data)
 
@@ -181,7 +187,7 @@ def forward(
         h2: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         nframes, nloc, _ = descriptor.shape
         assert gr is not None, "Must provide the rotation matrix for dipole fitting."
         # cast the input to internal precsion
diff --git a/deepmd/pt/model/task/dos.py b/deepmd/pt/model/task/dos.py
index 568ef81c92..afbed5f748 100644
--- a/deepmd/pt/model/task/dos.py
+++ b/deepmd/pt/model/task/dos.py
@@ -57,6 +57,7 @@ def __init__(
         exclude_types: list[int] = [],
         mixed_types: bool = True,
         type_map: Optional[list[str]] = None,
+        default_fparam: Optional[list] = None,
     ) -> None:
         if bias_dos is not None:
             self.bias_dos = bias_dos
@@ -83,6 +84,7 @@ def __init__(
             exclude_types=exclude_types,
             trainable=trainable,
             type_map=type_map,
+            default_fparam=default_fparam,
         )
 
     def output_def(self) -> FittingOutputDef:
@@ -101,7 +103,7 @@ def output_def(self) -> FittingOutputDef:
     @classmethod
     def deserialize(cls, data: dict) -> "DOSFittingNet":
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 3, 1)
+        check_version_compatibility(data.pop("@version", 1), 4, 1)
         data.pop("@class", None)
         data.pop("var_name", None)
         data.pop("tot_ener_zero", None)
diff --git a/deepmd/pt/model/task/ener.py b/deepmd/pt/model/task/ener.py
index 07351b33f6..af288bec10 100644
--- a/deepmd/pt/model/task/ener.py
+++ b/deepmd/pt/model/task/ener.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -56,7 +57,8 @@ def __init__(
         mixed_types: bool = True,
         seed: Optional[Union[int, list[int]]] = None,
         type_map: Optional[list[str]] = None,
-        **kwargs,
+        default_fparam: Optional[list] = None,
+        **kwargs: Any,
     ) -> None:
         super().__init__(
             "energy",
@@ -74,13 +76,14 @@ def __init__(
             mixed_types=mixed_types,
             seed=seed,
             type_map=type_map,
+            default_fparam=default_fparam,
             **kwargs,
         )
 
     @classmethod
     def deserialize(cls, data: dict) -> "GeneralFitting":
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 3, 1)
+        check_version_compatibility(data.pop("@version", 1), 4, 1)
         data.pop("var_name")
         data.pop("dim_out")
         return super().deserialize(data)
@@ -102,15 +105,15 @@ def serialize(self) -> dict:
 class EnergyFittingNetDirect(Fitting):
     def __init__(
         self,
-        ntypes,
-        dim_descrpt,
-        neuron,
-        bias_atom_e=None,
-        out_dim=1,
-        resnet_dt=True,
-        use_tebd=True,
-        return_energy=False,
-        **kwargs,
+        ntypes: int,
+        dim_descrpt: int,
+        neuron: list[int],
+        bias_atom_e: Optional[torch.Tensor] = None,
+        out_dim: int = 1,
+        resnet_dt: bool = True,
+        use_tebd: bool = True,
+        return_energy: bool = False,
+        **kwargs: Any,
     ) -> None:
         """Construct a fitting net for energy.
 
@@ -160,7 +163,7 @@ def __init__(
                 filter_layers.append(one)
         self.filter_layers = torch.nn.ModuleList(filter_layers)
 
-    def output_def(self):
+    def output_def(self) -> FittingOutputDef:
         return FittingOutputDef(
             [
                 OutputVariableDef(
@@ -187,7 +190,7 @@ def deserialize(self) -> "EnergyFittingNetDirect":
         raise NotImplementedError
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         raise NotImplementedError
 
diff --git a/deepmd/pt/model/task/fitting.py b/deepmd/pt/model/task/fitting.py
index 0865b61f52..4c8e90ef7c 100644
--- a/deepmd/pt/model/task/fitting.py
+++ b/deepmd/pt/model/task/fitting.py
@@ -4,6 +4,7 @@
     abstractmethod,
 )
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -50,12 +51,14 @@
 class Fitting(torch.nn.Module, BaseFitting):
     # plugin moved to BaseFitting
 
-    def __new__(cls, *args, **kwargs):
+    def __new__(cls, *args: Any, **kwargs: Any) -> "Fitting":
         if cls is Fitting:
             return BaseFitting.__new__(BaseFitting, *args, **kwargs)
         return super().__new__(cls)
 
-    def share_params(self, base_class, shared_level, resume=False) -> None:
+    def share_params(
+        self, base_class: "Fitting", shared_level: int, resume: bool = False
+    ) -> None:
         """
         Share the parameters of self to the base_class with shared_level during multitask training.
         If not start from checkpoint (resume is False),
@@ -204,6 +207,9 @@ class GeneralFitting(Fitting):
         A list of strings. Give the name to each type of atoms.
     use_aparam_as_mask: bool
         If True, the aparam will not be used in fitting net for embedding.
+    default_fparam: list[float], optional
+        The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+        this value will be used as the default value for the frame parameter in the fitting net.
     """
 
     def __init__(
@@ -227,7 +233,8 @@ def __init__(
         remove_vaccum_contribution: Optional[list[bool]] = None,
         type_map: Optional[list[str]] = None,
         use_aparam_as_mask: bool = False,
-        **kwargs,
+        default_fparam: Optional[list[float]] = None,
+        **kwargs: Any,
     ) -> None:
         super().__init__()
         self.var_name = var_name
@@ -238,6 +245,7 @@ def __init__(
         self.resnet_dt = resnet_dt
         self.numb_fparam = numb_fparam
         self.numb_aparam = numb_aparam
+        self.default_fparam = default_fparam
         self.dim_case_embd = dim_case_embd
         self.activation_function = activation_function
         self.precision = precision
@@ -299,6 +307,20 @@ def __init__(
         else:
             self.case_embd = None
 
+        if self.default_fparam is not None:
+            if self.numb_fparam > 0:
+                assert len(self.default_fparam) == self.numb_fparam, (
+                    "default_fparam length mismatch!"
+                )
+            self.register_buffer(
+                "default_fparam_tensor",
+                torch.tensor(
+                    np.array(self.default_fparam), dtype=self.prec, device=device
+                ),
+            )
+        else:
+            self.default_fparam_tensor = None
+
         in_dim = (
             self.dim_descrpt
             + self.numb_fparam
@@ -320,6 +342,7 @@ def __init__(
                     self.precision,
                     bias_out=True,
                     seed=child_seed(self.seed, ii),
+                    trainable=trainable,
                 )
                 for ii in range(self.ntypes if not self.mixed_types else 1)
             ],
@@ -328,6 +351,8 @@ def __init__(
         for param in self.parameters():
             param.requires_grad = self.trainable
 
+        self.eval_return_middle_output = False
+
     def reinit_exclude(
         self,
         exclude_types: list[int] = [],
@@ -336,7 +361,9 @@ def reinit_exclude(
         self.emask = AtomExcludeMask(self.ntypes, self.exclude_types)
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self,
+        type_map: list[str],
+        model_with_new_type_stat: Optional["GeneralFitting"] = None,
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -363,7 +390,7 @@ def serialize(self) -> dict:
         """Serialize the fitting to dict."""
         return {
             "@class": "Fitting",
-            "@version": 3,
+            "@version": 4,
             "var_name": self.var_name,
             "ntypes": self.ntypes,
             "dim_descrpt": self.dim_descrpt,
@@ -372,6 +399,7 @@ def serialize(self) -> dict:
             "numb_fparam": self.numb_fparam,
             "numb_aparam": self.numb_aparam,
             "dim_case_embd": self.dim_case_embd,
+            "default_fparam": self.default_fparam,
             "activation_function": self.activation_function,
             "precision": self.precision,
             "mixed_types": self.mixed_types,
@@ -415,6 +443,10 @@ def get_dim_fparam(self) -> int:
         """Get the number (dimension) of frame parameters of this atomic model."""
         return self.numb_fparam
 
+    def has_default_fparam(self) -> bool:
+        """Check if the fitting has default frame parameters."""
+        return self.default_fparam is not None
+
     def get_dim_aparam(self) -> int:
         """Get the number (dimension) of atomic parameters of this atomic model."""
         return self.numb_aparam
@@ -440,7 +472,7 @@ def get_type_map(self) -> list[str]:
         """Get the name to each type of atoms."""
         return self.type_map
 
-    def set_case_embd(self, case_idx: int):
+    def set_case_embd(self, case_idx: int) -> None:
         """
         Set the case embedding of this fitting net by the given case_idx,
         typically concatenated with the output of the descriptor and fed into the fitting net.
@@ -449,7 +481,10 @@ def set_case_embd(self, case_idx: int):
             case_idx
         ]
 
-    def __setitem__(self, key, value) -> None:
+    def set_return_middle_output(self, return_middle_output: bool = True) -> None:
+        self.eval_return_middle_output = return_middle_output
+
+    def __setitem__(self, key: str, value: torch.Tensor) -> None:
         if key in ["bias_atom_e"]:
             value = value.view([self.ntypes, self._net_out_dim()])
             self.bias_atom_e = value
@@ -465,10 +500,12 @@ def __setitem__(self, key, value) -> None:
             self.case_embd = value
         elif key in ["scale"]:
             self.scale = value
+        elif key in ["default_fparam_tensor"]:
+            self.default_fparam_tensor = value
         else:
             raise KeyError(key)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> torch.Tensor:
         if key in ["bias_atom_e"]:
             return self.bias_atom_e
         elif key in ["fparam_avg"]:
@@ -483,11 +520,13 @@ def __getitem__(self, key):
             return self.case_embd
         elif key in ["scale"]:
             return self.scale
+        elif key in ["default_fparam_tensor"]:
+            return self.default_fparam_tensor
         else:
             raise KeyError(key)
 
     @abstractmethod
-    def _net_out_dim(self):
+    def _net_out_dim(self) -> int:
         """Set the FittingNet output dim."""
         pass
 
@@ -506,9 +545,16 @@ def _forward_common(
         h2: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         # cast the input to internal precsion
         xx = descriptor.to(self.prec)
+        nf, nloc, nd = xx.shape
+
+        if self.numb_fparam > 0 and fparam is None:
+            # use default fparam
+            assert self.default_fparam_tensor is not None
+            fparam = torch.tile(self.default_fparam_tensor.unsqueeze(0), [nf, 1])
+
         fparam = fparam.to(self.prec) if fparam is not None else None
         aparam = aparam.to(self.prec) if aparam is not None else None
 
@@ -521,7 +567,6 @@ def _forward_common(
             xx_zeros = torch.zeros_like(xx)
         else:
             xx_zeros = None
-        nf, nloc, nd = xx.shape
         net_dim_out = self._net_out_dim()
 
         if nd != self.dim_descrpt:
@@ -597,14 +642,37 @@ def _forward_common(
             dtype=self.prec,
             device=descriptor.device,
         )  # jit assertion
+        results = {}
+
         if self.mixed_types:
             atom_property = self.filter_layers.networks[0](xx)
+            if self.eval_return_middle_output:
+                results["middle_output"] = self.filter_layers.networks[
+                    0
+                ].call_until_last(xx)
             if xx_zeros is not None:
                 atom_property -= self.filter_layers.networks[0](xx_zeros)
             outs = (
                 outs + atom_property + self.bias_atom_e[atype].to(self.prec)
             )  # Shape is [nframes, natoms[0], net_dim_out]
         else:
+            if self.eval_return_middle_output:
+                outs_middle = torch.zeros(
+                    (nf, nloc, self.neuron[-1]),
+                    dtype=self.prec,
+                    device=descriptor.device,
+                )  # jit assertion
+                for type_i, ll in enumerate(self.filter_layers.networks):
+                    mask = (atype == type_i).unsqueeze(-1)
+                    mask = torch.tile(mask, (1, 1, net_dim_out))
+                    middle_output_type = ll.call_until_last(xx)
+                    middle_output_type = torch.where(
+                        torch.tile(mask, (1, 1, self.neuron[-1])),
+                        middle_output_type,
+                        0.0,
+                    )
+                    outs_middle = outs_middle + middle_output_type
+                results["middle_output"] = outs_middle
             for type_i, ll in enumerate(self.filter_layers.networks):
                 mask = (atype == type_i).unsqueeze(-1)
                 mask = torch.tile(mask, (1, 1, net_dim_out))
@@ -626,4 +694,10 @@ def _forward_common(
         mask = self.emask(atype).to(torch.bool)
         # nf x nloc x nod
         outs = torch.where(mask[:, :, None], outs, 0.0)
-        return {self.var_name: outs}
+        results.update({self.var_name: outs})
+        return results
+
+    @torch.jit.export
+    def get_task_dim(self) -> int:
+        """Get the output dimension of the fitting net."""
+        return self._net_out_dim()
diff --git a/deepmd/pt/model/task/invar_fitting.py b/deepmd/pt/model/task/invar_fitting.py
index b1599eac60..4ec3407901 100644
--- a/deepmd/pt/model/task/invar_fitting.py
+++ b/deepmd/pt/model/task/invar_fitting.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -80,6 +81,9 @@ class InvarFitting(GeneralFitting):
         A list of strings. Give the name to each type of atoms.
     use_aparam_as_mask: bool
         If True, the aparam will not be used in fitting net for embedding.
+    default_fparam: list[float], optional
+        The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+        this value will be used as the default value for the frame parameter in the fitting net.
     """
 
     def __init__(
@@ -103,7 +107,8 @@ def __init__(
         atom_ener: Optional[list[Optional[torch.Tensor]]] = None,
         type_map: Optional[list[str]] = None,
         use_aparam_as_mask: bool = False,
-        **kwargs,
+        default_fparam: Optional[list[float]] = None,
+        **kwargs: Any,
     ) -> None:
         self.dim_out = dim_out
         self.atom_ener = atom_ener
@@ -128,10 +133,11 @@ def __init__(
             else [x is not None for x in atom_ener],
             type_map=type_map,
             use_aparam_as_mask=use_aparam_as_mask,
+            default_fparam=default_fparam,
             **kwargs,
         )
 
-    def _net_out_dim(self):
+    def _net_out_dim(self) -> int:
         """Set the FittingNet output dim."""
         return self.dim_out
 
@@ -145,7 +151,7 @@ def serialize(self) -> dict:
     @classmethod
     def deserialize(cls, data: dict) -> "GeneralFitting":
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 3, 1)
+        check_version_compatibility(data.pop("@version", 1), 4, 1)
         return super().deserialize(data)
 
     def output_def(self) -> FittingOutputDef:
@@ -170,7 +176,7 @@ def forward(
         h2: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         """Based on embedding net output, alculate total energy.
 
         Args:
@@ -181,10 +187,17 @@ def forward(
         -------
         - `torch.Tensor`: Total energy with shape [nframes, natoms[0]].
         """
-        out = self._forward_common(descriptor, atype, gr, g2, h2, fparam, aparam)[
-            self.var_name
-        ]
-        return {self.var_name: out.to(env.GLOBAL_PT_FLOAT_PRECISION)}
+        out = self._forward_common(descriptor, atype, gr, g2, h2, fparam, aparam)
+        result = {self.var_name: out[self.var_name].to(env.GLOBAL_PT_FLOAT_PRECISION)}
+        if "middle_output" in out:
+            result.update(
+                {
+                    "middle_output": out["middle_output"].to(
+                        env.GLOBAL_PT_FLOAT_PRECISION
+                    )
+                }
+            )
+        return result
 
     # make jit happy with torch 2.0.0
     exclude_types: list[int]
diff --git a/deepmd/pt/model/task/polarizability.py b/deepmd/pt/model/task/polarizability.py
index a326802918..bf63d9db4b 100644
--- a/deepmd/pt/model/task/polarizability.py
+++ b/deepmd/pt/model/task/polarizability.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -75,7 +76,9 @@ class PolarFittingNet(GeneralFitting):
         Whether to shift the diagonal part of the polarizability matrix. The shift operation is carried out after scale.
     type_map: list[str], Optional
         A list of strings. Give the name to each type of atoms.
-
+    default_fparam: list[float], optional
+        The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+        this value will be used as the default value for the frame parameter in the fitting net.
     """
 
     def __init__(
@@ -98,7 +101,8 @@ def __init__(
         scale: Optional[Union[list[float], float]] = None,
         shift_diag: bool = True,
         type_map: Optional[list[str]] = None,
-        **kwargs,
+        default_fparam: Optional[list] = None,
+        **kwargs: Any,
     ) -> None:
         self.embedding_width = embedding_width
         self.fit_diag = fit_diag
@@ -139,10 +143,11 @@ def __init__(
             seed=seed,
             exclude_types=exclude_types,
             type_map=type_map,
+            default_fparam=default_fparam,
             **kwargs,
         )
 
-    def _net_out_dim(self):
+    def _net_out_dim(self) -> int:
         """Set the FittingNet output dim."""
         return (
             self.embedding_width
@@ -150,20 +155,20 @@ def _net_out_dim(self):
             else self.embedding_width * self.embedding_width
         )
 
-    def __setitem__(self, key, value) -> None:
+    def __setitem__(self, key: str, value: Any) -> None:
         if key in ["constant_matrix"]:
             self.constant_matrix = value
         else:
             super().__setitem__(key, value)
 
-    def __getitem__(self, key):
+    def __getitem__(self, key: str) -> Any:
         if key in ["constant_matrix"]:
             return self.constant_matrix
         else:
             return super().__getitem__(key)
 
     def change_type_map(
-        self, type_map: list[str], model_with_new_type_stat=None
+        self, type_map: list[str], model_with_new_type_stat: Optional[Any] = None
     ) -> None:
         """Change the type related params to new ones, according to `type_map` and the original one in the model.
         If there are new types in `type_map`, statistics will be updated accordingly to `model_with_new_type_stat` for these new types.
@@ -195,7 +200,7 @@ def change_type_map(
     def serialize(self) -> dict:
         data = super().serialize()
         data["type"] = "polar"
-        data["@version"] = 4
+        data["@version"] = 5
         data["embedding_width"] = self.embedding_width
         data["fit_diag"] = self.fit_diag
         data["shift_diag"] = self.shift_diag
@@ -206,7 +211,7 @@ def serialize(self) -> dict:
     @classmethod
     def deserialize(cls, data: dict) -> "GeneralFitting":
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 4, 1)
+        check_version_compatibility(data.pop("@version", 1), 5, 1)
         data.pop("var_name", None)
         return super().deserialize(data)
 
@@ -232,7 +237,7 @@ def forward(
         h2: Optional[torch.Tensor] = None,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-    ):
+    ) -> dict[str, torch.Tensor]:
         nframes, nloc, _ = descriptor.shape
         assert gr is not None, (
             "Must provide the rotation matrix for polarizability fitting."
diff --git a/deepmd/pt/model/task/property.py b/deepmd/pt/model/task/property.py
index 5ef0cd0233..c2440b7de3 100644
--- a/deepmd/pt/model/task/property.py
+++ b/deepmd/pt/model/task/property.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -91,7 +92,8 @@ def __init__(
         mixed_types: bool = True,
         trainable: Union[bool, list[bool]] = True,
         seed: Optional[int] = None,
-        **kwargs,
+        default_fparam: Optional[list] = None,
+        **kwargs: Any,
     ) -> None:
         self.task_dim = task_dim
         self.intensive = intensive
@@ -111,6 +113,7 @@ def __init__(
             mixed_types=mixed_types,
             trainable=trainable,
             seed=seed,
+            default_fparam=default_fparam,
             **kwargs,
         )
 
@@ -135,7 +138,7 @@ def get_intensive(self) -> bool:
     @classmethod
     def deserialize(cls, data: dict) -> "PropertyFittingNet":
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 4, 1)
+        check_version_compatibility(data.pop("@version", 1), 5, 1)
         data.pop("dim_out")
         data["property_name"] = data.pop("var_name")
         obj = super().deserialize(data)
@@ -150,7 +153,7 @@ def serialize(self) -> dict:
             "task_dim": self.task_dim,
             "intensive": self.intensive,
         }
-        dd["@version"] = 4
+        dd["@version"] = 5
 
         return dd
 
diff --git a/deepmd/pt/model/task/type_predict.py b/deepmd/pt/model/task/type_predict.py
index e4a980c3ea..5c1b064d07 100644
--- a/deepmd/pt/model/task/type_predict.py
+++ b/deepmd/pt/model/task/type_predict.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
 )
 
@@ -15,7 +16,11 @@
 
 class TypePredictNet(Fitting):
     def __init__(
-        self, feature_dim, ntypes, activation_function="gelu", **kwargs
+        self,
+        feature_dim: int,
+        ntypes: int,
+        activation_function: str = "gelu",
+        **kwargs: Any,
     ) -> None:
         """Construct a type predict net.
 
@@ -34,7 +39,9 @@ def __init__(
             weight=None,
         )
 
-    def forward(self, features, masked_tokens: Optional[torch.Tensor] = None):
+    def forward(
+        self, features: torch.Tensor, masked_tokens: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
         """Calculate the predicted logits.
         Args:
         - features: Input features with shape [nframes, nloc, feature_dim].
diff --git a/deepmd/pt/optimizer/LKF.py b/deepmd/pt/optimizer/LKF.py
index c342960e5b..aeb1120bff 100644
--- a/deepmd/pt/optimizer/LKF.py
+++ b/deepmd/pt/optimizer/LKF.py
@@ -1,6 +1,10 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 import math
+from typing import (
+    Any,
+    Optional,
+)
 
 import torch
 import torch.distributed as dist
@@ -9,7 +13,7 @@
 )
 
 
-def distribute_indices(total_length, num_workers):
+def distribute_indices(total_length: int, num_workers: int) -> list[tuple[int, int]]:
     indices_per_worker = total_length // num_workers
     remainder = total_length % num_workers
 
@@ -27,10 +31,10 @@ def distribute_indices(total_length, num_workers):
 class LKFOptimizer(Optimizer):
     def __init__(
         self,
-        params,
-        kalman_lambda=0.98,
-        kalman_nue=0.9987,
-        block_size=5120,
+        params: Any,
+        kalman_lambda: float = 0.98,
+        kalman_nue: float = 0.9987,
+        block_size: int = 5120,
     ) -> None:
         defaults = {"lr": 0.1, "kalman_nue": kalman_nue, "block_size": block_size}
 
@@ -158,13 +162,13 @@ def __init_P(self) -> None:
         self._state.setdefault("weights_num", len(P))
         self._state.setdefault("params_packed_index", params_packed_index)
 
-    def __get_blocksize(self):
+    def __get_blocksize(self) -> int:
         return self.param_groups[0]["block_size"]
 
-    def __get_nue(self):
+    def __get_nue(self) -> float:
         return self.param_groups[0]["kalman_nue"]
 
-    def __split_weights(self, weight):
+    def __split_weights(self, weight: torch.Tensor) -> list[torch.Tensor]:
         block_size = self.__get_blocksize()
         param_num = weight.nelement()
         res = []
@@ -179,7 +183,9 @@ def __split_weights(self, weight):
                     res.append(weight[i * block_size :])
         return res
 
-    def __update(self, H, error, weights) -> None:
+    def __update(
+        self, H: torch.Tensor, error: torch.Tensor, weights: torch.Tensor
+    ) -> None:
         P = self._state.get("P")
         kalman_lambda = self._state.get("kalman_lambda")
         weights_num = self._state.get("weights_num")
@@ -253,10 +259,10 @@ def __update(self, H, error, weights) -> None:
                         i += 1
                     param.data = tmp_weight.reshape(param.data.T.shape).T.contiguous()
 
-    def set_grad_prefactor(self, grad_prefactor) -> None:
+    def set_grad_prefactor(self, grad_prefactor: float) -> None:
         self.grad_prefactor = grad_prefactor
 
-    def step(self, error) -> None:
+    def step(self, error: torch.Tensor) -> None:
         params_packed_index = self._state.get("params_packed_index")
 
         weights = []
@@ -313,7 +319,7 @@ def step(self, error) -> None:
 
         self.__update(H, error, weights)
 
-    def get_device_id(self, index):
+    def get_device_id(self, index: int) -> Optional[int]:
         for i, (start, end) in enumerate(self.dindex):
             if start <= index < end:
                 return i
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
index 7a6ff0ebde..52d2888081 100644
--- a/deepmd/pt/train/training.py
+++ b/deepmd/pt/train/training.py
@@ -3,6 +3,7 @@
 import logging
 import time
 from collections.abc import (
+    Generator,
     Iterable,
 )
 from copy import (
@@ -13,6 +14,8 @@
 )
 from typing import (
     Any,
+    Callable,
+    Optional,
 )
 
 import numpy as np
@@ -50,6 +53,7 @@
     dp_random,
 )
 from deepmd.pt.utils.dataloader import (
+    DpLoaderSet,
     get_sampler_from_params,
 )
 from deepmd.pt.utils.env import (
@@ -92,16 +96,16 @@ class Trainer:
     def __init__(
         self,
         config: dict[str, Any],
-        training_data,
-        stat_file_path=None,
-        validation_data=None,
-        init_model=None,
-        restart_model=None,
-        finetune_model=None,
-        force_load=False,
-        shared_links=None,
-        finetune_links=None,
-        init_frz_model=None,
+        training_data: DpLoaderSet,
+        stat_file_path: Optional[str] = None,
+        validation_data: Optional[DpLoaderSet] = None,
+        init_model: Optional[str] = None,
+        restart_model: Optional[str] = None,
+        finetune_model: Optional[str] = None,
+        force_load: bool = False,
+        shared_links: Optional[dict[str, str]] = None,
+        finetune_links: Optional[dict[str, str]] = None,
+        init_frz_model: Optional[str] = None,
     ) -> None:
         """Construct a DeePMD trainer.
 
@@ -140,6 +144,7 @@ def __init__(
         self.num_steps = training_params["numb_steps"]
         self.disp_file = training_params.get("disp_file", "lcurve.out")
         self.disp_freq = training_params.get("disp_freq", 1000)
+        self.disp_avg = training_params.get("disp_avg", False)
         self.save_ckpt = training_params.get("save_ckpt", "model.ckpt")
         self.save_freq = training_params.get("save_freq", 1000)
         self.max_ckpt_keep = training_params.get("max_ckpt_keep", 5)
@@ -150,7 +155,7 @@ def __init__(
         )
         self.lcurve_should_print_header = True
 
-        def get_opt_param(params):
+        def get_opt_param(params: dict[str, Any]) -> tuple[str, dict[str, Any]]:
             opt_type = params.get("opt_type", "Adam")
             opt_param = {
                 "kf_blocksize": params.get("kf_blocksize", 5120),
@@ -162,7 +167,7 @@ def get_opt_param(params):
             }
             return opt_type, opt_param
 
-        def cycle_iterator(iterable: Iterable):
+        def cycle_iterator(iterable: Iterable) -> Generator[Any, None, None]:
             """
             Produces an infinite iterator by repeatedly cycling through the given iterable.
 
@@ -178,8 +183,20 @@ def cycle_iterator(iterable: Iterable):
                     it = iter(iterable)
                 yield from it
 
-        def get_data_loader(_training_data, _validation_data, _training_params):
-            def get_dataloader_and_iter(_data, _params):
+        def get_data_loader(
+            _training_data: DpLoaderSet,
+            _validation_data: Optional[DpLoaderSet],
+            _training_params: dict[str, Any],
+        ) -> tuple[
+            DataLoader,
+            Generator[Any, None, None],
+            Optional[DataLoader],
+            Optional[Generator[Any, None, None]],
+            int,
+        ]:
+            def get_dataloader_and_iter(
+                _data: DpLoaderSet, _params: dict[str, Any]
+            ) -> tuple[DataLoader, Generator[Any, None, None]]:
                 _sampler = get_sampler_from_params(_data, _params)
                 if _sampler is None:
                     log.warning(
@@ -194,7 +211,7 @@ def get_dataloader_and_iter(_data, _params):
                     else 0,  # setting to 0 diverges the behavior of its iterator; should be >=1
                     drop_last=False,
                     collate_fn=lambda batch: batch,  # prevent extra conversion
-                    pin_memory=True,
+                    pin_memory=(DEVICE != "cpu"),  # pin memory only if not on CPU
                 )
                 _data_iter = cycle_iterator(_dataloader)
                 return _dataloader, _data_iter
@@ -226,21 +243,21 @@ def get_dataloader_and_iter(_data, _params):
             )
 
         def single_model_stat(
-            _model,
-            _data_stat_nbatch,
-            _training_data,
-            _validation_data,
-            _stat_file_path,
-            _data_requirement,
-            finetune_has_new_type=False,
-        ):
+            _model: Any,
+            _data_stat_nbatch: int,
+            _training_data: DpLoaderSet,
+            _validation_data: Optional[DpLoaderSet],
+            _stat_file_path: Optional[str],
+            _data_requirement: list[DataRequirementItem],
+            finetune_has_new_type: bool = False,
+        ) -> Callable[[], Any]:
             _data_requirement += get_additional_data_requirement(_model)
             _training_data.add_data_requirement(_data_requirement)
             if _validation_data is not None:
                 _validation_data.add_data_requirement(_data_requirement)
 
             @functools.lru_cache
-            def get_sample():
+            def get_sample() -> Any:
                 sampled = make_stat_input(
                     _training_data.systems,
                     _training_data.dataloaders,
@@ -257,7 +274,7 @@ def get_sample():
                     _stat_file_path.root.close()
             return get_sample
 
-        def get_lr(lr_params):
+        def get_lr(lr_params: dict[str, Any]) -> LearningRateExp:
             assert lr_params.get("type", "exp") == "exp", (
                 "Only learning rate `exp` is supported!"
             )
@@ -495,11 +512,11 @@ def get_lr(lr_params):
                     state_dict = pretrained_model_wrapper.state_dict()
 
                     def collect_single_finetune_params(
-                        _model_key,
-                        _finetune_rule_single,
-                        _new_state_dict,
-                        _origin_state_dict,
-                        _random_state_dict,
+                        _model_key: str,
+                        _finetune_rule_single: Any,
+                        _new_state_dict: dict[str, Any],
+                        _origin_state_dict: dict[str, Any],
+                        _random_state_dict: dict[str, Any],
                     ) -> None:
                         _new_fitting = _finetune_rule_single.get_random_fitting()
                         _model_key_from = _finetune_rule_single.get_model_branch()
@@ -509,15 +526,31 @@ def collect_single_finetune_params(
                             if i != "_extra_state" and f".{_model_key}." in i
                         ]
                         for item_key in target_keys:
-                            if _new_fitting and (".descriptor." not in item_key):
+                            new_key = item_key.replace(
+                                f".{_model_key}.", f".{_model_key_from}."
+                            )
+                            use_random_initialization = _new_fitting and (
+                                ".descriptor." not in item_key
+                            )
+                            if (
+                                not use_random_initialization
+                                and new_key not in _origin_state_dict
+                            ):
+                                # for ZBL models finetuning from standard models
+                                if ".models.0." in new_key:
+                                    new_key = new_key.replace(".models.0.", ".")
+                                elif ".models.1." in new_key:
+                                    use_random_initialization = True
+                                else:
+                                    raise KeyError(
+                                        f"Key {new_key} not found in pretrained model."
+                                    )
+                            if use_random_initialization:
                                 # print(f'Keep {item_key} in old model!')
                                 _new_state_dict[item_key] = (
                                     _random_state_dict[item_key].clone().detach()
                                 )
                             else:
-                                new_key = item_key.replace(
-                                    f".{_model_key}.", f".{_model_key_from}."
-                                )
                                 # print(f'Replace {item_key} with {new_key} in pretrained_model!')
                                 _new_state_dict[item_key] = (
                                     _origin_state_dict[new_key].clone().detach()
@@ -544,10 +577,10 @@ def collect_single_finetune_params(
                 if finetune_model is not None:
 
                     def single_model_finetune(
-                        _model,
-                        _finetune_rule_single,
-                        _sample_func,
-                    ):
+                        _model: Any,
+                        _finetune_rule_single: Any,
+                        _sample_func: Callable,
+                    ) -> Any:
                         _model = model_change_out_bias(
                             _model,
                             _sample_func,
@@ -602,7 +635,7 @@ def single_model_finetune(
 
         # TODO add lr warmups for multitask
         # author: iProzd
-        def warm_up_linear(step, warmup_steps):
+        def warm_up_linear(step: int, warmup_steps: int) -> float:
             if step < warmup_steps:
                 return step / warmup_steps
             else:
@@ -695,7 +728,7 @@ def run(self) -> None:
             )
             prof.start()
 
-        def step(_step_id, task_key="Default") -> None:
+        def step(_step_id: int, task_key: str = "Default") -> None:
             if self.multi_task:
                 model_index = dp_random.choice(
                     np.arange(self.num_model, dtype=np.int_),
@@ -769,7 +802,7 @@ def step(_step_id, task_key="Default") -> None:
                         else self.wrapper
                     )
 
-                    def fake_model():
+                    def fake_model() -> dict:
                         return model_pred
 
                     _, loss, more_loss = module.loss[task_key](
@@ -808,6 +841,33 @@ def fake_model():
             else:
                 raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
 
+            if self.disp_avg:
+                # Accumulate loss for averaging over display interval
+                self.step_count_in_interval += 1
+                if not self.multi_task:
+                    # Accumulate loss for single task
+                    if not self.train_loss_accu:
+                        # Initialize accumulator with current loss structure
+                        for item in more_loss:
+                            if "l2_" not in item:
+                                self.train_loss_accu[item] = 0.0
+                    for item in more_loss:
+                        if "l2_" not in item:
+                            self.train_loss_accu[item] += more_loss[item]
+                else:
+                    # Accumulate loss for multi-task
+                    if task_key not in self.train_loss_accu:
+                        self.train_loss_accu[task_key] = {}
+                    if task_key not in self.step_count_per_task:
+                        self.step_count_per_task[task_key] = 0
+                    self.step_count_per_task[task_key] += 1
+
+                    for item in more_loss:
+                        if "l2_" not in item:
+                            if item not in self.train_loss_accu[task_key]:
+                                self.train_loss_accu[task_key][item] = 0.0
+                            self.train_loss_accu[task_key][item] += more_loss[item]
+
             # Log and persist
             display_step_id = _step_id + 1
             if self.display_in_training and (
@@ -815,18 +875,47 @@ def fake_model():
             ):
                 self.wrapper.eval()  # Will set to train mode before fininshing validation
 
-                def log_loss_train(_loss, _more_loss, _task_key="Default"):
-                    results = {}
-                    rmse_val = {
-                        item: _more_loss[item]
-                        for item in _more_loss
-                        if "l2_" not in item
-                    }
-                    for item in sorted(rmse_val.keys()):
-                        results[item] = rmse_val[item]
-                    return results
+                if self.disp_avg:
+
+                    def log_loss_train(
+                        _loss: Any, _more_loss: Any, _task_key: str = "Default"
+                    ) -> dict:
+                        results = {}
+                        if not self.multi_task:
+                            # Use accumulated average loss for single task
+                            for item in self.train_loss_accu:
+                                results[item] = (
+                                    self.train_loss_accu[item]
+                                    / self.step_count_in_interval
+                                )
+                        else:
+                            # Use accumulated average loss for multi-task
+                            if (
+                                _task_key in self.train_loss_accu
+                                and _task_key in self.step_count_per_task
+                            ):
+                                for item in self.train_loss_accu[_task_key]:
+                                    results[item] = (
+                                        self.train_loss_accu[_task_key][item]
+                                        / self.step_count_per_task[_task_key]
+                                    )
+                        return results
+                else:
 
-                def log_loss_valid(_task_key="Default"):
+                    def log_loss_train(
+                        _loss: Any, _more_loss: Any, _task_key: str = "Default"
+                    ) -> dict:
+                        results = {}
+                        rmse_val = {
+                            item: _more_loss[item]
+                            for item in _more_loss
+                            if "l2_" not in item
+                        }
+                        for item in sorted(rmse_val.keys()):
+                            results[item] = rmse_val[item]
+                        return results
+
+                def log_loss_valid(_task_key: str = "Default") -> dict:
                     single_results = {}
                     sum_natoms = 0
                     if not self.multi_task:
@@ -882,24 +971,31 @@ def log_loss_valid(_task_key="Default"):
                 else:
                     train_results = {_key: {} for _key in self.model_keys}
                     valid_results = {_key: {} for _key in self.model_keys}
-                    train_results[task_key] = log_loss_train(
-                        loss, more_loss, _task_key=task_key
-                    )
-                    for _key in self.model_keys:
-                        if _key != task_key:
-                            self.optimizer.zero_grad()
-                            input_dict, label_dict, _ = self.get_data(
-                                is_train=True, task_key=_key
-                            )
-                            _, loss, more_loss = self.wrapper(
-                                **input_dict,
-                                cur_lr=pref_lr,
-                                label=label_dict,
-                                task_key=_key,
-                            )
+                    if self.disp_avg:
+                        # For multi-task, use accumulated average loss for all tasks
+                        for _key in self.model_keys:
                             train_results[_key] = log_loss_train(
                                 loss, more_loss, _task_key=_key
                             )
+                    else:
+                        train_results[task_key] = log_loss_train(
+                            loss, more_loss, _task_key=task_key
+                        )
+                        for _key in self.model_keys:
+                            if _key != task_key:
+                                self.optimizer.zero_grad()
+                                input_dict, label_dict, _ = self.get_data(
+                                    is_train=True, task_key=_key
+                                )
+                                _, loss, more_loss = self.wrapper(
+                                    **input_dict,
+                                    cur_lr=pref_lr,
+                                    label=label_dict,
+                                    task_key=_key,
+                                )
+                                train_results[_key] = log_loss_train(
+                                    loss, more_loss, _task_key=_key
+                                )
                         valid_results[_key] = log_loss_valid(_task_key=_key)
                         if self.rank == 0:
                             log.info(
@@ -921,12 +1017,29 @@ def log_loss_valid(_task_key="Default"):
                                 )
                 self.wrapper.train()
 
+                if self.disp_avg:
+                    # Reset loss accumulators after display
+                    if not self.multi_task:
+                        for item in self.train_loss_accu:
+                            self.train_loss_accu[item] = 0.0
+                    else:
+                        for task_key in self.model_keys:
+                            if task_key in self.train_loss_accu:
+                                for item in self.train_loss_accu[task_key]:
+                                    self.train_loss_accu[task_key][item] = 0.0
+                            if task_key in self.step_count_per_task:
+                                self.step_count_per_task[task_key] = 0
+                    self.step_count_in_interval = 0
+                    self.last_display_step = display_step_id
+
                 current_time = time.time()
                 train_time = current_time - self.t0
                 self.t0 = current_time
                 if self.rank == 0 and self.timing_in_training:
                     eta = int(
-                        (self.num_steps - display_step_id) / self.disp_freq * train_time
+                        (self.num_steps - display_step_id)
+                        / min(self.disp_freq, display_step_id - self.start_step)
+                        * train_time
                     )
                     log.info(
                         format_training_message(
@@ -935,12 +1048,19 @@ def log_loss_valid(_task_key="Default"):
                             eta=eta,
                         )
                     )
-                # the first training time is not accurate
                 if (
-                    (_step_id + 1 - self.start_step) > self.disp_freq
-                    or self.num_steps - self.start_step < 2 * self.disp_freq
+                    (self.num_steps - self.start_step)
+                    <= 2 * self.disp_freq  # not enough steps
+                    or (_step_id - self.start_step)
+                    >= self.disp_freq  # skip first disp_freq steps
                 ):
                     self.total_train_time += train_time
+                    if display_step_id == 1:
+                        self.timed_steps += 1
+                    else:
+                        self.timed_steps += min(
+                            self.disp_freq, _step_id - self.start_step
+                        )
 
                 if fout:
                     if self.lcurve_should_print_header:
@@ -951,11 +1071,14 @@ def log_loss_valid(_task_key="Default"):
                     )
 
             if (
-                ((_step_id + 1) % self.save_freq == 0 and _step_id != self.start_step)
-                or (_step_id + 1) == self.num_steps
+                (
+                    (display_step_id) % self.save_freq == 0
+                    and _step_id != self.start_step
+                )
+                or (display_step_id) == self.num_steps
             ) and (self.rank == 0 or dist.get_rank() == 0):
                 # Handle the case if rank 0 aborted and re-assigned
-                self.latest_model = Path(self.save_ckpt + f"-{_step_id + 1}.pt")
+                self.latest_model = Path(self.save_ckpt + f"-{display_step_id}.pt")
 
                 module = (
                     self.wrapper.module
@@ -982,6 +1105,18 @@ def log_loss_valid(_task_key="Default"):
         self.wrapper.train()
         self.t0 = time.time()
         self.total_train_time = 0.0
+        self.timed_steps = 0
+
+        if self.disp_avg:
+            # Initialize loss accumulators
+            if not self.multi_task:
+                self.train_loss_accu = {}
+            else:
+                self.train_loss_accu = {key: {} for key in self.model_keys}
+                self.step_count_per_task = dict.fromkeys(self.model_keys, 0)
+            self.step_count_in_interval = 0
+            self.last_display_step = 0
+
         for step_id in range(self.start_step, self.num_steps):
             step(step_id)
             if JIT:
@@ -1021,24 +1156,12 @@ def log_loss_valid(_task_key="Default"):
                 with open("checkpoint", "w") as f:
                     f.write(str(self.latest_model))
 
-            elapsed_batch = self.num_steps - self.start_step
-            if self.timing_in_training and elapsed_batch // self.disp_freq > 0:
-                if self.start_step >= 2 * self.disp_freq:
-                    log.info(
-                        "average training time: %.4f s/batch (exclude first %d batches)",
-                        self.total_train_time
-                        / (
-                            elapsed_batch // self.disp_freq * self.disp_freq
-                            - self.disp_freq
-                        ),
-                        self.disp_freq,
-                    )
-                else:
-                    log.info(
-                        "average training time: %.4f s/batch",
-                        self.total_train_time
-                        / (elapsed_batch // self.disp_freq * self.disp_freq),
-                    )
+            if self.timing_in_training and self.timed_steps:
+                msg = f"average training time: {self.total_train_time / self.timed_steps:.4f} s/batch"
+                excluded_steps = self.num_steps - self.start_step - self.timed_steps
+                if excluded_steps > 0:
+                    msg += f" ({excluded_steps} batches excluded)"
+                log.info(msg)
 
             if JIT:
                 pth_model_path = (
@@ -1062,13 +1185,13 @@ def log_loss_valid(_task_key="Default"):
                 log.info(
                     f"The profiling trace has been saved under {self.tensorboard_log_dir}"
                 )
-            if self.profiling:
+            if not self.enable_profiler and self.profiling:
                 prof.export_chrome_trace(self.profiling_file)
                 log.info(
                     f"The profiling trace has been saved to: {self.profiling_file}"
                 )
 
-    def save_model(self, save_path, lr=0.0, step=0) -> None:
+    def save_model(self, save_path: str, lr: float = 0.0, step: int = 0) -> None:
         module = (
             self.wrapper.module
             if dist.is_available() and dist.is_initialized()
@@ -1093,7 +1216,9 @@ def save_model(self, save_path, lr=0.0, step=0) -> None:
             checkpoint_files.sort(key=lambda x: x.stat().st_mtime)
             checkpoint_files[0].unlink()
 
-    def get_data(self, is_train=True, task_key="Default"):
+    def get_data(
+        self, is_train: bool = True, task_key: str = "Default"
+    ) -> tuple[dict[str, Any], dict[str, Any], dict[str, Any]]:
         if is_train:
             iterator = self.training_data
         else:
@@ -1127,7 +1252,8 @@ def get_data(self, is_train=True, task_key="Default"):
         label_dict = {}
         for item_key in batch_data:
             if item_key in input_keys:
-                input_dict[item_key] = batch_data[item_key]
+                if item_key != "fparam" or batch_data["find_fparam"] != 0.0:
+                    input_dict[item_key] = batch_data[item_key]
             else:
                 if item_key not in ["sid", "fid"]:
                     label_dict[item_key] = batch_data[item_key]
@@ -1137,7 +1263,9 @@ def get_data(self, is_train=True, task_key="Default"):
         log_dict["sid"] = batch_data["sid"]
         return input_dict, label_dict, log_dict
 
-    def print_header(self, fout, train_results, valid_results) -> None:
+    def print_header(
+        self, fout: Any, train_results: dict[str, Any], valid_results: dict[str, Any]
+    ) -> None:
         train_keys = sorted(train_results.keys())
         print_str = ""
         print_str += "# {:5s}".format("step")
@@ -1169,7 +1297,12 @@ def print_header(self, fout, train_results, valid_results) -> None:
         fout.flush()
 
     def print_on_training(
-        self, fout, step_id, cur_lr, train_results, valid_results
+        self,
+        fout: Any,
+        step_id: int,
+        cur_lr: float,
+        train_results: dict,
+        valid_results: dict,
     ) -> None:
         train_keys = sorted(train_results.keys())
         print_str = ""
@@ -1201,12 +1334,15 @@ def print_on_training(
         fout.flush()
 
 
-def get_additional_data_requirement(_model):
+def get_additional_data_requirement(_model: Any) -> list[DataRequirementItem]:
     additional_data_requirement = []
     if _model.get_dim_fparam() > 0:
         fparam_requirement_items = [
             DataRequirementItem(
-                "fparam", _model.get_dim_fparam(), atomic=False, must=True
+                "fparam",
+                _model.get_dim_fparam(),
+                atomic=False,
+                must=not _model.has_default_fparam(),
             )
         ]
         additional_data_requirement += fparam_requirement_items
@@ -1228,12 +1364,14 @@ def get_additional_data_requirement(_model):
     return additional_data_requirement
 
 
-def whether_hessian(loss_params):
+def whether_hessian(loss_params: dict[str, Any]) -> bool:
     loss_type = loss_params.get("type", "ener")
     return loss_type == "ener" and loss_params.get("start_pref_h", 0.0) > 0.0
 
 
-def get_loss(loss_params, start_lr, _ntypes, _model):
+def get_loss(
+    loss_params: dict[str, Any], start_lr: float, _ntypes: int, _model: Any
+) -> TaskLoss:
     loss_type = loss_params.get("type", "ener")
     if whether_hessian(loss_params):
         loss_params["starter_learning_rate"] = start_lr
@@ -1276,8 +1414,8 @@ def get_loss(loss_params, start_lr, _ntypes, _model):
 
 
 def get_single_model(
-    _model_params,
-):
+    _model_params: dict[str, Any],
+) -> Any:
     if "use_srtab" in _model_params:
         model = get_zbl_model(deepcopy(_model_params)).to(DEVICE)
     else:
@@ -1286,10 +1424,10 @@ def get_single_model(
 
 
 def get_model_for_wrapper(
-    _model_params,
-    resuming=False,
-    _loss_params=None,
-):
+    _model_params: dict[str, Any],
+    resuming: bool = False,
+    _loss_params: Optional[dict[str, Any]] = None,
+) -> Any:
     if "model_dict" not in _model_params:
         if _loss_params is not None and whether_hessian(_loss_params):
             _model_params["hessian_mode"] = True
@@ -1312,7 +1450,7 @@ def get_model_for_wrapper(
     return _model
 
 
-def get_case_embd_config(_model_params):
+def get_case_embd_config(_model_params: dict[str, Any]) -> tuple[bool, dict[str, int]]:
     assert "model_dict" in _model_params, (
         "Only support setting case embedding for multi-task model!"
     )
@@ -1337,10 +1475,10 @@ def get_case_embd_config(_model_params):
 
 
 def model_change_out_bias(
-    _model,
-    _sample_func,
-    _bias_adjust_mode="change-by-statistic",
-):
+    _model: Any,
+    _sample_func: Callable[[], Any],
+    _bias_adjust_mode: str = "change-by-statistic",
+) -> Any:
     old_bias = deepcopy(_model.get_out_bias())
     _model.change_out_bias(
         _sample_func,
diff --git a/deepmd/pt/train/wrapper.py b/deepmd/pt/train/wrapper.py
index 9a2cbff295..392f928b0d 100644
--- a/deepmd/pt/train/wrapper.py
+++ b/deepmd/pt/train/wrapper.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -19,8 +20,8 @@ def __init__(
         self,
         model: Union[torch.nn.Module, dict],
         loss: Union[torch.nn.Module, dict] = None,
-        model_params=None,
-        shared_links=None,
+        model_params: Optional[dict[str, Any]] = None,
+        shared_links: Optional[dict[str, Any]] = None,
     ) -> None:
         """Construct a DeePMD model wrapper.
 
@@ -59,7 +60,7 @@ def __init__(
                     self.loss[task_key] = loss[task_key]
         self.inference_only = self.loss is None
 
-    def share_params(self, shared_links, resume=False) -> None:
+    def share_params(self, shared_links: dict[str, Any], resume: bool = False) -> None:
         """
         Share the parameters of classes following rules defined in shared_links during multitask training.
         If not start from checkpoint (resume is False),
@@ -138,18 +139,18 @@ def share_params(self, shared_links, resume=False) -> None:
 
     def forward(
         self,
-        coord,
-        atype,
+        coord: torch.Tensor,
+        atype: torch.Tensor,
         spin: Optional[torch.Tensor] = None,
         box: Optional[torch.Tensor] = None,
         cur_lr: Optional[torch.Tensor] = None,
         label: Optional[torch.Tensor] = None,
         task_key: Optional[torch.Tensor] = None,
-        inference_only=False,
-        do_atomic_virial=False,
+        inference_only: bool = False,
+        do_atomic_virial: bool = False,
         fparam: Optional[torch.Tensor] = None,
         aparam: Optional[torch.Tensor] = None,
-    ):
+    ) -> tuple[Any, Any, Any]:
         if not self.multi_task:
             task_key = "Default"
         else:
diff --git a/deepmd/pt/utils/dataloader.py b/deepmd/pt/utils/dataloader.py
index bc771b41d4..c434341ab9 100644
--- a/deepmd/pt/utils/dataloader.py
+++ b/deepmd/pt/utils/dataloader.py
@@ -4,6 +4,11 @@
 from multiprocessing.dummy import (
     Pool,
 )
+from typing import (
+    Any,
+    Optional,
+    Union,
+)
 
 import h5py
 import numpy as np
@@ -45,7 +50,7 @@
 torch.multiprocessing.set_sharing_strategy("file_system")
 
 
-def setup_seed(seed) -> None:
+def setup_seed(seed: Union[int, list[int], tuple[int, ...]]) -> None:
     if isinstance(seed, (list, tuple)):
         mixed_seed = mix_entropy(seed)
     else:
@@ -75,11 +80,11 @@ class DpLoaderSet(Dataset):
 
     def __init__(
         self,
-        systems,
-        batch_size,
-        type_map,
-        seed=None,
-        shuffle=True,
+        systems: Union[str, list[str]],
+        batch_size: int,
+        type_map: Optional[list[str]],
+        seed: Optional[int] = None,
+        shuffle: bool = True,
     ) -> None:
         if seed is not None:
             setup_seed(seed)
@@ -87,7 +92,7 @@ def __init__(
             with h5py.File(systems) as file:
                 systems = [os.path.join(systems, item) for item in file.keys()]
 
-        def construct_dataset(system):
+        def construct_dataset(system: str) -> DeepmdDataSetForLoader:
             return DeepmdDataSetForLoader(
                 system=system,
                 type_map=type_map,
@@ -180,7 +185,7 @@ def construct_dataset(system):
             for item in self.dataloaders:
                 self.iters.append(iter(item))
 
-    def set_noise(self, noise_settings) -> None:
+    def set_noise(self, noise_settings: dict[str, Any]) -> None:
         # noise_settings['noise_type'] # "trunc_normal", "normal", "uniform"
         # noise_settings['noise'] # float, default 1.0
         # noise_settings['noise_mode'] # "prob", "fix_num"
@@ -193,7 +198,7 @@ def set_noise(self, noise_settings) -> None:
     def __len__(self) -> int:
         return len(self.dataloaders)
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx: int) -> dict[str, torch.Tensor]:
         # log.warning(str(torch.distributed.get_rank())+" idx: "+str(idx)+" index: "+str(self.index[idx]))
         with torch.device("cpu"):
             try:
@@ -231,7 +236,7 @@ def print_summary(
             )
 
 
-def collate_batch(batch):
+def collate_batch(batch: list[dict[str, Any]]) -> dict[str, Any]:
     example = batch[0]
     result = {}
     for key in example.keys():
@@ -251,7 +256,9 @@ def collate_batch(batch):
     return result
 
 
-def get_weighted_sampler(training_data, prob_style, sys_prob=False):
+def get_weighted_sampler(
+    training_data: Any, prob_style: str, sys_prob: bool = False
+) -> WeightedRandomSampler:
     if sys_prob is False:
         if prob_style == "prob_uniform":
             prob_v = 1.0 / float(training_data.__len__())
@@ -276,7 +283,7 @@ def get_weighted_sampler(training_data, prob_style, sys_prob=False):
     return sampler
 
 
-def get_sampler_from_params(_data, _params):
+def get_sampler_from_params(_data: Any, _params: dict[str, Any]) -> Any:
     if (
         "sys_probs" in _params and _params["sys_probs"] is not None
     ):  # use sys_probs first
diff --git a/deepmd/pt/utils/dataset.py b/deepmd/pt/utils/dataset.py
index 3043839308..2cbe47cc3e 100644
--- a/deepmd/pt/utils/dataset.py
+++ b/deepmd/pt/utils/dataset.py
@@ -2,6 +2,7 @@
 
 
 from typing import (
+    Any,
     Optional,
 )
 
@@ -34,7 +35,7 @@ def __init__(self, system: str, type_map: Optional[list[str]] = None) -> None:
     def __len__(self) -> int:
         return self._data_system.nframes
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> dict[str, Any]:
         """Get a frame from the selected system."""
         b_data = self._data_system.get_item_torch(index)
         b_data["natoms"] = self._natoms_vec
diff --git a/deepmd/pt/utils/env_mat_stat.py b/deepmd/pt/utils/env_mat_stat.py
index 23e8627bcd..1f89c09621 100644
--- a/deepmd/pt/utils/env_mat_stat.py
+++ b/deepmd/pt/utils/env_mat_stat.py
@@ -200,7 +200,7 @@ def get_hash(self) -> str:
             }
         )
 
-    def __call__(self):
+    def __call__(self) -> tuple[np.ndarray, np.ndarray]:
         avgs = self.get_avg()
         stds = self.get_std()
 
diff --git a/deepmd/pt/utils/exclude_mask.py b/deepmd/pt/utils/exclude_mask.py
index 0a99c0777f..cf39220f1b 100644
--- a/deepmd/pt/utils/exclude_mask.py
+++ b/deepmd/pt/utils/exclude_mask.py
@@ -32,10 +32,10 @@ def reinit(
         )
         self.type_mask = to_torch_tensor(self.type_mask).view([-1])
 
-    def get_exclude_types(self):
+    def get_exclude_types(self) -> list[int]:
         return self.exclude_types
 
-    def get_type_mask(self):
+    def get_type_mask(self) -> torch.Tensor:
         return self.type_mask
 
     def forward(
@@ -98,7 +98,7 @@ def reinit(
         self.type_mask = to_torch_tensor(self.type_mask).view([-1])
         self.no_exclusion = len(self._exclude_types) == 0
 
-    def get_exclude_types(self):
+    def get_exclude_types(self) -> set[tuple[int, int]]:
         return self._exclude_types
 
     # may have a better place for this method...
diff --git a/deepmd/pt/utils/finetune.py b/deepmd/pt/utils/finetune.py
index 96a420bf6a..0e86c9aa6c 100644
--- a/deepmd/pt/utils/finetune.py
+++ b/deepmd/pt/utils/finetune.py
@@ -3,6 +3,9 @@
 from copy import (
     deepcopy,
 )
+from typing import (
+    Any,
+)
 
 import torch
 
@@ -12,18 +15,21 @@
 from deepmd.utils.finetune import (
     FinetuneRuleItem,
 )
+from deepmd.utils.model_branch_dict import (
+    get_model_dict,
+)
 
 log = logging.getLogger(__name__)
 
 
 def get_finetune_rule_single(
-    _single_param_target,
-    _model_param_pretrained,
-    from_multitask=False,
-    model_branch="Default",
-    model_branch_from="",
-    change_model_params=False,
-):
+    _single_param_target: dict[str, Any],
+    _model_param_pretrained: dict[str, Any],
+    from_multitask: bool = False,
+    model_branch: str = "Default",
+    model_branch_from: str = "",
+    change_model_params: bool = False,
+) -> tuple[dict[str, Any], FinetuneRuleItem]:
     single_config = deepcopy(_single_param_target)
     new_fitting = False
     model_branch_chosen = "Default"
@@ -44,10 +50,13 @@ def get_finetune_rule_single(
             )
         else:
             model_branch_chosen = model_branch_from
-        assert model_branch_chosen in model_dict_params, (
-            f"No model branch named '{model_branch_chosen}'! "
+        model_alias_dict, model_branch_dict = get_model_dict(model_dict_params)
+        assert model_branch_chosen in model_alias_dict, (
+            f"No model branch or alias named '{model_branch_chosen}'! "
             f"Available ones are {list(model_dict_params.keys())}."
+            f"Use `dp --pt show your_model.pt model-branch` to show detail information."
         )
+        model_branch_chosen = model_alias_dict[model_branch_chosen]
         single_config_chosen = deepcopy(model_dict_params[model_branch_chosen])
     old_type_map, new_type_map = (
         single_config_chosen["type_map"],
@@ -80,8 +89,11 @@ def get_finetune_rule_single(
 
 
 def get_finetune_rules(
-    finetune_model, model_config, model_branch="", change_model_params=True
-):
+    finetune_model: str,
+    model_config: dict[str, Any],
+    model_branch: str = "",
+    change_model_params: bool = True,
+) -> tuple[dict[str, Any], dict[str, FinetuneRuleItem]]:
     """
     Get fine-tuning rules and (optionally) change the model_params according to the pretrained one.
 
diff --git a/deepmd/pt/utils/multi_task.py b/deepmd/pt/utils/multi_task.py
index 6c397400bf..87b020c17b 100644
--- a/deepmd/pt/utils/multi_task.py
+++ b/deepmd/pt/utils/multi_task.py
@@ -2,6 +2,10 @@
 from copy import (
     deepcopy,
 )
+from typing import (
+    Any,
+    Optional,
+)
 
 from deepmd.pt.model.descriptor import (
     BaseDescriptor,
@@ -11,7 +15,9 @@
 )
 
 
-def preprocess_shared_params(model_config):
+def preprocess_shared_params(
+    model_config: dict[str, Any],
+) -> tuple[dict[str, Any], dict[str, Any]]:
     """Preprocess the model params for multitask model, and generate the links dict for further sharing.
 
     Args:
@@ -97,7 +103,11 @@ def preprocess_shared_params(model_config):
     type_map_keys = []
 
     def replace_one_item(
-        params_dict, key_type, key_in_dict, suffix="", index=None
+        params_dict: dict[str, Any],
+        key_type: str,
+        key_in_dict: str,
+        suffix: str = "",
+        index: Optional[int] = None,
     ) -> None:
         shared_type = key_type
         shared_key = key_in_dict
@@ -155,7 +165,7 @@ def replace_one_item(
     return model_config, shared_links
 
 
-def get_class_name(item_key, item_params):
+def get_class_name(item_key: str, item_params: dict[str, Any]) -> type:
     if item_key == "descriptor":
         return BaseDescriptor.get_class_by_type(item_params.get("type", "se_e2_a"))
     elif item_key == "fitting_net":
diff --git a/deepmd/pt/utils/neighbor_stat.py b/deepmd/pt/utils/neighbor_stat.py
index 64ad695827..b0e9eca141 100644
--- a/deepmd/pt/utils/neighbor_stat.py
+++ b/deepmd/pt/utils/neighbor_stat.py
@@ -171,7 +171,7 @@ def _execute(
         coord: np.ndarray,
         atype: np.ndarray,
         cell: Optional[np.ndarray],
-    ):
+    ) -> tuple[np.ndarray, np.ndarray]:
         """Execute the operation.
 
         Parameters
diff --git a/deepmd/pt/utils/nlist.py b/deepmd/pt/utils/nlist.py
index af84151829..8023645f8c 100644
--- a/deepmd/pt/utils/nlist.py
+++ b/deepmd/pt/utils/nlist.py
@@ -16,13 +16,13 @@
 
 
 def extend_input_and_build_neighbor_list(
-    coord,
-    atype,
+    coord: torch.Tensor,
+    atype: torch.Tensor,
     rcut: float,
     sel: list[int],
     mixed_types: bool = False,
     box: Optional[torch.Tensor] = None,
-):
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     nframes, nloc = atype.shape[:2]
     if box is not None:
         box_gpu = box.to(coord.device, non_blocking=True)
@@ -292,7 +292,7 @@ def nlist_distinguish_types(
     nlist: torch.Tensor,
     atype: torch.Tensor,
     sel: list[int],
-):
+) -> torch.Tensor:
     """Given a nlist that does not distinguish atom types, return a nlist that
     distinguish atom types.
 
@@ -414,7 +414,7 @@ def extend_coord_with_ghosts(
     cell: Optional[torch.Tensor],
     rcut: float,
     cell_cpu: Optional[torch.Tensor] = None,
-):
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """Extend the coordinates of the atoms by appending peridoc images.
     The number of images is large enough to ensure all the neighbors
     within rcut are appended.
diff --git a/deepmd/pt/utils/preprocess.py b/deepmd/pt/utils/preprocess.py
index 7161bac692..0cc31b5d7a 100644
--- a/deepmd/pt/utils/preprocess.py
+++ b/deepmd/pt/utils/preprocess.py
@@ -6,7 +6,9 @@
 log = logging.getLogger(__name__)
 
 
-def compute_smooth_weight(distance, rmin: float, rmax: float):
+def compute_smooth_weight(
+    distance: torch.Tensor, rmin: float, rmax: float
+) -> torch.Tensor:
     """Compute smooth weight for descriptor elements."""
     if rmin >= rmax:
         raise ValueError("rmin should be less than rmax.")
@@ -17,7 +19,7 @@ def compute_smooth_weight(distance, rmin: float, rmax: float):
     return vv
 
 
-def compute_exp_sw(distance, rmin: float, rmax: float):
+def compute_exp_sw(distance: torch.Tensor, rmin: float, rmax: float) -> torch.Tensor:
     """Compute the exponential switch function for neighbor update."""
     if rmin >= rmax:
         raise ValueError("rmin should be less than rmax.")
diff --git a/deepmd/pt/utils/region.py b/deepmd/pt/utils/region.py
index 3272434995..21af694c2c 100644
--- a/deepmd/pt/utils/region.py
+++ b/deepmd/pt/utils/region.py
@@ -68,7 +68,7 @@ def to_face_distance(
     return dist.view(list(cshape[:-2]) + [3])  # noqa:RUF005
 
 
-def b_to_face_distance(cell):
+def b_to_face_distance(cell: torch.Tensor) -> torch.Tensor:
     volume = torch.linalg.det(cell)
     c_yz = torch.cross(cell[:, 1], cell[:, 2], dim=-1)
     _h2yz = volume / torch.linalg.norm(c_yz, dim=-1)
diff --git a/deepmd/pt/utils/spin.py b/deepmd/pt/utils/spin.py
index 285dcaf93e..74ddb5ca13 100644
--- a/deepmd/pt/utils/spin.py
+++ b/deepmd/pt/utils/spin.py
@@ -4,10 +4,10 @@
 
 
 def concat_switch_virtual(
-    extended_tensor,
-    extended_tensor_virtual,
+    extended_tensor: torch.Tensor,
+    extended_tensor_virtual: torch.Tensor,
     nloc: int,
-):
+) -> torch.Tensor:
     """
     Concat real and virtual extended tensors, and switch all the local ones to the first nloc * 2 atoms.
     - [:, :nloc]: original nloc real atoms.
diff --git a/deepmd/pt/utils/stat.py b/deepmd/pt/utils/stat.py
index cf6892b49d..7312d95a06 100644
--- a/deepmd/pt/utils/stat.py
+++ b/deepmd/pt/utils/stat.py
@@ -4,6 +4,7 @@
     defaultdict,
 )
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -35,7 +36,9 @@
 log = logging.getLogger(__name__)
 
 
-def make_stat_input(datasets, dataloaders, nbatches):
+def make_stat_input(
+    datasets: list[Any], dataloaders: list[Any], nbatches: int
+) -> dict[str, Any]:
     """Pack data for statistics.
 
     Args:
@@ -59,6 +62,14 @@ def make_stat_input(datasets, dataloaders, nbatches):
                 except StopIteration:
                     iterator = iter(dataloaders[i])
                     stat_data = next(iterator)
+                if (
+                    "find_fparam" in stat_data
+                    and "fparam" in stat_data
+                    and stat_data["find_fparam"] == 0.0
+                ):
+                    # for model using default fparam
+                    stat_data.pop("fparam")
+                    stat_data.pop("find_fparam")
                 for dd in stat_data:
                     if stat_data[dd] is None:
                         sys_stat[dd] = None
@@ -127,9 +138,9 @@ def _save_to_file(
 
 
 def _post_process_stat(
-    out_bias,
-    out_std,
-):
+    out_bias: torch.Tensor,
+    out_std: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
     """Post process the statistics.
 
     For global statistics, we do not have the std for each type of atoms,
@@ -151,7 +162,7 @@ def _compute_model_predict(
     sampled: Union[Callable[[], list[dict]], list[dict]],
     keys: list[str],
     model_forward: Callable[..., torch.Tensor],
-):
+) -> dict[str, list[torch.Tensor]]:
     auto_batch_size = AutoBatchSize()
     model_predict = {kk: [] for kk in keys}
     for system in sampled:
@@ -165,7 +176,7 @@ def _compute_model_predict(
         fparam = system.get("fparam", None)
         aparam = system.get("aparam", None)
 
-        def model_forward_auto_batch_size(*args, **kwargs):
+        def model_forward_auto_batch_size(*args: Any, **kwargs: Any) -> Any:
             return auto_batch_size.execute_all(
                 model_forward,
                 nframes,
@@ -214,7 +225,7 @@ def _make_preset_out_bias(
 def _fill_stat_with_global(
     atomic_stat: Union[np.ndarray, None],
     global_stat: np.ndarray,
-):
+) -> Union[np.ndarray, None]:
     """This function is used to fill atomic stat with global stat.
 
     Parameters
@@ -247,7 +258,7 @@ def compute_output_stats(
     model_forward: Optional[Callable[..., torch.Tensor]] = None,
     stats_distinguish_types: bool = True,
     intensive: bool = False,
-):
+) -> dict[str, Any]:
     """
     Compute the output statistics (e.g. energy bias) for the fitting net from packed data.
 
@@ -414,7 +425,7 @@ def compute_output_stats_global(
     model_pred: Optional[dict[str, np.ndarray]] = None,
     stats_distinguish_types: bool = True,
     intensive: bool = False,
-):
+) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
     """This function only handle stat computation from reduced global labels."""
     # return directly if model predict is empty for global
     if model_pred == {}:
@@ -522,7 +533,7 @@ def compute_output_stats_global(
         }
     atom_numbs = {kk: merged_natoms[kk].sum(-1) for kk in bias_atom_e.keys()}
 
-    def rmse(x):
+    def rmse(x: np.ndarray) -> float:
         return np.sqrt(np.mean(np.square(x)))
 
     for kk in bias_atom_e.keys():
@@ -541,7 +552,7 @@ def compute_output_stats_atomic(
     ntypes: int,
     keys: list[str],
     model_pred: Optional[dict[str, np.ndarray]] = None,
-):
+) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
     # get label dict from sample; for each key, only picking the system with atomic labels.
     outputs = {
         kk: [
diff --git a/deepmd/pt/utils/tabulate.py b/deepmd/pt/utils/tabulate.py
index db743ff98c..b155a897da 100644
--- a/deepmd/pt/utils/tabulate.py
+++ b/deepmd/pt/utils/tabulate.py
@@ -3,6 +3,9 @@
 from functools import (
     cached_property,
 )
+from typing import (
+    Any,
+)
 
 import numpy as np
 import torch
@@ -48,7 +51,7 @@ class DPTabulate(BaseTabulate):
 
     def __init__(
         self,
-        descrpt,
+        descrpt: Any,
         neuron: list[int],
         type_one_side: bool = False,
         exclude_types: list[list[int]] = [],
@@ -113,7 +116,7 @@ def __init__(
         self.data_type = self._get_data_type()
         self.last_layer_size = self._get_last_layer_size()
 
-    def _make_data(self, xx, idx):
+    def _make_data(self, xx: np.ndarray, idx: int) -> Any:
         """Generate tabulation data for the given input.
 
         Parameters
@@ -282,12 +285,12 @@ def _make_data(self, xx, idx):
         d2 = dy2.detach().cpu().numpy().astype(self.data_type)
         return vv, dd, d2
 
-    def _layer_0(self, x, w, b):
+    def _layer_0(self, x: torch.Tensor, w: np.ndarray, b: np.ndarray) -> torch.Tensor:
         w = torch.from_numpy(w).to(env.DEVICE)
         b = torch.from_numpy(b).to(env.DEVICE)
         return self.activation_fn(torch.matmul(x, w) + b)
 
-    def _layer_1(self, x, w, b):
+    def _layer_1(self, x: torch.Tensor, w: np.ndarray, b: np.ndarray) -> torch.Tensor:
         w = torch.from_numpy(w).to(env.DEVICE)
         b = torch.from_numpy(b).to(env.DEVICE)
         t = torch.cat([x, x], dim=1)
@@ -310,7 +313,7 @@ def _get_descrpt_type(self) -> str:
             return "T"
         raise RuntimeError(f"Unsupported descriptor {self.descrpt}")
 
-    def _get_layer_size(self):
+    def _get_layer_size(self) -> int:
         # get the number of layers in EmbeddingNet
         layer_size = 0
         basic_size = 0
@@ -417,10 +420,10 @@ def _get_network_variable(self, var_name: str) -> dict:
                 raise RuntimeError("Unsupported descriptor")
         return result
 
-    def _get_bias(self):
+    def _get_bias(self) -> Any:
         return self._get_network_variable("b")
 
-    def _get_matrix(self):
+    def _get_matrix(self) -> Any:
         return self._get_network_variable("w")
 
     def _convert_numpy_to_tensor(self) -> None:
@@ -435,7 +438,7 @@ def _n_all_excluded(self) -> int:
 
 
 # customized op
-def grad(xbar: torch.Tensor, y: torch.Tensor, functype: int):
+def grad(xbar: torch.Tensor, y: torch.Tensor, functype: int) -> torch.Tensor:
     if functype == 1:
         return 1 - y * y
 
@@ -465,7 +468,7 @@ def grad(xbar: torch.Tensor, y: torch.Tensor, functype: int):
         raise ValueError(f"Unsupported function type: {functype}")
 
 
-def grad_grad(xbar: torch.Tensor, y: torch.Tensor, functype: int):
+def grad_grad(xbar: torch.Tensor, y: torch.Tensor, functype: int) -> torch.Tensor:
     if functype == 1:
         return -2 * y * (1 - y * y)
 
@@ -494,7 +497,7 @@ def grad_grad(xbar: torch.Tensor, y: torch.Tensor, functype: int):
 
 def unaggregated_dy_dx_s(
     y: torch.Tensor, w_np: np.ndarray, xbar: torch.Tensor, functype: int
-):
+) -> torch.Tensor:
     w = torch.from_numpy(w_np).to(env.DEVICE)
     y = y.to(env.DEVICE)
     xbar = xbar.to(env.DEVICE)
@@ -520,7 +523,7 @@ def unaggregated_dy2_dx_s(
     w_np: np.ndarray,
     xbar: torch.Tensor,
     functype: int,
-):
+) -> torch.Tensor:
     w = torch.from_numpy(w_np).to(env.DEVICE)
     y = y.to(env.DEVICE)
     dy = dy.to(env.DEVICE)
@@ -549,7 +552,7 @@ def unaggregated_dy_dx(
     dy_dx: torch.Tensor,
     ybar: torch.Tensor,
     functype: int,
-):
+) -> torch.Tensor:
     w = torch.from_numpy(w_np).to(env.DEVICE)
     if z.dim() != 2:
         raise ValueError("z tensor must have 2 dimensions")
@@ -587,7 +590,7 @@ def unaggregated_dy2_dx(
     dy2_dx: torch.Tensor,
     ybar: torch.Tensor,
     functype: int,
-):
+) -> torch.Tensor:
     w = torch.from_numpy(w_np).to(env.DEVICE)
     if z.dim() != 2:
         raise ValueError("z tensor must have 2 dimensions")
diff --git a/deepmd/pt/utils/utils.py b/deepmd/pt/utils/utils.py
index 054dc3c80b..d06e2c1640 100644
--- a/deepmd/pt/utils/utils.py
+++ b/deepmd/pt/utils/utils.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 from typing import (
+    Any,
     Optional,
     Union,
     overload,
@@ -69,7 +70,7 @@ def silut_double_backward(
 
 
 class SiLUTScript(torch.nn.Module):
-    def __init__(self, threshold: float = 3.0):
+    def __init__(self, threshold: float = 3.0) -> None:
         super().__init__()
         self.threshold = threshold
 
@@ -81,14 +82,20 @@ def __init__(self, threshold: float = 3.0):
         self.const_val = float(threshold * sigmoid_threshold)
         self.get_script_code()
 
-    def get_script_code(self):
+    def get_script_code(self) -> None:
         silut_forward_script = torch.jit.script(silut_forward)
         silut_backward_script = torch.jit.script(silut_backward)
         silut_double_backward_script = torch.jit.script(silut_double_backward)
 
         class SiLUTFunction(torch.autograd.Function):
             @staticmethod
-            def forward(ctx, x, threshold, slope, const_val):
+            def forward(
+                ctx: Any,
+                x: torch.Tensor,
+                threshold: float,
+                slope: float,
+                const_val: float,
+            ) -> torch.Tensor:
                 ctx.save_for_backward(x)
                 ctx.threshold = threshold
                 ctx.slope = slope
@@ -96,7 +103,9 @@ def forward(ctx, x, threshold, slope, const_val):
                 return silut_forward_script(x, threshold, slope, const_val)
 
             @staticmethod
-            def backward(ctx, grad_output):
+            def backward(
+                ctx: Any, grad_output: torch.Tensor
+            ) -> tuple[torch.Tensor, None, None, None]:
                 (x,) = ctx.saved_tensors
                 threshold = ctx.threshold
                 slope = ctx.slope
@@ -106,7 +115,13 @@ def backward(ctx, grad_output):
 
         class SiLUTGradFunction(torch.autograd.Function):
             @staticmethod
-            def forward(ctx, x, grad_output, threshold, slope):
+            def forward(
+                ctx: Any,
+                x: torch.Tensor,
+                grad_output: torch.Tensor,
+                threshold: float,
+                slope: float,
+            ) -> torch.Tensor:
                 ctx.threshold = threshold
                 ctx.slope = slope
                 grad_input = silut_backward_script(x, grad_output, threshold, slope)
@@ -114,7 +129,9 @@ def forward(ctx, x, grad_output, threshold, slope):
                 return grad_input
 
             @staticmethod
-            def backward(ctx, grad_grad_output):
+            def backward(
+                ctx: Any, grad_grad_output: torch.Tensor
+            ) -> tuple[torch.Tensor, torch.Tensor]:
                 (x, grad_output) = ctx.saved_tensors
                 threshold = ctx.threshold
                 slope = ctx.slope
@@ -126,21 +143,21 @@ def backward(ctx, grad_grad_output):
 
         self.SiLUTFunction = SiLUTFunction
 
-    def forward(self, x):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.SiLUTFunction.apply(x, self.threshold, self.slope, self.const_val)
 
 
 class SiLUT(torch.nn.Module):
-    def __init__(self, threshold=3.0):
+    def __init__(self, threshold: float = 3.0) -> None:
         super().__init__()
 
-        def sigmoid(x):
+        def sigmoid(x: float) -> float:
             return 1 / (1 + np.exp(-x))
 
-        def silu(x):
+        def silu(x: float) -> float:
             return x * sigmoid(x)
 
-        def silu_grad(x):
+        def silu_grad(x: float) -> float:
             sig = sigmoid(x)
             return sig + x * sig * (1 - sig)
 
@@ -212,8 +229,8 @@ def to_numpy_array(xx: None) -> None: ...
 
 
 def to_numpy_array(
-    xx,
-):
+    xx: Optional[torch.Tensor],
+) -> Optional[np.ndarray]:
     if xx is None:
         return None
     assert xx is not None
@@ -239,8 +256,8 @@ def to_torch_tensor(xx: None) -> None: ...
 
 
 def to_torch_tensor(
-    xx,
-):
+    xx: Optional[np.ndarray],
+) -> Optional[torch.Tensor]:
     if xx is None:
         return None
     assert xx is not None
@@ -259,7 +276,7 @@ def to_torch_tensor(
     return torch.tensor(xx, dtype=prec, device=DEVICE)
 
 
-def dict_to_device(sample_dict) -> None:
+def dict_to_device(sample_dict: dict[str, Any]) -> None:
     for key in sample_dict:
         if isinstance(sample_dict[key], list):
             sample_dict[key] = [item.to(DEVICE) for item in sample_dict[key]]
@@ -280,7 +297,7 @@ def dict_to_device(sample_dict) -> None:
 XSHIFT = 16
 
 
-def hashmix(value: int, hash_const: list[int]):
+def hashmix(value: int, hash_const: list[int]) -> int:
     value ^= INIT_A
     hash_const[0] *= MULT_A
     value *= INIT_A
@@ -291,7 +308,7 @@ def hashmix(value: int, hash_const: list[int]):
     return value
 
 
-def mix(x: int, y: int):
+def mix(x: int, y: int) -> int:
     result = MIX_MULT_L * x - MIX_MULT_R * y
     # prevent overflow
     result &= 0xFFFF_FFFF_FFFF_FFFF
diff --git a/deepmd/tf/descriptor/se.py b/deepmd/tf/descriptor/se.py
index 2863704143..5b04c5ba00 100644
--- a/deepmd/tf/descriptor/se.py
+++ b/deepmd/tf/descriptor/se.py
@@ -192,6 +192,7 @@ def serialize_network(
         resnet_dt: bool,
         variables: dict,
         excluded_types: set[tuple[int, int]] = set(),
+        trainable: bool = True,
         suffix: str = "",
     ) -> dict:
         """Serialize network.
@@ -214,6 +215,8 @@ def serialize_network(
             The input variables
         excluded_types : set[tuple[int, int]], optional
             The excluded types
+        trainable : bool
+            Whether the network is trainable
         suffix : str, optional
             The suffix of the scope
 
@@ -236,6 +239,7 @@ def serialize_network(
                     activation_function=activation_function,
                     resnet_dt=resnet_dt,
                     precision=self.precision.name,
+                    trainable=trainable,
                 )
                 embeddings[(type_j, type_i)] = EmbeddingNet(
                     in_dim=in_dim,
@@ -243,6 +247,7 @@ def serialize_network(
                     activation_function=activation_function,
                     resnet_dt=resnet_dt,
                     precision=self.precision.name,
+                    trainable=trainable,
                 )
                 embeddings[(type_i, type_j)].clear()
                 embeddings[(type_j, type_i)].clear()
@@ -278,6 +283,7 @@ def serialize_network(
                     activation_function=activation_function,
                     resnet_dt=resnet_dt,
                     precision=self.precision.name,
+                    trainable=trainable,
                 )
             assert embeddings[network_idx] is not None
             if weight_name == "idt":
diff --git a/deepmd/tf/descriptor/se_a.py b/deepmd/tf/descriptor/se_a.py
index e3ae7bf99c..ba21925b13 100644
--- a/deepmd/tf/descriptor/se_a.py
+++ b/deepmd/tf/descriptor/se_a.py
@@ -1462,6 +1462,7 @@ def serialize(self, suffix: str = "") -> dict:
                 resnet_dt=self.filter_resnet_dt,
                 variables=self.embedding_net_variables,
                 excluded_types=self.exclude_types,
+                trainable=self.trainable,
                 suffix=suffix,
             ),
             "env_mat": EnvMat(self.rcut_r, self.rcut_r_smth).serialize(),
diff --git a/deepmd/tf/descriptor/se_atten.py b/deepmd/tf/descriptor/se_atten.py
index 3a9b86a0d6..002e7bd3d3 100644
--- a/deepmd/tf/descriptor/se_atten.py
+++ b/deepmd/tf/descriptor/se_atten.py
@@ -1593,6 +1593,7 @@ def serialize_attention_layers(
                 bias=bias,
                 use_timestep=False,
                 precision=self.precision.name,
+                trainable=self.trainable,
             )
             matrix_list = [
                 attention_layer_params[layer_idx][key]["matrix"]
@@ -1611,6 +1612,7 @@ def serialize_attention_layers(
                 bias=bias,
                 use_timestep=False,
                 precision=self.precision.name,
+                trainable=self.trainable,
             )
             out_proj["matrix"] = attention_layer_params[layer_idx]["c_out"]["matrix"]
             if bias:
@@ -1654,6 +1656,7 @@ def serialize_network_strip(
         variables: dict,
         suffix: str = "",
         type_one_side: bool = False,
+        trainable: bool = True,
     ) -> dict:
         """Serialize network.
 
@@ -1679,6 +1682,8 @@ def serialize_network_strip(
             If 'False', type embeddings of both neighbor and central atoms are considered.
             If 'True', only type embeddings of neighbor atoms are considered.
             Default is 'False'.
+        trainable : bool
+            Whether the network is trainable
 
         Returns
         -------
@@ -1719,6 +1724,7 @@ def serialize_network_strip(
                     activation_function=activation_function,
                     resnet_dt=resnet_dt,
                     precision=self.precision.name,
+                    trainable=trainable,
                 )
             assert embeddings[network_idx] is not None
             if weight_name == "idt":
@@ -1983,6 +1989,7 @@ def serialize(self, suffix: str = "") -> dict:
                 resnet_dt=self.filter_resnet_dt,
                 variables=self.embedding_net_variables,
                 excluded_types=self.exclude_types,
+                trainable=self.trainable,
                 suffix=suffix,
             ),
             "attention_layers": self.serialize_attention_layers(
@@ -2032,6 +2039,7 @@ def serialize(self, suffix: str = "") -> dict:
                         variables=self.two_side_embeeding_net_variables,
                         suffix=suffix,
                         type_one_side=self.type_one_side,
+                        trainable=self.trainable,
                     )
                 }
             )
diff --git a/deepmd/tf/descriptor/se_r.py b/deepmd/tf/descriptor/se_r.py
index ed66d6ad25..9613a9fa9b 100644
--- a/deepmd/tf/descriptor/se_r.py
+++ b/deepmd/tf/descriptor/se_r.py
@@ -795,6 +795,7 @@ def serialize(self, suffix: str = "") -> dict:
                 resnet_dt=self.filter_resnet_dt,
                 variables=self.embedding_net_variables,
                 excluded_types=self.exclude_types,
+                trainable=self.trainable,
                 suffix=suffix,
             ),
             "env_mat": EnvMat(self.rcut, self.rcut_smth).serialize(),
diff --git a/deepmd/tf/descriptor/se_t.py b/deepmd/tf/descriptor/se_t.py
index c5d50744af..ec6a1122d6 100644
--- a/deepmd/tf/descriptor/se_t.py
+++ b/deepmd/tf/descriptor/se_t.py
@@ -726,6 +726,7 @@ def serialize_network(
         resnet_dt: bool,
         variables: dict,
         excluded_types: set[tuple[int, int]] = set(),
+        trainable: bool = True,
         suffix: str = "",
     ) -> dict:
         """Serialize network.
@@ -748,6 +749,8 @@ def serialize_network(
             The input variables
         excluded_types : set[tuple[int, int]], optional
             The excluded types
+        trainable : bool, optional
+            Whether the network is trainable
         suffix : str, optional
             The suffix of the scope
 
@@ -771,6 +774,7 @@ def clear_ij(type_i, type_j) -> None:
                 activation_function=activation_function,
                 resnet_dt=resnet_dt,
                 precision=self.precision.name,
+                trainable=trainable,
             )
             embeddings[(type_i, type_j)].clear()
 
@@ -805,6 +809,7 @@ def clear_ij(type_i, type_j) -> None:
                     activation_function=activation_function,
                     resnet_dt=resnet_dt,
                     precision=self.precision.name,
+                    trainable=trainable,
                 )
             assert embeddings[network_idx] is not None
             if weight_name == "idt":
@@ -941,6 +946,7 @@ def serialize(self, suffix: str = "") -> dict:
                 resnet_dt=self.filter_resnet_dt,
                 variables=self.embedding_net_variables,
                 excluded_types=self.exclude_types,
+                trainable=self.trainable,
                 suffix=suffix,
             ),
             "env_mat": EnvMat(self.rcut_r, self.rcut_r_smth).serialize(),
diff --git a/deepmd/tf/entrypoints/__init__.py b/deepmd/tf/entrypoints/__init__.py
index bf8c51067e..a33dc5b983 100644
--- a/deepmd/tf/entrypoints/__init__.py
+++ b/deepmd/tf/entrypoints/__init__.py
@@ -4,6 +4,9 @@
 from ..infer.model_devi import (
     make_model_devi,
 )
+from .change_bias import (
+    change_bias,
+)
 from .compress import (
     compress,
 )
@@ -34,6 +37,7 @@
 )
 
 __all__ = [
+    "change_bias",
     "compress",
     "convert",
     "doc_train_input",
diff --git a/deepmd/tf/entrypoints/change_bias.py b/deepmd/tf/entrypoints/change_bias.py
new file mode 100644
index 0000000000..efb4f9ae35
--- /dev/null
+++ b/deepmd/tf/entrypoints/change_bias.py
@@ -0,0 +1,443 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""DeePMD change bias entrypoint script."""
+
+import logging
+import os
+import shutil
+import tempfile
+from pathlib import (
+    Path,
+)
+from typing import (
+    Optional,
+)
+
+import numpy as np
+
+from deepmd.common import (
+    expand_sys_str,
+    j_loader,
+)
+from deepmd.tf.entrypoints.freeze import (
+    freeze,
+)
+from deepmd.tf.env import (
+    tf,
+)
+from deepmd.tf.infer import (
+    DeepPotential,
+)
+from deepmd.tf.train.run_options import (
+    RunOptions,
+)
+from deepmd.tf.train.trainer import (
+    DPTrainer,
+)
+from deepmd.tf.utils.argcheck import (
+    normalize,
+)
+from deepmd.tf.utils.compat import (
+    update_deepmd_input,
+)
+from deepmd.tf.utils.sess import (
+    run_sess,
+)
+from deepmd.utils.data_system import (
+    DeepmdDataSystem,
+)
+
+__all__ = ["change_bias"]
+
+log = logging.getLogger(__name__)
+
+
+def change_bias(
+    INPUT: str,
+    mode: str = "change",
+    bias_value: Optional[list] = None,
+    datafile: Optional[str] = None,
+    system: str = ".",
+    numb_batch: int = 0,
+    model_branch: Optional[str] = None,
+    output: Optional[str] = None,
+    log_level: int = 0,
+    **kwargs,
+) -> None:
+    """Change model out bias according to the input data.
+
+    Parameters
+    ----------
+    INPUT : str
+        The input checkpoint file or frozen model file
+    mode : str, optional
+        The mode for changing energy bias, by default "change"
+    bias_value : Optional[list], optional
+        The user defined value for each type, by default None
+    datafile : Optional[str], optional
+        The path to the datafile, by default None
+    system : str, optional
+        The system dir, by default "."
+    numb_batch : int, optional
+        The number of frames for bias changing, by default 0
+    model_branch : Optional[str], optional
+        Model branch chosen for changing bias if multi-task model, by default None
+    output : Optional[str], optional
+        The model after changing bias, by default None
+    log_level : int, optional
+        The log level for output, by default 0
+    """
+    # Determine input type and handle accordingly
+    if INPUT.endswith(".pb"):
+        # Frozen model (.pb)
+        return _change_bias_frozen_model(
+            INPUT,
+            mode,
+            bias_value,
+            datafile,
+            system,
+            numb_batch,
+            model_branch,
+            output,
+            log_level,
+        )
+    elif INPUT.endswith(".pbtxt"):
+        # Text format frozen model (.pbtxt) - not supported
+        raise NotImplementedError(
+            "Bias changing for .pbtxt models is not supported. "
+            "Please convert to .pb format first using: dp convert-from pbtxt -i model.pbtxt -o model.pb"
+        )
+    elif INPUT.endswith((".ckpt", ".meta", ".data", ".index")):
+        # Individual checkpoint files
+        checkpoint_prefix = INPUT
+        if INPUT.endswith((".meta", ".data", ".index")):
+            checkpoint_prefix = INPUT.rsplit(".", 1)[0]
+        return _change_bias_checkpoint_file(
+            checkpoint_prefix,
+            mode,
+            bias_value,
+            datafile,
+            system,
+            numb_batch,
+            model_branch,
+            output,
+            log_level,
+        )
+    else:
+        raise RuntimeError(
+            "The model provided must be a checkpoint file or frozen model file (.pb)"
+        )
+
+
+def _change_bias_checkpoint_file(
+    checkpoint_prefix: str,
+    mode: str,
+    bias_value: Optional[list],
+    datafile: Optional[str],
+    system: str,
+    numb_batch: int,
+    model_branch: Optional[str],
+    output: Optional[str],
+    log_level: int,
+) -> None:
+    """Change bias for individual checkpoint files."""
+    # Reset the default graph to avoid variable conflicts
+    tf.reset_default_graph()
+
+    checkpoint_path = Path(checkpoint_prefix)
+    checkpoint_dir = checkpoint_path.parent
+
+    # Check for valid checkpoint and find the actual checkpoint path
+    checkpoint_state_file = checkpoint_dir / "checkpoint"
+    if not checkpoint_state_file.exists():
+        raise RuntimeError(f"No valid checkpoint found in {checkpoint_dir}")
+
+    # Get the latest checkpoint path from the checkpoint state file
+    checkpoint_state = tf.train.get_checkpoint_state(str(checkpoint_dir))
+    if checkpoint_state is None or checkpoint_state.model_checkpoint_path is None:
+        raise RuntimeError(f"No valid checkpoint state found in {checkpoint_dir}")
+
+    # The model_checkpoint_path from get_checkpoint_state is the full path to the checkpoint
+    actual_checkpoint_path = checkpoint_state.model_checkpoint_path
+
+    bias_adjust_mode = "change-by-statistic" if mode == "change" else "set-by-statistic"
+
+    # Read the checkpoint to get the model configuration
+    input_json_path = _find_input_json(checkpoint_dir)
+    jdata = j_loader(input_json_path)
+
+    # Update and normalize the configuration
+    jdata = update_deepmd_input(jdata, warning=True, dump="input_v2_compat.json")
+    jdata = normalize(jdata)
+
+    # Determine output path - should be a single model file
+    if output is None:
+        output = str(checkpoint_path.with_suffix(".pb"))
+    elif not output.endswith(".pb"):
+        output = output + ".pb"
+
+    # Create trainer to access model methods
+    run_opt = RunOptions(
+        init_model=actual_checkpoint_path,  # Use the actual checkpoint file path
+        restart=None,
+        finetune=None,
+        init_frz_model=None,
+        log_level=log_level,
+    )
+
+    trainer = DPTrainer(jdata, run_opt)
+
+    # Load data for bias calculation using trainer data requirements
+    data = _load_data_systems(datafile, system, trainer)
+
+    # Get stop_batch and origin_type_map like in train.py
+    stop_batch = jdata.get("training", {}).get("numb_steps", 0)
+    origin_type_map = jdata["model"].get("origin_type_map", None)
+    if origin_type_map is not None and not origin_type_map:
+        # get the type_map from data if not provided
+        origin_type_map = data.get_type_map()
+
+    try:
+        # Build the model graph first with proper parameters, then initialize session
+        # and restore variables from checkpoint - following train.py pattern
+        trainer.build(data, stop_batch, origin_type_map=origin_type_map)
+        trainer._init_session()
+
+        if bias_value is not None:
+            # Use user-defined bias
+            _apply_user_defined_bias(trainer, bias_value)
+        else:
+            # Use data-based bias calculation
+            type_map = data.get_type_map()
+            if len(type_map) == 0:
+                # If data doesn't have type_map, get from model
+                type_map = trainer.model.get_type_map()
+
+            log.info(f"Changing bias for model with type_map: {type_map}")
+            log.info(f"Using bias adjustment mode: {bias_adjust_mode}")
+
+            # Read current bias values from the session (after variables are restored)
+            _apply_data_based_bias(trainer, data, type_map, bias_adjust_mode)
+
+        # Save the updated variables back to checkpoint format first
+        # Create a separate directory for updated checkpoint to avoid polluting original
+        updated_checkpoint_dir = checkpoint_dir / f"{checkpoint_path.name}_updated"
+        updated_checkpoint_dir.mkdir(exist_ok=True)
+
+        # Copy the input.json file to the new directory
+        updated_input_json_path = updated_checkpoint_dir / "input.json"
+        shutil.copy2(input_json_path, updated_input_json_path)
+
+        updated_checkpoint_prefix = str(updated_checkpoint_dir / checkpoint_path.name)
+        if hasattr(trainer, "saver") and trainer.saver is not None:
+            log.info(f"Saving updated checkpoint to {updated_checkpoint_prefix}")
+            trainer.saver.save(trainer.sess, updated_checkpoint_prefix)
+
+            # Create a new checkpoint state file in the updated directory
+            updated_checkpoint_state_file = updated_checkpoint_dir / "checkpoint"
+            with open(updated_checkpoint_state_file, "w") as f:
+                f.write(f'model_checkpoint_path: "{checkpoint_path.name}"\n')
+                f.write(f'all_model_checkpoint_paths: "{checkpoint_path.name}"\n')
+
+        # Then save the updated model as a frozen model using the updated checkpoint directory
+        freeze(
+            checkpoint_folder=str(updated_checkpoint_dir),
+            output=output,
+        )
+
+        log.info(f"Bias changing complete. Model saved to {output}")
+
+    finally:
+        # Ensure session is properly closed
+        if hasattr(trainer, "sess") and trainer.sess is not None:
+            trainer.sess.close()
+
+
+def _change_bias_frozen_model(
+    frozen_model_path: str,
+    mode: str,
+    bias_value: Optional[list],
+    datafile: Optional[str],
+    system: str,
+    numb_batch: int,
+    model_branch: Optional[str],
+    output: Optional[str],
+    log_level: int,
+) -> None:
+    """Change bias for frozen model (.pb file)."""
+    if bias_value is None:
+        raise NotImplementedError(
+            "Data-based bias changing for frozen models is not yet implemented. "
+            "Please provide user-defined bias values using the -b/--bias-value option, "
+            "or use a checkpoint directory instead."
+        )
+
+    # For frozen models, we need to modify the graph and save a new frozen model
+    # This is complex and requires graph manipulation
+    # For now, provide a clear error message with workaround
+    raise NotImplementedError(
+        "Bias modification for frozen models (.pb) is not yet fully implemented. "
+        "Recommended workaround:\n"
+        "1. Use a checkpoint directory instead of a frozen model\n"
+        "2. Or load the model, modify bias in training, then freeze again\n"
+        f"   dp --tf change-bias <checkpoint_dir> -b {' '.join(map(str, bias_value)) if bias_value else '<bias_values>'} -o <output_dir>\n"
+        "   dp freeze -c <output_dir> -o modified_model.pb"
+    )
+
+
+def _load_data_systems(
+    datafile: Optional[str], system: str, trainer: DPTrainer
+) -> DeepmdDataSystem:
+    """Load data systems for bias calculation."""
+    if datafile is not None:
+        with open(datafile) as datalist:
+            all_sys = datalist.read().splitlines()
+    else:
+        all_sys = expand_sys_str(system)
+
+    # Load the data systems with proper data requirements
+    data = DeepmdDataSystem(
+        systems=all_sys,
+        batch_size=1,
+        test_size=1,
+        rcut=None,
+        set_prefix="set",
+    )
+    # Use the data requirements from the trainer model instead of hardcoding them
+    data.add_data_requirements(trainer.data_requirements)
+    return data
+
+
+def _find_input_json(checkpoint_dir: Path) -> Path:
+    """Find the input.json file for the checkpoint."""
+    input_json_path = checkpoint_dir / "input.json"
+    if not input_json_path.exists():
+        # Look for input.json in parent directories or common locations
+        for parent in checkpoint_dir.parents:
+            potential_input = parent / "input.json"
+            if potential_input.exists():
+                input_json_path = potential_input
+                break
+        else:
+            raise RuntimeError(
+                f"Cannot find input.json configuration file needed to load the model. "
+                f"Please ensure input.json is available in {checkpoint_dir} or its parent directories."
+            )
+    return input_json_path
+
+
+def _apply_data_based_bias(
+    trainer: DPTrainer, data: DeepmdDataSystem, type_map: list, bias_adjust_mode: str
+) -> None:
+    """Apply data-based bias calculation by reading current bias from session."""
+    from deepmd.tf.env import (
+        tf,
+    )
+    from deepmd.tf.fit.ener import (
+        change_energy_bias_lower,
+    )
+
+    # Get the fitting object which contains the bias tensor
+    fitting = trainer.model.get_fitting()
+    if not hasattr(fitting, "t_bias_atom_e"):
+        raise RuntimeError(
+            "Model does not have t_bias_atom_e tensor for bias modification"
+        )
+
+    # Read current bias values from the session (these are the restored values)
+    current_bias = run_sess(trainer.sess, fitting.t_bias_atom_e)
+
+    log.info(f"Current bias values from session: {current_bias.flatten()}")
+
+    # Create a temporary frozen model to use with change_energy_bias_lower
+    with tempfile.NamedTemporaryFile(suffix=".pb", delete=False) as temp_frozen:
+        freeze(
+            checkpoint_folder=str(Path(trainer.run_opt.init_model).parent),
+            output=temp_frozen.name,
+        )
+
+        try:
+            # Create DeepPotential object for evaluation
+            dp = DeepPotential(temp_frozen.name)
+
+            # Use change_energy_bias_lower with the current bias values from session
+            new_bias = change_energy_bias_lower(
+                data,
+                dp,
+                type_map,  # origin_type_map
+                type_map,  # full_type_map
+                current_bias,  # Use the restored bias values
+                bias_adjust_mode=bias_adjust_mode,
+                ntest=1,
+            )
+
+            # Update the bias in the session
+            if len(new_bias.shape) == 1:
+                # 1D tensor, keep bias as 1D
+                new_bias_tensor = new_bias.flatten()
+            else:
+                # 2D tensor, reshape to match
+                new_bias_tensor = new_bias.reshape(-1, 1)
+
+            assign_op = tf.assign(fitting.t_bias_atom_e, new_bias_tensor)
+            run_sess(trainer.sess, assign_op)
+
+            # Also update the numpy array in the fitting object for consistency
+            fitting.bias_atom_e = new_bias
+
+        finally:
+            # Clean up temporary file
+            os.unlink(temp_frozen.name)
+
+
+def _apply_user_defined_bias(trainer: DPTrainer, bias_value: list) -> None:
+    """Apply user-defined bias values to the model."""
+    # Get the type map from the model
+    type_map = trainer.model.get_type_map()
+
+    # Validate bias_value length
+    if len(bias_value) != len(type_map):
+        raise ValueError(
+            f"The number of elements in the bias ({len(bias_value)}) should be the same as "
+            f"that in the type_map ({len(type_map)}): {type_map}"
+        )
+
+    # Check model type
+    if trainer.model.model_type != "ener":
+        raise RuntimeError(
+            f"User-defined bias is only supported for energy models, got: {trainer.model.model_type}"
+        )
+
+    # Get current bias
+    fitting = trainer.model.get_fitting()
+    if not hasattr(fitting, "bias_atom_e"):
+        raise RuntimeError(
+            "Model does not have bias_atom_e attribute for bias modification"
+        )
+
+    # Convert user bias to numpy array with proper shape matching the tensor
+    new_bias = np.array(bias_value, dtype=np.float64)
+
+    # Check the shape of the existing bias tensor to match it
+    if hasattr(fitting, "t_bias_atom_e"):
+        existing_shape = fitting.t_bias_atom_e.get_shape().as_list()
+        if len(existing_shape) == 1:
+            # 1D tensor, keep bias as 1D
+            new_bias = new_bias.flatten()
+        else:
+            # 2D tensor, reshape to match
+            new_bias = new_bias.reshape(-1, 1)
+    else:
+        # If no tensor, use the fitting.bias_atom_e shape
+        new_bias = new_bias.reshape(fitting.bias_atom_e.shape)
+
+    log.info(f"Changing bias from user-defined values for type_map: {type_map}")
+    log.info(f"Old bias: {fitting.bias_atom_e.flatten()}")
+    log.info(f"New bias: {new_bias.flatten()}")
+
+    # Update the bias in the model
+    fitting.bias_atom_e = new_bias
+
+    # Update the tensor in the session if needed
+    if hasattr(fitting, "t_bias_atom_e"):
+        assign_op = tf.assign(fitting.t_bias_atom_e, new_bias)
+        run_sess(trainer.sess, assign_op)
diff --git a/deepmd/tf/entrypoints/main.py b/deepmd/tf/entrypoints/main.py
index 5058c51c17..ac2edc8ddd 100644
--- a/deepmd/tf/entrypoints/main.py
+++ b/deepmd/tf/entrypoints/main.py
@@ -22,6 +22,7 @@
     clear_session,
 )
 from deepmd.tf.entrypoints import (
+    change_bias,
     compress,
     convert,
     freeze,
@@ -86,6 +87,8 @@ def main(args: Optional[Union[list[str], argparse.Namespace]] = None) -> None:
         compress(**dict_args)
     elif args.command == "convert-from":
         convert(**dict_args)
+    elif args.command == "change-bias":
+        change_bias(**dict_args)
     elif args.command == "train-nvnmd":  # nvnmd
         train_nvnmd(**dict_args)
     elif args.command is None:
diff --git a/deepmd/tf/entrypoints/train.py b/deepmd/tf/entrypoints/train.py
index b12e4fe1af..5bcca9a4e3 100755
--- a/deepmd/tf/entrypoints/train.py
+++ b/deepmd/tf/entrypoints/train.py
@@ -13,7 +13,7 @@
     Optional,
 )
 
-from deepmd.tf.common import (
+from deepmd.common import (
     j_loader,
 )
 from deepmd.tf.env import (
diff --git a/deepmd/tf/fit/dipole.py b/deepmd/tf/fit/dipole.py
index 4428d06536..09da354116 100644
--- a/deepmd/tf/fit/dipole.py
+++ b/deepmd/tf/fit/dipole.py
@@ -5,6 +5,9 @@
 
 import numpy as np
 
+from deepmd.env import (
+    GLOBAL_NP_FLOAT_PRECISION,
+)
 from deepmd.tf.common import (
     cast_precision,
     get_activation_func,
@@ -75,6 +78,13 @@ class DipoleFittingSeA(Fitting):
         different fitting nets for different atom types.
     type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
+    default_fparam: list[float], optional
+        The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+        this value will be used as the default value for the frame parameter in the fitting net.
+    trainable : list[bool], Optional
+        If the weights of fitting net are trainable.
+        Suppose that we have :math:`N_l` hidden layers in the fitting net,
+        this list is of length :math:`N_l + 1`, specifying if the hidden layers and the output layer are trainable.
     """
 
     def __init__(
@@ -94,6 +104,8 @@ def __init__(
         uniform_seed: bool = False,
         mixed_types: bool = False,
         type_map: Optional[list[str]] = None,  # to be compat with input
+        default_fparam: Optional[list[float]] = None,  # to be compat with input
+        trainable: Optional[list[bool]] = None,
         **kwargs,
     ) -> None:
         """Constructor."""
@@ -123,18 +135,30 @@ def __init__(
         self.numb_fparam = numb_fparam
         self.numb_aparam = numb_aparam
         self.dim_case_embd = dim_case_embd
+        self.default_fparam = default_fparam
         if numb_fparam > 0:
             raise ValueError("numb_fparam is not supported in the dipole fitting")
         if numb_aparam > 0:
             raise ValueError("numb_aparam is not supported in the dipole fitting")
         if dim_case_embd > 0:
             raise ValueError("dim_case_embd is not supported in TensorFlow.")
+        if default_fparam is not None:
+            raise ValueError("default_fparam is not supported in TensorFlow.")
         self.fparam_avg = None
         self.fparam_std = None
         self.fparam_inv_std = None
         self.aparam_avg = None
         self.aparam_std = None
         self.aparam_inv_std = None
+        if trainable is None:
+            self.trainable = [True for _ in range(len(self.n_neuron) + 1)]
+        elif isinstance(trainable, bool):
+            self.trainable = [trainable] * (len(self.n_neuron) + 1)
+        else:
+            self.trainable = trainable
+        assert len(self.trainable) == len(self.n_neuron) + 1, (
+            "length of trainable should be that of n_neuron + 1"
+        )
 
     def get_sel_type(self) -> int:
         """Get selected type."""
@@ -166,6 +190,7 @@ def _build_lower(self, start_index, natoms, inputs, rot_mat, suffix="", reuse=No
                     uniform_seed=self.uniform_seed,
                     initial_variables=self.fitting_net_variables,
                     mixed_prec=self.mixed_prec,
+                    trainable=self.trainable[ii],
                 )
             else:
                 layer = one_layer(
@@ -179,6 +204,7 @@ def _build_lower(self, start_index, natoms, inputs, rot_mat, suffix="", reuse=No
                     uniform_seed=self.uniform_seed,
                     initial_variables=self.fitting_net_variables,
                     mixed_prec=self.mixed_prec,
+                    trainable=self.trainable[ii],
                 )
             if (not self.uniform_seed) and (self.seed is not None):
                 self.seed += self.seed_shift
@@ -195,6 +221,7 @@ def _build_lower(self, start_index, natoms, inputs, rot_mat, suffix="", reuse=No
             initial_variables=self.fitting_net_variables,
             mixed_prec=self.mixed_prec,
             final_layer=True,
+            trainable=self.trainable[-1],
         )
         if (not self.uniform_seed) and (self.seed is not None):
             self.seed += self.seed_shift
@@ -391,20 +418,22 @@ def serialize(self, suffix: str) -> dict:
         data = {
             "@class": "Fitting",
             "type": "dipole",
-            "@version": 3,
+            "@version": 4,
             "ntypes": self.ntypes,
             "dim_descrpt": self.dim_descrpt,
             "embedding_width": self.dim_rot_mat_1,
             "mixed_types": self.mixed_types,
-            "dim_out": 3,
             "neuron": self.n_neuron,
             "resnet_dt": self.resnet_dt,
             "numb_fparam": self.numb_fparam,
             "numb_aparam": self.numb_aparam,
             "dim_case_embd": self.dim_case_embd,
+            "default_fparam": self.default_fparam,
             "activation_function": self.activation_function_name,
             "precision": self.fitting_precision.name,
-            "exclude_types": [],
+            "exclude_types": []
+            if self.sel_type is None
+            else [ii for ii in range(self.ntypes) if ii not in self.sel_type],
             "nets": self.serialize_network(
                 ntypes=self.ntypes,
                 ndim=0 if self.mixed_types else 1,
@@ -414,9 +443,29 @@ def serialize(self, suffix: str) -> dict:
                 activation_function=self.activation_function_name,
                 resnet_dt=self.resnet_dt,
                 variables=self.fitting_net_variables,
+                trainable=self.trainable,
                 suffix=suffix,
             ),
+            "@variables": {
+                "fparam_avg": self.fparam_avg,
+                "fparam_inv_std": self.fparam_inv_std,
+                "aparam_avg": self.aparam_avg,
+                "aparam_inv_std": self.aparam_inv_std,
+                "case_embd": None,
+                "bias_atom_e": np.zeros(
+                    (self.ntypes, self.dim_rot_mat_1), dtype=GLOBAL_NP_FLOAT_PRECISION
+                ),
+            },
             "type_map": self.type_map,
+            "var_name": "dipole",
+            "rcond": None,
+            "tot_ener_zero": False,
+            "trainable": self.trainable,
+            "layer_name": None,
+            "use_aparam_as_mask": False,
+            "spin": None,
+            "r_differentiable": True,
+            "c_differentiable": True,
         }
         return data
 
@@ -435,7 +484,12 @@ def deserialize(cls, data: dict, suffix: str):
             The deserialized model
         """
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 3, 1)
+        check_version_compatibility(data.pop("@version", 1), 4, 1)
+        exclude_types = data.pop("exclude_types", [])
+        if len(exclude_types) > 0:
+            data["sel_type"] = [
+                ii for ii in range(data["ntypes"]) if ii not in exclude_types
+            ]
         fitting = cls(**data)
         fitting.fitting_net_variables = cls.deserialize_network(
             data["nets"],
diff --git a/deepmd/tf/fit/dos.py b/deepmd/tf/fit/dos.py
index 8fa3167bfc..81d80a4f4d 100644
--- a/deepmd/tf/fit/dos.py
+++ b/deepmd/tf/fit/dos.py
@@ -101,6 +101,9 @@ class DOSFitting(Fitting):
     mixed_types : bool
         If true, use a uniform fitting net for all atom types, otherwise use
         different fitting nets for different atom types.
+    default_fparam: list[float], optional
+        The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+        this value will be used as the default value for the frame parameter in the fitting net.
     type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     """
@@ -125,6 +128,7 @@ def __init__(
         use_aparam_as_mask: bool = False,
         mixed_types: bool = False,
         type_map: Optional[list[str]] = None,  # to be compat with input
+        default_fparam: Optional[list[float]] = None,  # to be compat with input
         **kwargs,
     ) -> None:
         """Constructor."""
@@ -136,8 +140,11 @@ def __init__(
         self.numb_fparam = numb_fparam
         self.numb_aparam = numb_aparam
         self.dim_case_embd = dim_case_embd
+        self.default_fparam = default_fparam
         if dim_case_embd > 0:
             raise ValueError("dim_case_embd is not supported in TensorFlow.")
+        if default_fparam is not None:
+            raise ValueError("default_fparam is not supported in TensorFlow.")
 
         self.numb_dos = numb_dos
 
@@ -678,7 +685,7 @@ def deserialize(cls, data: dict, suffix: str = ""):
             The deserialized model
         """
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 3, 1)
+        check_version_compatibility(data.pop("@version", 1), 4, 1)
         data["numb_dos"] = data.pop("dim_out")
         fitting = cls(**data)
         fitting.fitting_net_variables = cls.deserialize_network(
@@ -705,7 +712,7 @@ def serialize(self, suffix: str = "") -> dict:
         data = {
             "@class": "Fitting",
             "type": "dos",
-            "@version": 3,
+            "@version": 4,
             "var_name": "dos",
             "ntypes": self.ntypes,
             "dim_descrpt": self.dim_descrpt,
@@ -716,6 +723,7 @@ def serialize(self, suffix: str = "") -> dict:
             "numb_fparam": self.numb_fparam,
             "numb_aparam": self.numb_aparam,
             "dim_case_embd": self.dim_case_embd,
+            "default_fparam": self.default_fparam,
             "rcond": self.rcond,
             "trainable": self.trainable,
             "activation_function": self.activation_function,
@@ -730,6 +738,7 @@ def serialize(self, suffix: str = "") -> dict:
                 activation_function=self.activation_function,
                 resnet_dt=self.resnet_dt,
                 variables=self.fitting_net_variables,
+                trainable=self.trainable,
                 suffix=suffix,
             ),
             "@variables": {
@@ -741,6 +750,11 @@ def serialize(self, suffix: str = "") -> dict:
                 "case_embd": None,
             },
             "type_map": self.type_map,
+            "tot_ener_zero": False,
+            "layer_name": None,
+            "use_aparam_as_mask": False,
+            "spin": None,
+            "atom_ener": None,
         }
         return data
 
diff --git a/deepmd/tf/fit/ener.py b/deepmd/tf/fit/ener.py
index e10468df32..547c0eefb1 100644
--- a/deepmd/tf/fit/ener.py
+++ b/deepmd/tf/fit/ener.py
@@ -119,6 +119,8 @@ class EnerFitting(Fitting):
             Number of atomic parameter
     dim_case_embd
         Dimension of case specific embedding.
+    default_fparam
+        The default frame parameter. This parameter is not supported in TensorFlow.
     rcond
             The condition number for the regression of atomic energy.
     tot_ener_zero
@@ -146,6 +148,9 @@ class EnerFitting(Fitting):
     mixed_types : bool
         If true, use a uniform fitting net for all atom types, otherwise use
         different fitting nets for different atom types.
+    default_fparam: list[float], optional
+        The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+        this value will be used as the default value for the frame parameter in the fitting net.
     type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
     """
@@ -172,6 +177,7 @@ def __init__(
         spin: Optional[Spin] = None,
         mixed_types: bool = False,
         type_map: Optional[list[str]] = None,  # to be compat with input
+        default_fparam: Optional[list[float]] = None,  # to be compat with input
         **kwargs,
     ) -> None:
         """Constructor."""
@@ -196,6 +202,9 @@ def __init__(
         self.dim_case_embd = dim_case_embd
         if dim_case_embd > 0:
             raise ValueError("dim_case_embd is not supported in TensorFlow.")
+        self.default_fparam = default_fparam
+        if self.default_fparam is not None:
+            raise ValueError("default_fparam is not supported in TensorFlow.")
         self.n_neuron = neuron
         self.resnet_dt = resnet_dt
         self.rcond = rcond
@@ -884,7 +893,7 @@ def deserialize(cls, data: dict, suffix: str = ""):
             The deserialized model
         """
         data = data.copy()
-        check_version_compatibility(data.pop("@version", 1), 3, 1)
+        check_version_compatibility(data.pop("@version", 1), 4, 1)
         fitting = cls(**data)
         fitting.fitting_net_variables = cls.deserialize_network(
             data["nets"],
@@ -910,7 +919,7 @@ def serialize(self, suffix: str = "") -> dict:
         data = {
             "@class": "Fitting",
             "type": "ener",
-            "@version": 3,
+            "@version": 4,
             "var_name": "energy",
             "ntypes": self.ntypes,
             "dim_descrpt": self.dim_descrpt + self.tebd_dim,
@@ -921,6 +930,7 @@ def serialize(self, suffix: str = "") -> dict:
             "numb_fparam": self.numb_fparam,
             "numb_aparam": self.numb_aparam,
             "dim_case_embd": self.dim_case_embd,
+            "default_fparam": self.default_fparam,
             "rcond": self.rcond,
             "tot_ener_zero": self.tot_ener_zero,
             "trainable": self.trainable,
@@ -944,6 +954,7 @@ def serialize(self, suffix: str = "") -> dict:
                 activation_function=self.activation_function_name,
                 resnet_dt=self.resnet_dt,
                 variables=self.fitting_net_variables,
+                trainable=self.trainable,
                 suffix=suffix,
             ),
             "@variables": {
diff --git a/deepmd/tf/fit/fitting.py b/deepmd/tf/fit/fitting.py
index f159de1628..0e109fea60 100644
--- a/deepmd/tf/fit/fitting.py
+++ b/deepmd/tf/fit/fitting.py
@@ -135,6 +135,7 @@ def serialize_network(
         resnet_dt: bool,
         variables: dict,
         out_dim: Optional[int] = 1,
+        trainable: Optional[list[bool]] = None,
         suffix: str = "",
     ) -> dict:
         """Serialize network.
@@ -155,6 +156,8 @@ def serialize_network(
             Whether to use resnet
         variables : dict
             The input variables
+        trainable : list[bool]
+            Whether the network is trainable
         suffix : str, optional
             The suffix of the scope
         out_dim : int, optional
@@ -191,6 +194,8 @@ def serialize_network(
                 raise ValueError(f"Invalid ndim: {ndim}")
             if fittings[network_idx] is None:
                 # initialize the network if it is not initialized
+                if trainable is None:
+                    trainable = [True for _ in range(len(neuron) + 1)]
                 fittings[network_idx] = FittingNet(
                     in_dim=in_dim,
                     out_dim=out_dim,
@@ -199,6 +204,7 @@ def serialize_network(
                     resnet_dt=resnet_dt,
                     precision=self.precision.name,
                     bias_out=True,
+                    trainable=trainable,
                 )
             assert fittings[network_idx] is not None
             if weight_name == "idt":
@@ -238,7 +244,9 @@ def deserialize_network(cls, data: dict, suffix: str = "") -> dict:
             else:
                 raise ValueError(f"Invalid ndim: {fittings.ndim}")
             network = fittings[net_idx]
-            assert network is not None
+            if network is None:
+                # Skip types that are not selected (when sel_type is used)
+                continue
             for layer_idx, layer in enumerate(network.layers):
                 if layer_idx == len(network.layers) - 1:
                     layer_name = "final_layer"
diff --git a/deepmd/tf/fit/polar.py b/deepmd/tf/fit/polar.py
index c8fd4e86e8..31ccd18302 100644
--- a/deepmd/tf/fit/polar.py
+++ b/deepmd/tf/fit/polar.py
@@ -6,6 +6,9 @@
 
 import numpy as np
 
+from deepmd.env import (
+    GLOBAL_NP_FLOAT_PRECISION,
+)
 from deepmd.tf.common import (
     cast_precision,
     get_activation_func,
@@ -90,6 +93,13 @@ class PolarFittingSeA(Fitting):
         different fitting nets for different atom types.
     type_map: list[str], Optional
             A list of strings. Give the name to each type of atoms.
+    default_fparam: list[float], optional
+        The default frame parameter. If set, when `fparam.npy` files are not included in the data system,
+        this value will be used as the default value for the frame parameter in the fitting net.
+    trainable : list[bool], Optional
+        If the weights of fitting net are trainable.
+        Suppose that we have :math:`N_l` hidden layers in the fitting net,
+        this list is of length :math:`N_l + 1`, specifying if the hidden layers and the output layer are trainable.
     """
 
     def __init__(
@@ -113,6 +123,8 @@ def __init__(
         uniform_seed: bool = False,
         mixed_types: bool = False,
         type_map: Optional[list[str]] = None,  # to be compat with input
+        default_fparam: Optional[list[float]] = None,  # to be compat with input
+        trainable: Optional[list[bool]] = None,
         **kwargs,
     ) -> None:
         """Constructor."""
@@ -170,18 +182,30 @@ def __init__(
         self.numb_fparam = numb_fparam
         self.numb_aparam = numb_aparam
         self.dim_case_embd = dim_case_embd
+        self.default_fparam = default_fparam
         if numb_fparam > 0:
             raise ValueError("numb_fparam is not supported in the dipole fitting")
         if numb_aparam > 0:
             raise ValueError("numb_aparam is not supported in the dipole fitting")
         if dim_case_embd > 0:
             raise ValueError("dim_case_embd is not supported in TensorFlow.")
+        if default_fparam is not None:
+            raise ValueError("default_fparam is not supported in TensorFlow.")
         self.fparam_avg = None
         self.fparam_std = None
         self.fparam_inv_std = None
         self.aparam_avg = None
         self.aparam_std = None
         self.aparam_inv_std = None
+        if trainable is None:
+            self.trainable = [True for _ in range(len(self.n_neuron) + 1)]
+        elif isinstance(trainable, bool):
+            self.trainable = [trainable] * (len(self.n_neuron) + 1)
+        else:
+            self.trainable = trainable
+        assert len(self.trainable) == len(self.n_neuron) + 1, (
+            "length of trainable should be that of n_neuron + 1"
+        )
 
     def get_sel_type(self) -> list[int]:
         """Get selected atom types."""
@@ -312,6 +336,7 @@ def _build_lower(self, start_index, natoms, inputs, rot_mat, suffix="", reuse=No
                     uniform_seed=self.uniform_seed,
                     initial_variables=self.fitting_net_variables,
                     mixed_prec=self.mixed_prec,
+                    trainable=self.trainable[ii],
                 )
             else:
                 layer = one_layer(
@@ -325,6 +350,7 @@ def _build_lower(self, start_index, natoms, inputs, rot_mat, suffix="", reuse=No
                     uniform_seed=self.uniform_seed,
                     initial_variables=self.fitting_net_variables,
                     mixed_prec=self.mixed_prec,
+                    trainable=self.trainable[ii],
                 )
             if (not self.uniform_seed) and (self.seed is not None):
                 self.seed += self.seed_shift
@@ -347,6 +373,7 @@ def _build_lower(self, start_index, natoms, inputs, rot_mat, suffix="", reuse=No
                 initial_variables=self.fitting_net_variables,
                 mixed_prec=self.mixed_prec,
                 final_layer=True,
+                trainable=self.trainable[-1],
             )
             if (not self.uniform_seed) and (self.seed is not None):
                 self.seed += self.seed_shift
@@ -612,22 +639,21 @@ def serialize(self, suffix: str) -> dict:
         data = {
             "@class": "Fitting",
             "type": "polar",
-            "@version": 4,
+            "@version": 5,
             "ntypes": self.ntypes,
             "dim_descrpt": self.dim_descrpt,
             "embedding_width": self.dim_rot_mat_1,
             "mixed_types": self.mixed_types,
-            "dim_out": 3,
             "neuron": self.n_neuron,
             "resnet_dt": self.resnet_dt,
             "numb_fparam": self.numb_fparam,
             "numb_aparam": self.numb_aparam,
             "dim_case_embd": self.dim_case_embd,
+            "default_fparam": self.default_fparam,
             "activation_function": self.activation_function_name,
             "precision": self.fitting_precision.name,
             "exclude_types": [],
             "fit_diag": self.fit_diag,
-            "scale": list(self.scale),
             "shift_diag": self.shift_diag,
             "nets": self.serialize_network(
                 ntypes=self.ntypes,
@@ -638,6 +664,7 @@ def serialize(self, suffix: str) -> dict:
                 activation_function=self.activation_function_name,
                 resnet_dt=self.resnet_dt,
                 variables=self.fitting_net_variables,
+                trainable=self.trainable,
                 suffix=suffix,
             ),
             "@variables": {
@@ -648,8 +675,18 @@ def serialize(self, suffix: str) -> dict:
                 "case_embd": None,
                 "scale": self.scale.reshape(-1, 1),
                 "constant_matrix": self.constant_matrix.reshape(-1),
+                "bias_atom_e": np.zeros(
+                    (self.ntypes, self.dim_rot_mat_1), dtype=GLOBAL_NP_FLOAT_PRECISION
+                ),
             },
             "type_map": self.type_map,
+            "var_name": "polar",
+            "rcond": None,
+            "tot_ener_zero": False,
+            "trainable": self.trainable,
+            "layer_name": None,
+            "use_aparam_as_mask": False,
+            "spin": None,
         }
         return data
 
@@ -669,7 +706,7 @@ def deserialize(cls, data: dict, suffix: str):
         """
         data = data.copy()
         check_version_compatibility(
-            data.pop("@version", 1), 4, 1
+            data.pop("@version", 1), 5, 1
         )  # to allow PT version.
         fitting = cls(**data)
         fitting.fitting_net_variables = cls.deserialize_network(
diff --git a/deepmd/tf/infer/deep_eval.py b/deepmd/tf/infer/deep_eval.py
index a7682d2e58..75440accb9 100644
--- a/deepmd/tf/infer/deep_eval.py
+++ b/deepmd/tf/infer/deep_eval.py
@@ -1126,6 +1126,16 @@ def get_model_def_script(self) -> dict:
         model_def_script = script.decode("utf-8")
         return json.loads(model_def_script)["model"]
 
+    def get_model(self) -> "tf.Graph":
+        """Get the TensorFlow graph.
+
+        Returns
+        -------
+        tf.Graph
+            The TensorFlow graph.
+        """
+        return self.graph
+
 
 class DeepEvalOld:
     # old class for DipoleChargeModifier only
diff --git a/deepmd/tf/model/dos.py b/deepmd/tf/model/dos.py
index 1bebb4b971..264c77d045 100644
--- a/deepmd/tf/model/dos.py
+++ b/deepmd/tf/model/dos.py
@@ -149,6 +149,9 @@ def build(
             t_ver = tf.constant(MODEL_VERSION, name="model_version", dtype=tf.string)
             t_od = tf.constant(self.numb_dos, name="output_dim", dtype=tf.int32)
 
+            # Initialize out_bias and out_std for DOS models
+            self.init_out_stat(suffix=suffix)
+
         coord = tf.reshape(coord_, [-1, natoms[1] * 3])
         atype = tf.reshape(atype_, [-1, natoms[1]])
         input_dict["nframes"] = tf.shape(coord)[0]
@@ -181,6 +184,10 @@ def build(
         atom_dos = self.fitting.build(
             dout, natoms, input_dict, reuse=reuse, suffix=suffix
         )
+
+        # Apply out_bias and out_std directly to DOS output
+        atom_dos = self._apply_out_bias_std(atom_dos, atype, natoms, coord)
+
         self.atom_dos = atom_dos
 
         dos_raw = atom_dos
diff --git a/deepmd/tf/model/ener.py b/deepmd/tf/model/ener.py
index 6d2ff4615f..5b665511ef 100644
--- a/deepmd/tf/model/ener.py
+++ b/deepmd/tf/model/ener.py
@@ -193,6 +193,9 @@ def build(
             t_mt = tf.constant(self.model_type, name="model_type", dtype=tf.string)
             t_ver = tf.constant(MODEL_VERSION, name="model_version", dtype=tf.string)
 
+            # Initialize out_bias and out_std for energy models
+            self.init_out_stat(suffix=suffix)
+
             if self.srtab is not None:
                 tab_info, tab_data = self.srtab.get()
                 self.tab_info = tf.get_variable(
@@ -253,6 +256,10 @@ def build(
         atom_ener = self.fitting.build(
             dout, natoms, input_dict, reuse=reuse, suffix=suffix
         )
+
+        # Apply out_bias and out_std directly to atom energy
+        atom_ener = self._apply_out_bias_std(atom_ener, atype, natoms, coord)
+
         self.atom_ener = atom_ener
 
         if self.srtab is not None:
diff --git a/deepmd/tf/model/model.py b/deepmd/tf/model/model.py
index 3377ed2d51..95c9840cde 100644
--- a/deepmd/tf/model/model.py
+++ b/deepmd/tf/model/model.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+import logging
 from abc import (
     ABC,
     abstractmethod,
@@ -70,6 +71,8 @@
     check_version_compatibility,
 )
 
+log = logging.getLogger(__name__)
+
 
 class Model(ABC, make_plugin_registry("model")):
     """Abstract base model.
@@ -708,6 +711,63 @@ def __init__(
         else:
             self.typeebd = None
 
+        # Initialize out_bias and out_std storage
+        self.out_bias = None
+        self.out_std = None
+
+    def init_variables(
+        self,
+        graph: tf.Graph,
+        graph_def: tf.GraphDef,
+        model_type: str = "original_model",
+        suffix: str = "",
+    ) -> None:
+        """Init the model variables with the given frozen model.
+
+        Parameters
+        ----------
+        graph : tf.Graph
+            The input frozen model graph
+        graph_def : tf.GraphDef
+            The input frozen model graph_def
+        model_type : str
+            the type of the model
+        suffix : str
+            suffix to name scope
+        """
+        from deepmd.tf.utils.errors import (
+            GraphWithoutTensorError,
+        )
+        from deepmd.tf.utils.graph import (
+            get_tensor_by_name_from_graph,
+        )
+
+        # Initialize descriptor and fitting variables
+        self.descrpt.init_variables(graph, graph_def, suffix=suffix)
+        self.fitting.init_variables(graph, graph_def, suffix=suffix)
+        if (
+            self.typeebd is not None
+            and self.typeebd.type_embedding_net_variables is None
+        ):
+            self.typeebd.init_variables(graph, graph_def, suffix=suffix)
+
+        # Try to load out_bias and out_std from the graph
+        try:
+            self.out_bias = get_tensor_by_name_from_graph(
+                graph, f"model_attr{suffix}/t_out_bias"
+            )
+        except GraphWithoutTensorError:
+            # For compatibility, create default out_bias if not found
+            log.debug("out_bias not found in graph, falling back to default value")
+
+        try:
+            self.out_std = get_tensor_by_name_from_graph(
+                graph, f"model_attr{suffix}/t_out_std"
+            )
+        except GraphWithoutTensorError:
+            # For compatibility, create default out_std if not found
+            log.debug("out_std not found in graph, falling back to default value")
+
     def enable_mixed_precision(self, mixed_prec: dict) -> None:
         """Enable mixed precision for the model.
 
@@ -762,6 +822,130 @@ def get_ntypes(self) -> int:
         """Get the number of types."""
         return self.ntypes
 
+    def _get_dim_out(self):
+        """Get output dimension based on model type.
+
+        Returns
+        -------
+        int
+            Output dimension
+        """
+        if self.model_type == "ener":
+            return 1
+        elif self.model_type == "dipole":
+            return 3
+        elif self.model_type == "polar":
+            return 9
+        elif self.model_type == "dos":
+            return self.numb_dos
+        else:
+            raise ValueError(f"Unknown model type '{self.model_type}' in _get_dim_out")
+
+    def init_out_stat(self, suffix: str = "") -> None:
+        """Initialize the output bias and std variables."""
+        ntypes = self.get_ntypes()
+        dim_out = self._get_dim_out()
+
+        # Initialize out_bias and out_std as numpy arrays, preserving existing values if set
+        if self.out_bias is not None:
+            out_bias_data = self.out_bias.copy()
+        else:
+            out_bias_data = np.zeros(
+                [1, ntypes, dim_out], dtype=GLOBAL_NP_FLOAT_PRECISION
+            )
+
+        if self.out_std is not None:
+            out_std_data = self.out_std.copy()
+        else:
+            out_std_data = np.ones(
+                [1, ntypes, dim_out], dtype=GLOBAL_NP_FLOAT_PRECISION
+            )
+
+        # Create TensorFlow variables
+        with tf.variable_scope("model_attr" + suffix, reuse=tf.AUTO_REUSE):
+            self.t_out_bias = tf.get_variable(
+                "t_out_bias",
+                out_bias_data.shape,
+                dtype=GLOBAL_TF_FLOAT_PRECISION,
+                trainable=False,
+                initializer=tf.constant_initializer(out_bias_data),
+            )
+            self.t_out_std = tf.get_variable(
+                "t_out_std",
+                out_std_data.shape,
+                dtype=GLOBAL_TF_FLOAT_PRECISION,
+                trainable=False,
+                initializer=tf.constant_initializer(out_std_data),
+            )
+
+        # Store as instance variables for access
+        self.out_bias = out_bias_data
+        self.out_std = out_std_data
+
+    def _apply_out_bias_std(self, output, atype, natoms, coord, selected_atype=None):
+        """Apply output bias and standard deviation to the model output.
+
+        Parameters
+        ----------
+        output : tf.Tensor
+            The model output tensor
+        atype : tf.Tensor
+            Atom types with shape [nframes, nloc]
+        natoms : list[int]
+            Number of atoms [nloc, ntypes, ...]
+        coord : tf.Tensor
+            Coordinates for getting nframes
+        selected_atype : tf.Tensor, optional
+            Selected atom types for tensor models. If None, uses all atoms.
+
+        Returns
+        -------
+        tf.Tensor
+            Output with bias and std applied
+        """
+        if self.spin is not None:
+            # spin is not supported yet; also, it's incompatible with dpmodel
+            return output
+        nframes = tf.shape(coord)[0]
+
+        # Get output dimension consistently
+        nout = self._get_dim_out()
+
+        if selected_atype is not None:
+            natomsel = tf.shape(selected_atype)[1]
+            output_reshaped = tf.reshape(output, [nframes, natomsel, nout])
+            atype_for_gather = selected_atype
+        else:
+            nloc = natoms[0]
+            nall = natoms[1]
+            output_reshaped = tf.reshape(output, [nframes, nloc, nout])
+            atype_for_gather = tf.reshape(atype, [nframes, nall])
+            # slice to local atoms
+            atype_for_gather = atype_for_gather[:, :nloc]
+
+        # Handle invalid atom types (e.g., -1 for padding/invalid atoms)
+        # Create a mask for valid atom types (>= 0)
+        valid_mask = tf.greater_equal(atype_for_gather, 0)
+        # Replace invalid types with 0 for gathering (will be masked out later)
+        safe_atype = tf.where(
+            valid_mask, atype_for_gather, tf.zeros_like(atype_for_gather)
+        )
+
+        # Get bias and std for each atom type
+        bias_per_atom = tf.gather(self.t_out_bias[0], safe_atype)
+        std_per_atom = tf.gather(self.t_out_std[0], safe_atype)
+
+        # Apply bias and std: output = output * std + bias
+        adjusted_output = output_reshaped * std_per_atom + bias_per_atom
+
+        # expand axis 2 of valid_mask to nout
+        valid_mask = tf.tile(tf.expand_dims(valid_mask, -1), [1, 1, nout])
+
+        # Only apply bias/std to valid atoms, keep original values for invalid atoms
+        output_reshaped = tf.where(valid_mask, adjusted_output, output_reshaped)
+
+        return tf.reshape(output_reshaped, tf.shape(output))
+
     @classmethod
     def update_sel(
         cls,
@@ -820,25 +1004,7 @@ def deserialize(cls, data: dict, suffix: str = "") -> "Descriptor":
         data = data.copy()
         check_version_compatibility(data.pop("@version", 2), 2, 1)
         descriptor = Descriptor.deserialize(data.pop("descriptor"), suffix=suffix)
-        if data["fitting"].get("@variables", {}).get("bias_atom_e") is not None:
-            # careful: copy each level and don't modify the input array,
-            # otherwise it will affect the original data
-            # deepcopy is not used for performance reasons
-            data["fitting"] = data["fitting"].copy()
-            data["fitting"]["@variables"] = data["fitting"]["@variables"].copy()
-            if (
-                int(np.any(data["fitting"]["@variables"]["bias_atom_e"]))
-                + int(np.any(data["@variables"]["out_bias"]))
-                > 1
-            ):
-                raise ValueError(
-                    "fitting/@variables/bias_atom_e and @variables/out_bias should not be both non-zero"
-                )
-            data["fitting"]["@variables"]["bias_atom_e"] = data["fitting"][
-                "@variables"
-            ]["bias_atom_e"] + data["@variables"]["out_bias"].reshape(
-                data["fitting"]["@variables"]["bias_atom_e"].shape
-            )
+        # bias_atom_e and out_bias are now completely independent - no conversion needed
         fitting = Fitting.deserialize(data.pop("fitting"), suffix=suffix)
         # pass descriptor type embedding to model
         if descriptor.explicit_ntypes:
@@ -853,14 +1019,23 @@ def deserialize(cls, data: dict, suffix: str = "") -> "Descriptor":
             raise NotImplementedError("pair_exclude_types is not supported")
         data.pop("rcond", None)
         data.pop("preset_out_bias", None)
-        data.pop("@variables", None)
+        # Extract out_bias and out_std from variables before removing them
+        variables = data.pop("@variables", {})
+        out_bias = variables.get("out_bias", None)
+        out_std = variables.get("out_std", None)
         # END    not supported keys
-        return cls(
+        model = cls(
             descriptor=descriptor,
             fitting_net=fitting,
             type_embedding=type_embedding,
             **data,
         )
+        # Restore out_bias and out_std if they exist
+        if out_bias is not None:
+            model.out_bias = out_bias
+        if out_std is not None:
+            model.out_std = out_std
+        return model
 
     def serialize(self, suffix: str = "") -> dict:
         """Serialize the model.
@@ -886,18 +1061,23 @@ def serialize(self, suffix: str = "") -> dict:
             raise NotImplementedError("spin is not supported")
 
         ntypes = len(self.get_type_map())
+
+        # Get output dimension
+        dim_out = self._get_dim_out()
+
+        # Serialize fitting
         dict_fit = self.fitting.serialize(suffix=suffix)
-        if dict_fit.get("@variables", {}).get("bias_atom_e") is not None:
-            out_bias = dict_fit["@variables"]["bias_atom_e"].reshape(
-                [1, ntypes, dict_fit["dim_out"]]
-            )
-            dict_fit["@variables"]["bias_atom_e"] = np.zeros_like(
-                dict_fit["@variables"]["bias_atom_e"]
-            )
+
+        # Use the actual out_bias and out_std if they exist, otherwise create defaults
+        if self.out_bias is not None:
+            out_bias = self.out_bias.copy()
         else:
-            out_bias = np.zeros(
-                [1, ntypes, dict_fit["dim_out"]], dtype=GLOBAL_NP_FLOAT_PRECISION
-            )
+            out_bias = np.zeros([1, ntypes, dim_out], dtype=GLOBAL_NP_FLOAT_PRECISION)
+
+        if self.out_std is not None:
+            out_std = self.out_std.copy()
+        else:
+            out_std = np.ones([1, ntypes, dim_out], dtype=GLOBAL_NP_FLOAT_PRECISION)
         return {
             "@class": "Model",
             "type": "standard",
@@ -912,7 +1092,7 @@ def serialize(self, suffix: str = "") -> dict:
             "preset_out_bias": None,
             "@variables": {
                 "out_bias": out_bias,
-                "out_std": np.ones([1, ntypes, dict_fit["dim_out"]]),  # pylint: disable=no-explicit-dtype
+                "out_std": out_std,
             },
         }
 
diff --git a/deepmd/tf/model/tensor.py b/deepmd/tf/model/tensor.py
index 1e960907ef..58af997464 100644
--- a/deepmd/tf/model/tensor.py
+++ b/deepmd/tf/model/tensor.py
@@ -4,7 +4,13 @@
     Union,
 )
 
+import numpy as np
+
+from deepmd.env import (
+    GLOBAL_NP_FLOAT_PRECISION,
+)
 from deepmd.tf.env import (
+    GLOBAL_TF_FLOAT_PRECISION,
     MODEL_VERSION,
     global_cvt_2_ener_float,
     tf,
@@ -126,6 +132,9 @@ def build(
             t_ver = tf.constant(MODEL_VERSION, name="model_version", dtype=tf.string)
             t_od = tf.constant(self.get_out_size(), name="output_dim", dtype=tf.int32)
 
+            # Initialize out_bias and out_std for tensor models (dipole/polar)
+            self.init_out_stat(suffix=suffix)
+
         natomsel = sum(natoms[2 + type_i] for type_i in self.get_sel_type())
         nout = self.get_out_size()
 
@@ -164,6 +173,38 @@ def build(
         output = self.fitting.build(
             dout, rot_mat, natoms, input_dict, reuse=reuse, suffix=suffix
         )
+
+        # Apply out_bias and out_std directly to tensor output
+        # dipole not applying bias but polar does, per dpmodel
+        if self.model_type == "polar" and self.fitting.shift_diag:
+            v_constant_matrix = np.zeros(
+                self.ntypes,
+                dtype=GLOBAL_NP_FLOAT_PRECISION,
+            )
+            sel_type = self.get_sel_type()
+            for itype in range(len(sel_type)):
+                v_constant_matrix[sel_type[itype]] = np.mean(
+                    np.diagonal(self.out_bias[0, itype].reshape((3, 3)))
+                )
+            nframes = input_dict["nframes"]
+            nloc_mask = tf.reshape(
+                tf.tile(tf.repeat(self.fitting.sel_mask, natoms[2:]), [nframes]),
+                [nframes, -1],
+            )
+            constant_matrix = tf.reshape(
+                tf.reshape(
+                    tf.tile(tf.repeat(v_constant_matrix, natoms[2:]), [nframes]),
+                    [nframes, -1],
+                )[nloc_mask],
+                [nframes, -1],
+            )
+
+            # nf x nloc x odims, out_bias: ntypes x odims
+            output = output + tf.reshape(
+                tf.expand_dims(tf.expand_dims(constant_matrix, -1), -1)
+                * tf.eye(3, batch_shape=[1, 1], dtype=GLOBAL_TF_FLOAT_PRECISION),
+                tf.shape(output),
+            )
         framesize = nout if "global" in self.model_type else natomsel * nout
         output = tf.reshape(
             output, [-1, framesize], name="o_" + self.model_type + suffix
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index fb911550dd..308d39b0a3 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -3,6 +3,7 @@
 import logging
 import warnings
 from typing import (
+    Any,
     Callable,
     Optional,
     Union,
@@ -40,6 +41,7 @@
 
 doc_only_tf_supported = "(Supported Backend: TensorFlow) "
 doc_only_pt_supported = "(Supported Backend: PyTorch) "
+doc_only_pd_supported = "(Supported Backend: Paddle) "
 # descriptors
 doc_loc_frame = "Defines a local frame at each atom, and the compute the descriptor as local coordinates under this frame."
 doc_se_e2_a = "Used by the smooth edition of Deep Potential. The full relative coordinates are used to construct the descriptor."
@@ -59,7 +61,7 @@
 doc_dipole_charge = "Use WFCC to model the electronic structure of the system. Correct the long-range interaction."
 
 
-def list_to_doc(xx):
+def list_to_doc(xx: list[Any]) -> str:
     items = []
     for ii in xx:
         if len(items) == 0:
@@ -70,7 +72,7 @@ def list_to_doc(xx):
     return "".join(items)
 
 
-def make_link(content, ref_key) -> str:
+def make_link(content: str, ref_key: str) -> str:
     return (
         f"`{content} <{ref_key}_>`_"
         if not dargs.RAW_ANCHOR
@@ -96,7 +98,7 @@ def deprecate_something(data: Optional[dict]) -> bool:
     return deprecate_something
 
 
-def type_embedding_args():
+def type_embedding_args() -> list[Argument]:
     doc_neuron = "Number of neurons in each hidden layers of the embedding net. When two layers are of the same size or one layer is twice as large as the previous layer, a skip connection is built."
     doc_resnet_dt = 'Whether to use a "Timestep" in the skip connection'
     doc_seed = "Random seed for parameter initialization"
@@ -132,7 +134,7 @@ def type_embedding_args():
     ]
 
 
-def spin_args():
+def spin_args() -> list[Argument]:
     doc_use_spin = (
         "Whether to use atomic spin model for each atom type. "
         "List of boolean values with the shape of [ntypes] to specify which types use spin, "
@@ -247,7 +249,7 @@ def get_all_argument(self, exclude_hybrid: bool = False) -> list[Argument]:
 
 
 @descrpt_args_plugin.register("loc_frame", doc=doc_only_tf_supported + doc_loc_frame)
-def descrpt_local_frame_args():
+def descrpt_local_frame_args() -> list[Argument]:
     doc_sel_a = "A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_a[i]` gives the selected number of type-i neighbors. The full relative coordinates of the neighbors are used by the descriptor."
     doc_sel_r = "A list of integers. The length of the list should be the same as the number of atom types in the system. `sel_r[i]` gives the selected number of type-i neighbors. Only relative distance of the neighbors are used by the descriptor. sel_a[i] + sel_r[i] is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius."
     doc_rcut = "The cut-off radius. The default value is 6.0"
@@ -268,7 +270,7 @@ def descrpt_local_frame_args():
 
 
 @descrpt_args_plugin.register("se_e2_a", alias=["se_a"], doc=doc_se_e2_a)
-def descrpt_se_a_args():
+def descrpt_se_a_args() -> list[Argument]:
     doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
     - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
     - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wrapped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
@@ -338,7 +340,7 @@ def descrpt_se_a_args():
 @descrpt_args_plugin.register(
     "se_e3", alias=["se_at", "se_a_3be", "se_t"], doc=doc_se_e3
 )
-def descrpt_se_t_args():
+def descrpt_se_t_args() -> list[Argument]:
     doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
     - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
     - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wrapped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
@@ -395,7 +397,7 @@ def descrpt_se_t_args():
 @descrpt_args_plugin.register(
     "se_a_tpe", alias=["se_a_ebd"], doc=doc_only_tf_supported + doc_se_a_tpe
 )
-def descrpt_se_a_tpe_args():
+def descrpt_se_a_tpe_args() -> list[Argument]:
     doc_type_nchanl = "number of channels for type embedding"
     doc_type_nlayer = "number of hidden layers of type embedding net"
     doc_numb_aparam = "dimension of atomic parameter. if set to a value > 0, the atomic parameters are embedded."
@@ -409,7 +411,7 @@ def descrpt_se_a_tpe_args():
 
 
 @descrpt_args_plugin.register("se_e2_r", alias=["se_r"], doc=doc_se_e2_r)
-def descrpt_se_r_args():
+def descrpt_se_r_args() -> list[Argument]:
     doc_sel = 'This parameter set the number of selected neighbors for each type of atom. It can be:\n\n\
     - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
     - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wrapped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
@@ -468,7 +470,7 @@ def descrpt_se_r_args():
 
 
 @descrpt_args_plugin.register("hybrid", doc=doc_hybrid)
-def descrpt_hybrid_args():
+def descrpt_hybrid_args() -> list[Argument]:
     doc_list = "A list of descriptor definitions"
 
     return [
@@ -485,7 +487,7 @@ def descrpt_hybrid_args():
     ]
 
 
-def descrpt_se_atten_common_args():
+def descrpt_se_atten_common_args() -> list[Argument]:
     doc_sel = 'This parameter set the number of selected neighbors. Note that this parameter is a little different from that in other descriptors. Instead of separating each type of atoms, only the summation matters. And this number is highly related with the efficiency, thus one should not make it too large. Usually 200 or less is enough, far away from the GPU limitation 4096. It can be:\n\n\
     - `int`. The maximum number of neighbor atoms to be considered. We recommend it to be less than 200. \n\n\
     - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. Only the summation of `sel[i]` matters, and it is recommended to be less than 200.\
@@ -560,7 +562,7 @@ def descrpt_se_atten_common_args():
 
 
 @descrpt_args_plugin.register("se_atten", alias=["dpa1"], doc=doc_se_atten)
-def descrpt_se_atten_args():
+def descrpt_se_atten_args() -> list[Argument]:
     doc_smooth_type_embedding = f"Whether to use smooth process in attention weights calculation. {doc_only_tf_supported} When using stripped type embedding, whether to dot smooth factor on the network output of type embedding to keep the network smooth, instead of setting `set_davg_zero` to be True."
     doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `se_atten` descriptor or `atom_ener` in the energy fitting is used"
     doc_trainable_ln = (
@@ -682,7 +684,7 @@ def descrpt_se_atten_args():
 
 
 @descrpt_args_plugin.register("se_e3_tebd", doc=doc_only_pt_supported)
-def descrpt_se_e3_tebd_args():
+def descrpt_se_e3_tebd_args() -> list[Argument]:
     doc_sel = 'This parameter set the number of selected neighbors. Note that this parameter is a little different from that in other descriptors. Instead of separating each type of atoms, only the summation matters. And this number is highly related with the efficiency, thus one should not make it too large. Usually 200 or less is enough, far away from the GPU limitation 4096. It can be:\n\n\
     - `int`. The maximum number of neighbor atoms to be considered. We recommend it to be less than 200. \n\n\
     - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. Only the summation of `sel[i]` matters, and it is recommended to be less than 200.\
@@ -796,7 +798,7 @@ def descrpt_se_e3_tebd_args():
 
 
 @descrpt_args_plugin.register("se_atten_v2", doc=doc_se_atten_v2)
-def descrpt_se_atten_v2_args():
+def descrpt_se_atten_v2_args() -> list[Argument]:
     doc_set_davg_zero = "Set the normalization average to zero. This option should be set when `se_atten` descriptor or `atom_ener` in the energy fitting is used"
     doc_trainable_ln = (
         "Whether to use trainable shift and scale weights in layer normalization."
@@ -880,7 +882,7 @@ def descrpt_se_atten_v2_args():
 
 
 @descrpt_args_plugin.register("dpa2", doc=doc_only_pt_supported)
-def descrpt_dpa2_args():
+def descrpt_dpa2_args() -> list[Argument]:
     # repinit args
     doc_repinit = "The arguments used to initialize the repinit block."
     # repformer args
@@ -957,7 +959,7 @@ def descrpt_dpa2_args():
 
 
 # repinit for dpa2
-def dpa2_repinit_args():
+def dpa2_repinit_args() -> list[Argument]:
     # repinit args
     doc_rcut = "The cut-off radius."
     doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`."
@@ -1101,7 +1103,7 @@ def dpa2_repinit_args():
 
 
 # repformer for dpa2
-def dpa2_repformer_args():
+def dpa2_repformer_args() -> list[Argument]:
     # repformer args
     doc_rcut = "The cut-off radius."
     doc_rcut_smth = "Where to start smoothing. For example the 1/r term is smoothed from `rcut` to `rcut_smth`."
@@ -1357,7 +1359,7 @@ def dpa2_repformer_args():
 
 
 @descrpt_args_plugin.register("dpa3", doc=doc_only_pt_supported)
-def descrpt_dpa3_args():
+def descrpt_dpa3_args() -> list[Argument]:
     # repflow args
     doc_repflow = "The arguments used to initialize the repflow block."
     # descriptor args
@@ -1436,7 +1438,7 @@ def descrpt_dpa3_args():
 
 
 # repflow for dpa3
-def dpa3_repflow_args():
+def dpa3_repflow_args() -> list[Argument]:
     # repflow args
     doc_n_dim = "The dimension of node representation."
     doc_e_dim = "The dimension of edge representation."
@@ -1671,12 +1673,12 @@ def dpa3_repflow_args():
 @descrpt_args_plugin.register(
     "se_a_ebd_v2", alias=["se_a_tpe_v2"], doc=doc_only_tf_supported
 )
-def descrpt_se_a_ebd_v2_args():
+def descrpt_se_a_ebd_v2_args() -> list[Argument]:
     return descrpt_se_a_args()
 
 
 @descrpt_args_plugin.register("se_a_mask", doc=doc_only_tf_supported + doc_se_a_mask)
-def descrpt_se_a_mask_args():
+def descrpt_se_a_mask_args() -> list[Argument]:
     doc_sel = 'This parameter sets the number of selected neighbors for each type of atom. It can be:\n\n\
     - `list[int]`. The length of the list should be the same as the number of atom types in the system. `sel[i]` gives the selected number of type-i neighbors. `sel[i]` is recommended to be larger than the maximally possible number of type-i neighbors in the cut-off radius. It is noted that the total sel value must be less than 4096 in a GPU environment.\n\n\
     - `str`. Can be "auto:factor" or "auto". "factor" is a float number larger than 1. This option will automatically determine the `sel`. In detail it counts the maximal number of neighbors with in the cutoff radius for each type of neighbor, then multiply the maximum by the "factor". Finally the number is wrapped up to 4 divisible. The option "auto" is equivalent to "auto:1.1".'
@@ -1743,9 +1745,10 @@ def descrpt_variant_type_args(exclude_hybrid: bool = False) -> Variant:
 
 
 @fitting_args_plugin.register("ener", doc=doc_ener)
-def fitting_ener():
+def fitting_ener() -> list[Argument]:
     doc_numb_fparam = "The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams."
     doc_numb_aparam = "The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams."
+    doc_default_fparam = "The default frame parameter. If set, when `fparam.npy` files are not included in the data system, this value will be used as the default value for the frame parameter in the fitting net."
     doc_dim_case_embd = "The dimension of the case embedding embedding. When training or fine-tuning a multitask model with case embedding embeddings, this number should be set to the number of model branches."
     doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
     doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
@@ -1773,6 +1776,13 @@ def fitting_ener():
     return [
         Argument("numb_fparam", int, optional=True, default=0, doc=doc_numb_fparam),
         Argument("numb_aparam", int, optional=True, default=0, doc=doc_numb_aparam),
+        Argument(
+            "default_fparam",
+            list[float],
+            optional=True,
+            default=None,
+            doc=doc_only_pt_supported + doc_default_fparam,
+        ),
         Argument(
             "dim_case_embd",
             int,
@@ -1827,9 +1837,10 @@ def fitting_ener():
 
 
 @fitting_args_plugin.register("dos", doc=doc_dos)
-def fitting_dos():
+def fitting_dos() -> list[Argument]:
     doc_numb_fparam = "The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams."
     doc_numb_aparam = "The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams."
+    doc_default_fparam = "The default frame parameter. If set, when `fparam.npy` files are not included in the data system, this value will be used as the default value for the frame parameter in the fitting net."
     doc_dim_case_embd = "The dimension of the case embedding embedding. When training or fine-tuning a multitask model with case embedding embeddings, this number should be set to the number of model branches."
     doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
     doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
@@ -1847,6 +1858,13 @@ def fitting_dos():
     return [
         Argument("numb_fparam", int, optional=True, default=0, doc=doc_numb_fparam),
         Argument("numb_aparam", int, optional=True, default=0, doc=doc_numb_aparam),
+        Argument(
+            "default_fparam",
+            list[float],
+            optional=True,
+            default=None,
+            doc=doc_only_pt_supported + doc_default_fparam,
+        ),
         Argument(
             "dim_case_embd",
             int,
@@ -1882,9 +1900,10 @@ def fitting_dos():
 
 
 @fitting_args_plugin.register("property", doc=doc_only_pt_supported)
-def fitting_property():
+def fitting_property() -> list[Argument]:
     doc_numb_fparam = "The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams."
     doc_numb_aparam = "The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams."
+    doc_default_fparam = "The default frame parameter. If set, when `fparam.npy` files are not included in the data system, this value will be used as the default value for the frame parameter in the fitting net."
     doc_dim_case_embd = "The dimension of the case embedding embedding. When training or fine-tuning a multitask model with case embedding embeddings, this number should be set to the number of model branches."
     doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built"
     doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
@@ -1900,6 +1919,13 @@ def fitting_property():
     return [
         Argument("numb_fparam", int, optional=True, default=0, doc=doc_numb_fparam),
         Argument("numb_aparam", int, optional=True, default=0, doc=doc_numb_aparam),
+        Argument(
+            "default_fparam",
+            list[float],
+            optional=True,
+            default=None,
+            doc=doc_only_pt_supported + doc_default_fparam,
+        ),
         Argument(
             "dim_case_embd",
             int,
@@ -1944,9 +1970,10 @@ def fitting_property():
 
 
 @fitting_args_plugin.register("polar", doc=doc_polar)
-def fitting_polar():
+def fitting_polar() -> list[Argument]:
     doc_numb_fparam = "The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams."
     doc_numb_aparam = "The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams."
+    doc_default_fparam = "The default frame parameter. If set, when `fparam.npy` files are not included in the data system, this value will be used as the default value for the frame parameter in the fitting net."
     doc_dim_case_embd = "The dimension of the case embedding embedding. When training or fine-tuning a multitask model with case embedding embeddings, this number should be set to the number of model branches."
     doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
     doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
@@ -1976,6 +2003,13 @@ def fitting_polar():
             default=0,
             doc=doc_only_pt_supported + doc_numb_aparam,
         ),
+        Argument(
+            "default_fparam",
+            list[float],
+            optional=True,
+            default=None,
+            doc=doc_only_pt_supported + doc_default_fparam,
+        ),
         Argument(
             "dim_case_embd",
             int,
@@ -2022,9 +2056,10 @@ def fitting_polar():
 
 
 @fitting_args_plugin.register("dipole", doc=doc_dipole)
-def fitting_dipole():
+def fitting_dipole() -> list[Argument]:
     doc_numb_fparam = "The dimension of the frame parameter. If set to >0, file `fparam.npy` should be included to provided the input fparams."
     doc_numb_aparam = "The dimension of the atomic parameter. If set to >0, file `aparam.npy` should be included to provided the input aparams."
+    doc_default_fparam = "The default frame parameter. If set, when `fparam.npy` files are not included in the data system, this value will be used as the default value for the frame parameter in the fitting net."
     doc_dim_case_embd = "The dimension of the case embedding embedding. When training or fine-tuning a multitask model with case embedding embeddings, this number should be set to the number of model branches."
     doc_neuron = "The number of neurons in each hidden layers of the fitting net. When two hidden layers are of the same size, a skip connection is built."
     doc_activation_function = f'The activation function in the fitting net. Supported activation functions are {list_to_doc(ACTIVATION_FN_DICT.keys())} Note that "gelu" denotes the custom operator version, and "gelu_tf" denotes the TF standard version. If you set "None" or "none" here, no activation function will be used.'
@@ -2047,6 +2082,13 @@ def fitting_dipole():
             default=0,
             doc=doc_only_pt_supported + doc_numb_aparam,
         ),
+        Argument(
+            "default_fparam",
+            list[float],
+            optional=True,
+            default=None,
+            doc=doc_only_pt_supported + doc_default_fparam,
+        ),
         Argument(
             "dim_case_embd",
             int,
@@ -2083,7 +2125,7 @@ def fitting_dipole():
 
 
 #   YWolfeee: Delete global polar mode, merge it into polar mode and use loss setting to support.
-def fitting_variant_type_args():
+def fitting_variant_type_args() -> Variant:
     doc_descrpt_type = "The type of the fitting."
 
     return Variant(
@@ -2100,7 +2142,7 @@ def fitting_variant_type_args():
 
 
 @modifier_args_plugin.register("dipole_charge", doc=doc_dipole_charge)
-def modifier_dipole_charge():
+def modifier_dipole_charge() -> list[Argument]:
     doc_model_name = "The name of the frozen dipole model file."
     doc_model_charge_map = f"The charge of the WFCC. The list length should be the same as the {make_link('sel_type', 'model[standard]/fitting_net[dipole]/sel_type')}. "
     doc_sys_charge_map = f"The charge of real atoms. The list length should be the same as the {make_link('type_map', 'model/type_map')}"
@@ -2118,7 +2160,7 @@ def modifier_dipole_charge():
     ]
 
 
-def modifier_variant_type_args():
+def modifier_variant_type_args() -> Variant:
     doc_modifier_type = "The type of modifier."
     return Variant(
         "type",
@@ -2129,7 +2171,7 @@ def modifier_variant_type_args():
 
 
 #  --- model compression configurations: --- #
-def model_compression():
+def model_compression() -> list[Argument]:
     doc_model_file = "The input model file, which will be compressed by the DeePMD-kit."
     doc_table_config = "The arguments of model compression, including extrapolate(scale of model extrapolation), stride(uniform stride of tabulation's first and second table), and frequency(frequency of tabulation overflow check)."
     doc_min_nbor_dist = (
@@ -2144,7 +2186,7 @@ def model_compression():
 
 
 #  --- model compression configurations: --- #
-def model_compression_type_args():
+def model_compression_type_args() -> Variant:
     doc_compress_type = "The type of model compression, which should be consistent with the descriptor type."
 
     return Variant(
@@ -2161,7 +2203,7 @@ def model_compression_type_args():
 hybrid_model_args_plugin = ArgsPlugin()
 
 
-def model_args(exclude_hybrid=False):
+def model_args(exclude_hybrid: bool = False) -> list[Argument]:
     doc_type_map = "A list of strings. Give the name to each type of atoms. It is noted that the number of atom type of training system must be less than 128 in a GPU environment. If not given, type.raw in each system should use the same type indexes, and type_map.raw will take no effect."
     doc_data_stat_nbatch = "The model determines the normalization from the statistics of the data. This key specifies the number of `frames` in each `system` used for statistics."
     doc_data_stat_protect = "Protect parameter for atomic energy regression."
@@ -2305,6 +2347,16 @@ def model_args(exclude_hybrid=False):
 def standard_model_args() -> Argument:
     doc_descrpt = "The descriptor of atomic environment."
     doc_fitting = "The fitting of physical properties."
+    doc_model_branch_alias = (
+        "List of aliases for this model branch. "
+        "Multiple aliases can be defined, and any alias can reference this branch throughout the model usage. "
+        "Used only in multitask models."
+    )
+    doc_info = (
+        "Dictionary of metadata for this model branch. "
+        "Store arbitrary key-value pairs with branch-specific information. "
+        "Used only in multitask models."
+    )
 
     ca = Argument(
         "standard",
@@ -2320,6 +2372,20 @@ def standard_model_args() -> Argument:
                 [fitting_variant_type_args()],
                 doc=doc_fitting,
             ),
+            Argument(
+                "model_branch_alias",
+                list[str],
+                optional=True,
+                default=[],
+                doc=doc_only_pt_supported + doc_model_branch_alias,
+            ),
+            Argument(
+                "info",
+                dict,
+                optional=True,
+                default={},
+                doc=doc_only_pt_supported + doc_info,
+            ),
         ],
         doc="Standard model, which contains a descriptor and a fitting.",
     )
@@ -2411,7 +2477,7 @@ def linear_ener_model_args() -> Argument:
 
 
 #  --- Learning rate configurations: --- #
-def learning_rate_exp():
+def learning_rate_exp() -> list[Argument]:
     doc_start_lr = "The learning rate at the start of the training."
     doc_stop_lr = (
         "The desired learning rate at the end of the training. "
@@ -2443,7 +2509,7 @@ def learning_rate_exp():
     return args
 
 
-def learning_rate_variant_type_args():
+def learning_rate_variant_type_args() -> Variant:
     doc_lr = "The type of the learning rate."
 
     return Variant(
@@ -2478,7 +2544,9 @@ def learning_rate_args(fold_subdoc: bool = False) -> Argument:
 
 
 #  --- Loss configurations: --- #
-def start_pref(item, label=None, abbr=None) -> str:
+def start_pref(
+    item: str, label: Optional[str] = None, abbr: Optional[str] = None
+) -> str:
     if label is None:
         label = item
     if abbr is None:
@@ -2486,7 +2554,7 @@ def start_pref(item, label=None, abbr=None) -> str:
     return f"The prefactor of {item} loss at the start of the training. Should be larger than or equal to 0. If set to none-zero value, the {label} label should be provided by file {label}.npy in each data system. If both start_pref_{abbr} and limit_pref_{abbr} are set to 0, then the {item} will be ignored."
 
 
-def limit_pref(item) -> str:
+def limit_pref(item: str) -> str:
     return f"The prefactor of {item} loss at the limit of the training, Should be larger than or equal to 0. i.e. the training step goes to infinity."
 
 
@@ -2494,7 +2562,7 @@ def limit_pref(item) -> str:
 
 
 @loss_args_plugin.register("ener")
-def loss_ener():
+def loss_ener() -> list[Argument]:
     doc_start_pref_e = start_pref("energy", abbr="e")
     doc_limit_pref_e = limit_pref("energy")
     doc_start_pref_f = start_pref("force", abbr="f")
@@ -2654,7 +2722,7 @@ def loss_ener():
 
 
 @loss_args_plugin.register("ener_spin")
-def loss_ener_spin():
+def loss_ener_spin() -> list[Argument]:
     doc_start_pref_e = start_pref("energy")
     doc_limit_pref_e = limit_pref("energy")
     doc_start_pref_fr = start_pref("force_real_atom")
@@ -2766,7 +2834,7 @@ def loss_ener_spin():
 
 
 @loss_args_plugin.register("dos")
-def loss_dos():
+def loss_dos() -> list[Argument]:
     doc_start_pref_dos = start_pref("Density of State (DOS)")
     doc_limit_pref_dos = limit_pref("Density of State (DOS)")
     doc_start_pref_cdf = start_pref(
@@ -2840,7 +2908,7 @@ def loss_dos():
 
 
 @loss_args_plugin.register("property")
-def loss_property():
+def loss_property() -> list[Argument]:
     doc_loss_func = "The loss function to minimize, such as 'mae','smooth_mae'."
     doc_metric = "The metric for display. This list can include 'smooth_mae', 'mae', 'mse' and 'rmse'."
     doc_beta = "The 'beta' parameter in 'smooth_mae' loss."
@@ -2871,7 +2939,7 @@ def loss_property():
 
 # YWolfeee: Modified to support tensor type of loss args.
 @loss_args_plugin.register("tensor")
-def loss_tensor():
+def loss_tensor() -> list[Argument]:
     # doc_global_weight = "The prefactor of the weight of global loss. It should be larger than or equal to 0. If only `pref` is provided or both are not provided, training will be global mode, i.e. the shape of 'polarizability.npy` or `dipole.npy` should be #frams x [9 or 3]."
     # doc_local_weight =  "The prefactor of the weight of atomic loss. It should be larger than or equal to 0. If only `pref_atomic` is provided, training will be atomic mode, i.e. the shape of `polarizability.npy` or `dipole.npy` should be #frames x ([9 or 3] x #selected atoms). If both `pref` and `pref_atomic` are provided, training will be combined mode, and atomic label should be provided as well."
     doc_global_weight = "The prefactor of the weight of global loss. It should be larger than or equal to 0. It controls the weight of loss corresponding to global label, i.e. 'polarizability.npy` or `dipole.npy`, whose shape should be #frames x [9 or 3]. If it's larger than 0.0, this npy should be included."
@@ -2898,7 +2966,7 @@ def loss_tensor():
     ]
 
 
-def loss_variant_type_args():
+def loss_variant_type_args() -> Variant:
     doc_loss = "The type of the loss. When the fitting type is `ener`, the loss type should be set to `ener` or left unset. When the fitting type is `dipole` or `polar`, the loss type should be set to `tensor`."
 
     return Variant(
@@ -2910,7 +2978,7 @@ def loss_variant_type_args():
     )
 
 
-def loss_args():
+def loss_args() -> list[Argument]:
     doc_loss = "The definition of loss function. The loss type should be set to `tensor`, `ener` or left unset."
     ca = Argument(
         "loss", dict, [], [loss_variant_type_args()], optional=True, doc=doc_loss
@@ -2919,7 +2987,9 @@ def loss_args():
 
 
 #  --- Training configurations: --- #
-def training_data_args():  # ! added by Ziyao: new specification style for data systems.
+def training_data_args() -> list[
+    Argument
+]:  # ! added by Ziyao: new specification style for data systems.
     link_sys = make_link("systems", "training/training_data/systems")
     doc_systems = (
         "The data systems for training. "
@@ -2998,7 +3068,9 @@ def training_data_args():  # ! added by Ziyao: new specification style for data
     )
 
 
-def validation_data_args():  # ! added by Ziyao: new specification style for data systems.
+def validation_data_args() -> list[
+    Argument
+]:  # ! added by Ziyao: new specification style for data systems.
     link_sys = make_link("systems", "training/validation_data/systems")
     doc_systems = (
         "The data systems for validation. "
@@ -3088,7 +3160,7 @@ def validation_data_args():  # ! added by Ziyao: new specification style for dat
     )
 
 
-def mixed_precision_args():  # ! added by Denghui.
+def mixed_precision_args() -> list[Argument]:  # ! added by Denghui.
     doc_output_prec = 'The precision for mixed precision params. " \
         "The trainable variables precision during the mixed precision training process, " \
         "supported options are float32 only currently.'
@@ -3117,8 +3189,8 @@ def mixed_precision_args():  # ! added by Denghui.
 
 
 def training_args(
-    multi_task=False,
-):  # ! modified by Ziyao: data configuration isolated.
+    multi_task: bool = False,
+) -> list[Argument]:  # ! modified by Ziyao: data configuration isolated.
     doc_numb_steps = "Number of training batch. Each training uses one batch of data."
     doc_seed = "The random seed for getting frames from the training data set."
     doc_disp_file = "The file for printing learning curve."
@@ -3137,7 +3209,10 @@ def training_args(
     )
     doc_disp_training = "Displaying verbose information during training."
     doc_time_training = "Timing during training."
-    doc_profiling = "Export the profiling results to the Chrome JSON file for performance analysis, driven by the legacy TensorFlow profiling API or PyTorch Profiler. The output file will be saved to `profiling_file`."
+    doc_disp_avg = (
+        "Display the average loss over the display interval for training sets."
+    )
+    doc_profiling = "Export the profiling results to the Chrome JSON file for performance analysis, driven by the legacy TensorFlow profiling API or PyTorch Profiler. The output file will be saved to `profiling_file`. In the PyTorch backend, when enable_profiler is True, this option is ignored, since the profiling results will be saved to the TensorBoard log."
     doc_profiling_file = "Output file for profiling."
     doc_enable_profiler = "Export the profiling results to the TensorBoard log for performance analysis, driven by TensorFlow Profiler (available in TensorFlow 2.3) or PyTorch Profiler. The log will be saved to `tensorboard_log_dir`."
     doc_tensorboard = "Enable tensorboard"
@@ -3164,6 +3239,7 @@ def training_args(
     doc_kf_blocksize = "The blocksize for the Kalman filter."
     doc_model_prob = "The visiting probability of each model for each training step in the multi-task mode."
     doc_data_dict = "The multiple definition of the data, used in the multi-task mode."
+    doc_acc_freq = "Gradient accumulation steps (number of steps to accumulate gradients before performing an update)."
 
     arg_training_data = training_data_args()
     arg_validation_data = validation_data_args()
@@ -3213,6 +3289,13 @@ def training_args(
         Argument(
             "time_training", bool, optional=True, default=True, doc=doc_time_training
         ),
+        Argument(
+            "disp_avg",
+            bool,
+            optional=True,
+            default=False,
+            doc=doc_only_pt_supported + doc_disp_avg,
+        ),
         Argument(
             "profiling",
             bool,
@@ -3259,6 +3342,13 @@ def training_args(
             optional=True,
             doc=doc_only_pt_supported + doc_gradient_max_norm,
         ),
+        Argument(
+            "acc_freq",
+            int,
+            optional=True,
+            default=1,
+            doc=doc_only_pd_supported + doc_acc_freq,
+        ),
     ]
     variants = [
         Variant(
@@ -3291,7 +3381,7 @@ def training_args(
     return Argument("training", dict, args, variants, doc=doc_training)
 
 
-def multi_model_args():
+def multi_model_args() -> list[Argument]:
     model_dict = model_args()
     model_dict.name = "model_dict"
     model_dict.repeat = True
@@ -3312,7 +3402,7 @@ def multi_model_args():
     )
 
 
-def multi_loss_args():
+def multi_loss_args() -> list[Argument]:
     loss_dict = loss_args()
     loss_dict.name = "loss_dict"
     loss_dict.repeat = True
@@ -3321,14 +3411,20 @@ def multi_loss_args():
     return loss_dict
 
 
-def make_index(keys):
+def make_index(keys: list[str]) -> str:
     ret = []
     for ii in keys:
         ret.append(make_link(ii, ii))
     return ", ".join(ret)
 
 
-def gen_doc(*, make_anchor=True, make_link=True, multi_task=False, **kwargs) -> str:
+def gen_doc(
+    *,
+    make_anchor: bool = True,
+    make_link: bool = True,
+    multi_task: bool = False,
+    **kwargs: Any,
+) -> str:
     if make_link:
         make_anchor = True
     ptr = []
@@ -3344,7 +3440,7 @@ def gen_doc(*, make_anchor=True, make_link=True, multi_task=False, **kwargs) ->
     return "\n\n".join(ptr)
 
 
-def gen_json(multi_task: bool = False, **kwargs) -> str:
+def gen_json(multi_task: bool = False, **kwargs: Any) -> str:
     return json.dumps(
         tuple(gen_args(multi_task=multi_task)),
         cls=ArgumentEncoder,
@@ -3397,7 +3493,7 @@ def gen_json_schema(multi_task: bool = False) -> str:
     return json.dumps(generate_json_schema(arg))
 
 
-def normalize(data, multi_task: bool = False):
+def normalize(data: dict[str, Any], multi_task: bool = False) -> dict[str, Any]:
     base = Argument("base", dict, gen_args(multi_task=multi_task))
     data = base.normalize_value(data, trim_pattern="_*")
     base.check_value(data, strict=True)
diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py
index 516c4d2ead..860da030ba 100644
--- a/deepmd/utils/batch_size.py
+++ b/deepmd/utils/batch_size.py
@@ -6,6 +6,7 @@
     abstractmethod,
 )
 from typing import (
+    Any,
     Callable,
 )
 
@@ -145,7 +146,12 @@ def _adjust_batch_size(self, factor: float) -> None:
         )
 
     def execute_all(
-        self, callable: Callable, total_size: int, natoms: int, *args, **kwargs
+        self,
+        callable: Callable,
+        total_size: int,
+        natoms: int,
+        *args: Any,
+        **kwargs: Any,
     ) -> tuple[np.ndarray]:
         """Excuate a method with all given data.
 
@@ -209,7 +215,7 @@ def execute_with_batch_size(
                 result = (result,) if not isinstance(result, tuple) else result
             index += n_batch
 
-            def append_to_list(res_list, res):
+            def append_to_list(res_list: list[Any], res: Any) -> list[Any]:
                 if n_batch:
                     res_list.append(res)
                 return res_list
@@ -223,7 +229,7 @@ def append_to_list(res_list, res):
         assert results is not None
         assert returned_dict is not None
 
-        def concate_result(r):
+        def concate_result(r: list[Any]) -> Any:
             if array_api_compat.is_array_api_obj(r[0]):
                 xp = array_api_compat.array_namespace(r[0])
                 ret = xp.concat(r, axis=0)
diff --git a/deepmd/utils/compat.py b/deepmd/utils/compat.py
index 72948c96f4..f93c30e047 100644
--- a/deepmd/utils/compat.py
+++ b/deepmd/utils/compat.py
@@ -375,10 +375,10 @@ def deprecate_numb_test(
 def update_deepmd_input(
     jdata: dict[str, Any], warning: bool = True, dump: Optional[Union[str, Path]] = None
 ) -> dict[str, Any]:
-    def is_deepmd_v0_input(jdata):
+    def is_deepmd_v0_input(jdata: dict[str, Any]) -> bool:
         return "model" not in jdata.keys()
 
-    def is_deepmd_v1_input(jdata):
+    def is_deepmd_v1_input(jdata: dict[str, Any]) -> bool:
         return "systems" in jdata["training"].keys()
 
     if is_deepmd_v0_input(jdata):
diff --git a/deepmd/utils/data.py b/deepmd/utils/data.py
index ccf65d3243..9b93c64507 100644
--- a/deepmd/utils/data.py
+++ b/deepmd/utils/data.py
@@ -4,6 +4,7 @@
 import bisect
 import logging
 from typing import (
+    Any,
     Optional,
 )
 
@@ -54,7 +55,7 @@ def __init__(
         shuffle_test: bool = True,
         type_map: Optional[list[str]] = None,
         optional_type_map: bool = True,
-        modifier=None,
+        modifier: Optional[Any] = None,
         trn_all_set: bool = False,
         sort_atoms: bool = True,
     ) -> None:
@@ -145,7 +146,7 @@ def add(
         default: float = 0.0,
         dtype: Optional[np.dtype] = None,
         output_natoms_for_type_sel: bool = False,
-    ):
+    ) -> "DeepmdData":
         """Add a data item that to be loaded.
 
         Parameters
@@ -188,7 +189,7 @@ def add(
         }
         return self
 
-    def reduce(self, key_out: str, key_in: str):
+    def reduce(self, key_out: str, key_in: str) -> "DeepmdData":
         """Generate a new item from the reduction of another atom.
 
         Parameters
@@ -220,7 +221,7 @@ def get_data_dict(self) -> dict:
         """Get the `data_dict`."""
         return self.data_dict
 
-    def check_batch_size(self, batch_size):
+    def check_batch_size(self, batch_size: int) -> bool:
         """Check if the system can get a batch of data with `batch_size` frames."""
         for ii in self.dirs:
             if self.data_dict["coord"]["high_prec"]:
@@ -235,7 +236,7 @@ def check_batch_size(self, batch_size):
                 return ii, tmpe.shape[0]
         return None
 
-    def check_test_size(self, test_size):
+    def check_test_size(self, test_size: int) -> bool:
         """Check if the system can get a test dataset with `test_size` frames."""
         return self.check_batch_size(test_size)
 
@@ -352,11 +353,11 @@ def get_sys_numb_batch(self, batch_size: int) -> int:
             ret += self.get_numb_batch(batch_size, ii)
         return ret
 
-    def get_natoms(self):
+    def get_natoms(self) -> int:
         """Get number of atoms."""
         return len(self.atom_type)
 
-    def get_natoms_vec(self, ntypes: int):
+    def get_natoms_vec(self, ntypes: int) -> np.ndarray:
         """Get number of atoms and number of atoms in different types.
 
         Parameters
@@ -376,7 +377,7 @@ def get_natoms_vec(self, ntypes: int):
         tmp = np.append(tmp, natoms_vec)
         return tmp.astype(np.int32)
 
-    def avg(self, key):
+    def avg(self, key: str) -> float:
         """Return the average value of an item."""
         if key not in self.data_dict.keys():
             raise RuntimeError(f"key {key} has not been added")
@@ -393,7 +394,7 @@ def avg(self, key):
         else:
             return np.average(eners, axis=0)
 
-    def _idx_map_sel(self, atom_type, type_sel):
+    def _idx_map_sel(self, atom_type: np.ndarray, type_sel: list[int]) -> np.ndarray:
         new_types = []
         for ii in atom_type:
             if ii in type_sel:
@@ -404,7 +405,7 @@ def _idx_map_sel(self, atom_type, type_sel):
         idx_map = np.lexsort((idx, new_types))
         return idx_map
 
-    def _get_natoms_2(self, ntypes):
+    def _get_natoms_2(self, ntypes: int) -> tuple[int, np.ndarray]:
         sample_type = self.atom_type
         natoms = len(sample_type)
         natoms_vec = np.zeros(ntypes, dtype=np.int64)
@@ -412,7 +413,9 @@ def _get_natoms_2(self, ntypes):
             natoms_vec[ii] = np.count_nonzero(sample_type == ii)
         return natoms, natoms_vec
 
-    def _get_subdata(self, data, idx=None):
+    def _get_subdata(
+        self, data: dict[str, Any], idx: Optional[np.ndarray] = None
+    ) -> dict[str, Any]:
         new_data = {}
         for ii in data:
             dd = data[ii]
@@ -454,7 +457,7 @@ def _load_test_set(self, shuffle_test: bool) -> None:
         if shuffle_test:
             self.test_set, _ = self._shuffle_data(self.test_set)
 
-    def _shuffle_data(self, data):
+    def _shuffle_data(self, data: dict[str, Any]) -> dict[str, Any]:
         ret = {}
         nframes = data["coord"].shape[0]
         idx = np.arange(nframes, dtype=np.int64)
@@ -473,7 +476,7 @@ def _shuffle_data(self, data):
                 ret[kk] = data[kk]
         return ret, idx
 
-    def _get_nframes(self, set_name: DPPath):
+    def _get_nframes(self, set_name: DPPath) -> int:
         # get nframes
         if not isinstance(set_name, DPPath):
             set_name = DPPath(set_name)
@@ -487,7 +490,7 @@ def _get_nframes(self, set_name: DPPath):
         nframes = coord.shape[0]
         return nframes
 
-    def reformat_data_torch(self, data):
+    def reformat_data_torch(self, data: dict[str, Any]) -> dict[str, Any]:
         """Modify the data format for the requirements of Torch backend.
 
         Parameters
@@ -506,7 +509,7 @@ def reformat_data_torch(self, data):
             data["box"] = None
         return data
 
-    def _load_set(self, set_name: DPPath):
+    def _load_set(self, set_name: DPPath) -> dict[str, Any]:
         # get nframes
         if not isinstance(set_name, DPPath):
             set_name = DPPath(set_name)
@@ -593,19 +596,19 @@ def _load_set(self, set_name: DPPath):
 
     def _load_data(
         self,
-        set_name,
-        key,
-        nframes,
-        ndof_,
-        atomic=False,
-        must=True,
-        repeat=1,
-        high_prec=False,
-        type_sel=None,
+        set_name: str,
+        key: str,
+        nframes: int,
+        ndof_: int,
+        atomic: bool = False,
+        must: bool = True,
+        repeat: int = 1,
+        high_prec: bool = False,
+        type_sel: Optional[list[int]] = None,
         default: float = 0.0,
         dtype: Optional[np.dtype] = None,
         output_natoms_for_type_sel: bool = False,
-    ):
+    ) -> np.ndarray:
         if atomic:
             natoms = self.natoms
             idx_map = self.idx_map
@@ -704,16 +707,16 @@ def _load_data(
                 data = np.repeat(data, repeat).reshape([nframes, -1])
             return np.float32(0.0), data
 
-    def _load_type(self, sys_path: DPPath):
+    def _load_type(self, sys_path: DPPath) -> np.ndarray:
         atom_type = (sys_path / "type.raw").load_txt(ndmin=1).astype(np.int32)
         return atom_type
 
-    def _load_type_mix(self, set_name: DPPath):
+    def _load_type_mix(self, set_name: DPPath) -> np.ndarray:
         type_path = set_name / "real_atom_types.npy"
         real_type = type_path.load_numpy().astype(np.int32).reshape([-1, self.natoms])
         return real_type
 
-    def _make_idx_map(self, atom_type):
+    def _make_idx_map(self, atom_type: np.ndarray) -> np.ndarray:
         natoms = atom_type.shape[0]
         idx = np.arange(natoms, dtype=np.int64)
         if self.sort_atoms:
@@ -722,20 +725,20 @@ def _make_idx_map(self, atom_type):
             idx_map = idx
         return idx_map
 
-    def _load_type_map(self, sys_path: DPPath):
+    def _load_type_map(self, sys_path: DPPath) -> Optional[list[str]]:
         fname = sys_path / "type_map.raw"
         if fname.is_file():
             return fname.load_txt(dtype=str, ndmin=1).tolist()
         else:
             return None
 
-    def _check_pbc(self, sys_path: DPPath):
+    def _check_pbc(self, sys_path: DPPath) -> bool:
         pbc = True
         if (sys_path / "nopbc").is_file():
             pbc = False
         return pbc
 
-    def _check_mode(self, set_path: DPPath):
+    def _check_mode(self, set_path: DPPath) -> bool:
         return (set_path / "real_atom_types.npy").is_file()
 
 
@@ -808,7 +811,7 @@ def to_dict(self) -> dict:
             "output_natoms_for_type_sel": self.output_natoms_for_type_sel,
         }
 
-    def __getitem__(self, key: str):
+    def __getitem__(self, key: str) -> np.ndarray:
         if key not in self.dict:
             raise KeyError(key)
         return self.dict[key]
diff --git a/deepmd/utils/data_system.py b/deepmd/utils/data_system.py
index 07dab35a90..cf6e81aad1 100644
--- a/deepmd/utils/data_system.py
+++ b/deepmd/utils/data_system.py
@@ -49,10 +49,10 @@ def __init__(
         shuffle_test: bool = True,
         type_map: Optional[list[str]] = None,
         optional_type_map: bool = True,
-        modifier=None,
-        trn_all_set=False,
-        sys_probs=None,
-        auto_prob_style="prob_sys_size",
+        modifier: Optional[Any] = None,
+        trn_all_set: bool = False,
+        sys_probs: Optional[list[float]] = None,
+        auto_prob_style: str = "prob_sys_size",
         sort_atoms: bool = True,
     ) -> None:
         """Constructor.
@@ -152,6 +152,51 @@ def __init__(
                 else:
                     raise RuntimeError("batch size must be specified for mixed systems")
                 self.batch_size = rule * np.ones(self.nsystems, dtype=int)
+            elif "max" == words[0]:
+                # Determine batch size so that batch_size * natoms <= rule, at least 1
+                if len(words) != 2:
+                    raise RuntimeError("batch size must be specified for max systems")
+                rule = int(words[1])
+                bs = []
+                for ii in self.data_systems:
+                    ni = ii.get_natoms()
+                    bsi = rule // ni
+                    if bsi == 0:
+                        bsi = 1
+                    bs.append(bsi)
+                self.batch_size = bs
+            elif "filter" == words[0]:
+                # Remove systems with natoms > rule, then set batch size like "max:rule"
+                if len(words) != 2:
+                    raise RuntimeError(
+                        "batch size must be specified for filter systems"
+                    )
+                rule = int(words[1])
+                filtered_data_systems = []
+                filtered_system_dirs = []
+                for sys_dir, data_sys in zip(self.system_dirs, self.data_systems):
+                    if data_sys.get_natoms() <= rule:
+                        filtered_data_systems.append(data_sys)
+                        filtered_system_dirs.append(sys_dir)
+                if len(filtered_data_systems) == 0:
+                    raise RuntimeError(
+                        f"No system left after removing systems with more than {rule} atoms"
+                    )
+                if len(filtered_data_systems) != len(self.data_systems):
+                    warnings.warn(
+                        f"Remove {len(self.data_systems) - len(filtered_data_systems)} systems with more than {rule} atoms"
+                    )
+                self.data_systems = filtered_data_systems
+                self.system_dirs = filtered_system_dirs
+                self.nsystems = len(self.data_systems)
+                bs = []
+                for ii in self.data_systems:
+                    ni = ii.get_natoms()
+                    bsi = rule // ni
+                    if bsi == 0:
+                        bsi = 1
+                    bs.append(bsi)
+                self.batch_size = bs
             else:
                 raise RuntimeError("unknown batch_size rule " + words[0])
         elif isinstance(self.batch_size, list):
@@ -224,7 +269,7 @@ def __init__(
                     f"system {self.system_dirs[ii]} required test size is larger than the size of the dataset {chk_ret[0]} ({self.test_size[ii]} > {chk_ret[1]})"
                 )
 
-    def _load_test(self, ntests=-1) -> None:
+    def _load_test(self, ntests: int = -1) -> None:
         self.test_data = collections.defaultdict(list)
         for ii in range(self.nsystems):
             test_system_data = self.data_systems[ii].get_test(ntests=ntests)
@@ -241,7 +286,9 @@ def default_mesh(self) -> list[np.ndarray]:
             for ii in range(self.nsystems)
         ]
 
-    def compute_energy_shift(self, rcond=None, key="energy"):
+    def compute_energy_shift(
+        self, rcond: Optional[float] = None, key: str = "energy"
+    ) -> tuple[np.ndarray, np.ndarray]:
         sys_ener = []
         for ss in self.data_systems:
             sys_ener.append(ss.avg(key))
@@ -349,7 +396,7 @@ def add(
                 output_natoms_for_type_sel=output_natoms_for_type_sel,
             )
 
-    def reduce(self, key_out, key_in) -> None:
+    def reduce(self, key_out: str, key_in: str) -> None:
         """Generate a new item from the reduction of another atom.
 
         Parameters
@@ -366,7 +413,9 @@ def get_data_dict(self, ii: int = 0) -> dict:
         return self.data_systems[ii].get_data_dict()
 
     def set_sys_probs(
-        self, sys_probs=None, auto_prob_style: str = "prob_sys_size"
+        self,
+        sys_probs: Optional[list[float]] = None,
+        auto_prob_style: str = "prob_sys_size",
     ) -> None:
         if sys_probs is None:
             if auto_prob_style == "prob_uniform":
@@ -512,7 +561,9 @@ def _merge_batch_data(self, batch_data: list[dict]) -> dict:
         return b_data
 
     # ! altered by Marián Rynik
-    def get_test(self, sys_idx: Optional[int] = None, n_test: int = -1):  # depreciated
+    def get_test(
+        self, sys_idx: Optional[int] = None, n_test: int = -1
+    ) -> dict[str, np.ndarray]:  # depreciated
         """Get test data from the the data systems.
 
         Parameters
@@ -537,7 +588,7 @@ def get_test(self, sys_idx: Optional[int] = None, n_test: int = -1):  # deprecia
         test_system_data["default_mesh"] = self.default_mesh[idx]
         return test_system_data
 
-    def get_sys_ntest(self, sys_idx=None):
+    def get_sys_ntest(self, sys_idx: Optional[int] = None) -> int:
         """Get number of tests for the currently selected system,
         or one defined by sys_idx.
         """
@@ -582,7 +633,7 @@ def print_summary(self, name: str) -> None:
             [ii.pbc for ii in self.data_systems],
         )
 
-    def _make_auto_bs(self, rule):
+    def _make_auto_bs(self, rule: int) -> list[int]:
         bs = []
         for ii in self.data_systems:
             ni = ii.get_natoms()
@@ -593,7 +644,7 @@ def _make_auto_bs(self, rule):
         return bs
 
     # ! added by Marián Rynik
-    def _make_auto_ts(self, percent):
+    def _make_auto_ts(self, percent: float) -> list[int]:
         ts = []
         for ii in range(self.nsystems):
             ni = self.batch_size[ii] * self.nbatches[ii]
@@ -602,7 +653,9 @@ def _make_auto_ts(self, percent):
 
         return ts
 
-    def _check_type_map_consistency(self, type_map_list):
+    def _check_type_map_consistency(
+        self, type_map_list: list[Optional[list[str]]]
+    ) -> list[str]:
         ret = []
         for ii in type_map_list:
             if ii is not None:
@@ -619,7 +672,7 @@ def _check_type_map_consistency(self, type_map_list):
         return ret
 
 
-def _format_name_length(name, width):
+def _format_name_length(name: str, width: int) -> str:
     if len(name) <= width:
         return "{: >{}}".format(name, width)
     else:
@@ -689,7 +742,7 @@ def print_summary(
     )
 
 
-def process_sys_probs(sys_probs, nbatch):
+def process_sys_probs(sys_probs: list[float], nbatch: int) -> np.ndarray:
     sys_probs = np.array(sys_probs)
     type_filter = sys_probs >= 0
     assigned_sum_prob = np.sum(type_filter * sys_probs)
@@ -708,7 +761,7 @@ def process_sys_probs(sys_probs, nbatch):
     return ret_prob
 
 
-def prob_sys_size_ext(keywords, nsystems, nbatch):
+def prob_sys_size_ext(keywords: str, nsystems: int, nbatch: int) -> list[float]:
     block_str = keywords.split(";")[1:]
     block_stt = []
     block_end = []
@@ -762,7 +815,11 @@ def process_systems(
 
 
 def get_data(
-    jdata: dict[str, Any], rcut, type_map, modifier, multi_task_mode=False
+    jdata: dict[str, Any],
+    rcut: float,
+    type_map: Optional[list[str]],
+    modifier: Optional[Any],
+    multi_task_mode: bool = False,
 ) -> DeepmdDataSystem:
     """Get the data system.
 
diff --git a/deepmd/utils/econf_embd.py b/deepmd/utils/econf_embd.py
index 5ff136b373..ce7e6a1aaf 100644
--- a/deepmd/utils/econf_embd.py
+++ b/deepmd/utils/econf_embd.py
@@ -1,5 +1,9 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 
+from typing import (
+    Union,
+)
+
 import numpy as np
 from mendeleev import (
     element,
@@ -9,6 +13,7 @@
     "electronic_configuration_embedding",
     "make_econf_embedding",
     "normalized_electronic_configuration_embedding",
+    "sort_element_type",
     "transform_to_spin_rep",
 ]
 
@@ -181,7 +186,7 @@
 ECONF_DIM = electronic_configuration_embedding[type_map[0]].shape[0]
 
 
-def normalize_vec_length(res):
+def normalize_vec_length(res: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
     scale = 1.0 / np.sqrt(ECONF_DIM)
     return {kk: scale * vv for kk, vv in res.items()}
 
@@ -191,7 +196,7 @@ def normalize_vec_length(res):
 )
 
 
-def make_empty_list_vec():
+def make_empty_list_vec() -> dict[str, np.ndarray]:
     ret = {}
     for kk in conf_keys:
         ll = lett_to_ln[kk[1]]
@@ -199,7 +204,7 @@ def make_empty_list_vec():
     return ret
 
 
-def flatten_list_vec(lv):
+def flatten_list_vec(lv: dict[str, np.ndarray]) -> np.ndarray:
     ret = np.array([], dtype=np.int32)
     for kk in conf_keys:
         ret = np.append(ret, lv[kk])
@@ -240,7 +245,7 @@ def transform_to_spin_rep(res: dict[str, np.ndarray]) -> dict[str, np.ndarray]:
     """Transform electron occupation of 0/1/2 to -1,-1/-1,1/1,1."""
     ret = {}
 
-    def transform(ii):
+    def transform(ii: int) -> list[int]:
         if ii == 0:
             return [-1, -1]
         elif ii == 1:
@@ -263,3 +268,16 @@ def print_econf_embedding(res: dict[str, np.ndarray]) -> None:
         vvstr = ",".join([str(ii) for ii in vv])
         space = " " * (2 - len(kk))
         print(f'"{kk}"{space} : [{vvstr}],')  # noqa: T201
+
+
+def sort_element_type(elements: list[str]) -> list[str]:
+    """Sort element types based on their atomic number."""
+
+    def get_atomic_number(symbol: str) -> Union[int, float]:
+        try:
+            return element(symbol).atomic_number
+        except ValueError:
+            return float("inf")
+
+    sorted_elements = sorted(elements, key=lambda x: get_atomic_number(x))
+    return sorted_elements
diff --git a/deepmd/utils/finetune.py b/deepmd/utils/finetune.py
index 644da3649d..c019cc68ab 100644
--- a/deepmd/utils/finetune.py
+++ b/deepmd/utils/finetune.py
@@ -36,35 +36,35 @@ def __init__(
         self.resuming = resuming
         self.update_type = self.p_type_map != self.type_map
 
-    def get_index_mapping(self):
+    def get_index_mapping(self) -> list[int]:
         """Returns the mapping index of newly defined types to those in the pretrained model."""
         return get_index_between_two_maps(self.p_type_map, self.type_map)[0]
 
-    def get_has_new_type(self):
+    def get_has_new_type(self) -> bool:
         """Returns whether there are unseen types in the new type_map."""
         return get_index_between_two_maps(self.p_type_map, self.type_map)[1]
 
-    def get_model_branch(self):
+    def get_model_branch(self) -> str:
         """Returns the chosen model branch."""
         return self.model_branch
 
-    def get_random_fitting(self):
+    def get_random_fitting(self) -> bool:
         """Returns whether to use random fitting."""
         return self.random_fitting
 
-    def get_resuming(self):
+    def get_resuming(self) -> bool:
         """Returns whether to only do resuming."""
         return self.resuming
 
-    def get_update_type(self):
+    def get_update_type(self) -> bool:
         """Returns whether to update the type related params when loading from pretrained model with redundant types."""
         return self.update_type
 
-    def get_pretrained_tmap(self):
+    def get_pretrained_tmap(self) -> list[str]:
         """Returns the type map in the pretrained model."""
         return self.p_type_map
 
-    def get_finetune_tmap(self):
+    def get_finetune_tmap(self) -> list[str]:
         """Returns the type map in the fine-tuned model."""
         return self.type_map
 
@@ -72,7 +72,7 @@ def get_finetune_tmap(self):
 def get_index_between_two_maps(
     old_map: list[str],
     new_map: list[str],
-):
+) -> tuple[list[int], bool]:
     """Returns the mapping index of types in new_map to those in the old_map.
 
     Parameters
@@ -110,7 +110,7 @@ def get_index_between_two_maps(
 def map_atom_exclude_types(
     atom_exclude_types: list[int],
     remap_index: list[int],
-):
+) -> list[int]:
     """Return the remapped atom_exclude_types according to remap_index.
 
     Parameters
@@ -135,7 +135,7 @@ def map_atom_exclude_types(
 def map_pair_exclude_types(
     pair_exclude_types: list[tuple[int, int]],
     remap_index: list[int],
-):
+) -> list[tuple[int, int]]:
     """Return the remapped atom_exclude_types according to remap_index.
 
     Parameters
diff --git a/deepmd/utils/model_branch_dict.py b/deepmd/utils/model_branch_dict.py
new file mode 100644
index 0000000000..2b3390a85b
--- /dev/null
+++ b/deepmd/utils/model_branch_dict.py
@@ -0,0 +1,321 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import textwrap
+from collections import (
+    OrderedDict,
+)
+from typing import (
+    Any,
+    Optional,
+)
+
+
+def get_model_dict(model_dict: dict[str, Any]) -> tuple[dict[str, str], dict[str, str]]:
+    """
+    Get the model branch alias dictionary from the model_dict.
+
+    Returns
+    -------
+    model_alias_dict: dict
+        A dictionary where the keys are the aliases and the values are the original model branch names.
+    model_branch_dict: dict
+        A dictionary where the keys are the original model branch names, and the values are dictionaries with:
+        - alias
+            the list of aliases of this model branch.
+        - info
+            the info dictionary of this model branch.
+    """
+    model_alias_dict = {}
+    model_branch_dict = {}
+    for key in model_dict:
+        model_branch_dict[key] = {}
+        model_alias_dict[key] = key
+        alias_list = model_dict[key].get("model_branch_alias", [])
+        model_branch_dict[key]["alias"] = alias_list
+        branch_info = model_dict[key].get("info", {})
+        model_branch_dict[key]["info"] = branch_info
+        for alias in alias_list:
+            assert alias not in model_alias_dict, (
+                f"Alias {alias} for model_branch {key} already exists in model_branch {model_alias_dict[alias]}!"
+            )
+            model_alias_dict[alias] = key
+
+    return model_alias_dict, model_branch_dict
+
+
+# generated with GPT for formatted print
+class OrderedDictTableWrapper:
+    """
+    A wrapper for pretty-printing an OrderedDict that has a specific structure.
+
+    Expected structure:
+        OrderedDict({
+            "BranchName1": {
+                "alias": ["A", "B"],                  # Required key: alias (list of strings)
+                "info": {                             # Optional key: info (dict of arbitrary key-value pairs)
+                    "description": "Some text",
+                    "description2": "Some long text..."
+                }
+            },
+            "BranchName2": {
+                "alias": ["C"],
+                "info": { "owner": "Alice" }
+            },
+            ...
+        })
+
+    Features:
+    - Prints the data as an ASCII table with borders and aligned columns.
+    - The first two columns are fixed: "Model Branch Name" and "Alias".
+    - The remaining columns are all unique keys found in `info` across all branches (order preserved by first occurrence).
+    - Long text in cells is automatically wrapped to fit the column width, except column 1 & 2 auto-expanding to the **maximum content length** in that column.
+    - Missing info values are shown as empty strings.
+    """
+
+    def __init__(
+        self, data: "OrderedDict[str, dict[str, Any]]", col_width: int = 30
+    ) -> None:
+        """
+        Initialize the table wrapper.
+
+        Args:
+            data: OrderedDict containing the branch data.
+            col_width: Maximum width of each column (characters). Longer text will wrap.
+        """
+        # Ensure we are working with an OrderedDict to preserve branch order
+        if not isinstance(data, OrderedDict):
+            data = OrderedDict(data)
+        self.data = data
+        self.col_width = col_width
+
+        # Collect all unique keys from "info" across all branches in order of first appearance
+        seen = set()
+        self.info_keys: list[str] = []
+        for _, payload in self.data.items():
+            info = payload.get("info") or {}
+            for k in info.keys():
+                if k not in seen:
+                    seen.add(k)
+                    self.info_keys.append(k)
+
+        # Construct table header: fixed columns + dynamic info keys
+        self.headers: list[str] = ["Model Branch", "Alias", *self.info_keys]
+
+    def _wrap_cell(self, text: Any, width: Optional[int] = None) -> list[str]:
+        """
+        Convert a cell value into a list of wrapped text lines.
+
+        Args:
+            text: Any value that will be converted to a string.
+            width: Optional custom wrap width. If None, defaults to `self.col_width`.
+
+        Returns
+        -------
+        A list of strings, each representing one wrapped line of the cell.
+        """
+        text = "" if text is None else str(text)
+        eff_width = self.col_width if width is None else width
+        # If eff_width is very large, this effectively disables wrapping for that cell.
+        return textwrap.wrap(text, eff_width) or [""]
+
+    def as_table(self) -> str:
+        """
+        Generate a formatted ASCII table with borders and aligned columns.
+
+        Returns
+        -------
+        A string representation of the table.
+        """
+        # Step 0: Precompute dynamic widths for the first two columns.
+        # Column 0 (branch): width = max length over header + all branch names
+        branch_col_width = len(self.headers[0])  # "Model Branch Name"
+        for branch in self.data.keys():
+            branch_col_width = max(branch_col_width, len(str(branch)))
+
+        # Column 1 (alias): width = max length over header + all alias strings (joined by ", \n")
+        alias_col_width = len(self.headers[1])  # "Alias"
+        for payload in self.data.values():
+            alias_list = payload.get("alias", [])
+            for alias in alias_list:
+                alias_col_width = max(alias_col_width, len(str(alias)))
+
+        # Step 1: Create raw rows (without wrapping)
+        raw_rows: list[list[str]] = []
+        # First row: header
+        raw_rows.append(self.headers)
+
+        # Data rows
+        for branch, payload in self.data.items():
+            alias_str = ", ".join(map(str, payload.get("alias", [])))
+            info = payload.get("info") or {}
+            row = [branch, alias_str] + [info.get(k, "") for k in self.info_keys]
+            raw_rows.append(row)
+
+        # Step 2: Wrap each cell, using dynamic widths for the first two columns,
+        # and fixed `self.col_width` for info columns.
+        wrapped_rows: list[list[list[str]]] = []
+        for row in raw_rows:
+            wrapped_row: list[list[str]] = []
+            for j, cell in enumerate(row):
+                if j == 0:
+                    # First column: branch name -> no wrap by using its max width
+                    wrapped_row.append(self._wrap_cell(cell, width=branch_col_width))
+                elif j == 1:
+                    # Second column: alias -> no wrap by using its max width
+                    wrapped_row.append(self._wrap_cell(cell, width=alias_col_width))
+                else:
+                    # Info columns: keep using fixed col_width (wrapping allowed)
+                    wrapped_row.append(self._wrap_cell(cell))
+            wrapped_rows.append(wrapped_row)
+
+        # Step 3: Determine actual width for each column
+        # For the first two columns, we already decided the exact widths above.
+        col_widths: list[int] = []
+        for idx, col in enumerate(zip(*wrapped_rows)):
+            if idx == 0:
+                col_widths.append(branch_col_width)
+            elif idx == 1:
+                col_widths.append(alias_col_width)
+            else:
+                # Info columns: width is the maximum wrapped line length (<= self.col_width)
+                col_widths.append(max(len(line) for cell in col for line in cell))
+
+        # Helper: Draw a horizontal separator line
+        def draw_separator() -> str:
+            return "+" + "+".join("-" * (w + 2) for w in col_widths) + "+"
+
+        # Helper: Draw one row of text parts (single lines per cell)
+        def draw_row_line(cells_parts: list[list[str]]) -> str:
+            return (
+                "| "
+                + " | ".join(
+                    part.ljust(width) for part, width in zip(cells_parts, col_widths)
+                )
+                + " |"
+            )
+
+        # Step 4: Build the table string
+        table_lines = []
+        table_lines.append(draw_separator())
+
+        for i, row_cells in enumerate(wrapped_rows):
+            # Determine the maximum number of wrapped lines in this row
+            max_lines = max(len(cell) for cell in row_cells)
+            # Draw each wrapped line
+            for line_idx in range(max_lines):
+                line_parts = [
+                    cell[line_idx] if line_idx < len(cell) else "" for cell in row_cells
+                ]
+                table_lines.append(draw_row_line(line_parts))
+            table_lines.append(draw_separator())
+
+        return "\n".join(table_lines)
+
+
+# Example usage
+if __name__ == "__main__":
+    data = OrderedDict(
+        {
+            "Omat": {
+                "alias": ["Default", "Materials"],
+                "info": {
+                    "observed-type": [
+                        "H",
+                        "He",
+                        "Li",
+                        "Be",
+                        "B",
+                        "C",
+                        "N",
+                        "O",
+                        "F",
+                        "Ne",
+                        "Na",
+                        "Mg",
+                        "Al",
+                        "Si",
+                        "P",
+                        "S",
+                        "Cl",
+                        "Ar",
+                        "K",
+                        "Ca",
+                        "Sc",
+                        "Ti",
+                        "V",
+                        "Cr",
+                        "Mn",
+                        "Fe",
+                        "Co",
+                        "Ni",
+                        "Cu",
+                        "Zn",
+                        "Ga",
+                        "Ge",
+                        "As",
+                        "Se",
+                        "Br",
+                        "Kr",
+                        "Rb",
+                        "Sr",
+                        "Y",
+                        "Zr",
+                        "Nb",
+                        "Mo",
+                        "Tc",
+                        "Ru",
+                        "Rh",
+                        "Pd",
+                        "Ag",
+                        "Cd",
+                        "In",
+                        "Sn",
+                        "Sb",
+                        "Te",
+                        "I",
+                        "Xe",
+                        "Cs",
+                        "Ba",
+                        "La",
+                        "Ce",
+                        "Pr",
+                        "Nd",
+                        "Pm",
+                        "Sm",
+                        "Eu",
+                        "Gd",
+                        "Tb",
+                        "Dy",
+                        "Ho",
+                        "Er",
+                        "Tm",
+                        "Yb",
+                        "Lu",
+                        "Hf",
+                        "Ta",
+                        "W",
+                        "Re",
+                        "Os",
+                        "Ir",
+                        "Pt",
+                        "Au",
+                        "Hg",
+                        "Tl",
+                        "Pb",
+                        "Bi",
+                        "Th",
+                        "Pa",
+                        "U",
+                        "Np",
+                        "Pu",
+                        "Ac",
+                    ],
+                    "description": "OMat24 is a large-scale open dataset containing over 110 million DFT calculations "
+                    "spanning diverse structures and compositions. It is designed to support AI-driven "
+                    "materials discovery by providing broad and deep coverage of chemical space.",
+                },
+            },
+        }
+    )
+
+    wrapper = OrderedDictTableWrapper(data, col_width=20)
+    print(wrapper.as_table())  # noqa:T201
diff --git a/deepmd/utils/model_stat.py b/deepmd/utils/model_stat.py
index d2cc918b64..8061c7aa9c 100644
--- a/deepmd/utils/model_stat.py
+++ b/deepmd/utils/model_stat.py
@@ -2,11 +2,14 @@
 from collections import (
     defaultdict,
 )
+from typing import (
+    Any,
+)
 
 import numpy as np
 
 
-def _make_all_stat_ref(data, nbatches):
+def _make_all_stat_ref(data: Any, nbatches: int) -> dict[str, list[Any]]:
     all_stat = defaultdict(list)
     for ii in range(data.get_nsystems()):
         for jj in range(nbatches):
@@ -18,7 +21,9 @@ def _make_all_stat_ref(data, nbatches):
     return all_stat
 
 
-def make_stat_input(data, nbatches, merge_sys=True):
+def make_stat_input(
+    data: Any, nbatches: int, merge_sys: bool = True
+) -> dict[str, list[Any]]:
     """Pack data for statistics.
 
     Parameters
@@ -57,7 +62,7 @@ def make_stat_input(data, nbatches, merge_sys=True):
     return all_stat
 
 
-def merge_sys_stat(all_stat):
+def merge_sys_stat(all_stat: dict[str, list[Any]]) -> dict[str, list[Any]]:
     first_key = next(iter(all_stat.keys()))
     nsys = len(all_stat[first_key])
     ret = defaultdict(list)
diff --git a/deepmd/utils/pair_tab.py b/deepmd/utils/pair_tab.py
index 93c8b7a1f9..b261f62a40 100644
--- a/deepmd/utils/pair_tab.py
+++ b/deepmd/utils/pair_tab.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import logging
 from typing import (
+    Any,
     Optional,
 )
 
@@ -95,7 +96,7 @@ def serialize(self) -> dict:
         }
 
     @classmethod
-    def deserialize(cls, data) -> "PairTab":
+    def deserialize(cls, data: dict[str, Any]) -> "PairTab":
         data = data.copy()
         check_version_compatibility(data.pop("@version", 1), 1, 1)
         data.pop("@class")
@@ -257,7 +258,7 @@ def _extrapolate_table(self, pad_extrapolation: np.array) -> np.array:
         )
         return pad_extrapolation
 
-    def _make_data(self):
+    def _make_data(self) -> np.ndarray:
         data = np.zeros(
             [self.ntypes * self.ntypes * 4 * self.nspline], dtype=self.data_type
         )
diff --git a/deepmd/utils/path.py b/deepmd/utils/path.py
index 87a44aa70d..e6b00cdf80 100644
--- a/deepmd/utils/path.py
+++ b/deepmd/utils/path.py
@@ -12,6 +12,7 @@
     Path,
 )
 from typing import (
+    Any,
     ClassVar,
     Optional,
     Union,
@@ -35,7 +36,7 @@ class DPPath(ABC):
         mode, by default "r"
     """
 
-    def __new__(cls, path: str, mode: str = "r"):
+    def __new__(cls, path: str, mode: str = "r") -> "DPPath":
         if cls is DPPath:
             if os.path.isdir(path):
                 return super().__new__(DPOSPath)
@@ -56,7 +57,7 @@ def load_numpy(self) -> np.ndarray:
         """
 
     @abstractmethod
-    def load_txt(self, **kwargs) -> np.ndarray:
+    def load_txt(self, **kwargs: Any) -> np.ndarray:
         """Load NumPy array from text.
 
         Returns
@@ -115,7 +116,7 @@ def is_dir(self) -> bool:
         """Check if self is directory."""
 
     @abstractmethod
-    def __getnewargs__(self):
+    def __getnewargs__(self) -> tuple[str, str]:
         """Return the arguments to be passed to __new__ when unpickling an instance."""
 
     @abstractmethod
@@ -133,10 +134,10 @@ def __str__(self) -> str:
     def __repr__(self) -> str:
         return f"{type(self)} ({self!s})"
 
-    def __eq__(self, other) -> bool:
+    def __eq__(self, other: object) -> bool:
         return str(self) == str(other)
 
-    def __hash__(self):
+    def __hash__(self) -> int:
         return hash(str(self))
 
     @property
@@ -173,7 +174,7 @@ def __init__(self, path: Union[str, Path], mode: str = "r") -> None:
         self.mode = mode
         self.path = Path(path)
 
-    def __getnewargs__(self):
+    def __getnewargs__(self) -> tuple[str, str]:
         return (self.path, self.mode)
 
     def load_numpy(self) -> np.ndarray:
@@ -186,7 +187,7 @@ def load_numpy(self) -> np.ndarray:
         """
         return np.load(str(self.path))
 
-    def load_txt(self, **kwargs) -> np.ndarray:
+    def load_txt(self, **kwargs: Any) -> np.ndarray:
         """Load NumPy array from text.
 
         Returns
@@ -311,7 +312,7 @@ def __init__(self, path: str, mode: str = "r") -> None:
         # h5 path: default is the root path
         self._name = s[1] if len(s) > 1 else "/"
 
-    def __getnewargs__(self):
+    def __getnewargs__(self) -> tuple[str, str]:
         return (self.root_path, self.mode)
 
     @classmethod
@@ -341,7 +342,7 @@ def load_numpy(self) -> np.ndarray:
         """
         return self.root[self._name][:]
 
-    def load_txt(self, dtype: Optional[np.dtype] = None, **kwargs) -> np.ndarray:
+    def load_txt(self, dtype: Optional[np.dtype] = None, **kwargs: Any) -> np.ndarray:
         """Load NumPy array from text.
 
         Returns
@@ -416,7 +417,7 @@ def _keys(self) -> list[str]:
     __file_new_keys: ClassVar[dict[h5py.File, list[str]]] = {}
 
     @property
-    def _new_keys(self):
+    def _new_keys(self) -> list[str]:
         """New keys that haven't been cached."""
         self.__file_new_keys.setdefault(self.root, [])
         return self.__file_new_keys[self.root]
diff --git a/deepmd/utils/plugin.py b/deepmd/utils/plugin.py
index 37ff784d61..3c27750bbb 100644
--- a/deepmd/utils/plugin.py
+++ b/deepmd/utils/plugin.py
@@ -7,6 +7,7 @@
     ABCMeta,
 )
 from typing import (
+    Any,
     Callable,
     Optional,
 )
@@ -32,7 +33,7 @@ def xxx():
     def __init__(self) -> None:
         self.plugins = {}
 
-    def __add__(self, other) -> "Plugin":
+    def __add__(self, other: "Plugin") -> "Plugin":
         self.plugins.update(other.plugins)
         return self
 
@@ -56,7 +57,7 @@ def decorator(object: object) -> object:
 
         return decorator
 
-    def get_plugin(self, key) -> object:
+    def get_plugin(self, key: str) -> object:
         """Visit a plugin by key.
 
         Parameters
@@ -73,7 +74,7 @@ def get_plugin(self, key) -> object:
 
 
 class VariantMeta:
-    def __call__(self, *args, **kwargs):
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
         """Remove `type` and keys that starts with underline."""
         obj = self.__new__(self, *args, **kwargs)
         kwargs.pop("type", None)
diff --git a/deepmd/utils/random.py b/deepmd/utils/random.py
index 10ebdf0790..d6cc327034 100644
--- a/deepmd/utils/random.py
+++ b/deepmd/utils/random.py
@@ -14,7 +14,7 @@ def choice(
     size: Optional[Union[int, tuple[int, ...]]] = None,
     replace: bool = True,
     p: Optional[np.ndarray] = None,
-):
+) -> Union[np.ndarray, int]:
     """Generates a random sample from a given 1-D array.
 
     Parameters
@@ -40,7 +40,9 @@ def choice(
     return _RANDOM_GENERATOR.choice(a, size=size, replace=replace, p=p)
 
 
-def random(size=None):
+def random(
+    size: Optional[Union[int, tuple[int, ...]]] = None,
+) -> Union[float, np.ndarray]:
     """Return random floats in the half-open interval [0.0, 1.0).
 
     Parameters
diff --git a/deepmd/utils/spin.py b/deepmd/utils/spin.py
index c6cbb32f52..040de6b95b 100644
--- a/deepmd/utils/spin.py
+++ b/deepmd/utils/spin.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
 import copy
 from typing import (
+    Optional,
     Union,
 )
 
@@ -135,7 +136,9 @@ def init_atom_exclude_types_placeholder(self) -> None:
         """
         self.atom_exclude_types_p = self.placeholder_type.tolist()
 
-    def get_pair_exclude_types(self, exclude_types=None) -> list[tuple[int, int]]:
+    def get_pair_exclude_types(
+        self, exclude_types: Optional[list[tuple[int, int]]] = None
+    ) -> list[tuple[int, int]]:
         """
         Return the pair-wise exclusion types for descriptor.
         The placeholder types for those without spin are excluded.
@@ -151,7 +154,9 @@ def get_pair_exclude_types(self, exclude_types=None) -> list[tuple[int, int]]:
                 _exclude_types.append((tt[0], tt[1]))
             return _exclude_types
 
-    def get_atom_exclude_types(self, exclude_types=None) -> list[int]:
+    def get_atom_exclude_types(
+        self, exclude_types: Optional[list[int]] = None
+    ) -> list[int]:
         """
         Return the atom-wise exclusion types for fitting before out_def.
         Both the placeholder types and spin types are excluded.
@@ -164,7 +169,9 @@ def get_atom_exclude_types(self, exclude_types=None) -> list[int]:
             _exclude_types = list(set(_exclude_types))
             return _exclude_types
 
-    def get_atom_exclude_types_placeholder(self, exclude_types=None) -> list[int]:
+    def get_atom_exclude_types_placeholder(
+        self, exclude_types: Optional[list[int]] = None
+    ) -> list[int]:
         """
         Return the atom-wise exclusion types for fitting after out_def.
         The placeholder types for those without spin are excluded.
@@ -177,14 +184,14 @@ def get_atom_exclude_types_placeholder(self, exclude_types=None) -> list[int]:
             _exclude_types = list(set(_exclude_types))
             return _exclude_types
 
-    def get_spin_mask(self):
+    def get_spin_mask(self) -> np.ndarray:
         """
         Return the spin mask of shape [ntypes],
         with spin types being 1, and non-spin types being 0.
         """
         return self.spin_mask
 
-    def get_virtual_scale_mask(self):
+    def get_virtual_scale_mask(self) -> np.ndarray:
         """
         Return the virtual scale mask of shape [ntypes],
         with spin types being its virtual scale, and non-spin types being 0.
diff --git a/deepmd/utils/summary.py b/deepmd/utils/summary.py
index 51171ccc19..c00e6deb9e 100644
--- a/deepmd/utils/summary.py
+++ b/deepmd/utils/summary.py
@@ -56,7 +56,7 @@ class SummaryPrinter(ABC):
         "build variant": GLOBAL_CONFIG["dp_variant"],
     }
 
-    def __call__(self):
+    def __call__(self) -> None:
         """Print build and current running cluster configuration summary."""
         nodename, nodelist = get_host_names()
         build_info = self.BUILD.copy()
diff --git a/deepmd/utils/tabulate.py b/deepmd/utils/tabulate.py
index 6e5e988b4a..d15794a9e5 100644
--- a/deepmd/utils/tabulate.py
+++ b/deepmd/utils/tabulate.py
@@ -7,6 +7,10 @@
 from functools import (
     lru_cache,
 )
+from typing import (
+    Any,
+    Optional,
+)
 
 import numpy as np
 from scipy.special import (
@@ -21,11 +25,11 @@ class BaseTabulate(ABC):
 
     def __init__(
         self,
-        descrpt,
-        neuron,
-        type_one_side,
-        exclude_types,
-        is_pt,
+        descrpt: Any,
+        neuron: list[int],
+        type_one_side: bool,
+        exclude_types: list[list[int]],
+        is_pt: bool,
     ) -> None:
         """Constructor."""
         super().__init__()
@@ -239,7 +243,16 @@ def build(
         return self.lower, self.upper
 
     def _build_lower(
-        self, net, xx, idx, upper, lower, stride0, stride1, extrapolate, nspline
+        self,
+        net: int,
+        xx: np.ndarray,
+        idx: int,
+        upper: float,
+        lower: float,
+        stride0: int,
+        stride1: int,
+        extrapolate: bool,
+        nspline: int,
     ) -> None:
         vv, dd, d2 = self._make_data(xx, idx)
         self.data[net] = np.zeros(
@@ -334,7 +347,9 @@ def _build_lower(
         self.lower[net] = lower
 
     @abstractmethod
-    def _make_data(self, xx, idx) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    def _make_data(
+        self, xx: np.ndarray, idx: int
+    ) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
         """Generate tabulation data for the given input.
 
         Parameters
@@ -368,16 +383,16 @@ def _all_excluded(self, ii: int) -> bool:
         return all((ii, type_i) in self.exclude_types for type_i in range(self.ntypes))
 
     @abstractmethod
-    def _get_descrpt_type(self):
+    def _get_descrpt_type(self) -> str:
         """Get the descrpt type."""
         pass
 
     @abstractmethod
-    def _get_layer_size(self):
+    def _get_layer_size(self) -> int:
         """Get the number of embedding layer."""
         pass
 
-    def _get_table_size(self):
+    def _get_table_size(self) -> int:
         table_size = 0
         if self.descrpt_type in ("Atten", "AEbdV2"):
             table_size = 1
@@ -395,30 +410,30 @@ def _get_table_size(self):
             raise RuntimeError("Unsupported descriptor")
         return table_size
 
-    def _get_data_type(self):
+    def _get_data_type(self) -> Optional[type]:
         for item in self.matrix["layer_" + str(self.layer_size)]:
             if len(item) != 0:
                 return type(item[0][0])
         return None
 
-    def _get_last_layer_size(self):
+    def _get_last_layer_size(self) -> int:
         for item in self.matrix["layer_" + str(self.layer_size)]:
             if len(item) != 0:
                 return item.shape[1]
         return 0
 
     @abstractmethod
-    def _get_bias(self):
+    def _get_bias(self) -> dict[str, Any]:
         """Get bias of embedding net."""
         pass
 
     @abstractmethod
-    def _get_matrix(self):
+    def _get_matrix(self) -> dict[str, Any]:
         """Get weight matrx of embedding net."""
         pass
 
     @abstractmethod
-    def _convert_numpy_to_tensor(self):
+    def _convert_numpy_to_tensor(self) -> None:
         """Convert self.data from np.ndarray to torch.Tensor."""
         pass
 
@@ -427,7 +442,7 @@ def _convert_numpy_float_to_int(self) -> None:
         self.lower = {k: int(v) for k, v in self.lower.items()}
         self.upper = {k: int(v) for k, v in self.upper.items()}
 
-    def _get_env_mat_range(self, min_nbor_dist):
+    def _get_env_mat_range(self, min_nbor_dist: float) -> tuple[np.ndarray, np.ndarray]:
         """Change the embedding net range to sw / min_nbor_dist."""
         sw = self._spline5_switch(min_nbor_dist, self.rcut_smth, self.rcut)
         if self.descrpt_type in ("Atten", "A", "AEbdV2"):
@@ -447,7 +462,7 @@ def _get_env_mat_range(self, min_nbor_dist):
         # returns element-wise lower and upper
         return np.floor(lower), np.ceil(upper)
 
-    def _spline5_switch(self, xx, rmin, rmax):
+    def _spline5_switch(self, xx: float, rmin: float, rmax: float) -> float:
         if xx < rmin:
             vv = 1
         elif xx < rmax:
diff --git a/deepmd/utils/update_sel.py b/deepmd/utils/update_sel.py
index c9213de699..1d5d9bef01 100644
--- a/deepmd/utils/update_sel.py
+++ b/deepmd/utils/update_sel.py
@@ -5,6 +5,7 @@
     abstractmethod,
 )
 from typing import (
+    Any,
     Optional,
     Union,
 )
@@ -55,7 +56,7 @@ def update_one_sel(
                     )
         return min_nbor_dist, sel
 
-    def parse_auto_sel(self, sel) -> bool:
+    def parse_auto_sel(self, sel: Any) -> bool:
         if not isinstance(sel, str):
             return False
         words = sel.split(":")
@@ -64,7 +65,7 @@ def parse_auto_sel(self, sel) -> bool:
         else:
             return False
 
-    def parse_auto_sel_ratio(self, sel):
+    def parse_auto_sel_ratio(self, sel: Any) -> float:
         if not self.parse_auto_sel(sel):
             raise RuntimeError(f"invalid auto sel format {sel}")
         else:
@@ -77,7 +78,7 @@ def parse_auto_sel_ratio(self, sel):
                 raise RuntimeError(f"invalid auto sel format {sel}")
             return ratio
 
-    def wrap_up_4(self, xx):
+    def wrap_up_4(self, xx: int) -> int:
         return 4 * ((int(xx) + 3) // 4)
 
     def get_nbor_stat(
@@ -131,7 +132,7 @@ def neighbor_stat(self) -> type[NeighborStat]:
     def get_min_nbor_dist(
         self,
         train_data: DeepmdDataSystem,
-    ):
+    ) -> float:
         min_nbor_dist, _ = self.get_nbor_stat(
             train_data,
             None,  # type_map doesn't affect min_nbor_dist
diff --git a/doc/env.md b/doc/env.md
index 4ca7101236..1688e0af9c 100644
--- a/doc/env.md
+++ b/doc/env.md
@@ -88,5 +88,37 @@ These environment variables also apply to third-party programs using the C++ int
 **Type**: List of paths, split by `:` on Unix and `;` on Windows
 
 List of customized OP plugin libraries to load, such as `/path/to/plugin1.so:/path/to/plugin2.so` on Linux and `/path/to/plugin1.dll;/path/to/plugin2.dll` on Windows.
+:::
+
+:::{envvar} DP_PROFILER
+
+{{ pytorch_icon }} Enable the built-in PyTorch Kineto profiler for the PyTorch C++ (inference) backend.
+
+**Type**: string (output file stem)
+
+**Default**: unset (disabled)
+
+When set to a non-empty value, profiling is enabled for the lifetime of the loaded PyTorch model (e.g. during LAMMPS runs). A JSON trace file is created on finish. The final file name is constructed as:
+
+- `<ENV_VALUE>_gpu<ID>.json` if running on GPU
+- `<ENV_VALUE>.json` if running on CPU
+
+The trace can be examined with [Chrome trace viewer](https://ui.perfetto.dev/) (alternatively chrome://tracing). It includes:
+
+- CPU operator activities
+- CUDA activities (if available)
+
+Example:
+
+```bash
+export DP_PROFILER=result
+mpirun -np 4 lmp -in in.lammps
+# Produces result_gpuX.json, where X is the GPU id used by each MPI rank.
+```
+
+Tips:
+
+- Large runs can generate sizable JSON files; consider limiting numbers of MD steps, like 20.
+- Currently this feature only supports single process, or multi-process runs where each process uses a distinct GPU on the same node.
 
 :::
diff --git a/doc/index.rst b/doc/index.rst
index 38ff2fe97e..238dc0d25d 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -16,6 +16,7 @@ DeePMD-kit is a package written in Python/C++, designed to minimize the effort r
 
       - Han Wang, Linfeng Zhang, Jiequn Han, and Weinan E. "DeePMD-kit: A deep learning package for many-body potential energy representation and molecular dynamics." Computer Physics Communications 228 (2018): 178-184.
       - Jinzhe Zeng, Duo Zhang, Denghui Lu, Pinghui Mo, Zeyu Li, Yixiao Chen, Marián Rynik, Li'ang Huang, Ziyao Li, Shaochen Shi, Yingze Wang, Haotian Ye, Ping Tuo, Jiabin Yang, Ye Ding, Yifan Li, Davide Tisi, Qiyu Zeng, Han Bao, Yu Xia, Jiameng Huang, Koki Muraoka, Yibo Wang, Junhan Chang, Fengbo Yuan, Sigbjørn Løland Bore, Chun Cai, Yinnian Lin, Bo Wang, Jiayan Xu, Jia-Xin Zhu, Chenxing Luo, Yuzhi Zhang, Rhys E. A. Goodall, Wenshuo Liang, Anurag Kumar Singh, Sikai Yao, Jingchao Zhang, Renata Wentzcovitch, Jiequn Han, Jie Liu, Weile Jia, Darrin M. York, Weinan E, Roberto Car, Linfeng Zhang, Han Wang. "DeePMD-kit v2: A software package for Deep Potential models." J. Chem. Phys., 159, 054801 (2023).
+      - Jinzhe Zeng, Duo Zhang, Anyang Peng, Xiangyu Zhang, Sensen He, Yan Wang, Xinzijian Liu, Hangrui Bi, Yifan Li, Chun Cai, Chengqian Zhang, Yiming Du, Jia-Xin Zhu, Pinghui Mo, Zhengtao Huang, Qiyu Zeng, Shaochen Shi, Xuejian Qin, Zhaoxi Yu, Chenxing Luo, Ye Ding, Yun-Pei Liu, Ruosong Shi, Zhenyu Wang, Sigbjørn Løland Bore, Junhan Chang, Zhe Deng, Zhaohan Ding, Siyuan Han, Wanrun Jiang, Guolin Ke, Zhaoqing Liu, Denghui Lu, Koki Muraoka, Hananeh Oliaei, Anurag Kumar Singh, Haohui Que, Weihong Xu, Zhangmancang Xu, Yong-Bin Zhuang, Jiayu Dai, Timothy J. Giese, Weile Jia, Ben Xu, Darrin M. York, Linfeng Zhang, Han Wang. "DeePMD-kit v3: A Multiple-Backend Framework for Machine Learning Potentials." J. Chem. Theory Comput. 21 (2025): 4375-4385.
 
    In addition, please follow :ref:`this page <cite>` to cite the methods you used.
 
diff --git a/doc/inference/python.md b/doc/inference/python.md
index b2603c85f8..361db7b64f 100644
--- a/doc/inference/python.md
+++ b/doc/inference/python.md
@@ -19,6 +19,21 @@ e, f, v = dp.eval(coord, cell, atype)
 
 where `e`, `f` and `v` are predicted energy, force and virial of the system, respectively.
 
+One can also evaluate the descriptors of the model:
+
+```python
+from deepmd.infer import DeepPot
+import numpy as np
+
+dp = DeepPot("graph.pb")
+coord = np.array([[1, 0, 0], [0, 0, 1.5], [1, 0, 3]]).reshape([1, -1])
+cell = np.diag(10 * np.ones(3)).reshape([1, -1])
+atype = [1, 0, 1]
+descriptors = dp.eval_descriptor(coord, cell, atype)
+```
+
+where `descriptors` is the descriptor matrix of the system. This can also be done using the command line interface `dp eval-desc` as described in the [test documentation](../test/test.md).
+
 Furthermore, one can use the python interface to calculate model deviation.
 
 ```python
diff --git a/doc/install/easy-install.md b/doc/install/easy-install.md
index d28c0d0773..9f4d6d8861 100644
--- a/doc/install/easy-install.md
+++ b/doc/install/easy-install.md
@@ -185,7 +185,10 @@ Switch to the TensorFlow {{ tensorflow_icon }} tab for more information.
 ::::{tab-item} CUDA 12.6
 
 ```bash
-pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+# release version
+pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+# nightly-build version
+# pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
 pip install deepmd-kit
 ```
 
@@ -194,7 +197,10 @@ pip install deepmd-kit
 ::::{tab-item} CUDA 11.8
 
 ```bash
-pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
+# release version
+pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
+# nightly-build version
+# pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu118/
 pip install deepmd-kit
 ```
 
@@ -203,7 +209,10 @@ pip install deepmd-kit
 ::::{tab-item} CPU
 
 ```bash
-pip install paddlepaddle==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+# release version
+pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+# nightly-build version
+# pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 pip install deepmd-kit
 ```
 
@@ -227,4 +236,4 @@ If your platform is not supported, or you want to build against the installed ba
 pip install deepmd-kit[gpu,cu12,lmp,ipi]
 ```
 
-MPICH is required for parallel running.
+MPICH will be installed automatically - you do not need to install a MPI library by yourself.
diff --git a/doc/install/install-from-source.md b/doc/install/install-from-source.md
index 1dc72c51fa..a21b8913db 100644
--- a/doc/install/install-from-source.md
+++ b/doc/install/install-from-source.md
@@ -99,11 +99,22 @@ To install Paddle, run
 
 ```sh
 # cu126
-pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+# release version
+pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
+# nightly-build version
+# pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu126/
+
 # cu118
-pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
+# release version
+pip install paddlepaddle-gpu==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
+# nightly-build version
+# pip install --pre paddlepaddle-gpu -i https://www.paddlepaddle.org.cn/packages/nightly/cu118/
+
 # cpu
-pip install paddlepaddle==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+# release version
+pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+# nightly-build version
+# pip install --pre paddlepaddle -i https://www.paddlepaddle.org.cn/packages/nightly/cpu/
 ```
 
 :::
diff --git a/doc/install/install-lammps.md b/doc/install/install-lammps.md
index 00b887e9c3..b2a88db240 100644
--- a/doc/install/install-lammps.md
+++ b/doc/install/install-lammps.md
@@ -17,11 +17,11 @@ DeePMD-kit will generate a module called `USER-DEEPMD` in the `build` directory,
 
 ```bash
 cd /some/workspace
-wget https://github.com/lammps/lammps/archive/stable_29Aug2024_update1.tar.gz
-tar xf stable_29Aug2024_update1.tar.gz
+wget https://github.com/lammps/lammps/archive/stable_22Jul2025_update1.tar.gz
+tar xf stable_22Jul2025_update1.tar.gz
 ```
 
-The source code of LAMMPS is stored in the directory `lammps-stable_29Aug2024_update1`.
+The source code of LAMMPS is stored in the directory `lammps-stable_22Jul2025_update1`.
 
 Then, you can [build LAMMPS](https://docs.lammps.org/Build.html) with either make or CMake.
 
@@ -30,7 +30,7 @@ Then, you can [build LAMMPS](https://docs.lammps.org/Build.html) with either mak
 Now go into the LAMMPS code and copy the DeePMD-kit module like this
 
 ```bash
-cd lammps-stable_29Aug2024_update1/src/
+cd lammps-stable_22Jul2025_update1/src/
 cp -r $deepmd_source_dir/source/build/USER-DEEPMD .
 make yes-kspace
 make yes-extra-fix
@@ -60,8 +60,8 @@ make no-user-deepmd
 Now go into the LAMMPS directory and create a directory called `build`:
 
 ```bash
-mkdir -p lammps-stable_29Aug2024_update1/build/
-cd lammps-stable_29Aug2024_update1/build/
+mkdir -p lammps-stable_22Jul2025_update1/build/
+cd lammps-stable_22Jul2025_update1/build/
 ```
 
 Patch the LAMMPS `CMakeLists.txt` file:
@@ -94,15 +94,15 @@ Now download the LAMMPS code (`8Apr2021` or later), and uncompress it:
 
 ```bash
 cd /some/workspace
-wget https://github.com/lammps/lammps/archive/stable_29Aug2024_update1.tar.gz
-tar xf stable_29Aug2024_update1.tar.gz
+wget https://github.com/lammps/lammps/archive/stable_22Jul2025_update1.tar.gz
+tar xf stable_22Jul2025_update1.tar.gz
 ```
 
-The source code of LAMMPS is stored in the directory `lammps-stable_29Aug2024_update1`. The directory of the source code should be specified as the CMAKE argument `LAMMPS_SOURCE_ROOT` during installation of the DeePMD-kit C++ interface. Now go into the LAMMPS directory and create a directory called `build`
+The source code of LAMMPS is stored in the directory `lammps-stable_22Jul2025_update1`. The directory of the source code should be specified as the CMAKE argument `LAMMPS_SOURCE_ROOT` during installation of the DeePMD-kit C++ interface. Now go into the LAMMPS directory and create a directory called `build`
 
 ```bash
-mkdir -p lammps-stable_29Aug2024_update1/build/
-cd lammps-stable_29Aug2024_update1/build/
+mkdir -p lammps-stable_22Jul2025_update1/build/
+cd lammps-stable_22Jul2025_update1/build/
 ```
 
 Now build LAMMPS. Note that `PLUGIN` must be enabled, and `BUILD_SHARED_LIBS` must be set to `yes`. You can install any other package you want.
diff --git a/doc/model/change-bias.md b/doc/model/change-bias.md
index ac28201cb6..2a9b098606 100644
--- a/doc/model/change-bias.md
+++ b/doc/model/change-bias.md
@@ -1,7 +1,7 @@
-# Change the model output bias for trained model {{ pytorch_icon }}
+# Change the model output bias for trained model {{ tensorflow_icon }} {{ pytorch_icon }}
 
 :::{note}
-**Supported backends**: PyTorch {{ pytorch_icon }}
+**Supported backends**: TensorFlow {{ tensorflow_icon }}, PyTorch {{ pytorch_icon }}
 :::
 
 The output bias of a trained model typically originates from the statistical results of the training dataset.
@@ -10,32 +10,45 @@ There are several scenarios where one might want to adjust the output bias after
 such as zero-shot testing (similar to the procedure before the first step in fine-tuning)
 or manually setting the output bias.
 
-The `dp --pt change-bias` command supports the following methods for adjusting the bias:
+The `dp change-bias` command supports the following methods for adjusting the bias:
 
 ::::{tab-set}
 
-:::{tab-item} Changing bias using provided systems for trained `.pt`/`.pth` models:
+:::{tab-item} TensorFlow Backend {{ tensorflow_icon }}
+
+**Changing bias using provided systems for trained checkpoint:**
 
 ```sh
-dp --pt change-bias model.pt -s data_dir -o model_updated.pt
+dp --tf change-bias model.ckpt -s data_dir -o model_updated.pb
 ```
 
-For multitask models, where `--model-branch` must be specified:
+**Changing bias using user input for energy model:**
 
 ```sh
-dp --pt change-bias multi_model.pt -s data_dir -o model_updated.pt --model-branch model_1
+dp --tf change-bias model.ckpt -b -92.523 -187.66 -o model_updated.pb
 ```
 
 :::
 
-:::{tab-item} Changing bias using user input for **energy model**:
+:::{tab-item} PyTorch Backend {{ pytorch_icon }}
+
+**Changing bias using provided systems for trained `.pt`/`.pth` models:**
+
+```sh
+dp --pt change-bias model.pt -s data_dir -o model_updated.pt
+```
+
+**Changing bias using user input for energy model:**
 
 ```sh
 dp --pt change-bias model.pt -b -92.523 -187.66 -o model_updated.pt
 ```
 
-Here, `-b` specifies user-defined energy bias for each type, separated by space,
-in an order consistent with the `type_map` in the model.
+For multitask models, where `--model-branch` must be specified:
+
+```sh
+dp --pt change-bias multi_model.pt -s data_dir -o model_updated.pt --model-branch model_1
+```
 
 :::
 
diff --git a/doc/model/dpa3.md b/doc/model/dpa3.md
index c63b26f90a..0ff46c438f 100644
--- a/doc/model/dpa3.md
+++ b/doc/model/dpa3.md
@@ -1,4 +1,4 @@
-# Descriptor DPA3 {{ pytorch_icon }} {{ jax_icon }} {{ dpmodel_icon }}
+# Descriptor DPA3 {{ pytorch_icon }} {{ jax_icon }} {{ paddle_icon }} {{ dpmodel_icon }}
 
 :::{note}
 **Supported backends**: PyTorch {{ pytorch_icon }}, JAX {{ jax_icon }}, DP {{ dpmodel_icon }}
@@ -40,7 +40,11 @@ Virial RMSEs were averaged exclusively for systems containing virial labels (`Al
 
 Note that we set `float32` in all DPA3 models, while `float64` in other models by default.
 
-## Requirements of installation from source code {{ pytorch_icon }}
+## Requirements of installation from source code {{ pytorch_icon }} {{ paddle_icon }}
+
+::::{tab-set}
+
+:::{tab-item} PyTorch {{ pytorch_icon }}
 
 To run the DPA3 model on LAMMPS via source code installation
 (users can skip this step if using [easy installation](../install/easy-install.md)),
@@ -53,6 +57,25 @@ If one runs LAMMPS with MPI, the customized OP library for the C++ interface sho
 If one runs LAMMPS with MPI and CUDA devices, it is recommended to compile the customized OP library for the C++ interface with a [CUDA-Aware MPI](https://developer.nvidia.com/mpi-solutions-gpus) library and CUDA,
 otherwise the communication between GPU cards falls back to the slower CPU implementation.
 
+:::
+
+:::{tab-item} Paddle {{ paddle_icon }}
+
+The customized OP library for the Python interface can be installed by
+
+```sh
+cd deepmd-kit/source/op/pd
+python setup.py install
+```
+
+If one runs LAMMPS with MPI, the customized OP library for the C++ interface should be compiled against the same MPI library as the runtime MPI.
+If one runs LAMMPS with MPI and CUDA devices, it is recommended to compile the customized OP library for the C++ interface with a [CUDA-Aware MPI](https://developer.nvidia.com/mpi-solutions-gpus) library and CUDA,
+otherwise the communication between GPU cards falls back to the slower CPU implementation.
+
+:::
+
+::::
+
 ## Limitations of the JAX backend with LAMMPS {{ jax_icon }}
 
 When using the JAX backend, 2 or more MPI ranks are not supported. One must set `map` to `yes` using the [`atom_modify`](https://docs.lammps.org/atom_modify.html) command.
diff --git a/doc/model/dplr.md b/doc/model/dplr.md
index cf071d4029..61327bb55e 100644
--- a/doc/model/dplr.md
+++ b/doc/model/dplr.md
@@ -69,7 +69,7 @@ The loss section is provided as follows
     },
 ```
 
-so that the atomic dipole is trained as labels. Note that the NumPy compressed file `atomic_dipole.npy` should be provided in each dataset.
+so that the atomic dipole is trained as labels. Note that the NumPy compressed file `atomic_dipole.npy` should be provided in each dataset. In the context of DPLR models, the atomic dipole data represents the displacement vector from each atom to its associated Wannier centroid (WC), which can be calculated as `atomic_dipole = wannier_centroid_position - atom_position` from DFT calculations using tools such as VASP with Wannier90.
 
 The training and freezing can be started from the example directory by
 
diff --git a/doc/model/pairtab.md b/doc/model/pairtab.md
index f52ad5dae7..57fe23f5e9 100644
--- a/doc/model/pairtab.md
+++ b/doc/model/pairtab.md
@@ -103,3 +103,10 @@ To combine with a pairwise potential, use the [linear model](./linear.md):
 ```
 
 The {ref}`rcut <model[pairtab]/rcut>` can be larger than that of the DP model.
+
+:::{note}
+The above example shows a example of combining D3 dispersion.
+However, it is more efficient to train a model using plain DFT calculations without the dispersion correction, and add the dispersion correction during the simulation via the LAMMPS [`pair_style dispersion/d3` command](https://docs.lammps.org/pair_dispersion_d3.html#pair-style-dispersion-d3-command).
+Training against data with dispersion directly is discouraged.
+See the [D3 dispersion section](../third-party/lammps-command.md#d3-dispersion) for details.
+:::
diff --git a/doc/model/show-model-info.md b/doc/model/show-model-info.md
index 48aa41385d..67d82610de 100644
--- a/doc/model/show-model-info.md
+++ b/doc/model/show-model-info.md
@@ -11,12 +11,12 @@ dp --pt show <INPUT> <ATTRIBUTES...>
 
 - `<INPUT>`: Path to the model checkpoint file or frozen model file.
 - `<ATTRIBUTES>`: One or more information categories to display. Supported values are:
-
   - `model-branch`: Shows available branches for multi-task models.
   - `type-map`: Shows the type mapping used by the model.
   - `descriptor`: Displays the model descriptor parameters.
   - `fitting-net`: Displays parameters of the fitting network.
   - `size`: (Supported Backends: PyTorch and PaddlePaddle) Shows the parameter counts for various components.
+  - `observed-type`: (Supported Backends: PyTorch) Shows the observed types (elements) of the model during data statistics. Only energy models are supported now.
 
 ## Example Usage
 
@@ -33,33 +33,32 @@ dp show frozen_model.pth type-map descriptor fitting-net size
 Depending on the provided attributes and the model type, the output includes:
 
 - **Model Type**
-
   - Logs whether the loaded model is a _singletask_ or _multitask_ model.
 
 - **model-branch**
-
   - _Only available for multitask models._
   - Lists all available model branches and the special `"RANDOM"` branch, which refers to a randomly initialized fitting net.
 
 - **type-map**
-
   - For multitask models: Shows the type map for each branch.
   - For singletask models: Shows the model's type map.
 
 - **descriptor**
-
   - For multitask models: Displays the descriptor parameter for each branch.
   - For singletask models: Displays the descriptor parameter.
 
 - **fitting-net**
-
   - For multitask models: Shows the fitting network parameters for each branch.
   - For singletask models: Shows the fitting network parameters.
 
 - **size**
-
   - Prints the number of parameters for each component (`descriptor`, `fitting-net`, etc.), as well as the total parameter count.
 
+- **observed-type**
+  - Displays the count and list of observed element types of the model during data statistics.
+  - For multitask models, it shows the observed types for each branch.
+  - Note: This info shows the types observed during training data statistics, which may differ from the type map.
+
 ## Example Output
 
 For a singletask model, the output might look like:
@@ -73,6 +72,9 @@ Parameter counts:
 Parameters in descriptor: 19,350
 Parameters in fitting-net: 119,091
 Parameters in total: 138,441
+The observed types for this model:
+Number of observed types: 2
+Observed types: ['H', 'O']
 ```
 
 For a multitask model, if `model-branch` is selected, it will additionally display available branches:
diff --git a/doc/sphinx_contrib_exhale_multiproject.py b/doc/sphinx_contrib_exhale_multiproject.py
index 23b91f9f8d..afa2344452 100644
--- a/doc/sphinx_contrib_exhale_multiproject.py
+++ b/doc/sphinx_contrib_exhale_multiproject.py
@@ -78,14 +78,22 @@
 from pprint import (
     pprint,
 )
+from typing import (
+    TYPE_CHECKING,
+)
 
 import exhale
 import exhale.configs
 import exhale.deploy
 import exhale.utils
 
+if TYPE_CHECKING:
+    from sphinx.application import (
+        Sphinx,
+    )
+
 
-def exhale_environment_ready(app) -> None:
+def exhale_environment_ready(app: "Sphinx") -> None:
     default_project = app.config.breathe_default_project
     default_exhale_args = dict(app.config.exhale_args)
 
diff --git a/doc/test/test.md b/doc/test/test.md
index dfd59d8f1f..9d399cb1ed 100644
--- a/doc/test/test.md
+++ b/doc/test/test.md
@@ -17,3 +17,25 @@ An explanation will be provided
 ```{program-output} dp test -h
 
 ```
+
+## Evaluate descriptors
+
+The descriptors of a model can be evaluated and saved using `dp eval-desc`. A typical usage of `dp eval-desc` is
+
+```bash
+dp eval-desc -m graph.pb -s /path/to/system -o desc
+```
+
+where `-m` gives the model file, `-s` the path to the system directory (or `-f` for a datafile containing paths to systems), and `-o` the output directory where descriptor files will be saved. The descriptors for each system will be saved as `.npy` files with the format `desc/(system_name).npy`. Each descriptor file contains a 3D array with shape (nframes, natoms, ndesc).
+
+Several other command line options can be passed to `dp eval-desc`, which can be checked with
+
+```bash
+$ dp eval-desc --help
+```
+
+An explanation will be provided
+
+```{program-output} dp eval-desc -h
+
+```
diff --git a/doc/third-party/ase.md b/doc/third-party/ase.md
index 6ede63e2f9..183efa7cbb 100644
--- a/doc/third-party/ase.md
+++ b/doc/third-party/ase.md
@@ -6,6 +6,10 @@ See [Environment variables](../env.md) for the runtime environment variables.
 
 Deep potential can be set up as a calculator with ASE to obtain potential energies and forces.
 
+::::{tab-set}
+
+:::{tab-item} TensorFlow {{ tensorflow_icon }}
+
 ```python
 from ase import Atoms
 from deepmd.calculator import DP
@@ -20,6 +24,46 @@ print(water.get_potential_energy())
 print(water.get_forces())
 ```
 
+:::
+
+:::{tab-item} PyTorch {{ pytorch_icon }}
+
+```python
+from ase import Atoms
+from deepmd.calculator import DP
+
+water = Atoms(
+    "H2O",
+    positions=[(0.7601, 1.9270, 1), (1.9575, 1, 1), (1.0, 1.0, 1.0)],
+    cell=[100, 100, 100],
+    calculator=DP(model="frozen_model.pth"),
+)
+print(water.get_potential_energy())
+print(water.get_forces())
+```
+
+:::
+
+:::{tab-item} Paddle {{ paddle_icon }}
+
+```python
+from ase import Atoms
+from deepmd.calculator import DP
+
+water = Atoms(
+    "H2O",
+    positions=[(0.7601, 1.9270, 1), (1.9575, 1, 1), (1.0, 1.0, 1.0)],
+    cell=[100, 100, 100],
+    calculator=DP(model="frozen_model.json"),
+)
+print(water.get_potential_energy())
+print(water.get_forces())
+```
+
+:::
+
+::::
+
 Optimization is also available:
 
 ```python
diff --git a/doc/third-party/lammps-command.md b/doc/third-party/lammps-command.md
index 5d0a90c18b..25a77f8670 100644
--- a/doc/third-party/lammps-command.md
+++ b/doc/third-party/lammps-command.md
@@ -229,7 +229,9 @@ dump            1 all custom 100 water.dump id type c_dipole[1] c_dipole[2] c_di
 - The `deeptensor/atom` compute is provided in the USER-DEEPMD package, which is compiled from the DeePMD-kit, visit the [DeePMD-kit website](https://github.com/deepmodeling/deepmd-kit) for more information.
 - For the issue of using a unit style for `compute deeptensor/atom`, refer to the discussions in [units](#units) of this page.
 
-## Long-range interaction
+## Combine with other commands
+
+### Long-range interaction
 
 The reciprocal space part of the long-range interaction can be calculated by LAMMPS command `kspace_style`. To use it with DeePMD-kit, one writes
 
@@ -242,7 +244,7 @@ kspace_modify	gewald 0.45
 
 Please notice that the DeePMD does nothing to the direct space part of the electrostatic interaction, because this part is assumed to be fitted in the DeePMD model (the direct space cut-off is thus the cut-off of the DeePMD model). The splitting parameter `gewald` is modified by the `kspace_modify` command.
 
-## Use of the centroid/stress/atom to get the full 3x3 "atomic-virial"
+### Use of the centroid/stress/atom to get the full 3x3 "atomic-virial"
 
 The [DeePMD-kit](https://github.com/deepmodeling/deepmd-kit) also allows the computation of per-atom stress tensor defined as:
 
@@ -262,7 +264,7 @@ see [LAMMPS doc page](https://docs.lammps.org/compute_stress_atom.html#thompson2
 v2.2.2 or previous versions passed per-atom stress (`cvatom`) with the per-atom pressure tensor, which is inconsistent with [LAMMPS's definition](https://docs.lammps.org/compute_stress_atom.html). LAMMPS defines per-atom stress as the negative of the per-atom pressure tensor. Such behavior is corrected in v2.2.3.
 :::
 
-### Examples
+#### Examples
 
 In order of computing the 9-component per-atom stress
 
@@ -274,7 +276,7 @@ Thus `c_stress` is an array with 9 components in the order `xx,yy,zz,xy,xz,yz,yx
 
 If you use this feature please cite [D. Tisi, L. Zhang, R. Bertossa, H. Wang, R. Car, S. Baroni - arXiv preprint arXiv:2108.10850, 2021](https://arxiv.org/abs/2108.10850)
 
-## Computation of heat flux
+### Computation of heat flux
 
 Using a per-atom stress tensor one can, for example, compute the heat flux defined as:
 
@@ -289,7 +291,7 @@ compute stress_ID group-ID centroid/stress/atom NULL virial
 compute flux_ID all heat/flux ke_ID pe_ID stress_ID
 ```
 
-### Examples
+#### Examples
 
 ```lammps
 compute ke all ke/atom
@@ -305,3 +307,18 @@ If you use these features please cite [D. Tisi, L. Zhang, R. Bertossa, H. Wang,
 [DP]: https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.120.143001
 [DP-SE]: https://dl.acm.org/doi/10.5555/3327345.3327356
 [DPSPIN]: https://doi.org/10.1103/PhysRevB.110.064427
+
+### D3 dispersion
+
+:::{note}
+Requires LAMMPS version 4Feb2025 or newer.
+:::
+
+The DP model can be trained using plain DFT calculations without the dispersion correction, and the dispersion correction can be added during the simulation via the [`pair_style dispersion/d3` command](https://docs.lammps.org/pair_dispersion_d3.html#pair-style-dispersion-d3-command).
+For example, when `water.pb` is trained against the PBE0 functional, the simulation can be performed under the PBE0-D3 level with the following commands:
+
+```lammps
+pair_style hybrid/overlay deepmd water.pb dispersion/d3 original pbe0 30.0 20.0
+pair_coeff * * deepmd O H
+pair_coeff * * dispersion/d3 O H
+```
diff --git a/doc/train/multi-task-training.md b/doc/train/multi-task-training.md
index 16f6c0e05c..115c463cc2 100644
--- a/doc/train/multi-task-training.md
+++ b/doc/train/multi-task-training.md
@@ -48,7 +48,6 @@ Specifically, there are several parts that need to be modified:
 - {ref}`model/model_dict <model/model_dict>`: The core definition of the model part and the explanation of sharing rules,
   starting with user-defined model name keys `model_key`, such as `my_model_1`.
   Each model part needs to align with the components of the single-task training {ref}`model <model>`, but with the following sharing rules:
-
   - If you want to share the current model component with other tasks, which should be part of the {ref}`model/shared_dict <model/shared_dict>`,
     you can directly fill in the corresponding `part_key`, such as
     `"descriptor": "my_descriptor", `
diff --git a/doc/train/parallel-training.md b/doc/train/parallel-training.md
index 96cfadf4bb..998f1c3bec 100644
--- a/doc/train/parallel-training.md
+++ b/doc/train/parallel-training.md
@@ -218,6 +218,21 @@ NUM_WORKERS=0 HDF5_USE_FILE_LOCKING=0 python -m paddle.distributed.launch \
     dp --pd train input.json
 ```
 
+or you can wrapper the training script with `mpirun`:
+
+```bash
+# ----- train_pp.sh -------
+unset CUDA_DEVICE_MAX_CONNECTIONS
+python -m paddle.distributed.launch --gpus="0,1,2,3,4,5,6,7" --log_dir logs dp --pd train input_torch.json -l train_pp.log
+# -------------------------
+```
+
+Then, run the script on the first node with:
+
+```bash
+mpirun run_pp.sh
+```
+
 :::{note}
 
 If `NUM_WORKERS` is too large, it may cause the program to be terminated by the system;
diff --git a/doc/train/training.md b/doc/train/training.md
index 6c8b7a5549..6ccb43bbd7 100644
--- a/doc/train/training.md
+++ b/doc/train/training.md
@@ -29,12 +29,15 @@ $ dp --pt train input.json
 :::{tab-item} Paddle {{ paddle_icon }}
 
 ```bash
-# training model in eager mode
+# training model
 $ dp --pd train input.json
 
-# [experimental] training model with CINN compiler for better performance,
+# [experimental] training models with the CINN compiler (~40%+ speedup)
 # see: https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/guides/paddle_v3_features/cinn_cn.html
+## If the shape(s) of batch input data are dynamic during training(default).
 $ CINN=1 dp --pd train input.json
+## If the shape(s) of batch input data are fixed during training, e.g., examples/water.
+$ CINN=1 CINN_ALLOW_DYNAMIC_SHAPE=0 dp --pd train input.json
 ```
 
 :::
diff --git a/examples/water/d3/README.md b/examples/water/d3/README.md
index bd75960010..fdbeea1c1f 100644
--- a/examples/water/d3/README.md
+++ b/examples/water/d3/README.md
@@ -1,5 +1,9 @@
 # DPD3
 
+> [!IMPORTANT]
+> It is more efficient to train a model using plain DFT calculations without the dispersion correction, and add the dispersion correction during the simulation via the LAMMPS [`pair_style dispersion/d3` command](https://docs.lammps.org/pair_dispersion_d3.html#pair-style-dispersion-d3-command).
+> Training against data with dispersion directly is discouraged.
+
 `dftd3.txt` tabulates D3 dispersion for each pair of types (O-O, O-H, H-H).
 It can be generated by [simple-dftd3](https://github.com/dftd3/simple-dftd3).
 
diff --git a/examples/water_multi_task/pytorch_example/input_torch_with_alias.json b/examples/water_multi_task/pytorch_example/input_torch_with_alias.json
new file mode 100644
index 0000000000..92a0ea14a9
--- /dev/null
+++ b/examples/water_multi_task/pytorch_example/input_torch_with_alias.json
@@ -0,0 +1,148 @@
+{
+  "_comment": "that's all",
+  "model": {
+    "shared_dict": {
+      "type_map_all": [
+        "O",
+        "H"
+      ],
+      "dpa3_descriptor": {
+        "type": "dpa3",
+        "repflow": {
+          "n_dim": 128,
+          "e_dim": 64,
+          "a_dim": 32,
+          "nlayers": 6,
+          "e_rcut": 6.0,
+          "e_rcut_smth": 5.3,
+          "e_sel": 1200,
+          "a_rcut": 4.0,
+          "a_rcut_smth": 3.5,
+          "a_sel": 300,
+          "axis_neuron": 4,
+          "fix_stat_std": 0.3,
+          "a_compress_rate": 1,
+          "a_compress_e_rate": 2,
+          "a_compress_use_split": true,
+          "update_angle": true,
+          "smooth_edge_update": true,
+          "edge_init_use_dist": true,
+          "use_dynamic_sel": true,
+          "sel_reduce_factor": 10.0,
+          "use_exp_switch": true,
+          "update_style": "res_residual",
+          "update_residual": 0.1,
+          "update_residual_init": "const"
+        },
+        "activation_function": "silut:10.0",
+        "use_tebd_bias": false,
+        "precision": "float32",
+        "concat_output_tebd": false
+      },
+      "shared_fit_with_id": {
+        "neuron": [
+          240,
+          240,
+          240
+        ],
+        "resnet_dt": true,
+        "seed": 1,
+        "dim_case_embd": 2,
+        "_comment": " that's all"
+      },
+      "_comment": "that's all"
+    },
+    "model_dict": {
+      "water_1": {
+        "type_map": "type_map_all",
+        "descriptor": "dpa3_descriptor",
+        "fitting_net": "shared_fit_with_id",
+        "model_branch_alias": ["Default","Water"],
+        "info": {
+            "description": "Water model with DPA3 descriptor and shared fitting net",
+            "observed_type": ["H", "O"]
+        }
+      },
+      "water_2": {
+        "type_map": "type_map_all",
+        "descriptor": "dpa3_descriptor",
+        "fitting_net": "shared_fit_with_id",
+        "model_branch_alias": ["Water2"],
+        "info": {
+            "description": "Water duplicated model with DPA3 descriptor and shared fitting net",
+            "observed_type": ["H", "O"]
+        }
+      }
+    }
+  },
+  "learning_rate": {
+    "type": "exp",
+    "decay_steps": 5000,
+    "start_lr": 0.001,
+    "stop_lr": 3.51e-08,
+    "_comment": "that's all"
+  },
+  "loss_dict": {
+    "water_1": {
+      "type": "ener",
+      "start_pref_e": 0.02,
+      "limit_pref_e": 1,
+      "start_pref_f": 1000,
+      "limit_pref_f": 1,
+      "start_pref_v": 0,
+      "limit_pref_v": 0
+    },
+    "water_2": {
+      "type": "ener",
+      "start_pref_e": 0.02,
+      "limit_pref_e": 1,
+      "start_pref_f": 1000,
+      "limit_pref_f": 1,
+      "start_pref_v": 0,
+      "limit_pref_v": 0
+    }
+  },
+  "training": {
+    "model_prob": {
+      "water_1": 0.5,
+      "water_2": 0.5
+    },
+    "data_dict": {
+      "water_1": {
+        "training_data": {
+          "systems": [
+            "../../water/data/data_0/",
+            "../../water/data/data_1/",
+            "../../water/data/data_2/"
+          ],
+          "batch_size": 1,
+          "_comment": "that's all"
+        },
+        "validation_data": {
+          "systems": [
+            "../../water/data/data_3/"
+          ],
+          "batch_size": 1,
+          "_comment": "that's all"
+        }
+      },
+      "water_2": {
+        "training_data": {
+          "systems": [
+            "../../water/data/data_0/",
+            "../../water/data/data_1/",
+            "../../water/data/data_2/"
+          ],
+          "batch_size": 1,
+          "_comment": "that's all"
+        }
+      }
+    },
+    "numb_steps": 100000,
+    "seed": 10,
+    "disp_file": "lcurve.out",
+    "disp_freq": 100,
+    "save_freq": 100,
+    "_comment": "that's all"
+  }
+}
diff --git a/pyproject.toml b/pyproject.toml
index 362bcacd4b..cb11d0258d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,7 +58,7 @@ requires-python = ">=3.9"
 keywords = ["deepmd"]
 
 [project.entry-points."lammps.plugins"]
-deepmd = "deepmd.lmp:get_op_dir"
+deepmd = "deepmd.lmp_check_build:get_op_dir"
 
 [project.entry-points."dpgui"]
 "DeePMD-kit" = "deepmd.utils.argcheck:gen_args"
@@ -90,7 +90,7 @@ test = [
 docs = [
     "sphinx>=3.1.1",
     "sphinx-book-theme",
-    "myst-nb>=1.0.0rc0",
+    "myst-nb>=1.0.0",
     "myst-parser>=0.19.2",
     "sphinx-design",
     "breathe",
@@ -108,7 +108,7 @@ docs = [
     "sphinx-remove-toctrees",
 ]
 lmp = [
-    "lammps~=2024.8.29.1.0",
+    "lammps[mpi]~=2025.7.22.1.0",
 ]
 ipi = [
     "ipi",
@@ -238,14 +238,11 @@ manylinux-x86_64-image = "quay.io/pypa/manylinux_2_28_x86_64:2022-11-19-1b19e81"
 manylinux-aarch64-image = "manylinux_2_28"
 
 [tool.cibuildwheel.macos]
-before-all = [
-    '''pip install -i https://pypi.anaconda.org/mpi4py/simple mpich''',
-]
 repair-wheel-command = """delocate-wheel --require-archs {delocate_archs} -w {dest_dir} -v {wheel} --ignore-missing-dependencies"""
 
 [tool.cibuildwheel.macos.environment]
 PIP_PREFER_BINARY = "1"
-DP_LAMMPS_VERSION = "stable_29Aug2024_update1"
+DP_LAMMPS_VERSION = "stable_22Jul2025_update1"
 DP_ENABLE_IPI = "1"
 DP_ENABLE_PYTORCH = "1"
 DP_ENABLE_PADDLE = "1"
@@ -259,7 +256,7 @@ inherit.environment = "append"
 environment.MACOSX_DEPLOYMENT_TARGET = "11.0"
 
 [tool.cibuildwheel.linux]
-repair-wheel-command = "auditwheel repair --exclude libtensorflow_framework.so.2 --exclude libtensorflow_framework.so.1 --exclude libtensorflow_framework.so --exclude _pywrap_tensorflow_internal.so --exclude libtensorflow_cc.so.2 --exclude libc10.so --exclude libtorch.so --exclude libtorch_cpu.so -w {dest_dir} {wheel}"
+repair-wheel-command = "auditwheel repair --exclude libtensorflow_framework.so.2 --exclude libtensorflow_framework.so.1 --exclude libtensorflow_framework.so --exclude _pywrap_tensorflow_internal.so --exclude libtensorflow_cc.so.2 --exclude libc10.so --exclude libtorch.so --exclude libtorch_cpu.so --exclude libmpi.so.12 -w {dest_dir} {wheel}"
 environment-pass = [
     "CIBW_BUILD",
     "DP_VARIANT",
@@ -272,7 +269,6 @@ before-all = [
     # https://almalinux.org/blog/2023-12-20-almalinux-8-key-update/
     """rpm --import https://repo.almalinux.org/almalinux/RPM-GPG-KEY-AlmaLinux""",
     """{ if [ "$(uname -m)" = "x86_64" ] ; then yum config-manager --add-repo http://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo && yum install -y cuda-nvcc-${CUDA_VERSION/./-} cuda-cudart-devel-${CUDA_VERSION/./-}; fi }""",
-    '''/opt/python/cp311-cp311/bin/python -m pip install -i https://pypi.anaconda.org/mpi4py/simple mpich''',
     # uv is not available in the old manylinux image
     """{ if [ "$(uname -m)" = "x86_64" ] ; then pipx install uv; fi }""",
 ]
@@ -282,18 +278,14 @@ before-build = [
 ]
 [tool.cibuildwheel.linux.environment]
 PIP_PREFER_BINARY = "1"
-DP_LAMMPS_VERSION = "stable_29Aug2024_update1"
+DP_LAMMPS_VERSION = "stable_22Jul2025_update1"
 DP_ENABLE_IPI = "1"
 DP_ENABLE_PYTORCH = "1"
 DP_ENABLE_PADDLE = "1"
-MPI_HOME = "/usr/lib64/mpich"
-PATH = "/usr/lib64/mpich/bin:$PATH"
 # use CPU version of torch for building, which should also work for GPU
 # note: uv has different behavior from pip on extra index url
 # https://github.com/astral-sh/uv/blob/main/PIP_COMPATIBILITY.md#packages-that-exist-on-multiple-indexes
 UV_EXTRA_INDEX_URL = "https://download.pytorch.org/whl/cpu"
-# trick to find the correction version of mpich
-CMAKE_PREFIX_PATH="/opt/python/cp311-cp311/"
 
 [tool.cibuildwheel.windows]
 test-extras = ["cpu", "torch", "paddle"]
@@ -380,11 +372,14 @@ select = [
     "DTZ", # datetime
     "TCH", # flake8-type-checking
     "PYI", # flake8-pyi
+    "ANN", # type annotations
 ]
 
 ignore = [
+    "ANN401", # Allow Any due to too many violations
     "E501", # line too long
     "F841", # local variable is assigned to but never used
+    "RUF059", # unused-unpacked-variable
     "E741", # ambiguous variable name
     "E402", # module level import not at top of file
     "D100", # TODO: missing docstring in public module
@@ -397,7 +392,6 @@ ignore = [
     "D401", # TODO: first line should be in imperative mood
     "D404", # TODO: first word of the docstring should not be This
 ]
-ignore-init-module-imports = true
 
 exclude = [
     "source/3rdparty/**",
@@ -426,19 +420,27 @@ runtime-evaluated-base-classes = ["torch.nn.Module"]
 
 [tool.ruff.lint.extend-per-file-ignores]
 # Also ignore `E402` in all `__init__.py` files.
-"deepmd/tf/**" = ["TID253"]
+"source/3rdparty/**" = ["ALL"]
+"backend/**" = ["ANN"]
+"data/**" = ["ANN"]
+"deepmd/tf/**" = ["TID253", "ANN"]
 "deepmd/pt/**" = ["TID253"]
 "deepmd/jax/**" = ["TID253"]
-"deepmd/pd/**" = ["TID253"]
-"source/tests/tf/**" = ["TID253"]
-"source/tests/pt/**" = ["TID253"]
-"source/tests/jax/**" = ["TID253"]
-"source/tests/pd/**" = ["TID253"]
-"source/tests/universal/pt/**" = ["TID253"]
-"source/tests/universal/pd/**" = ["TID253"]
-"source/jax2tf_tests/**" = ["TID253"]
-"source/ipi/tests/**" = ["TID253"]
-"source/lmp/tests/**" = ["TID253"]
+"deepmd/pd/**" = ["TID253", "ANN"]
+
+"source/**" = ["ANN"]
+"source/tests/tf/**" = ["TID253", "ANN"]
+"source/tests/pt/**" = ["TID253", "ANN"]
+"source/tests/jax/**" = ["TID253", "ANN"]
+"source/tests/pd/**" = ["TID253", "ANN"]
+"source/tests/universal/pt/**" = ["TID253", "ANN"]
+"source/tests/universal/pd/**" = ["TID253", "ANN"]
+"source/tests/**" = ["ANN"]
+"source/jax2tf_tests/**" = ["TID253", "ANN"]
+"source/ipi/tests/**" = ["TID253", "ANN"]
+"source/lmp/tests/**" = ["TID253", "ANN"]
+"**/tests/**/test_*.py" = ["ANN"]
+"**/tests/**/*_test.py" = ["ANN"]
 "**/*.ipynb" = ["T20"]  # printing in a nb file is expected
 
 [tool.pytest.ini_options]
@@ -459,15 +461,6 @@ select = [
     "TOR2",
 ]
 
-[tool.uv.sources]
-mpich = { index = "mpi4py" }
-openmpi = { index = "mpi4py" }
-
-[[tool.uv.index]]
-name = "mpi4py"
-url = "https://pypi.anaconda.org/mpi4py/simple"
-explicit = true
-
 [[tool.uv.dependency-metadata]]
 # Fix https://github.com/deepmodeling/deepmd-kit/issues/4679
 name = "tensorflow"
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index a678802a35..2c13a5a367 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -373,7 +373,15 @@ if(ENABLE_PYTORCH AND NOT DEEPMD_C_ROOT)
   else()
     # Maybe in macos/windows
     if(NOT DEFINED OP_CXX_ABI)
-      set(OP_CXX_ABI 0)
+      if(UNIX
+         AND NOT APPLE
+         AND Torch_VERSION VERSION_GREATER_EQUAL "2.8.0")
+        # https://github.com/deepmodeling/deepmd-kit/issues/4877
+        # torch.compiled_with_cxx11_abi in PyTorch 2.8 always return True
+        set(OP_CXX_ABI 1)
+      else()
+        set(OP_CXX_ABI 0)
+      endif()
     endif()
     set(OP_CXX_ABI_PT "${OP_CXX_ABI}")
   endif()
@@ -520,13 +528,16 @@ if(DEEPMD_C_ROOT)
 endif()
 
 if(NOT DEEPMD_C_ROOT)
+  add_subdirectory(lib/)
   if(ENABLE_TENSORFLOW)
     add_subdirectory(op/tf/)
   endif()
   if(ENABLE_PYTORCH)
     add_subdirectory(op/pt/)
   endif()
-  add_subdirectory(lib/)
+  if(ENABLE_PADDLE)
+    add_subdirectory(op/pd/)
+  endif()
 endif()
 if(BUILD_PY_IF)
   add_subdirectory(config/)
diff --git a/source/api_c/include/deepmd.hpp b/source/api_c/include/deepmd.hpp
index 8a3656bfc2..afa62403e7 100644
--- a/source/api_c/include/deepmd.hpp
+++ b/source/api_c/include/deepmd.hpp
@@ -26,7 +26,7 @@ namespace hpp {
 struct deepmd_exception : public std::runtime_error {
  public:
   deepmd_exception() : runtime_error("DeePMD-kit C API Error!") {};
-  deepmd_exception(const std::string &msg)
+  deepmd_exception(const std::string& msg)
       : runtime_error(std::string("DeePMD-kit C API Error: ") + msg) {};
 };
 }  // namespace hpp
@@ -36,7 +36,7 @@ struct deepmd_exception : public std::runtime_error {
  * @brief Check if any exceptions throw in the C++ API. Throw if possible.
  */
 #define DP_CHECK_OK(check_func, dp)                   \
-  const char *err_msg = check_func(dp);               \
+  const char* err_msg = check_func(dp);               \
   if (std::strlen(err_msg)) {                         \
     std::string err_msg_str = std::string(err_msg);   \
     DP_DeleteChar(err_msg);                           \
@@ -45,173 +45,173 @@ struct deepmd_exception : public std::runtime_error {
   DP_DeleteChar(err_msg);
 
 template <typename FPTYPE>
-inline void _DP_DeepPotCompute(DP_DeepPot *dp,
+inline void _DP_DeepPotCompute(DP_DeepPot* dp,
                                const int nframes,
                                const int natom,
-                               const FPTYPE *coord,
-                               const int *atype,
-                               const FPTYPE *cell,
-                               const FPTYPE *fparam,
-                               const FPTYPE *aparam,
-                               double *energy,
-                               FPTYPE *force,
-                               FPTYPE *virial,
-                               FPTYPE *atomic_energy,
-                               FPTYPE *atomic_virial);
+                               const FPTYPE* coord,
+                               const int* atype,
+                               const FPTYPE* cell,
+                               const FPTYPE* fparam,
+                               const FPTYPE* aparam,
+                               double* energy,
+                               FPTYPE* force,
+                               FPTYPE* virial,
+                               FPTYPE* atomic_energy,
+                               FPTYPE* atomic_virial);
 
 template <>
-inline void _DP_DeepPotCompute<double>(DP_DeepPot *dp,
+inline void _DP_DeepPotCompute<double>(DP_DeepPot* dp,
                                        const int nframes,
                                        const int natom,
-                                       const double *coord,
-                                       const int *atype,
-                                       const double *cell,
-                                       const double *fparam,
-                                       const double *aparam,
-                                       double *energy,
-                                       double *force,
-                                       double *virial,
-                                       double *atomic_energy,
-                                       double *atomic_virial) {
+                                       const double* coord,
+                                       const int* atype,
+                                       const double* cell,
+                                       const double* fparam,
+                                       const double* aparam,
+                                       double* energy,
+                                       double* force,
+                                       double* virial,
+                                       double* atomic_energy,
+                                       double* atomic_virial) {
   DP_DeepPotCompute2(dp, nframes, natom, coord, atype, cell, fparam, aparam,
                      energy, force, virial, atomic_energy, atomic_virial);
 }
 
 template <>
-inline void _DP_DeepPotCompute<float>(DP_DeepPot *dp,
+inline void _DP_DeepPotCompute<float>(DP_DeepPot* dp,
                                       const int nframes,
                                       const int natom,
-                                      const float *coord,
-                                      const int *atype,
-                                      const float *cell,
-                                      const float *fparam,
-                                      const float *aparam,
-                                      double *energy,
-                                      float *force,
-                                      float *virial,
-                                      float *atomic_energy,
-                                      float *atomic_virial) {
+                                      const float* coord,
+                                      const int* atype,
+                                      const float* cell,
+                                      const float* fparam,
+                                      const float* aparam,
+                                      double* energy,
+                                      float* force,
+                                      float* virial,
+                                      float* atomic_energy,
+                                      float* atomic_virial) {
   DP_DeepPotComputef2(dp, nframes, natom, coord, atype, cell, fparam, aparam,
                       energy, force, virial, atomic_energy, atomic_virial);
 }
 
 // support spin
 template <typename FPTYPE>
-inline void _DP_DeepSpinCompute(DP_DeepSpin *dp,
+inline void _DP_DeepSpinCompute(DP_DeepSpin* dp,
                                 const int nframes,
                                 const int natom,
-                                const FPTYPE *coord,
-                                const FPTYPE *spin,
-                                const int *atype,
-                                const FPTYPE *cell,
-                                const FPTYPE *fparam,
-                                const FPTYPE *aparam,
-                                double *energy,
-                                FPTYPE *force,
-                                FPTYPE *force_mag,
-                                FPTYPE *virial,
-                                FPTYPE *atomic_energy,
-                                FPTYPE *atomic_virial);
+                                const FPTYPE* coord,
+                                const FPTYPE* spin,
+                                const int* atype,
+                                const FPTYPE* cell,
+                                const FPTYPE* fparam,
+                                const FPTYPE* aparam,
+                                double* energy,
+                                FPTYPE* force,
+                                FPTYPE* force_mag,
+                                FPTYPE* virial,
+                                FPTYPE* atomic_energy,
+                                FPTYPE* atomic_virial);
 
 template <>
-inline void _DP_DeepSpinCompute<double>(DP_DeepSpin *dp,
+inline void _DP_DeepSpinCompute<double>(DP_DeepSpin* dp,
                                         const int nframes,
                                         const int natom,
-                                        const double *coord,
-                                        const double *spin,
-                                        const int *atype,
-                                        const double *cell,
-                                        const double *fparam,
-                                        const double *aparam,
-                                        double *energy,
-                                        double *force,
-                                        double *force_mag,
-                                        double *virial,
-                                        double *atomic_energy,
-                                        double *atomic_virial) {
+                                        const double* coord,
+                                        const double* spin,
+                                        const int* atype,
+                                        const double* cell,
+                                        const double* fparam,
+                                        const double* aparam,
+                                        double* energy,
+                                        double* force,
+                                        double* force_mag,
+                                        double* virial,
+                                        double* atomic_energy,
+                                        double* atomic_virial) {
   DP_DeepSpinCompute2(dp, nframes, natom, coord, spin, atype, cell, fparam,
                       aparam, energy, force, force_mag, virial, atomic_energy,
                       atomic_virial);
 }
 
 template <>
-inline void _DP_DeepSpinCompute<float>(DP_DeepSpin *dp,
+inline void _DP_DeepSpinCompute<float>(DP_DeepSpin* dp,
                                        const int nframes,
                                        const int natom,
-                                       const float *coord,
-                                       const float *spin,
-                                       const int *atype,
-                                       const float *cell,
-                                       const float *fparam,
-                                       const float *aparam,
-                                       double *energy,
-                                       float *force,
-                                       float *force_mag,
-                                       float *virial,
-                                       float *atomic_energy,
-                                       float *atomic_virial) {
+                                       const float* coord,
+                                       const float* spin,
+                                       const int* atype,
+                                       const float* cell,
+                                       const float* fparam,
+                                       const float* aparam,
+                                       double* energy,
+                                       float* force,
+                                       float* force_mag,
+                                       float* virial,
+                                       float* atomic_energy,
+                                       float* atomic_virial) {
   DP_DeepSpinComputef2(dp, nframes, natom, coord, spin, atype, cell, fparam,
                        aparam, energy, force, force_mag, virial, atomic_energy,
                        atomic_virial);
 }
 
 template <typename FPTYPE>
-inline void _DP_DeepPotComputeNList(DP_DeepPot *dp,
+inline void _DP_DeepPotComputeNList(DP_DeepPot* dp,
                                     const int nframes,
                                     const int natom,
-                                    const FPTYPE *coord,
-                                    const int *atype,
-                                    const FPTYPE *cell,
+                                    const FPTYPE* coord,
+                                    const int* atype,
+                                    const FPTYPE* cell,
                                     const int nghost,
-                                    const DP_Nlist *nlist,
+                                    const DP_Nlist* nlist,
                                     const int ago,
-                                    const FPTYPE *fparam,
-                                    const FPTYPE *aparam,
-                                    double *energy,
-                                    FPTYPE *force,
-                                    FPTYPE *virial,
-                                    FPTYPE *atomic_energy,
-                                    FPTYPE *atomic_virial);
+                                    const FPTYPE* fparam,
+                                    const FPTYPE* aparam,
+                                    double* energy,
+                                    FPTYPE* force,
+                                    FPTYPE* virial,
+                                    FPTYPE* atomic_energy,
+                                    FPTYPE* atomic_virial);
 
 template <>
-inline void _DP_DeepPotComputeNList<double>(DP_DeepPot *dp,
+inline void _DP_DeepPotComputeNList<double>(DP_DeepPot* dp,
                                             const int nframes,
                                             const int natom,
-                                            const double *coord,
-                                            const int *atype,
-                                            const double *cell,
+                                            const double* coord,
+                                            const int* atype,
+                                            const double* cell,
                                             const int nghost,
-                                            const DP_Nlist *nlist,
+                                            const DP_Nlist* nlist,
                                             const int ago,
-                                            const double *fparam,
-                                            const double *aparam,
-                                            double *energy,
-                                            double *force,
-                                            double *virial,
-                                            double *atomic_energy,
-                                            double *atomic_virial) {
+                                            const double* fparam,
+                                            const double* aparam,
+                                            double* energy,
+                                            double* force,
+                                            double* virial,
+                                            double* atomic_energy,
+                                            double* atomic_virial) {
   DP_DeepPotComputeNList2(dp, nframes, natom, coord, atype, cell, nghost, nlist,
                           ago, fparam, aparam, energy, force, virial,
                           atomic_energy, atomic_virial);
 }
 
 template <>
-inline void _DP_DeepPotComputeNList<float>(DP_DeepPot *dp,
+inline void _DP_DeepPotComputeNList<float>(DP_DeepPot* dp,
                                            const int nframes,
                                            const int natom,
-                                           const float *coord,
-                                           const int *atype,
-                                           const float *cell,
+                                           const float* coord,
+                                           const int* atype,
+                                           const float* cell,
                                            const int nghost,
-                                           const DP_Nlist *nlist,
+                                           const DP_Nlist* nlist,
                                            const int ago,
-                                           const float *fparam,
-                                           const float *aparam,
-                                           double *energy,
-                                           float *force,
-                                           float *virial,
-                                           float *atomic_energy,
-                                           float *atomic_virial) {
+                                           const float* fparam,
+                                           const float* aparam,
+                                           double* energy,
+                                           float* force,
+                                           float* virial,
+                                           float* atomic_energy,
+                                           float* atomic_virial) {
   DP_DeepPotComputeNListf2(dp, nframes, natom, coord, atype, cell, nghost,
                            nlist, ago, fparam, aparam, energy, force, virial,
                            atomic_energy, atomic_virial);
@@ -219,550 +219,550 @@ inline void _DP_DeepPotComputeNList<float>(DP_DeepPot *dp,
 
 // support spin
 template <typename FPTYPE>
-inline void _DP_DeepSpinComputeNList(DP_DeepSpin *dp,
+inline void _DP_DeepSpinComputeNList(DP_DeepSpin* dp,
                                      const int nframes,
                                      const int natom,
-                                     const FPTYPE *coord,
-                                     const FPTYPE *spin,
-                                     const int *atype,
-                                     const FPTYPE *cell,
+                                     const FPTYPE* coord,
+                                     const FPTYPE* spin,
+                                     const int* atype,
+                                     const FPTYPE* cell,
                                      const int nghost,
-                                     const DP_Nlist *nlist,
+                                     const DP_Nlist* nlist,
                                      const int ago,
-                                     const FPTYPE *fparam,
-                                     const FPTYPE *aparam,
-                                     double *energy,
-                                     FPTYPE *force,
-                                     FPTYPE *force_mag,
-                                     FPTYPE *virial,
-                                     FPTYPE *atomic_energy,
-                                     FPTYPE *atomic_virial);
+                                     const FPTYPE* fparam,
+                                     const FPTYPE* aparam,
+                                     double* energy,
+                                     FPTYPE* force,
+                                     FPTYPE* force_mag,
+                                     FPTYPE* virial,
+                                     FPTYPE* atomic_energy,
+                                     FPTYPE* atomic_virial);
 
 template <>
-inline void _DP_DeepSpinComputeNList<double>(DP_DeepSpin *dp,
+inline void _DP_DeepSpinComputeNList<double>(DP_DeepSpin* dp,
                                              const int nframes,
                                              const int natom,
-                                             const double *coord,
-                                             const double *spin,
-                                             const int *atype,
-                                             const double *cell,
+                                             const double* coord,
+                                             const double* spin,
+                                             const int* atype,
+                                             const double* cell,
                                              const int nghost,
-                                             const DP_Nlist *nlist,
+                                             const DP_Nlist* nlist,
                                              const int ago,
-                                             const double *fparam,
-                                             const double *aparam,
-                                             double *energy,
-                                             double *force,
-                                             double *force_mag,
-                                             double *virial,
-                                             double *atomic_energy,
-                                             double *atomic_virial) {
+                                             const double* fparam,
+                                             const double* aparam,
+                                             double* energy,
+                                             double* force,
+                                             double* force_mag,
+                                             double* virial,
+                                             double* atomic_energy,
+                                             double* atomic_virial) {
   DP_DeepSpinComputeNList2(dp, nframes, natom, coord, spin, atype, cell, nghost,
                            nlist, ago, fparam, aparam, energy, force, force_mag,
                            virial, atomic_energy, atomic_virial);
 }
 
 template <>
-inline void _DP_DeepSpinComputeNList<float>(DP_DeepSpin *dp,
+inline void _DP_DeepSpinComputeNList<float>(DP_DeepSpin* dp,
                                             const int nframes,
                                             const int natom,
-                                            const float *coord,
-                                            const float *spin,
-                                            const int *atype,
-                                            const float *cell,
+                                            const float* coord,
+                                            const float* spin,
+                                            const int* atype,
+                                            const float* cell,
                                             const int nghost,
-                                            const DP_Nlist *nlist,
+                                            const DP_Nlist* nlist,
                                             const int ago,
-                                            const float *fparam,
-                                            const float *aparam,
-                                            double *energy,
-                                            float *force,
-                                            float *force_mag,
-                                            float *virial,
-                                            float *atomic_energy,
-                                            float *atomic_virial) {
+                                            const float* fparam,
+                                            const float* aparam,
+                                            double* energy,
+                                            float* force,
+                                            float* force_mag,
+                                            float* virial,
+                                            float* atomic_energy,
+                                            float* atomic_virial) {
   DP_DeepSpinComputeNListf2(dp, nframes, natom, coord, spin, atype, cell,
                             nghost, nlist, ago, fparam, aparam, energy, force,
                             force_mag, virial, atomic_energy, atomic_virial);
 }
 
 template <typename FPTYPE>
-inline void _DP_DeepPotComputeMixedType(DP_DeepPot *dp,
+inline void _DP_DeepPotComputeMixedType(DP_DeepPot* dp,
                                         const int nframes,
                                         const int natom,
-                                        const FPTYPE *coord,
-                                        const int *atype,
-                                        const FPTYPE *cell,
-                                        const FPTYPE *fparam,
-                                        const FPTYPE *aparam,
-                                        double *energy,
-                                        FPTYPE *force,
-                                        FPTYPE *virial,
-                                        FPTYPE *atomic_energy,
-                                        FPTYPE *atomic_virial);
+                                        const FPTYPE* coord,
+                                        const int* atype,
+                                        const FPTYPE* cell,
+                                        const FPTYPE* fparam,
+                                        const FPTYPE* aparam,
+                                        double* energy,
+                                        FPTYPE* force,
+                                        FPTYPE* virial,
+                                        FPTYPE* atomic_energy,
+                                        FPTYPE* atomic_virial);
 
 template <>
-inline void _DP_DeepPotComputeMixedType<double>(DP_DeepPot *dp,
+inline void _DP_DeepPotComputeMixedType<double>(DP_DeepPot* dp,
                                                 const int nframes,
                                                 const int natom,
-                                                const double *coord,
-                                                const int *atype,
-                                                const double *cell,
-                                                const double *fparam,
-                                                const double *aparam,
-                                                double *energy,
-                                                double *force,
-                                                double *virial,
-                                                double *atomic_energy,
-                                                double *atomic_virial) {
+                                                const double* coord,
+                                                const int* atype,
+                                                const double* cell,
+                                                const double* fparam,
+                                                const double* aparam,
+                                                double* energy,
+                                                double* force,
+                                                double* virial,
+                                                double* atomic_energy,
+                                                double* atomic_virial) {
   DP_DeepPotComputeMixedType(dp, nframes, natom, coord, atype, cell, fparam,
                              aparam, energy, force, virial, atomic_energy,
                              atomic_virial);
 }
 
 template <>
-inline void _DP_DeepPotComputeMixedType<float>(DP_DeepPot *dp,
+inline void _DP_DeepPotComputeMixedType<float>(DP_DeepPot* dp,
                                                const int nframes,
                                                const int natom,
-                                               const float *coord,
-                                               const int *atype,
-                                               const float *cell,
-                                               const float *fparam,
-                                               const float *aparam,
-                                               double *energy,
-                                               float *force,
-                                               float *virial,
-                                               float *atomic_energy,
-                                               float *atomic_virial) {
+                                               const float* coord,
+                                               const int* atype,
+                                               const float* cell,
+                                               const float* fparam,
+                                               const float* aparam,
+                                               double* energy,
+                                               float* force,
+                                               float* virial,
+                                               float* atomic_energy,
+                                               float* atomic_virial) {
   DP_DeepPotComputeMixedTypef(dp, nframes, natom, coord, atype, cell, fparam,
                               aparam, energy, force, virial, atomic_energy,
                               atomic_virial);
 }
 
 template <typename FPTYPE>
-inline void _DP_DeepPotModelDeviCompute(DP_DeepPotModelDevi *dp,
+inline void _DP_DeepPotModelDeviCompute(DP_DeepPotModelDevi* dp,
                                         const int natom,
-                                        const FPTYPE *coord,
-                                        const int *atype,
-                                        const FPTYPE *cell,
-                                        const FPTYPE *fparam,
-                                        const FPTYPE *aparam,
-                                        double *energy,
-                                        FPTYPE *force,
-                                        FPTYPE *virial,
-                                        FPTYPE *atomic_energy,
-                                        FPTYPE *atomic_virial);
+                                        const FPTYPE* coord,
+                                        const int* atype,
+                                        const FPTYPE* cell,
+                                        const FPTYPE* fparam,
+                                        const FPTYPE* aparam,
+                                        double* energy,
+                                        FPTYPE* force,
+                                        FPTYPE* virial,
+                                        FPTYPE* atomic_energy,
+                                        FPTYPE* atomic_virial);
 
 template <>
-inline void _DP_DeepPotModelDeviCompute<double>(DP_DeepPotModelDevi *dp,
+inline void _DP_DeepPotModelDeviCompute<double>(DP_DeepPotModelDevi* dp,
                                                 const int natom,
-                                                const double *coord,
-                                                const int *atype,
-                                                const double *cell,
-                                                const double *fparam,
-                                                const double *aparam,
-                                                double *energy,
-                                                double *force,
-                                                double *virial,
-                                                double *atomic_energy,
-                                                double *atomic_virial) {
+                                                const double* coord,
+                                                const int* atype,
+                                                const double* cell,
+                                                const double* fparam,
+                                                const double* aparam,
+                                                double* energy,
+                                                double* force,
+                                                double* virial,
+                                                double* atomic_energy,
+                                                double* atomic_virial) {
   DP_DeepPotModelDeviCompute2(dp, 1, natom, coord, atype, cell, fparam, aparam,
                               energy, force, virial, atomic_energy,
                               atomic_virial);
 }
 
 template <>
-inline void _DP_DeepPotModelDeviCompute<float>(DP_DeepPotModelDevi *dp,
+inline void _DP_DeepPotModelDeviCompute<float>(DP_DeepPotModelDevi* dp,
                                                const int natom,
-                                               const float *coord,
-                                               const int *atype,
-                                               const float *cell,
-                                               const float *fparam,
-                                               const float *aparam,
-                                               double *energy,
-                                               float *force,
-                                               float *virial,
-                                               float *atomic_energy,
-                                               float *atomic_virial) {
+                                               const float* coord,
+                                               const int* atype,
+                                               const float* cell,
+                                               const float* fparam,
+                                               const float* aparam,
+                                               double* energy,
+                                               float* force,
+                                               float* virial,
+                                               float* atomic_energy,
+                                               float* atomic_virial) {
   DP_DeepPotModelDeviComputef2(dp, 1, natom, coord, atype, cell, fparam, aparam,
                                energy, force, virial, atomic_energy,
                                atomic_virial);
 }
 
 template <typename FPTYPE>
-inline void _DP_DeepSpinModelDeviCompute(DP_DeepSpinModelDevi *dp,
+inline void _DP_DeepSpinModelDeviCompute(DP_DeepSpinModelDevi* dp,
                                          const int natom,
-                                         const FPTYPE *coord,
-                                         const FPTYPE *spin,
-                                         const int *atype,
-                                         const FPTYPE *cell,
-                                         const FPTYPE *fparam,
-                                         const FPTYPE *aparam,
-                                         double *energy,
-                                         FPTYPE *force,
-                                         FPTYPE *force_mag,
-                                         FPTYPE *virial,
-                                         FPTYPE *atomic_energy,
-                                         FPTYPE *atomic_virial);
+                                         const FPTYPE* coord,
+                                         const FPTYPE* spin,
+                                         const int* atype,
+                                         const FPTYPE* cell,
+                                         const FPTYPE* fparam,
+                                         const FPTYPE* aparam,
+                                         double* energy,
+                                         FPTYPE* force,
+                                         FPTYPE* force_mag,
+                                         FPTYPE* virial,
+                                         FPTYPE* atomic_energy,
+                                         FPTYPE* atomic_virial);
 
 template <>
-inline void _DP_DeepSpinModelDeviCompute<double>(DP_DeepSpinModelDevi *dp,
+inline void _DP_DeepSpinModelDeviCompute<double>(DP_DeepSpinModelDevi* dp,
                                                  const int natom,
-                                                 const double *coord,
-                                                 const double *spin,
-                                                 const int *atype,
-                                                 const double *cell,
-                                                 const double *fparam,
-                                                 const double *aparam,
-                                                 double *energy,
-                                                 double *force,
-                                                 double *force_mag,
-                                                 double *virial,
-                                                 double *atomic_energy,
-                                                 double *atomic_virial) {
+                                                 const double* coord,
+                                                 const double* spin,
+                                                 const int* atype,
+                                                 const double* cell,
+                                                 const double* fparam,
+                                                 const double* aparam,
+                                                 double* energy,
+                                                 double* force,
+                                                 double* force_mag,
+                                                 double* virial,
+                                                 double* atomic_energy,
+                                                 double* atomic_virial) {
   DP_DeepSpinModelDeviCompute2(dp, 1, natom, coord, spin, atype, cell, fparam,
                                aparam, energy, force, force_mag, virial,
                                atomic_energy, atomic_virial);
 }
 
 template <>
-inline void _DP_DeepSpinModelDeviCompute<float>(DP_DeepSpinModelDevi *dp,
+inline void _DP_DeepSpinModelDeviCompute<float>(DP_DeepSpinModelDevi* dp,
                                                 const int natom,
-                                                const float *coord,
-                                                const float *spin,
-                                                const int *atype,
-                                                const float *cell,
-                                                const float *fparam,
-                                                const float *aparam,
-                                                double *energy,
-                                                float *force,
-                                                float *force_mag,
-                                                float *virial,
-                                                float *atomic_energy,
-                                                float *atomic_virial) {
+                                                const float* coord,
+                                                const float* spin,
+                                                const int* atype,
+                                                const float* cell,
+                                                const float* fparam,
+                                                const float* aparam,
+                                                double* energy,
+                                                float* force,
+                                                float* force_mag,
+                                                float* virial,
+                                                float* atomic_energy,
+                                                float* atomic_virial) {
   DP_DeepSpinModelDeviComputef2(dp, 1, natom, coord, spin, atype, cell, fparam,
                                 aparam, energy, force, force_mag, virial,
                                 atomic_energy, atomic_virial);
 }
 
 template <typename FPTYPE>
-inline void _DP_DeepPotModelDeviComputeNList(DP_DeepPotModelDevi *dp,
+inline void _DP_DeepPotModelDeviComputeNList(DP_DeepPotModelDevi* dp,
                                              const int natom,
-                                             const FPTYPE *coord,
-                                             const int *atype,
-                                             const FPTYPE *cell,
+                                             const FPTYPE* coord,
+                                             const int* atype,
+                                             const FPTYPE* cell,
                                              const int nghost,
-                                             const DP_Nlist *nlist,
+                                             const DP_Nlist* nlist,
                                              const int ago,
-                                             const FPTYPE *fparam,
-                                             const FPTYPE *aparam,
-                                             double *energy,
-                                             FPTYPE *force,
-                                             FPTYPE *virial,
-                                             FPTYPE *atomic_energy,
-                                             FPTYPE *atomic_virial);
+                                             const FPTYPE* fparam,
+                                             const FPTYPE* aparam,
+                                             double* energy,
+                                             FPTYPE* force,
+                                             FPTYPE* virial,
+                                             FPTYPE* atomic_energy,
+                                             FPTYPE* atomic_virial);
 
 template <>
-inline void _DP_DeepPotModelDeviComputeNList<double>(DP_DeepPotModelDevi *dp,
+inline void _DP_DeepPotModelDeviComputeNList<double>(DP_DeepPotModelDevi* dp,
                                                      const int natom,
-                                                     const double *coord,
-                                                     const int *atype,
-                                                     const double *cell,
+                                                     const double* coord,
+                                                     const int* atype,
+                                                     const double* cell,
                                                      const int nghost,
-                                                     const DP_Nlist *nlist,
+                                                     const DP_Nlist* nlist,
                                                      const int ago,
-                                                     const double *fparam,
-                                                     const double *aparam,
-                                                     double *energy,
-                                                     double *force,
-                                                     double *virial,
-                                                     double *atomic_energy,
-                                                     double *atomic_virial) {
+                                                     const double* fparam,
+                                                     const double* aparam,
+                                                     double* energy,
+                                                     double* force,
+                                                     double* virial,
+                                                     double* atomic_energy,
+                                                     double* atomic_virial) {
   DP_DeepPotModelDeviComputeNList2(dp, 1, natom, coord, atype, cell, nghost,
                                    nlist, ago, fparam, aparam, energy, force,
                                    virial, atomic_energy, atomic_virial);
 }
 
 template <>
-inline void _DP_DeepPotModelDeviComputeNList<float>(DP_DeepPotModelDevi *dp,
+inline void _DP_DeepPotModelDeviComputeNList<float>(DP_DeepPotModelDevi* dp,
                                                     const int natom,
-                                                    const float *coord,
-                                                    const int *atype,
-                                                    const float *cell,
+                                                    const float* coord,
+                                                    const int* atype,
+                                                    const float* cell,
                                                     const int nghost,
-                                                    const DP_Nlist *nlist,
+                                                    const DP_Nlist* nlist,
                                                     const int ago,
-                                                    const float *fparam,
-                                                    const float *aparam,
-                                                    double *energy,
-                                                    float *force,
-                                                    float *virial,
-                                                    float *atomic_energy,
-                                                    float *atomic_virial) {
+                                                    const float* fparam,
+                                                    const float* aparam,
+                                                    double* energy,
+                                                    float* force,
+                                                    float* virial,
+                                                    float* atomic_energy,
+                                                    float* atomic_virial) {
   DP_DeepPotModelDeviComputeNListf2(dp, 1, natom, coord, atype, cell, nghost,
                                     nlist, ago, fparam, aparam, energy, force,
                                     virial, atomic_energy, atomic_virial);
 }
 
 template <typename FPTYPE>
-inline void _DP_DeepSpinModelDeviComputeNList(DP_DeepSpinModelDevi *dp,
+inline void _DP_DeepSpinModelDeviComputeNList(DP_DeepSpinModelDevi* dp,
                                               const int natom,
-                                              const FPTYPE *coord,
-                                              const FPTYPE *spin,
-                                              const int *atype,
-                                              const FPTYPE *cell,
+                                              const FPTYPE* coord,
+                                              const FPTYPE* spin,
+                                              const int* atype,
+                                              const FPTYPE* cell,
                                               const int nghost,
-                                              const DP_Nlist *nlist,
+                                              const DP_Nlist* nlist,
                                               const int ago,
-                                              const FPTYPE *fparam,
-                                              const FPTYPE *aparam,
-                                              double *energy,
-                                              FPTYPE *force,
-                                              FPTYPE *force_mag,
-                                              FPTYPE *virial,
-                                              FPTYPE *atomic_energy,
-                                              FPTYPE *atomic_virial);
+                                              const FPTYPE* fparam,
+                                              const FPTYPE* aparam,
+                                              double* energy,
+                                              FPTYPE* force,
+                                              FPTYPE* force_mag,
+                                              FPTYPE* virial,
+                                              FPTYPE* atomic_energy,
+                                              FPTYPE* atomic_virial);
 template <>
-inline void _DP_DeepSpinModelDeviComputeNList<double>(DP_DeepSpinModelDevi *dp,
+inline void _DP_DeepSpinModelDeviComputeNList<double>(DP_DeepSpinModelDevi* dp,
                                                       const int natom,
-                                                      const double *coord,
-                                                      const double *spin,
-                                                      const int *atype,
-                                                      const double *cell,
+                                                      const double* coord,
+                                                      const double* spin,
+                                                      const int* atype,
+                                                      const double* cell,
                                                       const int nghost,
-                                                      const DP_Nlist *nlist,
+                                                      const DP_Nlist* nlist,
                                                       const int ago,
-                                                      const double *fparam,
-                                                      const double *aparam,
-                                                      double *energy,
-                                                      double *force,
-                                                      double *force_mag,
-                                                      double *virial,
-                                                      double *atomic_energy,
-                                                      double *atomic_virial) {
+                                                      const double* fparam,
+                                                      const double* aparam,
+                                                      double* energy,
+                                                      double* force,
+                                                      double* force_mag,
+                                                      double* virial,
+                                                      double* atomic_energy,
+                                                      double* atomic_virial) {
   DP_DeepSpinModelDeviComputeNList2(
       dp, 1, natom, coord, spin, atype, cell, nghost, nlist, ago, fparam,
       aparam, energy, force, force_mag, virial, atomic_energy, atomic_virial);
 }
 template <>
-inline void _DP_DeepSpinModelDeviComputeNList<float>(DP_DeepSpinModelDevi *dp,
+inline void _DP_DeepSpinModelDeviComputeNList<float>(DP_DeepSpinModelDevi* dp,
                                                      const int natom,
-                                                     const float *coord,
-                                                     const float *spin,
-                                                     const int *atype,
-                                                     const float *cell,
+                                                     const float* coord,
+                                                     const float* spin,
+                                                     const int* atype,
+                                                     const float* cell,
                                                      const int nghost,
-                                                     const DP_Nlist *nlist,
+                                                     const DP_Nlist* nlist,
                                                      const int ago,
-                                                     const float *fparam,
-                                                     const float *aparam,
-                                                     double *energy,
-                                                     float *force,
-                                                     float *force_mag,
-                                                     float *virial,
-                                                     float *atomic_energy,
-                                                     float *atomic_virial) {
+                                                     const float* fparam,
+                                                     const float* aparam,
+                                                     double* energy,
+                                                     float* force,
+                                                     float* force_mag,
+                                                     float* virial,
+                                                     float* atomic_energy,
+                                                     float* atomic_virial) {
   DP_DeepSpinModelDeviComputeNListf2(
       dp, 1, natom, coord, spin, atype, cell, nghost, nlist, ago, fparam,
       aparam, energy, force, force_mag, virial, atomic_energy, atomic_virial);
 }
 
 template <typename FPTYPE>
-inline void _DP_DeepTensorComputeTensor(DP_DeepTensor *dt,
+inline void _DP_DeepTensorComputeTensor(DP_DeepTensor* dt,
                                         const int natom,
-                                        const FPTYPE *coord,
-                                        const int *atype,
-                                        const FPTYPE *cell,
-                                        FPTYPE **tensor,
-                                        int *size);
+                                        const FPTYPE* coord,
+                                        const int* atype,
+                                        const FPTYPE* cell,
+                                        FPTYPE** tensor,
+                                        int* size);
 
 template <>
-inline void _DP_DeepTensorComputeTensor<double>(DP_DeepTensor *dt,
+inline void _DP_DeepTensorComputeTensor<double>(DP_DeepTensor* dt,
                                                 const int natom,
-                                                const double *coord,
-                                                const int *atype,
-                                                const double *cell,
-                                                double **tensor,
-                                                int *size) {
+                                                const double* coord,
+                                                const int* atype,
+                                                const double* cell,
+                                                double** tensor,
+                                                int* size) {
   DP_DeepTensorComputeTensor(dt, natom, coord, atype, cell, tensor, size);
 }
 
 template <>
-inline void _DP_DeepTensorComputeTensor<float>(DP_DeepTensor *dt,
+inline void _DP_DeepTensorComputeTensor<float>(DP_DeepTensor* dt,
                                                const int natom,
-                                               const float *coord,
-                                               const int *atype,
-                                               const float *cell,
-                                               float **tensor,
-                                               int *size) {
+                                               const float* coord,
+                                               const int* atype,
+                                               const float* cell,
+                                               float** tensor,
+                                               int* size) {
   DP_DeepTensorComputeTensorf(dt, natom, coord, atype, cell, tensor, size);
 }
 
 template <typename FPTYPE>
-inline void _DP_DeepTensorComputeTensorNList(DP_DeepTensor *dt,
+inline void _DP_DeepTensorComputeTensorNList(DP_DeepTensor* dt,
                                              const int natom,
-                                             const FPTYPE *coord,
-                                             const int *atype,
-                                             const FPTYPE *cell,
+                                             const FPTYPE* coord,
+                                             const int* atype,
+                                             const FPTYPE* cell,
                                              const int nghost,
-                                             const DP_Nlist *nlist,
-                                             FPTYPE **tensor,
-                                             int *size);
+                                             const DP_Nlist* nlist,
+                                             FPTYPE** tensor,
+                                             int* size);
 
 template <>
-inline void _DP_DeepTensorComputeTensorNList<double>(DP_DeepTensor *dt,
+inline void _DP_DeepTensorComputeTensorNList<double>(DP_DeepTensor* dt,
                                                      const int natom,
-                                                     const double *coord,
-                                                     const int *atype,
-                                                     const double *cell,
+                                                     const double* coord,
+                                                     const int* atype,
+                                                     const double* cell,
                                                      const int nghost,
-                                                     const DP_Nlist *nlist,
-                                                     double **tensor,
-                                                     int *size) {
+                                                     const DP_Nlist* nlist,
+                                                     double** tensor,
+                                                     int* size) {
   DP_DeepTensorComputeTensorNList(dt, natom, coord, atype, cell, nghost, nlist,
                                   tensor, size);
 }
 
 template <>
-inline void _DP_DeepTensorComputeTensorNList<float>(DP_DeepTensor *dt,
+inline void _DP_DeepTensorComputeTensorNList<float>(DP_DeepTensor* dt,
                                                     const int natom,
-                                                    const float *coord,
-                                                    const int *atype,
-                                                    const float *cell,
+                                                    const float* coord,
+                                                    const int* atype,
+                                                    const float* cell,
                                                     const int nghost,
-                                                    const DP_Nlist *nlist,
-                                                    float **tensor,
-                                                    int *size) {
+                                                    const DP_Nlist* nlist,
+                                                    float** tensor,
+                                                    int* size) {
   DP_DeepTensorComputeTensorNListf(dt, natom, coord, atype, cell, nghost, nlist,
                                    tensor, size);
 }
 
 template <typename FPTYPE>
-inline void _DP_DeepTensorCompute(DP_DeepTensor *dt,
+inline void _DP_DeepTensorCompute(DP_DeepTensor* dt,
                                   const int natom,
-                                  const FPTYPE *coord,
-                                  const int *atype,
-                                  const FPTYPE *cell,
-                                  FPTYPE *global_tensor,
-                                  FPTYPE *force,
-                                  FPTYPE *virial,
-                                  FPTYPE **atomic_energy,
-                                  FPTYPE *atomic_virial,
-                                  int *size_at);
+                                  const FPTYPE* coord,
+                                  const int* atype,
+                                  const FPTYPE* cell,
+                                  FPTYPE* global_tensor,
+                                  FPTYPE* force,
+                                  FPTYPE* virial,
+                                  FPTYPE** atomic_energy,
+                                  FPTYPE* atomic_virial,
+                                  int* size_at);
 
 template <>
-inline void _DP_DeepTensorCompute<double>(DP_DeepTensor *dt,
+inline void _DP_DeepTensorCompute<double>(DP_DeepTensor* dt,
                                           const int natom,
-                                          const double *coord,
-                                          const int *atype,
-                                          const double *cell,
-                                          double *global_tensor,
-                                          double *force,
-                                          double *virial,
-                                          double **atomic_tensor,
-                                          double *atomic_virial,
-                                          int *size_at) {
+                                          const double* coord,
+                                          const int* atype,
+                                          const double* cell,
+                                          double* global_tensor,
+                                          double* force,
+                                          double* virial,
+                                          double** atomic_tensor,
+                                          double* atomic_virial,
+                                          int* size_at) {
   DP_DeepTensorCompute(dt, natom, coord, atype, cell, global_tensor, force,
                        virial, atomic_tensor, atomic_virial, size_at);
 }
 
 template <>
-inline void _DP_DeepTensorCompute<float>(DP_DeepTensor *dt,
+inline void _DP_DeepTensorCompute<float>(DP_DeepTensor* dt,
                                          const int natom,
-                                         const float *coord,
-                                         const int *atype,
-                                         const float *cell,
-                                         float *global_tensor,
-                                         float *force,
-                                         float *virial,
-                                         float **atomic_tensor,
-                                         float *atomic_virial,
-                                         int *size_at) {
+                                         const float* coord,
+                                         const int* atype,
+                                         const float* cell,
+                                         float* global_tensor,
+                                         float* force,
+                                         float* virial,
+                                         float** atomic_tensor,
+                                         float* atomic_virial,
+                                         int* size_at) {
   DP_DeepTensorComputef(dt, natom, coord, atype, cell, global_tensor, force,
                         virial, atomic_tensor, atomic_virial, size_at);
 }
 
 template <typename FPTYPE>
-inline void _DP_DeepTensorComputeNList(DP_DeepTensor *dt,
+inline void _DP_DeepTensorComputeNList(DP_DeepTensor* dt,
                                        const int natom,
-                                       const FPTYPE *coord,
-                                       const int *atype,
-                                       const FPTYPE *cell,
+                                       const FPTYPE* coord,
+                                       const int* atype,
+                                       const FPTYPE* cell,
                                        const int nghost,
-                                       const DP_Nlist *nlist,
-                                       FPTYPE *global_tensor,
-                                       FPTYPE *force,
-                                       FPTYPE *virial,
-                                       FPTYPE **atomic_energy,
-                                       FPTYPE *atomic_virial,
-                                       int *size_at);
+                                       const DP_Nlist* nlist,
+                                       FPTYPE* global_tensor,
+                                       FPTYPE* force,
+                                       FPTYPE* virial,
+                                       FPTYPE** atomic_energy,
+                                       FPTYPE* atomic_virial,
+                                       int* size_at);
 
 template <>
-inline void _DP_DeepTensorComputeNList<double>(DP_DeepTensor *dt,
+inline void _DP_DeepTensorComputeNList<double>(DP_DeepTensor* dt,
                                                const int natom,
-                                               const double *coord,
-                                               const int *atype,
-                                               const double *cell,
+                                               const double* coord,
+                                               const int* atype,
+                                               const double* cell,
                                                const int nghost,
-                                               const DP_Nlist *nlist,
-                                               double *global_tensor,
-                                               double *force,
-                                               double *virial,
-                                               double **atomic_tensor,
-                                               double *atomic_virial,
-                                               int *size_at) {
+                                               const DP_Nlist* nlist,
+                                               double* global_tensor,
+                                               double* force,
+                                               double* virial,
+                                               double** atomic_tensor,
+                                               double* atomic_virial,
+                                               int* size_at) {
   DP_DeepTensorComputeNList(dt, natom, coord, atype, cell, nghost, nlist,
                             global_tensor, force, virial, atomic_tensor,
                             atomic_virial, size_at);
 }
 
 template <>
-inline void _DP_DeepTensorComputeNList<float>(DP_DeepTensor *dt,
+inline void _DP_DeepTensorComputeNList<float>(DP_DeepTensor* dt,
                                               const int natom,
-                                              const float *coord,
-                                              const int *atype,
-                                              const float *cell,
+                                              const float* coord,
+                                              const int* atype,
+                                              const float* cell,
                                               const int nghost,
-                                              const DP_Nlist *nlist,
-                                              float *global_tensor,
-                                              float *force,
-                                              float *virial,
-                                              float **atomic_tensor,
-                                              float *atomic_virial,
-                                              int *size_at) {
+                                              const DP_Nlist* nlist,
+                                              float* global_tensor,
+                                              float* force,
+                                              float* virial,
+                                              float** atomic_tensor,
+                                              float* atomic_virial,
+                                              int* size_at) {
   DP_DeepTensorComputeNListf(dt, natom, coord, atype, cell, nghost, nlist,
                              global_tensor, force, virial, atomic_tensor,
                              atomic_virial, size_at);
 }
 
 template <typename FPTYPE>
-inline void _DP_DipoleChargeModifierComputeNList(DP_DipoleChargeModifier *dcm,
+inline void _DP_DipoleChargeModifierComputeNList(DP_DipoleChargeModifier* dcm,
                                                  const int natom,
-                                                 const FPTYPE *coord,
-                                                 const int *atype,
-                                                 const FPTYPE *cell,
-                                                 const int *pairs,
+                                                 const FPTYPE* coord,
+                                                 const int* atype,
+                                                 const FPTYPE* cell,
+                                                 const int* pairs,
                                                  const int npairs,
-                                                 const FPTYPE *delef_,
+                                                 const FPTYPE* delef_,
                                                  const int nghost,
-                                                 const DP_Nlist *nlist,
-                                                 FPTYPE *dfcorr_,
-                                                 FPTYPE *dvcorr_);
+                                                 const DP_Nlist* nlist,
+                                                 FPTYPE* dfcorr_,
+                                                 FPTYPE* dvcorr_);
 
 template <>
 inline void _DP_DipoleChargeModifierComputeNList<double>(
-    DP_DipoleChargeModifier *dcm,
+    DP_DipoleChargeModifier* dcm,
     const int natom,
-    const double *coord,
-    const int *atype,
-    const double *cell,
-    const int *pairs,
+    const double* coord,
+    const int* atype,
+    const double* cell,
+    const int* pairs,
     const int npairs,
-    const double *delef_,
+    const double* delef_,
     const int nghost,
-    const DP_Nlist *nlist,
-    double *dfcorr_,
-    double *dvcorr_) {
+    const DP_Nlist* nlist,
+    double* dfcorr_,
+    double* dvcorr_) {
   DP_DipoleChargeModifierComputeNList(dcm, natom, coord, atype, cell, pairs,
                                       npairs, delef_, nghost, nlist, dfcorr_,
                                       dvcorr_);
@@ -770,30 +770,30 @@ inline void _DP_DipoleChargeModifierComputeNList<double>(
 
 template <>
 inline void _DP_DipoleChargeModifierComputeNList<float>(
-    DP_DipoleChargeModifier *dcm,
+    DP_DipoleChargeModifier* dcm,
     const int natom,
-    const float *coord,
-    const int *atype,
-    const float *cell,
-    const int *pairs,
+    const float* coord,
+    const int* atype,
+    const float* cell,
+    const int* pairs,
     const int npairs,
-    const float *delef_,
+    const float* delef_,
     const int nghost,
-    const DP_Nlist *nlist,
-    float *dfcorr_,
-    float *dvcorr_) {
+    const DP_Nlist* nlist,
+    float* dfcorr_,
+    float* dvcorr_) {
   DP_DipoleChargeModifierComputeNListf(dcm, natom, coord, atype, cell, pairs,
                                        npairs, delef_, nghost, nlist, dfcorr_,
                                        dvcorr_);
 }
 
-inline double *_DP_Get_Energy_Pointer(std::vector<double> &vec,
+inline double* _DP_Get_Energy_Pointer(std::vector<double>& vec,
                                       const int nframes) {
   vec.resize(nframes);
   return &vec[0];
 }
 
-inline double *_DP_Get_Energy_Pointer(double &vec, const int nframes) {
+inline double* _DP_Get_Energy_Pointer(double& vec, const int nframes) {
   assert(nframes == 1);
   return &vec;
 }
@@ -812,7 +812,7 @@ struct InputNlist {
         nl(DP_NewNlist(0, nullptr, nullptr, nullptr)) {
     DP_CHECK_OK(DP_NlistCheckOK, nl);
   };
-  InputNlist(int inum_, int *ilist_, int *numneigh_, int **firstneigh_)
+  InputNlist(int inum_, int* ilist_, int* numneigh_, int** firstneigh_)
       : inum(inum_),
         ilist(ilist_),
         numneigh(numneigh_),
@@ -821,17 +821,17 @@ struct InputNlist {
     DP_CHECK_OK(DP_NlistCheckOK, nl);
   };
   InputNlist(int inum_,
-             int *ilist_,
-             int *numneigh_,
-             int **firstneigh_,
+             int* ilist_,
+             int* numneigh_,
+             int** firstneigh_,
              int nswap,
-             int *sendnum,
-             int *recvnum,
-             int *firstrecv,
-             int **sendlist,
-             int *sendproc,
-             int *recvproc,
-             void *world)
+             int* sendnum,
+             int* recvnum,
+             int* firstrecv,
+             int** sendlist,
+             int* sendproc,
+             int* recvproc,
+             void* world)
       : inum(inum_),
         ilist(ilist_),
         numneigh(numneigh_),
@@ -850,15 +850,15 @@ struct InputNlist {
                             world)) {};
   ~InputNlist() { DP_DeleteNlist(nl); };
   /// @brief C API neighbor list.
-  DP_Nlist *nl;
+  DP_Nlist* nl;
   /// @brief Number of core region atoms
   int inum;
   /// @brief Array stores the core region atom's index
-  int *ilist;
+  int* ilist;
   /// @brief Array stores the core region atom's neighbor atom number
-  int *numneigh;
+  int* numneigh;
   /// @brief Array stores the core region atom's neighbor index
-  int **firstneigh;
+  int** firstneigh;
   /**
    * @brief Set mask for this neighbor list.
    */
@@ -867,7 +867,7 @@ struct InputNlist {
    * @brief Set mapping for this neighbor list.
    * @param mapping mapping from all atoms to real atoms, in size nall.
    */
-  void set_mapping(int *mapping) { DP_NlistSetMapping(nl, mapping); };
+  void set_mapping(int* mapping) { DP_NlistSetMapping(nl, mapping); };
 };
 
 /**
@@ -884,8 +884,8 @@ void inline convert_pbtxt_to_pb(std::string fn_pb_txt, std::string fn_pb) {
  * @param[in] from_nlist 2D int vector. The first axis represents the centeral
  * atoms and the second axis represents the neighbor atoms.
  */
-void inline convert_nlist(InputNlist &to_nlist,
-                          std::vector<std::vector<int>> &from_nlist) {
+void inline convert_nlist(InputNlist& to_nlist,
+                          std::vector<std::vector<int>>& from_nlist) {
   to_nlist.inum = from_nlist.size();
   for (int ii = 0; ii < to_nlist.inum; ++ii) {
     to_nlist.ilist[ii] = ii;
@@ -936,8 +936,8 @@ class DeepBaseModel {
    * @brief Get the type map (element name of the atom types) of this model.
    * @param[out] type_map The type map of this model.
    **/
-  void get_type_map(std::string &type_map) {
-    const char *type_map_c = DP_DeepBaseModelGetTypeMap(dpbase);
+  void get_type_map(std::string& type_map) {
+    const char* type_map_c = DP_DeepBaseModelGetTypeMap(dpbase);
     type_map.assign(type_map_c);
     DP_DeleteChar(type_map_c);
   };
@@ -946,7 +946,7 @@ class DeepBaseModel {
    * information.
    * @param[in] pre The prefix to each line.
    */
-  void print_summary(const std::string &pre) const {
+  void print_summary(const std::string& pre) const {
     DP_PrintSummary(pre.c_str());
   }
   /**
@@ -967,15 +967,15 @@ class DeepBaseModel {
   }
 
  protected:
-  DP_DeepBaseModel *dpbase;
+  DP_DeepBaseModel* dpbase;
   int dfparam;
   int daparam;
   bool aparam_nall;
   template <typename VALUETYPE>
-  void validate_fparam_aparam(const int &nframes,
-                              const int &nloc,
-                              const std::vector<VALUETYPE> &fparam,
-                              const std::vector<VALUETYPE> &aparam) const {
+  void validate_fparam_aparam(const int& nframes,
+                              const int& nloc,
+                              const std::vector<VALUETYPE>& fparam,
+                              const std::vector<VALUETYPE>& aparam) const {
     if (fparam.size() != dfparam &&
         fparam.size() != static_cast<size_t>(nframes) * dfparam) {
       throw deepmd::hpp::deepmd_exception(
@@ -991,10 +991,10 @@ class DeepBaseModel {
     }
   }
   template <typename VALUETYPE>
-  void tile_fparam_aparam(std::vector<VALUETYPE> &out_param,
-                          const int &nframes,
-                          const int &dparam,
-                          const std::vector<VALUETYPE> &param) const {
+  void tile_fparam_aparam(std::vector<VALUETYPE>& out_param,
+                          const int& nframes,
+                          const int& dparam,
+                          const std::vector<VALUETYPE>& param) const {
     if (param.size() == dparam) {
       out_param.resize(static_cast<size_t>(nframes) * dparam);
       for (int ii = 0; ii < nframes; ++ii) {
@@ -1023,9 +1023,9 @@ class DeepPot : public DeepBaseModel {
    * @param[in] gpu_rank The GPU rank.
    * @param[in] file_content The content of the frozen model file.
    **/
-  DeepPot(const std::string &model,
-          const int &gpu_rank = 0,
-          const std::string &file_content = "")
+  DeepPot(const std::string& model,
+          const int& gpu_rank = 0,
+          const std::string& file_content = "")
       : dp(nullptr) {
     try {
       init(model, gpu_rank, file_content);
@@ -1043,9 +1043,9 @@ class DeepPot : public DeepBaseModel {
    * @param[in] gpu_rank The GPU rank.
    * @param[in] file_content The content of the frozen model file.
    **/
-  void init(const std::string &model,
-            const int &gpu_rank = 0,
-            const std::string &file_content = "") {
+  void init(const std::string& model,
+            const int& gpu_rank = 0,
+            const std::string& file_content = "") {
     if (dp) {
       std::cerr << "WARNING: deepmd-kit should not be initialized twice, do "
                    "nothing at the second call of initializer"
@@ -1058,7 +1058,7 @@ class DeepPot : public DeepBaseModel {
     dfparam = DP_DeepPotGetDimFParam(dp);
     daparam = DP_DeepPotGetDimAParam(dp);
     aparam_nall = DP_DeepPotIsAParamNAll(dp);
-    dpbase = (DP_DeepBaseModel *)dp;
+    dpbase = (DP_DeepBaseModel*)dp;
   };
 
   /**
@@ -1083,34 +1083,34 @@ class DeepPot : public DeepBaseModel {
    **/
   template <typename VALUETYPE, typename ENERGYVTYPE>
   void compute(
-      ENERGYVTYPE &ener,
-      std::vector<VALUETYPE> &force,
-      std::vector<VALUETYPE> &virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      ENERGYVTYPE& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = natoms > 0 ? coord.size() / natoms / 3 : 1;
     assert(nframes * natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == nframes * 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
-    double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
+    double* ener_ = _DP_Get_Energy_Pointer(ener, nframes);
     force.resize(static_cast<size_t>(nframes) * natoms * 3);
     virial.resize(static_cast<size_t>(nframes) * 9);
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *virial_ = &virial[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* virial_ = &virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, natoms, fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
     tile_fparam_aparam(aparam_, nframes, natoms * daparam, aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepPotCompute<VALUETYPE>(dp, nframes, natoms, coord_, atype_, box_,
                                   fparam__, aparam__, ener_, force_, virial_,
@@ -1142,41 +1142,41 @@ class DeepPot : public DeepBaseModel {
    **/
   template <typename VALUETYPE, typename ENERGYVTYPE>
   void compute(
-      ENERGYVTYPE &ener,
-      std::vector<VALUETYPE> &force,
-      std::vector<VALUETYPE> &virial,
-      std::vector<VALUETYPE> &atom_energy,
-      std::vector<VALUETYPE> &atom_virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      ENERGYVTYPE& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& virial,
+      std::vector<VALUETYPE>& atom_energy,
+      std::vector<VALUETYPE>& atom_virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = natoms > 0 ? coord.size() / natoms / 3 : 1;
     assert(nframes * natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == nframes * 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
-    double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
+    double* ener_ = _DP_Get_Energy_Pointer(ener, nframes);
     force.resize(static_cast<size_t>(nframes) * natoms * 3);
     virial.resize(static_cast<size_t>(nframes) * 9);
     atom_energy.resize(static_cast<size_t>(nframes) * natoms);
     atom_virial.resize(static_cast<size_t>(nframes) * natoms * 9);
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *virial_ = &virial[0];
-    VALUETYPE *atomic_ener_ = &atom_energy[0];
-    VALUETYPE *atomic_virial_ = &atom_virial[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* virial_ = &virial[0];
+    VALUETYPE* atomic_ener_ = &atom_energy[0];
+    VALUETYPE* atomic_virial_ = &atom_virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, natoms, fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
     tile_fparam_aparam(aparam_, nframes, natoms * daparam, aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepPotCompute<VALUETYPE>(dp, nframes, natoms, coord_, atype_, box_,
                                   fparam__, aparam__, ener_, force_, virial_,
@@ -1210,31 +1210,31 @@ class DeepPot : public DeepBaseModel {
    **/
   template <typename VALUETYPE, typename ENERGYVTYPE>
   void compute(
-      ENERGYVTYPE &ener,
-      std::vector<VALUETYPE> &force,
-      std::vector<VALUETYPE> &virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
+      ENERGYVTYPE& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
       const int nghost,
-      const InputNlist &lmp_list,
-      const int &ago,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      const InputNlist& lmp_list,
+      const int& ago,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = natoms > 0 ? coord.size() / natoms / 3 : 1;
     assert(nframes * natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == nframes * 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
-    double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
+    double* ener_ = _DP_Get_Energy_Pointer(ener, nframes);
     force.resize(static_cast<size_t>(nframes) * natoms * 3);
     virial.resize(static_cast<size_t>(nframes) * 9);
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *virial_ = &virial[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* virial_ = &virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, (aparam_nall ? natoms : (natoms - nghost)),
                            fparam, aparam);
@@ -1242,8 +1242,8 @@ class DeepPot : public DeepBaseModel {
     tile_fparam_aparam(aparam_, nframes,
                        (aparam_nall ? natoms : (natoms - nghost)) * daparam,
                        aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepPotComputeNList<VALUETYPE>(
         dp, nframes, natoms, coord_, atype_, box_, nghost, lmp_list.nl, ago,
@@ -1278,38 +1278,38 @@ class DeepPot : public DeepBaseModel {
    **/
   template <typename VALUETYPE, typename ENERGYVTYPE>
   void compute(
-      ENERGYVTYPE &ener,
-      std::vector<VALUETYPE> &force,
-      std::vector<VALUETYPE> &virial,
-      std::vector<VALUETYPE> &atom_energy,
-      std::vector<VALUETYPE> &atom_virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
+      ENERGYVTYPE& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& virial,
+      std::vector<VALUETYPE>& atom_energy,
+      std::vector<VALUETYPE>& atom_virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
       const int nghost,
-      const InputNlist &lmp_list,
-      const int &ago,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      const InputNlist& lmp_list,
+      const int& ago,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = natoms > 0 ? coord.size() / natoms / 3 : 1;
     assert(nframes * natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == nframes * 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
-    double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
+    double* ener_ = _DP_Get_Energy_Pointer(ener, nframes);
     force.resize(static_cast<size_t>(nframes) * natoms * 3);
     virial.resize(static_cast<size_t>(nframes) * 9);
     atom_energy.resize(static_cast<size_t>(nframes) * natoms);
     atom_virial.resize(static_cast<size_t>(nframes) * natoms * 9);
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *virial_ = &virial[0];
-    VALUETYPE *atomic_ener_ = &atom_energy[0];
-    VALUETYPE *atomic_virial_ = &atom_virial[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* virial_ = &virial[0];
+    VALUETYPE* atomic_ener_ = &atom_energy[0];
+    VALUETYPE* atomic_virial_ = &atom_virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, (aparam_nall ? natoms : (natoms - nghost)),
                            fparam, aparam);
@@ -1317,8 +1317,8 @@ class DeepPot : public DeepBaseModel {
     tile_fparam_aparam(aparam_, nframes,
                        (aparam_nall ? natoms : (natoms - nghost)) * daparam,
                        aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepPotComputeNList<VALUETYPE>(dp, nframes, natoms, coord_, atype_,
                                        box_, nghost, lmp_list.nl, ago, fparam__,
@@ -1349,34 +1349,34 @@ class DeepPot : public DeepBaseModel {
    **/
   template <typename VALUETYPE, typename ENERGYVTYPE>
   void compute_mixed_type(
-      ENERGYVTYPE &ener,
-      std::vector<VALUETYPE> &force,
-      std::vector<VALUETYPE> &virial,
-      const int &nframes,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      ENERGYVTYPE& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& virial,
+      const int& nframes,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size() / nframes;
     assert(nframes * natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == nframes * 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
-    double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
+    double* ener_ = _DP_Get_Energy_Pointer(ener, nframes);
     force.resize(static_cast<size_t>(nframes) * natoms * 3);
     virial.resize(static_cast<size_t>(nframes) * 9);
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *virial_ = &virial[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* virial_ = &virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, natoms, fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
     tile_fparam_aparam(aparam_, nframes, natoms * daparam, aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepPotComputeMixedType<VALUETYPE>(dp, nframes, natoms, coord_, atype_,
                                            box_, fparam__, aparam__, ener_,
@@ -1408,41 +1408,41 @@ class DeepPot : public DeepBaseModel {
    **/
   template <typename VALUETYPE, typename ENERGYVTYPE>
   void compute_mixed_type(
-      ENERGYVTYPE &ener,
-      std::vector<VALUETYPE> &force,
-      std::vector<VALUETYPE> &virial,
-      std::vector<VALUETYPE> &atom_energy,
-      std::vector<VALUETYPE> &atom_virial,
-      const int &nframes,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      ENERGYVTYPE& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& virial,
+      std::vector<VALUETYPE>& atom_energy,
+      std::vector<VALUETYPE>& atom_virial,
+      const int& nframes,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size() / nframes;
     assert(nframes * natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == nframes * 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
-    double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
+    double* ener_ = _DP_Get_Energy_Pointer(ener, nframes);
     force.resize(static_cast<size_t>(nframes) * natoms * 3);
     virial.resize(static_cast<size_t>(nframes) * 9);
     atom_energy.resize(static_cast<size_t>(nframes) * natoms);
     atom_virial.resize(static_cast<size_t>(nframes) * natoms * 9);
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *virial_ = &virial[0];
-    VALUETYPE *atomic_ener_ = &atom_energy[0];
-    VALUETYPE *atomic_virial_ = &atom_virial[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* virial_ = &virial[0];
+    VALUETYPE* atomic_ener_ = &atom_energy[0];
+    VALUETYPE* atomic_virial_ = &atom_virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, natoms, fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
     tile_fparam_aparam(aparam_, nframes, natoms * daparam, aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepPotComputeMixedType<VALUETYPE>(
         dp, nframes, natoms, coord_, atype_, box_, fparam__, aparam__, ener_,
@@ -1451,7 +1451,7 @@ class DeepPot : public DeepBaseModel {
   };
 
  private:
-  DP_DeepPot *dp;
+  DP_DeepPot* dp;
 };
 
 class DeepSpin : public DeepBaseModel {
@@ -1467,9 +1467,9 @@ class DeepSpin : public DeepBaseModel {
    * @param[in] gpu_rank The GPU rank.
    * @param[in] file_content The content of the frozen model file.
    **/
-  DeepSpin(const std::string &model,
-           const int &gpu_rank = 0,
-           const std::string &file_content = "")
+  DeepSpin(const std::string& model,
+           const int& gpu_rank = 0,
+           const std::string& file_content = "")
       : dp(nullptr) {
     try {
       init(model, gpu_rank, file_content);
@@ -1487,9 +1487,9 @@ class DeepSpin : public DeepBaseModel {
    * @param[in] gpu_rank The GPU rank.
    * @param[in] file_content The content of the frozen model file.
    **/
-  void init(const std::string &model,
-            const int &gpu_rank = 0,
-            const std::string &file_content = "") {
+  void init(const std::string& model,
+            const int& gpu_rank = 0,
+            const std::string& file_content = "") {
     if (dp) {
       std::cerr << "WARNING: deepmd-kit should not be initialized twice, do "
                    "nothing at the second call of initializer"
@@ -1502,7 +1502,7 @@ class DeepSpin : public DeepBaseModel {
     dfparam = DP_DeepSpinGetDimFParam(dp);
     daparam = DP_DeepSpinGetDimAParam(dp);
     aparam_nall = DP_DeepSpinIsAParamNAll(dp);
-    dpbase = (DP_DeepBaseModel *)dp;
+    dpbase = (DP_DeepBaseModel*)dp;
   };
 
   /**
@@ -1531,39 +1531,39 @@ class DeepSpin : public DeepBaseModel {
    **/
   template <typename VALUETYPE, typename ENERGYVTYPE>
   void compute(
-      ENERGYVTYPE &ener,
-      std::vector<VALUETYPE> &force,
-      std::vector<VALUETYPE> &force_mag,
-      std::vector<VALUETYPE> &virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<VALUETYPE> &spin,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      ENERGYVTYPE& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& force_mag,
+      std::vector<VALUETYPE>& virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<VALUETYPE>& spin,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = natoms > 0 ? coord.size() / natoms / 3 : 1;
     assert(nframes * natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == nframes * 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *spin_ = &spin[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
-    double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* spin_ = &spin[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
+    double* ener_ = _DP_Get_Energy_Pointer(ener, nframes);
     force.resize(static_cast<size_t>(nframes) * natoms * 3);
     force_mag.resize(static_cast<size_t>(nframes) * natoms * 3);
     virial.resize(static_cast<size_t>(nframes) * 9);
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *force_mag_ = &force_mag[0];
-    VALUETYPE *virial_ = &virial[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* force_mag_ = &force_mag[0];
+    VALUETYPE* virial_ = &virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, natoms, fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
     tile_fparam_aparam(aparam_, nframes, natoms * daparam, aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepSpinCompute<VALUETYPE>(dp, nframes, natoms, coord_, spin_, atype_,
                                    box_, fparam__, aparam__, ener_, force_,
@@ -1599,46 +1599,46 @@ class DeepSpin : public DeepBaseModel {
    **/
   template <typename VALUETYPE, typename ENERGYVTYPE>
   void compute(
-      ENERGYVTYPE &ener,
-      std::vector<VALUETYPE> &force,
-      std::vector<VALUETYPE> &force_mag,
-      std::vector<VALUETYPE> &virial,
-      std::vector<VALUETYPE> &atom_energy,
-      std::vector<VALUETYPE> &atom_virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<VALUETYPE> &spin,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      ENERGYVTYPE& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& force_mag,
+      std::vector<VALUETYPE>& virial,
+      std::vector<VALUETYPE>& atom_energy,
+      std::vector<VALUETYPE>& atom_virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<VALUETYPE>& spin,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = natoms > 0 ? coord.size() / natoms / 3 : 1;
     assert(nframes * natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == nframes * 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *spin_ = &spin[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* spin_ = &spin[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
-    double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
+    double* ener_ = _DP_Get_Energy_Pointer(ener, nframes);
     force.resize(static_cast<size_t>(nframes) * natoms * 3);
     force_mag.resize(static_cast<size_t>(nframes) * natoms * 3);
     virial.resize(static_cast<size_t>(nframes) * 9);
     atom_energy.resize(static_cast<size_t>(nframes) * natoms);
     atom_virial.resize(static_cast<size_t>(nframes) * natoms * 9);
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *force_mag_ = &force_mag[0];
-    VALUETYPE *virial_ = &virial[0];
-    VALUETYPE *atomic_ener_ = &atom_energy[0];
-    VALUETYPE *atomic_virial_ = &atom_virial[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* force_mag_ = &force_mag[0];
+    VALUETYPE* virial_ = &virial[0];
+    VALUETYPE* atomic_ener_ = &atom_energy[0];
+    VALUETYPE* atomic_virial_ = &atom_virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, natoms, fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
     tile_fparam_aparam(aparam_, nframes, natoms * daparam, aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepSpinCompute<VALUETYPE>(
         dp, nframes, natoms, coord_, spin_, atype_, box_, fparam__, aparam__,
@@ -1675,36 +1675,36 @@ class DeepSpin : public DeepBaseModel {
    **/
   template <typename VALUETYPE, typename ENERGYVTYPE>
   void compute(
-      ENERGYVTYPE &ener,
-      std::vector<VALUETYPE> &force,
-      std::vector<VALUETYPE> &force_mag,
-      std::vector<VALUETYPE> &virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<VALUETYPE> &spin,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
+      ENERGYVTYPE& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& force_mag,
+      std::vector<VALUETYPE>& virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<VALUETYPE>& spin,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
       const int nghost,
-      const InputNlist &lmp_list,
-      const int &ago,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      const InputNlist& lmp_list,
+      const int& ago,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = natoms > 0 ? coord.size() / natoms / 3 : 1;
     assert(nframes * natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == nframes * 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *spin_ = &spin[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
-    double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* spin_ = &spin[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
+    double* ener_ = _DP_Get_Energy_Pointer(ener, nframes);
     force.resize(static_cast<size_t>(nframes) * natoms * 3);
     force_mag.resize(static_cast<size_t>(nframes) * natoms * 3);
     virial.resize(static_cast<size_t>(nframes) * 9);
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *force_mag_ = &force_mag[0];
-    VALUETYPE *virial_ = &virial[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* force_mag_ = &force_mag[0];
+    VALUETYPE* virial_ = &virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, (aparam_nall ? natoms : (natoms - nghost)),
                            fparam, aparam);
@@ -1712,8 +1712,8 @@ class DeepSpin : public DeepBaseModel {
     tile_fparam_aparam(aparam_, nframes,
                        (aparam_nall ? natoms : (natoms - nghost)) * daparam,
                        aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
     _DP_DeepSpinComputeNList<VALUETYPE>(dp, nframes, natoms, coord_, spin_,
                                         atype_, box_, nghost, lmp_list.nl, ago,
                                         fparam__, aparam__, ener_, force_,
@@ -1752,42 +1752,42 @@ class DeepSpin : public DeepBaseModel {
    **/
   template <typename VALUETYPE, typename ENERGYVTYPE>
   void compute(
-      ENERGYVTYPE &ener,
-      std::vector<VALUETYPE> &force,
-      std::vector<VALUETYPE> &force_mag,
-      std::vector<VALUETYPE> &virial,
-      std::vector<VALUETYPE> &atom_energy,
-      std::vector<VALUETYPE> &atom_virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<VALUETYPE> &spin,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
+      ENERGYVTYPE& ener,
+      std::vector<VALUETYPE>& force,
+      std::vector<VALUETYPE>& force_mag,
+      std::vector<VALUETYPE>& virial,
+      std::vector<VALUETYPE>& atom_energy,
+      std::vector<VALUETYPE>& atom_virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<VALUETYPE>& spin,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
       const int nghost,
-      const InputNlist &lmp_list,
-      const int &ago,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      const InputNlist& lmp_list,
+      const int& ago,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = natoms > 0 ? coord.size() / natoms / 3 : 1;
     assert(nframes * natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == nframes * 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *spin_ = &spin[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
-    double *ener_ = _DP_Get_Energy_Pointer(ener, nframes);
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* spin_ = &spin[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
+    double* ener_ = _DP_Get_Energy_Pointer(ener, nframes);
     force.resize(static_cast<size_t>(nframes) * natoms * 3);
     force_mag.resize(static_cast<size_t>(nframes) * natoms * 3);
     virial.resize(static_cast<size_t>(nframes) * 9);
     atom_energy.resize(static_cast<size_t>(nframes) * natoms);
     atom_virial.resize(static_cast<size_t>(nframes) * natoms * 9);
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *force_mag_ = &force_mag[0];
-    VALUETYPE *virial_ = &virial[0];
-    VALUETYPE *atomic_ener_ = &atom_energy[0];
-    VALUETYPE *atomic_virial_ = &atom_virial[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* force_mag_ = &force_mag[0];
+    VALUETYPE* virial_ = &virial[0];
+    VALUETYPE* atomic_ener_ = &atom_energy[0];
+    VALUETYPE* atomic_virial_ = &atom_virial[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, (aparam_nall ? natoms : (natoms - nghost)),
                            fparam, aparam);
@@ -1795,8 +1795,8 @@ class DeepSpin : public DeepBaseModel {
     tile_fparam_aparam(aparam_, nframes,
                        (aparam_nall ? natoms : (natoms - nghost)) * daparam,
                        aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
     _DP_DeepSpinComputeNList<VALUETYPE>(
         dp, nframes, natoms, coord_, spin_, atype_, box_, nghost, lmp_list.nl,
         ago, fparam__, aparam__, ener_, force_, force_mag_, virial_,
@@ -1805,7 +1805,7 @@ class DeepSpin : public DeepBaseModel {
   };
 
  private:
-  DP_DeepSpin *dp;
+  DP_DeepSpin* dp;
 };
 
 /**
@@ -1865,8 +1865,8 @@ class DeepBaseModelDevi {
    * @param[in] xx The vectors of all models.
    **/
   template <typename VALUETYPE>
-  void compute_avg(std::vector<VALUETYPE> &avg,
-                   const std::vector<std::vector<VALUETYPE>> &xx) {
+  void compute_avg(std::vector<VALUETYPE>& avg,
+                   const std::vector<std::vector<VALUETYPE>>& xx) {
     assert(xx.size() == numb_models);
     if (numb_models == 0) {
       return;
@@ -1893,10 +1893,10 @@ class DeepBaseModelDevi {
    * @param[in] stride The stride to compute the deviation.
    **/
   template <typename VALUETYPE>
-  void compute_std(std::vector<VALUETYPE> &std,
-                   const std::vector<VALUETYPE> &avg,
-                   const std::vector<std::vector<VALUETYPE>> &xx,
-                   const int &stride) {
+  void compute_std(std::vector<VALUETYPE>& std,
+                   const std::vector<VALUETYPE>& avg,
+                   const std::vector<std::vector<VALUETYPE>>& xx,
+                   const int& stride) {
     assert(xx.size() == numb_models);
     if (numb_models == 0) {
       return;
@@ -1911,8 +1911,8 @@ class DeepBaseModelDevi {
 
     for (unsigned ii = 0; ii < numb_models; ++ii) {
       for (unsigned jj = 0; jj < nloc; ++jj) {
-        const VALUETYPE *tmp_f = &(xx[ii][static_cast<size_t>(jj) * stride]);
-        const VALUETYPE *tmp_avg = &(avg[static_cast<size_t>(jj) * stride]);
+        const VALUETYPE* tmp_f = &(xx[ii][static_cast<size_t>(jj) * stride]);
+        const VALUETYPE* tmp_avg = &(avg[static_cast<size_t>(jj) * stride]);
         for (unsigned dd = 0; dd < stride; ++dd) {
           VALUETYPE vdiff = tmp_f[dd] - tmp_avg[dd];
           std[jj] += vdiff * vdiff;
@@ -1932,16 +1932,16 @@ class DeepBaseModelDevi {
    * @param[in] stride The stride to compute the deviation.
    **/
   template <typename VALUETYPE>
-  void compute_relative_std(std::vector<VALUETYPE> &std,
-                            const std::vector<VALUETYPE> &avg,
+  void compute_relative_std(std::vector<VALUETYPE>& std,
+                            const std::vector<VALUETYPE>& avg,
                             const VALUETYPE eps,
-                            const int &stride) {
+                            const int& stride) {
     unsigned ndof = avg.size();
     unsigned nloc = std.size();
     assert(nloc * stride == ndof);
 
     for (unsigned ii = 0; ii < nloc; ++ii) {
-      const VALUETYPE *tmp_avg = &(avg[static_cast<size_t>(ii) * stride]);
+      const VALUETYPE* tmp_avg = &(avg[static_cast<size_t>(ii) * stride]);
       VALUETYPE f_norm = 0.0;
       for (unsigned dd = 0; dd < stride; ++dd) {
         f_norm += tmp_avg[dd] * tmp_avg[dd];
@@ -1957,9 +1957,9 @@ class DeepBaseModelDevi {
    * @param[in] xx The vectors of all forces.
    **/
   template <typename VALUETYPE>
-  void compute_std_f(std::vector<VALUETYPE> &std,
-                     const std::vector<VALUETYPE> &avg,
-                     const std::vector<std::vector<VALUETYPE>> &xx) {
+  void compute_std_f(std::vector<VALUETYPE>& std,
+                     const std::vector<VALUETYPE>& avg,
+                     const std::vector<std::vector<VALUETYPE>>& xx) {
     compute_std(std, avg, xx, 3);
   };
   /**
@@ -1969,23 +1969,23 @@ class DeepBaseModelDevi {
    * @param[in] eps The level parameter for computing the deviation.
    **/
   template <typename VALUETYPE>
-  void compute_relative_std_f(std::vector<VALUETYPE> &std,
-                              const std::vector<VALUETYPE> &avg,
+  void compute_relative_std_f(std::vector<VALUETYPE>& std,
+                              const std::vector<VALUETYPE>& avg,
                               const VALUETYPE eps) {
     compute_relative_std(std, avg, eps, 3);
   };
 
  protected:
-  DP_DeepBaseModelDevi *dpbase;
+  DP_DeepBaseModelDevi* dpbase;
   int numb_models;
   int dfparam;
   int daparam;
   bool aparam_nall;
   template <typename VALUETYPE>
-  void validate_fparam_aparam(const int &nframes,
-                              const int &nloc,
-                              const std::vector<VALUETYPE> &fparam,
-                              const std::vector<VALUETYPE> &aparam) const {
+  void validate_fparam_aparam(const int& nframes,
+                              const int& nloc,
+                              const std::vector<VALUETYPE>& fparam,
+                              const std::vector<VALUETYPE>& aparam) const {
     if (fparam.size() != dfparam &&
         fparam.size() != static_cast<size_t>(nframes) * dfparam) {
       throw deepmd::hpp::deepmd_exception(
@@ -2001,10 +2001,10 @@ class DeepBaseModelDevi {
     }
   }
   template <typename VALUETYPE>
-  void tile_fparam_aparam(std::vector<VALUETYPE> &out_param,
-                          const int &nframes,
-                          const int &dparam,
-                          const std::vector<VALUETYPE> &param) const {
+  void tile_fparam_aparam(std::vector<VALUETYPE>& out_param,
+                          const int& nframes,
+                          const int& dparam,
+                          const std::vector<VALUETYPE>& param) const {
     if (param.size() == dparam) {
       out_param.resize(static_cast<size_t>(nframes) * dparam);
       for (int ii = 0; ii < nframes; ++ii) {
@@ -2031,7 +2031,7 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
    * @brief DP model deviation constructor with initialization.
    * @param[in] models The names of the frozen model file.
    **/
-  DeepPotModelDevi(const std::vector<std::string> &models) : dp(nullptr) {
+  DeepPotModelDevi(const std::vector<std::string>& models) : dp(nullptr) {
     try {
       init(models);
     } catch (...) {
@@ -2048,9 +2048,9 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
    * @param[in] gpu_rank The GPU rank.
    * @param[in] file_content The content of the frozen model file.
    **/
-  void init(const std::vector<std::string> &models,
-            const int &gpu_rank = 0,
-            const std::vector<std::string> &file_content =
+  void init(const std::vector<std::string>& models,
+            const int& gpu_rank = 0,
+            const std::vector<std::string>& file_content =
                 std::vector<std::string>()) {
     if (dp) {
       std::cerr << "WARNING: deepmd-kit should not be initialized twice, do "
@@ -2058,17 +2058,17 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
                 << std::endl;
       return;
     }
-    std::vector<const char *> cstrings;
+    std::vector<const char*> cstrings;
     cstrings.reserve(models.size());
-    for (std::string const &str : models) {
+    for (std::string const& str : models) {
       cstrings.push_back(str.data());
     }
 
-    std::vector<const char *> c_file_contents;
+    std::vector<const char*> c_file_contents;
     std::vector<int> size_file_contents;
     c_file_contents.reserve(file_content.size());
     size_file_contents.reserve(file_content.size());
-    for (std::string const &str : file_content) {
+    for (std::string const& str : file_content) {
       c_file_contents.push_back(str.data());
       size_file_contents.push_back(str.size());
     }
@@ -2081,7 +2081,7 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
     dfparam = DP_DeepPotModelDeviGetDimFParam(dp);
     daparam = DP_DeepPotModelDeviGetDimAParam(dp);
     aparam_nall = DP_DeepPotModelDeviIsAParamNAll(dp);
-    dpbase = (DP_DeepBaseModelDevi *)dp;
+    dpbase = (DP_DeepBaseModelDevi*)dp;
   };
 
   /**
@@ -2106,23 +2106,23 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
    **/
   template <typename VALUETYPE>
   void compute(
-      std::vector<double> &ener,
-      std::vector<std::vector<VALUETYPE>> &force,
-      std::vector<std::vector<VALUETYPE>> &virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      std::vector<double>& ener,
+      std::vector<std::vector<VALUETYPE>>& force,
+      std::vector<std::vector<VALUETYPE>>& virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = 1;
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
     // memory will be continuous for std::vector but not
     // std::vector<std::vector>
@@ -2130,15 +2130,15 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
     std::vector<VALUETYPE> force_flat(static_cast<size_t>(numb_models) *
                                       natoms * 3);
     std::vector<VALUETYPE> virial_flat(numb_models * 9);
-    double *ener_ = &energy_flat[0];
-    VALUETYPE *force_ = &force_flat[0];
-    VALUETYPE *virial_ = &virial_flat[0];
+    double* ener_ = &energy_flat[0];
+    VALUETYPE* force_ = &force_flat[0];
+    VALUETYPE* virial_ = &virial_flat[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, natoms, fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
     tile_fparam_aparam(aparam_, nframes, natoms * daparam, aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepPotModelDeviCompute<VALUETYPE>(dp, natoms, coord_, atype_, box_,
                                            fparam__, aparam__, ener_, force_,
@@ -2185,25 +2185,25 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
    **/
   template <typename VALUETYPE>
   void compute(
-      std::vector<double> &ener,
-      std::vector<std::vector<VALUETYPE>> &force,
-      std::vector<std::vector<VALUETYPE>> &virial,
-      std::vector<std::vector<VALUETYPE>> &atom_energy,
-      std::vector<std::vector<VALUETYPE>> &atom_virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      std::vector<double>& ener,
+      std::vector<std::vector<VALUETYPE>>& force,
+      std::vector<std::vector<VALUETYPE>>& virial,
+      std::vector<std::vector<VALUETYPE>>& atom_energy,
+      std::vector<std::vector<VALUETYPE>>& atom_virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = 1;
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
     std::vector<double> energy_flat(numb_models);
     std::vector<VALUETYPE> force_flat(static_cast<size_t>(numb_models) *
@@ -2213,17 +2213,17 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
                                             natoms);
     std::vector<VALUETYPE> atom_virial_flat(static_cast<size_t>(numb_models) *
                                             natoms * 9);
-    double *ener_ = &energy_flat[0];
-    VALUETYPE *force_ = &force_flat[0];
-    VALUETYPE *virial_ = &virial_flat[0];
-    VALUETYPE *atomic_ener_ = &atom_energy_flat[0];
-    VALUETYPE *atomic_virial_ = &atom_virial_flat[0];
+    double* ener_ = &energy_flat[0];
+    VALUETYPE* force_ = &force_flat[0];
+    VALUETYPE* virial_ = &virial_flat[0];
+    VALUETYPE* atomic_ener_ = &atom_energy_flat[0];
+    VALUETYPE* atomic_virial_ = &atom_virial_flat[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, natoms, fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
     tile_fparam_aparam(aparam_, nframes, natoms * daparam, aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepPotModelDeviCompute<VALUETYPE>(
         dp, natoms, coord_, atype_, box_, fparam__, aparam__, ener_, force_,
@@ -2282,26 +2282,26 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
    **/
   template <typename VALUETYPE>
   void compute(
-      std::vector<double> &ener,
-      std::vector<std::vector<VALUETYPE>> &force,
-      std::vector<std::vector<VALUETYPE>> &virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
+      std::vector<double>& ener,
+      std::vector<std::vector<VALUETYPE>>& force,
+      std::vector<std::vector<VALUETYPE>>& virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
       const int nghost,
-      const InputNlist &lmp_list,
-      const int &ago,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      const InputNlist& lmp_list,
+      const int& ago,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = 1;
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
     // memory will be continuous for std::vector but not
     // std::vector<std::vector>
@@ -2309,9 +2309,9 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
     std::vector<VALUETYPE> force_flat(static_cast<size_t>(numb_models) *
                                       natoms * 3);
     std::vector<VALUETYPE> virial_flat(numb_models * 9);
-    double *ener_ = &energy_flat[0];
-    VALUETYPE *force_ = &force_flat[0];
-    VALUETYPE *virial_ = &virial_flat[0];
+    double* ener_ = &energy_flat[0];
+    VALUETYPE* force_ = &force_flat[0];
+    VALUETYPE* virial_ = &virial_flat[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, (aparam_nall ? natoms : (natoms - nghost)),
                            fparam, aparam);
@@ -2319,8 +2319,8 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
     tile_fparam_aparam(aparam_, nframes,
                        (aparam_nall ? natoms : (natoms - nghost)) * daparam,
                        aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepPotModelDeviComputeNList<VALUETYPE>(
         dp, natoms, coord_, atype_, box_, nghost, lmp_list.nl, ago, fparam__,
@@ -2370,28 +2370,28 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
    **/
   template <typename VALUETYPE>
   void compute(
-      std::vector<double> &ener,
-      std::vector<std::vector<VALUETYPE>> &force,
-      std::vector<std::vector<VALUETYPE>> &virial,
-      std::vector<std::vector<VALUETYPE>> &atom_energy,
-      std::vector<std::vector<VALUETYPE>> &atom_virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
+      std::vector<double>& ener,
+      std::vector<std::vector<VALUETYPE>>& force,
+      std::vector<std::vector<VALUETYPE>>& virial,
+      std::vector<std::vector<VALUETYPE>>& atom_energy,
+      std::vector<std::vector<VALUETYPE>>& atom_virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
       const int nghost,
-      const InputNlist &lmp_list,
-      const int &ago,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      const InputNlist& lmp_list,
+      const int& ago,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = 1;
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
     std::vector<double> energy_flat(numb_models);
     std::vector<VALUETYPE> force_flat(static_cast<size_t>(numb_models) *
@@ -2401,11 +2401,11 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
                                             natoms);
     std::vector<VALUETYPE> atom_virial_flat(static_cast<size_t>(numb_models) *
                                             natoms * 9);
-    double *ener_ = &energy_flat[0];
-    VALUETYPE *force_ = &force_flat[0];
-    VALUETYPE *virial_ = &virial_flat[0];
-    VALUETYPE *atomic_ener_ = &atom_energy_flat[0];
-    VALUETYPE *atomic_virial_ = &atom_virial_flat[0];
+    double* ener_ = &energy_flat[0];
+    VALUETYPE* force_ = &force_flat[0];
+    VALUETYPE* virial_ = &virial_flat[0];
+    VALUETYPE* atomic_ener_ = &atom_energy_flat[0];
+    VALUETYPE* atomic_virial_ = &atom_virial_flat[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, (aparam_nall ? natoms : (natoms - nghost)),
                            fparam, aparam);
@@ -2413,8 +2413,8 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
     tile_fparam_aparam(aparam_, nframes,
                        (aparam_nall ? natoms : (natoms - nghost)) * daparam,
                        aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepPotModelDeviComputeNList<VALUETYPE>(
         dp, natoms, coord_, atype_, box_, nghost, lmp_list.nl, ago, fparam__,
@@ -2449,7 +2449,7 @@ class DeepPotModelDevi : public DeepBaseModelDevi {
   };
 
  private:
-  DP_DeepPotModelDevi *dp;
+  DP_DeepPotModelDevi* dp;
 };
 
 class DeepSpinModelDevi : public DeepBaseModelDevi {
@@ -2463,7 +2463,7 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
    * @brief DP model deviation constructor with initialization.
    * @param[in] models The names of the frozen model file.
    **/
-  DeepSpinModelDevi(const std::vector<std::string> &models) : dp(nullptr) {
+  DeepSpinModelDevi(const std::vector<std::string>& models) : dp(nullptr) {
     try {
       init(models);
     } catch (...) {
@@ -2480,9 +2480,9 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
    * @param[in] gpu_rank The GPU rank.
    * @param[in] file_content The content of the frozen model file.
    **/
-  void init(const std::vector<std::string> &models,
-            const int &gpu_rank = 0,
-            const std::vector<std::string> &file_content =
+  void init(const std::vector<std::string>& models,
+            const int& gpu_rank = 0,
+            const std::vector<std::string>& file_content =
                 std::vector<std::string>()) {
     if (dp) {
       std::cerr << "WARNING: deepmd-kit should not be initialized twice, do "
@@ -2490,17 +2490,17 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
                 << std::endl;
       return;
     }
-    std::vector<const char *> cstrings;
+    std::vector<const char*> cstrings;
     cstrings.reserve(models.size());
-    for (std::string const &str : models) {
+    for (std::string const& str : models) {
       cstrings.push_back(str.data());
     }
 
-    std::vector<const char *> c_file_contents;
+    std::vector<const char*> c_file_contents;
     std::vector<int> size_file_contents;
     c_file_contents.reserve(file_content.size());
     size_file_contents.reserve(file_content.size());
-    for (std::string const &str : file_content) {
+    for (std::string const& str : file_content) {
       c_file_contents.push_back(str.data());
       size_file_contents.push_back(str.size());
     }
@@ -2513,7 +2513,7 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
     dfparam = DP_DeepSpinModelDeviGetDimFParam(dp);
     daparam = DP_DeepSpinModelDeviGetDimAParam(dp);
     aparam_nall = DP_DeepSpinModelDeviIsAParamNAll(dp);
-    dpbase = (DP_DeepBaseModelDevi *)dp;
+    dpbase = (DP_DeepBaseModelDevi*)dp;
   };
 
   /**
@@ -2541,26 +2541,26 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
    **/
   template <typename VALUETYPE>
   void compute(
-      std::vector<double> &ener,
-      std::vector<std::vector<VALUETYPE>> &force,
-      std::vector<std::vector<VALUETYPE>> &force_mag,
-      std::vector<std::vector<VALUETYPE>> &virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<VALUETYPE> &spin,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      std::vector<double>& ener,
+      std::vector<std::vector<VALUETYPE>>& force,
+      std::vector<std::vector<VALUETYPE>>& force_mag,
+      std::vector<std::vector<VALUETYPE>>& virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<VALUETYPE>& spin,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = 1;
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *spin_ = &spin[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* spin_ = &spin[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
     // memory will be continuous for std::vector but not
     // std::vector<std::vector>
@@ -2570,16 +2570,16 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
     std::vector<VALUETYPE> force_mag_flat(static_cast<size_t>(numb_models) *
                                           natoms * 3);
     std::vector<VALUETYPE> virial_flat(numb_models * 9);
-    double *ener_ = &energy_flat[0];
-    VALUETYPE *force_ = &force_flat[0];
-    VALUETYPE *force_mag_ = &force_mag_flat[0];
-    VALUETYPE *virial_ = &virial_flat[0];
+    double* ener_ = &energy_flat[0];
+    VALUETYPE* force_ = &force_flat[0];
+    VALUETYPE* force_mag_ = &force_mag_flat[0];
+    VALUETYPE* virial_ = &virial_flat[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, natoms, fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
     tile_fparam_aparam(aparam_, nframes, natoms * daparam, aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepSpinModelDeviCompute<VALUETYPE>(
         dp, natoms, coord_, spin_, atype_, box_, fparam__, aparam__, ener_,
@@ -2634,28 +2634,28 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
    **/
   template <typename VALUETYPE>
   void compute(
-      std::vector<double> &ener,
-      std::vector<std::vector<VALUETYPE>> &force,
-      std::vector<std::vector<VALUETYPE>> &force_mag,
-      std::vector<std::vector<VALUETYPE>> &virial,
-      std::vector<std::vector<VALUETYPE>> &atom_energy,
-      std::vector<std::vector<VALUETYPE>> &atom_virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<VALUETYPE> &spin,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      std::vector<double>& ener,
+      std::vector<std::vector<VALUETYPE>>& force,
+      std::vector<std::vector<VALUETYPE>>& force_mag,
+      std::vector<std::vector<VALUETYPE>>& virial,
+      std::vector<std::vector<VALUETYPE>>& atom_energy,
+      std::vector<std::vector<VALUETYPE>>& atom_virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<VALUETYPE>& spin,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = 1;
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *spin_ = &spin[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* spin_ = &spin[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
     std::vector<double> energy_flat(numb_models);
     std::vector<VALUETYPE> force_flat(static_cast<size_t>(numb_models) *
@@ -2667,18 +2667,18 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
                                             natoms);
     std::vector<VALUETYPE> atom_virial_flat(static_cast<size_t>(numb_models) *
                                             natoms * 9);
-    double *ener_ = &energy_flat[0];
-    VALUETYPE *force_ = &force_flat[0];
-    VALUETYPE *force_mag_ = &force_mag_flat[0];
-    VALUETYPE *virial_ = &virial_flat[0];
-    VALUETYPE *atomic_ener_ = &atom_energy_flat[0];
-    VALUETYPE *atomic_virial_ = &atom_virial_flat[0];
+    double* ener_ = &energy_flat[0];
+    VALUETYPE* force_ = &force_flat[0];
+    VALUETYPE* force_mag_ = &force_mag_flat[0];
+    VALUETYPE* virial_ = &virial_flat[0];
+    VALUETYPE* atomic_ener_ = &atom_energy_flat[0];
+    VALUETYPE* atomic_virial_ = &atom_virial_flat[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, natoms, fparam, aparam);
     tile_fparam_aparam(fparam_, nframes, dfparam, fparam);
     tile_fparam_aparam(aparam_, nframes, natoms * daparam, aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
 
     _DP_DeepSpinModelDeviCompute<VALUETYPE>(
         dp, natoms, coord_, spin_, atype_, box_, fparam__, aparam__, ener_,
@@ -2745,29 +2745,29 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
    **/
   template <typename VALUETYPE>
   void compute(
-      std::vector<double> &ener,
-      std::vector<std::vector<VALUETYPE>> &force,
-      std::vector<std::vector<VALUETYPE>> &force_mag,
-      std::vector<std::vector<VALUETYPE>> &virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<VALUETYPE> &spin,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
+      std::vector<double>& ener,
+      std::vector<std::vector<VALUETYPE>>& force,
+      std::vector<std::vector<VALUETYPE>>& force_mag,
+      std::vector<std::vector<VALUETYPE>>& virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<VALUETYPE>& spin,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
       const int nghost,
-      const InputNlist &lmp_list,
-      const int &ago,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      const InputNlist& lmp_list,
+      const int& ago,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = 1;
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *spin_ = &spin[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* spin_ = &spin[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
     // memory will be continous for std::vector but not std::vector<std::vector>
     std::vector<double> energy_flat(numb_models);
     std::vector<VALUETYPE> force_flat(static_cast<size_t>(numb_models) *
@@ -2775,10 +2775,10 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
     std::vector<VALUETYPE> force_mag_flat(static_cast<size_t>(numb_models) *
                                           natoms * 3);
     std::vector<VALUETYPE> virial_flat(numb_models * 9);
-    double *ener_ = &energy_flat[0];
-    VALUETYPE *force_ = &force_flat[0];
-    VALUETYPE *force_mag_ = &force_mag_flat[0];
-    VALUETYPE *virial_ = &virial_flat[0];
+    double* ener_ = &energy_flat[0];
+    VALUETYPE* force_ = &force_flat[0];
+    VALUETYPE* force_mag_ = &force_mag_flat[0];
+    VALUETYPE* virial_ = &virial_flat[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, (aparam_nall ? natoms : (natoms - nghost)),
                            fparam, aparam);
@@ -2786,8 +2786,8 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
     tile_fparam_aparam(aparam_, nframes,
                        (aparam_nall ? natoms : (natoms - nghost)) * daparam,
                        aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
     _DP_DeepSpinModelDeviComputeNList<VALUETYPE>(
         dp, natoms, coord_, spin_, atype_, box_, nghost, lmp_list.nl, ago,
         fparam__, aparam__, ener_, force_, force_mag_, virial_, nullptr,
@@ -2845,31 +2845,31 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
    **/
   template <typename VALUETYPE>
   void compute(
-      std::vector<double> &ener,
-      std::vector<std::vector<VALUETYPE>> &force,
-      std::vector<std::vector<VALUETYPE>> &force_mag,
-      std::vector<std::vector<VALUETYPE>> &virial,
-      std::vector<std::vector<VALUETYPE>> &atom_energy,
-      std::vector<std::vector<VALUETYPE>> &atom_virial,
-      const std::vector<VALUETYPE> &coord,
-      const std::vector<VALUETYPE> &spin,
-      const std::vector<int> &atype,
-      const std::vector<VALUETYPE> &box,
+      std::vector<double>& ener,
+      std::vector<std::vector<VALUETYPE>>& force,
+      std::vector<std::vector<VALUETYPE>>& force_mag,
+      std::vector<std::vector<VALUETYPE>>& virial,
+      std::vector<std::vector<VALUETYPE>>& atom_energy,
+      std::vector<std::vector<VALUETYPE>>& atom_virial,
+      const std::vector<VALUETYPE>& coord,
+      const std::vector<VALUETYPE>& spin,
+      const std::vector<int>& atype,
+      const std::vector<VALUETYPE>& box,
       const int nghost,
-      const InputNlist &lmp_list,
-      const int &ago,
-      const std::vector<VALUETYPE> &fparam = std::vector<VALUETYPE>(),
-      const std::vector<VALUETYPE> &aparam = std::vector<VALUETYPE>()) {
+      const InputNlist& lmp_list,
+      const int& ago,
+      const std::vector<VALUETYPE>& fparam = std::vector<VALUETYPE>(),
+      const std::vector<VALUETYPE>& aparam = std::vector<VALUETYPE>()) {
     unsigned int natoms = atype.size();
     unsigned int nframes = 1;
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *spin_ = &spin[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* spin_ = &spin[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
     std::vector<double> energy_flat(numb_models);
     std::vector<VALUETYPE> force_flat(static_cast<size_t>(numb_models) *
                                       natoms * 3);
@@ -2880,12 +2880,12 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
                                             natoms);
     std::vector<VALUETYPE> atom_virial_flat(static_cast<size_t>(numb_models) *
                                             natoms * 9);
-    double *ener_ = &energy_flat[0];
-    VALUETYPE *force_ = &force_flat[0];
-    VALUETYPE *force_mag_ = &force_mag_flat[0];
-    VALUETYPE *virial_ = &virial_flat[0];
-    VALUETYPE *atomic_ener_ = &atom_energy_flat[0];
-    VALUETYPE *atomic_virial_ = &atom_virial_flat[0];
+    double* ener_ = &energy_flat[0];
+    VALUETYPE* force_ = &force_flat[0];
+    VALUETYPE* force_mag_ = &force_mag_flat[0];
+    VALUETYPE* virial_ = &virial_flat[0];
+    VALUETYPE* atomic_ener_ = &atom_energy_flat[0];
+    VALUETYPE* atomic_virial_ = &atom_virial_flat[0];
     std::vector<VALUETYPE> fparam_, aparam_;
     validate_fparam_aparam(nframes, (aparam_nall ? natoms : (natoms - nghost)),
                            fparam, aparam);
@@ -2893,8 +2893,8 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
     tile_fparam_aparam(aparam_, nframes,
                        (aparam_nall ? natoms : (natoms - nghost)) * daparam,
                        aparam);
-    const VALUETYPE *fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
-    const VALUETYPE *aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
+    const VALUETYPE* fparam__ = !fparam_.empty() ? &fparam_[0] : nullptr;
+    const VALUETYPE* aparam__ = !aparam_.empty() ? &aparam_[0] : nullptr;
     _DP_DeepSpinModelDeviComputeNList<VALUETYPE>(
         dp, natoms, coord_, spin_, atype_, box_, nghost, lmp_list.nl, ago,
         fparam__, aparam__, ener_, force_, force_mag_, virial_, atomic_ener_,
@@ -2933,7 +2933,7 @@ class DeepSpinModelDevi : public DeepBaseModelDevi {
   };
 
  private:
-  DP_DeepSpinModelDevi *dp;
+  DP_DeepSpinModelDevi* dp;
 };
 
 /**
@@ -2950,9 +2950,9 @@ class DeepTensor {
    * @brief DeepTensor constructor with initialization.
    * @param[in] model The name of the frozen model file.
    **/
-  DeepTensor(const std::string &model,
-             const int &gpu_rank = 0,
-             const std::string &name_scope = "")
+  DeepTensor(const std::string& model,
+             const int& gpu_rank = 0,
+             const std::string& name_scope = "")
       : dt(nullptr) {
     try {
       init(model, gpu_rank, name_scope);
@@ -2968,9 +2968,9 @@ class DeepTensor {
    * @brief Initialize the DeepTensor.
    * @param[in] model The name of the frozen model file.
    **/
-  void init(const std::string &model,
-            const int &gpu_rank = 0,
-            const std::string &name_scope = "") {
+  void init(const std::string& model,
+            const int& gpu_rank = 0,
+            const std::string& name_scope = "") {
     if (dt) {
       std::cerr << "WARNING: deepmd-kit should not be initialized twice, do "
                    "nothing at the second call of initializer"
@@ -2993,23 +2993,23 @@ class DeepTensor {
    *x 9 (PBC) or empty (no PBC).
    **/
   template <typename VALUETYPE>
-  void compute(std::vector<VALUETYPE> &tensor,
-               const std::vector<VALUETYPE> &coord,
-               const std::vector<int> &atype,
-               const std::vector<VALUETYPE> &box) {
+  void compute(std::vector<VALUETYPE>& tensor,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box) {
     unsigned int natoms = atype.size();
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
-    VALUETYPE *tensor_;
-    VALUETYPE **p_tensor = &tensor_;
+    VALUETYPE* tensor_;
+    VALUETYPE** p_tensor = &tensor_;
     int size;
-    int *p_size = &size;
+    int* p_size = &size;
 
     _DP_DeepTensorComputeTensor<VALUETYPE>(dt, natoms, coord_, atype_, box_,
                                            p_tensor, p_size);
@@ -3033,25 +3033,25 @@ class DeepTensor {
    * @param[in] nlist The neighbor list.
    **/
   template <typename VALUETYPE>
-  void compute(std::vector<VALUETYPE> &tensor,
-               const std::vector<VALUETYPE> &coord,
-               const std::vector<int> &atype,
-               const std::vector<VALUETYPE> &box,
+  void compute(std::vector<VALUETYPE>& tensor,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box,
                const int nghost,
-               const InputNlist &lmp_list) {
+               const InputNlist& lmp_list) {
     unsigned int natoms = atype.size();
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
-    VALUETYPE *tensor_;
-    VALUETYPE **p_tensor = &tensor_;
+    VALUETYPE* tensor_;
+    VALUETYPE** p_tensor = &tensor_;
     int size;
-    int *p_size = &size;
+    int* p_size = &size;
 
     _DP_DeepTensorComputeTensorNList<VALUETYPE>(dt, natoms, coord_, atype_,
                                                 box_, nghost, lmp_list.nl,
@@ -3076,26 +3076,26 @@ class DeepTensor {
    *x 9 (PBC) or empty (no PBC).
    **/
   template <typename VALUETYPE>
-  void compute(std::vector<VALUETYPE> &global_tensor,
-               std::vector<VALUETYPE> &force,
-               std::vector<VALUETYPE> &virial,
-               const std::vector<VALUETYPE> &coord,
-               const std::vector<int> &atype,
-               const std::vector<VALUETYPE> &box) {
+  void compute(std::vector<VALUETYPE>& global_tensor,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box) {
     unsigned int natoms = atype.size();
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
     global_tensor.resize(odim);
     force.resize(static_cast<size_t>(odim) * natoms * 3);
     virial.resize(static_cast<size_t>(odim) * 9);
-    VALUETYPE *global_tensor_ = &global_tensor[0];
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *virial_ = &virial[0];
+    VALUETYPE* global_tensor_ = &global_tensor[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* virial_ = &virial[0];
 
     _DP_DeepTensorCompute<VALUETYPE>(dt, natoms, coord_, atype_, box_,
                                      global_tensor_, force_, virial_, nullptr,
@@ -3117,36 +3117,36 @@ class DeepTensor {
    *x 9 (PBC) or empty (no PBC).
    **/
   template <typename VALUETYPE>
-  void compute(std::vector<VALUETYPE> &global_tensor,
-               std::vector<VALUETYPE> &force,
-               std::vector<VALUETYPE> &virial,
-               std::vector<VALUETYPE> &atom_tensor,
-               std::vector<VALUETYPE> &atom_virial,
-               const std::vector<VALUETYPE> &coord,
-               const std::vector<int> &atype,
-               const std::vector<VALUETYPE> &box) {
+  void compute(std::vector<VALUETYPE>& global_tensor,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               std::vector<VALUETYPE>& atom_tensor,
+               std::vector<VALUETYPE>& atom_virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box) {
     unsigned int natoms = atype.size();
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
     global_tensor.resize(odim);
     force.resize(static_cast<size_t>(odim) * natoms * 3);
     virial.resize(static_cast<size_t>(odim) * 9);
     atom_virial.resize(static_cast<size_t>(odim) * natoms * 9);
-    VALUETYPE *global_tensor_ = &global_tensor[0];
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *virial_ = &virial[0];
-    VALUETYPE *atomic_virial_ = &atom_virial[0];
+    VALUETYPE* global_tensor_ = &global_tensor[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* virial_ = &virial[0];
+    VALUETYPE* atomic_virial_ = &atom_virial[0];
 
-    VALUETYPE *atomic_tensor_;
-    VALUETYPE **p_atomic_tensor = &atomic_tensor_;
+    VALUETYPE* atomic_tensor_;
+    VALUETYPE** p_atomic_tensor = &atomic_tensor_;
     int size_at;
-    int *p_size_at = &size_at;
+    int* p_size_at = &size_at;
 
     _DP_DeepTensorCompute<VALUETYPE>(
         dt, natoms, coord_, atype_, box_, global_tensor_, force_, virial_,
@@ -3173,28 +3173,28 @@ class DeepTensor {
    * @param[in] nlist The neighbor list.
    **/
   template <typename VALUETYPE>
-  void compute(std::vector<VALUETYPE> &global_tensor,
-               std::vector<VALUETYPE> &force,
-               std::vector<VALUETYPE> &virial,
-               const std::vector<VALUETYPE> &coord,
-               const std::vector<int> &atype,
-               const std::vector<VALUETYPE> &box,
+  void compute(std::vector<VALUETYPE>& global_tensor,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box,
                const int nghost,
-               const InputNlist &lmp_list) {
+               const InputNlist& lmp_list) {
     unsigned int natoms = atype.size();
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
     global_tensor.resize(odim);
     force.resize(static_cast<size_t>(odim) * natoms * 3);
     virial.resize(static_cast<size_t>(odim) * 9);
-    VALUETYPE *global_tensor_ = &global_tensor[0];
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *virial_ = &virial[0];
+    VALUETYPE* global_tensor_ = &global_tensor[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* virial_ = &virial[0];
 
     _DP_DeepTensorComputeNList<VALUETYPE>(
         dt, natoms, coord_, atype_, box_, nghost, lmp_list.nl, global_tensor_,
@@ -3218,38 +3218,38 @@ class DeepTensor {
    * @param[in] nlist The neighbor list.
    **/
   template <typename VALUETYPE>
-  void compute(std::vector<VALUETYPE> &global_tensor,
-               std::vector<VALUETYPE> &force,
-               std::vector<VALUETYPE> &virial,
-               std::vector<VALUETYPE> &atom_tensor,
-               std::vector<VALUETYPE> &atom_virial,
-               const std::vector<VALUETYPE> &coord,
-               const std::vector<int> &atype,
-               const std::vector<VALUETYPE> &box,
+  void compute(std::vector<VALUETYPE>& global_tensor,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               std::vector<VALUETYPE>& atom_tensor,
+               std::vector<VALUETYPE>& atom_virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box,
                const int nghost,
-               const InputNlist &lmp_list) {
+               const InputNlist& lmp_list) {
     unsigned int natoms = atype.size();
     assert(natoms * 3 == coord.size());
     if (!box.empty()) {
       assert(box.size() == 9);
     }
-    const VALUETYPE *coord_ = &coord[0];
-    const VALUETYPE *box_ = !box.empty() ? &box[0] : nullptr;
-    const int *atype_ = &atype[0];
+    const VALUETYPE* coord_ = &coord[0];
+    const VALUETYPE* box_ = !box.empty() ? &box[0] : nullptr;
+    const int* atype_ = &atype[0];
 
     global_tensor.resize(odim);
     force.resize(static_cast<size_t>(odim) * natoms * 3);
     virial.resize(static_cast<size_t>(odim) * 9);
     atom_virial.resize(static_cast<size_t>(odim) * natoms * 9);
-    VALUETYPE *global_tensor_ = &global_tensor[0];
-    VALUETYPE *force_ = &force[0];
-    VALUETYPE *virial_ = &virial[0];
-    VALUETYPE *atomic_virial_ = &atom_virial[0];
+    VALUETYPE* global_tensor_ = &global_tensor[0];
+    VALUETYPE* force_ = &force[0];
+    VALUETYPE* virial_ = &virial[0];
+    VALUETYPE* atomic_virial_ = &atom_virial[0];
 
-    VALUETYPE *atomic_tensor_;
-    VALUETYPE **p_atomic_tensor = &atomic_tensor_;
+    VALUETYPE* atomic_tensor_;
+    VALUETYPE** p_atomic_tensor = &atomic_tensor_;
     int size_at;
-    int *p_size_at = &size_at;
+    int* p_size_at = &size_at;
 
     _DP_DeepTensorComputeNList<VALUETYPE>(
         dt, natoms, coord_, atype_, box_, nghost, lmp_list.nl, global_tensor_,
@@ -3286,7 +3286,7 @@ class DeepTensor {
   }
 
   std::vector<int> sel_types() const {
-    int *sel_types_arr = DP_DeepTensorGetSelTypes(dt);
+    int* sel_types_arr = DP_DeepTensorGetSelTypes(dt);
     std::vector<int> sel_types_vec =
         std::vector<int>(sel_types_arr, sel_types_arr + nsel_types);
     return sel_types_vec;
@@ -3296,21 +3296,21 @@ class DeepTensor {
    * information.
    * @param[in] pre The prefix to each line.
    */
-  void print_summary(const std::string &pre) const {
+  void print_summary(const std::string& pre) const {
     DP_PrintSummary(pre.c_str());
   }
   /**
    * @brief Get the type map (element name of the atom types) of this model.
    * @param[out] type_map The type map of this model.
    **/
-  void get_type_map(std::string &type_map) {
-    const char *type_map_c = DP_DeepTensorGetTypeMap(dt);
+  void get_type_map(std::string& type_map) {
+    const char* type_map_c = DP_DeepTensorGetTypeMap(dt);
     type_map.assign(type_map_c);
     DP_DeleteChar(type_map_c);
   };
 
  private:
-  DP_DeepTensor *dt;
+  DP_DeepTensor* dt;
   int odim;
   int nsel_types;
 };
@@ -3328,9 +3328,9 @@ class DipoleChargeModifier {
    * @param[in] gpu_rank The rank of the GPU to be used.
    * @param[in] name_scope The name scope of the model.
    **/
-  DipoleChargeModifier(const std::string &model,
-                       const int &gpu_rank = 0,
-                       const std::string &name_scope = "")
+  DipoleChargeModifier(const std::string& model,
+                       const int& gpu_rank = 0,
+                       const std::string& name_scope = "")
       : dcm(nullptr) {
     try {
       init(model, gpu_rank, name_scope);
@@ -3348,9 +3348,9 @@ class DipoleChargeModifier {
    * @param[in] gpu_rank The rank of the GPU to be used.
    * @param[in] name_scope The name scope of the model.
    **/
-  void init(const std::string &model,
-            const int &gpu_rank = 0,
-            const std::string &name_scope = "") {
+  void init(const std::string& model,
+            const int& gpu_rank = 0,
+            const std::string& name_scope = "") {
     if (dcm) {
       std::cerr << "WARNING: deepmd-kit should not be initialized twice, do "
                    "nothing at the second call of initializer"
@@ -3379,31 +3379,31 @@ class DipoleChargeModifier {
    * @param[in] lmp_list The neighbor list.
    **/
   template <typename VALUETYPE>
-  void compute(std::vector<VALUETYPE> &dfcorr_,
-               std::vector<VALUETYPE> &dvcorr_,
-               const std::vector<VALUETYPE> &dcoord_,
-               const std::vector<int> &datype_,
-               const std::vector<VALUETYPE> &dbox,
-               const std::vector<std::pair<int, int>> &pairs,
-               const std::vector<VALUETYPE> &delef_,
+  void compute(std::vector<VALUETYPE>& dfcorr_,
+               std::vector<VALUETYPE>& dvcorr_,
+               const std::vector<VALUETYPE>& dcoord_,
+               const std::vector<int>& datype_,
+               const std::vector<VALUETYPE>& dbox,
+               const std::vector<std::pair<int, int>>& pairs,
+               const std::vector<VALUETYPE>& delef_,
                const int nghost,
-               const InputNlist &lmp_list) {
+               const InputNlist& lmp_list) {
     unsigned int natoms = datype_.size();
     assert(natoms * 3 == dcoord_.size());
     if (!dbox.empty()) {
       assert(dbox.size() == 9);
     }
-    const VALUETYPE *dcoord = &dcoord_[0];
-    const VALUETYPE *dbox_ = !dbox.empty() ? &dbox[0] : nullptr;
-    const int *datype = &datype_[0];
+    const VALUETYPE* dcoord = &dcoord_[0];
+    const VALUETYPE* dbox_ = !dbox.empty() ? &dbox[0] : nullptr;
+    const int* datype = &datype_[0];
     const int npairs = pairs.size();
-    const int *dpairs = reinterpret_cast<const int *>(&pairs[0]);
-    const VALUETYPE *delef = &delef_[0];
+    const int* dpairs = reinterpret_cast<const int*>(&pairs[0]);
+    const VALUETYPE* delef = &delef_[0];
 
     dfcorr_.resize(static_cast<size_t>(natoms) * 3);
     dvcorr_.resize(9);
-    VALUETYPE *dfcorr = &dfcorr_[0];
-    VALUETYPE *dvcorr = &dvcorr_[0];
+    VALUETYPE* dfcorr = &dfcorr_[0];
+    VALUETYPE* dvcorr = &dvcorr_[0];
 
     _DP_DipoleChargeModifierComputeNList<VALUETYPE>(
         dcm, natoms, dcoord, datype, dbox_, dpairs, npairs, delef, nghost,
@@ -3428,7 +3428,7 @@ class DipoleChargeModifier {
   };
 
   std::vector<int> sel_types() const {
-    int *sel_types_arr = DP_DipoleChargeModifierGetSelTypes(dcm);
+    int* sel_types_arr = DP_DipoleChargeModifierGetSelTypes(dcm);
     std::vector<int> sel_types_vec =
         std::vector<int>(sel_types_arr, sel_types_arr + nsel_types);
     return sel_types_vec;
@@ -3439,12 +3439,12 @@ class DipoleChargeModifier {
    * information.
    * @param[in] pre The prefix to each line.
    */
-  void print_summary(const std::string &pre) const {
+  void print_summary(const std::string& pre) const {
     DP_PrintSummary(pre.c_str());
   }
 
  private:
-  DP_DipoleChargeModifier *dcm;
+  DP_DipoleChargeModifier* dcm;
   int nsel_types;
 };
 
@@ -3453,9 +3453,9 @@ class DipoleChargeModifier {
  * @param[in] model Path to the model.
  * @param[out] file_content Content of the model file.
  **/
-void inline read_file_to_string(std::string model, std::string &file_content) {
+void inline read_file_to_string(std::string model, std::string& file_content) {
   int size;
-  const char *c_file_content = DP_ReadFileToChar2(model.c_str(), &size);
+  const char* c_file_content = DP_ReadFileToChar2(model.c_str(), &size);
   if (size < 0) {
     // negative size indicates error
     std::string error_message = std::string(c_file_content, -size);
@@ -3478,13 +3478,13 @@ void inline read_file_to_string(std::string model, std::string &file_content) {
  * @param[in] sel_type_ The selected atom types.
  */
 template <typename VALUETYPE>
-void select_by_type(std::vector<int> &fwd_map,
-                    std::vector<int> &bkw_map,
-                    int &nghost_real,
-                    const std::vector<VALUETYPE> &dcoord_,
-                    const std::vector<int> &datype_,
-                    const int &nghost,
-                    const std::vector<int> &sel_type_) {
+void select_by_type(std::vector<int>& fwd_map,
+                    std::vector<int>& bkw_map,
+                    int& nghost_real,
+                    const std::vector<VALUETYPE>& dcoord_,
+                    const std::vector<int>& datype_,
+                    const int& nghost,
+                    const std::vector<int>& sel_type_) {
   const int natoms = datype_.size();
   const int nsel_type = sel_type_.size();
   fwd_map.resize(natoms);
@@ -3505,10 +3505,10 @@ void select_by_type(std::vector<int> &fwd_map,
  * @param[in] stride The stride of the input vector.
  */
 template <typename VT>
-void select_map(std::vector<VT> &out,
-                const std::vector<VT> &in,
-                const std::vector<int> &fwd_map,
-                const int &stride) {
+void select_map(std::vector<VT>& out,
+                const std::vector<VT>& in,
+                const std::vector<int>& fwd_map,
+                const int& stride) {
   static_assert(std::is_same<int, VT>(), "only support int");
   const int nall1 = in.size() / stride;
   int nall2 = 0;
diff --git a/source/api_c/tests/test_deepmd_exception.cc b/source/api_c/tests/test_deepmd_exception.cc
index f9f2984588..96f6942a65 100644
--- a/source/api_c/tests/test_deepmd_exception.cc
+++ b/source/api_c/tests/test_deepmd_exception.cc
@@ -16,7 +16,7 @@ TEST(TestDeepmdException, deepmdexception) {
   std::string expected_error_message = "DeePMD-kit C API Error: unittest";
   try {
     throw deepmd::hpp::deepmd_exception("unittest");
-  } catch (deepmd::hpp::deepmd_exception &ex) {
+  } catch (deepmd::hpp::deepmd_exception& ex) {
     EXPECT_STREQ(expected_error_message.c_str(), ex.what());
   }
 }
diff --git a/source/api_c/tests/test_utils.h b/source/api_c/tests/test_utils.h
index 5167732bc8..59c764409a 100644
--- a/source/api_c/tests/test_utils.h
+++ b/source/api_c/tests/test_utils.h
@@ -14,7 +14,7 @@ typedef testing::Types<double, float> ValueTypes;
 template <typename VALUETYPE>
 inline void _fold_back(typename std::vector<VALUETYPE>::iterator out,
                        const typename std::vector<VALUETYPE>::const_iterator in,
-                       const std::vector<int> &mapping,
+                       const std::vector<int>& mapping,
                        const int nloc,
                        const int nall,
                        const int ndim,
@@ -35,9 +35,9 @@ inline void _fold_back(typename std::vector<VALUETYPE>::iterator out,
 }
 
 template <typename VALUETYPE>
-inline void _fold_back(std::vector<VALUETYPE> &out,
-                       const std::vector<VALUETYPE> &in,
-                       const std::vector<int> &mapping,
+inline void _fold_back(std::vector<VALUETYPE>& out,
+                       const std::vector<VALUETYPE>& in,
+                       const std::vector<int>& mapping,
                        const int nloc,
                        const int nall,
                        const int ndim,
@@ -48,14 +48,14 @@ inline void _fold_back(std::vector<VALUETYPE> &out,
 }
 
 template <typename VALUETYPE>
-inline void _build_nlist(std::vector<std::vector<int>> &nlist_data,
-                         std::vector<VALUETYPE> &coord_cpy,
-                         std::vector<int> &atype_cpy,
-                         std::vector<int> &mapping,
-                         const std::vector<VALUETYPE> &coord,
-                         const std::vector<int> &atype,
-                         const std::vector<VALUETYPE> &box,
-                         const float &rc) {
+inline void _build_nlist(std::vector<std::vector<int>>& nlist_data,
+                         std::vector<VALUETYPE>& coord_cpy,
+                         std::vector<int>& atype_cpy,
+                         std::vector<int>& mapping,
+                         const std::vector<VALUETYPE>& coord,
+                         const std::vector<int>& atype,
+                         const std::vector<VALUETYPE>& box,
+                         const float& rc) {
   // convert VALUETYPE to double, it looks like copy_coord only accepts double
   std::vector<double> coord_cpy_;
   std::vector<double> coord_(coord.begin(), coord.end());
@@ -90,13 +90,13 @@ class EnergyModelTest {
   double level =
       std::is_same<VALUETYPE, double>::value ? 1e-6 : 1e-2;  // expected?
  public:
-  virtual void compute(double &ener,
-                       std::vector<VALUETYPE> &force,
-                       std::vector<VALUETYPE> &virial,
-                       const std::vector<VALUETYPE> &coord,
-                       const std::vector<VALUETYPE> &box) = 0;
-  void test_f(const std::vector<VALUETYPE> &coord,
-              const std::vector<VALUETYPE> &box) {
+  virtual void compute(double& ener,
+                       std::vector<VALUETYPE>& force,
+                       std::vector<VALUETYPE>& virial,
+                       const std::vector<VALUETYPE>& coord,
+                       const std::vector<VALUETYPE>& box) = 0;
+  void test_f(const std::vector<VALUETYPE>& coord,
+              const std::vector<VALUETYPE>& box) {
     int ndof = coord.size();
     double ener;
     std::vector<VALUETYPE> force, virial;
@@ -114,8 +114,8 @@ class EnergyModelTest {
       EXPECT_LT(fabs(num - ana), level);
     }
   }
-  void test_v(const std::vector<VALUETYPE> &coord,
-              const std::vector<VALUETYPE> &box) {
+  void test_v(const std::vector<VALUETYPE>& coord,
+              const std::vector<VALUETYPE>& box) {
     std::vector<VALUETYPE> num_diff(9);
     double ener;
     std::vector<VALUETYPE> force, virial;
diff --git a/source/api_cc/CMakeLists.txt b/source/api_cc/CMakeLists.txt
index 0d4bba1047..90b7c08449 100644
--- a/source/api_cc/CMakeLists.txt
+++ b/source/api_cc/CMakeLists.txt
@@ -6,7 +6,6 @@ file(GLOB LIB_SRC src/*.cc src/*.cpp)
 file(GLOB INC_SRC include/*.h ${CMAKE_CURRENT_BINARY_DIR}/version.h)
 
 set(libname "${LIB_DEEPMD_CC}")
-
 add_library(${libname} SHARED ${LIB_SRC})
 
 # link: libdeepmd libdeepmd_op libtensorflow_cc libtensorflow_framework
@@ -48,7 +47,7 @@ set_target_properties(
   ${libname}
   PROPERTIES INSTALL_RPATH "$ORIGIN;${BACKEND_LIBRARY_PATH}"
              INSTALL_RPATH_USE_LINK_PATH TRUE
-             BUILD_RPATH "$ORIGIN/../op/tf;$ORIGIN/../op/pt")
+             BUILD_RPATH "$ORIGIN/../op/tf;$ORIGIN/../op/pt;$ORIGIN/../op/pd")
 target_compile_definitions(${libname} PRIVATE TF_PRIVATE)
 if(CMAKE_TESTING_ENABLED)
   target_link_libraries(${libname} PRIVATE coverage_config)
diff --git a/source/api_cc/include/DeepPotPD.h b/source/api_cc/include/DeepPotPD.h
index ec43300ca0..6fceb19352 100644
--- a/source/api_cc/include/DeepPotPD.h
+++ b/source/api_cc/include/DeepPotPD.h
@@ -282,12 +282,11 @@ class DeepPotPD : public DeepPotBackend {
    * @brief Compute the number of elements in a tensor.
    * @param[in] x Tensor x.
    **/
-  int numel(const paddle_infer::Tensor& x) const {
-    // TODO: There might be a overflow problem here for multiply int numbers.
-    int ret = 1;
+  size_t numel(const paddle_infer::Tensor& x) const {
+    size_t ret = 1;
     std::vector<int> x_shape = x.shape();
     for (std::size_t i = 0, n = x_shape.size(); i < n; ++i) {
-      ret *= x_shape[i];
+      ret *= static_cast<size_t>(x_shape[i]);
     }
     return ret;
   };
@@ -392,7 +391,7 @@ class DeepPotPD : public DeepPotBackend {
   int do_message_passing;  // 1:dpa2 model 0:others
   bool gpu_enabled;
   std::unique_ptr<paddle_infer::Tensor> firstneigh_tensor;
-  // std::unordered_map<std::string, paddle::Tensor> comm_dict; # Not used yet
+  std::unique_ptr<paddle_infer::Tensor> mapping_tensor;
 };
 
 }  // namespace deepmd
diff --git a/source/api_cc/include/DeepPotPT.h b/source/api_cc/include/DeepPotPT.h
index 207a13286c..4a06bf012c 100644
--- a/source/api_cc/include/DeepPotPT.h
+++ b/source/api_cc/include/DeepPotPT.h
@@ -340,6 +340,8 @@ class DeepPotPT : public DeepPotBackend {
   at::Tensor firstneigh_tensor;
   c10::optional<torch::Tensor> mapping_tensor;
   torch::Dict<std::string, torch::Tensor> comm_dict;
+  bool profiler_enabled{false};
+  std::string profiler_file;
   /**
    * @brief Translate PyTorch exceptions to the DeePMD-kit exception.
    * @param[in] f The function to run.
diff --git a/source/api_cc/include/DeepTensorPT.h b/source/api_cc/include/DeepTensorPT.h
new file mode 100644
index 0000000000..c602fc53e0
--- /dev/null
+++ b/source/api_cc/include/DeepTensorPT.h
@@ -0,0 +1,247 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#pragma once
+
+#include <torch/script.h>
+#include <torch/torch.h>
+
+#include "DeepTensor.h"
+
+namespace deepmd {
+/**
+ * @brief PyTorch implementation for Deep Tensor.
+ **/
+class DeepTensorPT : public DeepTensorBase {
+ public:
+  /**
+   * @brief Deep Tensor constructor without initialization.
+   **/
+  DeepTensorPT();
+  virtual ~DeepTensorPT();
+  /**
+   * @brief Deep Tensor constructor with initialization.
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] name_scope Name scopes of operations.
+   **/
+  DeepTensorPT(const std::string& model,
+               const int& gpu_rank = 0,
+               const std::string& name_scope = "");
+  /**
+   * @brief Initialize the Deep Tensor.
+   * @param[in] model The name of the frozen model file.
+   * @param[in] gpu_rank The GPU rank. Default is 0.
+   * @param[in] name_scope Name scopes of operations.
+   **/
+  void init(const std::string& model,
+            const int& gpu_rank = 0,
+            const std::string& name_scope = "");
+
+ private:
+  /**
+   * @brief Evaluate the global tensor and component-wise force and virial.
+   * @param[out] global_tensor The global tensor to evaluate.
+   * @param[out] force The component-wise force of the global tensor, size odim
+   *x natoms x 3.
+   * @param[out] virial The component-wise virial of the global tensor, size
+   *odim x 9.
+   * @param[out] atom_tensor The atomic tensor value of the model, size natoms x
+   *odim.
+   * @param[out] atom_virial The component-wise atomic virial of the global
+   *tensor, size odim x natoms x 9.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size 9.
+   * @param[in] request_deriv Whether to request the derivative of the global
+   * tensor, including force and virial.
+   **/
+  template <typename VALUETYPE>
+  void compute(std::vector<VALUETYPE>& global_tensor,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               std::vector<VALUETYPE>& atom_tensor,
+               std::vector<VALUETYPE>& atom_virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box,
+               const bool request_deriv);
+  /**
+   * @brief Evaluate the global tensor and component-wise force and virial.
+   * @param[out] global_tensor The global tensor to evaluate.
+   * @param[out] force The component-wise force of the global tensor, size odim
+   *x natoms x 3.
+   * @param[out] virial The component-wise virial of the global tensor, size
+   *odim x 9.
+   * @param[out] atom_tensor The atomic tensor value of the model, size natoms x
+   *odim.
+   * @param[out] atom_virial The component-wise atomic virial of the global
+   *tensor, size odim x natoms x 9.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size 9.
+   * @param[in] nghost The number of ghost atoms.
+   * @param[in] inlist The input neighbour list.
+   * @param[in] request_deriv Whether to request the derivative of the global
+   * tensor, including force and virial.
+   **/
+  template <typename VALUETYPE>
+  void compute(std::vector<VALUETYPE>& global_tensor,
+               std::vector<VALUETYPE>& force,
+               std::vector<VALUETYPE>& virial,
+               std::vector<VALUETYPE>& atom_tensor,
+               std::vector<VALUETYPE>& atom_virial,
+               const std::vector<VALUETYPE>& coord,
+               const std::vector<int>& atype,
+               const std::vector<VALUETYPE>& box,
+               const int nghost,
+               const InputNlist& inlist,
+               const bool request_deriv);
+
+ public:
+  /**
+   * @brief Get the cutoff radius.
+   * @return The cutoff radius.
+   **/
+  double cutoff() const {
+    assert(inited);
+    return rcut;
+  };
+  /**
+   * @brief Get the number of types.
+   * @return The number of types.
+   **/
+  int numb_types() const {
+    assert(inited);
+    return ntypes;
+  };
+  /**
+   * @brief Get the output dimension.
+   * @return The output dimension.
+   **/
+  int output_dim() const {
+    assert(inited);
+    return odim;
+  };
+  /**
+   * @brief Get the list of sel types.
+   * @return The list of sel types.
+   */
+  const std::vector<int>& sel_types() const {
+    assert(inited);
+    return sel_type;
+  };
+  /**
+   * @brief Get the type map (element name of the atom types) of this model.
+   * @param[out] type_map The type map of this model.
+   **/
+  void get_type_map(std::string& type_map);
+
+  /**
+   * @brief Evaluate the global tensor and component-wise force and virial.
+   * @param[out] global_tensor The global tensor to evaluate.
+   * @param[out] force The component-wise force of the global tensor, size odim
+   *x natoms x 3.
+   * @param[out] virial The component-wise virial of the global tensor, size
+   *odim x 9.
+   * @param[out] atom_tensor The atomic tensor value of the model, size natoms x
+   *odim.
+   * @param[out] atom_virial The component-wise atomic virial of the global
+   *tensor, size odim x natoms x 9.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size 9.
+   * @param[in] request_deriv Whether to request the derivative of the global
+   * tensor, including force and virial.
+   * @{
+   **/
+  void computew(std::vector<double>& global_tensor,
+                std::vector<double>& force,
+                std::vector<double>& virial,
+                std::vector<double>& atom_tensor,
+                std::vector<double>& atom_virial,
+                const std::vector<double>& coord,
+                const std::vector<int>& atype,
+                const std::vector<double>& box,
+                const bool request_deriv);
+  void computew(std::vector<float>& global_tensor,
+                std::vector<float>& force,
+                std::vector<float>& virial,
+                std::vector<float>& atom_tensor,
+                std::vector<float>& atom_virial,
+                const std::vector<float>& coord,
+                const std::vector<int>& atype,
+                const std::vector<float>& box,
+                const bool request_deriv);
+  /** @} */
+  /**
+   * @brief Evaluate the global tensor and component-wise force and virial.
+   * @param[out] global_tensor The global tensor to evaluate.
+   * @param[out] force The component-wise force of the global tensor, size odim
+   *x natoms x 3.
+   * @param[out] virial The component-wise virial of the global tensor, size
+   *odim x 9.
+   * @param[out] atom_tensor The atomic tensor value of the model, size natoms x
+   *odim.
+   * @param[out] atom_virial The component-wise atomic virial of the global
+   *tensor, size odim x natoms x 9.
+   * @param[in] coord The coordinates of atoms. The array should be of size
+   *natoms x 3.
+   * @param[in] atype The atom types. The list should contain natoms ints.
+   * @param[in] box The cell of the region. The array should be of size 9.
+   * @param[in] nghost The number of ghost atoms.
+   * @param[in] inlist The input neighbour list.
+   * @param[in] request_deriv Whether to request the derivative of the global
+   * tensor, including force and virial.
+   * @{
+   **/
+  void computew(std::vector<double>& global_tensor,
+                std::vector<double>& force,
+                std::vector<double>& virial,
+                std::vector<double>& atom_tensor,
+                std::vector<double>& atom_virial,
+                const std::vector<double>& coord,
+                const std::vector<int>& atype,
+                const std::vector<double>& box,
+                const int nghost,
+                const InputNlist& inlist,
+                const bool request_deriv);
+  void computew(std::vector<float>& global_tensor,
+                std::vector<float>& force,
+                std::vector<float>& virial,
+                std::vector<float>& atom_tensor,
+                std::vector<float>& atom_virial,
+                const std::vector<float>& coord,
+                const std::vector<int>& atype,
+                const std::vector<float>& box,
+                const int nghost,
+                const InputNlist& inlist,
+                const bool request_deriv);
+  /** @} */
+
+ private:
+  int num_intra_nthreads, num_inter_nthreads;
+  bool inited;
+  double rcut;
+  int ntypes;
+  mutable int odim;
+  std::vector<int> sel_type;
+  std::string name_scope;
+  // PyTorch module and device management
+  mutable torch::jit::script::Module module;
+  int gpu_id;
+  bool gpu_enabled;
+  NeighborListData nlist_data;
+  // Neighbor list tensors for efficient computation
+  at::Tensor firstneigh_tensor;
+
+  /**
+   * @brief Translate PyTorch exceptions to the DeePMD-kit exception.
+   * @param[in] f The function to run.
+   * @example translate_error([&](){...});
+   */
+  void translate_error(std::function<void()> f);
+};
+
+}  // namespace deepmd
diff --git a/source/api_cc/src/DeepPotPD.cc b/source/api_cc/src/DeepPotPD.cc
index 3a3d880c4b..94931a8415 100644
--- a/source/api_cc/src/DeepPotPD.cc
+++ b/source/api_cc/src/DeepPotPD.cc
@@ -11,15 +11,176 @@
 
 using namespace deepmd;
 
-std::vector<int> createNlistTensorPD(
-    const std::vector<std::vector<int>>& data) {
-  std::vector<int> ret;
-  for (const auto& row : data) {
-    ret.insert(ret.end(), row.begin(), row.end());
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+class Logger {
+ public:
+  enum Level { DEBUG = 0, INFO = 1, WARNING = 2, ERROR = 3 };
+
+ private:
+  static Level minLevel;
+  static bool colorEnabled;
+  static bool showTimestamp;
+
+  static const char* getColorCode(Level level) {
+    if (!colorEnabled) {
+      return "";
+    }
+    switch (level) {
+      case DEBUG:
+        return "\033[1;36m";
+      case INFO:
+        return "\033[1;32m";
+      case WARNING:
+        return "\033[1;33m";
+      case ERROR:
+        return "\033[1;31m";
+      default:
+        return "";
+    }
+  }
+
+  static const char* getResetCode() { return colorEnabled ? "\033[0m" : ""; }
+
+  static const char* getLevelName(Level level) {
+    switch (level) {
+      case DEBUG:
+        return "DEBUG";
+      case INFO:
+        return "INFO";
+      case WARNING:
+        return "WARNING";
+      case ERROR:
+        return "ERROR";
+      default:
+        return "UNKNOWN";
+    }
   }
-  return ret;
+
+  static std::string getCurrentTime() {
+    if (!showTimestamp) {
+      return "";
+    }
+
+    std::time_t now = std::time(0);
+    std::tm* ltm = std::localtime(&now);
+
+    std::ostringstream oss;
+    oss << std::setfill('0') << std::setw(4) << (1900 + ltm->tm_year) << "-"
+        << std::setw(2) << (1 + ltm->tm_mon) << "-" << std::setw(2)
+        << ltm->tm_mday << " " << std::setw(2) << ltm->tm_hour << ":"
+        << std::setw(2) << ltm->tm_min << ":" << std::setw(2) << ltm->tm_sec;
+    return oss.str();
+  }
+
+ public:
+  class LogStream {
+   private:
+    std::ostringstream oss;
+    Level level;
+    bool shouldLog;
+
+   public:
+    LogStream(Level lvl) : level(lvl), shouldLog(lvl >= minLevel) {
+      if (shouldLog) {
+        std::string timestamp = getCurrentTime();
+        if (!timestamp.empty()) {
+          oss << "[" << timestamp << "] ";
+        }
+        oss << getColorCode(level) << "[" << getLevelName(level) << "]"
+            << getResetCode() << " ";
+      }
+    }
+
+    ~LogStream() {
+      if (shouldLog) {
+        std::cout << oss.str() << std::flush;
+      }
+    }
+
+    template <typename T>
+    LogStream& operator<<(const T& value) {
+      if (shouldLog) {
+        oss << value;
+      }
+      return *this;
+    }
+
+    LogStream& operator<<(std::ostream& (*manip)(std::ostream&)) {
+      if (shouldLog) {
+        oss << manip;
+      }
+      return *this;
+    }
+
+    LogStream(const LogStream&) = delete;
+    LogStream& operator=(const LogStream&) = delete;
+    LogStream(LogStream&& other) noexcept
+        : oss(std::move(other.oss)),
+          level(other.level),
+          shouldLog(other.shouldLog) {}
+
+    LogStream& operator=(LogStream&& other) noexcept {
+      if (this != &other) {
+        oss = std::move(other.oss);
+        level = other.level;
+        shouldLog = other.shouldLog;
+      }
+      return *this;
+    }
+  };
+
+  static void setLevel(Level level) { minLevel = level; }
+  static void enableColor(bool enable = true) { colorEnabled = enable; }
+  static void enableTimestamp(bool enable = true) { showTimestamp = enable; }
+  static Level getLevel() { return minLevel; }
+  static bool isColorEnabled() { return colorEnabled; }
+  static bool isTimestampEnabled() { return showTimestamp; }
+
+  static LogStream debug() { return LogStream(DEBUG); }
+  static LogStream info() { return LogStream(INFO); }
+  static LogStream warning() { return LogStream(WARNING); }
+  static LogStream error() { return LogStream(ERROR); }
+};
+
+Logger::Level Logger::minLevel = Logger::INFO;
+bool Logger::colorEnabled = true;
+bool Logger::showTimestamp = true;
+
+namespace logg {
+inline Logger::LogStream debug() { return Logger::debug(); }
+inline Logger::LogStream info() { return Logger::info(); }
+inline Logger::LogStream warning() { return Logger::warning(); }
+inline Logger::LogStream error() { return Logger::error(); }
+
+inline void setLevel(Logger::Level level) { Logger::setLevel(level); }
+inline void enableColor(bool enable = true) { Logger::enableColor(enable); }
+inline void enableTimestamp(bool enable = true) {
+  Logger::enableTimestamp(enable);
 }
+}  // namespace logg
+
+void fillNlistTensor(const std::vector<std::vector<int>>& data,
+                     std::unique_ptr<paddle_infer::Tensor>& flat_tensor) {
+  size_t total_size = 0;
+  for (const auto& row : data) {
+    total_size += row.size();
+  }
+  std::vector<int> flat_data;
+  flat_data.reserve(total_size);
+  for (const auto& row : data) {
+    flat_data.insert(flat_data.end(), row.begin(), row.end());
+  }
 
+  int nloc = data.size();
+  int nnei = nloc > 0 ? total_size / nloc : 0;
+  flat_tensor->Reshape({1, nloc, nnei});
+  flat_tensor->CopyFromCpu(flat_data.data());
+}
 DeepPotPD::DeepPotPD() : inited(false) {}
 DeepPotPD::DeepPotPD(const std::string& model,
                      const int& gpu_rank,
@@ -41,9 +202,7 @@ void DeepPotPD::init(const std::string& model,
               << std::endl;
     return;
   }
-  // NOTE: There is no custom operators need to be loaded now.
-  // deepmd::load_op_library();
-
+  deepmd::load_op_library();
   // NOTE: Only support 1 GPU now.
   int gpu_num = 1;
   if (gpu_num > 0) {
@@ -59,6 +218,7 @@ void DeepPotPD::init(const std::string& model,
   config->EnableNewIR(true);
   config->EnableCustomPasses({"add_shadow_output_after_dead_parameter_pass"},
                              true);
+  // config->SwitchIrOptim(false);
 
   // initialize inference config_fl
   config_fl = std::make_shared<paddle_infer::Config>();
@@ -67,6 +227,7 @@ void DeepPotPD::init(const std::string& model,
   config_fl->EnableNewIR(true);
   config_fl->EnableCustomPasses({"add_shadow_output_after_dead_parameter_pass"},
                                 true);
+  // config_fl->SwitchIrOptim(false);
 
   // loading inference model
   std::string pdmodel_path, fl_pdmodel_path;
@@ -113,30 +274,31 @@ void DeepPotPD::init(const std::string& model,
   if (!gpu_enabled) {
     config->DisableGpu();
     config_fl->DisableGpu();
-    std::cout << "load model from: " << model << " to cpu " << std::endl;
+    logg::info() << "load model from: " << model << " to cpu " << std::endl;
   } else {
     config->EnableUseGpu(4096, 0);
     config_fl->EnableUseGpu(4096, 0);
-    std::cout << "load model from: " << model << " to gpu:" << gpu_id
-              << std::endl;
+    logg::info() << "load model from: " << model << " to gpu:" << gpu_id
+                 << std::endl;
   }
   if (config->cinn_enabled()) {
-    std::cout << "model.forward will be compiled with cinn." << std::endl;
+    logg::info() << "model.forward will be compiled with cinn." << std::endl;
   } else {
-    std::cout << "NOTE: You can try: \n'export FLAGS_prim_all=true"
-                 " FLAGS_enable_pir_in_executor=1"
-                 " FLAGS_prim_enable_dynamic=true FLAGS_use_cinn=true'\n"
-                 "to speed up C++ inference with paddle backend"
-              << std::endl;
+    logg::info() << "NOTE: You can try: \n'export FLAGS_prim_all=true"
+                    " FLAGS_enable_pir_in_executor=1"
+                    " FLAGS_prim_enable_dynamic=true FLAGS_use_cinn=true' "
+                    "to speed up C++ inference with paddle backend"
+                 << std::endl;
   }
   if (config_fl->cinn_enabled()) {
-    std::cout << "model.forward_lower will be compiled with cinn." << std::endl;
+    logg::info() << "model.forward_lower will be compiled with cinn."
+                 << std::endl;
   } else {
-    std::cout << "NOTE: You can try: \n'export FLAGS_prim_all=true"
-                 " FLAGS_enable_pir_in_executor=1"
-                 " FLAGS_prim_enable_dynamic=true FLAGS_use_cinn=true'\n"
-                 "to speed up C++ inference with paddle backend"
-              << std::endl;
+    logg::info() << "NOTE: You can try: \n'export FLAGS_prim_all=true"
+                    " FLAGS_enable_pir_in_executor=1"
+                    " FLAGS_prim_enable_dynamic=true FLAGS_use_cinn=true' "
+                    "to speed up C++ inference with paddle backend"
+                 << std::endl;
   }
 
   // NOTE: Both set to 1 now.
@@ -151,16 +313,42 @@ void DeepPotPD::init(const std::string& model,
   }
 
   predictor = paddle_infer::CreatePredictor(*config);
+  logg::info() << "Setup model.forward model" << std::endl;
   predictor_fl = paddle_infer::CreatePredictor(*config_fl);
+  logg::info() << "Setup model.forward_lower" << std::endl;
+  auto print_handle_names = [](const std::vector<std::string>& name_vec) {
+    int n = name_vec.size();
+    std::string ret;
+    for (int i = 0; i < n; ++i) {
+      ret += "[" + std::to_string(i) + "]" + name_vec[i] + " \n"[i == n - 1];
+    }
+    logg::debug() << ret;
+  };
+  logg::debug() << "Input names of model.forward below:" << std::endl;
+  print_handle_names(predictor->GetInputNames());
+  logg::debug() << "Output names of model.forward below:" << std::endl;
+  print_handle_names(predictor->GetOutputNames());
+  std::cout << std::endl;
+  logg::debug() << "Input names of model.forward_lower below:" << std::endl;
+  print_handle_names(predictor_fl->GetInputNames());
+  logg::debug() << "Output names of model.forward_lower below:" << std::endl;
+  print_handle_names(predictor_fl->GetOutputNames());
 
   // initialize hyper params from model buffers
   ntypes_spin = 0;
   DeepPotPD::get_buffer<int>("buffer_has_message_passing", do_message_passing);
+  logg::debug() << "buffer_has_message_passing = " << this->do_message_passing
+                << std::endl;
   DeepPotPD::get_buffer<double>("buffer_rcut", rcut);
+  logg::debug() << "buffer_rcut = " << this->rcut << std::endl;
   DeepPotPD::get_buffer<int>("buffer_ntypes", ntypes);
+  logg::debug() << "buffer_ntypes = " << this->ntypes << std::endl;
   DeepPotPD::get_buffer<int>("buffer_dfparam", dfparam);
+  logg::debug() << "buffer_dfparam = " << this->dfparam << std::endl;
   DeepPotPD::get_buffer<int>("buffer_daparam", daparam);
+  logg::debug() << "buffer_daparam = " << this->daparam << std::endl;
   DeepPotPD::get_buffer<int>("buffer_aparam_nall", aparam_nall);
+  logg::debug() << "buffer_aparam_nall = " << this->aparam_nall << std::endl;
   inited = true;
 }
 DeepPotPD::~DeepPotPD() {}
@@ -195,61 +383,86 @@ void DeepPotPD::compute(ENERGYVTYPE& ener,
   auto coord_wrapped_Tensor = predictor_fl->GetInputHandle("coord");
   coord_wrapped_Tensor->Reshape({1, nall_real, 3});
   coord_wrapped_Tensor->CopyFromCpu(coord_wrapped.data());
-
   auto atype_Tensor = predictor_fl->GetInputHandle("atype");
   atype_Tensor->Reshape({1, nall_real});
   atype_Tensor->CopyFromCpu(datype.data());
-
   if (ago == 0) {
-    nlist_data.copy_from_nlist(lmp_list);
+    nlist_data.copy_from_nlist(lmp_list, nall - nghost);
     nlist_data.shuffle_exclude_empty(fwd_map);
     nlist_data.padding();
-    if (do_message_passing == 1 && nghost > 0) {
-      throw deepmd::deepmd_exception(
-          "(do_message_passing == 1 && nghost > 0) is not supported yet.");
-      // int nswap = lmp_list.nswap;
-      // auto sendproc_tensor = predictor_fl->GetInputHandle("sendproc");
-      // sendproc_tensor->Reshape({nswap});
-      // sendproc_tensor->CopyFromCpu(lmp_list.sendproc);
-      // auto recvproc_tensor = predictor_fl->GetInputHandle("recvproc");
-      // recvproc_tensor->Reshape({nswap});
-      // recvproc_tensor->CopyFromCpu(lmp_list.recvproc);
-      // auto firstrecv_tensor = predictor_fl->GetInputHandle("firstrecv");
-      // firstrecv_tensor->Reshape({nswap});
-      // firstrecv_tensor->CopyFromCpu(lmp_list.firstrecv);
-      // auto recvnum_tensor = predictor_fl->GetInputHandle("recvnum");
-      // recvnum_tensor->Reshape({nswap});
-      // recvnum_tensor->CopyFromCpu(lmp_list.recvnum);
-      // auto sendnum_tensor = predictor_fl->GetInputHandle("sendnum");
-      // sendnum_tensor->Reshape({nswap});
-      // sendnum_tensor->CopyFromCpu(lmp_list.sendnum);
-      // auto communicator_tensor =
-      // predictor_fl->GetInputHandle("communicator");
-      // communicator_tensor->Reshape({1});
-      // communicator_tensor->CopyFromCpu(static_cast<int*>(lmp_list.world));
-      // auto sendlist_tensor = predictor_fl->GetInputHandle("sendlist");
-
-      // int total_send =
-      //     std::accumulate(lmp_list.sendnum, lmp_list.sendnum + nswap, 0);
+    if (do_message_passing) {
+      auto sendproc_tensor = predictor_fl->GetInputHandle("send_proc");
+      auto recvproc_tensor = predictor_fl->GetInputHandle("recv_proc");
+      auto recvnum_tensor = predictor_fl->GetInputHandle("recv_num");
+      auto sendnum_tensor = predictor_fl->GetInputHandle("send_num");
+      auto communicator_tensor = predictor_fl->GetInputHandle("communicator");
+      auto sendlist_tensor = predictor_fl->GetInputHandle("send_list");
+
+      int nswap = lmp_list.nswap;
+      sendproc_tensor->Reshape({nswap});
+      sendproc_tensor->CopyFromCpu(lmp_list.sendproc);
+
+      recvproc_tensor->Reshape({nswap});
+      recvproc_tensor->CopyFromCpu(lmp_list.recvproc);
+
+      recvnum_tensor->Reshape({nswap});
+      recvnum_tensor->CopyFromCpu(lmp_list.recvnum);
+
+      sendnum_tensor->Reshape({nswap});
+      if (sizeof(lmp_list.sendnum[0]) != sizeof(int32_t)) {
+        std::vector<int32_t> temp_data(nswap);
+        for (int i = 0; i < nswap; i++) {
+          temp_data[i] = static_cast<int32_t>(lmp_list.sendnum[i]);
+        }
+        sendnum_tensor->CopyFromCpu(temp_data.data());
+      } else {
+        sendnum_tensor->CopyFromCpu(lmp_list.sendnum);
+      }
+      communicator_tensor->Reshape({1});
+      if (lmp_list.world) {
+        communicator_tensor->CopyFromCpu(static_cast<int*>(lmp_list.world));
+      }
+
+      assert(sizeof(std::intptr_t) == 8);
+      int total_send =
+          std::accumulate(lmp_list.sendnum, lmp_list.sendnum + nswap, 0);
+      sendlist_tensor->Reshape({total_send});
+
+      /**
+      ** NOTE: paddle do not support construct a Tensor with from_blob(T**, ...)
+      ** from a double pointer, so we convert int* pointer to indptr_t for each
+      ** entry and wrap it into int64 Tensor as a workaround.
+      */
+      std::vector<std::intptr_t> pointer_addresses;
+      pointer_addresses.reserve(nswap);
+      for (int iswap = 0; iswap < nswap; ++iswap) {
+        std::intptr_t addr =
+            reinterpret_cast<std::intptr_t>(lmp_list.sendlist[iswap]);
+        pointer_addresses.push_back(addr);
+      }
+      sendlist_tensor->CopyFromCpu(pointer_addresses.data());
     }
-    if (do_message_passing == 1 && nghost == 0) {
-      throw deepmd::deepmd_exception(
-          "(do_message_passing == 1 && nghost == 0) is not supported yet.");
+    if (lmp_list.mapping) {
+      std::vector<std::int64_t> mapping(nall_real);
+      for (size_t ii = 0; ii < nall_real; ii++) {
+        mapping[ii] = lmp_list.mapping[fwd_map[ii]];
+      }
+      this->mapping_tensor = predictor_fl->GetInputHandle("mapping");
+      this->mapping_tensor->Reshape({1, nall_real});
+      this->mapping_tensor->CopyFromCpu(mapping.data());
     }
   }
-  std::vector<int> firstneigh = createNlistTensorPD(nlist_data.jlist);
-  firstneigh_tensor = predictor_fl->GetInputHandle("nlist");
-  firstneigh_tensor->Reshape({1, nloc, (int)firstneigh.size() / (int)nloc});
-  firstneigh_tensor->CopyFromCpu(firstneigh.data());
+  this->firstneigh_tensor = predictor_fl->GetInputHandle("nlist");
+  fillNlistTensor(nlist_data.jlist, this->firstneigh_tensor);
   bool do_atom_virial_tensor = atomic;
-  std::unique_ptr<paddle_infer::Tensor> fparam_tensor;
   if (!fparam.empty()) {
+    std::unique_ptr<paddle_infer::Tensor> fparam_tensor;
     fparam_tensor = predictor_fl->GetInputHandle("fparam");
     fparam_tensor->Reshape({1, static_cast<int>(fparam.size())});
-    fparam_tensor->CopyFromCpu((fparam.data()));
+    fparam_tensor->CopyFromCpu(fparam.data());
   }
-  std::unique_ptr<paddle_infer::Tensor> aparam_tensor;
   if (!aparam_.empty()) {
+    std::unique_ptr<paddle_infer::Tensor> aparam_tensor;
     aparam_tensor = predictor_fl->GetInputHandle("aparam");
     aparam_tensor->Reshape(
         {1, lmp_list.inum, static_cast<int>(aparam_.size()) / lmp_list.inum});
@@ -264,9 +477,9 @@ void DeepPotPD::compute(ENERGYVTYPE& ener,
   auto energy_ = predictor_fl->GetOutputHandle(output_names.at(1));
   auto force_ = predictor_fl->GetOutputHandle(output_names.at(2));
   auto virial_ = predictor_fl->GetOutputHandle(output_names.at(4));
-  int output_energy_size = numel(*energy_);
-  int output_force_size = numel(*force_);
-  int output_virial_size = numel(*virial_);
+  size_t output_energy_size = numel(*energy_);
+  size_t output_force_size = numel(*force_);
+  size_t output_virial_size = numel(*virial_);
   // output energy
   ener.resize(output_energy_size);
   energy_->CopyToCpu(ener.data());
@@ -300,7 +513,7 @@ void DeepPotPD::compute(ENERGYVTYPE& ener,
   }
 }
 template void DeepPotPD::compute<double, std::vector<ENERGYTYPE>>(
-    std::vector<ENERGYTYPE>& dener,
+    std::vector<ENERGYTYPE>& ener,
     std::vector<double>& force,
     std::vector<double>& virial,
     std::vector<double>& atom_energy,
@@ -312,11 +525,10 @@ template void DeepPotPD::compute<double, std::vector<ENERGYTYPE>>(
     const InputNlist& lmp_list,
     const int& ago,
     const std::vector<double>& fparam,
-    const std::vector<double>& aparam_,
+    const std::vector<double>& aparam,
     const bool atomic);
-
 template void DeepPotPD::compute<float, std::vector<ENERGYTYPE>>(
-    std::vector<ENERGYTYPE>& dener,
+    std::vector<ENERGYTYPE>& ener,
     std::vector<float>& force,
     std::vector<float>& virial,
     std::vector<float>& atom_energy,
@@ -328,9 +540,8 @@ template void DeepPotPD::compute<float, std::vector<ENERGYTYPE>>(
     const InputNlist& lmp_list,
     const int& ago,
     const std::vector<float>& fparam,
-    const std::vector<float>& aparam_,
+    const std::vector<float>& aparam,
     const bool atomic);
-
 // ENERGYVTYPE: std::vector<ENERGYTYPE> or ENERGYTYPE
 template <typename VALUETYPE, typename ENERGYVTYPE>
 void DeepPotPD::compute(ENERGYVTYPE& ener,
@@ -352,9 +563,9 @@ void DeepPotPD::compute(ENERGYVTYPE& ener,
   coord_wrapped_Tensor->Reshape({1, natoms, 3});
   coord_wrapped_Tensor->CopyFromCpu(coord_wrapped.data());
 
-  std::vector<std::int64_t> atype_64(atype.begin(), atype.end());
   auto atype_Tensor = predictor->GetInputHandle("atype");
   atype_Tensor->Reshape({1, natoms});
+  std::vector<std::int64_t> atype_64(atype.begin(), atype.end());
   atype_Tensor->CopyFromCpu(atype_64.data());
 
   std::unique_ptr<paddle_infer::Tensor> box_Tensor;
@@ -363,15 +574,15 @@ void DeepPotPD::compute(ENERGYVTYPE& ener,
     box_Tensor->Reshape({1, 9});
     box_Tensor->CopyFromCpu((box.data()));
   }
-  std::unique_ptr<paddle_infer::Tensor> fparam_tensor;
   if (!fparam.empty()) {
-    fparam_tensor = predictor->GetInputHandle("box");
+    std::unique_ptr<paddle_infer::Tensor> fparam_tensor;
+    fparam_tensor = predictor->GetInputHandle("fparam");
     fparam_tensor->Reshape({1, static_cast<int>(fparam.size())});
     fparam_tensor->CopyFromCpu((fparam.data()));
   }
-  std::unique_ptr<paddle_infer::Tensor> aparam_tensor;
   if (!aparam.empty()) {
-    aparam_tensor = predictor->GetInputHandle("box");
+    std::unique_ptr<paddle_infer::Tensor> aparam_tensor;
+    aparam_tensor = predictor->GetInputHandle("aparam");
     aparam_tensor->Reshape(
         {1, natoms, static_cast<int>(aparam.size()) / natoms});
     aparam_tensor->CopyFromCpu((aparam.data()));
@@ -387,17 +598,17 @@ void DeepPotPD::compute(ENERGYVTYPE& ener,
   auto force_ = predictor->GetOutputHandle(output_names.at(3));
   auto virial_ = predictor->GetOutputHandle(output_names.at(5));
 
-  int enery_numel = numel(*energy_);
+  size_t enery_numel = numel(*energy_);
   assert(enery_numel > 0);
   ener.resize(enery_numel);
   energy_->CopyToCpu(ener.data());
 
-  int force_numel = numel(*force_);
+  size_t force_numel = numel(*force_);
   assert(force_numel > 0);
   force.resize(force_numel);
   force_->CopyToCpu(force.data());
 
-  int virial_numel = numel(*virial_);
+  size_t virial_numel = numel(*virial_);
   assert(virial_numel > 0);
   virial.resize(virial_numel);
   virial_->CopyToCpu(virial.data());
@@ -405,8 +616,8 @@ void DeepPotPD::compute(ENERGYVTYPE& ener,
   if (atomic) {
     auto atom_energy_ = predictor->GetOutputHandle(output_names.at(0));
     auto atom_virial_ = predictor->GetOutputHandle(output_names.at(1));
-    int atom_energy_numel = numel(*atom_energy_);
-    int atom_virial_numel = numel(*atom_virial_);
+    size_t atom_energy_numel = numel(*atom_energy_);
+    size_t atom_virial_numel = numel(*atom_virial_);
     assert(atom_energy_numel > 0);
     assert(atom_virial_numel > 0);
     atom_energy.resize(atom_energy_numel);
@@ -418,11 +629,11 @@ void DeepPotPD::compute(ENERGYVTYPE& ener,
 
 template void DeepPotPD::compute<double, std::vector<ENERGYTYPE>>(
     std::vector<ENERGYTYPE>& ener,
-    std::vector<double>& dforce,
+    std::vector<double>& force,
     std::vector<double>& virial,
     std::vector<double>& atom_energy,
     std::vector<double>& atom_virial,
-    const std::vector<double>& dcoord,
+    const std::vector<double>& coord,
     const std::vector<int>& atype,
     const std::vector<double>& box,
     const std::vector<double>& fparam,
@@ -435,7 +646,7 @@ template void DeepPotPD::compute<float, std::vector<ENERGYTYPE>>(
     std::vector<float>& virial,
     std::vector<float>& atom_energy,
     std::vector<float>& atom_virial,
-    const std::vector<float>& dcoord,
+    const std::vector<float>& coord,
     const std::vector<int>& atype,
     const std::vector<float>& box,
     const std::vector<float>& fparam,
@@ -446,7 +657,7 @@ template void DeepPotPD::compute<float, std::vector<ENERGYTYPE>>(
 that need to be postprocessed */
 void DeepPotPD::get_type_map(std::string& type_map) {
   auto type_map_tensor = predictor->GetOutputHandle("buffer_type_map");
-  int type_map_size = numel(*type_map_tensor);
+  size_t type_map_size = numel(*type_map_tensor);
 
   std::vector<int> type_map_arr(type_map_size, 0);
   type_map_tensor->CopyToCpu(type_map_arr.data());
@@ -460,7 +671,7 @@ template <typename BUFFERTYPE>
 void DeepPotPD::get_buffer(const std::string& buffer_name,
                            std::vector<BUFFERTYPE>& buffer_array) {
   auto buffer_tensor = predictor->GetOutputHandle(buffer_name);
-  int buffer_size = numel(*buffer_tensor);
+  size_t buffer_size = numel(*buffer_tensor);
   buffer_array.resize(buffer_size);
   buffer_tensor->CopyToCpu(buffer_array.data());
 }
diff --git a/source/api_cc/src/DeepPotPT.cc b/source/api_cc/src/DeepPotPT.cc
index 5f03f7c5cb..3fdfeeae27 100644
--- a/source/api_cc/src/DeepPotPT.cc
+++ b/source/api_cc/src/DeepPotPT.cc
@@ -2,6 +2,7 @@
 #ifdef BUILD_PYTORCH
 #include "DeepPotPT.h"
 
+#include <torch/csrc/autograd/profiler.h>
 #include <torch/csrc/jit/runtime/jit_exception.h>
 
 #include <cstdint>
@@ -69,13 +70,9 @@ void DeepPotPT::init(const std::string& model,
   }
   deepmd::load_op_library();
   int gpu_num = torch::cuda::device_count();
-  if (gpu_num > 0) {
-    gpu_id = gpu_rank % gpu_num;
-  } else {
-    gpu_id = 0;
-  }
-  torch::Device device(torch::kCUDA, gpu_id);
+  gpu_id = (gpu_num > 0) ? (gpu_rank % gpu_num) : 0;
   gpu_enabled = torch::cuda::is_available();
+  torch::Device device(torch::kCUDA, gpu_id);
   if (!gpu_enabled) {
     device = torch::Device(torch::kCPU);
     std::cout << "load model from: " << model << " to cpu " << std::endl;
@@ -86,6 +83,37 @@ void DeepPotPT::init(const std::string& model,
     std::cout << "load model from: " << model << " to gpu " << gpu_id
               << std::endl;
   }
+
+  // Configure PyTorch profiler
+  const char* env_profiler = std::getenv("DP_PROFILER");
+  if (env_profiler && *env_profiler) {
+    using torch::profiler::impl::ActivityType;
+    using torch::profiler::impl::ExperimentalConfig;
+    using torch::profiler::impl::ProfilerConfig;
+    using torch::profiler::impl::ProfilerState;
+    std::set<ActivityType> activities{ActivityType::CPU};
+    if (gpu_enabled) {
+      activities.insert(ActivityType::CUDA);
+    }
+    profiler_file = std::string(env_profiler);
+    if (gpu_enabled) {
+      profiler_file += "_gpu" + std::to_string(gpu_id);
+    }
+    profiler_file += ".json";
+    ExperimentalConfig exp_cfg;
+    ProfilerConfig cfg(ProfilerState::KINETO,
+                       false,  // report_input_shapes
+                       false,  // profile_memory
+                       true,   // with_stack
+                       false,  // with_flops
+                       true,   // with_modules
+                       exp_cfg);
+    torch::autograd::profiler::prepareProfiler(cfg, activities);
+    torch::autograd::profiler::enableProfiler(cfg, activities);
+    std::cout << "PyTorch profiler enabled, output file: " << profiler_file
+              << std::endl;
+    profiler_enabled = true;
+  }
   std::unordered_map<std::string, std::string> metadata = {{"type", ""}};
   module = torch::jit::load(model, device, metadata);
   module.eval();
@@ -119,7 +147,17 @@ void DeepPotPT::init(const std::string& model,
   aparam_nall = module.run_method("is_aparam_nall").toBool();
   inited = true;
 }
-DeepPotPT::~DeepPotPT() {}
+
+DeepPotPT::~DeepPotPT() {
+  if (profiler_enabled) {
+    auto result = torch::autograd::profiler::disableProfiler();
+    if (result) {
+      result->save(profiler_file);
+    }
+    std::cout << "PyTorch profiler result saved to " << profiler_file
+              << std::endl;
+  }
+}
 
 template <typename VALUETYPE, typename ENERGYVTYPE>
 void DeepPotPT::compute(ENERGYVTYPE& ener,
@@ -197,12 +235,12 @@ void DeepPotPT::compute(ENERGYVTYPE& ener,
           std::accumulate(lmp_list.sendnum, lmp_list.sendnum + nswap, 0);
       torch::Tensor sendlist_tensor =
           torch::from_blob(lmp_list.sendlist, {total_send}, int32_option);
-      comm_dict.insert("send_list", sendlist_tensor);
-      comm_dict.insert("send_proc", sendproc_tensor);
-      comm_dict.insert("recv_proc", recvproc_tensor);
-      comm_dict.insert("send_num", sendnum_tensor);
-      comm_dict.insert("recv_num", recvnum_tensor);
-      comm_dict.insert("communicator", communicator_tensor);
+      comm_dict.insert_or_assign("send_list", sendlist_tensor);
+      comm_dict.insert_or_assign("send_proc", sendproc_tensor);
+      comm_dict.insert_or_assign("recv_proc", recvproc_tensor);
+      comm_dict.insert_or_assign("send_num", sendnum_tensor);
+      comm_dict.insert_or_assign("recv_num", recvnum_tensor);
+      comm_dict.insert_or_assign("communicator", communicator_tensor);
     }
     if (lmp_list.mapping) {
       std::vector<std::int64_t> mapping(nall_real);
diff --git a/source/api_cc/src/DeepSpinPT.cc b/source/api_cc/src/DeepSpinPT.cc
index 19d5368213..8ccf2fd383 100644
--- a/source/api_cc/src/DeepSpinPT.cc
+++ b/source/api_cc/src/DeepSpinPT.cc
@@ -205,13 +205,13 @@ void DeepSpinPT::compute(ENERGYVTYPE& ener,
       torch::Tensor sendlist_tensor =
           torch::from_blob(lmp_list.sendlist, {total_send}, int32_option);
       torch::Tensor has_spin = torch::tensor({1}, int32_option);
-      comm_dict.insert("send_list", sendlist_tensor);
-      comm_dict.insert("send_proc", sendproc_tensor);
-      comm_dict.insert("recv_proc", recvproc_tensor);
-      comm_dict.insert("send_num", sendnum_tensor);
-      comm_dict.insert("recv_num", recvnum_tensor);
-      comm_dict.insert("communicator", communicator_tensor);
-      comm_dict.insert("has_spin", has_spin);
+      comm_dict.insert_or_assign("send_list", sendlist_tensor);
+      comm_dict.insert_or_assign("send_proc", sendproc_tensor);
+      comm_dict.insert_or_assign("recv_proc", recvproc_tensor);
+      comm_dict.insert_or_assign("send_num", sendnum_tensor);
+      comm_dict.insert_or_assign("recv_num", recvnum_tensor);
+      comm_dict.insert_or_assign("communicator", communicator_tensor);
+      comm_dict.insert_or_assign("has_spin", has_spin);
     }
   }
   at::Tensor firstneigh = createNlistTensor2(nlist_data.jlist);
diff --git a/source/api_cc/src/DeepTensor.cc b/source/api_cc/src/DeepTensor.cc
index a9031472e6..ce9ca9dea2 100644
--- a/source/api_cc/src/DeepTensor.cc
+++ b/source/api_cc/src/DeepTensor.cc
@@ -6,24 +6,27 @@
 #ifdef BUILD_TENSORFLOW
 #include "DeepTensorTF.h"
 #endif
+#ifdef BUILD_PYTORCH
+#include "DeepTensorPT.h"
+#endif
 #include "common.h"
 
 using namespace deepmd;
 
 DeepTensor::DeepTensor() : inited(false) {}
 
-DeepTensor::DeepTensor(const std::string &model,
-                       const int &gpu_rank,
-                       const std::string &name_scope_)
+DeepTensor::DeepTensor(const std::string& model,
+                       const int& gpu_rank,
+                       const std::string& name_scope_)
     : inited(false) {
   init(model, gpu_rank, name_scope_);
 }
 
 DeepTensor::~DeepTensor() {}
 
-void DeepTensor::init(const std::string &model,
-                      const int &gpu_rank,
-                      const std::string &name_scope_) {
+void DeepTensor::init(const std::string& model,
+                      const int& gpu_rank,
+                      const std::string& name_scope_) {
   if (inited) {
     std::cerr << "WARNING: deepmd-kit should not be initialized twice, do "
                  "nothing at the second call of initializer"
@@ -38,7 +41,11 @@ void DeepTensor::init(const std::string &model,
     throw deepmd::deepmd_exception("TensorFlow backend is not built.");
 #endif
   } else if (deepmd::DPBackend::PyTorch == backend) {
-    throw deepmd::deepmd_exception("PyTorch backend is not supported yet");
+#ifdef BUILD_PYTORCH
+    dt = std::make_shared<deepmd::DeepTensorPT>(model, gpu_rank, name_scope_);
+#else
+    throw deepmd::deepmd_exception("PyTorch backend is not built.");
+#endif
   } else if (deepmd::DPBackend::Paddle == backend) {
     throw deepmd::deepmd_exception("PaddlePaddle backend is not supported yet");
   } else {
@@ -47,183 +54,183 @@ void DeepTensor::init(const std::string &model,
   inited = true;
 }
 
-void DeepTensor::print_summary(const std::string &pre) const {
+void DeepTensor::print_summary(const std::string& pre) const {
   deepmd::print_summary(pre);
 }
 
 template <typename VALUETYPE>
-void DeepTensor::compute(std::vector<VALUETYPE> &dtensor_,
-                         const std::vector<VALUETYPE> &dcoord_,
-                         const std::vector<int> &datype_,
-                         const std::vector<VALUETYPE> &dbox) {
+void DeepTensor::compute(std::vector<VALUETYPE>& dtensor_,
+                         const std::vector<VALUETYPE>& dcoord_,
+                         const std::vector<int>& datype_,
+                         const std::vector<VALUETYPE>& dbox) {
   std::vector<VALUETYPE> force_, virial_, datom_tensor_, datom_virial_;
   dt->computew(dtensor_, force_, virial_, datom_tensor_, datom_virial_, dcoord_,
                datype_, dbox, false);
 }
 
-template void DeepTensor::compute<double>(std::vector<double> &dtensor_,
-                                          const std::vector<double> &dcoord_,
-                                          const std::vector<int> &datype_,
-                                          const std::vector<double> &dbox);
+template void DeepTensor::compute<double>(std::vector<double>& dtensor_,
+                                          const std::vector<double>& dcoord_,
+                                          const std::vector<int>& datype_,
+                                          const std::vector<double>& dbox);
 
-template void DeepTensor::compute<float>(std::vector<float> &dtensor_,
-                                         const std::vector<float> &dcoord_,
-                                         const std::vector<int> &datype_,
-                                         const std::vector<float> &dbox);
+template void DeepTensor::compute<float>(std::vector<float>& dtensor_,
+                                         const std::vector<float>& dcoord_,
+                                         const std::vector<int>& datype_,
+                                         const std::vector<float>& dbox);
 
 template <typename VALUETYPE>
-void DeepTensor::compute(std::vector<VALUETYPE> &dtensor_,
-                         const std::vector<VALUETYPE> &dcoord_,
-                         const std::vector<int> &datype_,
-                         const std::vector<VALUETYPE> &dbox,
+void DeepTensor::compute(std::vector<VALUETYPE>& dtensor_,
+                         const std::vector<VALUETYPE>& dcoord_,
+                         const std::vector<int>& datype_,
+                         const std::vector<VALUETYPE>& dbox,
                          const int nghost,
-                         const InputNlist &lmp_list) {
+                         const InputNlist& lmp_list) {
   std::vector<VALUETYPE> force_, virial_, datom_tensor_, datom_virial_;
   dt->computew(dtensor_, force_, virial_, datom_tensor_, datom_virial_, dcoord_,
                datype_, dbox, nghost, lmp_list, false);
 }
 
-template void DeepTensor::compute<double>(std::vector<double> &dtensor_,
-                                          const std::vector<double> &dcoord_,
-                                          const std::vector<int> &datype_,
-                                          const std::vector<double> &dbox,
+template void DeepTensor::compute<double>(std::vector<double>& dtensor_,
+                                          const std::vector<double>& dcoord_,
+                                          const std::vector<int>& datype_,
+                                          const std::vector<double>& dbox,
                                           const int nghost,
-                                          const InputNlist &lmp_list);
+                                          const InputNlist& lmp_list);
 
-template void DeepTensor::compute<float>(std::vector<float> &dtensor_,
-                                         const std::vector<float> &dcoord_,
-                                         const std::vector<int> &datype_,
-                                         const std::vector<float> &dbox,
+template void DeepTensor::compute<float>(std::vector<float>& dtensor_,
+                                         const std::vector<float>& dcoord_,
+                                         const std::vector<int>& datype_,
+                                         const std::vector<float>& dbox,
                                          const int nghost,
-                                         const InputNlist &lmp_list);
+                                         const InputNlist& lmp_list);
 
 template <typename VALUETYPE>
-void DeepTensor::compute(std::vector<VALUETYPE> &dglobal_tensor_,
-                         std::vector<VALUETYPE> &dforce_,
-                         std::vector<VALUETYPE> &dvirial_,
-                         const std::vector<VALUETYPE> &dcoord_,
-                         const std::vector<int> &datype_,
-                         const std::vector<VALUETYPE> &dbox) {
+void DeepTensor::compute(std::vector<VALUETYPE>& dglobal_tensor_,
+                         std::vector<VALUETYPE>& dforce_,
+                         std::vector<VALUETYPE>& dvirial_,
+                         const std::vector<VALUETYPE>& dcoord_,
+                         const std::vector<int>& datype_,
+                         const std::vector<VALUETYPE>& dbox) {
   std::vector<VALUETYPE> datom_tensor_, datom_virial_;
   dt->computew(dglobal_tensor_, dforce_, dvirial_, datom_tensor_, datom_virial_,
                dcoord_, datype_, dbox, true);
 }
 
-template void DeepTensor::compute<double>(std::vector<double> &dglobal_tensor_,
-                                          std::vector<double> &dforce_,
-                                          std::vector<double> &dvirial_,
-                                          const std::vector<double> &dcoord_,
-                                          const std::vector<int> &datype_,
-                                          const std::vector<double> &dbox);
+template void DeepTensor::compute<double>(std::vector<double>& dglobal_tensor_,
+                                          std::vector<double>& dforce_,
+                                          std::vector<double>& dvirial_,
+                                          const std::vector<double>& dcoord_,
+                                          const std::vector<int>& datype_,
+                                          const std::vector<double>& dbox);
 
-template void DeepTensor::compute<float>(std::vector<float> &dglobal_tensor_,
-                                         std::vector<float> &dforce_,
-                                         std::vector<float> &dvirial_,
-                                         const std::vector<float> &dcoord_,
-                                         const std::vector<int> &datype_,
-                                         const std::vector<float> &dbox);
+template void DeepTensor::compute<float>(std::vector<float>& dglobal_tensor_,
+                                         std::vector<float>& dforce_,
+                                         std::vector<float>& dvirial_,
+                                         const std::vector<float>& dcoord_,
+                                         const std::vector<int>& datype_,
+                                         const std::vector<float>& dbox);
 
 template <typename VALUETYPE>
-void DeepTensor::compute(std::vector<VALUETYPE> &dglobal_tensor_,
-                         std::vector<VALUETYPE> &dforce_,
-                         std::vector<VALUETYPE> &dvirial_,
-                         const std::vector<VALUETYPE> &dcoord_,
-                         const std::vector<int> &datype_,
-                         const std::vector<VALUETYPE> &dbox,
+void DeepTensor::compute(std::vector<VALUETYPE>& dglobal_tensor_,
+                         std::vector<VALUETYPE>& dforce_,
+                         std::vector<VALUETYPE>& dvirial_,
+                         const std::vector<VALUETYPE>& dcoord_,
+                         const std::vector<int>& datype_,
+                         const std::vector<VALUETYPE>& dbox,
                          const int nghost,
-                         const InputNlist &lmp_list) {
+                         const InputNlist& lmp_list) {
   std::vector<VALUETYPE> datom_tensor_, datom_virial_;
   dt->computew(dglobal_tensor_, dforce_, dvirial_, datom_tensor_, datom_virial_,
                dcoord_, datype_, dbox, nghost, lmp_list, true);
 }
 
-template void DeepTensor::compute<double>(std::vector<double> &dglobal_tensor_,
-                                          std::vector<double> &dforce_,
-                                          std::vector<double> &dvirial_,
-                                          const std::vector<double> &dcoord_,
-                                          const std::vector<int> &datype_,
-                                          const std::vector<double> &dbox,
+template void DeepTensor::compute<double>(std::vector<double>& dglobal_tensor_,
+                                          std::vector<double>& dforce_,
+                                          std::vector<double>& dvirial_,
+                                          const std::vector<double>& dcoord_,
+                                          const std::vector<int>& datype_,
+                                          const std::vector<double>& dbox,
                                           const int nghost,
-                                          const InputNlist &lmp_list);
-
-template void DeepTensor::compute<float>(std::vector<float> &dglobal_tensor_,
-                                         std::vector<float> &dforce_,
-                                         std::vector<float> &dvirial_,
-                                         const std::vector<float> &dcoord_,
-                                         const std::vector<int> &datype_,
-                                         const std::vector<float> &dbox,
+                                          const InputNlist& lmp_list);
+
+template void DeepTensor::compute<float>(std::vector<float>& dglobal_tensor_,
+                                         std::vector<float>& dforce_,
+                                         std::vector<float>& dvirial_,
+                                         const std::vector<float>& dcoord_,
+                                         const std::vector<int>& datype_,
+                                         const std::vector<float>& dbox,
                                          const int nghost,
-                                         const InputNlist &lmp_list);
+                                         const InputNlist& lmp_list);
 
 template <typename VALUETYPE>
-void DeepTensor::compute(std::vector<VALUETYPE> &dglobal_tensor_,
-                         std::vector<VALUETYPE> &dforce_,
-                         std::vector<VALUETYPE> &dvirial_,
-                         std::vector<VALUETYPE> &datom_tensor_,
-                         std::vector<VALUETYPE> &datom_virial_,
-                         const std::vector<VALUETYPE> &dcoord_,
-                         const std::vector<int> &datype_,
-                         const std::vector<VALUETYPE> &dbox) {
+void DeepTensor::compute(std::vector<VALUETYPE>& dglobal_tensor_,
+                         std::vector<VALUETYPE>& dforce_,
+                         std::vector<VALUETYPE>& dvirial_,
+                         std::vector<VALUETYPE>& datom_tensor_,
+                         std::vector<VALUETYPE>& datom_virial_,
+                         const std::vector<VALUETYPE>& dcoord_,
+                         const std::vector<int>& datype_,
+                         const std::vector<VALUETYPE>& dbox) {
   dt->computew(dglobal_tensor_, dforce_, dvirial_, datom_tensor_, datom_virial_,
                dcoord_, datype_, dbox, true);
 }
 
-template void DeepTensor::compute<double>(std::vector<double> &dglobal_tensor_,
-                                          std::vector<double> &dforce_,
-                                          std::vector<double> &dvirial_,
-                                          std::vector<double> &datom_tensor_,
-                                          std::vector<double> &datom_virial_,
-                                          const std::vector<double> &dcoord_,
-                                          const std::vector<int> &datype_,
-                                          const std::vector<double> &dbox);
-
-template void DeepTensor::compute<float>(std::vector<float> &dglobal_tensor_,
-                                         std::vector<float> &dforce_,
-                                         std::vector<float> &dvirial_,
-                                         std::vector<float> &datom_tensor_,
-                                         std::vector<float> &datom_virial_,
-                                         const std::vector<float> &dcoord_,
-                                         const std::vector<int> &datype_,
-                                         const std::vector<float> &dbox);
+template void DeepTensor::compute<double>(std::vector<double>& dglobal_tensor_,
+                                          std::vector<double>& dforce_,
+                                          std::vector<double>& dvirial_,
+                                          std::vector<double>& datom_tensor_,
+                                          std::vector<double>& datom_virial_,
+                                          const std::vector<double>& dcoord_,
+                                          const std::vector<int>& datype_,
+                                          const std::vector<double>& dbox);
+
+template void DeepTensor::compute<float>(std::vector<float>& dglobal_tensor_,
+                                         std::vector<float>& dforce_,
+                                         std::vector<float>& dvirial_,
+                                         std::vector<float>& datom_tensor_,
+                                         std::vector<float>& datom_virial_,
+                                         const std::vector<float>& dcoord_,
+                                         const std::vector<int>& datype_,
+                                         const std::vector<float>& dbox);
 
 template <typename VALUETYPE>
-void DeepTensor::compute(std::vector<VALUETYPE> &dglobal_tensor_,
-                         std::vector<VALUETYPE> &dforce_,
-                         std::vector<VALUETYPE> &dvirial_,
-                         std::vector<VALUETYPE> &datom_tensor_,
-                         std::vector<VALUETYPE> &datom_virial_,
-                         const std::vector<VALUETYPE> &dcoord_,
-                         const std::vector<int> &datype_,
-                         const std::vector<VALUETYPE> &dbox,
+void DeepTensor::compute(std::vector<VALUETYPE>& dglobal_tensor_,
+                         std::vector<VALUETYPE>& dforce_,
+                         std::vector<VALUETYPE>& dvirial_,
+                         std::vector<VALUETYPE>& datom_tensor_,
+                         std::vector<VALUETYPE>& datom_virial_,
+                         const std::vector<VALUETYPE>& dcoord_,
+                         const std::vector<int>& datype_,
+                         const std::vector<VALUETYPE>& dbox,
                          const int nghost,
-                         const InputNlist &lmp_list) {
+                         const InputNlist& lmp_list) {
   dt->computew(dglobal_tensor_, dforce_, dvirial_, datom_tensor_, datom_virial_,
                dcoord_, datype_, dbox, nghost, lmp_list, true);
 }
 
-template void DeepTensor::compute<double>(std::vector<double> &dglobal_tensor_,
-                                          std::vector<double> &dforce_,
-                                          std::vector<double> &dvirial_,
-                                          std::vector<double> &datom_tensor_,
-                                          std::vector<double> &datom_virial_,
-                                          const std::vector<double> &dcoord_,
-                                          const std::vector<int> &datype_,
-                                          const std::vector<double> &dbox,
+template void DeepTensor::compute<double>(std::vector<double>& dglobal_tensor_,
+                                          std::vector<double>& dforce_,
+                                          std::vector<double>& dvirial_,
+                                          std::vector<double>& datom_tensor_,
+                                          std::vector<double>& datom_virial_,
+                                          const std::vector<double>& dcoord_,
+                                          const std::vector<int>& datype_,
+                                          const std::vector<double>& dbox,
                                           const int nghost,
-                                          const InputNlist &lmp_list);
-
-template void DeepTensor::compute<float>(std::vector<float> &dglobal_tensor_,
-                                         std::vector<float> &dforce_,
-                                         std::vector<float> &dvirial_,
-                                         std::vector<float> &datom_tensor_,
-                                         std::vector<float> &datom_virial_,
-                                         const std::vector<float> &dcoord_,
-                                         const std::vector<int> &datype_,
-                                         const std::vector<float> &dbox,
+                                          const InputNlist& lmp_list);
+
+template void DeepTensor::compute<float>(std::vector<float>& dglobal_tensor_,
+                                         std::vector<float>& dforce_,
+                                         std::vector<float>& dvirial_,
+                                         std::vector<float>& datom_tensor_,
+                                         std::vector<float>& datom_virial_,
+                                         const std::vector<float>& dcoord_,
+                                         const std::vector<int>& datype_,
+                                         const std::vector<float>& dbox,
                                          const int nghost,
-                                         const InputNlist &lmp_list);
+                                         const InputNlist& lmp_list);
 
-void DeepTensor::get_type_map(std::string &type_map) {
+void DeepTensor::get_type_map(std::string& type_map) {
   dt->get_type_map(type_map);
 }
 
@@ -231,7 +238,7 @@ double DeepTensor::cutoff() const { return dt->cutoff(); }
 
 int DeepTensor::output_dim() const { return dt->output_dim(); }
 
-const std::vector<int> &DeepTensor::sel_types() const {
+const std::vector<int>& DeepTensor::sel_types() const {
   return dt->sel_types();
 }
 
diff --git a/source/api_cc/src/DeepTensorPT.cc b/source/api_cc/src/DeepTensorPT.cc
new file mode 100644
index 0000000000..1636f3af95
--- /dev/null
+++ b/source/api_cc/src/DeepTensorPT.cc
@@ -0,0 +1,482 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#ifdef BUILD_PYTORCH
+#include "DeepTensorPT.h"
+
+#include <torch/csrc/jit/runtime/jit_exception.h>
+
+#include <cstdint>
+#include <numeric>  // for std::iota
+#include <sstream>
+
+#include "common.h"
+#include "device.h"
+#include "errors.h"
+
+using namespace deepmd;
+
+static torch::Tensor createNlistTensor(
+    const std::vector<std::vector<int>>& data) {
+  size_t total_size = 0;
+  for (const auto& row : data) {
+    total_size += row.size();
+  }
+  std::vector<int> flat_data;
+  flat_data.reserve(total_size);
+  for (const auto& row : data) {
+    flat_data.insert(flat_data.end(), row.begin(), row.end());
+  }
+
+  torch::Tensor flat_tensor = torch::tensor(flat_data, torch::kInt32);
+  int nloc = data.size();
+  int nnei = nloc > 0 ? total_size / nloc : 0;
+  return flat_tensor.view({1, nloc, nnei});
+}
+
+void DeepTensorPT::translate_error(std::function<void()> f) {
+  try {
+    f();
+    // it seems that libtorch may throw different types of exceptions which are
+    // inherbited from different base classes
+    // https://github.com/pytorch/pytorch/blob/13316a8d4642454012d34da0d742f1ba93fc0667/torch/csrc/jit/runtime/interpreter.cpp#L924-L939
+  } catch (const c10::Error& e) {
+    throw deepmd::deepmd_exception("DeePMD-kit PyTorch backend error: " +
+                                   std::string(e.what()));
+  } catch (const torch::jit::JITException& e) {
+    throw deepmd::deepmd_exception("DeePMD-kit PyTorch backend JIT error: " +
+                                   std::string(e.what()));
+  } catch (const std::runtime_error& e) {
+    throw deepmd::deepmd_exception("DeePMD-kit PyTorch backend error: " +
+                                   std::string(e.what()));
+  }
+}
+
+DeepTensorPT::DeepTensorPT() : inited(false) {}
+
+DeepTensorPT::DeepTensorPT(const std::string& model,
+                           const int& gpu_rank,
+                           const std::string& name_scope_)
+    : inited(false), name_scope(name_scope_) {
+  try {
+    translate_error([&] { init(model, gpu_rank, name_scope_); });
+  } catch (...) {
+    // Clean up and rethrow, as the destructor will not be called
+    throw;
+  }
+}
+
+void DeepTensorPT::init(const std::string& model,
+                        const int& gpu_rank,
+                        const std::string& name_scope_) {
+  if (inited) {
+    std::cerr << "WARNING: deepmd-kit should not be initialized twice, do "
+                 "nothing at the second call of initializer"
+              << std::endl;
+    return;
+  }
+  name_scope = name_scope_;
+  deepmd::load_op_library();
+  int gpu_num = torch::cuda::device_count();
+  if (gpu_num > 0) {
+    gpu_id = gpu_rank % gpu_num;
+  } else {
+    gpu_id = 0;
+  }
+  torch::Device device(torch::kCUDA, gpu_id);
+  gpu_enabled = torch::cuda::is_available();
+  if (!gpu_enabled) {
+    device = torch::Device(torch::kCPU);
+    std::cout << "load model from: " << model << " to cpu " << std::endl;
+  } else {
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    DPErrcheck(DPSetDevice(gpu_id));
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+    std::cout << "load model from: " << model << " to gpu " << gpu_id
+              << std::endl;
+  }
+  std::unordered_map<std::string, std::string> metadata = {{"type", ""}};
+  module = torch::jit::load(model, device, metadata);
+  module.eval();
+
+  get_env_nthreads(num_intra_nthreads, num_inter_nthreads);
+  if (num_inter_nthreads) {
+    try {
+      at::set_num_interop_threads(num_inter_nthreads);
+    } catch (...) {
+    }
+  }
+  if (num_intra_nthreads) {
+    try {
+      at::set_num_threads(num_intra_nthreads);
+    } catch (...) {
+    }
+  }
+
+  // Get model properties using run_method for C++ interface
+  auto rcut_result = module.run_method("get_rcut");
+  rcut = rcut_result.toDouble();
+
+  auto ntypes_result = module.run_method("get_ntypes");
+  ntypes = ntypes_result.toInt();
+
+  // Get task dimension from model method
+  auto task_dim_result = module.run_method("get_task_dim");
+  odim = task_dim_result.toInt();
+
+  // Get type map and set up sel_type
+  auto type_map_result = module.run_method("get_type_map");
+  auto type_map_list = type_map_result.toList();
+  sel_type.clear();
+
+  // For PyTorch models, all types are included (the backend handles exclusions
+  // internally) The model always outputs all types, but some results may be
+  // zero
+  for (size_t i = 0; i < type_map_list.size(); ++i) {
+    sel_type.push_back(i);
+  }
+  inited = true;
+}
+
+DeepTensorPT::~DeepTensorPT() {}
+
+void DeepTensorPT::get_type_map(std::string& type_map) {
+  auto type_map_result = module.run_method("get_type_map");
+  auto type_map_list = type_map_result.toList();
+  type_map.clear();
+  for (const torch::IValue& element : type_map_list) {
+    if (!type_map.empty()) {
+      type_map += " ";
+    }
+    type_map += torch::str(element);
+  }
+}
+
+template <typename VALUETYPE>
+void DeepTensorPT::compute(std::vector<VALUETYPE>& global_tensor,
+                           std::vector<VALUETYPE>& force,
+                           std::vector<VALUETYPE>& virial,
+                           std::vector<VALUETYPE>& atom_tensor,
+                           std::vector<VALUETYPE>& atom_virial,
+                           const std::vector<VALUETYPE>& coord,
+                           const std::vector<int>& atype,
+                           const std::vector<VALUETYPE>& box,
+                           const bool request_deriv) {
+  torch::Device device(torch::kCUDA, gpu_id);
+  if (!gpu_enabled) {
+    device = torch::Device(torch::kCPU);
+  }
+
+  int natoms = atype.size();
+  auto options = torch::TensorOptions().dtype(torch::kFloat64);
+  torch::ScalarType floatType = torch::kFloat64;
+  if (std::is_same<VALUETYPE, float>::value) {
+    options = torch::TensorOptions().dtype(torch::kFloat32);
+    floatType = torch::kFloat32;
+  }
+  auto int_options = torch::TensorOptions().dtype(torch::kInt64);
+
+  // Convert inputs to tensors
+  std::vector<VALUETYPE> coord_wrapped = coord;
+  at::Tensor coord_tensor =
+      torch::from_blob(coord_wrapped.data(), {1, natoms, 3}, options)
+          .to(device);
+
+  std::vector<std::int64_t> atype_64(atype.begin(), atype.end());
+  at::Tensor atype_tensor =
+      torch::from_blob(atype_64.data(), {1, natoms}, int_options).to(device);
+
+  c10::optional<torch::Tensor> box_tensor;
+  if (!box.empty()) {
+    box_tensor =
+        torch::from_blob(const_cast<VALUETYPE*>(box.data()), {1, 9}, options)
+            .to(device);
+  }
+
+  // Create input vector
+  std::vector<torch::jit::IValue> inputs;
+  inputs.push_back(coord_tensor);
+  inputs.push_back(atype_tensor);
+  inputs.push_back(box_tensor);
+
+  // Add None for fparam and aparam (not used by tensor models)
+  inputs.push_back(torch::jit::IValue());  // fparam = None
+  inputs.push_back(torch::jit::IValue());  // aparam = None
+  inputs.push_back(request_deriv);         // do_atomic_virial
+
+  // Forward pass through model
+  c10::Dict<c10::IValue, c10::IValue> outputs =
+      module.forward(inputs).toGenericDict();
+
+  // Extract global dipole/polar results
+  c10::IValue global_out;
+  if (outputs.contains("global_dipole")) {
+    global_out = outputs.at("global_dipole");
+  } else if (outputs.contains("global_polar")) {
+    global_out = outputs.at("global_polar");
+  } else {
+    throw deepmd::deepmd_exception(
+        "Cannot find global tensor output in model results");
+  }
+  torch::Tensor flat_global_ = global_out.toTensor().view({-1}).to(floatType);
+  torch::Tensor cpu_global_ = flat_global_.to(torch::kCPU);
+  global_tensor.assign(cpu_global_.data_ptr<VALUETYPE>(),
+                       cpu_global_.data_ptr<VALUETYPE>() + cpu_global_.numel());
+
+  // Extract atomic dipole/polar results
+  c10::IValue atom_out;
+  if (outputs.contains("dipole")) {
+    atom_out = outputs.at("dipole");
+  } else if (outputs.contains("polar")) {
+    atom_out = outputs.at("polar");
+  } else {
+    throw deepmd::deepmd_exception(
+        "Cannot find atomic tensor output in model results");
+  }
+  torch::Tensor flat_atom_ = atom_out.toTensor().view({-1}).to(floatType);
+  torch::Tensor cpu_atom_ = flat_atom_.to(torch::kCPU);
+  atom_tensor.assign(cpu_atom_.data_ptr<VALUETYPE>(),
+                     cpu_atom_.data_ptr<VALUETYPE>() + cpu_atom_.numel());
+
+  // Extract force results
+  c10::IValue force_ = outputs.at("force");
+  torch::Tensor flat_force_ = force_.toTensor().view({-1}).to(floatType);
+  torch::Tensor cpu_force_ = flat_force_.to(torch::kCPU);
+  force.assign(cpu_force_.data_ptr<VALUETYPE>(),
+               cpu_force_.data_ptr<VALUETYPE>() + cpu_force_.numel());
+
+  // Extract virial results
+  c10::IValue virial_ = outputs.at("virial");
+  torch::Tensor flat_virial_ = virial_.toTensor().view({-1}).to(floatType);
+  torch::Tensor cpu_virial_ = flat_virial_.to(torch::kCPU);
+  virial.assign(cpu_virial_.data_ptr<VALUETYPE>(),
+                cpu_virial_.data_ptr<VALUETYPE>() + cpu_virial_.numel());
+  // Extract atomic virial results if requested
+  if (request_deriv) {
+    c10::IValue atom_virial_ = outputs.at("atom_virial");
+    torch::Tensor flat_atom_virial_ =
+        atom_virial_.toTensor().view({-1}).to(floatType);
+    torch::Tensor cpu_atom_virial_ = flat_atom_virial_.to(torch::kCPU);
+    atom_virial.assign(
+        cpu_atom_virial_.data_ptr<VALUETYPE>(),
+        cpu_atom_virial_.data_ptr<VALUETYPE>() + cpu_atom_virial_.numel());
+  } else {
+    atom_virial.clear();
+  }
+}
+
+template <typename VALUETYPE>
+void DeepTensorPT::compute(std::vector<VALUETYPE>& global_tensor,
+                           std::vector<VALUETYPE>& force,
+                           std::vector<VALUETYPE>& virial,
+                           std::vector<VALUETYPE>& atom_tensor,
+                           std::vector<VALUETYPE>& atom_virial,
+                           const std::vector<VALUETYPE>& coord,
+                           const std::vector<int>& atype,
+                           const std::vector<VALUETYPE>& box,
+                           const int nghost,
+                           const InputNlist& lmp_list,
+                           const bool request_deriv) {
+  torch::Device device(torch::kCUDA, gpu_id);
+  if (!gpu_enabled) {
+    device = torch::Device(torch::kCPU);
+  }
+
+  int natoms = atype.size();
+  auto options = torch::TensorOptions().dtype(torch::kFloat64);
+  torch::ScalarType floatType = torch::kFloat64;
+  if (std::is_same<VALUETYPE, float>::value) {
+    options = torch::TensorOptions().dtype(torch::kFloat32);
+    floatType = torch::kFloat32;
+  }
+  auto int32_option =
+      torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt32);
+  auto int_option =
+      torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64);
+
+  // Select real atoms following DeepPotPT pattern
+  std::vector<VALUETYPE> dcoord, aparam_;
+  std::vector<int> datype, fwd_map, bkw_map;
+  int nghost_real, nall_real, nloc_real;
+  int nall = natoms;
+  int nframes = 1;
+  std::vector<VALUETYPE> aparam;  // Empty for tensor models
+  select_real_atoms_coord(dcoord, datype, aparam_, nghost_real, fwd_map,
+                          bkw_map, nall_real, nloc_real, coord, atype, aparam,
+                          nghost, ntypes, nframes, 0, nall, false);
+
+  std::vector<VALUETYPE> coord_wrapped = dcoord;
+  at::Tensor coord_wrapped_Tensor =
+      torch::from_blob(coord_wrapped.data(), {1, nall_real, 3}, options)
+          .to(device);
+  std::vector<std::int64_t> atype_64(datype.begin(), datype.end());
+  at::Tensor atype_Tensor =
+      torch::from_blob(atype_64.data(), {1, nall_real}, int_option).to(device);
+
+  // Process neighbor list following DeepPotPT pattern
+  nlist_data.copy_from_nlist(lmp_list, nall - nghost);
+  nlist_data.shuffle_exclude_empty(fwd_map);
+  nlist_data.padding();
+
+  at::Tensor firstneigh = createNlistTensor(nlist_data.jlist);
+  firstneigh_tensor = firstneigh.to(torch::kInt64).to(device);
+
+  bool do_atom_virial_tensor = request_deriv;
+  c10::optional<torch::Tensor> fparam_tensor;
+  c10::optional<torch::Tensor> aparam_tensor;
+  c10::optional<torch::Tensor> mapping_tensor;
+
+  // Use forward_lower method following DeepPotPT pattern
+  c10::Dict<c10::IValue, c10::IValue> outputs =
+      module
+          .run_method("forward_lower", coord_wrapped_Tensor, atype_Tensor,
+                      firstneigh_tensor, mapping_tensor, fparam_tensor,
+                      aparam_tensor, do_atom_virial_tensor)
+          .toGenericDict();
+
+  // Extract outputs following DeepPotPT pattern
+  c10::IValue global_dipole_;
+  if (outputs.contains("global_dipole")) {
+    global_dipole_ = outputs.at("global_dipole");
+  } else if (outputs.contains("global_polar")) {
+    global_dipole_ = outputs.at("global_polar");
+  } else {
+    throw deepmd::deepmd_exception(
+        "Cannot find global tensor output in model results");
+  }
+  // in Python, here used double; however, in TF C++, float is used
+  // for consistency, we use float
+  torch::Tensor flat_global_ =
+      global_dipole_.toTensor().view({-1}).to(floatType);
+  torch::Tensor cpu_global_ = flat_global_.to(torch::kCPU);
+  global_tensor.assign(cpu_global_.data_ptr<VALUETYPE>(),
+                       cpu_global_.data_ptr<VALUETYPE>() + cpu_global_.numel());
+
+  c10::IValue force_ = outputs.at("extended_force");
+  torch::Tensor flat_force_ = force_.toTensor().view({-1}).to(floatType);
+  torch::Tensor cpu_force_ = flat_force_.to(torch::kCPU);
+  std::vector<VALUETYPE> dforce;
+  dforce.assign(cpu_force_.data_ptr<VALUETYPE>(),
+                cpu_force_.data_ptr<VALUETYPE>() + cpu_force_.numel());
+
+  c10::IValue virial_ = outputs.at("virial");
+  torch::Tensor flat_virial_ = virial_.toTensor().view({-1}).to(floatType);
+  torch::Tensor cpu_virial_ = flat_virial_.to(torch::kCPU);
+  virial.assign(cpu_virial_.data_ptr<VALUETYPE>(),
+                cpu_virial_.data_ptr<VALUETYPE>() + cpu_virial_.numel());
+
+  // bkw map for forces
+  force.resize(static_cast<size_t>(nframes) * odim * fwd_map.size() * 3);
+  for (int kk = 0; kk < odim; ++kk) {
+    select_map<VALUETYPE>(force.begin() + kk * fwd_map.size() * 3,
+                          dforce.begin() + kk * bkw_map.size() * 3, bkw_map, 3);
+  }
+
+  // Extract atomic dipoles/polars if available
+  c10::IValue atom_tensor_output;
+  int task_dim;
+  if (outputs.contains("dipole")) {
+    atom_tensor_output = outputs.at("dipole");
+    task_dim = 3;  // dipole has 3 components
+  } else if (outputs.contains("polar")) {
+    atom_tensor_output = outputs.at("polar");
+    task_dim = 9;  // polarizability has 9 components typically
+  } else {
+    throw deepmd::deepmd_exception(
+        "Cannot find atomic tensor output in model results");
+  }
+
+  torch::Tensor flat_atom_tensor_ =
+      atom_tensor_output.toTensor().view({-1}).to(floatType);
+  torch::Tensor cpu_atom_tensor_ = flat_atom_tensor_.to(torch::kCPU);
+  std::vector<VALUETYPE> datom_tensor;
+  datom_tensor.assign(
+      cpu_atom_tensor_.data_ptr<VALUETYPE>(),
+      cpu_atom_tensor_.data_ptr<VALUETYPE>() + cpu_atom_tensor_.numel());
+  atom_tensor.resize(static_cast<size_t>(nframes) * fwd_map.size() * task_dim);
+  select_map<VALUETYPE>(atom_tensor, datom_tensor, bkw_map, task_dim, nframes,
+                        fwd_map.size(), nall_real);
+
+  if (request_deriv) {
+    c10::IValue atom_virial_ = outputs.at("extended_virial");
+    torch::Tensor flat_atom_virial_ =
+        atom_virial_.toTensor().view({-1}).to(floatType);
+    torch::Tensor cpu_atom_virial_ = flat_atom_virial_.to(torch::kCPU);
+    std::vector<VALUETYPE> datom_virial;
+    datom_virial.assign(
+        cpu_atom_virial_.data_ptr<VALUETYPE>(),
+        cpu_atom_virial_.data_ptr<VALUETYPE>() + cpu_atom_virial_.numel());
+    atom_virial.resize(static_cast<size_t>(nframes) * odim * fwd_map.size() *
+                       9);
+    for (int kk = 0; kk < odim; ++kk) {
+      select_map<VALUETYPE>(atom_virial.begin() + kk * fwd_map.size() * 9,
+                            datom_virial.begin() + kk * bkw_map.size() * 9,
+                            bkw_map, 9);
+    }
+  }
+}
+
+// Public wrapper functions
+void DeepTensorPT::computew(std::vector<double>& global_tensor,
+                            std::vector<double>& force,
+                            std::vector<double>& virial,
+                            std::vector<double>& atom_tensor,
+                            std::vector<double>& atom_virial,
+                            const std::vector<double>& coord,
+                            const std::vector<int>& atype,
+                            const std::vector<double>& box,
+                            const bool request_deriv) {
+  translate_error([&] {
+    compute(global_tensor, force, virial, atom_tensor, atom_virial, coord,
+            atype, box, request_deriv);
+  });
+}
+
+void DeepTensorPT::computew(std::vector<float>& global_tensor,
+                            std::vector<float>& force,
+                            std::vector<float>& virial,
+                            std::vector<float>& atom_tensor,
+                            std::vector<float>& atom_virial,
+                            const std::vector<float>& coord,
+                            const std::vector<int>& atype,
+                            const std::vector<float>& box,
+                            const bool request_deriv) {
+  translate_error([&] {
+    compute(global_tensor, force, virial, atom_tensor, atom_virial, coord,
+            atype, box, request_deriv);
+  });
+}
+
+void DeepTensorPT::computew(std::vector<double>& global_tensor,
+                            std::vector<double>& force,
+                            std::vector<double>& virial,
+                            std::vector<double>& atom_tensor,
+                            std::vector<double>& atom_virial,
+                            const std::vector<double>& coord,
+                            const std::vector<int>& atype,
+                            const std::vector<double>& box,
+                            const int nghost,
+                            const InputNlist& inlist,
+                            const bool request_deriv) {
+  translate_error([&] {
+    compute(global_tensor, force, virial, atom_tensor, atom_virial, coord,
+            atype, box, nghost, inlist, request_deriv);
+  });
+}
+
+void DeepTensorPT::computew(std::vector<float>& global_tensor,
+                            std::vector<float>& force,
+                            std::vector<float>& virial,
+                            std::vector<float>& atom_tensor,
+                            std::vector<float>& atom_virial,
+                            const std::vector<float>& coord,
+                            const std::vector<int>& atype,
+                            const std::vector<float>& box,
+                            const int nghost,
+                            const InputNlist& inlist,
+                            const bool request_deriv) {
+  translate_error([&] {
+    compute(global_tensor, force, virial, atom_tensor, atom_virial, coord,
+            atype, box, nghost, inlist, request_deriv);
+  });
+}
+
+#endif  // BUILD_PYTORCH
diff --git a/source/api_cc/src/DeepTensorTF.cc b/source/api_cc/src/DeepTensorTF.cc
index 1081473f25..d17c248f7e 100644
--- a/source/api_cc/src/DeepTensorTF.cc
+++ b/source/api_cc/src/DeepTensorTF.cc
@@ -7,9 +7,9 @@ using namespace tensorflow;
 
 DeepTensorTF::DeepTensorTF() : inited(false), graph_def(new GraphDef()) {}
 
-DeepTensorTF::DeepTensorTF(const std::string &model,
-                           const int &gpu_rank,
-                           const std::string &name_scope_)
+DeepTensorTF::DeepTensorTF(const std::string& model,
+                           const int& gpu_rank,
+                           const std::string& name_scope_)
     : inited(false), name_scope(name_scope_), graph_def(new GraphDef()) {
   try {
     init(model, gpu_rank, name_scope_);
@@ -22,9 +22,9 @@ DeepTensorTF::DeepTensorTF(const std::string &model,
 
 DeepTensorTF::~DeepTensorTF() { delete graph_def; }
 
-void DeepTensorTF::init(const std::string &model,
-                        const int &gpu_rank,
-                        const std::string &name_scope_) {
+void DeepTensorTF::init(const std::string& model,
+                        const int& gpu_rank,
+                        const std::string& name_scope_) {
   if (inited) {
     std::cerr << "WARNING: deepmd-kit should not be initialized twice, do "
                  "nothing at the second call of initializer"
@@ -59,7 +59,7 @@ void DeepTensorTF::init(const std::string &model,
   deepmd::check_status(session->Create(*graph_def));
   try {
     model_version = get_scalar<STRINGTYPE>("model_attr/model_version");
-  } catch (deepmd::tf_exception &e) {
+  } catch (deepmd::tf_exception& e) {
     // no model version defined in old models
     model_version = "0.0";
   }
@@ -85,23 +85,23 @@ void DeepTensorTF::init(const std::string &model,
 }
 
 template <class VT>
-VT DeepTensorTF::get_scalar(const std::string &name) const {
+VT DeepTensorTF::get_scalar(const std::string& name) const {
   return session_get_scalar<VT>(session, name, name_scope);
 }
 
 template <class VT>
-void DeepTensorTF::get_vector(std::vector<VT> &vec,
-                              const std::string &name) const {
+void DeepTensorTF::get_vector(std::vector<VT>& vec,
+                              const std::string& name) const {
   session_get_vector<VT>(vec, session, name, name_scope);
 }
 
 template <typename MODELTYPE, typename VALUETYPE>
 void DeepTensorTF::run_model(
-    std::vector<VALUETYPE> &d_tensor_,
-    Session *session,
-    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
+    std::vector<VALUETYPE>& d_tensor_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const std::vector<int>& sel_fwd,
     const int nghost) {
   unsigned nloc = atommap.get_type().size();
   unsigned nall = nloc + nghost;
@@ -139,46 +139,46 @@ void DeepTensorTF::run_model(
 }
 
 template void DeepTensorTF::run_model<double, double>(
-    std::vector<double> &d_tensor_,
-    Session *session,
-    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
+    std::vector<double>& d_tensor_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const std::vector<int>& sel_fwd,
     const int nghost);
 template void DeepTensorTF::run_model<float, double>(
-    std::vector<double> &d_tensor_,
-    Session *session,
-    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
+    std::vector<double>& d_tensor_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const std::vector<int>& sel_fwd,
     const int nghost);
 template void DeepTensorTF::run_model<double, float>(
-    std::vector<float> &d_tensor_,
-    Session *session,
-    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
+    std::vector<float>& d_tensor_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const std::vector<int>& sel_fwd,
     const int nghost);
 template void DeepTensorTF::run_model<float, float>(
-    std::vector<float> &d_tensor_,
-    Session *session,
-    const std::vector<std::pair<std::string, Tensor>> &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
+    std::vector<float>& d_tensor_,
+    Session* session,
+    const std::vector<std::pair<std::string, Tensor>>& input_tensors,
+    const AtomMap& atommap,
+    const std::vector<int>& sel_fwd,
     const int nghost);
 
 template <typename MODELTYPE, typename VALUETYPE>
 void DeepTensorTF::run_model(
-    std::vector<VALUETYPE> &dglobal_tensor_,
-    std::vector<VALUETYPE> &dforce_,
-    std::vector<VALUETYPE> &dvirial_,
-    std::vector<VALUETYPE> &datom_tensor_,
-    std::vector<VALUETYPE> &datom_virial_,
-    tensorflow::Session *session,
-    const std::vector<std::pair<std::string, tensorflow::Tensor>>
-        &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
+    std::vector<VALUETYPE>& dglobal_tensor_,
+    std::vector<VALUETYPE>& dforce_,
+    std::vector<VALUETYPE>& dvirial_,
+    std::vector<VALUETYPE>& datom_tensor_,
+    std::vector<VALUETYPE>& datom_virial_,
+    tensorflow::Session* session,
+    const std::vector<std::pair<std::string, tensorflow::Tensor>>&
+        input_tensors,
+    const AtomMap& atommap,
+    const std::vector<int>& sel_fwd,
     const int nghost) {
   unsigned nloc = atommap.get_type().size();
   unsigned nall = nloc + nghost;
@@ -282,61 +282,61 @@ void DeepTensorTF::run_model(
 }
 
 template void DeepTensorTF::run_model<double, double>(
-    std::vector<double> &dglobal_tensor_,
-    std::vector<double> &dforce_,
-    std::vector<double> &dvirial_,
-    std::vector<double> &datom_tensor_,
-    std::vector<double> &datom_virial_,
-    tensorflow::Session *session,
-    const std::vector<std::pair<std::string, tensorflow::Tensor>>
-        &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
+    std::vector<double>& dglobal_tensor_,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial_,
+    std::vector<double>& datom_tensor_,
+    std::vector<double>& datom_virial_,
+    tensorflow::Session* session,
+    const std::vector<std::pair<std::string, tensorflow::Tensor>>&
+        input_tensors,
+    const AtomMap& atommap,
+    const std::vector<int>& sel_fwd,
     const int nghost);
 template void DeepTensorTF::run_model<float, double>(
-    std::vector<double> &dglobal_tensor_,
-    std::vector<double> &dforce_,
-    std::vector<double> &dvirial_,
-    std::vector<double> &datom_tensor_,
-    std::vector<double> &datom_virial_,
-    tensorflow::Session *session,
-    const std::vector<std::pair<std::string, tensorflow::Tensor>>
-        &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
+    std::vector<double>& dglobal_tensor_,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial_,
+    std::vector<double>& datom_tensor_,
+    std::vector<double>& datom_virial_,
+    tensorflow::Session* session,
+    const std::vector<std::pair<std::string, tensorflow::Tensor>>&
+        input_tensors,
+    const AtomMap& atommap,
+    const std::vector<int>& sel_fwd,
     const int nghost);
 
 template void DeepTensorTF::run_model<double, float>(
-    std::vector<float> &dglobal_tensor_,
-    std::vector<float> &dforce_,
-    std::vector<float> &dvirial_,
-    std::vector<float> &datom_tensor_,
-    std::vector<float> &datom_virial_,
-    tensorflow::Session *session,
-    const std::vector<std::pair<std::string, tensorflow::Tensor>>
-        &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
+    std::vector<float>& dglobal_tensor_,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial_,
+    std::vector<float>& datom_tensor_,
+    std::vector<float>& datom_virial_,
+    tensorflow::Session* session,
+    const std::vector<std::pair<std::string, tensorflow::Tensor>>&
+        input_tensors,
+    const AtomMap& atommap,
+    const std::vector<int>& sel_fwd,
     const int nghost);
 
 template void DeepTensorTF::run_model<float, float>(
-    std::vector<float> &dglobal_tensor_,
-    std::vector<float> &dforce_,
-    std::vector<float> &dvirial_,
-    std::vector<float> &datom_tensor_,
-    std::vector<float> &datom_virial_,
-    tensorflow::Session *session,
-    const std::vector<std::pair<std::string, tensorflow::Tensor>>
-        &input_tensors,
-    const AtomMap &atommap,
-    const std::vector<int> &sel_fwd,
+    std::vector<float>& dglobal_tensor_,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial_,
+    std::vector<float>& datom_tensor_,
+    std::vector<float>& datom_virial_,
+    tensorflow::Session* session,
+    const std::vector<std::pair<std::string, tensorflow::Tensor>>&
+        input_tensors,
+    const AtomMap& atommap,
+    const std::vector<int>& sel_fwd,
     const int nghost);
 
 template <typename VALUETYPE>
-void DeepTensorTF::compute(std::vector<VALUETYPE> &dtensor_,
-                           const std::vector<VALUETYPE> &dcoord_,
-                           const std::vector<int> &datype_,
-                           const std::vector<VALUETYPE> &dbox) {
+void DeepTensorTF::compute(std::vector<VALUETYPE>& dtensor_,
+                           const std::vector<VALUETYPE>& dcoord_,
+                           const std::vector<int>& datype_,
+                           const std::vector<VALUETYPE>& dbox) {
   int nall = datype_.size();
   std::vector<VALUETYPE> dcoord, aparam, aparam_;
   std::vector<int> datype, fwd_map, bkw_map;
@@ -347,23 +347,23 @@ void DeepTensorTF::compute(std::vector<VALUETYPE> &dtensor_,
   compute_inner(dtensor_, dcoord, datype, dbox);
 }
 
-template void DeepTensorTF::compute<double>(std::vector<double> &dtensor_,
-                                            const std::vector<double> &dcoord_,
-                                            const std::vector<int> &datype_,
-                                            const std::vector<double> &dbox);
+template void DeepTensorTF::compute<double>(std::vector<double>& dtensor_,
+                                            const std::vector<double>& dcoord_,
+                                            const std::vector<int>& datype_,
+                                            const std::vector<double>& dbox);
 
-template void DeepTensorTF::compute<float>(std::vector<float> &dtensor_,
-                                           const std::vector<float> &dcoord_,
-                                           const std::vector<int> &datype_,
-                                           const std::vector<float> &dbox);
+template void DeepTensorTF::compute<float>(std::vector<float>& dtensor_,
+                                           const std::vector<float>& dcoord_,
+                                           const std::vector<int>& datype_,
+                                           const std::vector<float>& dbox);
 
 template <typename VALUETYPE>
-void DeepTensorTF::compute(std::vector<VALUETYPE> &dtensor_,
-                           const std::vector<VALUETYPE> &dcoord_,
-                           const std::vector<int> &datype_,
-                           const std::vector<VALUETYPE> &dbox,
+void DeepTensorTF::compute(std::vector<VALUETYPE>& dtensor_,
+                           const std::vector<VALUETYPE>& dcoord_,
+                           const std::vector<int>& datype_,
+                           const std::vector<VALUETYPE>& dbox,
                            const int nghost,
-                           const InputNlist &lmp_list) {
+                           const InputNlist& lmp_list) {
   int nall = datype_.size();
   std::vector<VALUETYPE> dcoord, dforce, datom_virial, aparam, aparam_;
   std::vector<int> datype, fwd_map, bkw_map;
@@ -380,29 +380,29 @@ void DeepTensorTF::compute(std::vector<VALUETYPE> &dtensor_,
   compute_inner(dtensor_, dcoord, datype, dbox, nghost_real, nlist);
 }
 
-template void DeepTensorTF::compute<double>(std::vector<double> &dtensor_,
-                                            const std::vector<double> &dcoord_,
-                                            const std::vector<int> &datype_,
-                                            const std::vector<double> &dbox,
+template void DeepTensorTF::compute<double>(std::vector<double>& dtensor_,
+                                            const std::vector<double>& dcoord_,
+                                            const std::vector<int>& datype_,
+                                            const std::vector<double>& dbox,
                                             const int nghost,
-                                            const InputNlist &lmp_list);
+                                            const InputNlist& lmp_list);
 
-template void DeepTensorTF::compute<float>(std::vector<float> &dtensor_,
-                                           const std::vector<float> &dcoord_,
-                                           const std::vector<int> &datype_,
-                                           const std::vector<float> &dbox,
+template void DeepTensorTF::compute<float>(std::vector<float>& dtensor_,
+                                           const std::vector<float>& dcoord_,
+                                           const std::vector<int>& datype_,
+                                           const std::vector<float>& dbox,
                                            const int nghost,
-                                           const InputNlist &lmp_list);
+                                           const InputNlist& lmp_list);
 
 template <typename VALUETYPE>
-void DeepTensorTF::compute(std::vector<VALUETYPE> &dglobal_tensor_,
-                           std::vector<VALUETYPE> &dforce_,
-                           std::vector<VALUETYPE> &dvirial_,
-                           std::vector<VALUETYPE> &datom_tensor_,
-                           std::vector<VALUETYPE> &datom_virial_,
-                           const std::vector<VALUETYPE> &dcoord_,
-                           const std::vector<int> &datype_,
-                           const std::vector<VALUETYPE> &dbox) {
+void DeepTensorTF::compute(std::vector<VALUETYPE>& dglobal_tensor_,
+                           std::vector<VALUETYPE>& dforce_,
+                           std::vector<VALUETYPE>& dvirial_,
+                           std::vector<VALUETYPE>& datom_tensor_,
+                           std::vector<VALUETYPE>& datom_virial_,
+                           const std::vector<VALUETYPE>& dcoord_,
+                           const std::vector<int>& datype_,
+                           const std::vector<VALUETYPE>& dbox) {
   int nall = datype_.size();
   std::vector<VALUETYPE> dcoord, dforce, datom_virial, aparam, aparam_;
   std::vector<int> datype, fwd_map, bkw_map;
@@ -434,35 +434,35 @@ void DeepTensorTF::compute(std::vector<VALUETYPE> &dglobal_tensor_,
 }
 
 template void DeepTensorTF::compute<double>(
-    std::vector<double> &dglobal_tensor_,
-    std::vector<double> &dforce_,
-    std::vector<double> &dvirial_,
-    std::vector<double> &datom_tensor_,
-    std::vector<double> &datom_virial_,
-    const std::vector<double> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<double> &dbox);
-
-template void DeepTensorTF::compute<float>(std::vector<float> &dglobal_tensor_,
-                                           std::vector<float> &dforce_,
-                                           std::vector<float> &dvirial_,
-                                           std::vector<float> &datom_tensor_,
-                                           std::vector<float> &datom_virial_,
-                                           const std::vector<float> &dcoord_,
-                                           const std::vector<int> &datype_,
-                                           const std::vector<float> &dbox);
+    std::vector<double>& dglobal_tensor_,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial_,
+    std::vector<double>& datom_tensor_,
+    std::vector<double>& datom_virial_,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox);
+
+template void DeepTensorTF::compute<float>(std::vector<float>& dglobal_tensor_,
+                                           std::vector<float>& dforce_,
+                                           std::vector<float>& dvirial_,
+                                           std::vector<float>& datom_tensor_,
+                                           std::vector<float>& datom_virial_,
+                                           const std::vector<float>& dcoord_,
+                                           const std::vector<int>& datype_,
+                                           const std::vector<float>& dbox);
 
 template <typename VALUETYPE>
-void DeepTensorTF::compute(std::vector<VALUETYPE> &dglobal_tensor_,
-                           std::vector<VALUETYPE> &dforce_,
-                           std::vector<VALUETYPE> &dvirial_,
-                           std::vector<VALUETYPE> &datom_tensor_,
-                           std::vector<VALUETYPE> &datom_virial_,
-                           const std::vector<VALUETYPE> &dcoord_,
-                           const std::vector<int> &datype_,
-                           const std::vector<VALUETYPE> &dbox,
+void DeepTensorTF::compute(std::vector<VALUETYPE>& dglobal_tensor_,
+                           std::vector<VALUETYPE>& dforce_,
+                           std::vector<VALUETYPE>& dvirial_,
+                           std::vector<VALUETYPE>& datom_tensor_,
+                           std::vector<VALUETYPE>& datom_virial_,
+                           const std::vector<VALUETYPE>& dcoord_,
+                           const std::vector<int>& datype_,
+                           const std::vector<VALUETYPE>& dbox,
                            const int nghost,
-                           const InputNlist &lmp_list) {
+                           const InputNlist& lmp_list) {
   int nall = datype_.size();
   std::vector<VALUETYPE> dcoord, dforce, datom_virial, aparam, aparam_;
   std::vector<int> datype, fwd_map, bkw_map;
@@ -493,33 +493,33 @@ void DeepTensorTF::compute(std::vector<VALUETYPE> &dglobal_tensor_,
 }
 
 template void DeepTensorTF::compute<double>(
-    std::vector<double> &dglobal_tensor_,
-    std::vector<double> &dforce_,
-    std::vector<double> &dvirial_,
-    std::vector<double> &datom_tensor_,
-    std::vector<double> &datom_virial_,
-    const std::vector<double> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<double> &dbox,
+    std::vector<double>& dglobal_tensor_,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial_,
+    std::vector<double>& datom_tensor_,
+    std::vector<double>& datom_virial_,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
     const int nghost,
-    const InputNlist &lmp_list);
-
-template void DeepTensorTF::compute<float>(std::vector<float> &dglobal_tensor_,
-                                           std::vector<float> &dforce_,
-                                           std::vector<float> &dvirial_,
-                                           std::vector<float> &datom_tensor_,
-                                           std::vector<float> &datom_virial_,
-                                           const std::vector<float> &dcoord_,
-                                           const std::vector<int> &datype_,
-                                           const std::vector<float> &dbox,
+    const InputNlist& lmp_list);
+
+template void DeepTensorTF::compute<float>(std::vector<float>& dglobal_tensor_,
+                                           std::vector<float>& dforce_,
+                                           std::vector<float>& dvirial_,
+                                           std::vector<float>& datom_tensor_,
+                                           std::vector<float>& datom_virial_,
+                                           const std::vector<float>& dcoord_,
+                                           const std::vector<int>& datype_,
+                                           const std::vector<float>& dbox,
                                            const int nghost,
-                                           const InputNlist &lmp_list);
+                                           const InputNlist& lmp_list);
 
 template <typename VALUETYPE>
-void DeepTensorTF::compute_inner(std::vector<VALUETYPE> &dtensor_,
-                                 const std::vector<VALUETYPE> &dcoord_,
-                                 const std::vector<int> &datype_,
-                                 const std::vector<VALUETYPE> &dbox) {
+void DeepTensorTF::compute_inner(std::vector<VALUETYPE>& dtensor_,
+                                 const std::vector<VALUETYPE>& dcoord_,
+                                 const std::vector<int>& datype_,
+                                 const std::vector<VALUETYPE>& dbox) {
   int nall = dcoord_.size() / 3;
   int nloc = nall;
   AtomMap atommap(datype_.begin(), datype_.begin() + nloc);
@@ -550,24 +550,24 @@ void DeepTensorTF::compute_inner(std::vector<VALUETYPE> &dtensor_,
 }
 
 template void DeepTensorTF::compute_inner<double>(
-    std::vector<double> &dtensor_,
-    const std::vector<double> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<double> &dbox);
+    std::vector<double>& dtensor_,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox);
 
 template void DeepTensorTF::compute_inner<float>(
-    std::vector<float> &dtensor_,
-    const std::vector<float> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<float> &dbox);
+    std::vector<float>& dtensor_,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox);
 
 template <typename VALUETYPE>
-void DeepTensorTF::compute_inner(std::vector<VALUETYPE> &dtensor_,
-                                 const std::vector<VALUETYPE> &dcoord_,
-                                 const std::vector<int> &datype_,
-                                 const std::vector<VALUETYPE> &dbox,
+void DeepTensorTF::compute_inner(std::vector<VALUETYPE>& dtensor_,
+                                 const std::vector<VALUETYPE>& dcoord_,
+                                 const std::vector<int>& datype_,
+                                 const std::vector<VALUETYPE>& dbox,
                                  const int nghost,
-                                 const InputNlist &nlist_) {
+                                 const InputNlist& nlist_) {
   int nall = dcoord_.size() / 3;
   int nloc = nall - nghost;
   AtomMap atommap(datype_.begin(), datype_.begin() + nloc);
@@ -608,30 +608,30 @@ void DeepTensorTF::compute_inner(std::vector<VALUETYPE> &dtensor_,
 }
 
 template void DeepTensorTF::compute_inner<double>(
-    std::vector<double> &dtensor_,
-    const std::vector<double> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<double> &dbox,
+    std::vector<double>& dtensor_,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
     const int nghost,
-    const InputNlist &nlist_);
+    const InputNlist& nlist_);
 
 template void DeepTensorTF::compute_inner<float>(
-    std::vector<float> &dtensor_,
-    const std::vector<float> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<float> &dbox,
+    std::vector<float>& dtensor_,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
     const int nghost,
-    const InputNlist &nlist_);
+    const InputNlist& nlist_);
 
 template <typename VALUETYPE>
-void DeepTensorTF::compute_inner(std::vector<VALUETYPE> &dglobal_tensor_,
-                                 std::vector<VALUETYPE> &dforce_,
-                                 std::vector<VALUETYPE> &dvirial_,
-                                 std::vector<VALUETYPE> &datom_tensor_,
-                                 std::vector<VALUETYPE> &datom_virial_,
-                                 const std::vector<VALUETYPE> &dcoord_,
-                                 const std::vector<int> &datype_,
-                                 const std::vector<VALUETYPE> &dbox) {
+void DeepTensorTF::compute_inner(std::vector<VALUETYPE>& dglobal_tensor_,
+                                 std::vector<VALUETYPE>& dforce_,
+                                 std::vector<VALUETYPE>& dvirial_,
+                                 std::vector<VALUETYPE>& datom_tensor_,
+                                 std::vector<VALUETYPE>& datom_virial_,
+                                 const std::vector<VALUETYPE>& dcoord_,
+                                 const std::vector<int>& datype_,
+                                 const std::vector<VALUETYPE>& dbox) {
   int nall = dcoord_.size() / 3;
   int nloc = nall;
   AtomMap atommap(datype_.begin(), datype_.begin() + nloc);
@@ -664,36 +664,36 @@ void DeepTensorTF::compute_inner(std::vector<VALUETYPE> &dglobal_tensor_,
 }
 
 template void DeepTensorTF::compute_inner<double>(
-    std::vector<double> &dglobal_tensor_,
-    std::vector<double> &dforce_,
-    std::vector<double> &dvirial_,
-    std::vector<double> &datom_tensor_,
-    std::vector<double> &datom_virial_,
-    const std::vector<double> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<double> &dbox);
+    std::vector<double>& dglobal_tensor_,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial_,
+    std::vector<double>& datom_tensor_,
+    std::vector<double>& datom_virial_,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox);
 
 template void DeepTensorTF::compute_inner<float>(
-    std::vector<float> &dglobal_tensor_,
-    std::vector<float> &dforce_,
-    std::vector<float> &dvirial_,
-    std::vector<float> &datom_tensor_,
-    std::vector<float> &datom_virial_,
-    const std::vector<float> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<float> &dbox);
+    std::vector<float>& dglobal_tensor_,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial_,
+    std::vector<float>& datom_tensor_,
+    std::vector<float>& datom_virial_,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox);
 
 template <typename VALUETYPE>
-void DeepTensorTF::compute_inner(std::vector<VALUETYPE> &dglobal_tensor_,
-                                 std::vector<VALUETYPE> &dforce_,
-                                 std::vector<VALUETYPE> &dvirial_,
-                                 std::vector<VALUETYPE> &datom_tensor_,
-                                 std::vector<VALUETYPE> &datom_virial_,
-                                 const std::vector<VALUETYPE> &dcoord_,
-                                 const std::vector<int> &datype_,
-                                 const std::vector<VALUETYPE> &dbox,
+void DeepTensorTF::compute_inner(std::vector<VALUETYPE>& dglobal_tensor_,
+                                 std::vector<VALUETYPE>& dforce_,
+                                 std::vector<VALUETYPE>& dvirial_,
+                                 std::vector<VALUETYPE>& datom_tensor_,
+                                 std::vector<VALUETYPE>& datom_virial_,
+                                 const std::vector<VALUETYPE>& dcoord_,
+                                 const std::vector<int>& datype_,
+                                 const std::vector<VALUETYPE>& dbox,
                                  const int nghost,
-                                 const InputNlist &nlist_) {
+                                 const InputNlist& nlist_) {
   int nall = dcoord_.size() / 3;
   int nloc = nall - nghost;
   AtomMap atommap(datype_.begin(), datype_.begin() + nloc);
@@ -736,41 +736,41 @@ void DeepTensorTF::compute_inner(std::vector<VALUETYPE> &dglobal_tensor_,
 }
 
 template void DeepTensorTF::compute_inner<double>(
-    std::vector<double> &dglobal_tensor_,
-    std::vector<double> &dforce_,
-    std::vector<double> &dvirial_,
-    std::vector<double> &datom_tensor_,
-    std::vector<double> &datom_virial_,
-    const std::vector<double> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<double> &dbox,
+    std::vector<double>& dglobal_tensor_,
+    std::vector<double>& dforce_,
+    std::vector<double>& dvirial_,
+    std::vector<double>& datom_tensor_,
+    std::vector<double>& datom_virial_,
+    const std::vector<double>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<double>& dbox,
     const int nghost,
-    const InputNlist &nlist_);
+    const InputNlist& nlist_);
 
 template void DeepTensorTF::compute_inner<float>(
-    std::vector<float> &dglobal_tensor_,
-    std::vector<float> &dforce_,
-    std::vector<float> &dvirial_,
-    std::vector<float> &datom_tensor_,
-    std::vector<float> &datom_virial_,
-    const std::vector<float> &dcoord_,
-    const std::vector<int> &datype_,
-    const std::vector<float> &dbox,
+    std::vector<float>& dglobal_tensor_,
+    std::vector<float>& dforce_,
+    std::vector<float>& dvirial_,
+    std::vector<float>& datom_tensor_,
+    std::vector<float>& datom_virial_,
+    const std::vector<float>& dcoord_,
+    const std::vector<int>& datype_,
+    const std::vector<float>& dbox,
     const int nghost,
-    const InputNlist &nlist_);
+    const InputNlist& nlist_);
 
-void DeepTensorTF::get_type_map(std::string &type_map) {
+void DeepTensorTF::get_type_map(std::string& type_map) {
   type_map = get_scalar<STRINGTYPE>("model_attr/tmap");
 }
 
-void DeepTensorTF::computew(std::vector<double> &global_tensor,
-                            std::vector<double> &force,
-                            std::vector<double> &virial,
-                            std::vector<double> &atom_tensor,
-                            std::vector<double> &atom_virial,
-                            const std::vector<double> &coord,
-                            const std::vector<int> &atype,
-                            const std::vector<double> &box,
+void DeepTensorTF::computew(std::vector<double>& global_tensor,
+                            std::vector<double>& force,
+                            std::vector<double>& virial,
+                            std::vector<double>& atom_tensor,
+                            std::vector<double>& atom_virial,
+                            const std::vector<double>& coord,
+                            const std::vector<int>& atype,
+                            const std::vector<double>& box,
                             const bool request_deriv) {
   if (request_deriv) {
     compute(global_tensor, force, virial, atom_tensor, atom_virial, coord,
@@ -783,14 +783,14 @@ void DeepTensorTF::computew(std::vector<double> &global_tensor,
     atom_virial.clear();
   }
 }
-void DeepTensorTF::computew(std::vector<float> &global_tensor,
-                            std::vector<float> &force,
-                            std::vector<float> &virial,
-                            std::vector<float> &atom_tensor,
-                            std::vector<float> &atom_virial,
-                            const std::vector<float> &coord,
-                            const std::vector<int> &atype,
-                            const std::vector<float> &box,
+void DeepTensorTF::computew(std::vector<float>& global_tensor,
+                            std::vector<float>& force,
+                            std::vector<float>& virial,
+                            std::vector<float>& atom_tensor,
+                            std::vector<float>& atom_virial,
+                            const std::vector<float>& coord,
+                            const std::vector<int>& atype,
+                            const std::vector<float>& box,
                             const bool request_deriv) {
   if (request_deriv) {
     compute(global_tensor, force, virial, atom_tensor, atom_virial, coord,
@@ -804,16 +804,16 @@ void DeepTensorTF::computew(std::vector<float> &global_tensor,
   }
 }
 
-void DeepTensorTF::computew(std::vector<double> &global_tensor,
-                            std::vector<double> &force,
-                            std::vector<double> &virial,
-                            std::vector<double> &atom_tensor,
-                            std::vector<double> &atom_virial,
-                            const std::vector<double> &coord,
-                            const std::vector<int> &atype,
-                            const std::vector<double> &box,
+void DeepTensorTF::computew(std::vector<double>& global_tensor,
+                            std::vector<double>& force,
+                            std::vector<double>& virial,
+                            std::vector<double>& atom_tensor,
+                            std::vector<double>& atom_virial,
+                            const std::vector<double>& coord,
+                            const std::vector<int>& atype,
+                            const std::vector<double>& box,
                             const int nghost,
-                            const InputNlist &inlist,
+                            const InputNlist& inlist,
                             const bool request_deriv) {
   if (request_deriv) {
     compute(global_tensor, force, virial, atom_tensor, atom_virial, coord,
@@ -826,16 +826,16 @@ void DeepTensorTF::computew(std::vector<double> &global_tensor,
     atom_virial.clear();
   }
 }
-void DeepTensorTF::computew(std::vector<float> &global_tensor,
-                            std::vector<float> &force,
-                            std::vector<float> &virial,
-                            std::vector<float> &atom_tensor,
-                            std::vector<float> &atom_virial,
-                            const std::vector<float> &coord,
-                            const std::vector<int> &atype,
-                            const std::vector<float> &box,
+void DeepTensorTF::computew(std::vector<float>& global_tensor,
+                            std::vector<float>& force,
+                            std::vector<float>& virial,
+                            std::vector<float>& atom_tensor,
+                            std::vector<float>& atom_virial,
+                            const std::vector<float>& coord,
+                            const std::vector<int>& atype,
+                            const std::vector<float>& box,
                             const int nghost,
-                            const InputNlist &inlist,
+                            const InputNlist& inlist,
                             const bool request_deriv) {
   if (request_deriv) {
     compute(global_tensor, force, virial, atom_tensor, atom_virial, coord,
diff --git a/source/api_cc/src/common.cc b/source/api_cc/src/common.cc
index 70755c901a..eace577f89 100644
--- a/source/api_cc/src/common.cc
+++ b/source/api_cc/src/common.cc
@@ -415,6 +415,9 @@ void deepmd::load_op_library() {
 #endif
 #ifdef BUILD_PYTORCH
   _load_single_op_library("deepmd_op_pt");
+#endif
+#ifdef BUILD_PADDLE
+  _load_single_op_library("deepmd_op_pd");
 #endif
   // load customized plugins
   const char* env_customized_plugins = std::getenv("DP_PLUGIN_PATH");
@@ -1419,7 +1422,9 @@ deepmd::DPBackend deepmd::get_backend(const std::string& model) {
              model.substr(model.length() - 11) == ".savedmodel") {
     return deepmd::DPBackend::JAX;
   } else if ((model.length() >= 5 &&
-              model.substr(model.length() - 5) == ".json")) {
+              model.substr(model.length() - 5) == ".json") ||
+             (model.length() >= 8 &&
+              model.substr(model.length() - 8) == ".pdmodel")) {
     return deepmd::DPBackend::Paddle;
   }
   throw deepmd::deepmd_exception("Unsupported model file format");
diff --git a/source/api_cc/tests/test_deepdipole_pt.cc b/source/api_cc/tests/test_deepdipole_pt.cc
new file mode 100644
index 0000000000..70e46dd9e9
--- /dev/null
+++ b/source/api_cc/tests/test_deepdipole_pt.cc
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+#include <fcntl.h>
+#include <gtest/gtest.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include <algorithm>
+#include <cmath>
+#include <fstream>
+#include <sstream>
+#include <vector>
+
+#include "DeepTensor.h"
+#include "neighbor_list.h"
+#include "test_utils.h"
+
+template <class VALUETYPE>
+class TestInferDeepTensorPt : public ::testing::Test {
+ protected:
+  std::vector<VALUETYPE> coord = {12.83, 2.56, 2.18, 12.09, 2.87, 2.74,
+                                  00.25, 3.32, 1.68, 3.36,  3.00, 1.81,
+                                  3.51,  2.51, 2.60, 4.27,  3.22, 1.56};
+  std::vector<int> atype = {0, 1, 1, 0, 1, 1};
+  std::vector<VALUETYPE> box = {13., 0., 0., 0., 13., 0., 0., 0., 13.};
+
+  // Expected global tensor values from Python inference
+  std::vector<VALUETYPE> expected_global_tensor = {0.2338104, 0.23701073,
+                                                   0.2334505};
+
+  // Expected atomic tensor values from Python inference (flattened)
+  std::vector<VALUETYPE> expected_atom_tensor = {-0.1808925408386811,
+                                                 0.3190798607195795,
+                                                 0.04760079958216837,
+                                                 -0.0,
+                                                 -0.0,
+                                                 0.0,
+                                                 0.0,
+                                                 0.0,
+                                                 -0.0,
+                                                 0.4147029447879755,
+                                                 -0.08206913353381971,
+                                                 0.1858497008385067,
+                                                 0.0,
+                                                 -0.0,
+                                                 0.0,
+                                                 0.0,
+                                                 0.0,
+                                                 -0.0};
+
+  int natoms = 6;
+  int output_dim = 3;
+
+  deepmd::DeepTensor dt;
+
+  void SetUp() override {
+    std::string file_name = "../../tests/infer/deepdipole_pt.pth";
+    dt.init(file_name);
+  };
+
+  void TearDown() override {};
+};
+
+TYPED_TEST_SUITE(TestInferDeepTensorPt, ValueTypes);
+
+TYPED_TEST(TestInferDeepTensorPt, cpu_build_nlist) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_global_tensor = this->expected_global_tensor;
+  std::vector<VALUETYPE>& expected_atom_tensor = this->expected_atom_tensor;
+  int& natoms = this->natoms;
+  int& output_dim = this->output_dim;
+  deepmd::DeepTensor& dt = this->dt;
+  // Use reasonable tolerance for minimal trained model
+  double tensor_tol = 1e-6;
+
+  std::vector<VALUETYPE> global_tensor, force, virial, atom_tensor, atom_virial;
+
+  dt.compute(global_tensor, force, virial, atom_tensor, atom_virial, coord,
+             atype, box);
+
+  EXPECT_EQ(global_tensor.size(), output_dim);
+  EXPECT_EQ(atom_tensor.size(), natoms * output_dim);
+  EXPECT_EQ(force.size(), natoms * output_dim * 3);
+  EXPECT_EQ(virial.size(), output_dim * 9);
+  EXPECT_EQ(atom_virial.size(), natoms * output_dim * 9);
+
+  for (int ii = 0; ii < output_dim; ++ii) {
+    EXPECT_LT(fabs(global_tensor[ii] - expected_global_tensor[ii]), tensor_tol);
+  }
+
+  for (int ii = 0; ii < natoms * output_dim; ++ii) {
+    EXPECT_LT(fabs(atom_tensor[ii] - expected_atom_tensor[ii]), tensor_tol);
+  }
+}
+
+TYPED_TEST(TestInferDeepTensorPt, cpu_lmp_nlist) {
+  using VALUETYPE = TypeParam;
+  std::vector<VALUETYPE>& coord = this->coord;
+  std::vector<int>& atype = this->atype;
+  std::vector<VALUETYPE>& box = this->box;
+  std::vector<VALUETYPE>& expected_global_tensor = this->expected_global_tensor;
+  std::vector<VALUETYPE>& expected_atom_tensor = this->expected_atom_tensor;
+  int& natoms = this->natoms;
+  int& output_dim = this->output_dim;
+  deepmd::DeepTensor& dt = this->dt;
+  double ener_tol = 1e-6;
+
+  float rc = dt.cutoff();
+  int nloc = coord.size() / 3;
+  std::vector<VALUETYPE> coord_cpy;
+  std::vector<int> atype_cpy, mapping;
+  std::vector<std::vector<int> > nlist_data;
+  _build_nlist<VALUETYPE>(nlist_data, coord_cpy, atype_cpy, mapping, coord,
+                          atype, box, rc);
+  int nall = coord_cpy.size() / 3;
+  std::vector<int> ilist(nloc), numneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
+  deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
+  convert_nlist(inlist, nlist_data);
+
+  std::vector<VALUETYPE> global_tensor, force, virial, atom_tensor, atom_virial;
+
+  dt.compute(global_tensor, force, virial, atom_tensor, atom_virial, coord_cpy,
+             atype_cpy, box, nall - nloc, inlist);
+
+  EXPECT_EQ(global_tensor.size(), output_dim);
+  EXPECT_EQ(atom_tensor.size(), nall * output_dim);
+
+  for (int ii = 0; ii < output_dim; ++ii) {
+    EXPECT_LT(fabs(global_tensor[ii] - expected_global_tensor[ii]), ener_tol);
+  }
+
+  for (int ii = 0; ii < natoms * output_dim; ++ii) {
+    EXPECT_LT(fabs(atom_tensor[ii] - expected_atom_tensor[ii]), ener_tol);
+  }
+}
+
+TYPED_TEST(TestInferDeepTensorPt, print_summary) {
+  deepmd::DeepTensor& dt = this->dt;
+  dt.print_summary("");
+}
+
+TYPED_TEST(TestInferDeepTensorPt, get_type_map) {
+  deepmd::DeepTensor& dt = this->dt;
+  std::string type_map_str;
+  dt.get_type_map(type_map_str);
+  // Parse the type map string manually
+  std::vector<std::string> type_map;
+  std::istringstream iss(type_map_str);
+  std::string token;
+  while (iss >> token) {
+    type_map.push_back(token);
+  }
+  EXPECT_EQ(type_map.size(), 2);
+  EXPECT_EQ(type_map[0], "O");
+  EXPECT_EQ(type_map[1], "H");
+}
+
+TYPED_TEST(TestInferDeepTensorPt, get_properties) {
+  deepmd::DeepTensor& dt = this->dt;
+
+  EXPECT_EQ(dt.numb_types(), 2);
+  EXPECT_EQ(dt.output_dim(), 3);
+  EXPECT_DOUBLE_EQ(dt.cutoff(), 4.0);
+
+  std::vector<int> sel_types = dt.sel_types();
+  EXPECT_EQ(sel_types.size(), 2);  // PyTorch models always return all types
+  EXPECT_EQ(sel_types[0], 0);      // Type 0 (O)
+  EXPECT_EQ(sel_types[1],
+            1);  // Type 1 (H) - included but may have zero results
+}
diff --git a/source/api_cc/tests/test_deepmd_exception.cc b/source/api_cc/tests/test_deepmd_exception.cc
index 77e399d722..c28c0f0069 100644
--- a/source/api_cc/tests/test_deepmd_exception.cc
+++ b/source/api_cc/tests/test_deepmd_exception.cc
@@ -18,7 +18,7 @@ TEST(TestDeepmdException, deepmdexception) {
   std::string expected_error_message = "DeePMD-kit Error: unittest";
   try {
     throw deepmd::deepmd_exception("unittest");
-  } catch (deepmd::deepmd_exception &ex) {
+  } catch (deepmd::deepmd_exception& ex) {
     EXPECT_STREQ(expected_error_message.c_str(), ex.what());
   }
 }
diff --git a/source/api_cc/tests/test_utils.h b/source/api_cc/tests/test_utils.h
index d06823b4e0..64d8a37ef5 100644
--- a/source/api_cc/tests/test_utils.h
+++ b/source/api_cc/tests/test_utils.h
@@ -14,7 +14,7 @@ typedef testing::Types<double, float> ValueTypes;
 template <typename VALUETYPE>
 inline void _fold_back(typename std::vector<VALUETYPE>::iterator out,
                        const typename std::vector<VALUETYPE>::const_iterator in,
-                       const std::vector<int> &mapping,
+                       const std::vector<int>& mapping,
                        const int nloc,
                        const int nall,
                        const int ndim,
@@ -35,9 +35,9 @@ inline void _fold_back(typename std::vector<VALUETYPE>::iterator out,
 }
 
 template <typename VALUETYPE>
-inline void _fold_back(std::vector<VALUETYPE> &out,
-                       const std::vector<VALUETYPE> &in,
-                       const std::vector<int> &mapping,
+inline void _fold_back(std::vector<VALUETYPE>& out,
+                       const std::vector<VALUETYPE>& in,
+                       const std::vector<int>& mapping,
                        const int nloc,
                        const int nall,
                        const int ndim,
@@ -48,14 +48,14 @@ inline void _fold_back(std::vector<VALUETYPE> &out,
 }
 
 template <typename VALUETYPE>
-inline void _build_nlist(std::vector<std::vector<int>> &nlist_data,
-                         std::vector<VALUETYPE> &coord_cpy,
-                         std::vector<int> &atype_cpy,
-                         std::vector<int> &mapping,
-                         const std::vector<VALUETYPE> &coord,
-                         const std::vector<int> &atype,
-                         const std::vector<VALUETYPE> &box,
-                         const float &rc) {
+inline void _build_nlist(std::vector<std::vector<int>>& nlist_data,
+                         std::vector<VALUETYPE>& coord_cpy,
+                         std::vector<int>& atype_cpy,
+                         std::vector<int>& mapping,
+                         const std::vector<VALUETYPE>& coord,
+                         const std::vector<int>& atype,
+                         const std::vector<VALUETYPE>& box,
+                         const float& rc) {
   // convert VALUETYPE to double, it looks like copy_coord only accepts double
   std::vector<double> coord_cpy_;
   std::vector<double> coord_(coord.begin(), coord.end());
@@ -90,13 +90,13 @@ class EnergyModelTest {
   double level =
       std::is_same<VALUETYPE, double>::value ? 1e-6 : 1e-2;  // expected?
  public:
-  virtual void compute(double &ener,
-                       std::vector<VALUETYPE> &force,
-                       std::vector<VALUETYPE> &virial,
-                       const std::vector<VALUETYPE> &coord,
-                       const std::vector<VALUETYPE> &box) = 0;
-  void test_f(const std::vector<VALUETYPE> &coord,
-              const std::vector<VALUETYPE> &box) {
+  virtual void compute(double& ener,
+                       std::vector<VALUETYPE>& force,
+                       std::vector<VALUETYPE>& virial,
+                       const std::vector<VALUETYPE>& coord,
+                       const std::vector<VALUETYPE>& box) = 0;
+  void test_f(const std::vector<VALUETYPE>& coord,
+              const std::vector<VALUETYPE>& box) {
     int ndof = coord.size();
     double ener;
     std::vector<VALUETYPE> force, virial;
@@ -114,8 +114,8 @@ class EnergyModelTest {
       EXPECT_LT(fabs(num - ana), level);
     }
   }
-  void test_v(const std::vector<VALUETYPE> &coord,
-              const std::vector<VALUETYPE> &box) {
+  void test_v(const std::vector<VALUETYPE>& coord,
+              const std::vector<VALUETYPE>& box) {
     std::vector<VALUETYPE> num_diff(9);
     double ener;
     std::vector<VALUETYPE> force, virial;
diff --git a/source/cmake/Findtensorflow.cmake b/source/cmake/Findtensorflow.cmake
index d579af7679..b5b8c92f3d 100644
--- a/source/cmake/Findtensorflow.cmake
+++ b/source/cmake/Findtensorflow.cmake
@@ -291,8 +291,10 @@ if(NOT DEFINED TENSORFLOW_VERSION)
     TENSORFLOW_VERSION_RUN_RESULT_VAR TENSORFLOW_VERSION_COMPILE_RESULT_VAR
     ${CMAKE_CURRENT_BINARY_DIR}/tf_version
     "${CMAKE_CURRENT_LIST_DIR}/tf_version.cpp"
-    CMAKE_FLAGS "-DINCLUDE_DIRECTORIES:STRING=${TensorFlow_INCLUDE_DIRS}"
-    RUN_OUTPUT_VARIABLE TENSORFLOW_VERSION
+    CMAKE_FLAGS
+      "-DINCLUDE_DIRECTORIES:STRING=${TensorFlow_INCLUDE_DIRS}" LINK_LIBRARIES
+      ${TensorFlowFramework_LIBRARY} ${TensorFlow_LIBRARY}
+      RUN_OUTPUT_STDOUT_VARIABLE TENSORFLOW_VERSION
     COMPILE_OUTPUT_VARIABLE TENSORFLOW_VERSION_COMPILE_OUTPUT_VAR)
   if(NOT ${TENSORFLOW_VERSION_COMPILE_RESULT_VAR})
     message(
@@ -304,6 +306,23 @@ if(NOT DEFINED TENSORFLOW_VERSION)
   endif()
 endif()
 
+if(TENSORFLOW_VERSION VERSION_GREATER_EQUAL 2.20)
+  # since TF 2.20, macros like TF_MAJOR_VERSION, TF_MINOR_VERSION, and
+  # TF_PATCH_VERSION are not defined We manuanlly define them in our CMake files
+  # first, split TENSORFLOW_VERSION (e.g. 2.20.0rc0) to 2 20 0 rc0
+  string(REGEX MATCH "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)$" _match
+               ${TENSORFLOW_VERSION})
+  if(_match)
+    set(TF_MAJOR_VERSION ${CMAKE_MATCH_1})
+    set(TF_MINOR_VERSION ${CMAKE_MATCH_2})
+    set(TF_PATCH_VERSION ${CMAKE_MATCH_3})
+    # add defines
+    add_definitions(-DTF_MAJOR_VERSION=${TF_MAJOR_VERSION})
+    add_definitions(-DTF_MINOR_VERSION=${TF_MINOR_VERSION})
+    add_definitions(-DTF_PATCH_VERSION=${TF_PATCH_VERSION})
+  endif()
+endif()
+
 # print message
 if(NOT TensorFlow_FIND_QUIETLY)
   message(
diff --git a/source/cmake/tf_version.cpp b/source/cmake/tf_version.cpp
index 6d09e33493..2ad2125291 100644
--- a/source/cmake/tf_version.cpp
+++ b/source/cmake/tf_version.cpp
@@ -1,12 +1,14 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include <iostream>
 
-#include "tensorflow/core/public/version.h"
+#include "tensorflow/c/c_api.h"
 
 int main(int argc, char* argv[]) {
   // See
   // https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/public/version.h
   // TF_VERSION_STRING has been available since TensorFlow v0.6
-  std::cout << TF_VERSION_STRING;
+  // Aug 2025: since TF 2.20, TF_VERSION_STRING is no more available;
+  // try to use the C API TF_Version
+  std::cout << TF_Version();
   return 0;
 }
diff --git a/source/config/run_config.ini b/source/config/run_config.ini
index 7bb6041af9..596be911db 100644
--- a/source/config/run_config.ini
+++ b/source/config/run_config.ini
@@ -18,3 +18,5 @@ PD_VERSION = @PADDLE_VERSION@
 PD_INFERENCE_DIR = @PADDLE_INFERENCE_DIR@
 MODEL_VERSION=@MODEL_VERSION@
 DP_VARIANT=@DP_VARIANT@
+LAMMPS_VERSION = @LAMMPS_VERSION@
+CIBUILDWHEEL = @CIBUILDWHEEL@
diff --git a/source/install/build_cc.sh b/source/install/build_cc.sh
index dc66343cb2..7f21b83eee 100755
--- a/source/install/build_cc.sh
+++ b/source/install/build_cc.sh
@@ -26,7 +26,7 @@ cmake -D ENABLE_TENSORFLOW=ON \
 	-D USE_TF_PYTHON_LIBS=TRUE \
 	-D USE_PT_PYTHON_LIBS=TRUE \
 	${CUDA_ARGS} \
-	-D LAMMPS_VERSION=stable_29Aug2024_update1 \
+	-D LAMMPS_VERSION=stable_22Jul2025_update1 \
 	..
 cmake --build . -j${NPROC}
 cmake --install .
diff --git a/source/install/build_from_c.sh b/source/install/build_from_c.sh
index 22739ec531..7c73b8543b 100755
--- a/source/install/build_from_c.sh
+++ b/source/install/build_from_c.sh
@@ -13,7 +13,7 @@ NPROC=$(nproc --all)
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DDEEPMD_C_ROOT=${DEEPMD_C_ROOT} -DLAMMPS_VERSION=stable_29Aug2024_update1 ..
+cmake -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DDEEPMD_C_ROOT=${DEEPMD_C_ROOT} -DLAMMPS_VERSION=stable_22Jul2025_update1 ..
 cmake --build . -j${NPROC}
 cmake --install .
 cmake --build . --target=lammps
diff --git a/source/install/build_lammps.sh b/source/install/build_lammps.sh
index d101714739..57af2f261a 100755
--- a/source/install/build_lammps.sh
+++ b/source/install/build_lammps.sh
@@ -14,7 +14,7 @@ BUILD_TMP_DIR=${SCRIPT_PATH}/../build_lammps
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
 # download LAMMMPS
-LAMMPS_VERSION=stable_29Aug2024_update1
+LAMMPS_VERSION=stable_22Jul2025_update1
 if [ ! -d "lammps-${LAMMPS_VERSION}" ]; then
 	curl -L -o lammps.tar.gz https://github.com/lammps/lammps/archive/refs/tags/${LAMMPS_VERSION}.tar.gz
 	tar vxzf lammps.tar.gz
diff --git a/source/install/docker/Dockerfile b/source/install/docker/Dockerfile
index 1e0f46eec6..a8cf698102 100644
--- a/source/install/docker/Dockerfile
+++ b/source/install/docker/Dockerfile
@@ -8,7 +8,7 @@ ENV PATH="/opt/deepmd-kit/bin:$PATH"
 ENV VIRTUAL_ENV="/opt/deepmd-kit"
 # Install package
 COPY dist /dist
-RUN if [ "${CUDA_VERSION}" = 11 ]; then uv pip install torch --index-url https://download.pytorch.org/whl/cu118; fi \
+RUN if [ "${CUDA_VERSION}" = 11 ]; then export UV_TORCH_BACKEND=cu118; fi \
     && uv pip install "$(ls /dist/deepmd_kit${VARIANT}-*manylinux*_x86_64.whl)[gpu,cu${CUDA_VERSION},lmp,ipi,torch]" \
     && dp -h \
     && lmp -h \
diff --git a/source/install/test_cc.sh b/source/install/test_cc.sh
index 1626f36193..f45b936d3e 100755
--- a/source/install/test_cc.sh
+++ b/source/install/test_cc.sh
@@ -17,7 +17,7 @@ INSTALL_PREFIX=${SCRIPT_PATH}/../../dp_test
 BUILD_TMP_DIR=${SCRIPT_PATH}/../build_tests
 mkdir -p ${BUILD_TMP_DIR}
 cd ${BUILD_TMP_DIR}
-cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_29Aug2024_update1 ${CUDA_ARGS} ..
+cmake -DINSTALL_TENSORFLOW=TRUE -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} -DTENSORFLOW_ROOT=${INSTALL_PREFIX} -DBUILD_TESTING:BOOL=TRUE -DLAMMPS_VERSION=stable_22Jul2025_update1 ${CUDA_ARGS} ..
 cmake --build . -j${NPROC}
 cmake --install .
 ctest --output-on-failure
diff --git a/source/install/test_cc_local.sh b/source/install/test_cc_local.sh
index 8152b6f1a4..c34c27fa64 100755
--- a/source/install/test_cc_local.sh
+++ b/source/install/test_cc_local.sh
@@ -28,7 +28,7 @@ cmake \
 	-D USE_PT_PYTHON_LIBS=TRUE \
 	-D CMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
 	-D BUILD_TESTING:BOOL=TRUE \
-	-D LAMMPS_VERSION=stable_29Aug2024_update1 \
+	-D LAMMPS_VERSION=stable_22Jul2025_update1 \
 	${CUDA_ARGS} ..
 cmake --build . -j${NPROC}
 cmake --install .
diff --git a/source/ipi/driver.cc b/source/ipi/driver.cc
index 9a91a27ad3..879e19c46f 100644
--- a/source/ipi/driver.cc
+++ b/source/ipi/driver.cc
@@ -29,8 +29,8 @@ const double icvt_ener = 1. / cvt_ener;
 const double cvt_f = cvt_ener / cvt_len;
 const double icvt_f = 1. / cvt_f;
 
-char *trimwhitespace(char *str) {
-  char *end;
+char* trimwhitespace(char* str) {
+  char* end;
   // Trim leading space
   while (isspace((unsigned char)*str)) {
     str++;
@@ -48,7 +48,7 @@ char *trimwhitespace(char *str) {
   return str;
 }
 
-int main(int argc, char *argv[]) {
+int main(int argc, char* argv[]) {
   if (argc == 1) {
     std::cerr << "usage " << std::endl;
     std::cerr << argv[0] << " input_script " << std::endl;
@@ -68,7 +68,7 @@ int main(int argc, char *argv[]) {
   }
   int port = jdata["port"];
   std::string host_str = jdata["host"];
-  const char *host = host_str.c_str();
+  const char* host = host_str.c_str();
   std::string graph_file = jdata["graph_file"];
   std::string coord_file = jdata["coord_file"];
   std::map<std::string, int> name_type_map = jdata["atom_type"];
@@ -102,7 +102,7 @@ int main(int argc, char *argv[]) {
   std::vector<double> dcoord_tmp;
   std::vector<int> dtype = cvt.get_type();
   std::vector<double> dbox(9, 0);
-  double *msg_buff = NULL;
+  double* msg_buff = NULL;
   double ener;
   double virial[9];
   char msg_needinit[] = "NEEDINIT    ";
@@ -144,7 +144,7 @@ int main(int argc, char *argv[]) {
       }
     } else if (header_str == "INIT") {
       assert(4 == sizeof(int32_t));
-      readbuffer_(&socket, (char *)(&cbuf), sizeof(int32_t));
+      readbuffer_(&socket, (char*)(&cbuf), sizeof(int32_t));
       readbuffer_(&socket, initbuffer, cbuf);
       if (b_verb) {
         std::cout << "Init sys from wrapper, using " << initbuffer << std::endl;
@@ -153,14 +153,14 @@ int main(int argc, char *argv[]) {
       assert(8 == sizeof(double));
 
       // get box
-      readbuffer_(&socket, (char *)(cell_h), 9 * sizeof(double));
-      readbuffer_(&socket, (char *)(cell_ih), 9 * sizeof(double));
+      readbuffer_(&socket, (char*)(cell_h), 9 * sizeof(double));
+      readbuffer_(&socket, (char*)(cell_ih), 9 * sizeof(double));
       for (int dd = 0; dd < 9; ++dd) {
         dbox[dd] = cell_h[(dd % 3) * 3 + (dd / 3)] * cvt_len;
       }
 
       // get number of atoms
-      readbuffer_(&socket, (char *)(&cbuf), sizeof(int32_t));
+      readbuffer_(&socket, (char*)(&cbuf), sizeof(int32_t));
       if (natoms < 0) {
         natoms = cbuf;
         if (b_verb) {
@@ -176,7 +176,7 @@ int main(int argc, char *argv[]) {
       }
 
       // get coord
-      readbuffer_(&socket, (char *)(msg_buff), natoms * 3 * sizeof(double));
+      readbuffer_(&socket, (char*)(msg_buff), natoms * 3 * sizeof(double));
       for (int ii = 0; ii < natoms * 3; ++ii) {
         dcoord_tmp[ii] = msg_buff[ii] * cvt_len;
       }
@@ -199,12 +199,12 @@ int main(int argc, char *argv[]) {
                   << std::setprecision(10) << dener << std::endl;
       }
       writebuffer_(&socket, msg_forceready, MSGLEN);
-      writebuffer_(&socket, (char *)(&ener), sizeof(double));
-      writebuffer_(&socket, (char *)(&natoms), sizeof(int32_t));
-      writebuffer_(&socket, (char *)(msg_buff), 3 * natoms * sizeof(double));
-      writebuffer_(&socket, (char *)(virial), 9 * sizeof(double));
+      writebuffer_(&socket, (char*)(&ener), sizeof(double));
+      writebuffer_(&socket, (char*)(&natoms), sizeof(int32_t));
+      writebuffer_(&socket, (char*)(msg_buff), 3 * natoms * sizeof(double));
+      writebuffer_(&socket, (char*)(virial), 9 * sizeof(double));
       cbuf = 7;
-      writebuffer_(&socket, (char *)(&cbuf), sizeof(int32_t));
+      writebuffer_(&socket, (char*)(&cbuf), sizeof(int32_t));
       writebuffer_(&socket, msg_nothing, 7);
       hasdata = false;
     } else {
diff --git a/source/ipi/include/sockets.h b/source/ipi/include/sockets.h
index 08f24c68ed..150b7c1a69 100644
--- a/source/ipi/include/sockets.h
+++ b/source/ipi/include/sockets.h
@@ -15,7 +15,7 @@
 extern "C" {
 #endif
 
-void error(const char *msg);
+void error(const char* msg);
 
 /* Opens a socket.
    Note that fortran passes an extra argument for the string length, but this is
@@ -29,7 +29,7 @@ void error(const char *msg);
       recommended.
    host: The name of the host server.
 */
-void open_socket_(int *psockfd, int *inet, int *port, const char *host);
+void open_socket_(int* psockfd, int* inet, int* port, const char* host);
 
 /* Writes to a socket.
    Args:
@@ -37,7 +37,7 @@ void open_socket_(int *psockfd, int *inet, int *port, const char *host);
    data: The data to be written to the socket.
    plen: The length of the data in bytes.
 */
-void writebuffer_(int *psockfd, char *data, int len);
+void writebuffer_(int* psockfd, char* data, int len);
 
 /* Reads from a socket.
    Args:
@@ -45,7 +45,7 @@ void writebuffer_(int *psockfd, char *data, int len);
    data: The storage array for data read from the socket.
    plen: The length of the data in bytes.
 */
-void readbuffer_(int *psockfd, char *data, int len);
+void readbuffer_(int* psockfd, char* data, int len);
 
 #ifdef __cplusplus
 }
diff --git a/source/ipi/src/sockets.c b/source/ipi/src/sockets.c
index d9a2b8a865..1d45849f1a 100644
--- a/source/ipi/src/sockets.c
+++ b/source/ipi/src/sockets.c
@@ -45,14 +45,14 @@ Can be linked to a FORTRAN code that does not support sockets natively.
 #include <sys/un.h>
 #include <unistd.h>
 
-void error(const char *msg)
+void error(const char* msg)
 // Prints an error message and then exits.
 {
   perror(msg);
   exit(-1);
 }
 
-void open_socket_(int *psockfd, int *inet, int *port, const char *host)
+void open_socket_(int* psockfd, int* inet, int* port, const char* host)
 /* Opens a socket.
 
 Note that fortran passes an extra argument for the string length, but this is
@@ -70,14 +70,14 @@ ignored here for C compatibility.
 
 {
   int sockfd, portno, n;
-  struct hostent *server;
+  struct hostent* server;
 
-  struct sockaddr *psock;
+  struct sockaddr* psock;
   int ssock;
 
   if (*inet > 0) {  // creates an internet socket
     struct sockaddr_in serv_addr;
-    psock = (struct sockaddr *)&serv_addr;
+    psock = (struct sockaddr*)&serv_addr;
     ssock = sizeof(serv_addr);
     sockfd = socket(AF_INET, SOCK_STREAM, 0);
     if (sockfd < 0) {
@@ -90,9 +90,9 @@ ignored here for C compatibility.
       exit(-1);
     }
 
-    bzero((char *)&serv_addr, sizeof(serv_addr));
+    bzero((char*)&serv_addr, sizeof(serv_addr));
     serv_addr.sin_family = AF_INET;
-    bcopy((char *)server->h_addr, (char *)&serv_addr.sin_addr.s_addr,
+    bcopy((char*)server->h_addr, (char*)&serv_addr.sin_addr.s_addr,
           server->h_length);
     serv_addr.sin_port = htons(*port);
     if (connect(sockfd, psock, ssock) < 0) {
@@ -100,10 +100,10 @@ ignored here for C compatibility.
     }
   } else {  // creates a unix socket
     struct sockaddr_un serv_addr;
-    psock = (struct sockaddr *)&serv_addr;
+    psock = (struct sockaddr*)&serv_addr;
     ssock = sizeof(serv_addr);
     sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
-    bzero((char *)&serv_addr, sizeof(serv_addr));
+    bzero((char*)&serv_addr, sizeof(serv_addr));
     serv_addr.sun_family = AF_UNIX;
     strcpy(serv_addr.sun_path, "/tmp/ipi_");
     strcpy(serv_addr.sun_path + 9, host);
@@ -115,7 +115,7 @@ ignored here for C compatibility.
   *psockfd = sockfd;
 }
 
-void writebuffer_(int *psockfd, char *data, int len)
+void writebuffer_(int* psockfd, char* data, int len)
 /* Writes to a socket.
 
 Args:
@@ -134,7 +134,7 @@ void writebuffer_(int *psockfd, char *data, int len)
   }
 }
 
-void readbuffer_(int *psockfd, char *data, int len)
+void readbuffer_(int* psockfd, char* data, int len)
 /* Reads from a socket.
 
 Args:
diff --git a/source/lib/include/ComputeDescriptor.h b/source/lib/include/ComputeDescriptor.h
index 733cb1ee0c..edede310b6 100644
--- a/source/lib/include/ComputeDescriptor.h
+++ b/source/lib/include/ComputeDescriptor.h
@@ -9,100 +9,100 @@
 #include "switcher.h"
 #include "utilities.h"
 
-inline void compute_descriptor(std::vector<double> &descrpt_a,
-                               std::vector<double> &descrpt_r,
-                               std::vector<double> &rot_mat,
-                               const std::vector<double> &posi,
-                               const int &ntypes,
-                               const std::vector<int> &type,
-                               const SimulationRegion<double> &region,
-                               const bool &b_pbc,
-                               const int &i_idx,
-                               const std::vector<int> &fmt_nlist_a,
-                               const std::vector<int> &fmt_nlist_r,
-                               const std::vector<int> &sec_a,
-                               const std::vector<int> &sec_r,
+inline void compute_descriptor(std::vector<double>& descrpt_a,
+                               std::vector<double>& descrpt_r,
+                               std::vector<double>& rot_mat,
+                               const std::vector<double>& posi,
+                               const int& ntypes,
+                               const std::vector<int>& type,
+                               const SimulationRegion<double>& region,
+                               const bool& b_pbc,
+                               const int& i_idx,
+                               const std::vector<int>& fmt_nlist_a,
+                               const std::vector<int>& fmt_nlist_r,
+                               const std::vector<int>& sec_a,
+                               const std::vector<int>& sec_r,
                                const int axis0_type,
                                const int axis0_idx,
                                const int axis1_type,
                                const int axis1_idx);
 
-inline void compute_descriptor(std::vector<double> &descrpt_a,
-                               std::vector<double> &descrpt_a_deriv,
-                               std::vector<double> &descrpt_r,
-                               std::vector<double> &descrpt_r_deriv,
-                               std::vector<double> &rij_a,
-                               std::vector<double> &rij_r,
-                               std::vector<double> &rot_mat,
-                               const std::vector<double> &posi,
-                               const int &ntypes,
-                               const std::vector<int> &type,
-                               const SimulationRegion<double> &region,
-                               const bool &b_pbc,
-                               const int &i_idx,
-                               const std::vector<int> &fmt_nlist_a,
-                               const std::vector<int> &fmt_nlist_r,
-                               const std::vector<int> &sec_a,
-                               const std::vector<int> &sec_r,
+inline void compute_descriptor(std::vector<double>& descrpt_a,
+                               std::vector<double>& descrpt_a_deriv,
+                               std::vector<double>& descrpt_r,
+                               std::vector<double>& descrpt_r_deriv,
+                               std::vector<double>& rij_a,
+                               std::vector<double>& rij_r,
+                               std::vector<double>& rot_mat,
+                               const std::vector<double>& posi,
+                               const int& ntypes,
+                               const std::vector<int>& type,
+                               const SimulationRegion<double>& region,
+                               const bool& b_pbc,
+                               const int& i_idx,
+                               const std::vector<int>& fmt_nlist_a,
+                               const std::vector<int>& fmt_nlist_r,
+                               const std::vector<int>& sec_a,
+                               const std::vector<int>& sec_r,
                                const int axis0_type,
                                const int axis0_idx,
                                const int axis1_type,
                                const int axis1_idx);
 
-inline void compute_descriptor_se_a_extf(std::vector<double> &descrpt_a,
-                                         std::vector<double> &descrpt_a_deriv,
-                                         std::vector<double> &rij_a,
-                                         const std::vector<double> &posi,
-                                         const int &ntypes,
-                                         const std::vector<int> &type,
-                                         const SimulationRegion<double> &region,
-                                         const bool &b_pbc,
-                                         const std::vector<double> &efield,
-                                         const int &i_idx,
-                                         const std::vector<int> &fmt_nlist_a,
-                                         const std::vector<int> &sec_a,
-                                         const double &rmin,
-                                         const double &rmax);
+inline void compute_descriptor_se_a_extf(std::vector<double>& descrpt_a,
+                                         std::vector<double>& descrpt_a_deriv,
+                                         std::vector<double>& rij_a,
+                                         const std::vector<double>& posi,
+                                         const int& ntypes,
+                                         const std::vector<int>& type,
+                                         const SimulationRegion<double>& region,
+                                         const bool& b_pbc,
+                                         const std::vector<double>& efield,
+                                         const int& i_idx,
+                                         const std::vector<int>& fmt_nlist_a,
+                                         const std::vector<int>& sec_a,
+                                         const double& rmin,
+                                         const double& rmax);
 inline void compute_descriptor_se_a_ef_para(
-    std::vector<double> &descrpt_a,
-    std::vector<double> &descrpt_a_deriv,
-    std::vector<double> &rij_a,
-    const std::vector<double> &posi,
-    const int &ntypes,
-    const std::vector<int> &type,
-    const SimulationRegion<double> &region,
-    const bool &b_pbc,
-    const std::vector<double> &efield,
-    const int &i_idx,
-    const std::vector<int> &fmt_nlist_a,
-    const std::vector<int> &sec_a,
-    const double &rmin,
-    const double &rmax);
+    std::vector<double>& descrpt_a,
+    std::vector<double>& descrpt_a_deriv,
+    std::vector<double>& rij_a,
+    const std::vector<double>& posi,
+    const int& ntypes,
+    const std::vector<int>& type,
+    const SimulationRegion<double>& region,
+    const bool& b_pbc,
+    const std::vector<double>& efield,
+    const int& i_idx,
+    const std::vector<int>& fmt_nlist_a,
+    const std::vector<int>& sec_a,
+    const double& rmin,
+    const double& rmax);
 inline void compute_descriptor_se_a_ef_vert(
-    std::vector<double> &descrpt_a,
-    std::vector<double> &descrpt_a_deriv,
-    std::vector<double> &rij_a,
-    const std::vector<double> &posi,
-    const int &ntypes,
-    const std::vector<int> &type,
-    const SimulationRegion<double> &region,
-    const bool &b_pbc,
-    const std::vector<double> &efield,
-    const int &i_idx,
-    const std::vector<int> &fmt_nlist_a,
-    const std::vector<int> &sec_a,
-    const double &rmin,
-    const double &rmax);
+    std::vector<double>& descrpt_a,
+    std::vector<double>& descrpt_a_deriv,
+    std::vector<double>& rij_a,
+    const std::vector<double>& posi,
+    const int& ntypes,
+    const std::vector<int>& type,
+    const SimulationRegion<double>& region,
+    const bool& b_pbc,
+    const std::vector<double>& efield,
+    const int& i_idx,
+    const std::vector<int>& fmt_nlist_a,
+    const std::vector<int>& sec_a,
+    const double& rmin,
+    const double& rmax);
 
 static void compute_dRdT(double (*dRdT)[9],
-                         const double *r1,
-                         const double *r2,
-                         const double *rot) {
-  double *dRdT0 = dRdT[0];
-  double *dRdT1 = dRdT[1];
-  double *dRdT2 = dRdT[2];
-  const double *xx = rot;
-  const double *yy = rot + 3;
+                         const double* r1,
+                         const double* r2,
+                         const double* rot) {
+  double* dRdT0 = dRdT[0];
+  double* dRdT1 = dRdT[1];
+  double* dRdT2 = dRdT[2];
+  const double* xx = rot;
+  const double* yy = rot + 3;
 
   double nr1 = sqrt(deepmd::dot3(r1, r1));
   double nr12 = nr1 * nr1;
@@ -160,14 +160,14 @@ static void compute_dRdT(double (*dRdT)[9],
 }
 
 static void compute_dRdT_1(double (*dRdT)[9],
-                           const double *r1,
-                           const double *r2,
-                           const double *rot) {
-  double *dRdT0 = dRdT[0];
-  double *dRdT1 = dRdT[1];
-  double *dRdT2 = dRdT[2];
-  const double *xx = rot;
-  const double *yy = rot + 3;
+                           const double* r1,
+                           const double* r2,
+                           const double* rot) {
+  double* dRdT0 = dRdT[0];
+  double* dRdT1 = dRdT[1];
+  double* dRdT2 = dRdT[2];
+  const double* xx = rot;
+  const double* yy = rot + 3;
 
   double nr1 = sqrt(deepmd::dot3(r1, r1));
   double nr12 = nr1 * nr1;
@@ -225,14 +225,14 @@ static void compute_dRdT_1(double (*dRdT)[9],
 }
 
 static void compute_dRdT_2(double (*dRdT)[9],
-                           const double *r1,
-                           const double *r2,
-                           const double *rot) {
-  double *dRdT0 = dRdT[0];
-  double *dRdT1 = dRdT[1];
-  double *dRdT2 = dRdT[2];
-  const double *xx = rot;
-  const double *yy = rot + 3;
+                           const double* r1,
+                           const double* r2,
+                           const double* rot) {
+  double* dRdT0 = dRdT[0];
+  double* dRdT1 = dRdT[1];
+  double* dRdT2 = dRdT[2];
+  const double* xx = rot;
+  const double* yy = rot + 3;
 
   double nr1 = sqrt(deepmd::dot3(r1, r1));
   double nr12 = nr1 * nr1;
@@ -287,23 +287,23 @@ static void compute_dRdT_2(double (*dRdT)[9],
 // n_sel_r_nei x 12
 //		      (1./rr, cos_theta, cos_phi, sin_phi)  x 4 x (x, y, z) +
 //(1./rr) x 4 x (x, y, z)
-void compute_descriptor(std::vector<double> &descrpt_a,
-                        std::vector<double> &descrpt_a_deriv,
-                        std::vector<double> &descrpt_r,
-                        std::vector<double> &descrpt_r_deriv,
-                        std::vector<double> &rij_a,
-                        std::vector<double> &rij_r,
-                        std::vector<double> &rot_mat,
-                        const std::vector<double> &posi,
-                        const int &ntypes,
-                        const std::vector<int> &type,
-                        const SimulationRegion<double> &region,
-                        const bool &b_pbc,
-                        const int &i_idx,
-                        const std::vector<int> &fmt_nlist_a,
-                        const std::vector<int> &fmt_nlist_r,
-                        const std::vector<int> &sec_a,
-                        const std::vector<int> &sec_r,
+void compute_descriptor(std::vector<double>& descrpt_a,
+                        std::vector<double>& descrpt_a_deriv,
+                        std::vector<double>& descrpt_r,
+                        std::vector<double>& descrpt_r_deriv,
+                        std::vector<double>& rij_a,
+                        std::vector<double>& rij_r,
+                        std::vector<double>& rot_mat,
+                        const std::vector<double>& posi,
+                        const int& ntypes,
+                        const std::vector<int>& type,
+                        const SimulationRegion<double>& region,
+                        const bool& b_pbc,
+                        const int& i_idx,
+                        const std::vector<int>& fmt_nlist_a,
+                        const std::vector<int>& fmt_nlist_r,
+                        const std::vector<int>& sec_a,
+                        const std::vector<int>& sec_r,
                         const int axis0_type,
                         const int axis0_idx,
                         const int axis1_type,
@@ -318,7 +318,7 @@ void compute_descriptor(std::vector<double> &descrpt_a,
         break;
       }
       sel_a_diff[jj].resize(3);
-      const int &j_idx = fmt_nlist_a[jj];
+      const int& j_idx = fmt_nlist_a[jj];
       if (b_pbc) {
         region.diffNearestNeighbor(
             posi[j_idx * 3 + 0], posi[j_idx * 3 + 1], posi[j_idx * 3 + 2],
@@ -344,7 +344,7 @@ void compute_descriptor(std::vector<double> &descrpt_a,
         break;
       }
       sel_r_diff[jj].resize(3);
-      const int &j_idx = fmt_nlist_r[jj];
+      const int& j_idx = fmt_nlist_r[jj];
       if (b_pbc) {
         region.diffNearestNeighbor(
             posi[j_idx * 3 + 0], posi[j_idx * 3 + 1], posi[j_idx * 3 + 2],
@@ -411,9 +411,9 @@ void compute_descriptor(std::vector<double> &descrpt_a,
 
   // rotation matrix
   double rot[9];
-  double *xx = rot;
-  double *yy = rot + 3;
-  double *zz = rot + 6;
+  double* xx = rot;
+  double* yy = rot + 3;
+  double* zz = rot + 6;
   for (unsigned dd = 0; dd < 3; ++dd) {
     xx[dd] = r1[dd];
     yy[dd] = r2[dd];
@@ -472,7 +472,7 @@ void compute_descriptor(std::vector<double> &descrpt_a,
       if (fmt_nlist_r[jj] < 0) {
         break;
       }
-      const double *rdiff = &sel_r_diff[jj][0];
+      const double* rdiff = &sel_r_diff[jj][0];
       double rr = sqrt(deepmd::dot3(rdiff, rdiff));
       descrpt_r[jj] = 1. / rr;
     }
@@ -503,7 +503,7 @@ void compute_descriptor(std::vector<double> &descrpt_a,
       }
       // drdS, stored in transposed form
       double dtrdST[4][3];
-      double *rr = &sel_a_diff[nei_iter][0];
+      double* rr = &sel_a_diff[nei_iter][0];
       double tr[3];
       deepmd::dotmv3(tr, rot, rr);
       double nr2 = deepmd::dot3(tr, tr);
@@ -638,7 +638,7 @@ void compute_descriptor(std::vector<double> &descrpt_a,
         break;
       }
 
-      const double *rr = &sel_r_diff[nei_iter][0];
+      const double* rr = &sel_r_diff[nei_iter][0];
       double nr = sqrt(deepmd::dot3(rr, rr));
       double nr3 = nr * nr * nr;
       int idx = nei_iter * 12;
@@ -658,19 +658,19 @@ void compute_descriptor(std::vector<double> &descrpt_a,
   }
 }
 
-void compute_descriptor(std::vector<double> &descrpt_a,
-                        std::vector<double> &descrpt_r,
-                        std::vector<double> &rot_mat,
-                        const std::vector<double> &posi,
-                        const int &ntypes,
-                        const std::vector<int> &type,
-                        const SimulationRegion<double> &region,
-                        const bool &b_pbc,
-                        const int &i_idx,
-                        const std::vector<int> &fmt_nlist_a,
-                        const std::vector<int> &fmt_nlist_r,
-                        const std::vector<int> &sec_a,
-                        const std::vector<int> &sec_r,
+void compute_descriptor(std::vector<double>& descrpt_a,
+                        std::vector<double>& descrpt_r,
+                        std::vector<double>& rot_mat,
+                        const std::vector<double>& posi,
+                        const int& ntypes,
+                        const std::vector<int>& type,
+                        const SimulationRegion<double>& region,
+                        const bool& b_pbc,
+                        const int& i_idx,
+                        const std::vector<int>& fmt_nlist_a,
+                        const std::vector<int>& fmt_nlist_r,
+                        const std::vector<int>& sec_a,
+                        const std::vector<int>& sec_r,
                         const int axis0_type,
                         const int axis0_idx,
                         const int axis1_type,
@@ -683,7 +683,7 @@ void compute_descriptor(std::vector<double> &descrpt_a,
         break;
       }
       sel_a_diff[jj].resize(3);
-      const int &j_idx = fmt_nlist_a[jj];
+      const int& j_idx = fmt_nlist_a[jj];
       if (b_pbc) {
         region.diffNearestNeighbor(
             posi[j_idx * 3 + 0], posi[j_idx * 3 + 1], posi[j_idx * 3 + 2],
@@ -703,7 +703,7 @@ void compute_descriptor(std::vector<double> &descrpt_a,
         break;
       }
       sel_r_diff[jj].resize(3);
-      const int &j_idx = fmt_nlist_r[jj];
+      const int& j_idx = fmt_nlist_r[jj];
       if (b_pbc) {
         region.diffNearestNeighbor(
             posi[j_idx * 3 + 0], posi[j_idx * 3 + 1], posi[j_idx * 3 + 2],
@@ -734,9 +734,9 @@ void compute_descriptor(std::vector<double> &descrpt_a,
 
   // rotation matrix
   double rot[9];
-  double *xx = rot;
-  double *yy = rot + 3;
-  double *zz = rot + 6;
+  double* xx = rot;
+  double* yy = rot + 3;
+  double* zz = rot + 6;
   for (unsigned dd = 0; dd < 3; ++dd) {
     xx[dd] = r1[dd];
     yy[dd] = r2[dd];
@@ -805,21 +805,21 @@ void compute_descriptor(std::vector<double> &descrpt_a,
 
 // output deriv size: n_sel_a_nei x 4 x 12
 //		      (1./rr, cos_theta, cos_phi, sin_phi)  x 4 x (x, y, z)
-void compute_descriptor_se_a_extf(std::vector<double> &descrpt_a,
-                                  std::vector<double> &descrpt_a_deriv,
-                                  std::vector<double> &rij_a,
-                                  const std::vector<double> &posi,
-                                  const int &ntypes,
-                                  const std::vector<int> &type,
-                                  const SimulationRegion<double> &region,
-                                  const bool &b_pbc,
-                                  const std::vector<double> &efield,
-                                  const int &i_idx,
-                                  const std::vector<int> &fmt_nlist_a,
-                                  const std::vector<int> &sec_a,
-                                  const double &rmin,
-                                  const double &rmax) {
-  const double *ef_ = &efield[i_idx * 3 + 0];
+void compute_descriptor_se_a_extf(std::vector<double>& descrpt_a,
+                                  std::vector<double>& descrpt_a_deriv,
+                                  std::vector<double>& rij_a,
+                                  const std::vector<double>& posi,
+                                  const int& ntypes,
+                                  const std::vector<int>& type,
+                                  const SimulationRegion<double>& region,
+                                  const bool& b_pbc,
+                                  const std::vector<double>& efield,
+                                  const int& i_idx,
+                                  const std::vector<int>& fmt_nlist_a,
+                                  const std::vector<int>& sec_a,
+                                  const double& rmin,
+                                  const double& rmax) {
+  const double* ef_ = &efield[i_idx * 3 + 0];
   double ef[3] = {0.};
   if (std::isnan(ef_[0]) || std::isnan(ef_[1]) || std::isnan(ef_[2])) {
     ef[0] = 1.;
@@ -842,7 +842,7 @@ void compute_descriptor_se_a_extf(std::vector<double> &descrpt_a,
         break;
       }
       sel_a_diff[jj].resize(3);
-      const int &j_idx = fmt_nlist_a[jj];
+      const int& j_idx = fmt_nlist_a[jj];
       if (b_pbc) {
         region.diffNearestNeighbor(
             posi[j_idx * 3 + 0], posi[j_idx * 3 + 1], posi[j_idx * 3 + 2],
@@ -872,7 +872,7 @@ void compute_descriptor_se_a_extf(std::vector<double> &descrpt_a,
       if (fmt_nlist_a[nei_iter] < 0) {
         break;
       }
-      const double *rr = &sel_a_diff[nei_iter][0];
+      const double* rr = &sel_a_diff[nei_iter][0];
       // check validity of ef
       double nr2 = deepmd::dot3(rr, rr);
       double inr = 1. / sqrt(nr2);
@@ -946,21 +946,21 @@ void compute_descriptor_se_a_extf(std::vector<double> &descrpt_a,
 
 // output deriv size: n_sel_a_nei x 4 x 12
 //		      (1./rr, cos_theta, cos_phi, sin_phi)  x 4 x (x, y, z)
-void compute_descriptor_se_a_ef_para(std::vector<double> &descrpt_a,
-                                     std::vector<double> &descrpt_a_deriv,
-                                     std::vector<double> &rij_a,
-                                     const std::vector<double> &posi,
-                                     const int &ntypes,
-                                     const std::vector<int> &type,
-                                     const SimulationRegion<double> &region,
-                                     const bool &b_pbc,
-                                     const std::vector<double> &efield,
-                                     const int &i_idx,
-                                     const std::vector<int> &fmt_nlist_a,
-                                     const std::vector<int> &sec_a,
-                                     const double &rmin,
-                                     const double &rmax) {
-  const double *ef_ = &efield[i_idx * 3 + 0];
+void compute_descriptor_se_a_ef_para(std::vector<double>& descrpt_a,
+                                     std::vector<double>& descrpt_a_deriv,
+                                     std::vector<double>& rij_a,
+                                     const std::vector<double>& posi,
+                                     const int& ntypes,
+                                     const std::vector<int>& type,
+                                     const SimulationRegion<double>& region,
+                                     const bool& b_pbc,
+                                     const std::vector<double>& efield,
+                                     const int& i_idx,
+                                     const std::vector<int>& fmt_nlist_a,
+                                     const std::vector<int>& sec_a,
+                                     const double& rmin,
+                                     const double& rmax) {
+  const double* ef_ = &efield[i_idx * 3 + 0];
   double ef[3] = {0.};
   if (std::isnan(ef_[0]) || std::isnan(ef_[1]) || std::isnan(ef_[2])) {
     ef[0] = 1.;
@@ -983,7 +983,7 @@ void compute_descriptor_se_a_ef_para(std::vector<double> &descrpt_a,
         break;
       }
       sel_a_diff[jj].resize(3);
-      const int &j_idx = fmt_nlist_a[jj];
+      const int& j_idx = fmt_nlist_a[jj];
       if (b_pbc) {
         region.diffNearestNeighbor(
             posi[j_idx * 3 + 0], posi[j_idx * 3 + 1], posi[j_idx * 3 + 2],
@@ -1013,7 +1013,7 @@ void compute_descriptor_se_a_ef_para(std::vector<double> &descrpt_a,
       if (fmt_nlist_a[nei_iter] < 0) {
         break;
       }
-      const double *rr = &sel_a_diff[nei_iter][0];
+      const double* rr = &sel_a_diff[nei_iter][0];
       // check validity of ef
       double nr2 = deepmd::dot3(rr, rr);
       double inr = 1. / sqrt(nr2);
@@ -1083,21 +1083,21 @@ void compute_descriptor_se_a_ef_para(std::vector<double> &descrpt_a,
 
 // output deriv size: n_sel_a_nei x 4 x 12
 //		      (1./rr, cos_theta, cos_phi, sin_phi)  x 4 x (x, y, z)
-void compute_descriptor_se_a_ef_vert(std::vector<double> &descrpt_a,
-                                     std::vector<double> &descrpt_a_deriv,
-                                     std::vector<double> &rij_a,
-                                     const std::vector<double> &posi,
-                                     const int &ntypes,
-                                     const std::vector<int> &type,
-                                     const SimulationRegion<double> &region,
-                                     const bool &b_pbc,
-                                     const std::vector<double> &efield,
-                                     const int &i_idx,
-                                     const std::vector<int> &fmt_nlist_a,
-                                     const std::vector<int> &sec_a,
-                                     const double &rmin,
-                                     const double &rmax) {
-  const double *ef_ = &efield[i_idx * 3 + 0];
+void compute_descriptor_se_a_ef_vert(std::vector<double>& descrpt_a,
+                                     std::vector<double>& descrpt_a_deriv,
+                                     std::vector<double>& rij_a,
+                                     const std::vector<double>& posi,
+                                     const int& ntypes,
+                                     const std::vector<int>& type,
+                                     const SimulationRegion<double>& region,
+                                     const bool& b_pbc,
+                                     const std::vector<double>& efield,
+                                     const int& i_idx,
+                                     const std::vector<int>& fmt_nlist_a,
+                                     const std::vector<int>& sec_a,
+                                     const double& rmin,
+                                     const double& rmax) {
+  const double* ef_ = &efield[i_idx * 3 + 0];
   double ef[3] = {0.};
   if (std::isnan(ef_[0]) || std::isnan(ef_[1]) || std::isnan(ef_[2])) {
     ef[0] = 1.;
@@ -1120,7 +1120,7 @@ void compute_descriptor_se_a_ef_vert(std::vector<double> &descrpt_a,
         break;
       }
       sel_a_diff[jj].resize(3);
-      const int &j_idx = fmt_nlist_a[jj];
+      const int& j_idx = fmt_nlist_a[jj];
       if (b_pbc) {
         region.diffNearestNeighbor(
             posi[j_idx * 3 + 0], posi[j_idx * 3 + 1], posi[j_idx * 3 + 2],
@@ -1150,7 +1150,7 @@ void compute_descriptor_se_a_ef_vert(std::vector<double> &descrpt_a,
       if (fmt_nlist_a[nei_iter] < 0) {
         break;
       }
-      const double *rr = &sel_a_diff[nei_iter][0];
+      const double* rr = &sel_a_diff[nei_iter][0];
       // check validity of ef
       double nr2 = deepmd::dot3(rr, rr);
       double inr = 1. / sqrt(nr2);
diff --git a/source/lib/include/SimulationRegion.h b/source/lib/include/SimulationRegion.h
index 7cc853d25b..377a115dc0 100644
--- a/source/lib/include/SimulationRegion.h
+++ b/source/lib/include/SimulationRegion.h
@@ -13,82 +13,82 @@ class SimulationRegion {
   const static int SPACENDIM = MOASPNDIM;
 
  public:
-  void reinitBox(const double *boxv);
-  void affineTransform(const double *affine_map);
-  void reinitOrigin(const double *orig);
-  void reinitOrigin(const std::vector<double> &orig);
+  void reinitBox(const double* boxv);
+  void affineTransform(const double* affine_map);
+  void reinitOrigin(const double* orig);
+  void reinitOrigin(const std::vector<double>& orig);
   void backup();
   void recover();
 
  public:
   SimulationRegion();
   ~SimulationRegion();
-  double *getBoxTensor() { return boxt; };
-  const double *getBoxTensor() const { return boxt; };
-  double *getRecBoxTensor() { return rec_boxt; }
-  const double *getRecBoxTensor() const { return rec_boxt; }
-  double *getBoxOrigin() { return origin; }
-  const double *getBoxOrigin() const { return origin; }
+  double* getBoxTensor() { return boxt; };
+  const double* getBoxTensor() const { return boxt; };
+  double* getRecBoxTensor() { return rec_boxt; }
+  const double* getRecBoxTensor() const { return rec_boxt; }
+  double* getBoxOrigin() { return origin; }
+  const double* getBoxOrigin() const { return origin; }
   double getVolume() const { return volume; }
 
  public:
-  void toFaceDistance(double *dd) const;
+  void toFaceDistance(double* dd) const;
 
  public:
-  void phys2Inter(double *i_v, const VALUETYPE *p_v) const;
-  void inter2Phys(VALUETYPE *p_v, const double *i_v) const;
+  void phys2Inter(double* i_v, const VALUETYPE* p_v) const;
+  void inter2Phys(VALUETYPE* p_v, const double* i_v) const;
 
  public:
   bool isPeriodic(const int dim) const { return is_periodic[dim]; }
-  static int compactIndex(const int *idx);
-  double *getShiftVec(const int index = 0);
-  const double *getShiftVec(const int index = 0) const;
-  int getShiftIndex(const int *idx) const;
+  static int compactIndex(const int* idx);
+  double* getShiftVec(const int index = 0);
+  const double* getShiftVec(const int index = 0) const;
+  int getShiftIndex(const int* idx) const;
   int getNullShiftIndex() const;
-  void shiftCoord(const int *idx,
-                  VALUETYPE &x,
-                  VALUETYPE &y,
-                  VALUETYPE &z) const;
+  void shiftCoord(const int* idx,
+                  VALUETYPE& x,
+                  VALUETYPE& y,
+                  VALUETYPE& z) const;
   static int getNumbShiftVec() { return shift_info_size; }
   static int getShiftVecTotalSize() { return shift_vec_size; }
 
  public:
-  void diffNearestNeighbor(const VALUETYPE *r0,
-                           const VALUETYPE *r1,
-                           VALUETYPE *phys) const;
+  void diffNearestNeighbor(const VALUETYPE* r0,
+                           const VALUETYPE* r1,
+                           VALUETYPE* phys) const;
   virtual void diffNearestNeighbor(const VALUETYPE x0,
                                    const VALUETYPE y0,
                                    const VALUETYPE z0,
                                    const VALUETYPE x1,
                                    const VALUETYPE y1,
                                    const VALUETYPE z1,
-                                   VALUETYPE &dx,
-                                   VALUETYPE &dy,
-                                   VALUETYPE &dz) const;
+                                   VALUETYPE& dx,
+                                   VALUETYPE& dy,
+                                   VALUETYPE& dz) const;
   virtual void diffNearestNeighbor(const VALUETYPE x0,
                                    const VALUETYPE y0,
                                    const VALUETYPE z0,
                                    const VALUETYPE x1,
                                    const VALUETYPE y1,
                                    const VALUETYPE z1,
-                                   VALUETYPE &dx,
-                                   VALUETYPE &dy,
-                                   VALUETYPE &dz,
-                                   int &shift_x,
-                                   int &shift_y,
-                                   int &shift_z) const;
+                                   VALUETYPE& dx,
+                                   VALUETYPE& dy,
+                                   VALUETYPE& dz,
+                                   int& shift_x,
+                                   int& shift_y,
+                                   int& shift_z) const;
   virtual void diffNearestNeighbor(const VALUETYPE x0,
                                    const VALUETYPE y0,
                                    const VALUETYPE z0,
                                    const VALUETYPE x1,
                                    const VALUETYPE y1,
                                    const VALUETYPE z1,
-                                   VALUETYPE &dx,
-                                   VALUETYPE &dy,
-                                   VALUETYPE &dz,
-                                   VALUETYPE &shift_x,
-                                   VALUETYPE &shift_y,
-                                   VALUETYPE &shift_z) const;
+                                   VALUETYPE& dx,
+                                   VALUETYPE& dy,
+                                   VALUETYPE& dz,
+                                   VALUETYPE& shift_x,
+                                   VALUETYPE& shift_y,
+                                   VALUETYPE& shift_z) const;
 
  private:
   void computeVolume();
@@ -118,25 +118,25 @@ class SimulationRegion {
   static int index3to1(const int tx, const int ty, const int tz) {
     return (NBOX_ZZ * (NBOX_YY * (tx + DBOX_XX) + ty + DBOX_YY) + tz + DBOX_ZZ);
   }
-  double *getInterShiftVec(const int index = 0);
-  const double *getInterShiftVec(const int index = 0) const;
+  double* getInterShiftVec(const int index = 0);
+  const double* getInterShiftVec(const int index = 0) const;
 
  private:
-  void copy(double *o_v, const double *i_v) const;
-  void naiveTensorDotVector(double *out,
-                            const double *i_t,
-                            const double *i_v) const;
-  void naiveTensorTransDotVector(double *out,
-                                 const double *i_t,
-                                 const double *i_v) const;
-  void tensorDotVector(double *out, const double *i_t, const double *i_v) const;
-  void tensorTransDotVector(double *out,
-                            const double *i_t,
-                            const double *i_v) const;
-  void getFromRestart(double *my_boxv, double *my_orig, bool *period) const;
-  void defaultInitBox(double *my_boxv, double *my_orig, bool *period) const;
-  void apply_periodic(int dim, double *dd) const;
-  void apply_periodic(int dim, double *dd, int &shift) const;
+  void copy(double* o_v, const double* i_v) const;
+  void naiveTensorDotVector(double* out,
+                            const double* i_t,
+                            const double* i_v) const;
+  void naiveTensorTransDotVector(double* out,
+                                 const double* i_t,
+                                 const double* i_v) const;
+  void tensorDotVector(double* out, const double* i_t, const double* i_v) const;
+  void tensorTransDotVector(double* out,
+                            const double* i_t,
+                            const double* i_v) const;
+  void getFromRestart(double* my_boxv, double* my_orig, bool* period) const;
+  void defaultInitBox(double* my_boxv, double* my_orig, bool* period) const;
+  void apply_periodic(int dim, double* dd) const;
+  void apply_periodic(int dim, double* dd, int& shift) const;
 
  private:
   std::fstream fp;
diff --git a/source/lib/include/SimulationRegion_Impl.h b/source/lib/include/SimulationRegion_Impl.h
index cab06087e3..7b4c3dbb4d 100644
--- a/source/lib/include/SimulationRegion_Impl.h
+++ b/source/lib/include/SimulationRegion_Impl.h
@@ -23,9 +23,9 @@ SimulationRegion<VALUETYPE>::SimulationRegion() {
 }
 
 template <typename VALUETYPE>
-void SimulationRegion<VALUETYPE>::defaultInitBox(double *my_boxv,
-                                                 double *my_orig,
-                                                 bool *period) const {
+void SimulationRegion<VALUETYPE>::defaultInitBox(double* my_boxv,
+                                                 double* my_orig,
+                                                 bool* period) const {
   // by default is a 1,1,1 logical box
   for (int ii = 0; ii < SPACENDIM; ++ii) {
     for (int jj = 0; jj < SPACENDIM; ++jj) {
@@ -55,7 +55,7 @@ void SimulationRegion<VALUETYPE>::recover() {
 }
 
 template <typename VALUETYPE>
-inline void SimulationRegion<VALUETYPE>::reinitBox(const double *boxv_) {
+inline void SimulationRegion<VALUETYPE>::reinitBox(const double* boxv_) {
   for (int ii = 0; ii < SPACENDIM * SPACENDIM; ++ii) {
     boxt[ii] = boxv_[ii];
   }
@@ -66,7 +66,7 @@ inline void SimulationRegion<VALUETYPE>::reinitBox(const double *boxv_) {
 
 template <typename VALUETYPE>
 inline void SimulationRegion<VALUETYPE>::affineTransform(
-    const double *affine_map) {
+    const double* affine_map) {
   tensorDotVector(boxt + SPACENDIM * 0, affine_map, boxt + SPACENDIM * 0);
   tensorDotVector(boxt + SPACENDIM * 1, affine_map, boxt + SPACENDIM * 1);
   tensorDotVector(boxt + SPACENDIM * 2, affine_map, boxt + SPACENDIM * 2);
@@ -76,7 +76,7 @@ inline void SimulationRegion<VALUETYPE>::affineTransform(
 }
 
 template <typename VALUETYPE>
-inline void SimulationRegion<VALUETYPE>::reinitOrigin(const double *orig) {
+inline void SimulationRegion<VALUETYPE>::reinitOrigin(const double* orig) {
   for (int ii = 0; ii < SPACENDIM; ++ii) {
     origin[ii] = orig[ii];
   }
@@ -84,7 +84,7 @@ inline void SimulationRegion<VALUETYPE>::reinitOrigin(const double *orig) {
 
 template <typename VALUETYPE>
 inline void SimulationRegion<VALUETYPE>::reinitOrigin(
-    const std::vector<double> &orig) {
+    const std::vector<double>& orig) {
   for (int ii = 0; ii < SPACENDIM; ++ii) {
     origin[ii] = orig[ii];
   }
@@ -93,14 +93,14 @@ inline void SimulationRegion<VALUETYPE>::reinitOrigin(
 template <typename VALUETYPE>
 void SimulationRegion<VALUETYPE>::computeShiftVec() {
   int tmp_idx[3];
-  int &ii(tmp_idx[0]);
-  int &jj(tmp_idx[1]);
-  int &kk(tmp_idx[2]);
+  int& ii(tmp_idx[0]);
+  int& jj(tmp_idx[1]);
+  int& kk(tmp_idx[2]);
   for (ii = -DBOX_XX; ii <= DBOX_XX; ++ii) {
     for (jj = -DBOX_YY; jj <= DBOX_YY; ++jj) {
       for (kk = -DBOX_ZZ; kk <= DBOX_ZZ; ++kk) {
-        double *posi = getShiftVec(getShiftIndex(tmp_idx));
-        double *inter_posi = getInterShiftVec(getShiftIndex(tmp_idx));
+        double* posi = getShiftVec(getShiftIndex(tmp_idx));
+        double* inter_posi = getInterShiftVec(getShiftIndex(tmp_idx));
         inter_posi[0] = ii;
         inter_posi[1] = jj;
         inter_posi[2] = kk;
@@ -112,29 +112,29 @@ void SimulationRegion<VALUETYPE>::computeShiftVec() {
 }
 
 template <typename VALUETYPE>
-inline double *SimulationRegion<VALUETYPE>::getShiftVec(const int index) {
+inline double* SimulationRegion<VALUETYPE>::getShiftVec(const int index) {
   return shift_vec + SPACENDIM * index;
 }
 
 template <typename VALUETYPE>
-inline const double *SimulationRegion<VALUETYPE>::getShiftVec(
+inline const double* SimulationRegion<VALUETYPE>::getShiftVec(
     const int index) const {
   return shift_vec + SPACENDIM * index;
 }
 
 template <typename VALUETYPE>
-inline double *SimulationRegion<VALUETYPE>::getInterShiftVec(const int index) {
+inline double* SimulationRegion<VALUETYPE>::getInterShiftVec(const int index) {
   return inter_shift_vec + SPACENDIM * index;
 }
 
 template <typename VALUETYPE>
-inline const double *SimulationRegion<VALUETYPE>::getInterShiftVec(
+inline const double* SimulationRegion<VALUETYPE>::getInterShiftVec(
     const int index) const {
   return inter_shift_vec + SPACENDIM * index;
 }
 
 template <typename VALUETYPE>
-inline int SimulationRegion<VALUETYPE>::getShiftIndex(const int *idx) const {
+inline int SimulationRegion<VALUETYPE>::getShiftIndex(const int* idx) const {
   return index3to1(idx[0], idx[1], idx[2]);
 }
 
@@ -144,16 +144,16 @@ inline int SimulationRegion<VALUETYPE>::getNullShiftIndex() const {
 }
 
 template <typename VALUETYPE>
-inline int SimulationRegion<VALUETYPE>::compactIndex(const int *idx) {
+inline int SimulationRegion<VALUETYPE>::compactIndex(const int* idx) {
   return index3to1(idx[0], idx[1], idx[2]);
 }
 
 template <typename VALUETYPE>
-inline void SimulationRegion<VALUETYPE>::shiftCoord(const int *idx,
-                                                    VALUETYPE &x,
-                                                    VALUETYPE &y,
-                                                    VALUETYPE &z) const {
-  const double *shift = getShiftVec(getShiftIndex(idx));
+inline void SimulationRegion<VALUETYPE>::shiftCoord(const int* idx,
+                                                    VALUETYPE& x,
+                                                    VALUETYPE& y,
+                                                    VALUETYPE& z) const {
+  const double* shift = getShiftVec(getShiftIndex(idx));
   x += shift[0];
   y += shift[1];
   z += shift[2];
@@ -199,7 +199,7 @@ inline void SimulationRegion<VALUETYPE>::shiftCoord(const int *idx,
 
 template <typename VALUETYPE>
 inline void SimulationRegion<VALUETYPE>::apply_periodic(int dim,
-                                                        double *dd) const {
+                                                        double* dd) const {
   if (!is_periodic[dim]) {
     return;
   }
@@ -212,8 +212,8 @@ inline void SimulationRegion<VALUETYPE>::apply_periodic(int dim,
 
 template <typename VALUETYPE>
 inline void SimulationRegion<VALUETYPE>::apply_periodic(int dim,
-                                                        double *dd,
-                                                        int &shift) const {
+                                                        double* dd,
+                                                        int& shift) const {
   shift = 0;
   if (!is_periodic[dim]) {
     return;
@@ -229,7 +229,7 @@ inline void SimulationRegion<VALUETYPE>::apply_periodic(int dim,
 
 template <typename VALUETYPE>
 inline void SimulationRegion<VALUETYPE>::diffNearestNeighbor(
-    const VALUETYPE *r0, const VALUETYPE *r1, VALUETYPE *phys) const {
+    const VALUETYPE* r0, const VALUETYPE* r1, VALUETYPE* phys) const {
   double inter[3];
   for (int dd = 0; dd < 3; ++dd) {
     phys[dd] = r0[dd] - r1[dd];
@@ -249,9 +249,9 @@ inline void SimulationRegion<VALUETYPE>::diffNearestNeighbor(
     const VALUETYPE x1,
     const VALUETYPE y1,
     const VALUETYPE z1,
-    VALUETYPE &dx,
-    VALUETYPE &dy,
-    VALUETYPE &dz) const {
+    VALUETYPE& dx,
+    VALUETYPE& dy,
+    VALUETYPE& dz) const {
   // diffNearestNeighbor (0, x0, x1, dx);
   // diffNearestNeighbor (1, y0, y1, dy);
   // diffNearestNeighbor (2, z0, z1, dz);
@@ -278,12 +278,12 @@ inline void SimulationRegion<VALUETYPE>::diffNearestNeighbor(
     const VALUETYPE x1,
     const VALUETYPE y1,
     const VALUETYPE z1,
-    VALUETYPE &dx,
-    VALUETYPE &dy,
-    VALUETYPE &dz,
-    int &shift_x,
-    int &shift_y,
-    int &shift_z) const {
+    VALUETYPE& dx,
+    VALUETYPE& dy,
+    VALUETYPE& dz,
+    int& shift_x,
+    int& shift_y,
+    int& shift_z) const {
   // diffNearestNeighbor (0, x0, x1, dx, shift_x);
   // diffNearestNeighbor (1, y0, y1, dy, shift_y);
   // diffNearestNeighbor (2, z0, z1, dz, shift_z);
@@ -310,12 +310,12 @@ inline void SimulationRegion<VALUETYPE>::diffNearestNeighbor(
     const VALUETYPE x1,
     const VALUETYPE y1,
     const VALUETYPE z1,
-    VALUETYPE &dx,
-    VALUETYPE &dy,
-    VALUETYPE &dz,
-    VALUETYPE &shift_x,
-    VALUETYPE &shift_y,
-    VALUETYPE &shift_z) const {
+    VALUETYPE& dx,
+    VALUETYPE& dy,
+    VALUETYPE& dz,
+    VALUETYPE& shift_x,
+    VALUETYPE& shift_y,
+    VALUETYPE& shift_z) const {
   // diffNearestNeighbor (0, x0, x1, dx, shift_x);
   // diffNearestNeighbor (1, y0, y1, dy, shift_y);
   // diffNearestNeighbor (2, z0, z1, dz, shift_z);
@@ -333,7 +333,7 @@ inline void SimulationRegion<VALUETYPE>::diffNearestNeighbor(
   dx = phys[0];
   dy = phys[1];
   dz = phys[2];
-  const double *tmp_shift(
+  const double* tmp_shift(
       getShiftVec(index3to1(i_shift_x, i_shift_y, i_shift_z)));
   shift_x = tmp_shift[0];
   shift_y = tmp_shift[1];
@@ -342,7 +342,7 @@ inline void SimulationRegion<VALUETYPE>::diffNearestNeighbor(
 
 template <typename VALUETYPE>
 inline void SimulationRegion<VALUETYPE>::phys2Inter(
-    double *i_v, const VALUETYPE *p_v_) const {
+    double* i_v, const VALUETYPE* p_v_) const {
   double p_v[3];
   for (int dd = 0; dd < 3; ++dd) {
     p_v[dd] = p_v_[dd];
@@ -351,8 +351,8 @@ inline void SimulationRegion<VALUETYPE>::phys2Inter(
 }
 
 template <typename VALUETYPE>
-inline void SimulationRegion<VALUETYPE>::inter2Phys(VALUETYPE *p_v_,
-                                                    const double *i_v) const {
+inline void SimulationRegion<VALUETYPE>::inter2Phys(VALUETYPE* p_v_,
+                                                    const double* i_v) const {
   double p_v[3];
   tensorTransDotVector(p_v, boxt, i_v);
   for (int dd = 0; dd < 3; ++dd) {
@@ -361,7 +361,7 @@ inline void SimulationRegion<VALUETYPE>::inter2Phys(VALUETYPE *p_v_,
 }
 
 template <typename VALUETYPE>
-inline void SimulationRegion<VALUETYPE>::toFaceDistance(double *dd) const {
+inline void SimulationRegion<VALUETYPE>::toFaceDistance(double* dd) const {
   double tmp[3];
   deepmd::cprod(boxt + 3, boxt + 6, tmp);
   dd[0] = volume * deepmd::invsqrt(deepmd::dot3(tmp, tmp));
@@ -374,8 +374,8 @@ inline void SimulationRegion<VALUETYPE>::toFaceDistance(double *dd) const {
 // static int tmp_count = 0;
 
 template <typename VALUETYPE>
-inline void SimulationRegion<VALUETYPE>::copy(double *o_v,
-                                              const double *i_v) const {
+inline void SimulationRegion<VALUETYPE>::copy(double* o_v,
+                                              const double* i_v) const {
 #ifdef DEBUG_CHECK_ASSERTIONS
   assert(o_v != i_v);
 #endif
@@ -386,7 +386,7 @@ inline void SimulationRegion<VALUETYPE>::copy(double *o_v,
 
 template <typename VALUETYPE>
 inline void SimulationRegion<VALUETYPE>::naiveTensorDotVector(
-    double *o_v, const double *i_t, const double *i_v) const {
+    double* o_v, const double* i_t, const double* i_v) const {
   o_v[0] = i_v[0] * i_t[0 * 3 + 0] + i_v[1] * i_t[0 * 3 + 1] +
            i_v[2] * i_t[0 * 3 + 2];
   o_v[1] = i_v[0] * i_t[1 * 3 + 0] + i_v[1] * i_t[1 * 3 + 1] +
@@ -397,7 +397,7 @@ inline void SimulationRegion<VALUETYPE>::naiveTensorDotVector(
 
 template <typename VALUETYPE>
 inline void SimulationRegion<VALUETYPE>::naiveTensorTransDotVector(
-    double *o_v, const double *i_t, const double *i_v) const {
+    double* o_v, const double* i_t, const double* i_v) const {
   o_v[0] = i_v[0] * i_t[0 * 3 + 0] + i_v[1] * i_t[1 * 3 + 0] +
            i_v[2] * i_t[2 * 3 + 0];
   o_v[1] = i_v[0] * i_t[0 * 3 + 1] + i_v[1] * i_t[1 * 3 + 1] +
@@ -408,7 +408,7 @@ inline void SimulationRegion<VALUETYPE>::naiveTensorTransDotVector(
 
 template <typename VALUETYPE>
 inline void SimulationRegion<VALUETYPE>::tensorDotVector(
-    double *o_v, const double *i_t, const double *i_v) const {
+    double* o_v, const double* i_t, const double* i_v) const {
   // the compiler will auto-matically optimize the following code away...
   // const double * tmp_v (i_v);
   // if (o_v == i_v){
@@ -421,7 +421,7 @@ inline void SimulationRegion<VALUETYPE>::tensorDotVector(
 
 template <typename VALUETYPE>
 inline void SimulationRegion<VALUETYPE>::tensorTransDotVector(
-    double *o_v, const double *i_t, const double *i_v) const {
+    double* o_v, const double* i_t, const double* i_v) const {
   naiveTensorTransDotVector(o_v, i_t, i_v);
 }
 
diff --git a/source/lib/include/env_mat_nvnmd.h b/source/lib/include/env_mat_nvnmd.h
index d3c18270cf..ce391a9563 100644
--- a/source/lib/include/env_mat_nvnmd.h
+++ b/source/lib/include/env_mat_nvnmd.h
@@ -28,16 +28,16 @@ date: 2021-12-6
 namespace deepmd {
 
 template <typename FPTYPE>
-void env_mat_a_nvnmd_quantize_cpu(std::vector<FPTYPE> &descrpt_a,
-                                  std::vector<FPTYPE> &descrpt_a_deriv,
-                                  std::vector<FPTYPE> &rij_a,
-                                  const std::vector<FPTYPE> &posi,
-                                  const std::vector<int> &type,
-                                  const int &i_idx,
-                                  const std::vector<int> &fmt_nlist,
-                                  const std::vector<int> &sec,
-                                  const float &rmin,
-                                  const float &rmax);
+void env_mat_a_nvnmd_quantize_cpu(std::vector<FPTYPE>& descrpt_a,
+                                  std::vector<FPTYPE>& descrpt_a_deriv,
+                                  std::vector<FPTYPE>& rij_a,
+                                  const std::vector<FPTYPE>& posi,
+                                  const std::vector<int>& type,
+                                  const int& i_idx,
+                                  const std::vector<int>& fmt_nlist,
+                                  const std::vector<int>& sec,
+                                  const float& rmin,
+                                  const float& rmax);
 }
 
 union U_Flt64_Int64 {
@@ -59,7 +59,7 @@ union U_Flt64_Int64 {
   split double into sign, expo, and frac
 */
 template <class T>  // float and double
-void split_flt(T x, int64_t &sign, int64_t &expo, int64_t &mant) {
+void split_flt(T x, int64_t& sign, int64_t& expo, int64_t& mant) {
   U_Flt64_Int64 ufi;
   ufi.nflt = x;
   sign = (ufi.nint >> 63) & 0x01;
@@ -71,7 +71,7 @@ void split_flt(T x, int64_t &sign, int64_t &expo, int64_t &mant) {
  find the max exponent for float array x
 */
 template <class T>  // float and double
-void find_max_expo(int64_t &max_expo, T *x, int64_t M) {
+void find_max_expo(int64_t& max_expo, T* x, int64_t M) {
   int ii, jj, kk;
   U_Flt64_Int64 ufi;
   int64_t expo;
@@ -87,7 +87,7 @@ void find_max_expo(int64_t &max_expo, T *x, int64_t M) {
  find the max exponent for float array x
 */
 template <class T>  // float and double
-void find_max_expo(int64_t &max_expo, T *x, int64_t N, int64_t M) {
+void find_max_expo(int64_t& max_expo, T* x, int64_t N, int64_t M) {
   int ii, jj, kk;
   U_Flt64_Int64 ufi;
   int64_t expo;
@@ -103,7 +103,7 @@ void find_max_expo(int64_t &max_expo, T *x, int64_t N, int64_t M) {
  dot multiply
 */
 template <class T>  // float and double
-void dotmul_flt_nvnmd(T &y, T *x1, T *x2, int64_t M) {
+void dotmul_flt_nvnmd(T& y, T* x1, T* x2, int64_t M) {
   int ii, jj, kk;
   U_Flt64_Int64 ufi;
   //
@@ -146,7 +146,7 @@ void dotmul_flt_nvnmd(T &y, T *x1, T *x2, int64_t M) {
   multiply
 */
 template <class T>  // float and double
-void mul_flt_nvnmd(T &y, T x1, T x2) {
+void mul_flt_nvnmd(T& y, T x1, T x2) {
   U_Flt64_Int64 ufi1, ufi2, ufi3;
   ufi1.nflt = x1;
   ufi1.nint &= FLT_MASK;
@@ -161,7 +161,7 @@ void mul_flt_nvnmd(T &y, T x1, T x2) {
   add
 */
 template <class T>  // float and double
-void add_flt_nvnmd(T &y, T x1, T x2) {
+void add_flt_nvnmd(T& y, T x1, T x2) {
   U_Flt64_Int64 ufi1, ufi2, ufi3;
   int64_t sign1, sign2, sign3;
   int64_t expo1, expo2, expo3;
diff --git a/source/lib/include/gpu_cuda.h b/source/lib/include/gpu_cuda.h
index 9504a95b7a..8fc7781f4c 100644
--- a/source/lib/include/gpu_cuda.h
+++ b/source/lib/include/gpu_cuda.h
@@ -23,7 +23,7 @@
     DPAssert((res), __FILE__, __LINE__); \
   }
 inline void DPAssert(cudaError_t code,
-                     const char *file,
+                     const char* file,
                      int line,
                      bool abort = true) {
   if (code != cudaSuccess) {
@@ -61,21 +61,21 @@ inline void DPAssert(cudaError_t code,
     nborAssert((res), __FILE__, __LINE__); \
   }
 inline void nborAssert(cudaError_t code,
-                       const char *file,
+                       const char* file,
                        int line,
                        bool abort = true) {
   if (code != cudaSuccess) {
     std::string error_msg = "DeePMD-kit: Illegal nbor list sorting: ";
     try {
       DPAssert(code, file, line, true);
-    } catch (deepmd::deepmd_exception_oom &e) {
+    } catch (deepmd::deepmd_exception_oom& e) {
       error_msg += e.what();
       if (abort) {
         throw deepmd::deepmd_exception_oom(error_msg);
       } else {
         fprintf(stderr, "%s\n", error_msg.c_str());
       }
-    } catch (deepmd::deepmd_exception &e) {
+    } catch (deepmd::deepmd_exception& e) {
       error_msg += e.what();
       if (abort) {
         throw deepmd::deepmd_exception(error_msg);
@@ -87,8 +87,8 @@ inline void nborAssert(cudaError_t code,
 }
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
-static __inline__ __device__ double atomicAdd(double *address, double val) {
-  unsigned long long int *address_as_ull = (unsigned long long int *)address;
+static __inline__ __device__ double atomicAdd(double* address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
   unsigned long long int old = *address_as_ull, assumed;
   do {
     assumed = old;
@@ -103,68 +103,68 @@ static __inline__ __device__ double atomicAdd(double *address, double val) {
 
 namespace deepmd {
 
-inline void DPGetDeviceCount(int &gpu_num) { cudaGetDeviceCount(&gpu_num); }
+inline void DPGetDeviceCount(int& gpu_num) { cudaGetDeviceCount(&gpu_num); }
 
 inline cudaError_t DPSetDevice(int rank) { return cudaSetDevice(rank); }
 
 template <typename FPTYPE>
-void memcpy_host_to_device(FPTYPE *device, const std::vector<FPTYPE> &host) {
+void memcpy_host_to_device(FPTYPE* device, const std::vector<FPTYPE>& host) {
   DPErrcheck(cudaMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(),
                         cudaMemcpyHostToDevice));
 }
 
 template <typename FPTYPE>
-void memcpy_host_to_device(FPTYPE *device, const FPTYPE *host, const int size) {
+void memcpy_host_to_device(FPTYPE* device, const FPTYPE* host, const int size) {
   DPErrcheck(
       cudaMemcpy(device, host, sizeof(FPTYPE) * size, cudaMemcpyHostToDevice));
 }
 
 template <typename FPTYPE>
-void memcpy_device_to_host(const FPTYPE *device, std::vector<FPTYPE> &host) {
+void memcpy_device_to_host(const FPTYPE* device, std::vector<FPTYPE>& host) {
   DPErrcheck(cudaMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(),
                         cudaMemcpyDeviceToHost));
 }
 
 template <typename FPTYPE>
-void memcpy_device_to_host(const FPTYPE *device, FPTYPE *host, const int size) {
+void memcpy_device_to_host(const FPTYPE* device, FPTYPE* host, const int size) {
   DPErrcheck(
       cudaMemcpy(host, device, sizeof(FPTYPE) * size, cudaMemcpyDeviceToHost));
 }
 
 template <typename FPTYPE>
-void malloc_device_memory(FPTYPE *&device, const std::vector<FPTYPE> &host) {
-  DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+void malloc_device_memory(FPTYPE*& device, const std::vector<FPTYPE>& host) {
+  DPErrcheck(cudaMalloc((void**)&device, sizeof(FPTYPE) * host.size()));
 }
 
 template <typename FPTYPE>
-void malloc_device_memory(FPTYPE *&device, const int size) {
-  DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size));
+void malloc_device_memory(FPTYPE*& device, const int size) {
+  DPErrcheck(cudaMalloc((void**)&device, sizeof(FPTYPE) * size));
 }
 
 template <typename FPTYPE>
-void malloc_device_memory_sync(FPTYPE *&device,
-                               const std::vector<FPTYPE> &host) {
-  DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+void malloc_device_memory_sync(FPTYPE*& device,
+                               const std::vector<FPTYPE>& host) {
+  DPErrcheck(cudaMalloc((void**)&device, sizeof(FPTYPE) * host.size()));
   memcpy_host_to_device(device, host);
 }
 
 template <typename FPTYPE>
-void malloc_device_memory_sync(FPTYPE *&device,
-                               const FPTYPE *host,
+void malloc_device_memory_sync(FPTYPE*& device,
+                               const FPTYPE* host,
                                const int size) {
-  DPErrcheck(cudaMalloc((void **)&device, sizeof(FPTYPE) * size));
+  DPErrcheck(cudaMalloc((void**)&device, sizeof(FPTYPE) * size));
   memcpy_host_to_device(device, host, size);
 }
 
 template <typename FPTYPE>
-void delete_device_memory(FPTYPE *&device) {
+void delete_device_memory(FPTYPE*& device) {
   if (device != NULL) {
     DPErrcheck(cudaFree(device));
   }
 }
 
 template <typename FPTYPE>
-void memset_device_memory(FPTYPE *device, const int var, const int size) {
+void memset_device_memory(FPTYPE* device, const int var, const int size) {
   DPErrcheck(cudaMemset(device, var, sizeof(FPTYPE) * size));
 }
 }  // end of namespace deepmd
diff --git a/source/lib/include/gpu_rocm.h b/source/lib/include/gpu_rocm.h
index abb7ddfa62..c522c6aed4 100644
--- a/source/lib/include/gpu_rocm.h
+++ b/source/lib/include/gpu_rocm.h
@@ -25,7 +25,7 @@
     DPAssert((res), __FILE__, __LINE__); \
   }
 inline void DPAssert(hipError_t code,
-                     const char *file,
+                     const char* file,
                      int line,
                      bool abort = true) {
   if (code != hipSuccess) {
@@ -46,14 +46,14 @@ inline void DPAssert(hipError_t code,
     nborAssert((res), __FILE__, __LINE__); \
   }
 inline void nborAssert(hipError_t code,
-                       const char *file,
+                       const char* file,
                        int line,
                        bool abort = true) {
   if (code != hipSuccess) {
     std::string error_msg = "DeePMD-kit: Illegal nbor list sorting: ";
     try {
       DPAssert(code, file, line, true);
-    } catch (deepmd::deepmd_exception &e) {
+    } catch (deepmd::deepmd_exception& e) {
       error_msg += e.what();
       if (abort) {
         throw deepmd::deepmd_exception(error_msg);
@@ -65,65 +65,65 @@ inline void nborAssert(hipError_t code,
 }
 
 namespace deepmd {
-inline void DPGetDeviceCount(int &gpu_num) { hipGetDeviceCount(&gpu_num); }
+inline void DPGetDeviceCount(int& gpu_num) { hipGetDeviceCount(&gpu_num); }
 
 inline hipError_t DPSetDevice(int rank) { return hipSetDevice(rank); }
 
 template <typename FPTYPE>
-void memcpy_host_to_device(FPTYPE *device, std::vector<FPTYPE> &host) {
+void memcpy_host_to_device(FPTYPE* device, std::vector<FPTYPE>& host) {
   DPErrcheck(hipMemcpy(device, &host[0], sizeof(FPTYPE) * host.size(),
                        hipMemcpyHostToDevice));
 }
 
 template <typename FPTYPE>
-void memcpy_host_to_device(FPTYPE *device, const FPTYPE *host, const int size) {
+void memcpy_host_to_device(FPTYPE* device, const FPTYPE* host, const int size) {
   DPErrcheck(
       hipMemcpy(device, host, sizeof(FPTYPE) * size, hipMemcpyHostToDevice));
 }
 
 template <typename FPTYPE>
-void memcpy_device_to_host(const FPTYPE *device, std::vector<FPTYPE> &host) {
+void memcpy_device_to_host(const FPTYPE* device, std::vector<FPTYPE>& host) {
   DPErrcheck(hipMemcpy(&host[0], device, sizeof(FPTYPE) * host.size(),
                        hipMemcpyDeviceToHost));
 }
 template <typename FPTYPE>
-void memcpy_device_to_host(const FPTYPE *device, FPTYPE *host, const int size) {
+void memcpy_device_to_host(const FPTYPE* device, FPTYPE* host, const int size) {
   DPErrcheck(
       hipMemcpy(host, device, sizeof(FPTYPE) * size, hipMemcpyDeviceToHost));
 }
 
 template <typename FPTYPE>
-void malloc_device_memory(FPTYPE *&device, std::vector<FPTYPE> &host) {
-  DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+void malloc_device_memory(FPTYPE*& device, std::vector<FPTYPE>& host) {
+  DPErrcheck(hipMalloc((void**)&device, sizeof(FPTYPE) * host.size()));
 }
 
 template <typename FPTYPE>
-void malloc_device_memory(FPTYPE *&device, const int size) {
-  DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size));
+void malloc_device_memory(FPTYPE*& device, const int size) {
+  DPErrcheck(hipMalloc((void**)&device, sizeof(FPTYPE) * size));
 }
 
 template <typename FPTYPE>
-void malloc_device_memory_sync(FPTYPE *&device, std::vector<FPTYPE> &host) {
-  DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * host.size()));
+void malloc_device_memory_sync(FPTYPE*& device, std::vector<FPTYPE>& host) {
+  DPErrcheck(hipMalloc((void**)&device, sizeof(FPTYPE) * host.size()));
   memcpy_host_to_device(device, host);
 }
 template <typename FPTYPE>
-void malloc_device_memory_sync(FPTYPE *&device,
-                               const FPTYPE *host,
+void malloc_device_memory_sync(FPTYPE*& device,
+                               const FPTYPE* host,
                                const int size) {
-  DPErrcheck(hipMalloc((void **)&device, sizeof(FPTYPE) * size));
+  DPErrcheck(hipMalloc((void**)&device, sizeof(FPTYPE) * size));
   memcpy_host_to_device(device, host, size);
 }
 
 template <typename FPTYPE>
-void delete_device_memory(FPTYPE *&device) {
+void delete_device_memory(FPTYPE*& device) {
   if (device != NULL) {
     DPErrcheck(hipFree(device));
   }
 }
 
 template <typename FPTYPE>
-void memset_device_memory(FPTYPE *device, const int var, const int size) {
+void memset_device_memory(FPTYPE* device, const int var, const int size) {
   DPErrcheck(hipMemset(device, var, sizeof(FPTYPE) * size));
 }
 }  // namespace deepmd
diff --git a/source/lib/include/pairwise.h b/source/lib/include/pairwise.h
index bbb4119e59..f711bd6f88 100644
--- a/source/lib/include/pairwise.h
+++ b/source/lib/include/pairwise.h
@@ -10,8 +10,8 @@ namespace deepmd {
  * @param[in] idxs The indexes of the fragment that each atom belongs to. -1
  * will be ignored.
  */
-void group_atoms_cpu(std::vector<std::vector<int>> &fragments,
-                     const std::vector<int> &idxs);
+void group_atoms_cpu(std::vector<std::vector<int>>& fragments,
+                     const std::vector<int>& idxs);
 /**
  * DPRc pairwise map.
  *
@@ -30,15 +30,15 @@ void group_atoms_cpu(std::vector<std::vector<int>> &fragments,
  * @param[in] nloc The number of local atoms.
  * @param[in] nall The number of all atoms, including local and ghost atoms.
  */
-void dprc_pairwise_map_cpu(std::vector<int> &forward_qm_map,
-                           std::vector<int> &backward_qm_map,
-                           std::vector<int> &forward_qmmm_map,
-                           std::vector<int> &backward_qmmm_map,
-                           int &nloc_qm,
-                           int &nloc_qmmm,
-                           int &nall_qm,
-                           int &nall_qmmm,
-                           const std::vector<std::vector<int>> &fragments,
+void dprc_pairwise_map_cpu(std::vector<int>& forward_qm_map,
+                           std::vector<int>& backward_qm_map,
+                           std::vector<int>& forward_qmmm_map,
+                           std::vector<int>& backward_qmmm_map,
+                           int& nloc_qm,
+                           int& nloc_qmmm,
+                           int& nall_qm,
+                           int& nall_qmmm,
+                           const std::vector<std::vector<int>>& fragments,
                            const int nloc,
                            const int nall);
 }  // namespace deepmd
diff --git a/source/lib/include/prod_env_mat.h b/source/lib/include/prod_env_mat.h
index 60da638d68..d8ca4d1861 100644
--- a/source/lib/include/prod_env_mat.h
+++ b/source/lib/include/prod_env_mat.h
@@ -8,34 +8,34 @@
 namespace deepmd {
 
 template <typename FPTYPE>
-void prod_env_mat_a_cpu(FPTYPE *em,
-                        FPTYPE *em_deriv,
-                        FPTYPE *rij,
-                        int *nlist,
-                        const FPTYPE *coord,
-                        const int *type,
-                        const InputNlist &inlist,
+void prod_env_mat_a_cpu(FPTYPE* em,
+                        FPTYPE* em_deriv,
+                        FPTYPE* rij,
+                        int* nlist,
+                        const FPTYPE* coord,
+                        const int* type,
+                        const InputNlist& inlist,
                         const int max_nbor_size,
-                        const FPTYPE *avg,
-                        const FPTYPE *std,
+                        const FPTYPE* avg,
+                        const FPTYPE* std,
                         const int nloc,
                         const int nall,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec,
-                        const int *f_type = NULL);
+                        const int* f_type = NULL);
 
 template <typename FPTYPE>
-void prod_env_mat_r_cpu(FPTYPE *em,
-                        FPTYPE *em_deriv,
-                        FPTYPE *rij,
-                        int *nlist,
-                        const FPTYPE *coord,
-                        const int *type,
-                        const InputNlist &inlist,
+void prod_env_mat_r_cpu(FPTYPE* em,
+                        FPTYPE* em_deriv,
+                        FPTYPE* rij,
+                        int* nlist,
+                        const FPTYPE* coord,
+                        const int* type,
+                        const InputNlist& inlist,
                         const int max_nbor_size,
-                        const FPTYPE *avg,
-                        const FPTYPE *std,
+                        const FPTYPE* avg,
+                        const FPTYPE* std,
                         const int nloc,
                         const int nall,
                         const float rcut,
@@ -44,49 +44,49 @@ void prod_env_mat_r_cpu(FPTYPE *em,
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 template <typename FPTYPE>
-void prod_env_mat_a_gpu(FPTYPE *em,
-                        FPTYPE *em_deriv,
-                        FPTYPE *rij,
-                        int *nlist,
-                        const FPTYPE *coord,
-                        const int *type,
-                        const InputNlist &gpu_inlist,
-                        int *array_int,
-                        unsigned long long *array_longlong,
+void prod_env_mat_a_gpu(FPTYPE* em,
+                        FPTYPE* em_deriv,
+                        FPTYPE* rij,
+                        int* nlist,
+                        const FPTYPE* coord,
+                        const int* type,
+                        const InputNlist& gpu_inlist,
+                        int* array_int,
+                        unsigned long long* array_longlong,
                         const int max_nbor_size,
-                        const FPTYPE *avg,
-                        const FPTYPE *std,
+                        const FPTYPE* avg,
+                        const FPTYPE* std,
                         const int nloc,
                         const int nall,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec,
-                        const int *f_type = NULL);
+                        const int* f_type = NULL);
 
 template <typename FPTYPE>
-void prod_env_mat_r_gpu(FPTYPE *em,
-                        FPTYPE *em_deriv,
-                        FPTYPE *rij,
-                        int *nlist,
-                        const FPTYPE *coord,
-                        const int *type,
-                        const InputNlist &gpu_inlist,
-                        int *array_int,
-                        unsigned long long *array_longlong,
+void prod_env_mat_r_gpu(FPTYPE* em,
+                        FPTYPE* em_deriv,
+                        FPTYPE* rij,
+                        int* nlist,
+                        const FPTYPE* coord,
+                        const int* type,
+                        const InputNlist& gpu_inlist,
+                        int* array_int,
+                        unsigned long long* array_longlong,
                         const int max_nbor_size,
-                        const FPTYPE *avg,
-                        const FPTYPE *std,
+                        const FPTYPE* avg,
+                        const FPTYPE* std,
                         const int nloc,
                         const int nall,
                         const float rcut,
                         const float rcut_smth,
                         const std::vector<int> sec);
 
-void env_mat_nbor_update(InputNlist &inlist,
-                         InputNlist &gpu_inlist,
-                         int &max_nbor_size,
-                         int *&nbor_list_dev,
-                         const int *mesh,
+void env_mat_nbor_update(InputNlist& inlist,
+                         InputNlist& gpu_inlist,
+                         int& max_nbor_size,
+                         int*& nbor_list_dev,
+                         const int* mesh,
                          const int size);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/source/lib/include/region.cuh b/source/lib/include/region.cuh
index 0feafad49e..6dc71861f1 100644
--- a/source/lib/include/region.cuh
+++ b/source/lib/include/region.cuh
@@ -1,9 +1,9 @@
 #pragma once
 
 template <typename FPTYPE>
-__device__ inline void tensorDotVector(FPTYPE *o_v,
-                                       const FPTYPE *i_v,
-                                       const FPTYPE *i_t) {
+__device__ inline void tensorDotVector(FPTYPE* o_v,
+                                       const FPTYPE* i_v,
+                                       const FPTYPE* i_t) {
   o_v[0] = i_v[0] * i_t[0 * 3 + 0] + i_v[1] * i_t[0 * 3 + 1] +
            i_v[2] * i_t[0 * 3 + 2];
   o_v[1] = i_v[0] * i_t[1 * 3 + 0] + i_v[1] * i_t[1 * 3 + 1] +
@@ -12,9 +12,9 @@ __device__ inline void tensorDotVector(FPTYPE *o_v,
            i_v[2] * i_t[2 * 3 + 2];
 }
 template <typename FPTYPE>
-__device__ inline void tensorTransDotVector(FPTYPE *o_v,
-                                            const FPTYPE *i_v,
-                                            const FPTYPE *i_t) {
+__device__ inline void tensorTransDotVector(FPTYPE* o_v,
+                                            const FPTYPE* i_v,
+                                            const FPTYPE* i_t) {
   o_v[0] = i_v[0] * i_t[0 * 3 + 0] + i_v[1] * i_t[1 * 3 + 0] +
            i_v[2] * i_t[2 * 3 + 0];
   o_v[1] = i_v[0] * i_t[0 * 3 + 1] + i_v[1] * i_t[1 * 3 + 1] +
@@ -23,19 +23,19 @@ __device__ inline void tensorTransDotVector(FPTYPE *o_v,
            i_v[2] * i_t[2 * 3 + 2];
 }
 template <typename FPTYPE>
-__device__ inline void phys2Inter(FPTYPE *inter,
-                                  const FPTYPE *phys,
-                                  const FPTYPE *rec_boxt) {
+__device__ inline void phys2Inter(FPTYPE* inter,
+                                  const FPTYPE* phys,
+                                  const FPTYPE* rec_boxt) {
   tensorDotVector(inter, phys, rec_boxt);
 }
 template <typename FPTYPE>
-__device__ inline void inter2Phys(FPTYPE *phys,
-                                  const FPTYPE *inter,
-                                  const FPTYPE *boxt) {
+__device__ inline void inter2Phys(FPTYPE* phys,
+                                  const FPTYPE* inter,
+                                  const FPTYPE* boxt) {
   tensorTransDotVector(phys, inter, boxt);
 }
 template <typename FPTYPE>
-__device__ inline FPTYPE compute_volume(const FPTYPE *boxt) {
+__device__ inline FPTYPE compute_volume(const FPTYPE* boxt) {
   FPTYPE volume = boxt[0 * 3 + 0] * (boxt[1 * 3 + 1] * boxt[2 * 3 + 2] -
                                      boxt[2 * 3 + 1] * boxt[1 * 3 + 2]) -
                   boxt[0 * 3 + 1] * (boxt[1 * 3 + 0] * boxt[2 * 3 + 2] -
diff --git a/source/lib/src/fmt_nlist.cc b/source/lib/src/fmt_nlist.cc
index 2bf3e78e99..3965585cf8 100644
--- a/source/lib/src/fmt_nlist.cc
+++ b/source/lib/src/fmt_nlist.cc
@@ -18,26 +18,26 @@ struct NeighborInfo {
   int index;
   NeighborInfo() : type(0), dist(0), index(0) {}
   NeighborInfo(int tt, FPTYPE dd, int ii) : type(tt), dist(dd), index(ii) {}
-  bool operator<(const NeighborInfo &b) const {
+  bool operator<(const NeighborInfo& b) const {
     return (type < b.type ||
             (type == b.type &&
              (dist < b.dist || (dist == b.dist && index < b.index))));
   }
 };
 
-int format_nlist_i_fill_a(std::vector<int> &fmt_nei_idx_a,
-                          std::vector<int> &fmt_nei_idx_r,
-                          const std::vector<double> &posi,
-                          const int &ntypes,
-                          const std::vector<int> &type,
-                          const SimulationRegion<double> &region,
-                          const bool &b_pbc,
-                          const int &i_idx,
-                          const std::vector<int> &nei_idx_a,
-                          const std::vector<int> &nei_idx_r,
-                          const double &rcut,
-                          const std::vector<int> &sec_a,
-                          const std::vector<int> &sec_r) {
+int format_nlist_i_fill_a(std::vector<int>& fmt_nei_idx_a,
+                          std::vector<int>& fmt_nei_idx_r,
+                          const std::vector<double>& posi,
+                          const int& ntypes,
+                          const std::vector<int>& type,
+                          const SimulationRegion<double>& region,
+                          const bool& b_pbc,
+                          const int& i_idx,
+                          const std::vector<int>& nei_idx_a,
+                          const std::vector<int>& nei_idx_r,
+                          const double& rcut,
+                          const std::vector<int>& sec_a,
+                          const std::vector<int>& sec_r) {
 #ifdef DEBUG
   assert(sec_a.size() == ntypes + 1);
   assert(sec_r.size() == ntypes + 1);
@@ -57,7 +57,7 @@ int format_nlist_i_fill_a(std::vector<int> &fmt_nei_idx_a,
   sel_nei.reserve(nei_idx_a.size() + nei_idx_r.size());
   for (unsigned kk = 0; kk < nei_idx.size(); ++kk) {
     double diff[3];
-    const int &j_idx = nei_idx[kk];
+    const int& j_idx = nei_idx[kk];
     if (b_pbc) {
       region.diffNearestNeighbor(posi[j_idx * 3 + 0], posi[j_idx * 3 + 1],
                                  posi[j_idx * 3 + 2], posi[i_idx * 3 + 0],
@@ -78,7 +78,7 @@ int format_nlist_i_fill_a(std::vector<int> &fmt_nei_idx_a,
   std::vector<int> nei_iter = sec_a;
   int overflowed = -1;
   for (unsigned kk = 0; kk < sel_nei.size(); ++kk) {
-    const int &nei_type = sel_nei[kk].type;
+    const int& nei_type = sel_nei[kk].type;
     if (nei_iter[nei_type] >= sec_a[nei_type + 1]) {
       int r_idx_iter =
           (nei_iter[nei_type]++) - sec_a[nei_type + 1] + sec_r[nei_type];
@@ -96,13 +96,13 @@ int format_nlist_i_fill_a(std::vector<int> &fmt_nei_idx_a,
 }
 
 template <typename FPTYPE>
-int format_nlist_i_cpu(std::vector<int> &fmt_nei_idx_a,
-                       const std::vector<FPTYPE> &posi,
-                       const std::vector<int> &type,
-                       const int &i_idx,
-                       const std::vector<int> &nei_idx_a,
-                       const float &rcut,
-                       const std::vector<int> &sec_a) {
+int format_nlist_i_cpu(std::vector<int>& fmt_nei_idx_a,
+                       const std::vector<FPTYPE>& posi,
+                       const std::vector<int>& type,
+                       const int& i_idx,
+                       const std::vector<int>& nei_idx_a,
+                       const float& rcut,
+                       const std::vector<int>& sec_a) {
   fmt_nei_idx_a.resize(sec_a.back());
   fill(fmt_nei_idx_a.begin(), fmt_nei_idx_a.end(), -1);
 
@@ -115,7 +115,7 @@ int format_nlist_i_cpu(std::vector<int> &fmt_nei_idx_a,
   for (unsigned kk = 0; kk < nei_idx.size(); ++kk) {
     // rcut is float in this function, so float rr is enough
     float diff[3];
-    const int &j_idx = nei_idx[kk];
+    const int& j_idx = nei_idx[kk];
     if (type[j_idx] < 0) {
       continue;
     }
@@ -132,7 +132,7 @@ int format_nlist_i_cpu(std::vector<int> &fmt_nei_idx_a,
   std::vector<int> nei_iter = sec_a;
   int overflowed = -1;
   for (unsigned kk = 0; kk < sel_nei.size(); ++kk) {
-    const int &nei_type = sel_nei[kk].type;
+    const int& nei_type = sel_nei[kk].type;
     if (nei_iter[nei_type] < sec_a[nei_type + 1]) {
       fmt_nei_idx_a[nei_iter[nei_type]++] = sel_nei[kk].index;
     } else {
@@ -143,10 +143,10 @@ int format_nlist_i_cpu(std::vector<int> &fmt_nei_idx_a,
 }
 
 template <typename FPTYPE>
-void deepmd::format_nlist_cpu(int *nlist,
-                              const InputNlist &in_nlist,
-                              const FPTYPE *coord,
-                              const int *type,
+void deepmd::format_nlist_cpu(int* nlist,
+                              const InputNlist& in_nlist,
+                              const FPTYPE* coord,
+                              const int* type,
                               const int nloc,
                               const int nall,
                               const float rcut,
@@ -165,7 +165,7 @@ void deepmd::format_nlist_cpu(int *nlist,
     std::copy(in_nlist.firstneigh[ii], in_nlist.firstneigh[ii] + i_num,
               ilist.begin());
     format_nlist_i_cpu(fmt_ilist, posi_, type_, i_idx, ilist, rcut, sec);
-    int *cur_nlist = nlist + i_idx * nnei;
+    int* cur_nlist = nlist + i_idx * nnei;
     if (fmt_ilist.size() != nnei) {
       std::cerr << "FATAL: formatted nlist of i have length "
                 << fmt_ilist.size() << " which does not match " << nnei
@@ -176,37 +176,37 @@ void deepmd::format_nlist_cpu(int *nlist,
   }
 }
 
-template int format_nlist_i_cpu<double>(std::vector<int> &fmt_nei_idx_a,
-                                        const std::vector<double> &posi,
-                                        const std::vector<int> &type,
-                                        const int &i_idx,
-                                        const std::vector<int> &nei_idx_a,
-                                        const float &rcut,
-                                        const std::vector<int> &sec_a);
-
-template int format_nlist_i_cpu<float>(std::vector<int> &fmt_nei_idx_a,
-                                       const std::vector<float> &posi,
-                                       const std::vector<int> &type,
-                                       const int &i_idx,
-                                       const std::vector<int> &nei_idx_a,
-                                       const float &rcut,
-                                       const std::vector<int> &sec_a);
+template int format_nlist_i_cpu<double>(std::vector<int>& fmt_nei_idx_a,
+                                        const std::vector<double>& posi,
+                                        const std::vector<int>& type,
+                                        const int& i_idx,
+                                        const std::vector<int>& nei_idx_a,
+                                        const float& rcut,
+                                        const std::vector<int>& sec_a);
+
+template int format_nlist_i_cpu<float>(std::vector<int>& fmt_nei_idx_a,
+                                       const std::vector<float>& posi,
+                                       const std::vector<int>& type,
+                                       const int& i_idx,
+                                       const std::vector<int>& nei_idx_a,
+                                       const float& rcut,
+                                       const std::vector<int>& sec_a);
 
 template void deepmd::format_nlist_cpu<double>(
-    int *nlist,
-    const deepmd::InputNlist &in_nlist,
-    const double *coord,
-    const int *type,
+    int* nlist,
+    const deepmd::InputNlist& in_nlist,
+    const double* coord,
+    const int* type,
     const int nloc,
     const int nall,
     const float rcut,
     const std::vector<int> sec);
 
 template void deepmd::format_nlist_cpu<float>(
-    int *nlist,
-    const deepmd::InputNlist &in_nlist,
-    const float *coord,
-    const int *type,
+    int* nlist,
+    const deepmd::InputNlist& in_nlist,
+    const float* coord,
+    const int* type,
     const int nloc,
     const int nall,
     const float rcut,
diff --git a/source/lib/src/gpu/coord.cu b/source/lib/src/gpu/coord.cu
index 52ec9ff09d..5030f67caf 100644
--- a/source/lib/src/gpu/coord.cu
+++ b/source/lib/src/gpu/coord.cu
@@ -2,22 +2,22 @@
 #include "device.h"
 #include "region.cuh"
 
-__device__ inline int collapse_index(const int *idx, const int *size) {
+__device__ inline int collapse_index(const int* idx, const int* size) {
   return (idx[0] * size[1] + idx[1]) * size[2] + idx[2];
 }
 __device__ inline void index_recover(const int in_idx,
-                                     const int *size,
-                                     int *idx) {
+                                     const int* size,
+                                     int* idx) {
   idx[2] = in_idx % size[2];
   idx[1] = int(in_idx / size[2]) % size[1];
   idx[0] = int(int(in_idx / size[2]) / size[1]);
 }
-__device__ inline void idx_addshift(int *idx, const int *shift) {
+__device__ inline void idx_addshift(int* idx, const int* shift) {
   for (int dd = 0; dd < 3; dd++) {
     idx[dd] += shift[dd];
   }
 }
-__device__ inline void idx_unshift(int *idx, const int *shift) {
+__device__ inline void idx_unshift(int* idx, const int* shift) {
   for (int dd = 0; dd < 3; dd++) {
     idx[dd] -= shift[dd];
   }
@@ -42,9 +42,9 @@ __device__ inline double _fmod(double x, double y) { return fmod(x, y); }
 __device__ inline float _fmod(float x, float y) { return fmodf(x, y); }
 
 template <typename FPTYPE>
-__global__ void normalize_one(FPTYPE *out_c,
-                              const FPTYPE *boxt,
-                              const FPTYPE *rec_boxt,
+__global__ void normalize_one(FPTYPE* out_c,
+                              const FPTYPE* boxt,
+                              const FPTYPE* rec_boxt,
                               const int nall) {
   // <<<nall/TPB, TPB>>>
   int idy = blockIdx.x * blockDim.x + threadIdx.x;
@@ -63,14 +63,14 @@ __global__ void normalize_one(FPTYPE *out_c,
 }
 
 template <typename FPTYPE>
-__global__ void _fill_idx_cellmap(int *idx_cellmap,
-                                  int *idx_cellmap_noshift,
-                                  const FPTYPE *in_c,
-                                  const FPTYPE *rec_boxt,
-                                  const int *nat_stt,
-                                  const int *nat_end,
-                                  const int *ext_stt,
-                                  const int *ext_end,
+__global__ void _fill_idx_cellmap(int* idx_cellmap,
+                                  int* idx_cellmap_noshift,
+                                  const FPTYPE* in_c,
+                                  const FPTYPE* rec_boxt,
+                                  const int* nat_stt,
+                                  const int* nat_end,
+                                  const int* ext_stt,
+                                  const int* ext_end,
                                   const int nloc) {
   int idy = blockIdx.x * blockDim.x + threadIdx.x;
   int ext_ncell[3];
@@ -107,9 +107,9 @@ __global__ void _fill_idx_cellmap(int *idx_cellmap,
   }
 }
 
-__global__ void _fill_loc_cellnum_map(int *temp_idx_order,
-                                      int *loc_cellnum_map,
-                                      const int *idx_cellmap_noshift,
+__global__ void _fill_loc_cellnum_map(int* temp_idx_order,
+                                      int* loc_cellnum_map,
+                                      const int* idx_cellmap_noshift,
                                       const int nloc,
                                       const int loc_cellnum) {
   int idy = blockIdx.x * blockDim.x + threadIdx.x;
@@ -125,15 +125,15 @@ __global__ void _fill_loc_cellnum_map(int *temp_idx_order,
   }
 }
 
-__global__ void _fill_total_cellnum_map(int *total_cellnum_map,
-                                        int *mask_cellnum_map,
-                                        int *cell_map,
-                                        int *cell_shift_map,
-                                        const int *nat_stt,
-                                        const int *nat_end,
-                                        const int *ext_stt,
-                                        const int *ext_end,
-                                        const int *loc_cellnum_map,
+__global__ void _fill_total_cellnum_map(int* total_cellnum_map,
+                                        int* mask_cellnum_map,
+                                        int* cell_map,
+                                        int* cell_shift_map,
+                                        const int* nat_stt,
+                                        const int* nat_end,
+                                        const int* ext_stt,
+                                        const int* ext_end,
+                                        const int* loc_cellnum_map,
                                         const int total_cellnum) {
   int idy = blockIdx.x * blockDim.x + threadIdx.x;
   int ext_ncell[3];
@@ -145,7 +145,7 @@ __global__ void _fill_total_cellnum_map(int *total_cellnum_map,
     idx_orig_shift[dd] = nat_stt[dd] - ext_stt[dd];
   }
   if (idy < total_cellnum) {
-    int *shift = cell_shift_map + idy * 3;
+    int* shift = cell_shift_map + idy * 3;
     int idx[3];
     index_recover(idy, ext_ncell, idx);
     idx_unshift(idx, idx_orig_shift);
@@ -169,36 +169,36 @@ __global__ void _fill_total_cellnum_map(int *total_cellnum_map,
   }
 }
 
-__global__ void _build_loc_clist(int *clist,
-                                 const int *idx_cellmap,
-                                 const int *idx_order,
-                                 const int *sec_num_map,
+__global__ void _build_loc_clist(int* clist,
+                                 const int* idx_cellmap,
+                                 const int* idx_order,
+                                 const int* sec_num_map,
                                  const int nloc) {
   int idy = blockIdx.x * blockDim.x + threadIdx.x;
   if (idy >= nloc) {
     return;
   }
   int cell_idx = idx_cellmap[idy];
-  int *clist_row = clist + sec_num_map[cell_idx];
+  int* clist_row = clist + sec_num_map[cell_idx];
   clist_row[idx_order[idy]] = idy;
 }
 
 template <typename FPTYPE>
-__global__ void _copy_coord(FPTYPE *out_c,
-                            int *out_t,
-                            int *mapping,
-                            const FPTYPE *in_c,
-                            const int *in_t,
-                            const int *cell_map,
-                            const int *cell_shift_map,
-                            const int *sec_loc_cellnum_map,
-                            const int *sec_total_cellnum_map,
-                            const int *loc_clist,
+__global__ void _copy_coord(FPTYPE* out_c,
+                            int* out_t,
+                            int* mapping,
+                            const FPTYPE* in_c,
+                            const int* in_t,
+                            const int* cell_map,
+                            const int* cell_shift_map,
+                            const int* sec_loc_cellnum_map,
+                            const int* sec_total_cellnum_map,
+                            const int* loc_clist,
                             const int nloc,
                             const int nall,
                             const int total_cellnum,
-                            const FPTYPE *boxt,
-                            const FPTYPE *rec_boxt) {
+                            const FPTYPE* boxt,
+                            const FPTYPE* rec_boxt) {
   int idy = blockIdx.x * blockDim.x + threadIdx.x;
   if (idy >= nall) {
     return;
@@ -241,26 +241,26 @@ __global__ void _copy_coord(FPTYPE *out_c,
 }
 
 template <typename FPTYPE>
-void compute_int_data(int *int_data,
-                      const FPTYPE *in_c,
-                      const int *cell_info,
-                      const deepmd::Region<FPTYPE> &region,
+void compute_int_data(int* int_data,
+                      const FPTYPE* in_c,
+                      const int* cell_info,
+                      const deepmd::Region<FPTYPE>& region,
                       const int nloc,
                       const int loc_cellnum,
                       const int total_cellnum) {
-  int *idx_cellmap = int_data;
-  int *idx_cellmap_noshift = idx_cellmap + nloc;
-  int *temp_idx_order = idx_cellmap_noshift + nloc;
-  int *loc_cellnum_map = temp_idx_order + nloc;
-  int *total_cellnum_map = loc_cellnum_map + loc_cellnum;
-  int *mask_cellnum_map = total_cellnum_map + total_cellnum;
-  int *cell_map = mask_cellnum_map + total_cellnum;
-  int *cell_shift_map = cell_map + total_cellnum;
-  const int *nat_stt = cell_info;
-  const int *nat_end = cell_info + 3;
-  const int *ext_stt = cell_info + 6;
-  const int *ext_end = cell_info + 9;
-  const FPTYPE *rec_boxt = region.rec_boxt;
+  int* idx_cellmap = int_data;
+  int* idx_cellmap_noshift = idx_cellmap + nloc;
+  int* temp_idx_order = idx_cellmap_noshift + nloc;
+  int* loc_cellnum_map = temp_idx_order + nloc;
+  int* total_cellnum_map = loc_cellnum_map + loc_cellnum;
+  int* mask_cellnum_map = total_cellnum_map + total_cellnum;
+  int* cell_map = mask_cellnum_map + total_cellnum;
+  int* cell_shift_map = cell_map + total_cellnum;
+  const int* nat_stt = cell_info;
+  const int* nat_end = cell_info + 3;
+  const int* ext_stt = cell_info + 6;
+  const int* ext_end = cell_info + 9;
+  const FPTYPE* rec_boxt = region.rec_boxt;
 
   const int nblock_loc = (nloc + TPB - 1) / TPB;
   _fill_idx_cellmap<<<nblock_loc, TPB>>>(idx_cellmap, idx_cellmap_noshift, in_c,
@@ -283,17 +283,17 @@ void compute_int_data(int *int_data,
   DPErrcheck(gpuDeviceSynchronize());
 }
 
-void build_loc_clist(int *int_data,
+void build_loc_clist(int* int_data,
                      const int nloc,
                      const int loc_cellnum,
                      const int total_cellnum) {
   const int nblock = (nloc + TPB - 1) / TPB;
-  const int *idx_cellmap_noshift = int_data + nloc;
-  const int *temp_idx_order = idx_cellmap_noshift + nloc;
-  const int *sec_loc_cellnum_map = temp_idx_order + nloc + loc_cellnum +
+  const int* idx_cellmap_noshift = int_data + nloc;
+  const int* temp_idx_order = idx_cellmap_noshift + nloc;
+  const int* sec_loc_cellnum_map = temp_idx_order + nloc + loc_cellnum +
                                    2 * total_cellnum + total_cellnum +
                                    3 * total_cellnum;
-  int *loc_clist = int_data + nloc * 3 + loc_cellnum + total_cellnum * 3 +
+  int* loc_clist = int_data + nloc * 3 + loc_cellnum + total_cellnum * 3 +
                    total_cellnum * 3 + loc_cellnum + 1 + total_cellnum + 1;
   _build_loc_clist<<<nblock, TPB>>>(loc_clist, idx_cellmap_noshift,
                                     temp_idx_order, sec_loc_cellnum_map, nloc);
@@ -302,26 +302,26 @@ void build_loc_clist(int *int_data,
 }
 
 template <typename FPTYPE>
-void copy_coord(FPTYPE *out_c,
-                int *out_t,
-                int *mapping,
-                const int *int_data,
-                const FPTYPE *in_c,
-                const int *in_t,
+void copy_coord(FPTYPE* out_c,
+                int* out_t,
+                int* mapping,
+                const int* int_data,
+                const FPTYPE* in_c,
+                const int* in_t,
                 const int nloc,
                 const int nall,
                 const int loc_cellnum,
                 const int total_cellnum,
-                const deepmd::Region<FPTYPE> &region) {
+                const deepmd::Region<FPTYPE>& region) {
   const int nblock = (nall + TPB - 1) / TPB;
-  const int *cell_map = int_data + 3 * nloc + loc_cellnum + 2 * total_cellnum;
-  const int *cell_shift_map = cell_map + total_cellnum;
-  const int *sec_loc_cellnum_map = cell_shift_map + 3 * total_cellnum;
-  const int *sec_total_cellnum_map = sec_loc_cellnum_map + loc_cellnum + 1;
-  const int *loc_clist = sec_total_cellnum_map + total_cellnum + 1;
+  const int* cell_map = int_data + 3 * nloc + loc_cellnum + 2 * total_cellnum;
+  const int* cell_shift_map = cell_map + total_cellnum;
+  const int* sec_loc_cellnum_map = cell_shift_map + 3 * total_cellnum;
+  const int* sec_total_cellnum_map = sec_loc_cellnum_map + loc_cellnum + 1;
+  const int* loc_clist = sec_total_cellnum_map + total_cellnum + 1;
 
-  const FPTYPE *boxt = region.boxt;
-  const FPTYPE *rec_boxt = region.rec_boxt;
+  const FPTYPE* boxt = region.boxt;
+  const FPTYPE* rec_boxt = region.rec_boxt;
   _copy_coord<<<nblock, TPB>>>(out_c, out_t, mapping, in_c, in_t, cell_map,
                                cell_shift_map, sec_loc_cellnum_map,
                                sec_total_cellnum_map, loc_clist, nloc, nall,
@@ -332,13 +332,13 @@ void copy_coord(FPTYPE *out_c,
 
 namespace deepmd {
 template <typename FPTYPE>
-void normalize_coord_gpu(FPTYPE *coord,
+void normalize_coord_gpu(FPTYPE* coord,
                          const int natom,
-                         const Region<FPTYPE> &region) {
+                         const Region<FPTYPE>& region) {
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
-  const FPTYPE *boxt = region.boxt;
-  const FPTYPE *rec_boxt = region.rec_boxt;
+  const FPTYPE* boxt = region.boxt;
+  const FPTYPE* rec_boxt = region.rec_boxt;
   const int nblock = (natom + TPB - 1) / TPB;
   normalize_one<<<nblock, TPB>>>(coord, boxt, rec_boxt, natom);
   DPErrcheck(gpuGetLastError());
@@ -349,35 +349,35 @@ void normalize_coord_gpu(FPTYPE *coord,
 //  memory):idx_map,idx_map_noshift,temp_idx_order,loc_cellnum_map,total_cellnum_map,mask_cellnum_map,
 //                             cell_map,cell_shift_map,sec_loc_cellnum_map,sec_total_cellnum_map,loc_clist
 template <typename FPTYPE>
-int copy_coord_gpu(FPTYPE *out_c,
-                   int *out_t,
-                   int *mapping,
-                   int *nall,
-                   int *int_data,
-                   const FPTYPE *in_c,
-                   const int *in_t,
-                   const int &nloc,
-                   const int &mem_nall,
-                   const int &loc_cellnum,
-                   const int &total_cellnum,
-                   const int *cell_info,
-                   const Region<FPTYPE> &region) {
+int copy_coord_gpu(FPTYPE* out_c,
+                   int* out_t,
+                   int* mapping,
+                   int* nall,
+                   int* int_data,
+                   const FPTYPE* in_c,
+                   const int* in_t,
+                   const int& nloc,
+                   const int& mem_nall,
+                   const int& loc_cellnum,
+                   const int& total_cellnum,
+                   const int* cell_info,
+                   const Region<FPTYPE>& region) {
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   compute_int_data(int_data, in_c, cell_info, region, nloc, loc_cellnum,
                    total_cellnum);
-  int *int_data_cpu = new int
+  int* int_data_cpu = new int
       [loc_cellnum + 2 * total_cellnum + loc_cellnum + 1 + total_cellnum +
        1];  // loc_cellnum_map,total_cellnum_map,mask_cellnum_map,sec_loc_cellnum_map,sec_total_cellnum_map
   DPErrcheck(gpuMemcpy(int_data_cpu, int_data + 3 * nloc,
                        sizeof(int) * (loc_cellnum + 2 * total_cellnum),
                        gpuMemcpyDeviceToHost));
   DPErrcheck(gpuGetLastError());
-  int *loc_cellnum_map = int_data_cpu;
-  int *total_cellnum_map = loc_cellnum_map + loc_cellnum;
-  int *mask_cellnum_map = total_cellnum_map + total_cellnum;
-  int *sec_loc_cellnum_map = mask_cellnum_map + total_cellnum;
-  int *sec_total_cellnum_map = sec_loc_cellnum_map + loc_cellnum + 1;
+  int* loc_cellnum_map = int_data_cpu;
+  int* total_cellnum_map = loc_cellnum_map + loc_cellnum;
+  int* mask_cellnum_map = total_cellnum_map + total_cellnum;
+  int* sec_loc_cellnum_map = mask_cellnum_map + total_cellnum;
+  int* sec_total_cellnum_map = sec_loc_cellnum_map + loc_cellnum + 1;
   sec_loc_cellnum_map[0] = 0;
   sec_total_cellnum_map[0] = nloc;
   int max_cell = 0;
@@ -412,36 +412,36 @@ int copy_coord_gpu(FPTYPE *out_c,
   return 0;
 }
 
-template void normalize_coord_gpu<float>(float *coord,
+template void normalize_coord_gpu<float>(float* coord,
                                          const int natom,
-                                         const Region<float> &region);
-template void normalize_coord_gpu<double>(double *coord,
+                                         const Region<float>& region);
+template void normalize_coord_gpu<double>(double* coord,
                                           const int natom,
-                                          const Region<double> &region);
-template int copy_coord_gpu<float>(float *out_c,
-                                   int *out_t,
-                                   int *mapping,
-                                   int *nall,
-                                   int *int_data,
-                                   const float *in_c,
-                                   const int *in_t,
-                                   const int &nloc,
-                                   const int &mem_nall,
-                                   const int &loc_cellnum,
-                                   const int &total_cellnum,
-                                   const int *cell_info,
-                                   const Region<float> &region);
-template int copy_coord_gpu<double>(double *out_c,
-                                    int *out_t,
-                                    int *mapping,
-                                    int *nall,
-                                    int *int_data,
-                                    const double *in_c,
-                                    const int *in_t,
-                                    const int &nloc,
-                                    const int &mem_nall,
-                                    const int &loc_cellnum,
-                                    const int &total_cellnum,
-                                    const int *cell_info,
-                                    const Region<double> &region);
+                                          const Region<double>& region);
+template int copy_coord_gpu<float>(float* out_c,
+                                   int* out_t,
+                                   int* mapping,
+                                   int* nall,
+                                   int* int_data,
+                                   const float* in_c,
+                                   const int* in_t,
+                                   const int& nloc,
+                                   const int& mem_nall,
+                                   const int& loc_cellnum,
+                                   const int& total_cellnum,
+                                   const int* cell_info,
+                                   const Region<float>& region);
+template int copy_coord_gpu<double>(double* out_c,
+                                    int* out_t,
+                                    int* mapping,
+                                    int* nall,
+                                    int* int_data,
+                                    const double* in_c,
+                                    const int* in_t,
+                                    const int& nloc,
+                                    const int& mem_nall,
+                                    const int& loc_cellnum,
+                                    const int& total_cellnum,
+                                    const int* cell_info,
+                                    const Region<double>& region);
 }  // namespace deepmd
diff --git a/source/lib/src/gpu/cudart/cudart_stub.cc b/source/lib/src/gpu/cudart/cudart_stub.cc
index cfbabd6f5e..222cdeb942 100644
--- a/source/lib/src/gpu/cudart/cudart_stub.cc
+++ b/source/lib/src/gpu/cudart/cudart_stub.cc
@@ -16,12 +16,12 @@ static cudaError_t DP_CudartGetSymbolNotFoundError() {
   return cudaErrorSharedObjectSymbolNotFound;
 }
 
-void *DP_cudart_dlopen(char *libname) {
-  static auto handle = [](std::string libname) -> void * {
+void* DP_cudart_dlopen(char* libname) {
+  static auto handle = [](std::string libname) -> void* {
 #if defined(_WIN32)
-    void *dso_handle = LoadLibrary(libname.c_str());
+    void* dso_handle = LoadLibrary(libname.c_str());
 #else
-    void *dso_handle = dlopen(libname.c_str(), RTLD_NOW | RTLD_LOCAL);
+    void* dso_handle = dlopen(libname.c_str(), RTLD_NOW | RTLD_LOCAL);
 #endif
     if (!dso_handle) {
       std::cerr << "DeePMD-kit: Cannot find " << libname << std::endl;
@@ -37,15 +37,15 @@ void *DP_cudart_dlopen(char *libname) {
   return handle;
 }
 
-void *DP_cudart_dlsym(void *handle, const char *sym_name) {
+void* DP_cudart_dlsym(void* handle, const char* sym_name) {
   // check if the handle is nullptr, if so, return a function that
   // returns cudaErrorSharedObjectSymbolNotFound
   if (!handle) {
-    return reinterpret_cast<void *>(&DP_CudartGetSymbolNotFoundError);
+    return reinterpret_cast<void*>(&DP_CudartGetSymbolNotFoundError);
   }
-  void *symbol = dlsym(handle, sym_name);
+  void* symbol = dlsym(handle, sym_name);
   if (!symbol) {
-    return reinterpret_cast<void *>(&DP_CudartGetSymbolNotFoundError);
+    return reinterpret_cast<void*>(&DP_CudartGetSymbolNotFoundError);
   }
   return symbol;
 };
diff --git a/source/lib/src/gpu/neighbor_list.cu b/source/lib/src/gpu/neighbor_list.cu
index fc4e784915..70bc406f5a 100644
--- a/source/lib/src/gpu/neighbor_list.cu
+++ b/source/lib/src/gpu/neighbor_list.cu
@@ -28,9 +28,9 @@ struct parallel_prefix_scan_op {
 };
 
 template <int THREADS_PER_BLOCK>
-__global__ void parallel_prefix_scan(int *numneigh,
-                                     int *nei_order,
-                                     const int *temp_nlist,
+__global__ void parallel_prefix_scan(int* numneigh,
+                                     int* nei_order,
+                                     const int* temp_nlist,
                                      const int mem_size,
                                      const int nloc,
                                      const int nall) {
@@ -67,14 +67,14 @@ __global__ void parallel_prefix_scan(int *numneigh,
 }
 
 template <typename FPTYPE>
-__device__ inline FPTYPE dev_dot(FPTYPE *arr1, FPTYPE *arr2) {
+__device__ inline FPTYPE dev_dot(FPTYPE* arr1, FPTYPE* arr2) {
   return arr1[0] * arr2[0] + arr1[1] * arr2[1] + arr1[2] * arr2[2];
 }
 
 template <typename FPTYPE>
-__global__ void build_nlist(int *ilist,
-                            int *temp_nlist,
-                            const FPTYPE *c_cpy,
+__global__ void build_nlist(int* ilist,
+                            int* temp_nlist,
+                            const FPTYPE* c_cpy,
                             const FPTYPE rcut2,
                             const int nloc,
                             const int nall,
@@ -82,12 +82,12 @@ __global__ void build_nlist(int *ilist,
   const unsigned int atom_idx = blockIdx.x;
   const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y;
   if (neighbor_idx < nall) {
-    int *neighbor_row = temp_nlist + atom_idx * mem_size;
+    int* neighbor_row = temp_nlist + atom_idx * mem_size;
     if (neighbor_idx == atom_idx) {
       ilist[atom_idx] = atom_idx;
     } else {
-      const FPTYPE *ccoord = c_cpy + atom_idx * 3;
-      const FPTYPE *ncoord = c_cpy + neighbor_idx * 3;
+      const FPTYPE* ccoord = c_cpy + atom_idx * 3;
+      const FPTYPE* ncoord = c_cpy + neighbor_idx * 3;
       FPTYPE diff[3];
       for (int kk = 0; kk < 3; kk++) {
         diff[kk] = ccoord[kk] - ncoord[kk];
@@ -100,16 +100,16 @@ __global__ void build_nlist(int *ilist,
   }
 }
 
-__global__ void fill_nlist(int **firstneigh,
-                           const int *temp_nlist,
-                           const int *nei_order,
+__global__ void fill_nlist(int** firstneigh,
+                           const int* temp_nlist,
+                           const int* nei_order,
                            const int mem_size,
                            const int nall) {
   const unsigned int atom_idx = blockIdx.x;
   const unsigned int neighbor_idx = blockIdx.y * blockDim.y + threadIdx.y;
   if (neighbor_idx < nall) {
-    const int *in_row = temp_nlist + atom_idx * mem_size;
-    int *out_row = firstneigh[atom_idx];
+    const int* in_row = temp_nlist + atom_idx * mem_size;
+    int* out_row = firstneigh[atom_idx];
     int nei = in_row[neighbor_idx];
     if (nei != -1) {
       out_row[nei_order[atom_idx * mem_size + neighbor_idx]] = nei;
@@ -117,8 +117,8 @@ __global__ void fill_nlist(int **firstneigh,
   }
 }
 
-__global__ void map_nlist(int *nlist,
-                          const int *nlist_map,
+__global__ void map_nlist(int* nlist,
+                          const int* nlist_map,
                           const int nloc,
                           const int nnei) {
   int atom_idx = blockIdx.x;
@@ -133,11 +133,11 @@ __global__ void map_nlist(int *nlist,
   }
 }
 
-__global__ void map_nei_info(int *nlist,
-                             int *ntype,
-                             bool *nmask,
-                             const int *type,
-                             const int *nlist_map,
+__global__ void map_nei_info(int* nlist,
+                             int* ntype,
+                             bool* nmask,
+                             const int* type,
+                             const int* nlist_map,
                              const int nloc,
                              const int nnei,
                              const int ntypes) {
@@ -159,10 +159,10 @@ __global__ void map_nei_info(int *nlist,
   }
 }
 
-__global__ void map_nei_info_noconvert(int *nlist,
-                                       int *ntype,
-                                       bool *nmask,
-                                       const int *type,
+__global__ void map_nei_info_noconvert(int* nlist,
+                                       int* ntype,
+                                       bool* nmask,
+                                       const int* type,
                                        const int nloc,
                                        const int nnei,
                                        const int ntypes) {
@@ -183,26 +183,26 @@ __global__ void map_nei_info_noconvert(int *nlist,
 
 namespace deepmd {
 template <typename FPTYPE>
-int build_nlist_gpu(InputNlist &nlist,
-                    int *max_list_size,
-                    int *nlist_data,
-                    const FPTYPE *c_cpy,
-                    const int &nloc,
-                    const int &nall,
-                    const int &mem_size,
-                    const float &rcut) {
+int build_nlist_gpu(InputNlist& nlist,
+                    int* max_list_size,
+                    int* nlist_data,
+                    const FPTYPE* c_cpy,
+                    const int& nloc,
+                    const int& nall,
+                    const int& mem_size,
+                    const float& rcut) {
   if (mem_size < nall) {
     return 1;
   }
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   const int nblock = (nall + TPB - 1) / TPB;
-  int *ilist = nlist.ilist;
-  int *numneigh = nlist.numneigh;
-  int **firstneigh = nlist.firstneigh;
+  int* ilist = nlist.ilist;
+  int* numneigh = nlist.numneigh;
+  int** firstneigh = nlist.firstneigh;
   DPErrcheck(gpuMemset(nlist_data, -1, sizeof(int) * 2 * nloc * mem_size));
-  int *temp_nlist = nlist_data;  // nloc*mem_size
-  int *nei_order = temp_nlist + nloc * mem_size;
+  int* temp_nlist = nlist_data;  // nloc*mem_size
+  int* nei_order = temp_nlist + nloc * mem_size;
   nlist.inum = nloc;
   FPTYPE rcut2 = rcut * rcut;
 
@@ -220,7 +220,7 @@ int build_nlist_gpu(InputNlist &nlist,
                                           mem_size, nall);
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
-  int *numneigh_host = new int[nloc];
+  int* numneigh_host = new int[nloc];
   DPErrcheck(gpuMemcpy(numneigh_host, numneigh, sizeof(int) * nloc,
                        gpuMemcpyDeviceToHost));
   int max_nei = 0;
@@ -234,8 +234,8 @@ int build_nlist_gpu(InputNlist &nlist,
   return 0;
 }
 
-void use_nlist_map(int *nlist,
-                   const int *nlist_map,
+void use_nlist_map(int* nlist,
+                   const int* nlist_map,
                    const int nloc,
                    const int nnei) {
   DPErrcheck(gpuGetLastError());
@@ -248,11 +248,11 @@ void use_nlist_map(int *nlist,
   DPErrcheck(gpuDeviceSynchronize());
 }
 
-void use_nei_info_gpu(int *nlist,
-                      int *ntype,
-                      bool *nmask,
-                      const int *type,
-                      const int *nlist_map,
+void use_nei_info_gpu(int* nlist,
+                      int* ntype,
+                      bool* nmask,
+                      const int* type,
+                      const int* nlist_map,
                       const int nloc,
                       const int nnei,
                       const int ntypes,
@@ -275,25 +275,25 @@ void use_nei_info_gpu(int *nlist,
   DPErrcheck(gpuDeviceSynchronize());
 }
 
-template int build_nlist_gpu<float>(InputNlist &nlist,
-                                    int *max_list_size,
-                                    int *nlist_data,
-                                    const float *c_cpy,
-                                    const int &nloc,
-                                    const int &nall,
-                                    const int &mem_size,
-                                    const float &rcut);
-template int build_nlist_gpu<double>(InputNlist &nlist,
-                                     int *max_list_size,
-                                     int *nlist_data,
-                                     const double *c_cpy,
-                                     const int &nloc,
-                                     const int &nall,
-                                     const int &mem_size,
-                                     const float &rcut);
+template int build_nlist_gpu<float>(InputNlist& nlist,
+                                    int* max_list_size,
+                                    int* nlist_data,
+                                    const float* c_cpy,
+                                    const int& nloc,
+                                    const int& nall,
+                                    const int& mem_size,
+                                    const float& rcut);
+template int build_nlist_gpu<double>(InputNlist& nlist,
+                                     int* max_list_size,
+                                     int* nlist_data,
+                                     const double* c_cpy,
+                                     const int& nloc,
+                                     const int& nall,
+                                     const int& mem_size,
+                                     const float& rcut);
 
-__global__ void map_filter_ftype(int *ftype_out,
-                                 const int *ftype_in,
+__global__ void map_filter_ftype(int* ftype_out,
+                                 const int* ftype_in,
                                  const int nloc) {
   int ii = blockIdx.x * blockDim.x + threadIdx.x;
   if (ii < nloc) {
@@ -301,7 +301,7 @@ __global__ void map_filter_ftype(int *ftype_out,
   }
 }
 
-void filter_ftype_gpu(int *ftype_out, const int *ftype_in, const int nloc) {
+void filter_ftype_gpu(int* ftype_out, const int* ftype_in, const int nloc) {
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   int nblock = (nloc + TPB - 1) / TPB;
diff --git a/source/lib/src/gpu/region.cu b/source/lib/src/gpu/region.cu
index 849eecfc3e..45fb8a2802 100644
--- a/source/lib/src/gpu/region.cu
+++ b/source/lib/src/gpu/region.cu
@@ -3,30 +3,30 @@
 #include "region.h"
 
 template <typename FPTYPE>
-__global__ void _phys2Inter(FPTYPE *inter,
-                            const FPTYPE *phys,
-                            const FPTYPE *rec_boxt) {
+__global__ void _phys2Inter(FPTYPE* inter,
+                            const FPTYPE* phys,
+                            const FPTYPE* rec_boxt) {
   phys2Inter(inter, phys, rec_boxt);
 }
 
 template <typename FPTYPE>
-__global__ void _inter2Phys(FPTYPE *phys,
-                            const FPTYPE *inter,
-                            const FPTYPE *boxt) {
+__global__ void _inter2Phys(FPTYPE* phys,
+                            const FPTYPE* inter,
+                            const FPTYPE* boxt) {
   inter2Phys(phys, inter, boxt);
 }
 
 template <typename FPTYPE>
-__global__ void _compute_volume(FPTYPE *volume, const FPTYPE *boxt) {
+__global__ void _compute_volume(FPTYPE* volume, const FPTYPE* boxt) {
   volume[0] = compute_volume(boxt);
 }
 
 namespace deepmd {
 // only for unittest
 template <typename FPTYPE>
-void convert_to_inter_gpu(FPTYPE *ri,
-                          const Region<FPTYPE> &region,
-                          const FPTYPE *rp) {
+void convert_to_inter_gpu(FPTYPE* ri,
+                          const Region<FPTYPE>& region,
+                          const FPTYPE* rp) {
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   _phys2Inter<<<1, 1>>>(ri, rp, region.rec_boxt);
@@ -35,9 +35,9 @@ void convert_to_inter_gpu(FPTYPE *ri,
 }
 
 template <typename FPTYPE>
-void convert_to_phys_gpu(FPTYPE *rp,
-                         const Region<FPTYPE> &region,
-                         const FPTYPE *ri) {
+void convert_to_phys_gpu(FPTYPE* rp,
+                         const Region<FPTYPE>& region,
+                         const FPTYPE* ri) {
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   _inter2Phys<<<1, 1>>>(rp, ri, region.boxt);
@@ -46,7 +46,7 @@ void convert_to_phys_gpu(FPTYPE *rp,
 }
 
 template <typename FPTYPE>
-void volume_gpu(FPTYPE *volume, const Region<FPTYPE> &region) {
+void volume_gpu(FPTYPE* volume, const Region<FPTYPE>& region) {
   DPErrcheck(gpuGetLastError());
   DPErrcheck(gpuDeviceSynchronize());
   _compute_volume<<<1, 1>>>(volume, region.boxt);
@@ -54,18 +54,18 @@ void volume_gpu(FPTYPE *volume, const Region<FPTYPE> &region) {
   DPErrcheck(gpuDeviceSynchronize());
 }
 
-template void convert_to_inter_gpu<float>(float *ri,
-                                          const Region<float> &region,
-                                          const float *rp);
-template void convert_to_inter_gpu<double>(double *ri,
-                                           const Region<double> &region,
-                                           const double *rp);
-template void convert_to_phys_gpu<float>(float *rp,
-                                         const Region<float> &region,
-                                         const float *ri);
-template void convert_to_phys_gpu<double>(double *rp,
-                                          const Region<double> &region,
-                                          const double *ri);
-template void volume_gpu<float>(float *volume, const Region<float> &region);
-template void volume_gpu<double>(double *volume, const Region<double> &region);
+template void convert_to_inter_gpu<float>(float* ri,
+                                          const Region<float>& region,
+                                          const float* rp);
+template void convert_to_inter_gpu<double>(double* ri,
+                                           const Region<double>& region,
+                                           const double* rp);
+template void convert_to_phys_gpu<float>(float* rp,
+                                         const Region<float>& region,
+                                         const float* ri);
+template void convert_to_phys_gpu<double>(double* rp,
+                                          const Region<double>& region,
+                                          const double* ri);
+template void volume_gpu<float>(float* volume, const Region<float>& region);
+template void volume_gpu<double>(double* volume, const Region<double>& region);
 }  // namespace deepmd
diff --git a/source/lib/src/pairwise.cc b/source/lib/src/pairwise.cc
index f5b21d9856..b4a68b00b7 100644
--- a/source/lib/src/pairwise.cc
+++ b/source/lib/src/pairwise.cc
@@ -8,7 +8,7 @@
 #include "errors.h"
 
 template <typename T>
-std::vector<size_t> sort_indexes(const std::vector<T> &v) {
+std::vector<size_t> sort_indexes(const std::vector<T>& v) {
   // https://stackoverflow.com/a/12399290/9567349
   // by Lukasz Wiklendt under CC BY-SA 4.0
   std::vector<size_t> idx(v.size());
@@ -18,8 +18,8 @@ std::vector<size_t> sort_indexes(const std::vector<T> &v) {
   return idx;
 }
 
-void deepmd::group_atoms_cpu(std::vector<std::vector<int>> &fragments,
-                             const std::vector<int> &idxs) {
+void deepmd::group_atoms_cpu(std::vector<std::vector<int>>& fragments,
+                             const std::vector<int>& idxs) {
   int natoms = idxs.size();
   // sort idxs
   std::vector<size_t> idxs_idx = sort_indexes(idxs);
@@ -41,15 +41,15 @@ void deepmd::group_atoms_cpu(std::vector<std::vector<int>> &fragments,
 }
 
 void deepmd::dprc_pairwise_map_cpu(
-    std::vector<int> &forward_qm_map,
-    std::vector<int> &backward_qm_map,
-    std::vector<int> &forward_qmmm_map,
-    std::vector<int> &backward_qmmm_map,
-    int &nloc_qm,
-    int &nloc_qmmm,
-    int &nall_qm,
-    int &nall_qmmm,
-    const std::vector<std::vector<int>> &fragments,
+    std::vector<int>& forward_qm_map,
+    std::vector<int>& backward_qm_map,
+    std::vector<int>& forward_qmmm_map,
+    std::vector<int>& backward_qmmm_map,
+    int& nloc_qm,
+    int& nloc_qmmm,
+    int& nall_qm,
+    int& nall_qmmm,
+    const std::vector<std::vector<int>>& fragments,
     const int nloc,
     const int nall) {
   int nfragments = fragments.size();
diff --git a/source/lib/src/prod_env_mat.cc b/source/lib/src/prod_env_mat.cc
index 81984c78e4..302fac4bc9 100644
--- a/source/lib/src/prod_env_mat.cc
+++ b/source/lib/src/prod_env_mat.cc
@@ -12,22 +12,22 @@
 using namespace deepmd;
 
 template <typename FPTYPE>
-void deepmd::prod_env_mat_a_cpu(FPTYPE *em,
-                                FPTYPE *em_deriv,
-                                FPTYPE *rij,
-                                int *nlist,
-                                const FPTYPE *coord,
-                                const int *type,
-                                const InputNlist &inlist,
+void deepmd::prod_env_mat_a_cpu(FPTYPE* em,
+                                FPTYPE* em_deriv,
+                                FPTYPE* rij,
+                                int* nlist,
+                                const FPTYPE* coord,
+                                const int* type,
+                                const InputNlist& inlist,
                                 const int max_nbor_size,
-                                const FPTYPE *avg,
-                                const FPTYPE *std,
+                                const FPTYPE* avg,
+                                const FPTYPE* std,
                                 const int nloc,
                                 const int nall,
                                 const float rcut,
                                 const float rcut_smth,
                                 const std::vector<int> sec,
-                                const int *f_type) {
+                                const int* f_type) {
   if (f_type == NULL) {
     f_type = type;
   }
@@ -108,16 +108,16 @@ void deepmd::prod_env_mat_a_cpu(FPTYPE *em,
 }
 
 template <typename FPTYPE>
-void deepmd::prod_env_mat_r_cpu(FPTYPE *em,
-                                FPTYPE *em_deriv,
-                                FPTYPE *rij,
-                                int *nlist,
-                                const FPTYPE *coord,
-                                const int *type,
-                                const InputNlist &inlist,
+void deepmd::prod_env_mat_r_cpu(FPTYPE* em,
+                                FPTYPE* em_deriv,
+                                FPTYPE* rij,
+                                int* nlist,
+                                const FPTYPE* coord,
+                                const int* type,
+                                const InputNlist& inlist,
                                 const int max_nbor_size,
-                                const FPTYPE *avg,
-                                const FPTYPE *std,
+                                const FPTYPE* avg,
+                                const FPTYPE* std,
                                 const int nloc,
                                 const int nall,
                                 const float rcut,
@@ -191,66 +191,66 @@ void deepmd::prod_env_mat_r_cpu(FPTYPE *em,
   }
 }
 
-template void deepmd::prod_env_mat_a_cpu<double>(double *em,
-                                                 double *em_deriv,
-                                                 double *rij,
-                                                 int *nlist,
-                                                 const double *coord,
-                                                 const int *type,
-                                                 const InputNlist &inlist,
+template void deepmd::prod_env_mat_a_cpu<double>(double* em,
+                                                 double* em_deriv,
+                                                 double* rij,
+                                                 int* nlist,
+                                                 const double* coord,
+                                                 const int* type,
+                                                 const InputNlist& inlist,
                                                  const int max_nbor_size,
-                                                 const double *avg,
-                                                 const double *std,
+                                                 const double* avg,
+                                                 const double* std,
                                                  const int nloc,
                                                  const int nall,
                                                  const float rcut,
                                                  const float rcut_smth,
                                                  const std::vector<int> sec,
-                                                 const int *f_type);
+                                                 const int* f_type);
 
-template void deepmd::prod_env_mat_a_cpu<float>(float *em,
-                                                float *em_deriv,
-                                                float *rij,
-                                                int *nlist,
-                                                const float *coord,
-                                                const int *type,
-                                                const InputNlist &inlist,
+template void deepmd::prod_env_mat_a_cpu<float>(float* em,
+                                                float* em_deriv,
+                                                float* rij,
+                                                int* nlist,
+                                                const float* coord,
+                                                const int* type,
+                                                const InputNlist& inlist,
                                                 const int max_nbor_size,
-                                                const float *avg,
-                                                const float *std,
+                                                const float* avg,
+                                                const float* std,
                                                 const int nloc,
                                                 const int nall,
                                                 const float rcut,
                                                 const float rcut_smth,
                                                 const std::vector<int> sec,
-                                                const int *f_type);
+                                                const int* f_type);
 
-template void deepmd::prod_env_mat_r_cpu<double>(double *em,
-                                                 double *em_deriv,
-                                                 double *rij,
-                                                 int *nlist,
-                                                 const double *coord,
-                                                 const int *type,
-                                                 const InputNlist &inlist,
+template void deepmd::prod_env_mat_r_cpu<double>(double* em,
+                                                 double* em_deriv,
+                                                 double* rij,
+                                                 int* nlist,
+                                                 const double* coord,
+                                                 const int* type,
+                                                 const InputNlist& inlist,
                                                  const int max_nbor_size,
-                                                 const double *avg,
-                                                 const double *std,
+                                                 const double* avg,
+                                                 const double* std,
                                                  const int nloc,
                                                  const int nall,
                                                  const float rcut,
                                                  const float rcut_smth,
                                                  const std::vector<int> sec);
 
-template void deepmd::prod_env_mat_r_cpu<float>(float *em,
-                                                float *em_deriv,
-                                                float *rij,
-                                                int *nlist,
-                                                const float *coord,
-                                                const int *type,
-                                                const InputNlist &inlist,
+template void deepmd::prod_env_mat_r_cpu<float>(float* em,
+                                                float* em_deriv,
+                                                float* rij,
+                                                int* nlist,
+                                                const float* coord,
+                                                const int* type,
+                                                const InputNlist& inlist,
                                                 const int max_nbor_size,
-                                                const float *avg,
-                                                const float *std,
+                                                const float* avg,
+                                                const float* std,
                                                 const int nloc,
                                                 const int nall,
                                                 const float rcut,
@@ -258,17 +258,17 @@ template void deepmd::prod_env_mat_r_cpu<float>(float *em,
                                                 const std::vector<int> sec);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-void deepmd::env_mat_nbor_update(InputNlist &inlist,
-                                 InputNlist &gpu_inlist,
-                                 int &max_nbor_size,
-                                 int *&nbor_list_dev,
-                                 const int *mesh,
+void deepmd::env_mat_nbor_update(InputNlist& inlist,
+                                 InputNlist& gpu_inlist,
+                                 int& max_nbor_size,
+                                 int*& nbor_list_dev,
+                                 const int* mesh,
                                  const int size) {
-  int *mesh_host = new int[size];
+  int* mesh_host = new int[size];
   memcpy_device_to_host(mesh, mesh_host, size);
-  memcpy(&inlist.ilist, 4 + mesh_host, sizeof(int *));
-  memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int *));
-  memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int **));
+  memcpy(&inlist.ilist, 4 + mesh_host, sizeof(int*));
+  memcpy(&inlist.numneigh, 8 + mesh_host, sizeof(int*));
+  memcpy(&inlist.firstneigh, 12 + mesh_host, sizeof(int**));
   const int ago = mesh_host[0];
   if (ago == 0 || gpu_inlist.inum < inlist.inum) {
     const int inum = inlist.inum;
@@ -306,7 +306,7 @@ void deepmd::env_mat_nbor_update(InputNlist &inlist,
     // copy nbor list from host to the device
     std::vector<int> nbor_list_host(static_cast<size_t>(inum) * max_nbor_size,
                                     0);
-    int **_firstneigh = (int **)malloc(sizeof(int *) * inum);
+    int** _firstneigh = (int**)malloc(sizeof(int*) * inum);
     for (int ii = 0; ii < inum; ii++) {
       _firstneigh[ii] = nbor_list_dev + ii * max_nbor_size;
       for (int jj = 0; jj < inlist.numneigh[ii]; jj++) {
diff --git a/source/lib/src/prod_env_mat_nvnmd.cc b/source/lib/src/prod_env_mat_nvnmd.cc
index d7d98b71d5..a8bf5ce29e 100644
--- a/source/lib/src/prod_env_mat_nvnmd.cc
+++ b/source/lib/src/prod_env_mat_nvnmd.cc
@@ -43,22 +43,22 @@ using namespace deepmd;
 */
 
 template <typename FPTYPE>
-void deepmd::prod_env_mat_a_nvnmd_quantize_cpu(FPTYPE *em,
-                                               FPTYPE *em_deriv,
-                                               FPTYPE *rij,
-                                               int *nlist,
-                                               const FPTYPE *coord,
-                                               const int *type,
-                                               const InputNlist &inlist,
+void deepmd::prod_env_mat_a_nvnmd_quantize_cpu(FPTYPE* em,
+                                               FPTYPE* em_deriv,
+                                               FPTYPE* rij,
+                                               int* nlist,
+                                               const FPTYPE* coord,
+                                               const int* type,
+                                               const InputNlist& inlist,
                                                const int max_nbor_size,
-                                               const FPTYPE *avg,
-                                               const FPTYPE *std,
+                                               const FPTYPE* avg,
+                                               const FPTYPE* std,
                                                const int nloc,
                                                const int nall,
                                                const float rcut,
                                                const float rcut_smth,
                                                const std::vector<int> sec,
-                                               const int *f_type) {
+                                               const int* f_type) {
   if (f_type == NULL) {
     f_type = type;
   }
@@ -143,40 +143,40 @@ void deepmd::prod_env_mat_a_nvnmd_quantize_cpu(FPTYPE *em,
 }
 
 template void deepmd::prod_env_mat_a_nvnmd_quantize_cpu<double>(
-    double *em,
-    double *em_deriv,
-    double *rij,
-    int *nlist,
-    const double *coord,
-    const int *type,
-    const InputNlist &inlist,
+    double* em,
+    double* em_deriv,
+    double* rij,
+    int* nlist,
+    const double* coord,
+    const int* type,
+    const InputNlist& inlist,
     const int max_nbor_size,
-    const double *avg,
-    const double *std,
+    const double* avg,
+    const double* std,
     const int nloc,
     const int nall,
     const float rcut,
     const float rcut_smth,
     const std::vector<int> sec,
-    const int *f_type);
+    const int* f_type);
 
 template void deepmd::prod_env_mat_a_nvnmd_quantize_cpu<float>(
-    float *em,
-    float *em_deriv,
-    float *rij,
-    int *nlist,
-    const float *coord,
-    const int *type,
-    const InputNlist &inlist,
+    float* em,
+    float* em_deriv,
+    float* rij,
+    int* nlist,
+    const float* coord,
+    const int* type,
+    const InputNlist& inlist,
     const int max_nbor_size,
-    const float *avg,
-    const float *std,
+    const float* avg,
+    const float* std,
     const int nloc,
     const int nall,
     const float rcut,
     const float rcut_smth,
     const std::vector<int> sec,
-    const int *f_type);
+    const int* f_type);
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // UNDEFINE
diff --git a/source/lib/tests/test_env_mat_a.cc b/source/lib/tests/test_env_mat_a.cc
index d041d1a0a1..3c309ca9ae 100644
--- a/source/lib/tests/test_env_mat_a.cc
+++ b/source/lib/tests/test_env_mat_a.cc
@@ -500,7 +500,7 @@ TEST_F(TestEnvMatA, prod_cpu) {
     }
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   deepmd::convert_nlist(inlist, nlist_a_cpy);
 
@@ -536,7 +536,7 @@ TEST_F(TestEnvMatA, prod_cpu_equal_cpu) {
     }
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_a_cpy);
   std::vector<double> em(static_cast<size_t>(nloc) * ndescrpt),
@@ -612,7 +612,7 @@ TEST_F(TestEnvMatA, prod_gpu) {
     max_nbor_size = 4096;
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]),
       gpu_inlist;
   convert_nlist(inlist, nlist_a_cpy);
@@ -626,7 +626,7 @@ TEST_F(TestEnvMatA, prod_gpu) {
   double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
   int *atype_cpy_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL,
       *memory_dev = NULL;
-  uint_64 *array_longlong_dev = NULL;
+  uint_64* array_longlong_dev = NULL;
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
@@ -690,7 +690,7 @@ TEST_F(TestEnvMatA, prod_gpu_equal_cpu) {
     max_nbor_size = 4096;
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]),
       gpu_inlist;
   convert_nlist(inlist, nlist_a_cpy);
@@ -704,7 +704,7 @@ TEST_F(TestEnvMatA, prod_gpu_equal_cpu) {
   double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
   int *atype_cpy_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL,
       *memory_dev = NULL;
-  uint_64 *array_longlong_dev = NULL;
+  uint_64* array_longlong_dev = NULL;
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
diff --git a/source/lib/tests/test_env_mat_a_mix.cc b/source/lib/tests/test_env_mat_a_mix.cc
index d7e6cc88eb..e96311dafd 100644
--- a/source/lib/tests/test_env_mat_a_mix.cc
+++ b/source/lib/tests/test_env_mat_a_mix.cc
@@ -528,7 +528,7 @@ TEST_F(TestEnvMatAMix, prod_cpu) {
     }
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   deepmd::convert_nlist(inlist, nlist_a_cpy);
 
@@ -537,7 +537,7 @@ TEST_F(TestEnvMatAMix, prod_cpu) {
       rij(static_cast<size_t>(nloc) * nnei * 3);
   std::vector<int> nlist(static_cast<size_t>(nloc) * nnei);
   std::vector<int> ntype(static_cast<size_t>(nloc) * nnei);
-  bool *nmask = new bool[static_cast<size_t>(nloc) * nnei];
+  bool* nmask = new bool[static_cast<size_t>(nloc) * nnei];
   memset(nmask, 0, sizeof(bool) * nloc * nnei);
   std::vector<double> avg(ntypes * ndescrpt, 0);
   std::vector<double> std(ntypes * ndescrpt, 1);
@@ -573,7 +573,7 @@ TEST_F(TestEnvMatAMix, prod_cpu_equal_cpu) {
     }
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_a_cpy);
   std::vector<double> em(static_cast<size_t>(nloc) * ndescrpt),
@@ -650,7 +650,7 @@ TEST_F(TestEnvMatAMix, prod_gpu) {
     max_nbor_size = 4096;
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]),
       gpu_inlist;
   convert_nlist(inlist, nlist_a_cpy);
@@ -659,18 +659,18 @@ TEST_F(TestEnvMatAMix, prod_gpu) {
       rij(static_cast<size_t>(nloc) * nnei * 3, 0.0);
   std::vector<int> nlist(static_cast<size_t>(nloc) * nnei, 0);
   std::vector<int> ntype(static_cast<size_t>(nloc) * nnei, 0);
-  bool *nmask = new bool[static_cast<size_t>(nloc) * nnei];
+  bool* nmask = new bool[static_cast<size_t>(nloc) * nnei];
   memset(nmask, 0, sizeof(bool) * nloc * nnei);
   std::vector<double> avg(ntypes * ndescrpt, 0);
   std::vector<double> std(ntypes * ndescrpt, 1);
 
   double *em_dev = NULL, *em_deriv_dev = NULL, *rij_dev = NULL;
-  bool *nmask_dev = NULL;
+  bool* nmask_dev = NULL;
   double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
   int *f_atype_cpy_dev = NULL, *atype_dev = NULL, *nlist_dev = NULL,
       *ntype_dev = NULL, *mapping_dev = NULL, *array_int_dev = NULL,
       *memory_dev = NULL;
-  uint_64 *array_longlong_dev = NULL;
+  uint_64* array_longlong_dev = NULL;
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
@@ -751,7 +751,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) {
     max_nbor_size = 4096;
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]),
       gpu_inlist;
   convert_nlist(inlist, nlist_a_cpy);
@@ -765,7 +765,7 @@ TEST_F(TestEnvMatAMix, prod_gpu_equal_cpu) {
   double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
   int *f_atype_cpy_dev = NULL, *atype_dev = NULL, *nlist_dev = NULL,
       *array_int_dev = NULL, *memory_dev = NULL;
-  uint_64 *array_longlong_dev = NULL;
+  uint_64* array_longlong_dev = NULL;
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
diff --git a/source/lib/tests/test_env_mat_r.cc b/source/lib/tests/test_env_mat_r.cc
index 3024e651d9..96da7e6963 100644
--- a/source/lib/tests/test_env_mat_r.cc
+++ b/source/lib/tests/test_env_mat_r.cc
@@ -278,7 +278,7 @@ TEST_F(TestEnvMatR, prod_cpu) {
     }
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_a_cpy);
 
@@ -313,7 +313,7 @@ TEST_F(TestEnvMatR, prod_cpu_equal_cpu) {
     }
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]);
   convert_nlist(inlist, nlist_a_cpy);
   std::vector<double> em(nloc * ndescrpt), em_deriv(nloc * ndescrpt * 3),
@@ -378,7 +378,7 @@ TEST_F(TestEnvMatR, prod_gpu) {
     max_nbor_size = 4096;
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]),
       gpu_inlist;
   convert_nlist(inlist, nlist_a_cpy);
@@ -392,7 +392,7 @@ TEST_F(TestEnvMatR, prod_gpu) {
   double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
   int *atype_cpy_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL,
       *memory_dev = NULL;
-  uint_64 *array_longlong_dev = NULL;
+  uint_64* array_longlong_dev = NULL;
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
@@ -457,7 +457,7 @@ TEST_F(TestEnvMatR, prod_gpu_equal_cpu) {
     max_nbor_size = 4096;
   }
   std::vector<int> ilist(nloc), numneigh(nloc);
-  std::vector<int *> firstneigh(nloc);
+  std::vector<int*> firstneigh(nloc);
   deepmd::InputNlist inlist(nloc, &ilist[0], &numneigh[0], &firstneigh[0]),
       gpu_inlist;
   convert_nlist(inlist, nlist_a_cpy);
@@ -471,7 +471,7 @@ TEST_F(TestEnvMatR, prod_gpu_equal_cpu) {
   double *posi_cpy_dev = NULL, *avg_dev = NULL, *std_dev = NULL;
   int *atype_cpy_dev = NULL, *nlist_dev = NULL, *array_int_dev = NULL,
       *memory_dev = NULL;
-  uint_64 *array_longlong_dev = NULL;
+  uint_64* array_longlong_dev = NULL;
   deepmd::malloc_device_memory_sync(em_dev, em);
   deepmd::malloc_device_memory_sync(em_deriv_dev, em_deriv);
   deepmd::malloc_device_memory_sync(rij_dev, rij);
diff --git a/source/lib/tests/test_main.cc b/source/lib/tests/test_main.cc
index df7815b694..2ce083b175 100644
--- a/source/lib/tests/test_main.cc
+++ b/source/lib/tests/test_main.cc
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: LGPL-3.0-or-later
 #include <gtest/gtest.h>
 
-int main(int argc, char **argv) {
+int main(int argc, char** argv) {
   ::testing::InitGoogleTest(&argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/source/lib/tests/test_tabulate_se_a.cc b/source/lib/tests/test_tabulate_se_a.cc
index ce2defb22c..66a77f41fd 100644
--- a/source/lib/tests/test_tabulate_se_a.cc
+++ b/source/lib/tests/test_tabulate_se_a.cc
@@ -777,7 +777,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_gpu) {
     EXPECT_LT(fabs(xyz_scatter[jj] - expected_xyz_scatter[jj]), 1e-5);
   }
 
-  double *two_embed_dev = nullptr;
+  double* two_embed_dev = nullptr;
   deepmd::malloc_device_memory_sync(two_embed_dev, two_embed);
   deepmd::malloc_device_memory_sync(xyz_scatter_dev, xyz_scatter);
   deepmd::tabulate_fusion_se_a_gpu<double>(xyz_scatter_dev, table_dev, &info[0],
@@ -831,7 +831,7 @@ TEST_F(TestTabulateSeA, tabulate_fusion_se_a_grad_gpu) {
     EXPECT_LT(fabs(dy_dem[jj] - expected_dy_dem[jj]), 1e-5);
   }
 
-  double *two_embed_dev = nullptr;
+  double* two_embed_dev = nullptr;
   deepmd::malloc_device_memory_sync(two_embed_dev, two_embed);
   deepmd::malloc_device_memory_sync(dy_dem_x_dev, dy_dem_x);
   deepmd::malloc_device_memory_sync(dy_dem_dev, dy_dem);
diff --git a/source/lmp/builtin.cmake b/source/lmp/builtin.cmake
index f29e9d3319..e051e5c24a 100644
--- a/source/lmp/builtin.cmake
+++ b/source/lmp/builtin.cmake
@@ -5,7 +5,52 @@
 # assume LAMMPS CMake file has been executed, so these target/variables exist:
 # lammps LAMMPS_SOURCE_DIR get_lammps_version
 
-get_lammps_version(${LAMMPS_SOURCE_DIR}/version.h LAMMPS_VERSION_NUMBER)
+# Since May 15, 2025, the output of get_lammps_version is changed. We vendor the
+# old get_lammps_version
+# https://github.com/lammps/lammps/commit/b3e7121535863df3db487cd3e6a68c080bf2a6b4#diff-1214db0d1c015a50103f61f8ff7896053dec7ebc1edb930d6ef8bb07282f52abR75
+function(_get_lammps_version version_header variable)
+  file(STRINGS ${version_header} line REGEX LAMMPS_VERSION)
+  set(MONTHS
+      x
+      Jan
+      Feb
+      Mar
+      Apr
+      May
+      Jun
+      Jul
+      Aug
+      Sep
+      Oct
+      Nov
+      Dec)
+  string(REGEX
+         REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\""
+                 "\\1" day "${line}")
+  string(REGEX
+         REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\""
+                 "\\2" month "${line}")
+  string(REGEX
+         REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\""
+                 "\\3" year "${line}")
+  string(STRIP ${day} day)
+  string(STRIP ${month} month)
+  string(STRIP ${year} year)
+  list(FIND MONTHS "${month}" month)
+  string(LENGTH ${day} day_length)
+  string(LENGTH ${month} month_length)
+  if(day_length EQUAL 1)
+    set(day "0${day}")
+  endif()
+  if(month_length EQUAL 1)
+    set(month "0${month}")
+  endif()
+  set(${variable}
+      "${year}${month}${day}"
+      PARENT_SCOPE)
+endfunction()
+
+_get_lammps_version(${LAMMPS_SOURCE_DIR}/version.h LAMMPS_VERSION_NUMBER)
 
 configure_file("${CMAKE_CURRENT_LIST_DIR}/deepmd_version.h.in"
                "${CMAKE_CURRENT_BINARY_DIR}/deepmd_version.h" @ONLY)
diff --git a/source/lmp/compute_deeptensor_atom.cpp b/source/lmp/compute_deeptensor_atom.cpp
index 68c97a629e..f38279d936 100644
--- a/source/lmp/compute_deeptensor_atom.cpp
+++ b/source/lmp/compute_deeptensor_atom.cpp
@@ -24,7 +24,7 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-ComputeDeeptensorAtom::ComputeDeeptensorAtom(LAMMPS *lmp, int narg, char **arg)
+ComputeDeeptensorAtom::ComputeDeeptensorAtom(LAMMPS* lmp, int narg, char** arg)
     : Compute(lmp, narg, arg), dp(lmp), tensor(nullptr) {
   if (strcmp(update->unit_style, "lj") == 0) {
     error->all(FLERR,
@@ -45,7 +45,7 @@ ComputeDeeptensorAtom::ComputeDeeptensorAtom(LAMMPS *lmp, int narg, char **arg)
   int gpu_rank = dp.get_node_rank();
   try {
     dt.init(model_file, gpu_rank);
-  } catch (deepmd_compat::deepmd_exception &e) {
+  } catch (deepmd_compat::deepmd_exception& e) {
     error->one(FLERR, e.what());
   }
   sel_types = dt.sel_types();
@@ -83,7 +83,7 @@ void ComputeDeeptensorAtom::init() {
 #endif
 }
 
-void ComputeDeeptensorAtom::init_list(int /*id*/, NeighList *ptr) {
+void ComputeDeeptensorAtom::init_list(int /*id*/, NeighList* ptr) {
   list = ptr;
 }
 
@@ -101,10 +101,10 @@ void ComputeDeeptensorAtom::compute_peratom() {
     array_atom = tensor;
   }
 
-  double **x = atom->x;
-  double **f = atom->f;
-  int *type = atom->type;
-  int *mask = atom->mask;
+  double** x = atom->x;
+  double** f = atom->f;
+  int* type = atom->type;
+  int* mask = atom->mask;
   int nlocal = atom->nlocal;
   int nghost = atom->nghost;
   int nall = nlocal + nghost;
@@ -145,7 +145,7 @@ void ComputeDeeptensorAtom::compute_peratom() {
   try {
     dt.compute(gtensor, force, virial, atensor, avirial, dcoord, dtype, dbox,
                nghost, lmp_list);
-  } catch (deepmd_compat::deepmd_exception &e) {
+  } catch (deepmd_compat::deepmd_exception& e) {
     error->one(FLERR, e.what());
   }
 
diff --git a/source/lmp/compute_deeptensor_atom.h b/source/lmp/compute_deeptensor_atom.h
index a90283aa9e..aeba8c11f4 100644
--- a/source/lmp/compute_deeptensor_atom.h
+++ b/source/lmp/compute_deeptensor_atom.h
@@ -30,19 +30,19 @@ namespace LAMMPS_NS {
 
 class ComputeDeeptensorAtom : public Compute {
  public:
-  ComputeDeeptensorAtom(class LAMMPS *, int, char **);
+  ComputeDeeptensorAtom(class LAMMPS*, int, char**);
   ~ComputeDeeptensorAtom() override;
   void init() override;
   void compute_peratom() override;
   double memory_usage() override;
-  void init_list(int, class NeighList *) override;
+  void init_list(int, class NeighList*) override;
   double dist_unit_cvt_factor;
 
  private:
   int nmax;
-  double **tensor;
+  double** tensor;
   PairDeepMD dp;
-  class NeighList *list;
+  class NeighList* list;
   deepmd_compat::DeepTensor dt;
   std::vector<int> sel_types;
 };
diff --git a/source/lmp/fix_dplr.cpp b/source/lmp/fix_dplr.cpp
index ac161730db..90cb4f4bba 100644
--- a/source/lmp/fix_dplr.cpp
+++ b/source/lmp/fix_dplr.cpp
@@ -24,7 +24,7 @@ using namespace LAMMPS_NS;
 using namespace FixConst;
 using namespace std;
 
-static bool is_key(const string &input) {
+static bool is_key(const string& input) {
   vector<string> keys;
   keys.push_back("model");
   keys.push_back("type_associate");
@@ -39,7 +39,7 @@ static bool is_key(const string &input) {
   return false;
 }
 
-FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg)
+FixDPLR::FixDPLR(LAMMPS* lmp, int narg, char** arg)
     : Fix(lmp, narg, arg),
       xstr(nullptr),
       ystr(nullptr),
@@ -145,11 +145,11 @@ FixDPLR::FixDPLR(LAMMPS *lmp, int narg, char **arg)
   try {
     dpt.init(model, 0, "dipole_charge");
     dtm.init(model, 0, "dipole_charge");
-  } catch (deepmd_compat::deepmd_exception &e) {
+  } catch (deepmd_compat::deepmd_exception& e) {
     error->one(FLERR, e.what());
   }
 
-  pair_deepmd = (PairDeepMD *)force->pair_match("deepmd", 1, pair_deepmd_index);
+  pair_deepmd = (PairDeepMD*)force->pair_match("deepmd", 1, pair_deepmd_index);
   if (!pair_deepmd) {
     error->all(FLERR, "pair_style deepmd should be set before this fix\n");
   }
@@ -305,7 +305,7 @@ void FixDPLR::init() {
 /* ---------------------------------------------------------------------- */
 
 void FixDPLR::setup_post_neighbor() {
-  double **x = atom->x;
+  double** x = atom->x;
 
   vector<pair<int, int> > valid_pairs;
   get_valid_pairs(valid_pairs, true);
@@ -358,7 +358,7 @@ void FixDPLR::min_setup(int vflag) { setup(vflag); }
 
 /* ---------------------------------------------------------------------- */
 
-void FixDPLR::get_valid_pairs(vector<pair<int, int> > &pairs, bool is_setup) {
+void FixDPLR::get_valid_pairs(vector<pair<int, int> >& pairs, bool is_setup) {
   pairs.clear();
 
   int nlocal = atom->nlocal;
@@ -366,12 +366,12 @@ void FixDPLR::get_valid_pairs(vector<pair<int, int> > &pairs, bool is_setup) {
   int nall = nlocal + nghost;
   vector<int> dtype(nall);
   // get type
-  int *type = atom->type;
+  int* type = atom->type;
   for (int ii = 0; ii < nall; ++ii) {
     dtype[ii] = type_idx_map[type[ii] - 1];
   }
 
-  int **bondlist = neighbor->bondlist;
+  int** bondlist = neighbor->bondlist;
   int nbondlist = neighbor->nbondlist;
   for (int ii = 0; ii < nbondlist; ++ii) {
     int idx0 = -1, idx1 = -1;
@@ -437,9 +437,9 @@ void FixDPLR::get_valid_pairs(vector<pair<int, int> > &pairs, bool is_setup) {
 /* ---------------------------------------------------------------------- */
 
 void FixDPLR::pre_exchange() {
-  double **x = atom->x;
-  double **v = atom->v;
-  int *type = atom->type;
+  double** x = atom->x;
+  double** v = atom->v;
+  int* type = atom->type;
   int nlocal = atom->nlocal;
   int nghost = atom->nghost;
   int nall = nlocal + nghost;
@@ -461,8 +461,8 @@ void FixDPLR::pre_exchange() {
 /* ---------------------------------------------------------------------- */
 
 void FixDPLR::pre_force(int vflag) {
-  double **x = atom->x;
-  int *type = atom->type;
+  double** x = atom->x;
+  int* type = atom->type;
   int nlocal = atom->nlocal;
   int nghost = atom->nghost;
   int nall = nlocal + nghost;
@@ -503,7 +503,7 @@ void FixDPLR::pre_force(int vflag) {
     }
   }
   // get lammps nlist
-  NeighList *list = pair_deepmd->list;
+  NeighList* list = pair_deepmd->list;
   deepmd_compat::InputNlist lmp_list(list->inum, list->ilist, list->numneigh,
                                      list->firstneigh);
   lmp_list.set_mask(NEIGHMASK);
@@ -515,7 +515,7 @@ void FixDPLR::pre_force(int vflag) {
   // compute
   try {
     dpt.compute(tensor, dcoord, dtype, dbox, nghost, lmp_list);
-  } catch (deepmd_compat::deepmd_exception &e) {
+  } catch (deepmd_compat::deepmd_exception& e) {
     error->one(FLERR, e.what());
   }
   // cout << "tensor of size " << tensor.size() << endl;
@@ -607,7 +607,7 @@ void FixDPLR::post_force(int vflag) {
     update_efield_variables();
   }
 
-  PPPMDPLR *pppm_dplr = (PPPMDPLR *)force->kspace_match("pppm/dplr", 1);
+  PPPMDPLR* pppm_dplr = (PPPMDPLR*)force->kspace_match("pppm/dplr", 1);
   int nlocal = atom->nlocal;
   int nghost = atom->nghost;
   int nall = nlocal + nghost;
@@ -616,7 +616,7 @@ void FixDPLR::post_force(int vflag) {
   vector<int> dtype(nall, 0);
   // set values for dcoord, dbox, dfele
   {
-    int *type = atom->type;
+    int* type = atom->type;
     for (int ii = 0; ii < nall; ++ii) {
       dtype[ii] = type_idx_map[type[ii] - 1];
     }
@@ -627,7 +627,7 @@ void FixDPLR::post_force(int vflag) {
     dbox[6] = domain->h[4] / dist_unit_cvt_factor;  // zx
     dbox[3] = domain->h[5] / dist_unit_cvt_factor;  // yx
     // get coord
-    double **x = atom->x;
+    double** x = atom->x;
     for (int ii = 0; ii < nall; ++ii) {
       for (int dd = 0; dd < 3; ++dd) {
         dcoord[ii * 3 + dd] =
@@ -636,15 +636,15 @@ void FixDPLR::post_force(int vflag) {
     }
     // revise force according to efield
     if (pppm_dplr) {
-      const vector<double> &dfele_(pppm_dplr->get_fele());
+      const vector<double>& dfele_(pppm_dplr->get_fele());
       assert(dfele_.size() == nlocal * 3);
       for (int ii = 0; ii < nlocal * 3; ++ii) {
         dfele[ii] += dfele_[ii];
       }
     }
     // revise force and virial according to efield
-    double *q = atom->q;
-    imageint *image = atom->image;
+    double* q = atom->q;
+    imageint* image = atom->image;
     double unwrap[3];
     double v[6];
     efield_fsum[0] = efield_fsum[1] = efield_fsum[2] = efield_fsum[3] = 0.0;
@@ -675,7 +675,7 @@ void FixDPLR::post_force(int vflag) {
     }
   }
   // lmp nlist
-  NeighList *list = pair_deepmd->list;
+  NeighList* list = pair_deepmd->list;
   deepmd_compat::InputNlist lmp_list(list->inum, list->ilist, list->numneigh,
                                      list->firstneigh);
   // bonded pairs
@@ -696,7 +696,7 @@ void FixDPLR::post_force(int vflag) {
     for (int ii = 0; ii < 9; ++ii) {
       dvcorr[ii] *= ener_unit_cvt_factor;
     }
-  } catch (deepmd_compat::deepmd_exception &e) {
+  } catch (deepmd_compat::deepmd_exception& e) {
     error->one(FLERR, e.what());
   }
   assert(dfcorr.size() == dcoord.size());
@@ -726,7 +726,7 @@ void FixDPLR::post_force(int vflag) {
   //   cout << endl;
   // }
   // apply the force correction
-  double **f = atom->f;
+  double** f = atom->f;
   for (int ii = 0; ii < nlocal; ++ii) {
     for (int dd = 0; dd < 3; ++dd) {
       f[ii][dd] += dfcorr[ii * 3 + dd];
@@ -778,7 +778,7 @@ void FixDPLR::min_post_force(int vflag) { post_force(vflag); }
 
 /* ---------------------------------------------------------------------- */
 
-int FixDPLR::pack_reverse_comm(int n, int first, double *buf) {
+int FixDPLR::pack_reverse_comm(int n, int first, double* buf) {
   int m = 0;
   int last = first + n;
   for (int i = first; i < last; i++) {
@@ -791,7 +791,7 @@ int FixDPLR::pack_reverse_comm(int n, int first, double *buf) {
 
 /* ---------------------------------------------------------------------- */
 
-void FixDPLR::unpack_reverse_comm(int n, int *list, double *buf) {
+void FixDPLR::unpack_reverse_comm(int n, int* list, double* buf) {
   int m = 0;
   for (int i = 0; i < n; i++) {
     int j = list[i];
diff --git a/source/lmp/fix_dplr.h b/source/lmp/fix_dplr.h
index 5f1161fda6..cd2c54f9d9 100644
--- a/source/lmp/fix_dplr.h
+++ b/source/lmp/fix_dplr.h
@@ -37,7 +37,7 @@ namespace deepmd_compat = deepmd::hpp;
 namespace LAMMPS_NS {
 class FixDPLR : public Fix {
  public:
-  FixDPLR(class LAMMPS *, int, char **);
+  FixDPLR(class LAMMPS*, int, char**);
   ~FixDPLR() override;
   int setmask() override;
   void init() override;
@@ -52,14 +52,14 @@ class FixDPLR : public Fix {
   void min_pre_exchange() override;
   void min_pre_force(int) override;
   void min_post_force(int) override;
-  int pack_reverse_comm(int, int, double *) override;
-  void unpack_reverse_comm(int, int *, double *) override;
+  int pack_reverse_comm(int, int, double*) override;
+  void unpack_reverse_comm(int, int*, double*) override;
   double compute_scalar(void) override;
   double compute_vector(int) override;
   double ener_unit_cvt_factor, dist_unit_cvt_factor, force_unit_cvt_factor;
 
  private:
-  PairDeepMD *pair_deepmd;
+  PairDeepMD* pair_deepmd;
   deepmd_compat::DeepTensor dpt;
   deepmd_compat::DipoleChargeModifier dtm;
   std::string model;
@@ -74,7 +74,7 @@ class FixDPLR : public Fix {
   std::vector<double> efield;
   std::vector<double> efield_fsum, efield_fsum_all;
   int efield_force_flag;
-  void get_valid_pairs(std::vector<std::pair<int, int> > &pairs, bool is_setup);
+  void get_valid_pairs(std::vector<std::pair<int, int> >& pairs, bool is_setup);
   int varflag;
   char *xstr, *ystr, *zstr;
   int xvar, yvar, zvar, xstyle, ystyle, zstyle;
diff --git a/source/lmp/fix_ttm_dp.h b/source/lmp/fix_ttm_dp.h
index 168f880226..3eb4ccd533 100644
--- a/source/lmp/fix_ttm_dp.h
+++ b/source/lmp/fix_ttm_dp.h
@@ -13,6 +13,6 @@ class FixTTMDP : public FixTTM {
     tmp[2] = nzgrid;
     return tmp;
   };
-  double ***const get_T_electron() const { return T_electron; };
+  double*** const get_T_electron() const { return T_electron; };
 };
 }  // namespace LAMMPS_NS
diff --git a/source/lmp/pair_base.cpp b/source/lmp/pair_base.cpp
index a62956bbe4..ab60ccc780 100644
--- a/source/lmp/pair_base.cpp
+++ b/source/lmp/pair_base.cpp
@@ -35,9 +35,9 @@
 using namespace LAMMPS_NS;
 using namespace std;
 
-static int stringCmp(const void *a, const void *b) {
-  char *m = (char *)a;
-  char *n = (char *)b;
+static int stringCmp(const void* a, const void* b) {
+  char* m = (char*)a;
+  char* n = (char*)b;
   int i, sum = 0;
 
   for (i = 0; i < MPI_MAX_PROCESSOR_NAME; i++) {
@@ -98,7 +98,7 @@ int PairDeepBaseModel::get_node_rank() {
   return looprank;
 }
 
-std::string PairDeepBaseModel::get_file_content(const std::string &model) {
+std::string PairDeepBaseModel::get_file_content(const std::string& model) {
   int myrank = 0, root = 0;
   MPI_Comm_rank(MPI_COMM_WORLD, &myrank);
   int nchar = 0;
@@ -108,7 +108,7 @@ std::string PairDeepBaseModel::get_file_content(const std::string &model) {
     nchar = file_content.size();
   }
   MPI_Bcast(&nchar, 1, MPI_INT, root, MPI_COMM_WORLD);
-  char *buff = (char *)malloc(sizeof(char) * nchar);
+  char* buff = (char*)malloc(sizeof(char) * nchar);
   if (myrank == root) {
     memcpy(buff, file_content.c_str(), sizeof(char) * nchar);
   }
@@ -122,7 +122,7 @@ std::string PairDeepBaseModel::get_file_content(const std::string &model) {
 }
 
 std::vector<std::string> PairDeepBaseModel::get_file_content(
-    const std::vector<std::string> &models) {
+    const std::vector<std::string>& models) {
   std::vector<std::string> file_contents(models.size());
   for (unsigned ii = 0; ii < models.size(); ++ii) {
     file_contents[ii] = get_file_content(models[ii]);
@@ -130,11 +130,11 @@ std::vector<std::string> PairDeepBaseModel::get_file_content(
   return file_contents;
 }
 
-void PairDeepBaseModel::make_fparam_from_compute(vector<double> &fparam) {
+void PairDeepBaseModel::make_fparam_from_compute(vector<double>& fparam) {
   assert(do_compute_fparam);
 
   int icompute = modify->find_compute(compute_fparam_id);
-  Compute *compute = modify->compute[icompute];
+  Compute* compute = modify->compute[icompute];
 
   if (!compute) {
     error->all(FLERR, "compute id is not found: " + compute_fparam_id);
@@ -152,18 +152,18 @@ void PairDeepBaseModel::make_fparam_from_compute(vector<double> &fparam) {
       compute->compute_vector();
       compute->invoked_flag |= Compute::INVOKED_VECTOR;
     }
-    double *cvector = compute->vector;
+    double* cvector = compute->vector;
     for (int jj = 0; jj < dim_fparam; ++jj) {
       fparam[jj] = cvector[jj];
     }
   }
 }
 
-void PairDeepBaseModel::make_aparam_from_compute(vector<double> &aparam) {
+void PairDeepBaseModel::make_aparam_from_compute(vector<double>& aparam) {
   assert(do_compute_aparam);
 
   int icompute = modify->find_compute(compute_aparam_id);
-  Compute *compute = modify->compute[icompute];
+  Compute* compute = modify->compute[icompute];
 
   if (!compute) {
     error->all(FLERR, "compute id is not found: " + compute_aparam_id);
@@ -176,10 +176,10 @@ void PairDeepBaseModel::make_aparam_from_compute(vector<double> &aparam) {
     compute->invoked_flag |= Compute::INVOKED_PERATOM;
   }
   if (dim_aparam == 1) {
-    double *cvector = compute->vector_atom;
+    double* cvector = compute->vector_atom;
     aparam.assign(cvector, cvector + nlocal);
   } else if (dim_aparam > 1) {
-    double **carray = compute->array_atom;
+    double** carray = compute->array_atom;
     for (int ii = 0; ii < nlocal; ++ii) {
       for (int jj = 0; jj < dim_aparam; ++jj) {
         aparam[ii * dim_aparam + jj] = carray[ii][jj];
@@ -189,13 +189,13 @@ void PairDeepBaseModel::make_aparam_from_compute(vector<double> &aparam) {
 }
 
 #ifdef USE_TTM
-void PairDeepBaseModel::make_ttm_fparam(vector<double> &fparam) {
+void PairDeepBaseModel::make_ttm_fparam(vector<double>& fparam) {
   assert(do_ttm);
   // get ttm_fix
-  const FixTTMDP *ttm_fix = NULL;
+  const FixTTMDP* ttm_fix = NULL;
   for (int ii = 0; ii < modify->nfix; ii++) {
     if (string(modify->fix[ii]->id) == ttm_fix_id) {
-      ttm_fix = dynamic_cast<FixTTMDP *>(modify->fix[ii]);
+      ttm_fix = dynamic_cast<FixTTMDP*>(modify->fix[ii]);
     }
   }
   if (!ttm_fix) {
@@ -208,7 +208,7 @@ void PairDeepBaseModel::make_ttm_fparam(vector<double> &fparam) {
   int nxnodes = nnodes[0];
   int nynodes = nnodes[1];
   int nznodes = nnodes[2];
-  double ***const T_electron = ttm_fix->get_T_electron();
+  double*** const T_electron = ttm_fix->get_T_electron();
 
   int numb_effective_nodes = 0;
   double total_Te = 0;
@@ -230,27 +230,27 @@ void PairDeepBaseModel::make_ttm_fparam(vector<double> &fparam) {
 #endif
 
 #ifdef USE_TTM
-void PairDeepBaseModel::make_ttm_aparam(vector<double> &daparam) {
+void PairDeepBaseModel::make_ttm_aparam(vector<double>& daparam) {
   assert(do_ttm);
   // get ttm_fix
-  const FixTTMDP *ttm_fix = NULL;
+  const FixTTMDP* ttm_fix = NULL;
   for (int ii = 0; ii < modify->nfix; ii++) {
     if (string(modify->fix[ii]->id) == ttm_fix_id) {
-      ttm_fix = dynamic_cast<FixTTMDP *>(modify->fix[ii]);
+      ttm_fix = dynamic_cast<FixTTMDP*>(modify->fix[ii]);
     }
   }
   if (!ttm_fix) {
     error->all(FLERR, "fix ttm id is not found: " + ttm_fix_id);
   }
   // modify
-  double **x = atom->x;
-  int *mask = atom->mask;
+  double** x = atom->x;
+  int* mask = atom->mask;
   int nlocal = atom->nlocal;
   vector<int> nnodes = ttm_fix->get_nodes();
   int nxnodes = nnodes[0];
   int nynodes = nnodes[1];
   int nznodes = nnodes[2];
-  double ***const T_electron = ttm_fix->get_T_electron();
+  double*** const T_electron = ttm_fix->get_T_electron();
   double dx = domain->xprd / nxnodes;
   double dy = domain->yprd / nynodes;
   double dz = domain->zprd / nynodes;
@@ -275,8 +275,8 @@ void PairDeepBaseModel::make_ttm_aparam(vector<double> &daparam) {
 }
 #endif
 
-void PairDeepBaseModel::cum_sum(std::map<int, int> &sum,
-                                std::map<int, int> &vec) {
+void PairDeepBaseModel::cum_sum(std::map<int, int>& sum,
+                                std::map<int, int>& vec) {
   sum[0] = 0;
   for (int ii = 1; ii < vec.size(); ++ii) {
     sum[ii] = sum[ii - 1] + vec[ii - 1];
@@ -284,10 +284,10 @@ void PairDeepBaseModel::cum_sum(std::map<int, int> &sum,
 }
 
 PairDeepBaseModel::PairDeepBaseModel(
-    LAMMPS *lmp,
-    const char *cite_user_package,
-    deepmd_compat::DeepBaseModel &deep_model,
-    deepmd_compat::DeepBaseModelDevi &deep_model_devi)
+    LAMMPS* lmp,
+    const char* cite_user_package,
+    deepmd_compat::DeepBaseModel& deep_model,
+    deepmd_compat::DeepBaseModelDevi& deep_model_devi)
     : Pair(lmp),
       deep_base(deep_model),
       deep_base_model_devi(deep_model_devi)
@@ -349,7 +349,7 @@ void PairDeepBaseModel::print_summary(const string pre) const {
     // capture cout to a string, then call LAMMPS's utils::logmesg
     // https://stackoverflow.com/a/4043813/9567349
     std::stringstream buffer;
-    std::streambuf *sbuf = std::cout.rdbuf();
+    std::streambuf* sbuf = std::cout.rdbuf();
     std::cout.rdbuf(buffer.rdbuf());
 
     cout << "Summary of lammps deepmd module ..." << endl;
@@ -405,9 +405,9 @@ void PairDeepBaseModel::allocate() {
   }
 }
 
-void PairDeepBaseModel::read_restart(FILE *) { is_restart = true; }
+void PairDeepBaseModel::read_restart(FILE*) { is_restart = true; }
 
-void PairDeepBaseModel::write_restart(FILE *) {
+void PairDeepBaseModel::write_restart(FILE*) {
   // pass
 }
 
@@ -454,23 +454,23 @@ double PairDeepBaseModel::init_one(int i, int j) {
   return cutoff;
 }
 
-void *PairDeepBaseModel::extract(const char *str, int &dim) {
+void* PairDeepBaseModel::extract(const char* str, int& dim) {
   if (strcmp(str, "cut_coul") == 0) {
     dim = 0;
-    return (void *)&cutoff;
+    return (void*)&cutoff;
   }
   if (strcmp(str, "scale") == 0) {
     dim = 2;
-    return (void *)scale;
+    return (void*)scale;
   }
   return NULL;
 }
 
-void ana_st(double &max,
-            double &min,
-            double &sum,
-            const vector<double> &vec,
-            const int &nloc) {
+void ana_st(double& max,
+            double& min,
+            double& sum,
+            const vector<double>& vec,
+            const int& nloc) {
   if (nloc == 0) {
     return;
   }
@@ -488,9 +488,9 @@ void ana_st(double &max,
   }
 }
 
-void make_uniform_aparam(vector<double> &daparam,
-                         const vector<double> &aparam,
-                         const int &nlocal) {
+void make_uniform_aparam(vector<double>& daparam,
+                         const vector<double>& aparam,
+                         const int& nlocal) {
   unsigned dim_aparam = aparam.size();
   daparam.resize(static_cast<size_t>(dim_aparam) * nlocal);
   for (int ii = 0; ii < nlocal; ++ii) {
diff --git a/source/lmp/pair_base.h b/source/lmp/pair_base.h
index 055b45d20e..1dd4b84041 100644
--- a/source/lmp/pair_base.h
+++ b/source/lmp/pair_base.h
@@ -30,23 +30,23 @@ namespace deepmd_compat = deepmd::hpp;
 namespace LAMMPS_NS {
 class PairDeepBaseModel : public Pair {
  public:
-  PairDeepBaseModel(class LAMMPS *,
-                    const char *,
-                    deepmd_compat::DeepBaseModel &,
-                    deepmd_compat::DeepBaseModelDevi &);
+  PairDeepBaseModel(class LAMMPS*,
+                    const char*,
+                    deepmd_compat::DeepBaseModel&,
+                    deepmd_compat::DeepBaseModelDevi&);
   virtual ~PairDeepBaseModel() override;
-  void *extract(const char *, int &) override;
+  void* extract(const char*, int&) override;
   void init_style() override;
-  void write_restart(FILE *) override;
-  void read_restart(FILE *) override;
+  void write_restart(FILE*) override;
+  void read_restart(FILE*) override;
   double init_one(int i, int j) override;
   void print_summary(const std::string pre) const;
   int get_node_rank();
-  void cum_sum(std::map<int, int> &, std::map<int, int> &);
+  void cum_sum(std::map<int, int>&, std::map<int, int>&);
 
-  std::string get_file_content(const std::string &model);
+  std::string get_file_content(const std::string& model);
   std::vector<std::string> get_file_content(
-      const std::vector<std::string> &models);
+      const std::vector<std::string>& models);
   std::vector<std::string> type_names;
   double ener_unit_cvt_factor, dist_unit_cvt_factor, force_unit_cvt_factor;
 
@@ -54,7 +54,7 @@ class PairDeepBaseModel : public Pair {
   deepmd_compat::DeepBaseModel deep_base;
   deepmd_compat::DeepBaseModelDevi deep_base_model_devi;
   virtual void allocate();
-  double **scale;
+  double** scale;
   unsigned numb_models;
   double cutoff;
   int numb_types;
@@ -83,16 +83,16 @@ class PairDeepBaseModel : public Pair {
   double eps;
   double eps_v;
 
-  void make_fparam_from_compute(std::vector<double> &fparam);
+  void make_fparam_from_compute(std::vector<double>& fparam);
   bool do_compute_fparam;
   std::string compute_fparam_id;
-  void make_aparam_from_compute(std::vector<double> &aparam);
+  void make_aparam_from_compute(std::vector<double>& aparam);
   bool do_compute_aparam;
   std::string compute_aparam_id;
 
-  void make_ttm_fparam(std::vector<double> &fparam);
+  void make_ttm_fparam(std::vector<double>& fparam);
 
-  void make_ttm_aparam(std::vector<double> &dparam);
+  void make_ttm_aparam(std::vector<double>& dparam);
   bool do_ttm;
   std::string ttm_fix_id;
   int *counts, *displacements;
@@ -103,13 +103,13 @@ class PairDeepBaseModel : public Pair {
 
 }  // namespace LAMMPS_NS
 
-void make_uniform_aparam(std::vector<double> &daparam,
-                         const std::vector<double> &aparam,
-                         const int &nlocal);
-void ana_st(double &max,
-            double &min,
-            double &sum,
-            const std::vector<double> &vec,
-            const int &nloc);
+void make_uniform_aparam(std::vector<double>& daparam,
+                         const std::vector<double>& aparam,
+                         const int& nlocal);
+void ana_st(double& max,
+            double& min,
+            double& sum,
+            const std::vector<double>& vec,
+            const int& nloc);
 
 #endif
diff --git a/source/lmp/pair_deepmd.cpp b/source/lmp/pair_deepmd.cpp
index a11ad7f99c..3684c38dd9 100644
--- a/source/lmp/pair_deepmd.cpp
+++ b/source/lmp/pair_deepmd.cpp
@@ -117,7 +117,7 @@ static const char cite_user_deepmd_package[] =
     "  doi =      {10.1021/acs.jctc.5c00340},\n"
     "}\n\n";
 
-PairDeepMD::PairDeepMD(LAMMPS *lmp)
+PairDeepMD::PairDeepMD(LAMMPS* lmp)
     : PairDeepBaseModel(
           lmp, cite_user_deepmd_package, deep_pot, deep_pot_model_devi) {
   // Constructor body can be empty
@@ -141,10 +141,10 @@ void PairDeepMD::compute(int eflag, int vflag) {
   }
   bool do_ghost = true;
   //  dpa2 communication
-  commdata_ = (CommBrickDeepMD *)comm;
-  double **x = atom->x;
-  double **f = atom->f;
-  int *type = atom->type;
+  commdata_ = (CommBrickDeepMD*)comm;
+  double** x = atom->x;
+  double** f = atom->f;
+  int* type = atom->type;
   int nlocal = atom->nlocal;
   int nghost = 0;
   if (do_ghost) {
@@ -249,7 +249,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
         try {
           deep_pot.compute(dener, dforce, dvirial, dcoord, dtype, dbox, nghost,
                            lmp_list, ago, fparam, daparam);
-        } catch (deepmd_compat::deepmd_exception &e) {
+        } catch (deepmd_compat::deepmd_exception& e) {
           error->one(FLERR, e.what());
         }
       }
@@ -260,7 +260,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
         try {
           deep_pot.compute(dener, dforce, dvirial, deatom, dvatom, dcoord,
                            dtype, dbox, nghost, lmp_list, ago, fparam, daparam);
-        } catch (deepmd_compat::deepmd_exception &e) {
+        } catch (deepmd_compat::deepmd_exception& e) {
           error->one(FLERR, e.what());
         }
         if (eflag_atom) {
@@ -312,7 +312,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
           deep_pot_model_devi.compute(all_energy, all_force, all_virial, dcoord,
                                       dtype, dbox, nghost, lmp_list, ago,
                                       fparam, daparam);
-        } catch (deepmd_compat::deepmd_exception &e) {
+        } catch (deepmd_compat::deepmd_exception& e) {
           error->one(FLERR, e.what());
         }
       } else {
@@ -321,7 +321,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
                                       all_atom_energy, all_atom_virial, dcoord,
                                       dtype, dbox, nghost, lmp_list, ago,
                                       fparam, daparam);
-        } catch (deepmd_compat::deepmd_exception &e) {
+        } catch (deepmd_compat::deepmd_exception& e) {
           error->one(FLERR, e.what());
         }
       }
@@ -449,7 +449,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
         if (out_each == 1) {
           vector<double> std_f_all(atom->natoms);
           // Gather std_f and tags
-          tagint *tag = atom->tag;
+          tagint* tag = atom->tag;
           int nprocs = comm->nprocs;
           // Grow arrays if necessary
           if (atom->natoms > stdf_comm_buff_size) {
@@ -496,7 +496,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
     if (numb_models == 1) {
       try {
         deep_pot.compute(dener, dforce, dvirial, dcoord, dtype, dbox);
-      } catch (deepmd_compat::deepmd_exception &e) {
+      } catch (deepmd_compat::deepmd_exception& e) {
         error->one(FLERR, e.what());
       }
     } else {
@@ -525,7 +525,7 @@ void PairDeepMD::compute(int eflag, int vflag) {
   }
 }
 
-static bool is_key(const string &input) {
+static bool is_key(const string& input) {
   vector<string> keys;
   keys.push_back("out_freq");
   keys.push_back("out_file");
@@ -548,7 +548,7 @@ static bool is_key(const string &input) {
   return false;
 }
 
-void PairDeepMD::settings(int narg, char **arg) {
+void PairDeepMD::settings(int narg, char** arg) {
   if (narg <= 0) {
     error->all(FLERR, "Illegal pair_style command");
   }
@@ -568,7 +568,7 @@ void PairDeepMD::settings(int narg, char **arg) {
   if (numb_models == 1) {
     try {
       deep_pot.init(arg[0], get_node_rank(), get_file_content(arg[0]));
-    } catch (deepmd_compat::deepmd_exception &e) {
+    } catch (deepmd_compat::deepmd_exception& e) {
       error->one(FLERR, e.what());
     }
     cutoff = deep_pot.cutoff() * dist_unit_cvt_factor;
@@ -581,7 +581,7 @@ void PairDeepMD::settings(int narg, char **arg) {
       deep_pot.init(arg[0], get_node_rank(), get_file_content(arg[0]));
       deep_pot_model_devi.init(models, get_node_rank(),
                                get_file_content(models));
-    } catch (deepmd_compat::deepmd_exception &e) {
+    } catch (deepmd_compat::deepmd_exception& e) {
       error->one(FLERR, e.what());
     }
     cutoff = deep_pot_model_devi.cutoff() * dist_unit_cvt_factor;
@@ -798,7 +798,7 @@ void PairDeepMD::settings(int narg, char **arg) {
    set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 
-void PairDeepMD::coeff(int narg, char **arg) {
+void PairDeepMD::coeff(int narg, char** arg) {
   if (!allocated) {
     allocate();
   }
@@ -889,7 +889,7 @@ void PairDeepMD::coeff(int narg, char **arg) {
 
 /* ---------------------------------------------------------------------- */
 
-int PairDeepMD::pack_reverse_comm(int n, int first, double *buf) {
+int PairDeepMD::pack_reverse_comm(int n, int first, double* buf) {
   int i, m, last;
 
   m = 0;
@@ -913,7 +913,7 @@ int PairDeepMD::pack_reverse_comm(int n, int first, double *buf) {
 
 /* ---------------------------------------------------------------------- */
 
-void PairDeepMD::unpack_reverse_comm(int n, int *list, double *buf) {
+void PairDeepMD::unpack_reverse_comm(int n, int* list, double* buf) {
   int i, j, m;
 
   m = 0;
diff --git a/source/lmp/pair_deepmd.h b/source/lmp/pair_deepmd.h
index a8b3c13f4c..6d54a69fe6 100644
--- a/source/lmp/pair_deepmd.h
+++ b/source/lmp/pair_deepmd.h
@@ -42,20 +42,20 @@ class CommBrickDeepMD : public CommBrick {
 };
 class PairDeepMD : public PairDeepBaseModel {
  public:
-  PairDeepMD(class LAMMPS *);
+  PairDeepMD(class LAMMPS*);
   ~PairDeepMD() override;
-  void settings(int, char **) override;
-  void coeff(int, char **) override;
+  void settings(int, char**) override;
+  void coeff(int, char**) override;
   void compute(int, int) override;
-  int pack_reverse_comm(int, int, double *) override;
-  void unpack_reverse_comm(int, int *, double *) override;
+  int pack_reverse_comm(int, int, double*) override;
+  void unpack_reverse_comm(int, int*, double*) override;
 
  protected:
   deepmd_compat::DeepPot deep_pot;
   deepmd_compat::DeepPotModelDevi deep_pot_model_devi;
 
  private:
-  CommBrickDeepMD *commdata_;
+  CommBrickDeepMD* commdata_;
 };
 
 }  // namespace LAMMPS_NS
diff --git a/source/lmp/pair_deepspin.cpp b/source/lmp/pair_deepspin.cpp
index accdce4c79..494ddcfb68 100644
--- a/source/lmp/pair_deepspin.cpp
+++ b/source/lmp/pair_deepspin.cpp
@@ -117,7 +117,7 @@ static const char cite_user_deepmd_package[] =
     "  doi =      {10.1021/acs.jctc.5c00340},\n"
     "}\n\n";
 
-PairDeepSpin::PairDeepSpin(LAMMPS *lmp)
+PairDeepSpin::PairDeepSpin(LAMMPS* lmp)
     : PairDeepBaseModel(
           lmp, cite_user_deepmd_package, deep_spin, deep_spin_model_devi) {
   // Constructor body can be empty
@@ -141,10 +141,10 @@ void PairDeepSpin::compute(int eflag, int vflag) {
   }
   bool do_ghost = true;
   //  dpa2 communication
-  commdata_ = (CommBrickDeepSpin *)comm;
-  double **x = atom->x;
-  double **f = atom->f;
-  int *type = atom->type;
+  commdata_ = (CommBrickDeepSpin*)comm;
+  double** x = atom->x;
+  double** f = atom->f;
+  int* type = atom->type;
   int nlocal = atom->nlocal;
   int nghost = 0;
   if (do_ghost) {
@@ -155,8 +155,8 @@ void PairDeepSpin::compute(int eflag, int vflag) {
 
   vector<double> dspin(nall * 3, 0.);
   vector<double> dfm(nall * 3, 0.);
-  double **sp = atom->sp;
-  double **fm = atom->fm;
+  double** sp = atom->sp;
+  double** fm = atom->fm;
   // spin initialize
   if (atom->sp_flag) {
     // get spin
@@ -251,7 +251,7 @@ void PairDeepSpin::compute(int eflag, int vflag) {
           deep_spin.compute(dener, dforce, dforce_mag, dvirial, dcoord, dspin,
                             dtype, dbox, nghost, lmp_list, ago, fparam,
                             daparam);
-        } catch (deepmd_compat::deepmd_exception &e) {
+        } catch (deepmd_compat::deepmd_exception& e) {
           error->one(FLERR, e.what());
         }
       }
@@ -263,7 +263,7 @@ void PairDeepSpin::compute(int eflag, int vflag) {
           deep_spin.compute(dener, dforce, dforce_mag, dvirial, deatom, dvatom,
                             dcoord, dspin, dtype, dbox, nghost, lmp_list, ago,
                             fparam, daparam);
-        } catch (deepmd_compat::deepmd_exception &e) {
+        } catch (deepmd_compat::deepmd_exception& e) {
           error->one(FLERR, e.what());
         }
         if (eflag_atom) {
@@ -315,7 +315,7 @@ void PairDeepSpin::compute(int eflag, int vflag) {
           deep_spin_model_devi.compute(all_energy, all_force, all_force_mag,
                                        all_virial, dcoord, dspin, dtype, dbox,
                                        nghost, lmp_list, ago, fparam, daparam);
-        } catch (deepmd_compat::deepmd_exception &e) {
+        } catch (deepmd_compat::deepmd_exception& e) {
           error->one(FLERR, e.what());
         }
       } else {
@@ -324,7 +324,7 @@ void PairDeepSpin::compute(int eflag, int vflag) {
               all_energy, all_force, all_force_mag, all_virial, all_atom_energy,
               all_atom_virial, dcoord, dspin, dtype, dbox, nghost, lmp_list,
               ago, fparam, daparam);
-        } catch (deepmd_compat::deepmd_exception &e) {
+        } catch (deepmd_compat::deepmd_exception& e) {
           error->one(FLERR, e.what());
         }
       }
@@ -473,7 +473,7 @@ void PairDeepSpin::compute(int eflag, int vflag) {
           // need support for spin atomic force.
           vector<double> std_f_all(atom->natoms);
           // Gather std_f and tags
-          tagint *tag = atom->tag;
+          tagint* tag = atom->tag;
           int nprocs = comm->nprocs;
           // Grow arrays if necessary
           if (atom->natoms > stdf_comm_buff_size) {
@@ -521,7 +521,7 @@ void PairDeepSpin::compute(int eflag, int vflag) {
       try {
         deep_spin.compute(dener, dforce, dforce_mag, dvirial, dcoord, dspin,
                           dtype, dbox);
-      } catch (deepmd_compat::deepmd_exception &e) {
+      } catch (deepmd_compat::deepmd_exception& e) {
         error->one(FLERR, e.what());
       }
     } else {
@@ -558,7 +558,7 @@ void PairDeepSpin::compute(int eflag, int vflag) {
   }
 }
 
-static bool is_key(const string &input) {
+static bool is_key(const string& input) {
   vector<string> keys;
   keys.push_back("out_freq");
   keys.push_back("out_file");
@@ -581,7 +581,7 @@ static bool is_key(const string &input) {
   return false;
 }
 
-void PairDeepSpin::settings(int narg, char **arg) {
+void PairDeepSpin::settings(int narg, char** arg) {
   if (narg <= 0) {
     error->all(FLERR, "Illegal pair_style command");
   }
@@ -601,7 +601,7 @@ void PairDeepSpin::settings(int narg, char **arg) {
   if (numb_models == 1) {
     try {
       deep_spin.init(arg[0], get_node_rank(), get_file_content(arg[0]));
-    } catch (deepmd_compat::deepmd_exception &e) {
+    } catch (deepmd_compat::deepmd_exception& e) {
       error->one(FLERR, e.what());
     }
     cutoff = deep_spin.cutoff() * dist_unit_cvt_factor;
@@ -614,7 +614,7 @@ void PairDeepSpin::settings(int narg, char **arg) {
       deep_spin.init(arg[0], get_node_rank(), get_file_content(arg[0]));
       deep_spin_model_devi.init(models, get_node_rank(),
                                 get_file_content(models));
-    } catch (deepmd_compat::deepmd_exception &e) {
+    } catch (deepmd_compat::deepmd_exception& e) {
       error->one(FLERR, e.what());
     }
     cutoff = deep_spin_model_devi.cutoff() * dist_unit_cvt_factor;
@@ -828,7 +828,7 @@ void PairDeepSpin::settings(int narg, char **arg) {
    set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 
-void PairDeepSpin::coeff(int narg, char **arg) {
+void PairDeepSpin::coeff(int narg, char** arg) {
   if (!allocated) {
     allocate();
   }
@@ -919,7 +919,7 @@ void PairDeepSpin::coeff(int narg, char **arg) {
 
 /* ---------------------------------------------------------------------- */
 
-int PairDeepSpin::pack_reverse_comm(int n, int first, double *buf) {
+int PairDeepSpin::pack_reverse_comm(int n, int first, double* buf) {
   int i, m, last;
 
   m = 0;
@@ -946,7 +946,7 @@ int PairDeepSpin::pack_reverse_comm(int n, int first, double *buf) {
 
 /* ---------------------------------------------------------------------- */
 
-void PairDeepSpin::unpack_reverse_comm(int n, int *list, double *buf) {
+void PairDeepSpin::unpack_reverse_comm(int n, int* list, double* buf) {
   int i, j, m;
 
   m = 0;
diff --git a/source/lmp/pair_deepspin.h b/source/lmp/pair_deepspin.h
index 47d6678441..cc31db8bf5 100644
--- a/source/lmp/pair_deepspin.h
+++ b/source/lmp/pair_deepspin.h
@@ -42,13 +42,13 @@ class CommBrickDeepSpin : public CommBrick {
 };
 class PairDeepSpin : public PairDeepBaseModel {
  public:
-  PairDeepSpin(class LAMMPS *);
+  PairDeepSpin(class LAMMPS*);
   ~PairDeepSpin() override;
-  void settings(int, char **) override;
-  void coeff(int, char **) override;
+  void settings(int, char**) override;
+  void coeff(int, char**) override;
   void compute(int, int) override;
-  int pack_reverse_comm(int, int, double *) override;
-  void unpack_reverse_comm(int, int *, double *) override;
+  int pack_reverse_comm(int, int, double*) override;
+  void unpack_reverse_comm(int, int*, double*) override;
 
  protected:
   deepmd_compat::DeepSpin deep_spin;
@@ -56,7 +56,7 @@ class PairDeepSpin : public PairDeepBaseModel {
   std::vector<std::vector<double> > all_force_mag;
 
  private:
-  CommBrickDeepSpin *commdata_;
+  CommBrickDeepSpin* commdata_;
 };
 
 }  // namespace LAMMPS_NS
diff --git a/source/lmp/plugin/CMakeLists.txt b/source/lmp/plugin/CMakeLists.txt
index a0998b3ce9..a4e7d9e430 100644
--- a/source/lmp/plugin/CMakeLists.txt
+++ b/source/lmp/plugin/CMakeLists.txt
@@ -38,8 +38,54 @@ if(DEFINED LAMMPS_SOURCE_ROOT OR DEFINED LAMMPS_VERSION)
 
   # get_lammps_version
   # https://github.com/lammps/lammps/blob/c2a12f97c5f665852fb38fdd4922f7dd2e77a0a1/cmake/Modules/LAMMPSUtils.cmake#L27-L46
-  include(${LAMMPS_SOURCE_ROOT}/cmake/Modules/LAMMPSUtils.cmake)
-  get_lammps_version(${LAMMPS_HEADER_DIR}/version.h LAMMPS_VERSION_NUMBER)
+  # include(${LAMMPS_SOURCE_ROOT}/cmake/Modules/LAMMPSUtils.cmake) Since May 15,
+  # 2025, the output of get_lammps_version is changed. We vendor the old
+  # get_lammps_version
+  # https://github.com/lammps/lammps/commit/b3e7121535863df3db487cd3e6a68c080bf2a6b4#diff-1214db0d1c015a50103f61f8ff7896053dec7ebc1edb930d6ef8bb07282f52abR75
+
+  function(_get_lammps_version version_header variable)
+    file(STRINGS ${version_header} line REGEX LAMMPS_VERSION)
+    set(MONTHS
+        x
+        Jan
+        Feb
+        Mar
+        Apr
+        May
+        Jun
+        Jul
+        Aug
+        Sep
+        Oct
+        Nov
+        Dec)
+    string(REGEX
+           REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\""
+                   "\\1" day "${line}")
+    string(REGEX
+           REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\""
+                   "\\2" month "${line}")
+    string(REGEX
+           REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\""
+                   "\\3" year "${line}")
+    string(STRIP ${day} day)
+    string(STRIP ${month} month)
+    string(STRIP ${year} year)
+    list(FIND MONTHS "${month}" month)
+    string(LENGTH ${day} day_length)
+    string(LENGTH ${month} month_length)
+    if(day_length EQUAL 1)
+      set(day "0${day}")
+    endif()
+    if(month_length EQUAL 1)
+      set(month "0${month}")
+    endif()
+    set(${variable}
+        "${year}${month}${day}"
+        PARENT_SCOPE)
+  endfunction()
+
+  _get_lammps_version(${LAMMPS_HEADER_DIR}/version.h LAMMPS_VERSION_NUMBER)
   set(LAMMPS_VERSION_NUMBER
       ${LAMMPS_VERSION_NUMBER}
       PARENT_SCOPE)
@@ -75,8 +121,7 @@ if(DEFINED LAMMPS_SOURCE_ROOT OR DEFINED LAMMPS_VERSION)
     target_link_libraries(${libname} PUBLIC ${LIB_DEEPMD_C})
     target_precompile_headers(${libname} PUBLIC [["deepmd.hpp"]])
     remove_definitions(-D_GLIBCXX_USE_CXX11_ABI=${OP_CXX_ABI})
-    if("$ENV{CIBUILDWHEEL}" STREQUAL "1" OR "$ENV{LMP_CXX11_ABI_0}" STREQUAL
-                                            "1")
+    if("$ENV{LMP_CXX11_ABI_0}" STREQUAL "1")
       add_definitions(-D_GLIBCXX_USE_CXX11_ABI=0)
     endif()
   else()
diff --git a/source/lmp/plugin/deepmdplugin.cpp b/source/lmp/plugin/deepmdplugin.cpp
index 4f62cb3944..d3b54f8e41 100644
--- a/source/lmp/plugin/deepmdplugin.cpp
+++ b/source/lmp/plugin/deepmdplugin.cpp
@@ -15,22 +15,22 @@
 
 using namespace LAMMPS_NS;
 
-static Pair *pairdeepmd(LAMMPS *lmp) { return new PairDeepMD(lmp); }
-static Pair *pairdeepspin(LAMMPS *lmp) { return new PairDeepSpin(lmp); }
+static Pair* pairdeepmd(LAMMPS* lmp) { return new PairDeepMD(lmp); }
+static Pair* pairdeepspin(LAMMPS* lmp) { return new PairDeepSpin(lmp); }
 
-static Compute *computedeepmdtensoratom(LAMMPS *lmp, int narg, char **arg) {
+static Compute* computedeepmdtensoratom(LAMMPS* lmp, int narg, char** arg) {
   return new ComputeDeeptensorAtom(lmp, narg, arg);
 }
 
-static Fix *fixdplr(LAMMPS *lmp, int narg, char **arg) {
+static Fix* fixdplr(LAMMPS* lmp, int narg, char** arg) {
   return new FixDPLR(lmp, narg, arg);
 }
 
 #if LAMMPS_VERSION_NUMBER >= 20220328
-static KSpace *pppmdplr(LAMMPS *lmp) { return new PPPMDPLR(lmp); }
+static KSpace* pppmdplr(LAMMPS* lmp) { return new PPPMDPLR(lmp); }
 #endif
 
-extern "C" void lammpsplugin_init(void *lmp, void *handle, void *regfunc) {
+extern "C" void lammpsplugin_init(void* lmp, void* handle, void* regfunc) {
   lammpsplugin_t plugin;
   lammpsplugin_regfunc register_plugin = (lammpsplugin_regfunc)regfunc;
 
@@ -39,7 +39,7 @@ extern "C" void lammpsplugin_init(void *lmp, void *handle, void *regfunc) {
   plugin.name = "deepmd";
   plugin.info = "deepmd pair style " STR_GIT_SUMM;
   plugin.author = "Han Wang";
-  plugin.creator.v1 = (lammpsplugin_factory1 *)&pairdeepmd;
+  plugin.creator.v1 = (lammpsplugin_factory1*)&pairdeepmd;
   plugin.handle = handle;
   (*register_plugin)(&plugin, lmp);
 
@@ -48,7 +48,7 @@ extern "C" void lammpsplugin_init(void *lmp, void *handle, void *regfunc) {
   plugin.name = "deepspin";
   plugin.info = "deepspin pair style " STR_GIT_SUMM;
   plugin.author = "Duo Zhang";
-  plugin.creator.v1 = (lammpsplugin_factory1 *)&pairdeepspin;
+  plugin.creator.v1 = (lammpsplugin_factory1*)&pairdeepspin;
   plugin.handle = handle;
   (*register_plugin)(&plugin, lmp);
 
@@ -56,14 +56,14 @@ extern "C" void lammpsplugin_init(void *lmp, void *handle, void *regfunc) {
   plugin.name = "deeptensor/atom";
   plugin.info = "compute deeptensor/atom " STR_GIT_SUMM;
   plugin.author = "Han Wang";
-  plugin.creator.v2 = (lammpsplugin_factory2 *)&computedeepmdtensoratom;
+  plugin.creator.v2 = (lammpsplugin_factory2*)&computedeepmdtensoratom;
   (*register_plugin)(&plugin, lmp);
 
   plugin.style = "fix";
   plugin.name = "dplr";
   plugin.info = "fix dplr " STR_GIT_SUMM;
   plugin.author = "Han Wang";
-  plugin.creator.v2 = (lammpsplugin_factory2 *)&fixdplr;
+  plugin.creator.v2 = (lammpsplugin_factory2*)&fixdplr;
   (*register_plugin)(&plugin, lmp);
 
 #if LAMMPS_VERSION_NUMBER >= 20220328
@@ -72,7 +72,7 @@ extern "C" void lammpsplugin_init(void *lmp, void *handle, void *regfunc) {
   plugin.name = "pppm/dplr";
   plugin.info = "kspace pppm/dplr " STR_GIT_SUMM;
   plugin.author = "Han Wang";
-  plugin.creator.v1 = (lammpsplugin_factory1 *)&pppmdplr;
+  plugin.creator.v1 = (lammpsplugin_factory1*)&pppmdplr;
   (*register_plugin)(&plugin, lmp);
 #endif
 }
diff --git a/source/lmp/pppm_dplr.cpp b/source/lmp/pppm_dplr.cpp
index e1bdb828af..3597a31548 100644
--- a/source/lmp/pppm_dplr.cpp
+++ b/source/lmp/pppm_dplr.cpp
@@ -36,10 +36,10 @@ enum { FORWARD_IK, FORWARD_AD, FORWARD_IK_PERATOM, FORWARD_AD_PERATOM };
 
 #if LAMMPS_VERSION_NUMBER < 20181109
 // See lammps/lammps#1165
-PPPMDPLR::PPPMDPLR(LAMMPS *lmp, int narg, char **arg)
+PPPMDPLR::PPPMDPLR(LAMMPS* lmp, int narg, char** arg)
     : PPPM(lmp, narg, arg)
 #else
-PPPMDPLR::PPPMDPLR(LAMMPS *lmp)
+PPPMDPLR::PPPMDPLR(LAMMPS* lmp)
     : PPPM(lmp)
 #endif
 {
@@ -232,7 +232,7 @@ void PPPMDPLR::compute(int eflag, int vflag) {
   // ntotal accounts for TIP4P tallying eatom/vatom for ghost atoms
 
   if (evflag_atom) {
-    double *q = atom->q;
+    double* q = atom->q;
     int nlocal = atom->nlocal;
     int ntotal = nlocal;
     if (tip4pflag) {
@@ -288,8 +288,8 @@ void PPPMDPLR::fieldforce_ik() {
   // (mx,my,mz) = global coords of moving stencil pt
   // ek = 3 components of E-field on particle
 
-  double *q = atom->q;
-  double **x = atom->x;
+  double* q = atom->q;
+  double** x = atom->x;
   // double **f = atom->f;
 
   int nlocal = atom->nlocal;
@@ -347,7 +347,7 @@ void PPPMDPLR::fieldforce_ad() {
   FFT_SCALAR ekx, eky, ekz;
   double s1, s2, s3;
   double sf = 0.0;
-  double *prd;
+  double* prd;
 
   prd = domain->prd;
   double xprd = prd[0];
@@ -364,8 +364,8 @@ void PPPMDPLR::fieldforce_ad() {
   // (mx,my,mz) = global coords of moving stencil pt
   // ek = 3 components of E-field on particle
 
-  double *q = atom->q;
-  double **x = atom->x;
+  double* q = atom->q;
+  double** x = atom->x;
   // double **f = atom->f;
 
   int nlocal = atom->nlocal;
diff --git a/source/lmp/pppm_dplr.h b/source/lmp/pppm_dplr.h
index b7e221c686..79a9a9ce37 100644
--- a/source/lmp/pppm_dplr.h
+++ b/source/lmp/pppm_dplr.h
@@ -21,14 +21,14 @@ class PPPMDPLR : public PPPM {
  public:
 #if LAMMPS_VERSION_NUMBER < 20181109
   // See lammps/lammps#1165
-  PPPMDPLR(class LAMMPS *, int, char **);
+  PPPMDPLR(class LAMMPS*, int, char**);
 #else
-  PPPMDPLR(class LAMMPS *);
+  PPPMDPLR(class LAMMPS*);
 #endif
   ~PPPMDPLR() override {};
   void init() override;
-  const std::vector<double> &get_fele() const { return fele; };
-  std::vector<double> &get_fele() { return fele; }
+  const std::vector<double>& get_fele() const { return fele; };
+  std::vector<double>& get_fele() { return fele; }
 
  protected:
   void compute(int, int) override;
diff --git a/source/lmp/tests/test_deeptensor.py b/source/lmp/tests/test_deeptensor.py
index 6fb7cde746..41d1c10ed6 100644
--- a/source/lmp/tests/test_deeptensor.py
+++ b/source/lmp/tests/test_deeptensor.py
@@ -142,7 +142,7 @@ def test_compute_deeptensor_atom(lammps) -> None:
     lammps.variable("tensor atom c_tensor[1]")
     lammps.dump("1 all custom 1 dump id c_tensor[1]")
     lammps.run(0)
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     assert np.array(lammps.variables["tensor"].value) == pytest.approx(
         expected_d[idx_map]
     )
@@ -155,7 +155,7 @@ def test_compute_deeptensor_atom_si(lammps_si) -> None:
     lammps_si.variable("tensor atom c_tensor[1]")
     lammps_si.dump("1 all custom 1 dump id c_tensor[1]")
     lammps_si.run(0)
-    idx_map = lammps_si.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_si.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     assert np.array(lammps_si.variables["tensor"].value) == pytest.approx(
         expected_d[idx_map] * constants.dist_metal2si
     )
diff --git a/source/lmp/tests/test_dplr.py b/source/lmp/tests/test_dplr.py
index 21d1f18658..bf8783f233 100644
--- a/source/lmp/tests/test_dplr.py
+++ b/source/lmp/tests/test_dplr.py
@@ -357,7 +357,7 @@ def test_pair_deepmd_sr(lammps) -> None:
     lammps.pair_coeff("* *")
     lammps.run(0)
     assert lammps.eval("pe") == pytest.approx(expected_e_sr)
-    id_list = lammps.lmp.numpy.extract_atom("id")
+    id_list = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]]
     for ii in range(6):
         assert lammps.atoms[np.where(id_list == (ii + 1))[0][0]].force == pytest.approx(
             expected_f_sr[ii]
@@ -378,7 +378,7 @@ def test_pair_deepmd_sr_virial(lammps) -> None:
     )
     lammps.dump_modify("1 sort id")
     lammps.run(0)
-    id_list = lammps.lmp.numpy.extract_atom("id")
+    id_list = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]]
     idx_list = [np.where(id_list == i)[0][0] for i in range(1, 7)]
     assert lammps.eval("pe") == pytest.approx(expected_e_sr)
     for ii in range(6):
@@ -445,7 +445,7 @@ def test_pair_deepmd_lr_efield_constant(lammps) -> None:
     )
     lammps.fix_modify("0 energy yes virial yes")
     lammps.run(0)
-    id_list = lammps.lmp.numpy.extract_atom("id")
+    id_list = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]]
     assert lammps.eval("evdwl") == pytest.approx(expected_evdwl_lr_efield_constant)
     assert lammps.eval("f_0") == pytest.approx(expected_e_efield_constant)
     assert lammps.eval("pe") == pytest.approx(expected_e_lr_efield_constant)
@@ -481,7 +481,7 @@ def test_pair_deepmd_lr_efield_variable(lammps) -> None:
     )
     lammps.fix_modify("0 energy yes virial yes")
     lammps.run(0)
-    id_list = lammps.lmp.numpy.extract_atom("id")
+    id_list = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]]
     assert lammps.eval("evdwl") == pytest.approx(expected_evdwl_lr_efield_variable)
     assert lammps.eval("f_0") == pytest.approx(expected_e_efield_variable)
     assert lammps.eval("pe") == pytest.approx(expected_e_lr_efield_variable)
diff --git a/source/lmp/tests/test_lammps.py b/source/lmp/tests/test_lammps.py
index ad8f8cdaac..c24f032cf6 100644
--- a/source/lmp/tests/test_lammps.py
+++ b/source/lmp/tests/test_lammps.py
@@ -340,7 +340,7 @@ def test_pair_deepmd_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     assert np.array(lammps.variables["eatom"].value) == pytest.approx(
         expected_ae[idx_map]
     )
@@ -408,7 +408,7 @@ def test_pair_deepmd_model_devi_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     assert np.array(lammps.variables["eatom"].value) == pytest.approx(
         expected_ae[idx_map]
     )
@@ -545,7 +545,7 @@ def test_pair_deepmd_virial_real(lammps_real) -> None:
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
@@ -604,7 +604,7 @@ def test_pair_deepmd_model_devi_virial_real(lammps_real) -> None:
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
diff --git a/source/lmp/tests/test_lammps_3types.py b/source/lmp/tests/test_lammps_3types.py
index f0cbe19ddf..a99a83b758 100644
--- a/source/lmp/tests/test_lammps_3types.py
+++ b/source/lmp/tests/test_lammps_3types.py
@@ -320,7 +320,7 @@ def test_pair_deepmd_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -371,7 +371,7 @@ def test_pair_deepmd_model_devi_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
diff --git a/source/lmp/tests/test_lammps_dpa_jax.py b/source/lmp/tests/test_lammps_dpa_jax.py
index 4867b5f84e..65991b9732 100644
--- a/source/lmp/tests/test_lammps_dpa_jax.py
+++ b/source/lmp/tests/test_lammps_dpa_jax.py
@@ -334,7 +334,7 @@ def test_pair_deepmd_virial(lammps):
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -385,7 +385,7 @@ def test_pair_deepmd_model_devi_virial(lammps):
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -511,7 +511,7 @@ def test_pair_deepmd_virial_real(lammps_real):
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
@@ -570,7 +570,7 @@ def test_pair_deepmd_model_devi_virial_real(lammps_real):
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
diff --git a/source/lmp/tests/test_lammps_dpa_pt.py b/source/lmp/tests/test_lammps_dpa_pt.py
index e66b93e09e..2768332c71 100644
--- a/source/lmp/tests/test_lammps_dpa_pt.py
+++ b/source/lmp/tests/test_lammps_dpa_pt.py
@@ -330,7 +330,7 @@ def test_pair_deepmd_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -381,7 +381,7 @@ def test_pair_deepmd_model_devi_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -507,7 +507,7 @@ def test_pair_deepmd_virial_real(lammps_real) -> None:
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
@@ -566,7 +566,7 @@ def test_pair_deepmd_model_devi_virial_real(lammps_real) -> None:
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
diff --git a/source/lmp/tests/test_lammps_dpa_pt_nopbc.py b/source/lmp/tests/test_lammps_dpa_pt_nopbc.py
index 563650c714..1c2e145c84 100644
--- a/source/lmp/tests/test_lammps_dpa_pt_nopbc.py
+++ b/source/lmp/tests/test_lammps_dpa_pt_nopbc.py
@@ -328,7 +328,7 @@ def test_pair_deepmd_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -379,7 +379,7 @@ def test_pair_deepmd_model_devi_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -505,7 +505,7 @@ def test_pair_deepmd_virial_real(lammps_real) -> None:
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
@@ -564,7 +564,7 @@ def test_pair_deepmd_model_devi_virial_real(lammps_real) -> None:
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
diff --git a/source/lmp/tests/test_lammps_dpa_sel_pt.py b/source/lmp/tests/test_lammps_dpa_sel_pt.py
index 9ff2883fc1..e758251f18 100644
--- a/source/lmp/tests/test_lammps_dpa_sel_pt.py
+++ b/source/lmp/tests/test_lammps_dpa_sel_pt.py
@@ -333,7 +333,7 @@ def test_pair_deepmd_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -384,7 +384,7 @@ def test_pair_deepmd_model_devi_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -510,7 +510,7 @@ def test_pair_deepmd_virial_real(lammps_real) -> None:
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
@@ -569,7 +569,7 @@ def test_pair_deepmd_model_devi_virial_real(lammps_real) -> None:
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
diff --git a/source/lmp/tests/test_lammps_faparam.py b/source/lmp/tests/test_lammps_faparam.py
index 4206aa68fb..4f744119b6 100644
--- a/source/lmp/tests/test_lammps_faparam.py
+++ b/source/lmp/tests/test_lammps_faparam.py
@@ -213,7 +213,7 @@ def test_pair_deepmd_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
diff --git a/source/lmp/tests/test_lammps_jax.py b/source/lmp/tests/test_lammps_jax.py
index 5d88cfca12..0c488cd1bc 100644
--- a/source/lmp/tests/test_lammps_jax.py
+++ b/source/lmp/tests/test_lammps_jax.py
@@ -332,7 +332,7 @@ def test_pair_deepmd_virial(lammps):
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -383,7 +383,7 @@ def test_pair_deepmd_model_devi_virial(lammps):
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -509,7 +509,7 @@ def test_pair_deepmd_virial_real(lammps_real):
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
@@ -568,7 +568,7 @@ def test_pair_deepmd_model_devi_virial_real(lammps_real):
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
diff --git a/source/lmp/tests/test_lammps_pd.py b/source/lmp/tests/test_lammps_pd.py
index 31ee2e482a..92b00aba29 100644
--- a/source/lmp/tests/test_lammps_pd.py
+++ b/source/lmp/tests/test_lammps_pd.py
@@ -333,7 +333,7 @@ def test_pair_deepmd_virial(lammps):
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -384,7 +384,7 @@ def test_pair_deepmd_model_devi_virial(lammps):
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1], RTOL, ATOL
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -516,7 +516,7 @@ def test_pair_deepmd_virial_real(lammps_real):
             RTOL,
             ATOL,
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
@@ -581,7 +581,7 @@ def test_pair_deepmd_model_devi_virial_real(lammps_real):
             RTOL,
             ATOL,
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
diff --git a/source/lmp/tests/test_lammps_pt.py b/source/lmp/tests/test_lammps_pt.py
index 9aed014b62..f675b2b671 100644
--- a/source/lmp/tests/test_lammps_pt.py
+++ b/source/lmp/tests/test_lammps_pt.py
@@ -330,7 +330,7 @@ def test_pair_deepmd_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -381,7 +381,7 @@ def test_pair_deepmd_model_devi_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps.variables[f"virial{ii}"].value
@@ -507,7 +507,7 @@ def test_pair_deepmd_virial_real(lammps_real) -> None:
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
@@ -566,7 +566,7 @@ def test_pair_deepmd_model_devi_virial_real(lammps_real) -> None:
         assert lammps_real.atoms[ii].force == pytest.approx(
             expected_f[lammps_real.atoms[ii].id - 1] * constants.force_metal2real
         )
-    idx_map = lammps_real.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps_real.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     for ii in range(9):
         assert np.array(
             lammps_real.variables[f"virial{ii}"].value
diff --git a/source/lmp/tests/test_lammps_spin.py b/source/lmp/tests/test_lammps_spin.py
index 39e12b03fc..9ab7271f5f 100644
--- a/source/lmp/tests/test_lammps_spin.py
+++ b/source/lmp/tests/test_lammps_spin.py
@@ -172,7 +172,7 @@ def test_pair_deepmd_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     assert np.array(lammps.variables["eatom"].value) == pytest.approx(
         expected_ae[idx_map]
     )
diff --git a/source/lmp/tests/test_lammps_spin_pt.py b/source/lmp/tests/test_lammps_spin_pt.py
index e15a10ee72..9a0771d047 100644
--- a/source/lmp/tests/test_lammps_spin_pt.py
+++ b/source/lmp/tests/test_lammps_spin_pt.py
@@ -168,7 +168,7 @@ def test_pair_deepmd_virial(lammps) -> None:
         assert lammps.atoms[ii].force == pytest.approx(
             expected_f[lammps.atoms[ii].id - 1]
         )
-    idx_map = lammps.lmp.numpy.extract_atom("id") - 1
+    idx_map = lammps.lmp.numpy.extract_atom("id")[: coord.shape[0]] - 1
     assert np.array(lammps.variables["eatom"].value) == pytest.approx(
         expected_ae[idx_map]
     )
diff --git a/source/op/pd/CMakeLists.txt b/source/op/pd/CMakeLists.txt
new file mode 100644
index 0000000000..194ceb4061
--- /dev/null
+++ b/source/op/pd/CMakeLists.txt
@@ -0,0 +1,72 @@
+file(GLOB OP_SRC comm.cc)
+
+add_library(deepmd_op_pd SHARED ${OP_SRC})
+
+if(NOT DEFINED PADDLE_INFERENCE_DIR)
+  message(
+    FATAL_ERROR
+      "please set PADDLE_INFERENCE_DIR with -DPADDLE_INFERENCE_DIR=/path/to/paddle_inference"
+  )
+endif()
+
+set(PADDLE_INFERENCE_DIR_THIRD_PARTY_PATH
+    ${PADDLE_INFERENCE_DIR}/third_party/install)
+include_directories(${PADDLE_INFERENCE_DIR})
+include_directories(${PADDLE_INFERENCE_DIR}/paddle/include)
+include_directories(${PADDLE_INFERENCE_DIR_THIRD_PARTY_PATH}/protobuf/include)
+include_directories(${PADDLE_INFERENCE_DIR_THIRD_PARTY_PATH}/glog/include)
+include_directories(${PADDLE_INFERENCE_DIR_THIRD_PARTY_PATH}/gflags/include)
+include_directories(${PADDLE_INFERENCE_DIR_THIRD_PARTY_PATH}/xxhash/include)
+
+set(PADDLE_INFER_LIB ${PADDLE_INFERENCE_DIR}/paddle/lib/libpaddle_inference.so)
+set(MKLML_LIB
+    ${PADDLE_INFERENCE_DIR_THIRD_PARTY_PATH}/mklml/lib/libmklml_intel.so
+    ${PADDLE_INFERENCE_DIR_THIRD_PARTY_PATH}/mklml/lib/libiomp5.so)
+
+target_link_libraries(deepmd_op_pd PRIVATE ${LIB_DEEPMD})
+
+if(APPLE)
+  set_target_properties(deepmd_op_pd PROPERTIES INSTALL_RPATH "@loader_path")
+else()
+  set_target_properties(deepmd_op_pd PROPERTIES INSTALL_RPATH "$ORIGIN")
+endif()
+
+find_package(MPI)
+if(MPI_FOUND)
+  include(CheckCXXSymbolExists)
+  set(CMAKE_REQUIRED_INCLUDES ${MPI_CXX_INCLUDE_DIRS})
+  set(CMAKE_REQUIRED_LIBRARIES ${MPI_CXX_LIBRARIES})
+  check_cxx_symbol_exists(MPIX_Query_cuda_support "mpi.h" CUDA_AWARE)
+  if(NOT CUDA_AWARE)
+    check_cxx_symbol_exists(MPIX_Query_cuda_support "mpi.h;mpi-ext.h" OMP_CUDA)
+    if(NOT OMP_CUDA)
+      target_compile_definitions(deepmd_op_pd PRIVATE NO_CUDA_AWARE)
+    endif()
+  endif()
+  target_link_libraries(deepmd_op_pd PRIVATE MPI::MPI_CXX)
+  target_compile_definitions(deepmd_op_pd PRIVATE USE_MPI)
+endif()
+
+if(CMAKE_TESTING_ENABLED)
+  target_link_libraries(deepmd_op_pd PRIVATE coverage_config)
+endif()
+
+target_link_libraries(
+  deepmd_op_pd
+  PRIVATE ${PADDLE_INFERENCE_DIR_THIRD_PARTY_PATH}/glog/lib/libglog.a
+          ${PADDLE_INFERENCE_DIR_THIRD_PARTY_PATH}/gflags/lib/libgflags.a
+          ${PADDLE_INFERENCE_DIR_THIRD_PARTY_PATH}/xxhash/lib/libxxhash.a
+          ${PADDLE_INFERENCE_DIR_THIRD_PARTY_PATH}/protobuf/lib/libprotobuf.a
+          ${PADDLE_INFER_LIB}
+          ${MKLML_LIB}
+          dl
+          pthread)
+
+if(BUILD_PY_IF)
+  install(TARGETS deepmd_op_pd DESTINATION deepmd/lib/)
+else(BUILD_PY_IF)
+  install(
+    TARGETS deepmd_op_pd
+    EXPORT DeePMDTargets
+    DESTINATION lib/)
+endif(BUILD_PY_IF)
diff --git a/source/op/pd/comm.cc b/source/op/pd/comm.cc
new file mode 100644
index 0000000000..548e5db83a
--- /dev/null
+++ b/source/op/pd/comm.cc
@@ -0,0 +1,479 @@
+// SPDX-License-Identifier: LGPL-3.0-or-later
+
+#ifdef USE_MPI
+#include <mpi.h>
+#ifdef OMPI_MPI_H
+#include <mpi-ext.h>
+#endif
+#endif
+#include <cstdint>
+
+#include "paddle/extension.h"
+
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+#include "device.h"
+#endif
+
+#ifdef USE_MPI
+template <typename T>
+static MPI_Datatype get_mpi_type();
+
+template <>
+MPI_Datatype get_mpi_type<float>() {
+  return MPI_FLOAT;
+}
+
+template <>
+MPI_Datatype get_mpi_type<double>() {
+  return MPI_DOUBLE;
+}
+#endif
+
+#ifdef USE_MPI
+static void unpack_communicator(const paddle::Tensor& communicator_tensor,
+                                MPI_Comm& mpi_comm) {
+#ifdef OMPI_MPI_H
+  const int64_t* communicator = communicator_tensor.data<int64_t>();
+#else
+  const int64_t* ptr = communicator_tensor.data<int64_t>();
+  const int* communicator = reinterpret_cast<const int*>(ptr);
+#endif
+  mpi_comm = reinterpret_cast<MPI_Comm>(*communicator);
+}
+#endif
+
+template <typename FPTYPE>
+void Border_forward_t(const paddle::Tensor& sendlist_tensor,
+                      const paddle::Tensor& sendproc_tensor,
+                      const paddle::Tensor& recvproc_tensor,
+                      const paddle::Tensor& sendnum_tensor,
+                      const paddle::Tensor& recvnum_tensor,
+                      paddle::Tensor& g1,
+                      const paddle::Tensor& communicator_tensor,
+                      const paddle::Tensor& nlocal_tensor,
+                      const paddle::Tensor& nghost_tensor) {
+  int64_t send_list_len = sendlist_tensor.numel();
+
+  paddle::Tensor cpu_sendlist = paddle::empty(
+      {send_list_len}, paddle::DataType::INT64, paddle::CPUPlace());
+  cpu_sendlist.copy_(sendlist_tensor, paddle::CPUPlace(), true);
+  int64_t* sendlist = cpu_sendlist.data<int64_t>();
+
+  int nswap = sendproc_tensor.dims()[0];
+
+  paddle::Tensor cpu_sendproc =
+      paddle::empty({nswap}, paddle::DataType::INT32, paddle::CPUPlace());
+  cpu_sendproc.copy_(sendproc_tensor, paddle::CPUPlace(), true);
+  int* sendproc = cpu_sendproc.data<int>();
+
+  paddle::Tensor cpu_recvproc =
+      paddle::empty({nswap}, paddle::DataType::INT32, paddle::CPUPlace());
+  cpu_recvproc.copy_(recvproc_tensor, paddle::CPUPlace(), true);
+  int* recvproc = cpu_recvproc.data<int>();
+
+  paddle::Tensor cpu_sendnum =
+      paddle::empty({nswap}, paddle::DataType::INT32, paddle::CPUPlace());
+  cpu_sendnum.copy_(sendnum_tensor, paddle::CPUPlace(), true);
+  int* sendnum = cpu_sendnum.data<int>();
+
+  paddle::Tensor cpu_recvnum =
+      paddle::empty({nswap}, paddle::DataType::INT32, paddle::CPUPlace());
+  cpu_recvnum.copy_(recvnum_tensor, paddle::CPUPlace(), true);
+  int* recvnum = cpu_recvnum.data<int>();
+
+  int tensor_size = g1.dims()[1];
+
+  paddle::Tensor cpu_nlocal =
+      paddle::empty({nswap}, paddle::DataType::INT32, paddle::CPUPlace());
+  cpu_nlocal.copy_(nlocal_tensor, paddle::CPUPlace(), true);
+  int nlocal = *(cpu_nlocal.data<int>());
+
+  paddle::Tensor cpu_nghost =
+      paddle::empty({nswap}, paddle::DataType::INT32, paddle::CPUPlace());
+  cpu_nghost.copy_(nghost_tensor, paddle::CPUPlace(), true);
+  int nghost = *(cpu_nghost.data<int>());
+
+  int ntotal = nlocal + nghost;
+
+  paddle::Tensor recv_g1_tensor = g1;
+
+#ifdef USE_MPI
+  // MPI initialization check
+  int mpi_init = 0;
+  MPI_Initialized(&mpi_init);
+  int cuda_aware = 1;
+  int me = 0;
+  MPI_Comm world;
+  int world_size = 0;
+
+  if (mpi_init) {
+    unpack_communicator(communicator_tensor, world);
+    MPI_Comm_rank(world, &me);
+    MPI_Comm_size(world, &world_size);
+  }
+
+  MPI_Datatype mpi_type = get_mpi_type<FPTYPE>();
+  MPI_Request request;
+
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+  if (world_size >= 1) {
+    int version, subversion;
+    MPI_Get_version(&version, &subversion);
+    if (version >= 4) {
+#ifdef NO_CUDA_AWARE
+      cuda_aware = 0;
+#else
+      cuda_aware = MPIX_Query_cuda_support();
+#endif
+    } else {
+      cuda_aware = 0;
+    }
+
+    if (cuda_aware == 0) {
+      recv_g1_tensor = paddle::empty_like(g1, g1.dtype(), paddle::CPUPlace());
+      recv_g1_tensor.copy_(g1, recv_g1_tensor.place(), true);
+    }
+  }
+#endif
+
+#endif  // USE_MPI
+  FPTYPE* recv_g1 = recv_g1_tensor.data<FPTYPE>() + nlocal * tensor_size;
+
+  for (int iswap = 0; iswap < nswap; ++iswap) {
+    int nrecv = recvnum[iswap];
+    int nsend = sendnum[iswap];
+    paddle::Tensor isendlist;
+    paddle::Tensor send_g1_tensor;
+    FPTYPE* send_g1 = nullptr;
+
+    if (nsend != 0) {
+      std::intptr_t addr = static_cast<std::intptr_t>(sendlist[iswap]);
+      int* isendlist_ptr = reinterpret_cast<int*>(addr);
+      isendlist =
+          paddle::from_blob(isendlist_ptr, {nsend}, paddle::DataType::INT32,
+                            phi::DataLayout::NCHW, paddle::CPUPlace())
+              .copy_to(recv_g1_tensor.place(), true);
+      send_g1_tensor =
+          paddle::experimental::index_select(recv_g1_tensor, isendlist, 0);
+      send_g1 = send_g1_tensor.data<FPTYPE>();
+    }
+
+#ifdef USE_MPI
+    if (sendproc[iswap] != me) {
+      if (nrecv) {
+        MPI_Irecv(recv_g1, nrecv * tensor_size, mpi_type, recvproc[iswap], 0,
+                  world, &request);
+      }
+      if (nsend) {
+        MPI_Send(send_g1, nsend * tensor_size, mpi_type, sendproc[iswap], 0,
+                 world);
+      }
+      if (nrecv) {
+        MPI_Wait(&request, MPI_STATUS_IGNORE);
+      }
+    } else {
+#endif
+
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+#ifdef USE_MPI
+      if (cuda_aware == 0) {
+        memcpy(recv_g1, send_g1,
+               (unsigned long)nsend * tensor_size * sizeof(FPTYPE));
+      } else {
+        gpuMemcpy(recv_g1, send_g1,
+                  (unsigned long)nsend * tensor_size * sizeof(FPTYPE),
+                  gpuMemcpyDeviceToDevice);
+      }
+#else
+      gpuMemcpy(recv_g1, send_g1,
+                (unsigned long)nsend * tensor_size * sizeof(FPTYPE),
+                gpuMemcpyDeviceToDevice);
+#endif
+
+#else
+    memcpy(recv_g1, send_g1,
+           (unsigned long)nsend * tensor_size * sizeof(FPTYPE));
+#endif
+
+#ifdef USE_MPI
+    }
+#endif
+    recv_g1 += nrecv * tensor_size;
+  }
+
+#ifdef USE_MPI
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+  if (cuda_aware == 0) {
+    g1.copy_(recv_g1_tensor, g1.place(), true);
+  }
+#endif
+#endif
+}
+
+void Border_forward(const paddle::Tensor& sendlist_tensor,
+                    const paddle::Tensor& sendproc_tensor,
+                    const paddle::Tensor& recvproc_tensor,
+                    const paddle::Tensor& sendnum_tensor,
+                    const paddle::Tensor& recvnum_tensor,
+                    paddle::Tensor& g1_tensor,
+                    const paddle::Tensor& communicator_tensor,
+                    const paddle::Tensor& nlocal_tensor,
+                    const paddle::Tensor& nghost_tensor) {
+  bool type_flag = (g1_tensor.dtype() == phi::DataType::FLOAT64) ? true : false;
+  if (type_flag) {
+    Border_forward_t<double>(sendlist_tensor, sendproc_tensor, recvproc_tensor,
+                             sendnum_tensor, recvnum_tensor, g1_tensor,
+                             communicator_tensor, nlocal_tensor, nghost_tensor);
+  } else {
+    Border_forward_t<float>(sendlist_tensor, sendproc_tensor, recvproc_tensor,
+                            sendnum_tensor, recvnum_tensor, g1_tensor,
+                            communicator_tensor, nlocal_tensor, nghost_tensor);
+  }
+}
+
+template <typename FPTYPE>
+void Border_backward_t(const paddle::Tensor& sendlist_tensor,
+                       const paddle::Tensor& sendproc_tensor,
+                       const paddle::Tensor& recvproc_tensor,
+                       const paddle::Tensor& sendnum_tensor,
+                       const paddle::Tensor& recvnum_tensor,
+                       const paddle::Tensor& g1_tensor,
+                       const paddle::Tensor& communicator_tensor,
+                       const paddle::Tensor& nlocal_tensor,
+                       const paddle::Tensor& nghost_tensor,
+                       paddle::Tensor& recv_g1_tensor_grad) {
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+  gpuDeviceSynchronize();
+#endif
+  paddle::Tensor d_local_g1_tensor =
+      paddle::empty(recv_g1_tensor_grad.shape(), recv_g1_tensor_grad.dtype(),
+                    recv_g1_tensor_grad.place());
+  d_local_g1_tensor.copy_(recv_g1_tensor_grad.contiguous(),
+                          d_local_g1_tensor.place(), true);
+
+#ifdef USE_MPI
+  int mpi_init = 0, world_size = 0, me = 0, cuda_aware = 1;
+  MPI_Initialized(&mpi_init);
+
+  MPI_Comm world;
+  if (mpi_init) {
+    unpack_communicator(communicator_tensor, world);
+    MPI_Comm_rank(world, &me);
+    MPI_Comm_size(world, &world_size);
+  }
+
+  auto mpi_type = get_mpi_type<FPTYPE>();
+  MPI_Request request;
+
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+  if (world_size >= 1) {
+    int version, subversion;
+    MPI_Get_version(&version, &subversion);
+
+    if (version >= 4) {
+#ifdef NO_CUDA_AWARE
+      cuda_aware = 0;
+#else
+      cuda_aware = MPIX_Query_cuda_support();
+#endif
+    } else {
+      cuda_aware = 0;
+    }
+
+    if (cuda_aware == 0) {
+      d_local_g1_tensor = paddle::empty_like(
+          recv_g1_tensor_grad, recv_g1_tensor_grad.dtype(), paddle::CPUPlace());
+      d_local_g1_tensor.copy_(recv_g1_tensor_grad, d_local_g1_tensor.place(),
+                              true);
+    }
+  }
+#endif
+#endif  // USE_MPI
+  int64_t send_list_len = sendlist_tensor.numel();
+  paddle::Tensor cpu_sendlist = paddle::empty(
+      {send_list_len}, paddle::DataType::INT64, paddle::CPUPlace());
+  cpu_sendlist.copy_(sendlist_tensor, paddle::CPUPlace(), true);
+  int64_t* recvlist = cpu_sendlist.data<int64_t>();
+
+  int nswap = sendproc_tensor.dims()[0];
+  // swap send and recv here
+  paddle::Tensor cpu_recvproc =
+      paddle::empty({nswap}, paddle::DataType::INT32, paddle::CPUPlace());
+  cpu_recvproc.copy_(recvproc_tensor, paddle::CPUPlace(), true);
+  int* recvproc = cpu_recvproc.data<int>();
+
+  paddle::Tensor cpu_sendproc =
+      paddle::empty({nswap}, paddle::DataType::INT32, paddle::CPUPlace());
+  cpu_sendproc.copy_(sendproc_tensor, paddle::CPUPlace(), true);
+  int* sendproc = cpu_sendproc.data<int>();
+
+  paddle::Tensor cpu_sendnum =
+      paddle::empty({nswap}, paddle::DataType::INT32, paddle::CPUPlace());
+  cpu_sendnum.copy_(sendnum_tensor, paddle::CPUPlace(), true);
+  int* recvnum = cpu_sendnum.data<int>();
+
+  paddle::Tensor cpu_recvnum =
+      paddle::empty({nswap}, paddle::DataType::INT32, paddle::CPUPlace());
+  cpu_recvnum.copy_(recvnum_tensor, paddle::CPUPlace(), true);
+  int* sendnum = cpu_recvnum.data<int>();
+
+  FPTYPE* local_g1 = d_local_g1_tensor.data<FPTYPE>();
+  int tensor_size = d_local_g1_tensor.dims()[1];
+
+  paddle::Tensor cpu_nlocal =
+      paddle::empty({1}, paddle::DataType::INT32, paddle::CPUPlace());
+  cpu_nlocal.copy_(nlocal_tensor, paddle::CPUPlace(), true);
+  int nlocal = *cpu_nlocal.data<int>();
+
+  paddle::Tensor cpu_nghost =
+      paddle::empty({1}, paddle::DataType::INT32, paddle::CPUPlace());
+  cpu_nghost.copy_(nghost_tensor, paddle::CPUPlace(), true);
+  int nghost = *cpu_nghost.data<int>();
+  int ntotal = nlocal + nghost;
+
+  paddle::Tensor send_g1_tensor, recv_g1_tensor;
+  FPTYPE *recv_g1 = nullptr, *send_g1 = nullptr;
+
+  if (nswap != 0) {
+    send_g1_tensor = d_local_g1_tensor;
+
+    int max_recvnum =
+        *(paddle::experimental::max(cpu_sendnum, {}, false).data<int>());
+    recv_g1_tensor =
+        paddle::empty({max_recvnum, tensor_size}, d_local_g1_tensor.dtype(),
+                      d_local_g1_tensor.place());
+    recv_g1 = recv_g1_tensor.data<FPTYPE>();
+    send_g1 = send_g1_tensor.data<FPTYPE>() + ntotal * tensor_size;
+  }
+
+  for (int iswap = nswap - 1; iswap >= 0; --iswap) {
+    int nrecv = recvnum[iswap];
+    int nsend = sendnum[iswap];
+
+    paddle::Tensor irecvlist;
+    if (nrecv) {
+      std::intptr_t addr = static_cast<std::intptr_t>(recvlist[iswap]);
+      int* irecvlist_ptr = reinterpret_cast<int*>(addr);
+      irecvlist =
+          paddle::from_blob(irecvlist_ptr, {nrecv}, paddle::DataType::INT32,
+                            paddle::DataLayout::NCHW, paddle::CPUPlace())
+              .copy_to(d_local_g1_tensor.place(), true);
+    }
+
+    if (nsend) {
+      send_g1 -= nsend * tensor_size;
+    }
+
+#ifdef USE_MPI
+    if (sendproc[iswap] != me) {
+      if (nrecv) {
+        MPI_Irecv(recv_g1, nrecv * tensor_size, mpi_type, recvproc[iswap], 0,
+                  world, &request);
+      }
+      if (nsend) {
+        MPI_Send(send_g1, nsend * tensor_size, mpi_type, sendproc[iswap], 0,
+                 world);
+      }
+      if (nrecv) {
+        MPI_Wait(&request, MPI_STATUS_IGNORE);
+      }
+    } else {
+#endif
+      if (nrecv) {
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+#ifdef USE_MPI
+        if (cuda_aware == 0) {
+          memcpy(recv_g1, send_g1,
+                 (unsigned long)nrecv * tensor_size * sizeof(FPTYPE));
+        } else {
+          gpuMemcpy(recv_g1, send_g1,
+                    (unsigned long)nrecv * tensor_size * sizeof(FPTYPE),
+                    gpuMemcpyDeviceToDevice);
+        }
+#else
+        gpuMemcpy(recv_g1, send_g1,
+                  (unsigned long)nrecv * tensor_size * sizeof(FPTYPE),
+                  gpuMemcpyDeviceToDevice);
+#endif
+#else
+      memcpy(recv_g1, send_g1,
+             (unsigned long)nrecv * tensor_size * sizeof(FPTYPE));
+#endif
+      }
+#ifdef USE_MPI
+    }
+#endif
+    if (nrecv) {
+      d_local_g1_tensor = paddle::experimental::index_add_(
+          d_local_g1_tensor, irecvlist, recv_g1_tensor.slice(0, nrecv), 0);
+    }
+  }
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+  gpuDeviceSynchronize();
+#endif
+
+#ifdef USE_MPI
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+  if (cuda_aware == 0) {
+    recv_g1_tensor_grad.copy_(d_local_g1_tensor, recv_g1_tensor_grad.place(),
+                              true);
+  }
+#endif
+#endif
+}
+
+void Border_backward(const paddle::Tensor& sendlist_tensor,
+                     const paddle::Tensor& sendproc_tensor,
+                     const paddle::Tensor& recvproc_tensor,
+                     const paddle::Tensor& sendnum_tensor,
+                     const paddle::Tensor& recvnum_tensor,
+                     const paddle::Tensor& g1_tensor,
+                     const paddle::Tensor& communicator_tensor,
+                     const paddle::Tensor& nlocal_tensor,
+                     const paddle::Tensor& nghost_tensor,
+                     paddle::Tensor& recv_g1_tensor_grad) {
+  bool type_flag =
+      (recv_g1_tensor_grad.dtype() == paddle::DataType::FLOAT64) ? true : false;
+  if (type_flag) {
+    Border_backward_t<double>(sendlist_tensor, sendproc_tensor, recvproc_tensor,
+                              sendnum_tensor, recvnum_tensor, g1_tensor,
+                              communicator_tensor, nlocal_tensor, nghost_tensor,
+                              recv_g1_tensor_grad);
+  } else {
+    Border_backward_t<float>(sendlist_tensor, sendproc_tensor, recvproc_tensor,
+                             sendnum_tensor, recvnum_tensor, g1_tensor,
+                             communicator_tensor, nlocal_tensor, nghost_tensor,
+                             recv_g1_tensor_grad);
+  }
+}
+
+/**
+ * @brief communicate the latest g1_tensor info to other lmp proc
+ * @param[in]  sendlist_tensor list of atoms to send in each swap
+ * @param[in]  sendproc_tensor proc to send to at each swap
+ * @param[in]  recvproc_tensor proc to recv from at each swap
+ * @param[in]  sendnum_tensor # of atoms to send in each swap
+ * @param[in]  recvnum_tensor # of atoms to recv in each swap
+ * @param[in]  g1_tensor tensor to store g1_tensor info
+ * @param[in]  communicator_tensor MPI_comm data in lmp
+ * @param[in]  nlocal_tensor # of local atoms
+ * @param[in]  nghost_tensor # of nghost atoms
+ * @param[out] recv_g1_tensor g1_tensor after communication
+ **/
+PD_BUILD_OP(border_op)
+    .Inputs({"sendlist_tensor", "sendproc_tensor", "recvproc_tensor",
+             "sendnum_tensor", "recvnum_tensor", "g1_tensor",
+             "communicator_tensor", "nlocal_tensor", "nghost_tensor"})
+    .Outputs({"recv_g1_tensor"})
+    .SetKernelFn(PD_KERNEL(Border_forward))
+    .SetInplaceMap({{"g1_tensor", "recv_g1_tensor"}});
+
+PD_BUILD_GRAD_OP(border_op)
+    .Inputs({"sendlist_tensor", "sendproc_tensor", "recvproc_tensor",
+             "sendnum_tensor", "recvnum_tensor", "g1_tensor",
+             "communicator_tensor", "nlocal_tensor", "nghost_tensor",
+             paddle::Grad("recv_g1_tensor")})
+    .Outputs({paddle::Grad("g1_tensor")})
+    .SetInplaceMap({{paddle::Grad("recv_g1_tensor"),
+                     paddle::Grad("g1_tensor")}})
+    .SetKernelFn(PD_KERNEL(Border_backward));
diff --git a/source/op/pd/setup.py b/source/op/pd/setup.py
new file mode 100644
index 0000000000..951b50de9c
--- /dev/null
+++ b/source/op/pd/setup.py
@@ -0,0 +1,23 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import os
+
+
+def main():
+    current_dir = os.path.abspath(os.getcwd())
+    script_dir = os.path.abspath(os.path.dirname(__file__))
+
+    if current_dir != script_dir:
+        raise RuntimeError(
+            f"[ERROR] Please run this script under directory: `{script_dir}`"
+        )
+
+    from paddle.utils.cpp_extension import (
+        CppExtension,
+        setup,
+    )
+
+    setup(name="deepmd_op_pd", ext_modules=CppExtension(sources=["comm.cc"]))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/source/op/pt/comm.cc b/source/op/pt/comm.cc
index 71a2b0e118..97466a4833 100644
--- a/source/op/pt/comm.cc
+++ b/source/op/pt/comm.cc
@@ -86,7 +86,7 @@ class Border : public torch::autograd::Function<Border> {
 #ifdef USE_MPI
     int mpi_init = 0;
     MPI_Initialized(&mpi_init);
-    int cuda_aware = 1;
+    int cuda_aware = 0;
     int me = 0;
     MPI_Comm world;
     int world_size = 0;
@@ -99,17 +99,9 @@ class Border : public torch::autograd::Function<Border> {
     MPI_Request request;
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
     if (world_size >= 1) {
-      int version, subversion;
-      MPI_Get_version(&version, &subversion);
-      if (version >= 4) {
-#ifdef NO_CUDA_AWARE
-        cuda_aware = 0;
-#else
-        cuda_aware = MPIX_Query_cuda_support();
+#ifndef NO_CUDA_AWARE
+      cuda_aware = MPIX_Query_cuda_support();
 #endif
-      } else {
-        cuda_aware = 0;
-      }
       if (cuda_aware == 0) {
         recv_g1_tensor = torch::empty_like(g1).to(torch::kCPU);
         recv_g1_tensor.copy_(g1);
@@ -193,10 +185,6 @@ class Border : public torch::autograd::Function<Border> {
   static torch::autograd::variable_list backward_t(
       torch::autograd::AutogradContext* ctx,
       torch::autograd::variable_list grad_output) {
-#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
-    gpuDeviceSynchronize();
-#endif
-
     torch::autograd::variable_list saved_variables = ctx->get_saved_variables();
     torch::Tensor sendlist_tensor = saved_variables[0];
     torch::Tensor sendproc_tensor = saved_variables[1];
@@ -212,7 +200,7 @@ class Border : public torch::autograd::Function<Border> {
     int mpi_init = 0;
     MPI_Initialized(&mpi_init);
     int world_size = 0;
-    int cuda_aware = 1;
+    int cuda_aware = 0;
     int me = 0;
     MPI_Comm world;
     if (mpi_init) {
@@ -224,17 +212,9 @@ class Border : public torch::autograd::Function<Border> {
     MPI_Request request;
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
     if (world_size >= 1) {
-      int version, subversion;
-      MPI_Get_version(&version, &subversion);
-      if (version >= 4) {
-#ifdef NO_CUDA_AWARE
-        cuda_aware = 0;
-#else
-        cuda_aware = MPIX_Query_cuda_support();
+#ifndef NO_CUDA_AWARE
+      cuda_aware = MPIX_Query_cuda_support();
 #endif
-      } else {
-        cuda_aware = 0;
-      }
       if (cuda_aware == 0) {
         d_local_g1_tensor = torch::empty_like(grad_output[0]).to(torch::kCPU);
         d_local_g1_tensor.copy_(grad_output[0]);
@@ -329,9 +309,6 @@ class Border : public torch::autograd::Function<Border> {
                                      recv_g1_tensor.slice(0, 0, nrecv));
       }
     }
-#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
-    gpuDeviceSynchronize();
-#endif
 #ifdef USE_MPI
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
     if (cuda_aware == 0) {
diff --git a/source/op/tf/descrpt_se_a_mask.cc b/source/op/tf/descrpt_se_a_mask.cc
index 28e4a575db..7f8bcd9411 100644
--- a/source/op/tf/descrpt_se_a_mask.cc
+++ b/source/op/tf/descrpt_se_a_mask.cc
@@ -32,7 +32,7 @@ struct NeighborInfo {
   int index;
   NeighborInfo() : type(0), dist(0), index(0) {}
   NeighborInfo(int tt, FPTYPE dd, int ii) : type(tt), dist(dd), index(ii) {}
-  bool operator<(const NeighborInfo &b) const {
+  bool operator<(const NeighborInfo& b) const {
     return (type < b.type ||
             (type == b.type &&
              (dist < b.dist || (dist == b.dist && index < b.index))));
@@ -42,24 +42,24 @@ struct NeighborInfo {
 template <typename Device, typename FPTYPE>
 class DescrptSeAMaskOp : public OpKernel {
  public:
-  explicit DescrptSeAMaskOp(OpKernelConstruction *context) : OpKernel(context) {
+  explicit DescrptSeAMaskOp(OpKernelConstruction* context) : OpKernel(context) {
     // OP_REQUIRES_OK(context);
   }
 
-  void Compute(OpKernelContext *context) override {
+  void Compute(OpKernelContext* context) override {
     deepmd::safe_compute(
-        context, [this](OpKernelContext *context) { this->_Compute(context); });
+        context, [this](OpKernelContext* context) { this->_Compute(context); });
   }
 
-  void _Compute(OpKernelContext *context) {
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
     int context_input_index = 0;
-    const Tensor &coord_tensor = context->input(context_input_index++);
-    const Tensor &type_tensor = context->input(context_input_index++);
-    const Tensor &mask_matrix_tensor = context->input(context_input_index++);
-    const Tensor &box_tensor = context->input(context_input_index++);
-    const Tensor &natoms_tensor = context->input(context_input_index++);
-    const Tensor &mesh_tensor = context->input(context_input_index++);
+    const Tensor& coord_tensor = context->input(context_input_index++);
+    const Tensor& type_tensor = context->input(context_input_index++);
+    const Tensor& mask_matrix_tensor = context->input(context_input_index++);
+    const Tensor& box_tensor = context->input(context_input_index++);
+    const Tensor& natoms_tensor = context->input(context_input_index++);
+    const Tensor& mesh_tensor = context->input(context_input_index++);
 
     // set size of the sample
     OP_REQUIRES(context, (coord_tensor.shape().dims() == 2),
@@ -109,18 +109,18 @@ class DescrptSeAMaskOp : public OpKernel {
     nlist_shape.AddDim(static_cast<int64_t>(total_atom_num) * total_atom_num);
 
     int context_output_index = 0;
-    Tensor *descrpt_tensor = NULL;
+    Tensor* descrpt_tensor = NULL;
     OP_REQUIRES_OK(
         context, context->allocate_output(context_output_index++, descrpt_shape,
                                           &descrpt_tensor));
-    Tensor *descrpt_deriv_tensor = NULL;
+    Tensor* descrpt_deriv_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
                                                      descrpt_deriv_shape,
                                                      &descrpt_deriv_tensor));
-    Tensor *rij_tensor = NULL;
+    Tensor* rij_tensor = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(context_output_index++,
                                                      rij_shape, &rij_tensor));
-    Tensor *nlist_tensor = NULL;
+    Tensor* nlist_tensor = NULL;
     OP_REQUIRES_OK(context,
                    context->allocate_output(context_output_index++, nlist_shape,
                                             &nlist_tensor));
@@ -317,9 +317,9 @@ class DescrptSeAMaskOp : public OpKernel {
   compute_t max_distance = 10000.0;
   void buildAndSortNeighborList(int i_idx,
                                 const std::vector<compute_t> d_coord3,
-                                std::vector<int> &d_type,
-                                std::vector<int> &d_mask,
-                                std::vector<int> &sorted_nlist,
+                                std::vector<int>& d_type,
+                                std::vector<int>& d_mask,
+                                std::vector<int>& sorted_nlist,
                                 int total_atom_num) {
     // sorted_nlist.resize(total_atom_num);
     std::vector<NeighborInfo<double>> sel_nei;
diff --git a/source/op/tf/dotmul_flt_nvnmd.cc b/source/op/tf/dotmul_flt_nvnmd.cc
index 1aca3e8bf8..ecfac60a0a 100644
--- a/source/op/tf/dotmul_flt_nvnmd.cc
+++ b/source/op/tf/dotmul_flt_nvnmd.cc
@@ -37,15 +37,15 @@ modw = 1: normalize w[hh, : , kk]
 using namespace tensorflow;
 
 template <class T>
-void split_flt(T x, int64_t &sign, int64_t &expo, int64_t &mant);
+void split_flt(T x, int64_t& sign, int64_t& expo, int64_t& mant);
 
 // read matmul_flt_nvnmd.cc
 template <class T>  // float and double
-void find_max_expo(int64_t &max_expo, T *x, int64_t M);
+void find_max_expo(int64_t& max_expo, T* x, int64_t M);
 
 // read matmul_flt_nvnmd.cc
 template <class T>  // float and double
-void find_max_expo(int64_t &max_expo, T *x, int64_t N, int64_t M);
+void find_max_expo(int64_t& max_expo, T* x, int64_t N, int64_t M);
 
 //- register the operator
 REGISTER_OP("DotmulFltNvnmd")
@@ -60,19 +60,19 @@ template <typename Device, typename FPTYPE>
 class DotmulFltNvnmdOp : public OpKernel {
  public:
   /// Constructor.
-  explicit DotmulFltNvnmdOp(OpKernelConstruction *context)
+  explicit DotmulFltNvnmdOp(OpKernelConstruction* context)
       : OpKernel(context) {};
 
   /// Compute the descriptor
   /// param: context
-  void Compute(OpKernelContext *context) override {
+  void Compute(OpKernelContext* context) override {
     // check
     DCHECK_EQ(2, context->num_inputs());
-    const Tensor &X = context->input(0);
-    const Tensor &W = context->input(1);
+    const Tensor& X = context->input(0);
+    const Tensor& W = context->input(1);
 
-    const TensorShape &shX = X.shape();
-    const TensorShape &shW = W.shape();
+    const TensorShape& shX = X.shape();
+    const TensorShape& shW = W.shape();
     TensorShape shY;
     DCHECK_EQ(shW.dims(), shX.dims());
 
@@ -104,7 +104,7 @@ class DotmulFltNvnmdOp : public OpKernel {
     }
 
     // create output
-    Tensor *Y = NULL;
+    Tensor* Y = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(0, shY, &Y));
 
     // compute
@@ -131,8 +131,8 @@ class DotmulFltNvnmdOp : public OpKernel {
 
     for (ii = 0; ii < H * N; ii++) {
       // find x max exponnet
-      find_max_expo(expo_max1, (FPTYPE *)&x[ii * M], M);
-      find_max_expo(expo_max2, (FPTYPE *)&w[ii * M], M);
+      find_max_expo(expo_max1, (FPTYPE*)&x[ii * M], M);
+      find_max_expo(expo_max2, (FPTYPE*)&w[ii * M], M);
       //
       s = 0;
       for (jj = 0; jj < M; jj++) {
diff --git a/source/op/tf/matmul_flt_nvnmd.cc b/source/op/tf/matmul_flt_nvnmd.cc
index 22ed23c0a3..c2821096c1 100644
--- a/source/op/tf/matmul_flt_nvnmd.cc
+++ b/source/op/tf/matmul_flt_nvnmd.cc
@@ -37,15 +37,15 @@ modw = 1: normalize w[hh, : , kk]
 using namespace tensorflow;
 
 template <class T>
-void split_flt(T x, int64_t &sign, int64_t &expo, int64_t &mant);
+void split_flt(T x, int64_t& sign, int64_t& expo, int64_t& mant);
 
 // read matmul_flt_nvnmd.cc
 template <class T>  // float and double
-void find_max_expo(int64_t &max_expo, T *x, int64_t M);
+void find_max_expo(int64_t& max_expo, T* x, int64_t M);
 
 // read matmul_flt_nvnmd.cc
 template <class T>  // float and double
-void find_max_expo(int64_t &max_expo, T *x, int64_t N, int64_t M);
+void find_max_expo(int64_t& max_expo, T* x, int64_t N, int64_t M);
 
 //- register the operator
 REGISTER_OP("MatmulFltNvnmd")
@@ -62,21 +62,21 @@ template <typename Device, typename FPTYPE>
 class MatmulFltNvnmdOp : public OpKernel {
  public:
   /// Constructor.
-  explicit MatmulFltNvnmdOp(OpKernelConstruction *context) : OpKernel(context) {
+  explicit MatmulFltNvnmdOp(OpKernelConstruction* context) : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("normx", &normx));
     OP_REQUIRES_OK(context, context->GetAttr("normw", &normw));
   };
 
   /// Compute the descriptor
   /// param: context
-  void Compute(OpKernelContext *context) override {
+  void Compute(OpKernelContext* context) override {
     // check
     DCHECK_EQ(2, context->num_inputs());
-    const Tensor &X = context->input(0);
-    const Tensor &W = context->input(1);
+    const Tensor& X = context->input(0);
+    const Tensor& W = context->input(1);
 
-    const TensorShape &shX = X.shape();
-    const TensorShape &shW = W.shape();
+    const TensorShape& shX = X.shape();
+    const TensorShape& shW = W.shape();
     TensorShape shY;
     DCHECK_EQ(shW.dims(), shX.dims());
 
@@ -103,7 +103,7 @@ class MatmulFltNvnmdOp : public OpKernel {
     }
 
     // create output
-    Tensor *Y = NULL;
+    Tensor* Y = NULL;
     OP_REQUIRES_OK(context, context->allocate_output(0, shY, &Y));
 
     // compute
@@ -130,7 +130,7 @@ class MatmulFltNvnmdOp : public OpKernel {
     for (hh = 0; hh < H; hh++) {
       // find x max exponnet
       if ((normx & 0x0f) == 0) {  // normalize x[:,:]
-        find_max_expo(expo_max1, (FPTYPE *)&x[hh * N * M],
+        find_max_expo(expo_max1, (FPTYPE*)&x[hh * N * M],
                       static_cast<int64_t>(N) * M);
         for (ii = 0; ii < N; ii++) {
           expo_max1s[ii] = expo_max1;
@@ -138,14 +138,14 @@ class MatmulFltNvnmdOp : public OpKernel {
 
       } else {  // normalize x[ii,:]
         for (ii = 0; ii < N; ii++) {
-          find_max_expo(expo_max1, (FPTYPE *)&x[hh * N * M + ii * M], M);
+          find_max_expo(expo_max1, (FPTYPE*)&x[hh * N * M + ii * M], M);
           expo_max1s[ii] = expo_max1;
         }
       }
 
       // find w max exponnet
       if ((normw & 0x0f) == 0) {  // normalize w[:,:]
-        find_max_expo(expo_max2, (FPTYPE *)&w[hh * M * K],
+        find_max_expo(expo_max2, (FPTYPE*)&w[hh * M * K],
                       static_cast<int64_t>(M) * K);
         for (kk = 0; kk < K; kk++) {
           expo_max2s[kk] = expo_max2;
@@ -153,7 +153,7 @@ class MatmulFltNvnmdOp : public OpKernel {
 
       } else {  // normalize w[:,kk]
         for (kk = 0; kk < K; kk++) {
-          find_max_expo(expo_max2, (FPTYPE *)&w[hh * M * K + kk], M, K);
+          find_max_expo(expo_max2, (FPTYPE*)&w[hh * M * K + kk], M, K);
           expo_max2s[kk] = expo_max2;
         }
       }
diff --git a/source/op/tf/optimizer/parallel.cc b/source/op/tf/optimizer/parallel.cc
index f5b7c62b6a..87a53b18ae 100644
--- a/source/op/tf/optimizer/parallel.cc
+++ b/source/op/tf/optimizer/parallel.cc
@@ -27,7 +27,7 @@
 // based on tensorflow/core/grappler/optimizers/remapper.cc
 
 struct RemapperContext {
-  explicit RemapperContext(GrapplerItem *item, Status *status)
+  explicit RemapperContext(GrapplerItem* item, Status* status)
       : nodes_to_preserve(item->NodesToPreserve()),
         graph_view(&item->graph, status) {}
 
@@ -35,11 +35,11 @@ struct RemapperContext {
   utils::MutableGraphView graph_view;
 };
 
-bool IsProdForce(const NodeDef &node) { return node.op() == "ProdForceSeA"; }
+bool IsProdForce(const NodeDef& node) { return node.op() == "ProdForceSeA"; }
 
-bool FindProdForce(RemapperContext *ctx, int node_index) {
-  const auto *node_view = ctx->graph_view.GetNode(node_index);
-  const auto *node_def = node_view->node();
+bool FindProdForce(RemapperContext* ctx, int node_index) {
+  const auto* node_view = ctx->graph_view.GetNode(node_index);
+  const auto* node_def = node_view->node();
   return IsProdForce(*node_def);
 }
 
@@ -55,17 +55,17 @@ TF_INT64 GetNThreads() {
   return tot;
 }
 
-Status ParallelProdForce(RemapperContext *ctx,
+Status ParallelProdForce(RemapperContext* ctx,
                          int node_index,
-                         std::vector<bool> *invalidated_nodes,
-                         std::vector<bool> *nodes_to_delete) {
+                         std::vector<bool>* invalidated_nodes,
+                         std::vector<bool>* nodes_to_delete) {
   // skip on GPUs
   if (GetNumAvailableGPUs() > 0) {
     return Status();
   }
 
-  const NodeDef *ori_node = ctx->graph_view.GetNode(node_index)->node();
-  auto &src_attr = ori_node->attr();
+  const NodeDef* ori_node = ctx->graph_view.GetNode(node_index)->node();
+  auto& src_attr = ori_node->attr();
   TF_INT64 tot = GetNThreads();
   if (tot <= 1) {
     return Status();
@@ -75,11 +75,11 @@ Status ParallelProdForce(RemapperContext *ctx,
   sum_node.set_name(ori_node->name());
   sum_node.set_op("AddN");
   sum_node.set_device(ori_node->device());
-  auto *sum_attr = sum_node.mutable_attr();
+  auto* sum_attr = sum_node.mutable_attr();
   (*sum_attr)["N"].set_i(tot);
   (*sum_attr)["T"] = src_attr.at("T");
 
-  utils::Mutation *mutation = ctx->graph_view.GetMutationBuilder();
+  utils::Mutation* mutation = ctx->graph_view.GetMutationBuilder();
   Status status;
 
   for (int ii = 0; ii < tot; ++ii) {
@@ -92,7 +92,7 @@ Status ParallelProdForce(RemapperContext *ctx,
       sub_node.add_input(ori_node->input(jj));
     }
     // set frac
-    auto *sub_attr = sub_node.mutable_attr();
+    auto* sub_attr = sub_node.mutable_attr();
     (*sub_attr)["T"] = src_attr.at("T");
     (*sub_attr)["n_a_sel"] = src_attr.at("n_a_sel");
     (*sub_attr)["n_r_sel"] = src_attr.at("n_r_sel");
@@ -111,9 +111,9 @@ Status ParallelProdForce(RemapperContext *ctx,
   return Status();
 }
 
-Status DPParallel::Optimize(Cluster *cluster,
-                            const GrapplerItem &item,
-                            GraphDef *optimized_graph) {
+Status DPParallel::Optimize(Cluster* cluster,
+                            const GrapplerItem& item,
+                            GraphDef* optimized_graph) {
   GrapplerItem mutable_item = item;
   Status status;
   RemapperContext ctx(&mutable_item, &status);
@@ -147,7 +147,7 @@ Status DPParallel::Optimize(Cluster *cluster,
   }
 
   // Remove invalidated nodes.
-  utils::Mutation *mutation = ctx.graph_view.GetMutationBuilder();
+  utils::Mutation* mutation = ctx.graph_view.GetMutationBuilder();
   for (int i = 0; i < num_nodes; ++i) {
     if (nodes_to_delete[i]) {
       mutation->RemoveNode(ctx.graph_view.GetNode(i));
diff --git a/source/op/tf/prod_force_se_a_mask.cc b/source/op/tf/prod_force_se_a_mask.cc
index a7b08ae664..6c938f88e0 100644
--- a/source/op/tf/prod_force_se_a_mask.cc
+++ b/source/op/tf/prod_force_se_a_mask.cc
@@ -17,23 +17,23 @@ using CPUDevice = Eigen::ThreadPoolDevice;
 template <typename Device, typename FPTYPE>
 class ProdForceSeAMaskOp : public OpKernel {
  public:
-  explicit ProdForceSeAMaskOp(OpKernelConstruction *context)
+  explicit ProdForceSeAMaskOp(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context,
                    context->GetAttr("total_atom_num", &total_atom_num));
   }
 
-  void Compute(OpKernelContext *context) override {
+  void Compute(OpKernelContext* context) override {
     deepmd::safe_compute(
-        context, [this](OpKernelContext *context) { this->_Compute(context); });
+        context, [this](OpKernelContext* context) { this->_Compute(context); });
   }
 
-  void _Compute(OpKernelContext *context) {
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
-    const Tensor &net_deriv_tensor = context->input(0);
-    const Tensor &in_deriv_tensor = context->input(1);
-    const Tensor &mask_tensor = context->input(2);
-    const Tensor &nlist_tensor = context->input(3);
+    const Tensor& net_deriv_tensor = context->input(0);
+    const Tensor& in_deriv_tensor = context->input(1);
+    const Tensor& mask_tensor = context->input(2);
+    const Tensor& nlist_tensor = context->input(3);
 
     // set size of the sample
     OP_REQUIRES(context, (net_deriv_tensor.shape().dims() == 2),
@@ -67,7 +67,7 @@ class ProdForceSeAMaskOp : public OpKernel {
     force_shape.AddDim(3 * static_cast<int64_t>(nall));
     // std::cout << "forcesahpe " << force_shape.dim_size(0) << " " <<
     // force_shape.dim_size(1) << std::endl;
-    Tensor *force_tensor = NULL;
+    Tensor* force_tensor = NULL;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, force_shape, &force_tensor));
 
diff --git a/source/op/tf/prod_force_se_a_mask_grad.cc b/source/op/tf/prod_force_se_a_mask_grad.cc
index a01919199f..c7ff091857 100644
--- a/source/op/tf/prod_force_se_a_mask_grad.cc
+++ b/source/op/tf/prod_force_se_a_mask_grad.cc
@@ -16,24 +16,24 @@ using CPUDevice = Eigen::ThreadPoolDevice;
 template <typename Device, typename FPTYPE>
 class ProdForceSeAMaskGradOp : public OpKernel {
  public:
-  explicit ProdForceSeAMaskGradOp(OpKernelConstruction *context)
+  explicit ProdForceSeAMaskGradOp(OpKernelConstruction* context)
       : OpKernel(context) {
     OP_REQUIRES_OK(context,
                    context->GetAttr("total_atom_num", &total_atom_num));
   }
 
-  void Compute(OpKernelContext *context) override {
+  void Compute(OpKernelContext* context) override {
     deepmd::safe_compute(
-        context, [this](OpKernelContext *context) { this->_Compute(context); });
+        context, [this](OpKernelContext* context) { this->_Compute(context); });
   }
 
-  void _Compute(OpKernelContext *context) {
+  void _Compute(OpKernelContext* context) {
     // Grab the input tensor
-    const Tensor &grad_tensor = context->input(0);
-    const Tensor &net_deriv_tensor = context->input(1);
-    const Tensor &in_deriv_tensor = context->input(2);
-    const Tensor &mask_tensor = context->input(3);
-    const Tensor &nlist_tensor = context->input(4);
+    const Tensor& grad_tensor = context->input(0);
+    const Tensor& net_deriv_tensor = context->input(1);
+    const Tensor& in_deriv_tensor = context->input(2);
+    const Tensor& mask_tensor = context->input(3);
+    const Tensor& nlist_tensor = context->input(4);
 
     // set size of the sample
     TensorShape grad_shape = grad_tensor.shape();
@@ -82,7 +82,7 @@ class ProdForceSeAMaskGradOp : public OpKernel {
     grad_net_shape.AddDim(static_cast<int64_t>(nloc) * ndescrpt);
 
     // allocate the output tensor
-    Tensor *grad_net_tensor = NULL;
+    Tensor* grad_net_tensor = NULL;
     OP_REQUIRES_OK(
         context, context->allocate_output(0, grad_net_shape, &grad_net_tensor));
 
diff --git a/source/tests/array_api_strict/fitting/fitting.py b/source/tests/array_api_strict/fitting/fitting.py
index 323a49cfe8..c4a5674d2a 100644
--- a/source/tests/array_api_strict/fitting/fitting.py
+++ b/source/tests/array_api_strict/fitting/fitting.py
@@ -31,6 +31,7 @@ def setattr_for_general_fitting(name: str, value: Any) -> Any:
         "fparam_inv_std",
         "aparam_avg",
         "aparam_inv_std",
+        "default_fparam_tensor",
     }:
         value = to_array_api_strict_array(value)
     elif name == "emask":
diff --git a/source/tests/common/dpmodel/test_padding_atoms.py b/source/tests/common/dpmodel/test_padding_atoms.py
new file mode 100644
index 0000000000..d4ea39f598
--- /dev/null
+++ b/source/tests/common/dpmodel/test_padding_atoms.py
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+from copy import (
+    deepcopy,
+)
+
+import numpy as np
+
+from deepmd.dpmodel.descriptor.se_e2_a import (
+    DescrptSeA,
+)
+from deepmd.dpmodel.fitting import (
+    PropertyFittingNet,
+)
+from deepmd.dpmodel.model.property_model import (
+    PropertyModel,
+)
+
+
+class TestCaseSingleFrameWithoutNlist:
+    def setUp(self) -> None:
+        # nf=2, nloc == 3
+        self.nloc = 3
+        self.nt = 2
+        self.coord = np.array(
+            [
+                [
+                    [0, 0, 0],
+                    [0, 1, 0],
+                    [0, 0, 1],
+                ],
+                [
+                    [1, 0, 1],
+                    [0, 1, 1],
+                    [1, 1, 0],
+                ],
+            ],
+            dtype=np.float64,
+        )
+        self.atype = np.array([[0, 0, 1], [1, 1, 0]], dtype=int).reshape([2, self.nloc])
+        self.cell = 2.0 * np.eye(3).reshape([1, 9])
+        self.cell = np.array([self.cell, self.cell]).reshape(2, 9)
+        self.sel = [16, 8]
+        self.rcut = 2.2
+        self.rcut_smth = 0.4
+        self.atol = 1e-12
+
+
+class TestPaddingAtoms(unittest.TestCase, TestCaseSingleFrameWithoutNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithoutNlist.setUp(self)
+
+    def test_padding_atoms_consistency(self):
+        ds = DescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        )
+        ft = PropertyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+            intensive=True,
+        )
+        type_map = ["foo", "bar"]
+        model = PropertyModel(ds, ft, type_map=type_map)
+        var_name = model.get_var_name()
+        args = [self.coord, self.atype, self.cell]
+        result = model.call(*args)
+        # test intensive
+        np.testing.assert_allclose(
+            result[f"{var_name}_redu"],
+            np.mean(result[f"{var_name}"], axis=1),
+            atol=self.atol,
+        )
+        # test padding atoms
+        padding_atoms_list = [1, 5, 10]
+        for padding_atoms in padding_atoms_list:
+            coord = deepcopy(self.coord)
+            atype = deepcopy(self.atype)
+            atype_padding = np.pad(
+                atype,
+                pad_width=((0, 0), (0, padding_atoms)),
+                mode="constant",
+                constant_values=-1,
+            )
+            coord_padding = np.pad(
+                coord,
+                pad_width=((0, 0), (0, padding_atoms), (0, 0)),
+                mode="constant",
+                constant_values=0,
+            )
+            args = [coord_padding, atype_padding, self.cell]
+            result_padding = model.call(*args)
+            np.testing.assert_allclose(
+                result[f"{var_name}_redu"],
+                result_padding[f"{var_name}_redu"],
+                atol=self.atol,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/common/test_argument_parser.py b/source/tests/common/test_argument_parser.py
index 4e39df8659..4aebb7dafc 100644
--- a/source/tests/common/test_argument_parser.py
+++ b/source/tests/common/test_argument_parser.py
@@ -322,6 +322,32 @@ def test_parser_test(self) -> None:
 
         self.run_test(command="test", mapping=ARGS)
 
+    def test_parser_test_train_data(self) -> None:
+        """Test test subparser with train-data."""
+        ARGS = {
+            "--model": {"type": str, "value": "MODEL.PB"},
+            "--train-data": {
+                "type": (str, type(None)),
+                "value": "INPUT.JSON",
+                "dest": "train_json",
+            },
+        }
+
+        self.run_test(command="test", mapping=ARGS)
+
+    def test_parser_test_valid_data(self) -> None:
+        """Test test subparser with valid-data."""
+        ARGS = {
+            "--model": {"type": str, "value": "MODEL.PB"},
+            "--valid-data": {
+                "type": (str, type(None)),
+                "value": "INPUT.JSON",
+                "dest": "valid_json",
+            },
+        }
+
+        self.run_test(command="test", mapping=ARGS)
+
     def test_parser_compress(self) -> None:
         """Test compress subparser."""
         ARGS = {
diff --git a/source/tests/common/test_examples.py b/source/tests/common/test_examples.py
index 09e59bf711..6c9f1e43a2 100644
--- a/source/tests/common/test_examples.py
+++ b/source/tests/common/test_examples.py
@@ -68,6 +68,7 @@
 input_files_multi = (
     p_examples / "water_multi_task" / "pytorch_example" / "input_torch.json",
     p_examples / "water_multi_task" / "pytorch_example" / "input_torch_sharefit.json",
+    p_examples / "water_multi_task" / "pytorch_example" / "input_torch_with_alias.json",
     p_examples / "hessian" / "multi_task" / "input.json",
 )
 
diff --git a/source/tests/consistent/common.py b/source/tests/consistent/common.py
index 7ecb5ea5a5..9a9c585c44 100644
--- a/source/tests/consistent/common.py
+++ b/source/tests/consistent/common.py
@@ -354,9 +354,6 @@ def test_tf_consistent_with_ref(self) -> None:
         data1.pop("@version")
         data2.pop("@version")
 
-        if tf_obj.__class__.__name__.startswith("Polar"):
-            data1["@variables"].pop("bias_atom_e")
-
         np.testing.assert_equal(data1, data2)
         for rr1, rr2 in zip(ret1, ret2):
             np.testing.assert_allclose(
diff --git a/source/tests/consistent/descriptor/common.py b/source/tests/consistent/descriptor/common.py
index 617312145e..8af1c7ea64 100644
--- a/source/tests/consistent/descriptor/common.py
+++ b/source/tests/consistent/descriptor/common.py
@@ -57,7 +57,15 @@
 class DescriptorTest:
     """Useful utilities for descriptor tests."""
 
-    def build_tf_descriptor(self, obj, natoms, coords, atype, box, suffix):
+    def build_tf_descriptor(
+        self,
+        obj: Any,
+        natoms: np.ndarray,
+        coords: np.ndarray,
+        atype: np.ndarray,
+        box: np.ndarray,
+        suffix: str,
+    ) -> tuple[list[Any], dict[Any, np.ndarray]]:
         t_coord = tf.placeholder(GLOBAL_TF_FLOAT_PRECISION, [None], name="i_coord")
         t_type = tf.placeholder(tf.int32, [None], name="i_type")
         t_natoms = tf.placeholder(tf.int32, natoms.shape, name="i_natoms")
@@ -83,7 +91,13 @@ def build_tf_descriptor(self, obj, natoms, coords, atype, box, suffix):
         }
 
     def eval_dp_descriptor(
-        self, dp_obj: Any, natoms, coords, atype, box, mixed_types: bool = False
+        self,
+        dp_obj: Any,
+        natoms: np.ndarray,
+        coords: np.ndarray,
+        atype: np.ndarray,
+        box: np.ndarray,
+        mixed_types: bool = False,
     ) -> Any:
         ext_coords, ext_atype, mapping = extend_coord_with_ghosts(
             coords.reshape(1, -1, 3),
@@ -102,7 +116,13 @@ def eval_dp_descriptor(
         return dp_obj(ext_coords, ext_atype, nlist=nlist, mapping=mapping)
 
     def eval_pt_descriptor(
-        self, pt_obj: Any, natoms, coords, atype, box, mixed_types: bool = False
+        self,
+        pt_obj: Any,
+        natoms: np.ndarray,
+        coords: np.ndarray,
+        atype: np.ndarray,
+        box: np.ndarray,
+        mixed_types: bool = False,
     ) -> Any:
         ext_coords, ext_atype, mapping = extend_coord_with_ghosts_pt(
             torch.from_numpy(coords).to(PT_DEVICE).reshape(1, -1, 3),
@@ -124,7 +144,13 @@ def eval_pt_descriptor(
         ]
 
     def eval_jax_descriptor(
-        self, jax_obj: Any, natoms, coords, atype, box, mixed_types: bool = False
+        self,
+        jax_obj: Any,
+        natoms: np.ndarray,
+        coords: np.ndarray,
+        atype: np.ndarray,
+        box: np.ndarray,
+        mixed_types: bool = False,
     ) -> Any:
         ext_coords, ext_atype, mapping = extend_coord_with_ghosts(
             jnp.array(coords).reshape(1, -1, 3),
@@ -146,7 +172,13 @@ def eval_jax_descriptor(
         ]
 
     def eval_pd_descriptor(
-        self, pd_obj: Any, natoms, coords, atype, box, mixed_types: bool = False
+        self,
+        pd_obj: Any,
+        natoms: np.ndarray,
+        coords: np.ndarray,
+        atype: np.ndarray,
+        box: np.ndarray,
+        mixed_types: bool = False,
     ) -> Any:
         ext_coords, ext_atype, mapping = extend_coord_with_ghosts_pd(
             paddle.to_tensor(coords).to(PD_DEVICE).reshape([1, -1, 3]),
@@ -170,10 +202,10 @@ def eval_pd_descriptor(
     def eval_array_api_strict_descriptor(
         self,
         array_api_strict_obj: Any,
-        natoms,
-        coords,
-        atype,
-        box,
+        natoms: np.ndarray,
+        coords: np.ndarray,
+        atype: np.ndarray,
+        box: np.ndarray,
         mixed_types: bool = False,
     ) -> Any:
         ext_coords, ext_atype, mapping = extend_coord_with_ghosts(
diff --git a/source/tests/consistent/descriptor/test_dpa1.py b/source/tests/consistent/descriptor/test_dpa1.py
index db5fe4dae0..d31cf289b9 100644
--- a/source/tests/consistent/descriptor/test_dpa1.py
+++ b/source/tests/consistent/descriptor/test_dpa1.py
@@ -127,6 +127,7 @@ def data(self) -> dict:
             "use_tebd_bias": use_tebd_bias,
             "type_map": ["O", "H"] if use_econf_tebd else None,
             "seed": 1145141919810,
+            "trainable": False,
         }
 
     def is_meaningless_zero_attention_layer_tests(
diff --git a/source/tests/consistent/descriptor/test_dpa2.py b/source/tests/consistent/descriptor/test_dpa2.py
index ef840bf9d7..6864d91f26 100644
--- a/source/tests/consistent/descriptor/test_dpa2.py
+++ b/source/tests/consistent/descriptor/test_dpa2.py
@@ -181,7 +181,7 @@ def data(self) -> dict:
             "smooth": smooth,
             "exclude_types": exclude_types,
             "env_protection": 0.0,
-            "trainable": True,
+            "trainable": False,
             "use_econf_tebd": use_econf_tebd,
             "use_tebd_bias": use_tebd_bias,
             "type_map": ["O", "H"] if use_econf_tebd else None,
diff --git a/source/tests/consistent/descriptor/test_dpa3.py b/source/tests/consistent/descriptor/test_dpa3.py
index b99117b9e7..367ff29a3b 100644
--- a/source/tests/consistent/descriptor/test_dpa3.py
+++ b/source/tests/consistent/descriptor/test_dpa3.py
@@ -130,7 +130,7 @@ def data(self) -> dict:
             "exclude_types": exclude_types,
             "env_protection": 0.0,
             "use_loc_mapping": use_loc_mapping,
-            "trainable": True,
+            "trainable": False,
         }
 
     @property
@@ -171,14 +171,7 @@ def skip_pd(self) -> bool:
             n_multi_edge_message,
             precision,
         ) = self.param
-        return (
-            not INSTALLED_PD
-            or precision == "bfloat16"
-            or edge_init_use_dist
-            or use_exp_switch
-            or use_dynamic_sel
-            or use_loc_mapping
-        )  # not supported yet
+        return CommonTest.skip_pd
 
     @property
     def skip_dp(self) -> bool:
diff --git a/source/tests/consistent/fitting/test_dipole.py b/source/tests/consistent/fitting/test_dipole.py
index 396ee2d492..010944d109 100644
--- a/source/tests/consistent/fitting/test_dipole.py
+++ b/source/tests/consistent/fitting/test_dipole.py
@@ -61,6 +61,7 @@
     (True, False),  # resnet_dt
     ("float64", "float32"),  # precision
     (True, False),  # mixed_types
+    (None, [0]),  # sel_type
 )
 class TestDipole(CommonTest, DipoleFittingTest, unittest.TestCase):
     @property
@@ -69,13 +70,37 @@ def data(self) -> dict:
             resnet_dt,
             precision,
             mixed_types,
+            sel_type,
         ) = self.param
-        return {
+        data = {
             "neuron": [5, 5, 5],
             "resnet_dt": resnet_dt,
             "precision": precision,
+            "sel_type": sel_type,
             "seed": 20240217,
         }
+        return data
+
+    def pass_data_to_cls(self, cls, data) -> Any:
+        """Pass data to the class."""
+        if cls not in (self.tf_class,):
+            sel_type = data.pop("sel_type", None)
+            if sel_type is not None:
+                all_types = list(range(self.ntypes))
+                exclude_types = [t for t in all_types if t not in sel_type]
+                data["exclude_types"] = exclude_types
+        return cls(**data, **self.additional_data)
+
+    @property
+    def skip_tf(self) -> bool:
+        (
+            resnet_dt,
+            precision,
+            mixed_types,
+            sel_type,
+        ) = self.param
+        # mixed_types + sel_type is not supported
+        return CommonTest.skip_tf or (mixed_types and sel_type is not None)
 
     @property
     def skip_pt(self) -> bool:
@@ -83,6 +108,7 @@ def skip_pt(self) -> bool:
             resnet_dt,
             precision,
             mixed_types,
+            sel_type,
         ) = self.param
         return CommonTest.skip_pt
 
@@ -112,6 +138,7 @@ def additional_data(self) -> dict:
             resnet_dt,
             precision,
             mixed_types,
+            sel_type,
         ) = self.param
         return {
             "ntypes": self.ntypes,
@@ -125,6 +152,7 @@ def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
             resnet_dt,
             precision,
             mixed_types,
+            sel_type,
         ) = self.param
         return self.build_tf_fitting(
             obj,
@@ -141,6 +169,7 @@ def eval_pt(self, pt_obj: Any) -> Any:
             resnet_dt,
             precision,
             mixed_types,
+            sel_type,
         ) = self.param
         return (
             pt_obj(
@@ -159,6 +188,7 @@ def eval_dp(self, dp_obj: Any) -> Any:
             resnet_dt,
             precision,
             mixed_types,
+            sel_type,
         ) = self.param
         return dp_obj(
             self.inputs,
@@ -200,6 +230,7 @@ def rtol(self) -> float:
             resnet_dt,
             precision,
             mixed_types,
+            sel_type,
         ) = self.param
         if precision == "float64":
             return 1e-10
@@ -215,6 +246,7 @@ def atol(self) -> float:
             resnet_dt,
             precision,
             mixed_types,
+            sel_type,
         ) = self.param
         if precision == "float64":
             return 1e-10
@@ -222,3 +254,39 @@ def atol(self) -> float:
             return 1e-4
         else:
             raise ValueError(f"Unknown precision: {precision}")
+
+    def test_tf_consistent_with_ref(self) -> None:
+        """Test whether TF and reference are consistent."""
+        # Special handle for sel_types
+        if self.skip_tf:
+            self.skipTest("Unsupported backend")
+        ref_backend = self.get_reference_backend()
+        if ref_backend == self.RefBackend.TF:
+            self.skipTest("Reference is self")
+        ret1, data1 = self.get_reference_ret_serialization(ref_backend)
+        ret1 = self.extract_ret(ret1, ref_backend)
+        self.reset_unique_id()
+        tf_obj = self.tf_class.deserialize(data1, suffix=self.unique_id)
+        ret2, data2 = self.get_tf_ret_serialization_from_cls(tf_obj)
+        ret2 = self.extract_ret(ret2, self.RefBackend.TF)
+        if tf_obj.__class__.__name__.startswith(("Polar", "Dipole", "DOS")):
+            # tf, pt serialization mismatch
+            common_keys = set(data1.keys()) & set(data2.keys())
+            data1 = {k: data1[k] for k in common_keys}
+            data2 = {k: data2[k] for k in common_keys}
+
+        # not comparing version
+        data1.pop("@version")
+        data2.pop("@version")
+
+        if tf_obj.__class__.__name__.startswith("Polar"):
+            data1["@variables"].pop("bias_atom_e")
+        for ii, networks in enumerate(data2["nets"]["networks"]):
+            if networks is None:
+                data1["nets"]["networks"][ii] = None
+        np.testing.assert_equal(data1, data2)
+        for rr1, rr2 in zip(ret1, ret2):
+            np.testing.assert_allclose(
+                rr1.ravel()[: rr2.size], rr2.ravel(), rtol=self.rtol, atol=self.atol
+            )
+            assert rr1.dtype == rr2.dtype, f"{rr1.dtype} != {rr2.dtype}"
diff --git a/source/tests/consistent/fitting/test_ener.py b/source/tests/consistent/fitting/test_ener.py
index f5a79acabe..ad70bd0bfa 100644
--- a/source/tests/consistent/fitting/test_ener.py
+++ b/source/tests/consistent/fitting/test_ener.py
@@ -70,7 +70,7 @@
     (True, False),  # resnet_dt
     ("float64", "float32", "bfloat16"),  # precision
     (True, False),  # mixed_types
-    (0, 1),  # numb_fparam
+    ((0, None), (1, None), (1, [1.0])),  # (numb_fparam, default_fparam)
     ((0, False), (1, False), (1, True)),  # (numb_aparam, use_aparam_as_mask)
     ([], [-12345.6, None]),  # atom_ener
 )
@@ -81,7 +81,7 @@ def data(self) -> dict:
             resnet_dt,
             precision,
             mixed_types,
-            numb_fparam,
+            (numb_fparam, default_fparam),
             (numb_aparam, use_aparam_as_mask),
             atom_ener,
         ) = self.param
@@ -91,6 +91,7 @@ def data(self) -> dict:
             "precision": precision,
             "numb_fparam": numb_fparam,
             "numb_aparam": numb_aparam,
+            "default_fparam": default_fparam,
             "seed": 20240217,
             "atom_ener": atom_ener,
             "use_aparam_as_mask": use_aparam_as_mask,
@@ -102,7 +103,7 @@ def skip_pt(self) -> bool:
             resnet_dt,
             precision,
             mixed_types,
-            numb_fparam,
+            (numb_fparam, default_fparam),
             (numb_aparam, use_aparam_as_mask),
             atom_ener,
         ) = self.param
@@ -116,7 +117,7 @@ def skip_array_api_strict(self) -> bool:
             resnet_dt,
             precision,
             mixed_types,
-            numb_fparam,
+            (numb_fparam, default_fparam),
             (numb_aparam, use_aparam_as_mask),
             atom_ener,
         ) = self.param
@@ -129,13 +130,25 @@ def skip_pd(self) -> bool:
             resnet_dt,
             precision,
             mixed_types,
-            numb_fparam,
+            (numb_fparam, default_fparam),
             (numb_aparam, use_aparam_as_mask),
             atom_ener,
         ) = self.param
         # Paddle do not support "bfloat16" in some kernels,
         # so skip this in CI test
-        return not INSTALLED_PD or precision == "bfloat16"
+        return not INSTALLED_PD or precision == "bfloat16" or default_fparam is not None
+
+    @property
+    def skip_tf(self) -> bool:
+        (
+            resnet_dt,
+            precision,
+            mixed_types,
+            (numb_fparam, default_fparam),
+            (numb_aparam, use_aparam_as_mask),
+            atom_ener,
+        ) = self.param
+        return not INSTALLED_TF or default_fparam is not None
 
     tf_class = EnerFittingTF
     dp_class = EnerFittingDP
@@ -165,7 +178,7 @@ def additional_data(self) -> dict:
             resnet_dt,
             precision,
             mixed_types,
-            numb_fparam,
+            (numb_fparam, default_fparam),
             (numb_aparam, use_aparam_as_mask),
             atom_ener,
         ) = self.param
@@ -180,7 +193,7 @@ def build_tf(self, obj: Any, suffix: str) -> tuple[list, dict]:
             resnet_dt,
             precision,
             mixed_types,
-            numb_fparam,
+            (numb_fparam, default_fparam),
             (numb_aparam, use_aparam_as_mask),
             atom_ener,
         ) = self.param
@@ -199,7 +212,7 @@ def eval_pt(self, pt_obj: Any) -> Any:
             resnet_dt,
             precision,
             mixed_types,
-            numb_fparam,
+            (numb_fparam, default_fparam),
             (numb_aparam, use_aparam_as_mask),
             atom_ener,
         ) = self.param
@@ -209,7 +222,7 @@ def eval_pt(self, pt_obj: Any) -> Any:
                 torch.from_numpy(self.atype.reshape(1, -1)).to(device=PT_DEVICE),
                 fparam=(
                     torch.from_numpy(self.fparam).to(device=PT_DEVICE)
-                    if numb_fparam
+                    if (numb_fparam and default_fparam is None)  # test default_fparam
                     else None
                 ),
                 aparam=(
@@ -228,14 +241,14 @@ def eval_dp(self, dp_obj: Any) -> Any:
             resnet_dt,
             precision,
             mixed_types,
-            numb_fparam,
+            (numb_fparam, default_fparam),
             (numb_aparam, use_aparam_as_mask),
             atom_ener,
         ) = self.param
         return dp_obj(
             self.inputs,
             self.atype.reshape(1, -1),
-            fparam=self.fparam if numb_fparam else None,
+            fparam=self.fparam if (numb_fparam and default_fparam is None) else None,
             aparam=self.aparam if numb_aparam else None,
         )["energy"]
 
@@ -244,7 +257,7 @@ def eval_jax(self, jax_obj: Any) -> Any:
             resnet_dt,
             precision,
             mixed_types,
-            numb_fparam,
+            (numb_fparam, default_fparam),
             (numb_aparam, use_aparam_as_mask),
             atom_ener,
         ) = self.param
@@ -252,7 +265,9 @@ def eval_jax(self, jax_obj: Any) -> Any:
             jax_obj(
                 jnp.asarray(self.inputs),
                 jnp.asarray(self.atype.reshape(1, -1)),
-                fparam=jnp.asarray(self.fparam) if numb_fparam else None,
+                fparam=jnp.asarray(self.fparam)
+                if (numb_fparam and default_fparam is None)
+                else None,
                 aparam=jnp.asarray(self.aparam) if numb_aparam else None,
             )["energy"]
         )
@@ -262,7 +277,7 @@ def eval_array_api_strict(self, array_api_strict_obj: Any) -> Any:
             resnet_dt,
             precision,
             mixed_types,
-            numb_fparam,
+            (numb_fparam, default_fparam),
             (numb_aparam, use_aparam_as_mask),
             atom_ener,
         ) = self.param
@@ -270,7 +285,9 @@ def eval_array_api_strict(self, array_api_strict_obj: Any) -> Any:
             array_api_strict_obj(
                 array_api_strict.asarray(self.inputs),
                 array_api_strict.asarray(self.atype.reshape(1, -1)),
-                fparam=array_api_strict.asarray(self.fparam) if numb_fparam else None,
+                fparam=array_api_strict.asarray(self.fparam)
+                if (numb_fparam and default_fparam is None)
+                else None,
                 aparam=array_api_strict.asarray(self.aparam) if numb_aparam else None,
             )["energy"]
         )
@@ -280,7 +297,7 @@ def eval_pd(self, pd_obj: Any) -> Any:
             resnet_dt,
             precision,
             mixed_types,
-            numb_fparam,
+            (numb_fparam, default_fparam),
             (numb_aparam, use_aparam_as_mask),
             atom_ener,
         ) = self.param
@@ -317,7 +334,7 @@ def rtol(self) -> float:
             resnet_dt,
             precision,
             mixed_types,
-            numb_fparam,
+            (numb_fparam, default_fparam),
             (numb_aparam, use_aparam_as_mask),
             atom_ener,
         ) = self.param
@@ -337,7 +354,7 @@ def atol(self) -> float:
             resnet_dt,
             precision,
             mixed_types,
-            numb_fparam,
+            (numb_fparam, default_fparam),
             (numb_aparam, use_aparam_as_mask),
             atom_ener,
         ) = self.param
diff --git a/source/tests/consistent/model/common.py b/source/tests/consistent/model/common.py
index 7cf71000db..778ae519c6 100644
--- a/source/tests/consistent/model/common.py
+++ b/source/tests/consistent/model/common.py
@@ -77,8 +77,8 @@ def build_tf_model(
             ]
         elif ret_key == "polar":
             ret_list = [
-                ret["polar"],
                 ret["global_polar"],
+                ret["polar"],
             ]
         else:
             raise NotImplementedError
diff --git a/source/tests/consistent/model/test_dipole.py b/source/tests/consistent/model/test_dipole.py
index bb381129a3..78146a4974 100644
--- a/source/tests/consistent/model/test_dipole.py
+++ b/source/tests/consistent/model/test_dipole.py
@@ -73,6 +73,7 @@ def data(self) -> dict:
     pt_class = DipoleModelPT
     jax_class = DipoleModelJAX
     args = model_args()
+    atol = 1e-8
 
     def get_reference_backend(self):
         """Get the reference backend.
@@ -89,7 +90,7 @@ def get_reference_backend(self):
 
     @property
     def skip_tf(self):
-        return True  # need to fix tf consistency
+        return not INSTALLED_TF
 
     @property
     def skip_jax(self) -> bool:
diff --git a/source/tests/consistent/model/test_dos.py b/source/tests/consistent/model/test_dos.py
index 83e33e499a..ef72e9096b 100644
--- a/source/tests/consistent/model/test_dos.py
+++ b/source/tests/consistent/model/test_dos.py
@@ -90,7 +90,7 @@ def get_reference_backend(self):
 
     @property
     def skip_tf(self):
-        return True  # need to fix tf consistency
+        return not INSTALLED_TF
 
     @property
     def skip_jax(self) -> bool:
diff --git a/source/tests/consistent/model/test_polar.py b/source/tests/consistent/model/test_polar.py
index 5295bc4705..62e84a27c4 100644
--- a/source/tests/consistent/model/test_polar.py
+++ b/source/tests/consistent/model/test_polar.py
@@ -73,6 +73,7 @@ def data(self) -> dict:
     pt_class = PolarModelPT
     jax_class = PolarModelJAX
     args = model_args()
+    atol = 1e-8
 
     def get_reference_backend(self):
         """Get the reference backend.
@@ -89,7 +90,7 @@ def get_reference_backend(self):
 
     @property
     def skip_tf(self):
-        return True  # need to fix tf consistency
+        return not INSTALLED_TF
 
     @property
     def skip_jax(self) -> bool:
diff --git a/source/tests/infer/case.py b/source/tests/infer/case.py
index 8b8481a194..fc7dd30e9c 100644
--- a/source/tests/infer/case.py
+++ b/source/tests/infer/case.py
@@ -125,6 +125,13 @@ def __init__(self, data: dict) -> None:
         else:
             self.descriptor = None
 
+        if "fit_ll" in data:
+            self.fit_ll = np.array(data["fit_ll"], dtype=np.float64).reshape(
+                self.nloc, -1
+            )
+        else:
+            self.fit_ll = None
+
 
 class Case:
     """Test case.
diff --git a/source/tests/infer/deepdipole_pt.pth b/source/tests/infer/deepdipole_pt.pth
new file mode 100644
index 0000000000..4c93a1b864
Binary files /dev/null and b/source/tests/infer/deepdipole_pt.pth differ
diff --git a/source/tests/infer/deeppot-testcase.yaml b/source/tests/infer/deeppot-testcase.yaml
index 9523b8d1ea..772a06a89b 100644
--- a/source/tests/infer/deeppot-testcase.yaml
+++ b/source/tests/infer/deeppot-testcase.yaml
@@ -350,6 +350,45 @@ results:
         1.391094495316195001e+00,
         7.036614101584164338e-01,
       ]
+    fit_ll:
+      [
+        -1.930622006643730598e-02,
+        7.105172146387829235e-01,
+        8.063835335367619539e-01,
+        -8.414936892447275607e-01,
+        1.076881365346436414e+00,
+        -5.058153291569045251e-01,
+        -3.104797373867691779e-02,
+        7.915138025598530414e-01,
+        8.704498369678651537e-01,
+        -9.394329433114724237e-01,
+        1.081177674358831053e+00,
+        -5.122829163516022799e-01,
+        5.307913125575804136e-03,
+        7.644783775007328863e-01,
+        8.548853566716824171e-01,
+        -9.264496186379944653e-01,
+        1.087178488222722672e+00,
+        -4.893627623467682874e-01,
+        -1.098746804357388085e-01,
+        8.092546382430507723e-01,
+        8.757043853926992361e-01,
+        -9.036627000544070754e-01,
+        1.064706190677472852e+00,
+        -5.670533963064982030e-01,
+        -1.270062329805081158e-01,
+        8.618261193779762630e-01,
+        8.979592934126284787e-01,
+        -9.939941754957831721e-01,
+        1.072078883192923771e+00,
+        -5.780043831847785363e-01,
+        -8.617331266742107865e-02,
+        8.388158674801169390e-01,
+        8.904977456468012864e-01,
+        -9.751383339999978306e-01,
+        1.075378146084344344e+00,
+        -5.508880199511664300e-01,
+      ]
   - coord:
       [
         12.83,
diff --git a/source/tests/infer/test_get_model.py b/source/tests/infer/test_get_model.py
new file mode 100644
index 0000000000..4c52dda0a1
--- /dev/null
+++ b/source/tests/infer/test_get_model.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+from deepmd.infer.deep_eval import (
+    DeepEval,
+)
+
+from ..consistent.common import (
+    parameterized,
+)
+from .case import (
+    get_cases,
+)
+
+
+@parameterized(
+    (
+        "se_e2_a",
+        "fparam_aparam",
+    ),  # key
+    (".pb", ".pth"),  # model extension
+)
+class TestGetModelMethod(unittest.TestCase):
+    """Test the new get_model method functionality."""
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        key, extension = cls.param
+        cls.case = get_cases()[key]
+        cls.model_name = cls.case.get_model(extension)
+        cls.dp = DeepEval(cls.model_name)
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        cls.dp = None
+
+    def test_get_model_method_exists(self):
+        """Test that get_model method exists."""
+        self.assertTrue(
+            hasattr(self.dp, "get_model"), "DeepEval should have get_model method"
+        )
+
+    def test_get_model_returns_valid_object(self):
+        """Test that get_model returns a valid model object."""
+        model = self.dp.get_model()
+        self.assertIsNotNone(model, "get_model should return a non-None object")
+
+    def test_get_model_backend_specific(self):
+        """Test that get_model returns the expected type for each backend."""
+        key, extension = self.param
+        model = self.dp.get_model()
+
+        if extension == ".pth":
+            # For PyTorch .pth models (TorchScript), should return torch.jit.ScriptModule
+            import torch
+
+            self.assertIsInstance(
+                model,
+                torch.jit.ScriptModule,
+                "PyTorch .pth model should return TorchScript ScriptModule instance",
+            )
+            # TorchScript modules are also nn.Module instances
+            self.assertIsInstance(
+                model,
+                torch.nn.Module,
+                "PyTorch .pth model should be a torch.nn.Module instance",
+            )
+            # Check if it has common model methods
+            self.assertTrue(
+                hasattr(model, "get_type_map"),
+                "PyTorch model should have get_type_map method",
+            )
+            self.assertTrue(
+                hasattr(model, "get_rcut"),
+                "PyTorch model should have get_rcut method",
+            )
+        elif extension == ".pb":
+            # For TensorFlow models, should return graph
+            try:
+                # Should be a TensorFlow graph or have graph-like properties
+                self.assertTrue(
+                    hasattr(model, "get_operations")
+                    or str(type(model)).find("Graph") >= 0,
+                    "TensorFlow model should be a graph or graph-like object",
+                )
+            except ImportError:
+                # If TensorFlow not available, skip this assertion
+                pass
+
+    def test_get_model_consistency(self):
+        """Test that get_model always returns the same object."""
+        model1 = self.dp.get_model()
+        model2 = self.dp.get_model()
+        # Should return the same object (not necessarily equal, but same reference)
+        self.assertIs(
+            model1, model2, "get_model should return consistent object reference"
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/infer/test_models.py b/source/tests/infer/test_models.py
index c0543e91cd..7f7b7cc21c 100644
--- a/source/tests/infer/test_models.py
+++ b/source/tests/infer/test_models.py
@@ -164,6 +164,24 @@ def test_descriptor(self) -> None:
             expected_descpt = result.descriptor
             np.testing.assert_almost_equal(descpt.ravel(), expected_descpt.ravel())
 
+    def test_fitting_last_layer(self) -> None:
+        _, extension = self.param
+        if extension == ".pb":
+            self.skipTest("fitting_last_layer not supported for TensorFlow models")
+        for ii, result in enumerate(self.case.results):
+            if result.fit_ll is None:
+                continue
+            fit_ll = self.dp.eval_fitting_last_layer(
+                result.coord, result.box, result.atype
+            )
+            expected_fit_ll = result.fit_ll
+            np.testing.assert_almost_equal(fit_ll.ravel(), expected_fit_ll.ravel())
+            fit_ll = self.dp.eval_fitting_last_layer(
+                result.coord, result.box, result.atype
+            )
+            expected_fit_ll = result.fit_ll
+            np.testing.assert_almost_equal(fit_ll.ravel(), expected_fit_ll.ravel())
+
     def test_2frame_atm(self) -> None:
         for ii, result in enumerate(self.case.results):
             coords2 = np.concatenate((result.coord, result.coord))
diff --git a/source/tests/jax/test_padding_atoms.py b/source/tests/jax/test_padding_atoms.py
new file mode 100644
index 0000000000..42e2ae527c
--- /dev/null
+++ b/source/tests/jax/test_padding_atoms.py
@@ -0,0 +1,127 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import sys
+import unittest
+from copy import (
+    deepcopy,
+)
+
+import numpy as np
+
+from deepmd.dpmodel.common import (
+    to_numpy_array,
+)
+
+if sys.version_info >= (3, 10):
+    from deepmd.jax.common import (
+        to_jax_array,
+    )
+    from deepmd.jax.descriptor.se_e2_a import (
+        DescrptSeA,
+    )
+    from deepmd.jax.env import (
+        jnp,
+    )
+    from deepmd.jax.fitting.fitting import (
+        PropertyFittingNet,
+    )
+    from deepmd.jax.model.property_model import (
+        PropertyModel,
+    )
+
+    dtype = jnp.float64
+
+
+@unittest.skipIf(
+    sys.version_info < (3, 10),
+    "JAX requires Python 3.10 or later",
+)
+class TestCaseSingleFrameWithoutNlist:
+    def setUp(self) -> None:
+        # nf=2, nloc == 3
+        self.nloc = 3
+        self.nt = 2
+        self.coord = np.array(
+            [
+                [
+                    [0, 0, 0],
+                    [0, 1, 0],
+                    [0, 0, 1],
+                ],
+                [
+                    [1, 0, 1],
+                    [0, 1, 1],
+                    [1, 1, 0],
+                ],
+            ],
+            dtype=np.float64,
+        )
+        self.atype = np.array([[0, 0, 1], [1, 1, 0]], dtype=int).reshape([2, self.nloc])
+        self.cell = 2.0 * np.eye(3).reshape([1, 9])
+        self.cell = np.array([self.cell, self.cell]).reshape(2, 9)
+        self.sel = [16, 8]
+        self.rcut = 2.2
+        self.rcut_smth = 0.4
+        self.atol = 1e-12
+
+
+@unittest.skipIf(
+    sys.version_info < (3, 10),
+    "JAX requires Python 3.10 or later",
+)
+class TestPaddingAtoms(unittest.TestCase, TestCaseSingleFrameWithoutNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithoutNlist.setUp(self)
+
+    def test_padding_atoms_consistency(self):
+        ds = DescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        )
+        ft = PropertyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+            intensive=True,
+        )
+        type_map = ["foo", "bar"]
+        model = PropertyModel(ds, ft, type_map=type_map)
+        var_name = model.get_var_name()
+        args = [to_jax_array(ii) for ii in [self.coord, self.atype, self.cell]]
+        result = model.call(*args)
+        # test intensive
+        np.testing.assert_allclose(
+            to_numpy_array(result[f"{var_name}_redu"]),
+            np.mean(to_numpy_array(result[f"{var_name}"]), axis=1),
+            atol=self.atol,
+        )
+        # test padding atoms
+        padding_atoms_list = [1, 5, 10]
+        for padding_atoms in padding_atoms_list:
+            coord = deepcopy(self.coord)
+            atype = deepcopy(self.atype)
+            atype_padding = np.pad(
+                atype,
+                pad_width=((0, 0), (0, padding_atoms)),
+                mode="constant",
+                constant_values=-1,
+            )
+            coord_padding = np.pad(
+                coord,
+                pad_width=((0, 0), (0, padding_atoms), (0, 0)),
+                mode="constant",
+                constant_values=0,
+            )
+            args = [
+                to_jax_array(ii) for ii in [coord_padding, atype_padding, self.cell]
+            ]
+            result_padding = model.call(*args)
+            np.testing.assert_allclose(
+                to_numpy_array(result[f"{var_name}_redu"]),
+                to_numpy_array(result_padding[f"{var_name}_redu"]),
+                atol=self.atol,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pd/common.py b/source/tests/pd/common.py
index d73544c5f1..ec36fd0eb9 100644
--- a/source/tests/pd/common.py
+++ b/source/tests/pd/common.py
@@ -79,7 +79,12 @@ def eval_model(
         if spins is not None:
             assert isinstance(spins, paddle.Tensor), err_msg
         assert isinstance(atom_types, paddle.Tensor) or isinstance(atom_types, list)
-        atom_types = paddle.to_tensor(atom_types, dtype=paddle.int32, place=DEVICE)
+        if isinstance(atom_types, paddle.Tensor):
+            atom_types = (
+                atom_types.clone().detach().to(dtype=paddle.int32, device=DEVICE)
+            )
+        else:
+            atom_types = paddle.to_tensor(atom_types, dtype=paddle.int32, place=DEVICE)
     elif isinstance(coords, np.ndarray):
         if cells is not None:
             assert isinstance(cells, np.ndarray), err_msg
@@ -101,28 +106,57 @@ def eval_model(
     else:
         natoms = len(atom_types[0])
 
-    coord_input = paddle.to_tensor(
-        coords.reshape([-1, natoms, 3]), dtype=GLOBAL_PD_FLOAT_PRECISION, place=DEVICE
-    )
-    spin_input = None
-    if spins is not None:
-        spin_input = paddle.to_tensor(
-            spins.reshape([-1, natoms, 3]),
+    if isinstance(coords, paddle.Tensor):
+        coord_input = (
+            coords.reshape([-1, natoms, 3])
+            .clone()
+            .detach()
+            .to(dtype=GLOBAL_PD_FLOAT_PRECISION, device=DEVICE)
+        )
+    else:
+        coord_input = paddle.to_tensor(
+            coords.reshape([-1, natoms, 3]),
             dtype=GLOBAL_PD_FLOAT_PRECISION,
             place=DEVICE,
         )
+    spin_input = None
+    if spins is not None:
+        if isinstance(spins, paddle.Tensor):
+            spin_input = (
+                spins.reshape([-1, natoms, 3])
+                .clone()
+                .detach()
+                .to(dtype=GLOBAL_PD_FLOAT_PRECISION, device=DEVICE)
+            )
+        else:
+            spin_input = paddle.to_tensor(
+                spins.reshape([-1, natoms, 3]),
+                dtype=GLOBAL_PD_FLOAT_PRECISION,
+                place=DEVICE,
+            )
     has_spin = getattr(model, "has_spin", False)
     if callable(has_spin):
         has_spin = has_spin()
-    type_input = paddle.to_tensor(atom_types, dtype=paddle.int64, place=DEVICE)
+    if isinstance(atom_types, paddle.Tensor):
+        type_input = atom_types.clone().detach().to(dtype=paddle.int64, device=DEVICE)
+    else:
+        type_input = paddle.to_tensor(atom_types, dtype=paddle.int64, place=DEVICE)
     box_input = None
     if cells is None:
         pbc = False
     else:
         pbc = True
-        box_input = paddle.to_tensor(
-            cells.reshape([-1, 3, 3]), dtype=GLOBAL_PD_FLOAT_PRECISION, place=DEVICE
-        )
+        if isinstance(cells, paddle.Tensor):
+            box_input = (
+                cells.reshape([-1, 3, 3])
+                .clone()
+                .detach()
+                .to(dtype=GLOBAL_PD_FLOAT_PRECISION, device=DEVICE)
+            )
+        else:
+            box_input = paddle.to_tensor(
+                cells.reshape([-1, 3, 3]), dtype=GLOBAL_PD_FLOAT_PRECISION, place=DEVICE
+            )
     num_iter = int((nframes + infer_batch_size - 1) / infer_batch_size)
 
     for ii in range(num_iter):
diff --git a/source/tests/pd/model/test_atomic_model_atomic_stat.py b/source/tests/pd/model/test_atomic_model_atomic_stat.py
index 93aa7b8905..bfc86edc12 100644
--- a/source/tests/pd/model/test_atomic_model_atomic_stat.py
+++ b/source/tests/pd/model/test_atomic_model_atomic_stat.py
@@ -5,6 +5,7 @@
     Path,
 )
 from typing import (
+    NoReturn,
     Optional,
 )
 
@@ -114,10 +115,10 @@ def forward(
 
 
 class TestAtomicModelStat(unittest.TestCase, TestCaseSingleFrameWithNlist):
-    def tearDown(self):
+    def tearDown(self) -> None:
         self.tempdir.cleanup()
 
-    def setUp(self):
+    def setUp(self) -> None:
         TestCaseSingleFrameWithNlist.setUp(self)
         self.merged_output_stat = [
             {
@@ -171,7 +172,7 @@ def setUp(self):
             pass
         self.stat_file_path = DPPath(h5file, "a")
 
-    def test_output_stat(self):
+    def test_output_stat(self) -> None:
         nf, nloc, nnei = self.nlist.shape
         ds = DescrptDPA1(
             self.rcut,
@@ -237,10 +238,12 @@ def cvt_ret(x):
         expected_ret1["foo"] = ret0["foo"] + foo_bias[at]
         expected_ret1["bar"] = ret0["bar"] + bar_bias[at]
         for kk in ["foo", "bar"]:
-            np.testing.assert_almost_equal(ret1[kk], expected_ret1[kk])
+            np.testing.assert_almost_equal(
+                ret1[kk], expected_ret1[kk], err_msg=f"{kk} not equal"
+            )
 
         # 3. test bias load from file
-        def raise_error():
+        def raise_error() -> NoReturn:
             raise RuntimeError
 
         md0.compute_or_load_out_stat(raise_error, stat_file_path=self.stat_file_path)
@@ -284,10 +287,10 @@ def raise_error():
 class TestAtomicModelStatMergeGlobalAtomic(
     unittest.TestCase, TestCaseSingleFrameWithNlist
 ):
-    def tearDown(self):
+    def tearDown(self) -> None:
         self.tempdir.cleanup()
 
-    def setUp(self):
+    def setUp(self) -> None:
         TestCaseSingleFrameWithNlist.setUp(self)
         self.merged_output_stat = [
             {
@@ -341,7 +344,7 @@ def setUp(self):
             pass
         self.stat_file_path = DPPath(h5file, "a")
 
-    def test_output_stat(self):
+    def test_output_stat(self) -> None:
         nf, nloc, nnei = self.nlist.shape
         ds = DescrptDPA1(
             self.rcut,
@@ -401,7 +404,7 @@ def cvt_ret(x):
             np.testing.assert_almost_equal(ret1[kk], expected_ret1[kk])
 
         # 3. test bias load from file
-        def raise_error():
+        def raise_error() -> NoReturn:
             raise RuntimeError
 
         md0.compute_or_load_out_stat(raise_error, stat_file_path=self.stat_file_path)
diff --git a/source/tests/pd/model/test_deeppot.py b/source/tests/pd/model/test_deeppot.py
new file mode 100644
index 0000000000..24696dea86
--- /dev/null
+++ b/source/tests/pd/model/test_deeppot.py
@@ -0,0 +1,140 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import os
+import unittest
+from copy import (
+    deepcopy,
+)
+from pathlib import (
+    Path,
+)
+
+import numpy as np
+import paddle
+
+from deepmd.infer.deep_pot import DeepPot as DeepPotUni
+from deepmd.pd.entrypoints.main import (
+    freeze,
+    get_trainer,
+)
+from deepmd.pd.infer.deep_eval import (
+    DeepPot,
+)
+
+
+class TestDeepPot(unittest.TestCase):
+    def setUp(self) -> None:
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        self.config["training"]["training_data"]["systems"] = [
+            str(Path(__file__).parent / "water/data/single")
+        ]
+        self.config["training"]["validation_data"]["systems"] = [
+            str(Path(__file__).parent / "water/data/single")
+        ]
+        self.input_json = "test_dp_test.json"
+        with open(self.input_json, "w") as fp:
+            json.dump(self.config, fp, indent=4)
+
+        trainer = get_trainer(deepcopy(self.config))
+        trainer.run()
+
+        ori_place = paddle.device.get_device()
+        paddle.device.set_device("cpu")
+        input_dict, label_dict, _ = trainer.get_data(is_train=False)
+        paddle.device.set_device(ori_place)
+        trainer.wrapper(**input_dict, label=label_dict, cur_lr=1.0)
+        self.model = "model.pd"
+
+    def tearDown(self) -> None:
+        for f in os.listdir("."):
+            if f in ["lcurve.out", self.input_json]:
+                os.remove(f)
+
+    def test_dp_test(self) -> None:
+        dp = DeepPot(str(self.model))
+        cell = np.array(
+            [
+                5.122106549439247480e00,
+                4.016537340154059388e-01,
+                6.951654033828678081e-01,
+                4.016537340154059388e-01,
+                6.112136112297989143e00,
+                8.178091365465004481e-01,
+                6.951654033828678081e-01,
+                8.178091365465004481e-01,
+                6.159552512682983760e00,
+            ]
+        ).reshape(1, 3, 3)
+        coord = np.array(
+            [
+                2.978060152121375648e00,
+                3.588469695887098077e00,
+                2.792459820604495491e00,
+                3.895592322591093115e00,
+                2.712091020667753760e00,
+                1.366836847133650501e00,
+                9.955616170888935690e-01,
+                4.121324820711413039e00,
+                1.817239061889086571e00,
+                3.553661462345699906e00,
+                5.313046969500791583e00,
+                6.635182659098815883e00,
+                6.088601018589653080e00,
+                6.575011420004332585e00,
+                6.825240650611076099e00,
+            ]
+        ).reshape(1, -1, 3)
+        atype = np.array([0, 0, 0, 1, 1]).reshape(1, -1)
+
+        ret = dp.eval(coord, cell, atype, atomic=True)
+        e, f, v, ae, av = ret[0], ret[1], ret[2], ret[3], ret[4]
+        self.assertEqual(e.shape, (1, 1))
+        self.assertEqual(f.shape, (1, 5, 3))
+        self.assertEqual(v.shape, (1, 9))
+        self.assertEqual(ae.shape, (1, 5, 1))
+        self.assertEqual(av.shape, (1, 5, 9))
+
+        self.assertEqual(dp.get_type_map(), ["O", "H"])
+        self.assertEqual(dp.get_ntypes(), 2)
+        self.assertEqual(dp.get_dim_fparam(), 0)
+        self.assertEqual(dp.get_dim_aparam(), 0)
+        self.assertEqual(dp.deep_eval.model_type, DeepPot)
+
+    def test_uni(self) -> None:
+        dp = DeepPotUni("model.pd")
+        self.assertIsInstance(dp, DeepPot)
+        # its methods has been tested in test_dp_test
+
+    def test_eval_typeebd(self) -> None:
+        dp = DeepPot(str(self.model))
+        eval_typeebd = dp.eval_typeebd()
+        self.assertEqual(
+            eval_typeebd.shape, (len(self.config["model"]["type_map"]) + 1, 8)
+        )
+        np.testing.assert_allclose(eval_typeebd[-1], np.zeros_like(eval_typeebd[-1]))
+
+
+@unittest.skip(reason="Freezed model(.json) do not support getting attributes")
+class TestDeepPotFrozen(TestDeepPot):
+    def setUp(self) -> None:
+        super().setUp()
+        frozen_model = "frozen_model.json"
+        freeze(
+            model=self.model,
+            output=frozen_model,
+            head=None,
+            do_atomic_virial=True,
+        )
+        self.model = frozen_model
+
+    # Note: this can not actually disable cuda device to be used
+    # only can be used to test whether devices are mismatched
+    @unittest.skipIf(not paddle.device.is_compiled_with_cuda(), "CUDA not available")
+    @unittest.mock.patch("deepmd.pd.utils.env.DEVICE", paddle.CPUPlace())
+    @unittest.mock.patch("deepmd.pd.infer.deep_eval.DEVICE", paddle.CPUPlace())
+    def test_dp_test_cpu(self) -> None:
+        self.test_dp_test()
diff --git a/source/tests/pd/model/test_descriptor_dpa1.py b/source/tests/pd/model/test_descriptor_dpa1.py
index bfcf4ba6ee..f4cf6a8005 100644
--- a/source/tests/pd/model/test_descriptor_dpa1.py
+++ b/source/tests/pd/model/test_descriptor_dpa1.py
@@ -368,7 +368,7 @@ def translate_se_atten_and_type_embd_dicts_to_dpa1(
     source_dict,
     type_embd_dict,
 ):
-    all_keys = list(target_dict.keys())
+    all_keys = [key for key in target_dict.keys() if "buffer_" not in key]
     record = [False for ii in all_keys]
     for kk, vv in source_dict.items():
         tk = "se_atten." + kk
diff --git a/source/tests/pd/model/test_descriptor_dpa2.py b/source/tests/pd/model/test_descriptor_dpa2.py
index 12017bb840..b8e48580d0 100644
--- a/source/tests/pd/model/test_descriptor_dpa2.py
+++ b/source/tests/pd/model/test_descriptor_dpa2.py
@@ -190,7 +190,7 @@ def translate_type_embd_dicts_to_dpa2(
     source_dict,
     type_embd_dict,
 ):
-    all_keys = list(target_dict.keys())
+    all_keys = [key for key in target_dict.keys() if "buffer_" not in key]
     record = [False for ii in all_keys]
     for kk, vv in source_dict.items():
         record[all_keys.index(kk)] = True
diff --git a/source/tests/pd/model/test_dynamic_sel.py b/source/tests/pd/model/test_dynamic_sel.py
new file mode 100644
index 0000000000..a605d97f85
--- /dev/null
+++ b/source/tests/pd/model/test_dynamic_sel.py
@@ -0,0 +1,155 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import itertools
+import unittest
+
+import numpy as np
+import paddle
+
+from deepmd.dpmodel.descriptor.dpa3 import (
+    RepFlowArgs,
+)
+from deepmd.pd.model.descriptor import (
+    DescrptDPA3,
+)
+from deepmd.pd.utils import (
+    env,
+)
+from deepmd.pd.utils.env import (
+    PRECISION_DICT,
+)
+
+from ...seed import (
+    GLOBAL_SEED,
+)
+from .test_env_mat import (
+    TestCaseSingleFrameWithNlist,
+)
+
+dtype = env.GLOBAL_PD_FLOAT_PRECISION
+
+
+class TestDescrptDPA3DynamicSel(unittest.TestCase, TestCaseSingleFrameWithNlist):
+    def setUp(self) -> None:
+        TestCaseSingleFrameWithNlist.setUp(self)
+
+    def test_consistency(
+        self,
+    ) -> None:
+        rng = np.random.default_rng(100)
+        nf, nloc, nnei = self.nlist.shape
+        davg = rng.normal(size=(self.nt, nnei, 4))
+        dstd = rng.normal(size=(self.nt, nnei, 4))
+        dstd = 0.1 + np.abs(dstd)
+
+        for (
+            ua,
+            rus,
+            ruri,
+            acr,
+            nme,
+            prec,
+            ect,
+            optim,
+        ) in itertools.product(
+            [True, False],  # update_angle
+            ["res_residual"],  # update_style
+            ["norm", "const"],  # update_residual_init
+            [0, 1],  # a_compress_rate
+            [1, 2],  # n_multi_edge_message
+            ["float64"],  # precision
+            [False],  # use_econf_tebd
+            [True, False],  # optim_update
+        ):
+            dtype = PRECISION_DICT[prec]
+            # rtol, atol = get_tols(prec)
+            rtol, atol = 1e-5, 1e-7
+            if prec == "float64":
+                atol = 1e-8  # marginal GPU test cases...
+
+            repflow = RepFlowArgs(
+                n_dim=20,
+                e_dim=10,
+                a_dim=10,
+                nlayers=3,
+                e_rcut=self.rcut,
+                e_rcut_smth=self.rcut_smth,
+                e_sel=nnei,
+                a_rcut=self.rcut - 0.1,
+                a_rcut_smth=self.rcut_smth,
+                a_sel=nnei,
+                a_compress_rate=acr,
+                n_multi_edge_message=nme,
+                axis_neuron=4,
+                update_angle=ua,
+                update_style=rus,
+                update_residual_init=ruri,
+                optim_update=optim,
+                smooth_edge_update=True,
+                sel_reduce_factor=1.0,  # test consistent when sel_reduce_factor == 1.0
+            )
+
+            # dpa3 new impl
+            dd0 = DescrptDPA3(
+                self.nt,
+                repflow=repflow,
+                # kwargs for descriptor
+                exclude_types=[],
+                precision=prec,
+                use_econf_tebd=ect,
+                type_map=["O", "H"] if ect else None,
+                seed=GLOBAL_SEED,
+            ).to(env.DEVICE)
+
+            repflow.use_dynamic_sel = True
+
+            # dpa3 new impl
+            dd1 = DescrptDPA3(
+                self.nt,
+                repflow=repflow,
+                # kwargs for descriptor
+                exclude_types=[],
+                precision=prec,
+                use_econf_tebd=ect,
+                type_map=["O", "H"] if ect else None,
+                seed=GLOBAL_SEED,
+            ).to(env.DEVICE)
+
+            dd0.repflows.mean = paddle.to_tensor(davg, dtype=dtype).to(
+                device=env.DEVICE
+            )
+            dd0.repflows.stddev = paddle.to_tensor(dstd, dtype=dtype).to(
+                device=env.DEVICE
+            )
+            rd0, _, _, _, _ = dd0(
+                paddle.to_tensor(self.coord_ext, dtype=dtype).to(device=env.DEVICE),
+                paddle.to_tensor(self.atype_ext, dtype=paddle.int64).to(
+                    device=env.DEVICE
+                ),
+                paddle.to_tensor(self.nlist, dtype=paddle.int64).to(device=env.DEVICE),
+                paddle.to_tensor(self.mapping, dtype=paddle.int64).to(
+                    device=env.DEVICE
+                ),
+            )
+            # serialization
+            dd1.repflows.mean = paddle.to_tensor(davg, dtype=dtype).to(
+                device=env.DEVICE
+            )
+            dd1.repflows.stddev = paddle.to_tensor(dstd, dtype=dtype).to(
+                device=env.DEVICE
+            )
+            rd1, _, _, _, _ = dd1(
+                paddle.to_tensor(self.coord_ext, dtype=dtype).to(device=env.DEVICE),
+                paddle.to_tensor(self.atype_ext, dtype=paddle.int64).to(
+                    device=env.DEVICE
+                ),
+                paddle.to_tensor(self.nlist, dtype=paddle.int64).to(device=env.DEVICE),
+                paddle.to_tensor(self.mapping, dtype=paddle.int64).to(
+                    device=env.DEVICE
+                ),
+            )
+            np.testing.assert_allclose(
+                rd0.numpy(),
+                rd1.numpy(),
+                rtol=rtol,
+                atol=atol,
+            )
diff --git a/source/tests/pd/model/test_model.py b/source/tests/pd/model/test_model.py
index ce91fd3f21..fa62c28922 100644
--- a/source/tests/pd/model/test_model.py
+++ b/source/tests/pd/model/test_model.py
@@ -400,7 +400,7 @@ def test_consistency(self) -> None:
             .detach()
             .numpy(),
         )
-        self.assertIsNone(model_predict_1.get("atom_virial", None))
+        # self.assertIsNone(model_predict_1.get("atom_virial", None))
         np.testing.assert_allclose(
             head_dict["atom_virial"],
             p_atomic_virial.reshape(head_dict["atom_virial"].shape)
diff --git a/source/tests/pd/test_training.py b/source/tests/pd/test_training.py
index 8958dcb165..0dc36fa314 100644
--- a/source/tests/pd/test_training.py
+++ b/source/tests/pd/test_training.py
@@ -150,9 +150,25 @@ def setUp(self) -> None:
         self.config["model"] = deepcopy(model_se_e2_a)
         self.config["training"]["numb_steps"] = 1
         self.config["training"]["save_freq"] = 1
-        # import paddle
         enable_prim(True)
-        # assert paddle.framework.core._is_eager_prim_enabled()
+
+    def tearDown(self) -> None:
+        DPTrainTest.tearDown(self)
+
+
+class TestEnergyModelGradientAccumulation(unittest.TestCase, DPTrainTest):
+    def setUp(self) -> None:
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        self.config["training"]["training_data"]["systems"] = data_file
+        self.config["training"]["validation_data"]["systems"] = data_file
+        self.config["model"] = deepcopy(model_se_e2_a)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        self.config["training"]["acc_freq"] = 4
+        enable_prim(True)
 
     def tearDown(self) -> None:
         DPTrainTest.tearDown(self)
diff --git a/source/tests/pt/common.py b/source/tests/pt/common.py
index 8709c8b4f9..2dbfdb84ff 100644
--- a/source/tests/pt/common.py
+++ b/source/tests/pt/common.py
@@ -79,7 +79,12 @@ def eval_model(
         if spins is not None:
             assert isinstance(spins, torch.Tensor), err_msg
         assert isinstance(atom_types, torch.Tensor) or isinstance(atom_types, list)
-        atom_types = torch.tensor(atom_types, dtype=torch.int32, device=DEVICE)
+        if isinstance(atom_types, torch.Tensor):
+            atom_types = (
+                atom_types.clone().detach().to(dtype=torch.int32, device=DEVICE)
+            )
+        else:
+            atom_types = torch.tensor(atom_types, dtype=torch.int32, device=DEVICE)
     elif isinstance(coords, np.ndarray):
         if cells is not None:
             assert isinstance(cells, np.ndarray), err_msg
@@ -101,28 +106,59 @@ def eval_model(
     else:
         natoms = len(atom_types[0])
 
-    coord_input = torch.tensor(
-        coords.reshape([-1, natoms, 3]), dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
-    )
-    spin_input = None
-    if spins is not None:
-        spin_input = torch.tensor(
-            spins.reshape([-1, natoms, 3]),
+    if isinstance(coords, torch.Tensor):
+        coord_input = (
+            coords.reshape([-1, natoms, 3])
+            .clone()
+            .detach()
+            .to(dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE)
+        )
+    else:
+        coord_input = torch.tensor(
+            coords.reshape([-1, natoms, 3]),
             dtype=GLOBAL_PT_FLOAT_PRECISION,
             device=DEVICE,
         )
+    spin_input = None
+    if spins is not None:
+        if isinstance(spins, torch.Tensor):
+            spin_input = (
+                spins.reshape([-1, natoms, 3])
+                .clone()
+                .detach()
+                .to(dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE)
+            )
+        else:
+            spin_input = torch.tensor(
+                spins.reshape([-1, natoms, 3]),
+                dtype=GLOBAL_PT_FLOAT_PRECISION,
+                device=DEVICE,
+            )
     has_spin = getattr(model, "has_spin", False)
     if callable(has_spin):
         has_spin = has_spin()
-    type_input = torch.tensor(atom_types, dtype=torch.long, device=DEVICE)
+    if isinstance(atom_types, torch.Tensor):
+        type_input = atom_types.clone().detach().to(dtype=torch.long, device=DEVICE)
+    else:
+        type_input = torch.tensor(atom_types, dtype=torch.long, device=DEVICE)
     box_input = None
     if cells is None:
         pbc = False
     else:
         pbc = True
-        box_input = torch.tensor(
-            cells.reshape([-1, 3, 3]), dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE
-        )
+        if isinstance(cells, torch.Tensor):
+            box_input = (
+                cells.reshape([-1, 3, 3])
+                .clone()
+                .detach()
+                .to(dtype=GLOBAL_PT_FLOAT_PRECISION, device=DEVICE)
+            )
+        else:
+            box_input = torch.tensor(
+                cells.reshape([-1, 3, 3]),
+                dtype=GLOBAL_PT_FLOAT_PRECISION,
+                device=DEVICE,
+            )
     num_iter = int((nframes + infer_batch_size - 1) / infer_batch_size)
 
     for ii in range(num_iter):
diff --git a/source/tests/pt/model/test_linear_atomic_model_stat.py b/source/tests/pt/model/test_linear_atomic_model_stat.py
index 90758526b9..0f92f1253d 100644
--- a/source/tests/pt/model/test_linear_atomic_model_stat.py
+++ b/source/tests/pt/model/test_linear_atomic_model_stat.py
@@ -233,16 +233,11 @@ def test_linear_atomic_model_stat_with_bias(self) -> None:
         linear_model.compute_or_load_out_stat(
             self.merged_output_stat, stat_file_path=self.stat_file_path
         )
-        # bias applied to sub atomic models.
         ener_bias = np.array([1.0, 3.0]).reshape(2, 1)
-        linear_ret = []
-        for idx, md in enumerate(linear_model.models):
-            ret = md.forward_common_atomic(*args)
-            ret = to_numpy_array(ret["energy"])
-            linear_ret.append(ret_no_bias[idx] + ener_bias[at])
-            np.testing.assert_almost_equal((ret_no_bias[idx] + ener_bias[at]), ret)
+        ret = to_numpy_array(linear_model.forward_common_atomic(*args)["energy"])
+        np.testing.assert_almost_equal((ret0 + ener_bias[at]), ret)
 
         # linear model not adding bias again
         ret1 = linear_model.forward_common_atomic(*args)
         ret1 = to_numpy_array(ret1["energy"])
-        np.testing.assert_almost_equal(np.mean(np.stack(linear_ret), axis=0), ret1)
+        np.testing.assert_almost_equal(ret, ret1)
diff --git a/source/tests/pt/model/test_saveload_dpa1.py b/source/tests/pt/model/test_saveload_dpa1.py
index d09d156d4e..73d6adc0b1 100644
--- a/source/tests/pt/model/test_saveload_dpa1.py
+++ b/source/tests/pt/model/test_saveload_dpa1.py
@@ -69,7 +69,6 @@ def setUp(self) -> None:
             batch_size=None,
             num_workers=0,  # setting to 0 diverges the behavior of its iterator; should be >=1
             drop_last=False,
-            pin_memory=True,
         )
 
         def cycle_iterator(iterable):
diff --git a/source/tests/pt/model/test_saveload_se_e2_a.py b/source/tests/pt/model/test_saveload_se_e2_a.py
index 12cd854664..d9bd7e0c95 100644
--- a/source/tests/pt/model/test_saveload_se_e2_a.py
+++ b/source/tests/pt/model/test_saveload_se_e2_a.py
@@ -69,7 +69,6 @@ def setUp(self) -> None:
             batch_size=None,
             num_workers=0,  # setting to 0 diverges the behavior of its iterator; should be >=1
             drop_last=False,
-            pin_memory=True,
         )
 
         def cycle_iterator(iterable):
diff --git a/source/tests/pt/model/water/data/data_0/set.000/aparam.npy b/source/tests/pt/model/water/data/data_0/set.000/aparam.npy
new file mode 100644
index 0000000000..3058d9d660
Binary files /dev/null and b/source/tests/pt/model/water/data/data_0/set.000/aparam.npy differ
diff --git a/source/tests/pt/model/water/data/data_0/set.000/fparam.npy b/source/tests/pt/model/water/data/data_0/set.000/fparam.npy
new file mode 100644
index 0000000000..770b4a5d66
Binary files /dev/null and b/source/tests/pt/model/water/data/data_0/set.000/fparam.npy differ
diff --git a/source/tests/pt/model/water/data/single/set.000/aparam.npy b/source/tests/pt/model/water/data/single/set.000/aparam.npy
new file mode 100644
index 0000000000..af871fd2ab
Binary files /dev/null and b/source/tests/pt/model/water/data/single/set.000/aparam.npy differ
diff --git a/source/tests/pt/model/water/data/single/set.000/fparam.npy b/source/tests/pt/model/water/data/single/set.000/fparam.npy
new file mode 100644
index 0000000000..0a55fbe92a
Binary files /dev/null and b/source/tests/pt/model/water/data/single/set.000/fparam.npy differ
diff --git a/source/tests/pt/test_calculator.py b/source/tests/pt/test_calculator.py
index c73bbad415..7458117ca3 100644
--- a/source/tests/pt/test_calculator.py
+++ b/source/tests/pt/test_calculator.py
@@ -64,17 +64,108 @@ def test_calculator(self) -> None:
         atomic_numbers = [1, 1, 1, 8, 8]
         idx_perm = [1, 0, 4, 3, 2]
 
+        # Convert tensors to numpy for ASE compatibility
+        cell_np = cell.numpy()
+        coord_np = coord.numpy()
+
+        prec = 1e-10
+        low_prec = 1e-4
+
+        ase_atoms0 = Atoms(
+            numbers=atomic_numbers,
+            positions=coord_np,
+            # positions=[tuple(item) for item in coordinate],
+            cell=cell_np,
+            calculator=self.calculator,
+            pbc=True,
+        )
+        e0, f0 = ase_atoms0.get_potential_energy(), ase_atoms0.get_forces()
+        s0, v0 = (
+            ase_atoms0.get_stress(voigt=True),
+            -ase_atoms0.get_stress(voigt=False) * ase_atoms0.get_volume(),
+        )
+
+        ase_atoms1 = Atoms(
+            numbers=[atomic_numbers[i] for i in idx_perm],
+            positions=coord_np[idx_perm, :],
+            # positions=[tuple(item) for item in coordinate],
+            cell=cell_np,
+            calculator=self.calculator,
+            pbc=True,
+        )
+        e1, f1 = ase_atoms1.get_potential_energy(), ase_atoms1.get_forces()
+        s1, v1 = (
+            ase_atoms1.get_stress(voigt=True),
+            -ase_atoms1.get_stress(voigt=False) * ase_atoms1.get_volume(),
+        )
+
+        assert isinstance(e0, float)
+        assert f0.shape == (natoms, 3)
+        assert v0.shape == (3, 3)
+        np.testing.assert_allclose(e0, e1, rtol=low_prec, atol=prec)
+        np.testing.assert_allclose(f0[idx_perm, :], f1, rtol=low_prec, atol=prec)
+        np.testing.assert_allclose(s0, s1, rtol=low_prec, atol=prec)
+        np.testing.assert_allclose(v0, v1, rtol=low_prec, atol=prec)
+
+
+class TestCalculatorWithFparamAparam(unittest.TestCase):
+    def setUp(self) -> None:
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        self.config["training"]["numb_steps"] = 1
+        self.config["model"]["fitting_net"]["numb_fparam"] = 2
+        self.config["model"]["fitting_net"]["numb_aparam"] = 1
+        self.config["training"]["save_freq"] = 1
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        self.config["training"]["training_data"]["systems"] = data_file
+        self.config["training"]["validation_data"]["systems"] = [
+            str(Path(__file__).parent / "water/data/single")
+        ]
+        self.input_json = "test_dp_test.json"
+        with open(self.input_json, "w") as fp:
+            json.dump(self.config, fp, indent=4)
+
+        trainer = get_trainer(deepcopy(self.config))
+        trainer.run()
+
+        with torch.device("cpu"):
+            input_dict, label_dict, _ = trainer.get_data(is_train=False)
+        _, _, more_loss = trainer.wrapper(**input_dict, label=label_dict, cur_lr=1.0)
+
+        self.calculator = DPCalculator("model.pt")
+
+    def test_calculator(self) -> None:
+        from ase import (
+            Atoms,
+        )
+
+        natoms = 5
+        cell = torch.eye(3, dtype=dtype, device="cpu") * 10
+        generator = torch.Generator(device="cpu").manual_seed(GLOBAL_SEED)
+        coord = torch.rand([natoms, 3], dtype=dtype, device="cpu", generator=generator)
+        coord = torch.matmul(coord, cell)
+        fparam = torch.IntTensor([1, 2]).numpy()
+        aparam = torch.IntTensor([[1], [0], [2], [1], [0]]).numpy()
+        atomic_numbers = [1, 1, 1, 8, 8]
+        idx_perm = [1, 0, 4, 3, 2]
+
+        # Convert tensors to numpy for ASE compatibility
+        cell_np = cell.numpy()
+        coord_np = coord.numpy()
+
         prec = 1e-10
         low_prec = 1e-4
 
         ase_atoms0 = Atoms(
             numbers=atomic_numbers,
-            positions=coord,
+            positions=coord_np,
             # positions=[tuple(item) for item in coordinate],
-            cell=cell,
+            cell=cell_np,
             calculator=self.calculator,
             pbc=True,
         )
+        ase_atoms0.info.update({"fparam": fparam, "aparam": aparam})
         e0, f0 = ase_atoms0.get_potential_energy(), ase_atoms0.get_forces()
         s0, v0 = (
             ase_atoms0.get_stress(voigt=True),
@@ -83,12 +174,13 @@ def test_calculator(self) -> None:
 
         ase_atoms1 = Atoms(
             numbers=[atomic_numbers[i] for i in idx_perm],
-            positions=coord[idx_perm, :],
+            positions=coord_np[idx_perm, :],
             # positions=[tuple(item) for item in coordinate],
-            cell=cell,
+            cell=cell_np,
             calculator=self.calculator,
             pbc=True,
         )
+        ase_atoms1.info.update({"fparam": fparam, "aparam": aparam[idx_perm, :]})
         e1, f1 = ase_atoms1.get_potential_energy(), ase_atoms1.get_forces()
         s1, v1 = (
             ase_atoms1.get_stress(voigt=True),
diff --git a/source/tests/pt/test_dp_show.py b/source/tests/pt/test_dp_show.py
index d27e5e69d9..d816e3d0b4 100644
--- a/source/tests/pt/test_dp_show.py
+++ b/source/tests/pt/test_dp_show.py
@@ -47,43 +47,53 @@ def setUp(self) -> None:
 
     def test_checkpoint(self) -> None:
         INPUT = "model.pt"
-        ATTRIBUTES = "type-map descriptor fitting-net size"
+        ATTRIBUTES = "type-map descriptor fitting-net size observed-type"
         with redirect_stderr(io.StringIO()) as f:
             run_dp(f"dp --pt show {INPUT} {ATTRIBUTES}")
-        results = f.getvalue().split("\n")[:-1]
-        assert "This is a singletask model" in results[-8]
-        assert "The type_map is ['O', 'H', 'Au']" in results[-7]
+        results = [
+            res for res in f.getvalue().split("\n")[:-1] if "DEEPMD WARNING" not in res
+        ]  # filter out warnings
+        assert "This is a singletask model" in results[0]
+        assert "The type_map is ['O', 'H', 'Au']" in results[1]
         assert (
             "{'type': 'se_e2_a'" and "'sel': [46, 92, 4]" and "'rcut': 4.0"
-        ) in results[-6]
+        ) in results[2]
         assert (
             "The fitting_net parameter is {'neuron': [24, 24, 24], 'resnet_dt': True, 'seed': 1}"
-            in results[-5]
+            in results[3]
         )
-        assert "Parameter counts:" in results[-4]
-        assert "Parameters in descriptor: 19,350" in results[-3]
-        assert "Parameters in fitting-net: 119,091" in results[-2]
-        assert "Parameters in total: 138,441" in results[-1]
+        assert "Parameter counts:" in results[4]
+        assert "Parameters in descriptor: 19,350" in results[5]
+        assert "Parameters in fitting-net: 119,091" in results[6]
+        assert "Parameters in total: 138,441" in results[7]
+        assert "The observed types for this model:" in results[8]
+        assert "Number of observed types: 2" in results[9]
+        assert "Observed types: ['H', 'O']" in results[10]
 
     def test_frozen_model(self) -> None:
         INPUT = "frozen_model.pth"
-        ATTRIBUTES = "type-map descriptor fitting-net size"
+        ATTRIBUTES = "type-map descriptor fitting-net size observed-type"
         with redirect_stderr(io.StringIO()) as f:
             run_dp(f"dp --pt show {INPUT} {ATTRIBUTES}")
-        results = f.getvalue().split("\n")[:-1]
-        assert "This is a singletask model" in results[-8]
-        assert "The type_map is ['O', 'H', 'Au']" in results[-7]
+        results = [
+            res for res in f.getvalue().split("\n")[:-1] if "DEEPMD WARNING" not in res
+        ]  # filter out warnings
+        assert "This is a singletask model" in results[0]
+        assert "The type_map is ['O', 'H', 'Au']" in results[1]
         assert (
             "{'type': 'se_e2_a'" and "'sel': [46, 92, 4]" and "'rcut': 4.0"
-        ) in results[-6]
+        ) in results[2]
         assert (
             "The fitting_net parameter is {'neuron': [24, 24, 24], 'resnet_dt': True, 'seed': 1}"
-            in results[-5]
+            in results[3]
         )
-        assert "Parameter counts:" in results[-4]
-        assert "Parameters in descriptor: 19,350" in results[-3]
-        assert "Parameters in fitting-net: 119,091" in results[-2]
-        assert "Parameters in total: 138,441" in results[-1]
+        assert "Parameter counts:" in results[4]
+        assert "Parameters in descriptor: 19,350" in results[5]
+        assert "Parameters in fitting-net: 119,091" in results[6]
+        assert "Parameters in total: 138,441" in results[7]
+        assert "The observed types for this model:" in results[8]
+        assert "Number of observed types: 2" in results[9]
+        assert "Observed types: ['H', 'O']" in results[10]  # only covers two elements
 
     def test_checkpoint_error(self) -> None:
         INPUT = "model.pt"
@@ -152,62 +162,81 @@ def setUp(self) -> None:
 
     def test_checkpoint(self) -> None:
         INPUT = "model.ckpt.pt"
-        ATTRIBUTES = "model-branch type-map descriptor fitting-net size"
+        ATTRIBUTES = "model-branch type-map descriptor fitting-net size observed-type"
         with redirect_stderr(io.StringIO()) as f:
             run_dp(f"dp --pt show {INPUT} {ATTRIBUTES}")
-        results = f.getvalue().split("\n")[:-1]
-        assert "This is a multitask model" in results[-12]
+        results = [
+            res
+            for res in f.getvalue().split("\n")[:-1]
+            if "DEEPMD WARNING" not in res
+            and "|" not in res
+            and "+-" not in res
+            and "Detailed information" not in res
+        ]  # filter out warnings and tables
+        assert "This is a multitask model" in results[0]
         assert (
             "Available model branches are ['model_1', 'model_2', 'RANDOM'], "
             "where 'RANDOM' means using a randomly initialized fitting net."
-            in results[-11]
+            in results[1]
         )
-        assert "The type_map of branch model_1 is ['O', 'H', 'B']" in results[-10]
-        assert "The type_map of branch model_2 is ['O', 'H', 'B']" in results[-9]
+        assert "The type_map of branch model_1 is ['O', 'H', 'B']" in results[2]
+        assert "The type_map of branch model_2 is ['O', 'H', 'B']" in results[3]
         assert (
             "model_1"
             and "'type': 'se_e2_a'"
             and "'sel': [46, 92, 4]"
             and "'rcut_smth': 0.5"
-        ) in results[-8]
+        ) in results[4]
         assert (
             "model_2"
             and "'type': 'se_e2_a'"
             and "'sel': [46, 92, 4]"
             and "'rcut_smth': 0.5"
-        ) in results[-7]
+        ) in results[5]
         assert (
             "The fitting_net parameter of branch model_1 is {'neuron': [1, 2, 3], 'seed': 678}"
-            in results[-6]
+            in results[6]
         )
         assert (
             "The fitting_net parameter of branch model_2 is {'neuron': [9, 8, 7], 'seed': 1111}"
-            in results[-5]
+            in results[7]
         )
-        assert "Parameter counts for a single branch model:" in results[-4]
-        assert "Parameters in descriptor: 19,350" in results[-3]
-        assert "Parameters in fitting-net: 4,860" in results[-2]
-        assert "Parameters in total: 24,210" in results[-1]
+        assert "Parameter counts for a single branch model:" in results[8]
+        assert "Parameters in descriptor: 19,350" in results[9]
+        assert "Parameters in fitting-net: 4,860" in results[10]
+        assert "Parameters in total: 24,210" in results[11]
+        assert "The observed types for each branch:" in results[12]
+        assert "model_1: Number of observed types: 2" in results[13]
+        assert "model_1: Observed types: ['H', 'O']" in results[14]
+        assert "model_2: Number of observed types: 2" in results[15]
+        assert "model_2: Observed types: ['H', 'O']" in results[16]
+        assert "TOTAL number of observed types in the model: 2" in results[17]
+        assert "TOTAL observed types in the model: ['H', 'O']" in results[18]
 
     def test_frozen_model(self) -> None:
         INPUT = "frozen_model.pth"
-        ATTRIBUTES = "type-map descriptor fitting-net size"
+        ATTRIBUTES = "type-map descriptor fitting-net size observed-type"
         with redirect_stderr(io.StringIO()) as f:
             run_dp(f"dp --pt show {INPUT} {ATTRIBUTES}")
-        results = f.getvalue().split("\n")[:-1]
-        assert "This is a singletask model" in results[-8]
-        assert "The type_map is ['O', 'H', 'B']" in results[-7]
+        results = [
+            res for res in f.getvalue().split("\n")[:-1] if "DEEPMD WARNING" not in res
+        ]  # filter out warnings
+        assert "This is a singletask model" in results[0]
+        assert "The type_map is ['O', 'H', 'B']" in results[1]
         assert (
             "'type': 'se_e2_a'" and "'sel': [46, 92, 4]" and "'rcut_smth': 0.5"
-        ) in results[-6]
+        ) in results[2]
         assert (
             "The fitting_net parameter is {'neuron': [1, 2, 3], 'seed': 678}"
-            in results[-5]
+            in results[3]
         )
-        assert "Parameter counts:" in results[-4]
-        assert "Parameters in descriptor: 19,350" in results[-3]
-        assert "Parameters in fitting-net: 4,860" in results[-2]
-        assert "Parameters in total: 24,210" in results[-1]
+        assert "Parameter counts:" in results[4]
+        assert "Parameters in descriptor: 19,350" in results[5]
+        assert "Parameters in fitting-net: 4,860" in results[6]
+        assert "Parameters in total: 24,210" in results[7]
+        assert "The observed types for this model:" in results[8]
+        assert "Number of observed types: 2" in results[9]
+        assert "Observed types: ['H', 'O']" in results[10]  # only covers two elements
 
     def tearDown(self) -> None:
         for f in os.listdir("."):
diff --git a/source/tests/pt/test_dp_test.py b/source/tests/pt/test_dp_test.py
index c2915c7ee7..1c11541e50 100644
--- a/source/tests/pt/test_dp_test.py
+++ b/source/tests/pt/test_dp_test.py
@@ -15,12 +15,19 @@
 import torch
 
 from deepmd.entrypoints.test import test as dp_test
+from deepmd.entrypoints.test import test_ener as dp_test_ener
+from deepmd.infer.deep_eval import (
+    DeepEval,
+)
 from deepmd.pt.entrypoints.main import (
     get_trainer,
 )
 from deepmd.pt.utils.utils import (
     to_numpy_array,
 )
+from deepmd.utils.data import (
+    DeepmdData,
+)
 
 from .model.test_permutation import (
     model_property,
@@ -30,7 +37,9 @@
 
 
 class DPTest:
-    def test_dp_test_1_frame(self) -> None:
+    def _run_dp_test(
+        self, use_input_json: bool, numb_test: int = 0, use_train: bool = False
+    ) -> None:
         trainer = get_trainer(deepcopy(self.config))
         with torch.device("cpu"):
             input_dict, label_dict, _ = trainer.get_data(is_train=False)
@@ -44,12 +53,17 @@ def test_dp_test_1_frame(self) -> None:
         model = torch.jit.script(trainer.model)
         tmp_model = tempfile.NamedTemporaryFile(delete=False, suffix=".pth")
         torch.jit.save(model, tmp_model.name)
+        val_sys = self.config["training"]["validation_data"]["systems"]
+        if isinstance(val_sys, list):
+            val_sys = val_sys[0]
         dp_test(
             model=tmp_model.name,
-            system=self.config["training"]["validation_data"]["systems"][0],
+            system=None if use_input_json else val_sys,
             datafile=None,
+            train_json=self.input_json if use_input_json and use_train else None,
+            valid_json=self.input_json if use_input_json and not use_train else None,
             set_prefix="set",
-            numb_test=0,
+            numb_test=numb_test,
             rand_seed=None,
             shuffle_test=False,
             detail_file=self.detail_file,
@@ -93,6 +107,20 @@ def test_dp_test_1_frame(self) -> None:
                 ).reshape(-1, 3),
             )
 
+    def test_dp_test_1_frame(self) -> None:
+        self._run_dp_test(False)
+
+    def test_dp_test_input_json(self) -> None:
+        self._run_dp_test(True)
+
+    def test_dp_test_input_json_train(self) -> None:
+        with open(self.input_json) as f:
+            cfg = json.load(f)
+        cfg["training"]["validation_data"]["systems"] = ["non-existent"]
+        with open(self.input_json, "w") as f:
+            json.dump(cfg, f, indent=4)
+        self._run_dp_test(True, use_train=True)
+
     def tearDown(self) -> None:
         for f in os.listdir("."):
             if f.startswith("model") and f.endswith(".pt"):
@@ -140,6 +168,208 @@ def setUp(self) -> None:
             json.dump(self.config, fp, indent=4)
 
 
+class TestDPTestSeARglob(unittest.TestCase):
+    def setUp(self) -> None:
+        self.detail_file = "test_dp_test_ener_rglob_detail"
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        data_file = [str(Path(__file__).parent / "water/data/single")]
+        self.config["training"]["training_data"]["systems"] = data_file
+        root_dir = str(Path(__file__).parent)
+        self.config["training"]["validation_data"]["systems"] = root_dir
+        self.config["training"]["validation_data"]["rglob_patterns"] = [
+            "water/data/single"
+        ]
+        self.config["model"] = deepcopy(model_se_e2_a)
+        self.input_json = "test_dp_test_rglob.json"
+        with open(self.input_json, "w") as fp:
+            json.dump(self.config, fp, indent=4)
+
+    def test_dp_test_input_json_rglob(self) -> None:
+        trainer = get_trainer(deepcopy(self.config))
+        with torch.device("cpu"):
+            input_dict, _, _ = trainer.get_data(is_train=False)
+        input_dict.pop("spin", None)
+        model = torch.jit.script(trainer.model)
+        tmp_model = tempfile.NamedTemporaryFile(delete=False, suffix=".pth")
+        torch.jit.save(model, tmp_model.name)
+        dp_test(
+            model=tmp_model.name,
+            system=None,
+            datafile=None,
+            valid_json=self.input_json,
+            set_prefix="set",
+            numb_test=1,
+            rand_seed=None,
+            shuffle_test=False,
+            detail_file=self.detail_file,
+            atomic=False,
+        )
+        os.unlink(tmp_model.name)
+        self.assertTrue(os.path.exists(self.detail_file + ".e.out"))
+
+    def tearDown(self) -> None:
+        for f in os.listdir("."):
+            if f.startswith("model") and f.endswith(".pt"):
+                os.remove(f)
+            if f.startswith(self.detail_file):
+                os.remove(f)
+            if f in ["lcurve.out", self.input_json]:
+                os.remove(f)
+            if f in ["stat_files"]:
+                shutil.rmtree(f)
+
+
+class TestDPTestSeARglobTrain(unittest.TestCase):
+    def setUp(self) -> None:
+        self.detail_file = "test_dp_test_ener_rglob_train_detail"
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        root_dir = str(Path(__file__).parent)
+        self.config["training"]["training_data"]["systems"] = root_dir
+        self.config["training"]["training_data"]["rglob_patterns"] = [
+            "water/data/single"
+        ]
+        data_file = [str(Path(__file__).parent / "water/data/single")]
+        self.config["training"]["validation_data"]["systems"] = data_file
+        self.config["model"] = deepcopy(model_se_e2_a)
+        self.input_json = "test_dp_test_rglob_train.json"
+        with open(self.input_json, "w") as fp:
+            json.dump(self.config, fp, indent=4)
+
+    def test_dp_test_input_json_rglob_train(self) -> None:
+        trainer = get_trainer(deepcopy(self.config))
+        with torch.device("cpu"):
+            input_dict, _, _ = trainer.get_data(is_train=False)
+        input_dict.pop("spin", None)
+        model = torch.jit.script(trainer.model)
+        tmp_model = tempfile.NamedTemporaryFile(delete=False, suffix=".pth")
+        torch.jit.save(model, tmp_model.name)
+        dp_test(
+            model=tmp_model.name,
+            system=None,
+            datafile=None,
+            train_json=self.input_json,
+            set_prefix="set",
+            numb_test=1,
+            rand_seed=None,
+            shuffle_test=False,
+            detail_file=self.detail_file,
+            atomic=False,
+        )
+        os.unlink(tmp_model.name)
+        self.assertTrue(os.path.exists(self.detail_file + ".e.out"))
+
+    def tearDown(self) -> None:
+        for f in os.listdir("."):
+            if f.startswith("model") and f.endswith(".pt"):
+                os.remove(f)
+            if f.startswith(self.detail_file):
+                os.remove(f)
+            if f in ["lcurve.out", self.input_json]:
+                os.remove(f)
+            if f in ["stat_files"]:
+                shutil.rmtree(f)
+
+
+class TestDPTestForceWeight(DPTest, unittest.TestCase):
+    def setUp(self) -> None:
+        self.detail_file = "test_dp_test_force_weight_detail"
+        input_json = str(Path(__file__).parent / "water/se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        system_dir = self._prepare_weighted_system()
+        data_file = [system_dir]
+        self.config["training"]["training_data"]["systems"] = data_file
+        self.config["training"]["validation_data"]["systems"] = data_file
+        self.config["model"] = deepcopy(model_se_e2_a)
+        self.system_dir = system_dir
+        self.input_json = "test_dp_test_force_weight.json"
+        with open(self.input_json, "w") as fp:
+            json.dump(self.config, fp, indent=4)
+
+    def _prepare_weighted_system(self) -> str:
+        src = Path(__file__).parent / "water/data/single"
+        tmp_dir = tempfile.mkdtemp()
+        shutil.copytree(src, tmp_dir, dirs_exist_ok=True)
+        set_dir = Path(tmp_dir) / "set.000"
+        forces = np.load(set_dir / "force.npy")
+        forces[0, :3] += 1.0
+        forces[0, -3:] += 10.0
+        np.save(set_dir / "force.npy", forces)
+        natoms = forces.shape[1] // 3
+        atom_pref = np.ones((forces.shape[0], natoms), dtype=forces.dtype)
+        atom_pref[:, 0] = 2.0
+        atom_pref[:, -1] = 0.0
+        np.save(set_dir / "atom_pref.npy", atom_pref)
+        return tmp_dir
+
+    def test_force_weight(self) -> None:
+        trainer = get_trainer(deepcopy(self.config))
+        with torch.device("cpu"):
+            trainer.get_data(is_train=False)
+        model = torch.jit.script(trainer.model)
+        tmp_model = tempfile.NamedTemporaryFile(delete=False, suffix=".pth")
+        torch.jit.save(model, tmp_model.name)
+        dp = DeepEval(tmp_model.name)
+        data = DeepmdData(
+            self.system_dir,
+            set_prefix="set",
+            shuffle_test=False,
+            type_map=dp.get_type_map(),
+            sort_atoms=False,
+        )
+        err = dp_test_ener(
+            dp,
+            data,
+            self.system_dir,
+            numb_test=1,
+            detail_file=None,
+            has_atom_ener=False,
+        )
+        test_data = data.get_test()
+        coord = test_data["coord"].reshape([1, -1])
+        box = test_data["box"][:1]
+        atype = test_data["type"][0]
+        ret = dp.eval(
+            coord,
+            box,
+            atype,
+            fparam=None,
+            aparam=None,
+            atomic=False,
+            efield=None,
+            mixed_type=False,
+            spin=None,
+        )
+        force_pred = ret[1].reshape([1, -1])
+        force_true = test_data["force"][:1]
+        weight = test_data["atom_pref"][:1]
+        diff = force_pred - force_true
+        mae_unweighted = np.sum(np.abs(diff)) / diff.size
+        rmse_unweighted = np.sqrt(np.sum(diff * diff) / diff.size)
+        denom = weight.sum()
+        mae_weighted = np.sum(np.abs(diff) * weight) / denom
+        rmse_weighted = np.sqrt(np.sum(diff * diff * weight) / denom)
+        np.testing.assert_allclose(err["mae_f"][0], mae_unweighted)
+        np.testing.assert_allclose(err["rmse_f"][0], rmse_unweighted)
+        np.testing.assert_allclose(err["mae_fw"][0], mae_weighted)
+        np.testing.assert_allclose(err["rmse_fw"][0], rmse_weighted)
+        os.unlink(tmp_model.name)
+
+    def tearDown(self) -> None:
+        super().tearDown()
+        shutil.rmtree(self.system_dir)
+
+
 class TestDPTestPropertySeA(unittest.TestCase):
     def setUp(self) -> None:
         self.detail_file = "test_dp_test_property_detail"
diff --git a/source/tests/pt/test_eval_desc.py b/source/tests/pt/test_eval_desc.py
new file mode 100644
index 0000000000..ff79a0a376
--- /dev/null
+++ b/source/tests/pt/test_eval_desc.py
@@ -0,0 +1,101 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import os
+import shutil
+import tempfile
+import unittest
+from copy import (
+    deepcopy,
+)
+from pathlib import (
+    Path,
+)
+
+import numpy as np
+import torch
+
+from deepmd.entrypoints.eval_desc import (
+    eval_desc,
+)
+from deepmd.pt.entrypoints.main import (
+    get_trainer,
+)
+
+from .model.test_permutation import (
+    model_se_e2_a,
+)
+
+
+class DPEvalDesc:
+    def test_dp_eval_desc_1_frame(self) -> None:
+        trainer = get_trainer(deepcopy(self.config))
+        with torch.device("cpu"):
+            input_dict, label_dict, _ = trainer.get_data(is_train=False)
+        has_spin = getattr(trainer.model, "has_spin", False)
+        if callable(has_spin):
+            has_spin = has_spin()
+        if not has_spin:
+            input_dict.pop("spin", None)
+        input_dict["do_atomic_virial"] = True
+        result = trainer.model(**input_dict)
+        model = torch.jit.script(trainer.model)
+        tmp_model = tempfile.NamedTemporaryFile(delete=False, suffix=".pth")
+        torch.jit.save(model, tmp_model.name)
+
+        # Test eval_desc
+        eval_desc(
+            model=tmp_model.name,
+            system=self.config["training"]["validation_data"]["systems"][0],
+            datafile=None,
+            output=self.output_dir,
+        )
+        os.unlink(tmp_model.name)
+
+        # Check that descriptor file was created
+        system_name = os.path.basename(
+            self.config["training"]["validation_data"]["systems"][0].rstrip("/")
+        )
+        desc_file = os.path.join(self.output_dir, f"{system_name}.npy")
+        self.assertTrue(os.path.exists(desc_file))
+
+        # Load and validate descriptor
+        descriptors = np.load(desc_file)
+        self.assertIsInstance(descriptors, np.ndarray)
+        # Descriptors should be 3D: (nframes, natoms, ndesc)
+        self.assertEqual(len(descriptors.shape), 3)  # Should be 3D array
+        self.assertGreater(descriptors.shape[0], 0)  # Should have frames
+        self.assertGreater(descriptors.shape[1], 0)  # Should have atoms
+        self.assertGreater(descriptors.shape[2], 0)  # Should have descriptor dimensions
+
+    def tearDown(self) -> None:
+        for f in os.listdir("."):
+            if f.startswith("model") and f.endswith(".pt"):
+                os.remove(f)
+            if f in ["lcurve.out", self.input_json]:
+                os.remove(f)
+            if f in ["stat_files"]:
+                shutil.rmtree(f)
+        # Clean up output directory
+        if hasattr(self, "output_dir") and os.path.exists(self.output_dir):
+            shutil.rmtree(self.output_dir)
+
+
+class TestDPEvalDescSeA(DPEvalDesc, unittest.TestCase):
+    def setUp(self) -> None:
+        self.output_dir = "test_eval_desc_output"
+        input_json = str(Path(__file__).parent / "water" / "se_atten.json")
+        with open(input_json) as f:
+            self.config = json.load(f)
+        self.config["training"]["numb_steps"] = 1
+        self.config["training"]["save_freq"] = 1
+        data_file = [str(Path(__file__).parent / "water" / "data" / "single")]
+        self.config["training"]["training_data"]["systems"] = data_file
+        self.config["training"]["validation_data"]["systems"] = data_file
+        self.config["model"] = deepcopy(model_se_e2_a)
+        self.input_json = "test_eval_desc.json"
+        with open(self.input_json, "w") as fp:
+            json.dump(self.config, fp, indent=4)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pt/test_padding_atoms.py b/source/tests/pt/test_padding_atoms.py
new file mode 100644
index 0000000000..8fb417c6a4
--- /dev/null
+++ b/source/tests/pt/test_padding_atoms.py
@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+from copy import (
+    deepcopy,
+)
+
+import numpy as np
+
+from deepmd.dpmodel.common import (
+    to_numpy_array,
+)
+from deepmd.pt.model.descriptor import (
+    DescrptSeA,
+)
+from deepmd.pt.model.model import (
+    PropertyModel,
+)
+from deepmd.pt.model.task import (
+    PropertyFittingNet,
+)
+from deepmd.pt.utils.utils import (
+    to_torch_tensor,
+)
+
+
+class TestCaseSingleFrameWithoutNlist:
+    def setUp(self) -> None:
+        # nf=2, nloc == 3
+        self.nloc = 3
+        self.nt = 2
+        self.coord = np.array(
+            [
+                [
+                    [0, 0, 0],
+                    [0, 1, 0],
+                    [0, 0, 1],
+                ],
+                [
+                    [1, 0, 1],
+                    [0, 1, 1],
+                    [1, 1, 0],
+                ],
+            ],
+            dtype=np.float64,
+        )
+        self.atype = np.array([[0, 0, 1], [1, 1, 0]], dtype=int).reshape([2, self.nloc])
+        self.cell = 2.0 * np.eye(3).reshape([1, 9])
+        self.cell = np.array([self.cell, self.cell]).reshape(2, 9)
+        self.sel = [16, 8]
+        self.rcut = 2.2
+        self.rcut_smth = 0.4
+        self.atol = 1e-6
+        self.rtol = 1e-5
+
+
+class TestPaddingAtoms(unittest.TestCase, TestCaseSingleFrameWithoutNlist):
+    def setUp(self):
+        TestCaseSingleFrameWithoutNlist.setUp(self)
+
+    def test_padding_atoms_consistency(self):
+        ds = DescrptSeA(
+            self.rcut,
+            self.rcut_smth,
+            self.sel,
+        )
+        ft = PropertyFittingNet(
+            self.nt,
+            ds.get_dim_out(),
+            mixed_types=ds.mixed_types(),
+            intensive=True,
+            property_name="abc",
+        )
+        type_map = ["foo", "bar"]
+        model = PropertyModel(ds, ft, type_map=type_map)
+        var_name = model.get_var_name()
+        args = [to_torch_tensor(ii) for ii in [self.coord, self.atype, self.cell]]
+        result = model(*args)
+        # test intensive
+        np.testing.assert_allclose(
+            to_numpy_array(result[var_name].cpu().detach()),
+            np.mean(to_numpy_array(result[f"atom_{var_name}"].cpu().detach()), axis=1),
+            atol=self.atol,
+            rtol=self.rtol,
+        )
+        # test padding atoms
+        padding_atoms_list = [1, 5, 10]
+        for padding_atoms in padding_atoms_list:
+            coord = deepcopy(self.coord)
+            atype = deepcopy(self.atype)
+            atype_padding = np.pad(
+                atype,
+                pad_width=((0, 0), (0, padding_atoms)),
+                mode="constant",
+                constant_values=-1,
+            )
+            coord_padding = np.pad(
+                coord,
+                pad_width=((0, 0), (0, padding_atoms), (0, 0)),
+                mode="constant",
+                constant_values=0,
+            )
+            args = [
+                to_torch_tensor(ii) for ii in [coord_padding, atype_padding, self.cell]
+            ]
+            result_padding = model(*args)
+            np.testing.assert_allclose(
+                to_numpy_array(result[var_name].cpu().detach()),
+                to_numpy_array(result_padding[var_name].cpu().detach()),
+                atol=self.atol,
+                rtol=self.rtol,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/pt/test_sampler.py b/source/tests/pt/test_sampler.py
index ffe459a834..3d7143b350 100644
--- a/source/tests/pt/test_sampler.py
+++ b/source/tests/pt/test_sampler.py
@@ -62,7 +62,6 @@ def test_sampler_debug_info(self) -> None:
             batch_size=None,
             num_workers=0,  # setting to 0 diverges the behavior of its iterator; should be >=1
             drop_last=False,
-            pin_memory=True,
         )
         with torch.device("cpu"):
             batch_data = next(iter(dataloader))
diff --git a/source/tests/pt/test_training.py b/source/tests/pt/test_training.py
index 3df95e4b14..da239212b0 100644
--- a/source/tests/pt/test_training.py
+++ b/source/tests/pt/test_training.py
@@ -15,6 +15,7 @@
 from deepmd.pt.entrypoints.main import (
     get_trainer,
 )
+from deepmd.pt.entrypoints.main import train as train_entry
 from deepmd.pt.utils.finetune import (
     get_finetune_rules,
 )
@@ -30,6 +31,8 @@
 
 
 class DPTrainTest:
+    test_zbl_from_standard: bool = False
+
     def test_dp_train(self) -> None:
         # test training from scratch
         trainer = get_trainer(deepcopy(self.config))
@@ -95,6 +98,34 @@ def test_dp_train(self) -> None:
                         state_dict_finetuned_random[state_key],
                     )
 
+        if self.test_zbl_from_standard:
+            # test fine-tuning using zbl from standard model
+            finetune_model = (
+                self.config["training"].get("save_ckpt", "model.ckpt") + ".pt"
+            )
+            self.config_zbl["model"], finetune_links = get_finetune_rules(
+                finetune_model,
+                self.config_zbl["model"],
+            )
+            trainer_finetune_zbl = get_trainer(
+                deepcopy(self.config_zbl),
+                finetune_model=finetune_model,
+                finetune_links=finetune_links,
+            )
+            state_dict_finetuned_zbl = trainer_finetune_zbl.wrapper.model.state_dict()
+            for state_key in state_dict_finetuned_zbl:
+                if "out_bias" not in state_key and "out_std" not in state_key:
+                    original_key = state_key
+                    if ".models.0." in state_key:
+                        original_key = state_key.replace(".models.0.", ".")
+                    if ".models.1." not in state_key:
+                        torch.testing.assert_close(
+                            state_dict_trained[original_key],
+                            state_dict_finetuned_zbl[state_key],
+                        )
+            # check running
+            trainer_finetune_zbl.run()
+
         # check running
         trainer_finetune.run()
         trainer_finetune_empty.run()
@@ -150,8 +181,29 @@ def setUp(self) -> None:
         self.config["training"]["numb_steps"] = 1
         self.config["training"]["save_freq"] = 1
 
+    def test_yaml_input(self) -> None:
+        import yaml
+
+        yaml_file = Path("input.yaml")
+        with open(yaml_file, "w") as fp:
+            yaml.safe_dump(self.config, fp)
+        train_entry(
+            input_file=str(yaml_file),
+            init_model=None,
+            restart=None,
+            finetune=None,
+            init_frz_model=None,
+            model_branch="main",
+            skip_neighbor_stat=True,
+            output="out.json",
+        )
+        self.assertTrue(Path("out.json").exists())
+
     def tearDown(self) -> None:
         DPTrainTest.tearDown(self)
+        for ff in ["out.json", "input.yaml"]:
+            if Path(ff).exists():
+                os.remove(ff)
 
 
 class TestDOSModelSeA(unittest.TestCase, DPTrainTest):
@@ -222,6 +274,18 @@ def setUp(self) -> None:
         self.config["training"]["numb_steps"] = 1
         self.config["training"]["save_freq"] = 1
 
+        self.test_zbl_from_standard = True
+
+        input_json_zbl = str(Path(__file__).parent / "water/zbl.json")
+        with open(input_json_zbl) as f:
+            self.config_zbl = json.load(f)
+        data_file = [str(Path(__file__).parent / "water/data/data_0")]
+        self.config_zbl["training"]["training_data"]["systems"] = data_file
+        self.config_zbl["training"]["validation_data"]["systems"] = data_file
+        self.config_zbl["model"] = deepcopy(model_zbl)
+        self.config_zbl["training"]["numb_steps"] = 1
+        self.config_zbl["training"]["save_freq"] = 1
+
     def tearDown(self) -> None:
         DPTrainTest.tearDown(self)
 
diff --git a/source/tests/tf/test_change_bias.py b/source/tests/tf/test_change_bias.py
new file mode 100644
index 0000000000..4392bbd139
--- /dev/null
+++ b/source/tests/tf/test_change_bias.py
@@ -0,0 +1,233 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import json
+import os
+import shutil
+import tempfile
+import unittest
+from pathlib import (
+    Path,
+)
+
+from deepmd.tf.entrypoints.change_bias import (
+    change_bias,
+)
+from deepmd.tf.train.run_options import (
+    RunOptions,
+)
+from deepmd.tf.train.trainer import (
+    DPTrainer,
+)
+from deepmd.tf.utils.argcheck import (
+    normalize,
+)
+from deepmd.tf.utils.compat import (
+    update_deepmd_input,
+)
+
+from .common import (
+    j_loader,
+    run_dp,
+    tests_path,
+)
+
+
+class TestChangeBias(unittest.TestCase):
+    def setUp(self):
+        """Set up test fixtures."""
+        self.temp_dir = tempfile.mkdtemp()
+        self.temp_path = Path(self.temp_dir)
+
+    def tearDown(self):
+        """Clean up test fixtures."""
+        shutil.rmtree(self.temp_dir, ignore_errors=True)
+
+    def test_change_bias_frozen_model_partial_support(self):
+        """Test that frozen model support has limitations but provides helpful error."""
+        fake_pb = self.temp_path / "model.pb"
+        fake_pb.write_text("fake model content")
+
+        # Without bias_value, should suggest using bias_value or checkpoint
+        with self.assertRaises(NotImplementedError) as cm:
+            change_bias(
+                INPUT=str(fake_pb),
+                mode="change",
+                system=".",
+            )
+
+        self.assertIn(
+            "Data-based bias changing for frozen models is not yet implemented",
+            str(cm.exception),
+        )
+        self.assertIn("bias-value option", str(cm.exception))
+
+        # With bias_value, should provide implementation guidance
+        with self.assertRaises(NotImplementedError) as cm:
+            change_bias(
+                INPUT=str(fake_pb),
+                mode="change",
+                bias_value=[1.0, 2.0],
+                system=".",
+            )
+
+        self.assertIn(
+            "Bias modification for frozen models (.pb) is not yet fully implemented",
+            str(cm.exception),
+        )
+        self.assertIn("checkpoint_dir", str(cm.exception))
+
+    def test_change_bias_invalid_model_type(self):
+        """Test that invalid model types raise RuntimeError."""
+        fake_model = self.temp_path / "model.xyz"
+        fake_model.write_text("fake model content")
+
+        with self.assertRaises(RuntimeError) as cm:
+            change_bias(
+                INPUT=str(fake_model),
+                mode="change",
+                system=".",
+            )
+
+        self.assertIn(
+            "checkpoint file or frozen model file (.pb)",
+            str(cm.exception),
+        )
+
+    def test_change_bias_no_checkpoint_in_directory(self):
+        """Test that checkpoint files need proper checkpoint structure."""
+        fake_ckpt = self.temp_path / "model.ckpt"
+        fake_ckpt.write_text("fake checkpoint content")
+
+        # Create a fake data system for the test
+        fake_data_dir = self.temp_path / "fake_data"
+        fake_data_dir.mkdir()
+        fake_set_dir = fake_data_dir / "set.000"
+        fake_set_dir.mkdir()
+
+        with self.assertRaises(RuntimeError) as cm:
+            change_bias(
+                INPUT=str(fake_ckpt),
+                mode="change",
+                system=str(fake_data_dir),
+            )
+
+        self.assertIn("No valid checkpoint found", str(cm.exception))
+
+    def test_change_bias_user_defined_requires_real_model(self):
+        """Test that user-defined bias requires a real model with proper structure."""
+        fake_ckpt_dir = self.temp_path / "fake_checkpoint"
+        fake_ckpt_dir.mkdir()
+        fake_ckpt = fake_ckpt_dir / "model.ckpt"
+        fake_ckpt.write_text("fake checkpoint content")
+        (fake_ckpt_dir / "checkpoint").write_text("fake checkpoint")
+        # Create a minimal but complete input.json
+        minimal_config = {
+            "model": {"type_map": ["H", "O"]},
+            "training": {"systems": ["."], "validation_data": {"systems": ["."]}},
+        }
+
+        (fake_ckpt_dir / "input.json").write_text(json.dumps(minimal_config))
+
+        # Should fail because there's no real model structure, but with different error
+        with self.assertRaises((RuntimeError, FileNotFoundError, Exception)) as cm:
+            change_bias(
+                INPUT=str(fake_ckpt),
+                mode="change",
+                bias_value=[1.0, 2.0],
+                system=".",
+            )
+
+        # The error should be about model loading, not about NotImplementedError
+        self.assertNotIn("not yet implemented", str(cm.exception))
+
+    def test_change_bias_with_real_model(self):
+        """Test change_bias with a real trained model and verify output."""
+        # Create temporary directories for training and output
+        train_dir = self.temp_path / "train"
+        train_dir.mkdir()
+        checkpoint_dir = train_dir / "checkpoint"
+        output_file = self.temp_path / "output_model.pb"
+
+        # Use existing test data and configuration
+        data_dir = tests_path / "init_frz_model" / "data"
+        config_file = tests_path / "init_frz_model" / "input.json"
+
+        # Load and modify configuration for quick training
+        jdata = j_loader(str(config_file))
+        jdata["training"]["training_data"]["systems"] = [str(data_dir)]
+        jdata["training"]["validation_data"]["systems"] = [str(data_dir)]
+        jdata["training"]["numb_steps"] = 2  # Minimal training for testing
+        jdata["training"]["save_freq"] = 1
+        jdata["training"]["save_ckpt"] = str(checkpoint_dir / "model.ckpt")
+
+        # Write modified config
+        input_json_path = train_dir / "input.json"
+        with open(input_json_path, "w") as f:
+            json.dump(jdata, f, indent=4)
+
+        # Train the model using run_dp
+        ret = run_dp(f"dp train {input_json_path}")
+        self.assertEqual(ret, 0, "DP train failed!")
+
+        # Verify checkpoint was created
+        self.assertTrue(checkpoint_dir.exists())
+        checkpoint_files = list(checkpoint_dir.glob("*"))
+        self.assertGreater(len(checkpoint_files), 0, "No checkpoint files created")
+
+        # Find the actual checkpoint file
+        checkpoint_file = checkpoint_dir / "model.ckpt"
+
+        # Create a frozen model from the checkpoint for testing
+        frozen_model_path = train_dir / "frozen_model.pb"
+        ret = run_dp(f"dp freeze -c {checkpoint_dir} -o {frozen_model_path}")
+        self.assertEqual(ret, 0, "DP freeze failed!")
+        self.assertTrue(frozen_model_path.exists())
+
+        # Test change_bias function - this should provide implementation guidance for frozen models
+        with self.assertRaises(NotImplementedError) as cm:
+            change_bias(
+                INPUT=str(frozen_model_path),
+                mode="change",
+                system=str(data_dir),
+                output=str(output_file),
+            )
+        self.assertIn(
+            "Data-based bias changing for frozen models is not yet implemented",
+            str(cm.exception),
+        )
+
+        # Now test change_bias on the real checkpoint file (this is the real test)
+        change_bias(
+            INPUT=str(checkpoint_file),
+            mode="change",
+            system=str(data_dir),
+            output=str(output_file),
+        )
+
+        # Verify that output model file was created
+        self.assertTrue(output_file.exists())
+        self.assertTrue(output_file.stat().st_size > 0, "Output model file is empty")
+
+        # Load original model to verify structure
+        original_run_opt = RunOptions(init_model=str(checkpoint_dir), log_level=20)
+
+        # Load the configuration again for creating trainers
+        jdata = update_deepmd_input(jdata, warning=True, dump="input_v2_compat.json")
+        jdata = normalize(jdata)
+
+        original_trainer = DPTrainer(jdata, run_opt=original_run_opt)
+
+        # Verify original model loads successfully
+        self.assertIsNotNone(original_trainer.model)
+
+        # Verify the original model has the expected structure
+        original_type_map = original_trainer.model.get_type_map()
+        self.assertGreater(len(original_type_map), 0, "Model should have a type_map")
+
+        # Clean up training artifacts
+        for artifact in ["lcurve.out", "input_v2_compat.json"]:
+            if os.path.exists(artifact):
+                os.remove(artifact)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/tf/test_out_bias_std.py b/source/tests/tf/test_out_bias_std.py
new file mode 100644
index 0000000000..5d0ca45274
--- /dev/null
+++ b/source/tests/tf/test_out_bias_std.py
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+
+import unittest
+
+import numpy as np
+
+from deepmd.tf.descriptor.se_a import (
+    DescrptSeA,
+)
+from deepmd.tf.fit.dipole import (
+    DipoleFittingSeA,
+)
+from deepmd.tf.fit.ener import (
+    EnerFitting,
+)
+from deepmd.tf.model.model import (
+    StandardModel,
+)
+
+
+class TestOutBiasStd(unittest.TestCase):
+    """Test out_bias and out_std functionality in TensorFlow backend."""
+
+    def test_init_out_stat_basic(self):
+        """Test basic init_out_stat functionality."""
+        descriptor = DescrptSeA(
+            rcut=4.0, rcut_smth=3.5, sel=[10, 20], neuron=[8, 16, 32]
+        )
+        fitting = EnerFitting(ntypes=2, dim_descrpt=32)
+        model = StandardModel(
+            descriptor=descriptor, fitting_net=fitting, type_map=["H", "O"]
+        )
+
+        # Test initial state
+        self.assertIsNone(model.out_bias)
+        self.assertIsNone(model.out_std)
+
+        # Test init_out_stat
+        model.init_out_stat()
+        self.assertIsNotNone(model.out_bias)
+        self.assertIsNotNone(model.out_std)
+        self.assertEqual(model.out_bias.shape, (1, 2, 1))  # 1 output, 2 types, 1 dim
+        self.assertEqual(model.out_std.shape, (1, 2, 1))
+
+        # Check default values
+        np.testing.assert_array_equal(model.out_bias, np.zeros((1, 2, 1)))
+        np.testing.assert_array_equal(model.out_std, np.ones((1, 2, 1)))
+
+    def test_different_fitting_dimensions(self):
+        """Test that different fitting types have correct dimensions."""
+        descriptor = DescrptSeA(
+            rcut=4.0, rcut_smth=3.5, sel=[10, 20], neuron=[8, 16, 32]
+        )
+
+        # Test energy fitting (dim_out = 1)
+        fitting_ener = EnerFitting(ntypes=2, dim_descrpt=32)
+        model_ener = StandardModel(
+            descriptor=descriptor, fitting_net=fitting_ener, type_map=["H", "O"]
+        )
+        model_ener.init_out_stat(suffix="_ener")
+        self.assertEqual(model_ener.out_bias.shape, (1, 2, 1))
+
+        # Test dipole fitting (dim_out = 3)
+        fitting_dipole = DipoleFittingSeA(ntypes=2, dim_descrpt=32, embedding_width=32)
+        model_dipole = StandardModel(
+            descriptor=descriptor, fitting_net=fitting_dipole, type_map=["H", "O"]
+        )
+        model_dipole.init_out_stat(suffix="_dipole")
+        self.assertEqual(model_dipole.out_bias.shape, (1, 2, 3))
+
+    def test_apply_out_stat(self):
+        """Test that out_bias and out_std are applied during model build."""
+        from deepmd.tf.env import (
+            tf,
+        )
+
+        descriptor = DescrptSeA(
+            rcut=4.0, rcut_smth=3.5, sel=[10, 20], neuron=[8, 16, 32]
+        )
+        fitting = EnerFitting(ntypes=2, dim_descrpt=32)
+        model = StandardModel(
+            descriptor=descriptor, fitting_net=fitting, type_map=["H", "O"]
+        )
+
+        # Set test bias and std directly
+        test_bias = np.array([[[1.0], [2.0]]])  # bias for type 0: 1.0, type 1: 2.0
+        test_std = np.array([[[0.5], [1.5]]])  # std for type 0: 0.5, type 1: 1.5
+        model.out_bias = test_bias
+        model.out_std = test_std
+
+        # Create mock input data for testing
+        nloc = 3
+
+        # Mock coordinates and atom types
+        coord = tf.placeholder(tf.float64, [None, nloc * 3])
+        atype = tf.placeholder(tf.int32, [None, nloc])
+        natoms = [
+            nloc,
+            nloc,
+            1,
+            2,
+        ]  # [local atoms, total atoms, type 0 count, type 1 count]
+        box = tf.placeholder(tf.float64, [None, 9])
+        mesh = tf.placeholder(tf.int32, [None, 6])
+
+        # Build the model - this should apply bias/std internally
+        model.build(coord, atype, natoms, box, mesh, input_dict=None)
+
+        # Check that the bias and std variables were created
+        self.assertTrue(hasattr(model, "t_out_bias"))
+        self.assertTrue(hasattr(model, "t_out_std"))
+
+        # Test that out_bias and out_std are preserved
+        np.testing.assert_array_equal(model.out_bias, test_bias)
+        np.testing.assert_array_equal(model.out_std, test_std)
+
+    def test_apply_out_stat_no_bias(self):
+        """Test that when no bias is explicitly set, default bias (zeros) is used."""
+        descriptor = DescrptSeA(
+            rcut=4.0, rcut_smth=3.5, sel=[10, 20], neuron=[8, 16, 32]
+        )
+        fitting = EnerFitting(ntypes=2, dim_descrpt=32)
+        model = StandardModel(
+            descriptor=descriptor, fitting_net=fitting, type_map=["H", "O"]
+        )
+
+        # Initialize the model which should set default bias=0, std=1
+        model.init_out_stat()
+
+        # Verify that default bias and std are set correctly
+        bias = model.out_bias
+        std = model.out_std
+
+        # Default bias should be zeros
+        expected_bias = np.zeros([1, 2, 1])  # [1, ntypes, dim_out]
+        expected_std = np.ones([1, 2, 1])  # [1, ntypes, dim_out]
+
+        np.testing.assert_array_equal(bias, expected_bias)
+        np.testing.assert_array_equal(std, expected_std)
+
+    def test_decoupled_bias_architecture(self):
+        """Test that out_bias and bias_atom_e are completely decoupled."""
+        # Test that setting out_bias does not affect bias_atom_e and vice versa
+
+        descriptor = DescrptSeA(
+            rcut=4.0, rcut_smth=3.5, sel=[10, 20], neuron=[8, 16, 32]
+        )
+        fitting = EnerFitting(ntypes=2, dim_descrpt=32)
+        model = StandardModel(
+            descriptor=descriptor, fitting_net=fitting, type_map=["H", "O"]
+        )
+
+        # Initialize with defaults
+        model.init_out_stat()
+
+        # Set out_bias directly
+        test_out_bias = np.array([[[1.0], [2.0]]])
+        model.out_bias = test_out_bias
+
+        # Verify out_bias is set correctly
+        retrieved_bias = model.out_bias
+        np.testing.assert_array_equal(retrieved_bias, test_out_bias)
+
+        # Verify that out_std can be set independently
+        test_out_std = np.array([[[0.5], [1.5]]])
+        model.out_std = test_out_std
+        retrieved_std = model.out_std
+        np.testing.assert_array_equal(retrieved_std, test_out_std)
+
+        # Verify shapes are correct for energy models
+        self.assertEqual(retrieved_bias.shape, (1, 2, 1))  # [1, ntypes, dim_out]
+        self.assertEqual(retrieved_std.shape, (1, 2, 1))  # [1, ntypes, dim_out]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/universal/dpmodel/fitting/test_fitting.py b/source/tests/universal/dpmodel/fitting/test_fitting.py
index 90b0668d20..29c5fcd4da 100644
--- a/source/tests/universal/dpmodel/fitting/test_fitting.py
+++ b/source/tests/universal/dpmodel/fitting/test_fitting.py
@@ -52,6 +52,7 @@ def FittingParamEnergy(
         "numb_fparam": numb_param,
         "numb_aparam": numb_param,
         "dim_case_embd": numb_param,
+        "default_fparam": [1.0] * numb_param if numb_param > 0 else None,
     }
     return input_dict