diff --git a/.gitignore b/.gitignore
index 6382ecedd2..521f387697 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,6 +50,9 @@ uv.lock
 buildcxx/
 node_modules/
 *.bib.original
+.claude
+.spec-workflow
+.serena
 
 # Coverage files
 .coverage
@@ -71,3 +74,11 @@ frozen_model.*
 
 # Test system directories
 system/
+
+# clangd
+compile_commands.json
+source/.cache
+
+# pytorch profiler
+*.tfevents.*
+*.pt.trace.json
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6ec5c0e8a1..c813077783 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 # See https://pre-commit.com for more information
 # See https://pre-commit.com/hooks.html for more hooks
 repos:
-  - repo: https://github.com/pre-commit/pre-commit-hooks
+  - repo: https://gh-proxy.com/github.com/pre-commit/pre-commit-hooks
     rev: v6.0.0
     hooks:
       - id: trailing-whitespace
@@ -21,13 +21,13 @@ repos:
       - id: check-symlinks
       - id: check-toml
   # Python
-  - repo: https://github.com/PyCQA/isort
+  - repo: https://gh-proxy.com/github.com/PyCQA/isort
     rev: 7.0.0
     hooks:
       - id: isort
         files: \.py$
         exclude: ^source/3rdparty
-  - repo: https://github.com/astral-sh/ruff-pre-commit
+  - repo: https://gh-proxy.com/github.com/astral-sh/ruff-pre-commit
     # Ruff version.
     rev: v0.14.11
     hooks:
@@ -38,7 +38,7 @@ repos:
       - id: ruff-format
         exclude: ^source/3rdparty
         types_or: [python, pyi, jupyter]
-  - repo: https://github.com/pycqa/flake8
+  - repo: https://gh-proxy.com/github.com/pycqa/flake8
     # flake8 cannot autofix
     rev: "7.3.0"
     hooks:
@@ -47,25 +47,25 @@ repos:
           - torchfix==0.6.0
           - flake8-pyproject==1.2.3
   # numpydoc
-  - repo: https://github.com/Carreau/velin
+  - repo: https://gh-proxy.com/github.com/Carreau/velin
     rev: 0.0.12
     hooks:
       - id: velin
         args: ["--write"]
         exclude: ^source/3rdparty
   # Python inside docs
-  - repo: https://github.com/asottile/blacken-docs
+  - repo: https://gh-proxy.com/github.com/asottile/blacken-docs
     rev: 1.20.0
     hooks:
       - id: blacken-docs
   # C++
-  - repo: https://github.com/pre-commit/mirrors-clang-format
+  - repo: https://gh-proxy.com/github.com/pre-commit/mirrors-clang-format
     rev: v21.1.8
     hooks:
       - id: clang-format
         exclude: ^(source/3rdparty|source/lib/src/gpu/cudart/.+\.inc|.+\.ipynb$|.+\.json$)
   # markdown, yaml, CSS, javascript
-  - repo: https://github.com/pre-commit/mirrors-prettier
+  - repo: https://gh-proxy.com/github.com/pre-commit/mirrors-prettier
     rev: v4.0.0-alpha.8
     hooks:
       - id: prettier
@@ -73,17 +73,17 @@ repos:
         # workflow files cannot be modified by pre-commit.ci
         exclude: ^(source/3rdparty|\.github/workflows|\.clang-format)
   # Shell
-  - repo: https://github.com/scop/pre-commit-shfmt
+  - repo: https://gh-proxy.com/github.com/scop/pre-commit-shfmt
     rev: v3.12.0-2
     hooks:
       - id: shfmt
   # CMake
-  - repo: https://github.com/cheshirekow/cmake-format-precommit
+  - repo: https://gh-proxy.com/github.com/cheshirekow/cmake-format-precommit
     rev: v0.6.13
     hooks:
       - id: cmake-format
       #- id: cmake-lint
-  - repo: https://github.com/njzjz/mirrors-bibtex-tidy
+  - repo: https://gh-proxy.com/github.com/njzjz/mirrors-bibtex-tidy
     rev: v1.14.0
     hooks:
       - id: bibtex-tidy
@@ -103,7 +103,7 @@ repos:
           - --remove-empty-fields
           - --wrap=80
   # license header
-  - repo: https://github.com/Lucas-C/pre-commit-hooks
+  - repo: https://gh-proxy.com/github.com/Lucas-C/pre-commit-hooks
     rev: v1.5.5
     hooks:
       # C++, js
@@ -153,7 +153,7 @@ repos:
         # unclear why PairDeepMD is used instead of PairDeePMD
         exclude: .pre-commit-config.yaml|source/lmp
   # customized pylint rules
-  - repo: https://github.com/pylint-dev/pylint/
+  - repo: https://gh-proxy.com/github.com/pylint-dev/pylint/
     rev: v4.0.4
     hooks:
       - id: pylint
diff --git a/AGENTS.md b/AGENTS.md
index c629a08def..9d268607a4 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,191 +1,415 @@
-# DeePMD-kit
+# CLAUDE.md
 
-DeePMD-kit is a deep learning package for many-body potential energy representation and molecular dynamics. It supports multiple backends (TensorFlow, PyTorch, JAX, Paddle) and integrates with MD packages like LAMMPS, GROMACS, and i-PI.
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
 
-**Always reference these instructions first and fallback to search or bash commands only when you encounter unexpected information that does not match the info here.**
+## Project Overview
 
-## Working Effectively
+DeePMD-kit is a deep learning-based molecular dynamics potential model modeling software package that supports four deep learning backends: TensorFlow, PyTorch, JAX, and Paddle, and integrates with multiple MD software including LAMMPS, i-PI, AMBER, CP2K, GROMACS, etc.
 
-### Bootstrap and Build Repository
+## Common Development Commands
 
-- Create virtual environment: `uv venv venv && source venv/bin/activate`
-- Install base dependencies: `uv pip install tensorflow-cpu` (takes ~8 seconds)
-- Install PyTorch: `uv pip install torch --index-url https://download.pytorch.org/whl/cpu` (takes ~5 seconds)
-- Build Python package: `uv pip install -e .[cpu,test]` -- takes 67 seconds. **NEVER CANCEL. Set timeout to 120+ seconds.**
-- Build C++ components: `export TENSORFLOW_ROOT=$(python -c 'import importlib.util,pathlib;print(pathlib.Path(importlib.util.find_spec("tensorflow").origin).parent)')` then `export PYTORCH_ROOT=$(python -c 'import torch;print(torch.__path__[0])')` then `./source/install/build_cc.sh` -- takes 164 seconds. **NEVER CANCEL. Set timeout to 300+ seconds.**
+Use this python if needed: /home/outisli/miniforge3/envs/dpmd/bin/python
 
-### Test Repository
+### Code Check and Format
 
-- Run single test: `pytest source/tests/tf/test_dp_test.py::TestDPTestEner::test_1frame -v` -- takes 8-13 seconds
-- Run test subset: `pytest source/tests/tf/test_dp_test.py -v` -- takes 15 seconds. **NEVER CANCEL. Set timeout to 60+ seconds.**
-- **Recommended: Use single test cases for validation instead of full test suite** -- full suite has 314 test files and takes 60+ minutes
+```bash
+ruff check .      # Check code style
+ruff format .     # Format code
+isort .           # Sort imports
+```
+
+### Test Commands
+
+```bash
+# Verify installation
+dp --version
+python -c "import deepmd; import deepmd.tf; print('Interfaces working')"
+
+# VITAL!: set these three OMP_NUM_THREADS, DP_INTER_OP_PARALLELISM_THREADS, DP_INTRA_OP_PARALLELISM_THREADS to zero before running test
+
+# Single test (recommended for development)
+pytest source/tests/tf/test_dp_test.py::TestDPTestEner::test_1frame -v
+
+# Specific test suite
+pytest source/tests/tf/test_dp_test.py -v
+
+# Training test
+cd examples/water/se_e2_a
+dp train input.json --skip-neighbor-stat  # TensorFlow
+dp --pt train input_torch.json --skip-neighbor-stat  # PyTorch
+```
+
+### Model Compression (Reference: doc/outisli/compress.md)
+
+#### Compression Principle
+
+- **Tabulation**: Pre-compute and store embedding network outputs
+- **Piecewise Interpolation**: Use quintic Hermite interpolation for continuity
+- **Performance**: Significantly reduces memory usage and improves inference speed
+
+#### Supported Descriptors
 
-### Lint and Format Code
+- ✅ SE_A, SE_R, SE_T, SE_Atten
+- ✅ DPA1, DPA2
+- ❌ DPA3 (compression not supported)
+
+## Code Architecture and Core Modules
+
+### 1. Deep Learning Model Layer (deepmd/dpmodel/)
 
-- Install linter: `uv pip install ruff`
-- Run linting: `ruff check .` -- takes <1 second
-- Format code: `ruff format .` -- takes <1 second
-- **Always run `ruff check .` and `ruff format .` before committing changes or the CI will fail.**
+This is the core model definition layer of DeePMD-kit, containing all mathematical abstractions of models:
 
-### Training and Validation
+- **descriptor/**: Descriptor modules (embedding networks, environment information extraction)
+  - `se_a.py`: Embedded Atom Descriptor
+  - `se_r.py`: Simplified embedding descriptor
+  - `se_a_tpe.py`: Descriptor with type embedding
+  - `hybrid.py`: Hybrid descriptor
+- **fitting/**: Fitting network modules
+  - `ener.py`: Energy fitting network
+  - `dipole.py`: Dipole fitting
+  - `polar.py`: Polarizability fitting
+- **model/**: Model definitions
+  - `model.py`: Base model class
+  - `ener_model.py`: Energy model
+  - `dos_model.py`: Density of states model
 
-- Test TensorFlow training: `cd examples/water/se_e2_a && dp train input.json --skip-neighbor-stat` -- training proceeds but is slow on CPU
-- Test PyTorch training: `cd examples/water/se_e2_a && dp --pt train input_torch.json --skip-neighbor-stat` -- training proceeds but is slow on CPU
-- **Training examples are for validation only. Real training takes hours/days. Timeout training tests after 60 seconds for validation.**
+### 2. Backend Implementation Layer
 
-## Validation Scenarios
+Each backend implements the same interface to ensure consistency:
 
-**ALWAYS manually validate any new code through at least one complete scenario:**
+#### TensorFlow Backend (deepmd/tf/)
 
-### Basic Functionality Validation
+- **entrypoints/**: Command line entry points
+  - `main.py`: Main CLI entry
+  - `train.py`: Training script
+  - `freeze.py`: Model freezing
+  - `test.py`: Model testing
+- **network/**: Network definitions
+  - `network.py`: Main network class
+  - `embedding_net.py`: Embedding network
+  - `fitting_net.py`: Fitting network
+- **model/**: Model implementations
+  - `model.py`: Model definition
+  - `model_stat.py`: Model statistics
+- **infer/**: Inference interface
+  - `deep_eval.py`: Deep evaluation
+  - `deep_pot.py`: Deep potential
 
-1. **CLI Interface**: Run `dp --version` and `dp -h` to verify installation
-2. **Python Interface**: Run `python -c "import deepmd; import deepmd.tf; print('Both interfaces work')"`
-3. **Backend Selection**: Test `dp --tf -h`, `dp --pt -h`, `dp --jax -h`, `dp --pd -h`
+#### PyTorch Backend (deepmd/pt/)
 
-### Training Workflow Validation
+Similar structure to TensorFlow backend but with PyTorch-specific optimizations:
 
-1. **TensorFlow Training**: `cd examples/water/se_e2_a && timeout 60 dp train input.json --skip-neighbor-stat` -- should start training and show decreasing loss
-2. **PyTorch Training**: `cd examples/water/se_e2_a && timeout 60 dp --pt train input_torch.json --skip-neighbor-stat` -- should start training and show decreasing loss
-3. **Verify training output**: Look for "batch X: trn: rmse" messages showing decreasing error values
+- **model/**: PyTorch model implementations
+  - `model.py`: Base model class
+  - `nn.py`: Neural network modules
+- **utils/**: PyTorch utilities
+  - `env_mat.py`: Environment matrix construction
+  - `region.py`: Periodic boundary condition handling
+- **train/**: Training related
+  - `training.py`: Training loop
+  - `optimizer.py`: Optimizer configuration
 
-### Test-Based Validation
+### 3. C++ Core Engine (source/)
 
-1. **Core Tests**: `pytest source/tests/tf/test_dp_test.py::TestDPTestEner::test_1frame -v` -- should pass in ~10 seconds
-2. **Multi-backend**: Test both TensorFlow and PyTorch components work
+Core implementation for high-performance computing:
 
-## Common Commands and Timing
+#### Core Library (source/lib/)
 
-### Repository Structure
+- **include/**: Header file definitions
+  - `deepmd.hpp`: Main API declarations
+  - `common.hpp`: Common definitions
+  - `neighbor_list.hpp`: Neighbor list algorithm
+- **src/**: Source code implementation
+  - `deepmd.cpp`: Core C++ implementation
+  - `region.cpp`: Region processing
+  - `neighbor_list.cpp`: High-performance neighbor list
+  - `prod_env_mat_a.cpp`: Environment matrix production
 
+#### Operator Implementation (source/op/)
+
+Framework-specific operators for each deep learning framework:
+
+- **tf/**: TensorFlow custom operators
+  - `prod_env_mat_a.cc`: Environment matrix operator
+  - `prod_force_se_a.cc`: Force calculation operator
+  - `tabulate.cc`: Lookup table operator
+- **torch/**: PyTorch C++ extensions
+  - `prod_env_mat_a.cpp`: PyTorch version of environment matrix operator
+
+### 4. Data Processing Layer (deepmd/utils/)
+
+- **data.py**: Data loading and preprocessing
+- `data_system.py`: Data system management
+- `shuffle.py`: Data shuffling
+- `neighbor_stat.py`: Neighbor statistics
+- `type_embed.py`: Type embedding
+- `args.py`: Argument parsing
+- `path.py`: Path handling
+- `compat.py`: Version compatibility handling
+
+### 5. Input/Output Layer (deepmd/infer/)
+
+- **deep_pot.py**: High-level inference interface
+- **deep_dipole.py**: Dipole inference
+- **deep_dos.py**: Density of states inference
+- **deep_wfc.py**: Wave function inference
+
+## Key Data Flow
+
+1. **Training Flow**:
+
+   ```
+   Atomic coordinates → neighbor_list → env_matrix → descriptor → fitting_net → loss
+   ```
+
+2. **Inference Flow**:
+
+   ```
+   Input structure → Descriptor calculation → Fitting network → Energy/Force/Stress
+   ```
+
+3. **Multi-backend Unified Interface**:
+   - Python layer provides unified API through `deepmd.infer`
+   - C++ layer provides unified interface through `source/api_cc/`
+   - Each backend implements the same model specification
+
+### Select Backend
+
+```bash
+# Command line flags
+dp --pt train input.json
+dp --tf train input.json
+
+# Environment variable
+export DP_BACKEND=pytorch
+dp train input.json
 ```
-ls -la [repo-root]
-.github/               # GitHub workflows and templates
-CONTRIBUTING.md        # Contributing guide
-README.md             # Project overview
-deepmd/               # Python package source
-doc/                  # Documentation
-examples/             # Training examples and configurations
-pyproject.toml        # Python build configuration
-source/               # C++ source code and tests
+
+## Core Algorithms and Data Structures
+
+### 1. Descriptor Implementation
+
+Descriptors are the core innovation of DeePMD-kit, used to convert local atomic environments into vector representations:
+
+#### Embedded Atom Descriptor (SE_A)
+
+- **Location**: `deepmd/dpmodel/descriptor/se_a.py`
+- **Core functions**:
+  - `build()`: Build descriptor network
+  - `call()`: Calculate descriptor values
+- **Mathematical principle**:
+  - Radial basis function expansion: $g(r) = \sum_{i} \exp[-\gamma (r-r_s)^2]$
+  - Angular basis function: Angular dependency through 1D filters
+
+#### Environment Matrix (Env Mat)
+
+- **C++ implementation**: `source/lib/src/prod_env_mat_a.cpp`
+- **Function**: Efficiently calculate environment matrix between atom pairs
+- **Optimization**: Use parallelization and SIMD instructions for acceleration
+
+### 2. Fitting Network
+
+Maps descriptors to physical quantities:
+
+#### Energy Fitting
+
+- **Location**: `deepmd/dpmodel/fitting/ener.py`
+- **Output**: Atomic energy, system total energy obtained by summation
+- **Force calculation**: Through automatic differentiation or analytical gradient
+
+#### Fitting Network Structure
+
+```python
+# Typical fitting network architecture
+FittingNet(
+    layers=[embedding_dim, 240, 240, 240, 1],  # Network layer sizes
+    activation_function="tanh",  # Activation function
+    precision="float64",  # Numerical precision
+)
 ```
 
-### Key Directories and Files
+### 3. Training Strategy
 
-- `deepmd/` - Main Python package with backend implementations
-- `source/lib/` - Core C++ library
-- `source/op/` - Backend-specific operators (TF, PyTorch, etc.)
-- `source/api_cc/` - C++ API
-- `source/api_c/` - C API
-- `source/tests/` - Test suite (314 test files)
-- `examples/water/se_e2_a/` - Basic water training example
-- `examples/` - Various model examples for different scenarios
+#### Loss Function
 
-### Common CLI Commands
+```python
+# Location: deepmd/loss.py or backend implementations
+Loss = lr_e * energy_loss + lr_f * force_loss + lr_v * virial_loss
+```
 
-- `dp --version` - Show version information
-- `dp -h` - Show help and available commands
-- `dp train input.json` - Train a model (TensorFlow backend)
-- `dp --pt train input.json` - Train with PyTorch backend
-- `dp --jax train input.json` - Train with JAX backend
-- `dp --pd train input.json` - Train with Paddle backend
-- `dp test -m model.pb -s system/` - Test a trained model
-- `dp freeze -o model.pb` - Freeze/save a model
+#### Data Preprocessing
 
-### Build Dependencies and Setup
+- **Data shuffling**: `deepmd/utils/shuffle.py`
+- **Batching**: Auto-fill to ensure consistent batch size
+- **Data augmentation**: Increase data diversity through rotation and translation
 
-- **Python 3.10+** required
-- **Virtual environment** strongly recommended: `uv venv venv && source venv/bin/activate`
-- **Backend dependencies**: TensorFlow, PyTorch, JAX, or Paddle (install before building)
-- **Build tools**: CMake, C++ compiler, scikit-build-core
-- **C++ build requires**: Both TensorFlow and PyTorch installed, set TENSORFLOW_ROOT and PYTORCH_ROOT environment variables
+### 4. Model Saving and Loading
 
-### Key Configuration Files
+#### Checkpoint Formats
 
-- `pyproject.toml` - Python build configuration and dependencies
-- `source/CMakeLists.txt` - C++ build configuration
-- `examples/water/se_e2_a/input.json` - Basic TensorFlow training config
-- `examples/water/se_e2_a/input_torch.json` - Basic PyTorch training config
+- **TensorFlow**: .pb format (frozen graph)
+- **PyTorch**: .pth format
+- **Universal format**: .dp format (framework-agnostic)
 
-## Frequent Patterns and Time Expectations
+#### Model Conversion
 
-### Installation and Build Times
+```python
+# TensorFlow to PyTorch conversion
+from deepmd.pt import model as pt_model
 
-- **Virtual environment setup**: ~5 seconds
-- **TensorFlow CPU install**: ~8 seconds
-- **PyTorch CPU install**: ~5 seconds
-- **Python package build**: ~67 seconds. **NEVER CANCEL.**
-- **C++ components build**: ~164 seconds. **NEVER CANCEL.**
-- **Full fresh setup**: ~3-4 minutes total
+pt_model.load_tf_graph(tf_checkpoint_path)
+```
 
-### Testing Times
+## Common Development Patterns
+
+### 1. Adding New Descriptors
+
+1. Create new descriptor class in `deepmd/dpmodel/descriptor/`
+2. Inherit from `BaseDescriptor` and implement necessary methods
+3. Add corresponding implementations in each backend (tf/pt/jax/pd)
+4. Add unit tests
+
+### 2. Debugging Tips
+
+- Use small systems for quick testing
+- Check energy conservation and symmetry
+- Compare results consistency across different backends
+- Use `dp test --rand-init` to verify model
+
+## Development Standards
+
+### Naming Conventions
+
+- Always use correct capitalization: DeePMD-kit, PyTorch, TensorFlow, NumPy, GitHub, LAMMPS
+
+### License Requirements
+
+All source files must include header license:
+`SPDX-License-Identifier: LGPL-3.0-or-later`
+
+## Test Strategy
+
+### Test Locations
+
+- **source/tests/**: C++ and Python tests
+- **tests/** directories in each submodule
+
+### Test Principles
+
+- During development, only run single or few related tests; full test suite takes 60+ minutes
+- Training tests use `--skip-neighbor-stat` to skip statistics for speed
+- Use `timeout` to limit training test time
+
+## Configuration File Structure
+
+### Typical Training Configuration (input.json)
+
+```json
+{
+  "model": {
+    "type_map": ["O", "H"],
+    "descriptor": {
+      "type": "se_a",
+      "sel": [46, 92],
+      "rcut_smth": 5.8,
+      "rcut": 6.0,
+      "neuron": [25, 50, 100],
+      "axis_neuron": 12
+    },
+    "fitting_net": {
+      "type": "ener",
+      "neuron": [240, 240, 240],
+      "resnet_dt": true
+    }
+  },
+  "learning_rate": {
+    "type": "exp",
+    "start_lr": 0.001,
+    "decay_steps": 5000
+  },
+  "loss": {
+    "start_pref_e": 0.02,
+    "start_pref_f": 1000,
+    "start_pref_v": 0.0
+  },
+  "training": {
+    "training_data": {
+      "systems": ["system1/", "system2/"],
+      "batch_size": 8
+    },
+    "numb_steps": 1000000
+  }
+}
+```
 
-- **Single test**: 8-13 seconds
-- **Test file (~5 tests)**: ~15 seconds
-- **Backend-specific test subset**: 15-30 minutes. **Use sparingly.**
-- **Full test suite (314 files)**: 60+ minutes. **Avoid in development - use single tests instead.**
+## Special Features
 
-### Linting and Formatting
+### 1. Type Embedding
 
-- **Ruff check**: <1 second
-- **Ruff format**: <1 second
-- **Pre-commit hooks**: May have network issues, use individual tools
+- Support unified training for multi-element systems
+- Location: `deepmd/utils/type_embed.py`
+- Dynamic type embedding can handle unseen element combinations
 
-### Commit Messages and PR Titles
+### 2. Adaptive Selection (UpdateSel)
 
-**All commit messages and PR titles must follow [conventional commit specification](https://www.conventionalcommits.org/):**
+- Automatically update neighbor list selection parameters
+- Avoid neighbor loss due to atomic migration
+- Location: `deepmd/utils/update_sel.py`
 
-- **Format**: `type(scope): description`
-- **Common types**: `feat`, `fix`, `docs`, `style`, `refactor`, `test`, `chore`, `ci`
-- **Examples**:
-  - `feat(core): add new descriptor type`
-  - `fix(tf): resolve memory leak in training`
-  - `docs: update installation guide`
-  - `ci: add workflow for testing`
+### 3. Multi-task Learning
 
-### Training and Model Operations
+- Simultaneously fit energy, force, stress, dipole, etc.
+- Loss function can configure weights for each task
+- Support physical constraints and regularization
 
-- **Training initialization**: 10-30 seconds
-- **Training per batch**: 0.1-1 second (CPU), much faster on GPU
-- **Model freezing**: 5-15 seconds
-- **Model testing**: 10-30 seconds
+## Model Compression Details (Advanced)
 
-## Backend-Specific Notes
+### Compression Data Structure
 
-### TensorFlow Backend
+#### 1. Compression Information (compress_info)
 
-- **Default backend** when no flag specified
-- **Configuration**: Use `input.json` format
-- **Training**: `dp train input.json`
-- **Requirements**: `tensorflow` or `tensorflow-cpu` package
+```python
+# Store 6 parameters for each embedding network [6]
+compress_info[embedding_idx] = torch.tensor(
+    [
+        lower[net],  # Lower bound
+        upper[net],  # Upper bound
+        upper[net] * extrapolate,  # Extrapolation upper bound
+        table_stride_1,  # First segment stride
+        table_stride_2,  # Second segment stride
+        check_frequency,  # Overflow check frequency
+    ]
+)
+```
 
-### PyTorch Backend
+#### 2. Compression Data (compress_data)
 
-- **Activation**: Use `--pt` flag or `export DP_BACKEND=pytorch`
-- **Configuration**: Use `input_torch.json` format typically
-- **Training**: `dp --pt train input_torch.json`
-- **Requirements**: `torch` package
+```python
+# Store coefficient table for each embedding network [nspline, 6 * last_layer_size]
+compress_data[embedding_idx] = table_data[net]
 
-### JAX Backend
+# Each 6 consecutive coefficients represent polynomial coefficients
+# [f(x), f'(x), f''(x)/2, c3, c4, c5] × last_layer_size
+```
 
-- **Activation**: Use `--jax` flag
-- **Training**: `dp --jax train input.json`
-- **Requirements**: `jax` and related packages
-- **Note**: Experimental backend, may have limitations
+### Tabulation Implementation
 
-### Paddle Backend
+- **Table Builder**: `deepmd/pt/utils/tabulate.py` (PyTorch)
+- **Common Utilities**: `deepmd/utils/tabulate.py`
+- **Supported Activations**: tanh, gelu, relu, relu6, softplus, sigmoid
 
-- **Activation**: Use `--pd` flag
-- **Training**: `dp --pd train input.json`
-- **Requirements**: `paddlepaddle` package
-- **Note**: Less commonly used
+### Polynomial Interpolation Formula
+
+In interval [x_i, x_{i+1}], for variable x, the polynomial is:
+
+```
+f(x) = c₀ + c₁t + c₂t² + c₃t³ + c₄t⁴ + c₅t⁵
+```
 
-## Critical Warnings
+Where:
 
-- **NEVER CANCEL BUILD OPERATIONS**: Python build takes 67 seconds, C++ build takes 164 seconds
-- **USE SINGLE TESTS FOR VALIDATION**: Run individual tests instead of full test suite for faster feedback
-- **ALWAYS activate virtual environment**: Build and runtime failures occur without proper environment
-- **ALWAYS install backend dependencies first**: TensorFlow/PyTorch required before building C++ components
-- **ALWAYS run linting before commits**: `ruff check . && ruff format .` or CI will fail
-- **ALWAYS test both Python and C++ components**: Some features require both to be built
-- **ALWAYS follow conventional commit format**: All commit messages and PR titles must use conventional commit specification (`type(scope): description`)
+- `t = (x - x_i) / h`, h is step size
+- `c₀ = f(x_i)`
+- `c₁ = f'(x_i) × h`
+- `c₂ = f''(x_i) × h² / 2`
+- `c₃, c₄, c₅` determined by boundary continuity
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000000..9d268607a4
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,415 @@
+# CLAUDE.md
+
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+
+## Project Overview
+
+DeePMD-kit is a deep learning-based molecular dynamics potential model modeling software package that supports four deep learning backends: TensorFlow, PyTorch, JAX, and Paddle, and integrates with multiple MD software including LAMMPS, i-PI, AMBER, CP2K, GROMACS, etc.
+
+## Common Development Commands
+
+Use this python if needed: /home/outisli/miniforge3/envs/dpmd/bin/python
+
+### Code Check and Format
+
+```bash
+ruff check .      # Check code style
+ruff format .     # Format code
+isort .           # Sort imports
+```
+
+### Test Commands
+
+```bash
+# Verify installation
+dp --version
+python -c "import deepmd; import deepmd.tf; print('Interfaces working')"
+
+# VITAL!: set these three OMP_NUM_THREADS, DP_INTER_OP_PARALLELISM_THREADS, DP_INTRA_OP_PARALLELISM_THREADS to zero before running test
+
+# Single test (recommended for development)
+pytest source/tests/tf/test_dp_test.py::TestDPTestEner::test_1frame -v
+
+# Specific test suite
+pytest source/tests/tf/test_dp_test.py -v
+
+# Training test
+cd examples/water/se_e2_a
+dp train input.json --skip-neighbor-stat  # TensorFlow
+dp --pt train input_torch.json --skip-neighbor-stat  # PyTorch
+```
+
+### Model Compression (Reference: doc/outisli/compress.md)
+
+#### Compression Principle
+
+- **Tabulation**: Pre-compute and store embedding network outputs
+- **Piecewise Interpolation**: Use quintic Hermite interpolation for continuity
+- **Performance**: Significantly reduces memory usage and improves inference speed
+
+#### Supported Descriptors
+
+- ✅ SE_A, SE_R, SE_T, SE_Atten
+- ✅ DPA1, DPA2
+- ❌ DPA3 (compression not supported)
+
+## Code Architecture and Core Modules
+
+### 1. Deep Learning Model Layer (deepmd/dpmodel/)
+
+This is the core model definition layer of DeePMD-kit, containing all mathematical abstractions of models:
+
+- **descriptor/**: Descriptor modules (embedding networks, environment information extraction)
+  - `se_a.py`: Embedded Atom Descriptor
+  - `se_r.py`: Simplified embedding descriptor
+  - `se_a_tpe.py`: Descriptor with type embedding
+  - `hybrid.py`: Hybrid descriptor
+- **fitting/**: Fitting network modules
+  - `ener.py`: Energy fitting network
+  - `dipole.py`: Dipole fitting
+  - `polar.py`: Polarizability fitting
+- **model/**: Model definitions
+  - `model.py`: Base model class
+  - `ener_model.py`: Energy model
+  - `dos_model.py`: Density of states model
+
+### 2. Backend Implementation Layer
+
+Each backend implements the same interface to ensure consistency:
+
+#### TensorFlow Backend (deepmd/tf/)
+
+- **entrypoints/**: Command line entry points
+  - `main.py`: Main CLI entry
+  - `train.py`: Training script
+  - `freeze.py`: Model freezing
+  - `test.py`: Model testing
+- **network/**: Network definitions
+  - `network.py`: Main network class
+  - `embedding_net.py`: Embedding network
+  - `fitting_net.py`: Fitting network
+- **model/**: Model implementations
+  - `model.py`: Model definition
+  - `model_stat.py`: Model statistics
+- **infer/**: Inference interface
+  - `deep_eval.py`: Deep evaluation
+  - `deep_pot.py`: Deep potential
+
+#### PyTorch Backend (deepmd/pt/)
+
+Similar structure to TensorFlow backend but with PyTorch-specific optimizations:
+
+- **model/**: PyTorch model implementations
+  - `model.py`: Base model class
+  - `nn.py`: Neural network modules
+- **utils/**: PyTorch utilities
+  - `env_mat.py`: Environment matrix construction
+  - `region.py`: Periodic boundary condition handling
+- **train/**: Training related
+  - `training.py`: Training loop
+  - `optimizer.py`: Optimizer configuration
+
+### 3. C++ Core Engine (source/)
+
+Core implementation for high-performance computing:
+
+#### Core Library (source/lib/)
+
+- **include/**: Header file definitions
+  - `deepmd.hpp`: Main API declarations
+  - `common.hpp`: Common definitions
+  - `neighbor_list.hpp`: Neighbor list algorithm
+- **src/**: Source code implementation
+  - `deepmd.cpp`: Core C++ implementation
+  - `region.cpp`: Region processing
+  - `neighbor_list.cpp`: High-performance neighbor list
+  - `prod_env_mat_a.cpp`: Environment matrix production
+
+#### Operator Implementation (source/op/)
+
+Framework-specific operators for each deep learning framework:
+
+- **tf/**: TensorFlow custom operators
+  - `prod_env_mat_a.cc`: Environment matrix operator
+  - `prod_force_se_a.cc`: Force calculation operator
+  - `tabulate.cc`: Lookup table operator
+- **torch/**: PyTorch C++ extensions
+  - `prod_env_mat_a.cpp`: PyTorch version of environment matrix operator
+
+### 4. Data Processing Layer (deepmd/utils/)
+
+- **data.py**: Data loading and preprocessing
+- `data_system.py`: Data system management
+- `shuffle.py`: Data shuffling
+- `neighbor_stat.py`: Neighbor statistics
+- `type_embed.py`: Type embedding
+- `args.py`: Argument parsing
+- `path.py`: Path handling
+- `compat.py`: Version compatibility handling
+
+### 5. Input/Output Layer (deepmd/infer/)
+
+- **deep_pot.py**: High-level inference interface
+- **deep_dipole.py**: Dipole inference
+- **deep_dos.py**: Density of states inference
+- **deep_wfc.py**: Wave function inference
+
+## Key Data Flow
+
+1. **Training Flow**:
+
+   ```
+   Atomic coordinates → neighbor_list → env_matrix → descriptor → fitting_net → loss
+   ```
+
+2. **Inference Flow**:
+
+   ```
+   Input structure → Descriptor calculation → Fitting network → Energy/Force/Stress
+   ```
+
+3. **Multi-backend Unified Interface**:
+   - Python layer provides unified API through `deepmd.infer`
+   - C++ layer provides unified interface through `source/api_cc/`
+   - Each backend implements the same model specification
+
+### Select Backend
+
+```bash
+# Command line flags
+dp --pt train input.json
+dp --tf train input.json
+
+# Environment variable
+export DP_BACKEND=pytorch
+dp train input.json
+```
+
+## Core Algorithms and Data Structures
+
+### 1. Descriptor Implementation
+
+Descriptors are the core innovation of DeePMD-kit, used to convert local atomic environments into vector representations:
+
+#### Embedded Atom Descriptor (SE_A)
+
+- **Location**: `deepmd/dpmodel/descriptor/se_a.py`
+- **Core functions**:
+  - `build()`: Build descriptor network
+  - `call()`: Calculate descriptor values
+- **Mathematical principle**:
+  - Radial basis function expansion: $g(r) = \sum_{i} \exp[-\gamma (r-r_s)^2]$
+  - Angular basis function: Angular dependency through 1D filters
+
+#### Environment Matrix (Env Mat)
+
+- **C++ implementation**: `source/lib/src/prod_env_mat_a.cpp`
+- **Function**: Efficiently calculate environment matrix between atom pairs
+- **Optimization**: Use parallelization and SIMD instructions for acceleration
+
+### 2. Fitting Network
+
+Maps descriptors to physical quantities:
+
+#### Energy Fitting
+
+- **Location**: `deepmd/dpmodel/fitting/ener.py`
+- **Output**: Atomic energy, system total energy obtained by summation
+- **Force calculation**: Through automatic differentiation or analytical gradient
+
+#### Fitting Network Structure
+
+```python
+# Typical fitting network architecture
+FittingNet(
+    layers=[embedding_dim, 240, 240, 240, 1],  # Network layer sizes
+    activation_function="tanh",  # Activation function
+    precision="float64",  # Numerical precision
+)
+```
+
+### 3. Training Strategy
+
+#### Loss Function
+
+```python
+# Location: deepmd/loss.py or backend implementations
+Loss = lr_e * energy_loss + lr_f * force_loss + lr_v * virial_loss
+```
+
+#### Data Preprocessing
+
+- **Data shuffling**: `deepmd/utils/shuffle.py`
+- **Batching**: Auto-fill to ensure consistent batch size
+- **Data augmentation**: Increase data diversity through rotation and translation
+
+### 4. Model Saving and Loading
+
+#### Checkpoint Formats
+
+- **TensorFlow**: .pb format (frozen graph)
+- **PyTorch**: .pth format
+- **Universal format**: .dp format (framework-agnostic)
+
+#### Model Conversion
+
+```python
+# TensorFlow to PyTorch conversion
+from deepmd.pt import model as pt_model
+
+pt_model.load_tf_graph(tf_checkpoint_path)
+```
+
+## Common Development Patterns
+
+### 1. Adding New Descriptors
+
+1. Create new descriptor class in `deepmd/dpmodel/descriptor/`
+2. Inherit from `BaseDescriptor` and implement necessary methods
+3. Add corresponding implementations in each backend (tf/pt/jax/pd)
+4. Add unit tests
+
+### 2. Debugging Tips
+
+- Use small systems for quick testing
+- Check energy conservation and symmetry
+- Compare results consistency across different backends
+- Use `dp test --rand-init` to verify model
+
+## Development Standards
+
+### Naming Conventions
+
+- Always use correct capitalization: DeePMD-kit, PyTorch, TensorFlow, NumPy, GitHub, LAMMPS
+
+### License Requirements
+
+All source files must include header license:
+`SPDX-License-Identifier: LGPL-3.0-or-later`
+
+## Test Strategy
+
+### Test Locations
+
+- **source/tests/**: C++ and Python tests
+- **tests/** directories in each submodule
+
+### Test Principles
+
+- During development, only run single or few related tests; full test suite takes 60+ minutes
+- Training tests use `--skip-neighbor-stat` to skip statistics for speed
+- Use `timeout` to limit training test time
+
+## Configuration File Structure
+
+### Typical Training Configuration (input.json)
+
+```json
+{
+  "model": {
+    "type_map": ["O", "H"],
+    "descriptor": {
+      "type": "se_a",
+      "sel": [46, 92],
+      "rcut_smth": 5.8,
+      "rcut": 6.0,
+      "neuron": [25, 50, 100],
+      "axis_neuron": 12
+    },
+    "fitting_net": {
+      "type": "ener",
+      "neuron": [240, 240, 240],
+      "resnet_dt": true
+    }
+  },
+  "learning_rate": {
+    "type": "exp",
+    "start_lr": 0.001,
+    "decay_steps": 5000
+  },
+  "loss": {
+    "start_pref_e": 0.02,
+    "start_pref_f": 1000,
+    "start_pref_v": 0.0
+  },
+  "training": {
+    "training_data": {
+      "systems": ["system1/", "system2/"],
+      "batch_size": 8
+    },
+    "numb_steps": 1000000
+  }
+}
+```
+
+## Special Features
+
+### 1. Type Embedding
+
+- Support unified training for multi-element systems
+- Location: `deepmd/utils/type_embed.py`
+- Dynamic type embedding can handle unseen element combinations
+
+### 2. Adaptive Selection (UpdateSel)
+
+- Automatically update neighbor list selection parameters
+- Avoid neighbor loss due to atomic migration
+- Location: `deepmd/utils/update_sel.py`
+
+### 3. Multi-task Learning
+
+- Simultaneously fit energy, force, stress, dipole, etc.
+- Loss function can configure weights for each task
+- Support physical constraints and regularization
+
+## Model Compression Details (Advanced)
+
+### Compression Data Structure
+
+#### 1. Compression Information (compress_info)
+
+```python
+# Store 6 parameters for each embedding network [6]
+compress_info[embedding_idx] = torch.tensor(
+    [
+        lower[net],  # Lower bound
+        upper[net],  # Upper bound
+        upper[net] * extrapolate,  # Extrapolation upper bound
+        table_stride_1,  # First segment stride
+        table_stride_2,  # Second segment stride
+        check_frequency,  # Overflow check frequency
+    ]
+)
+```
+
+#### 2. Compression Data (compress_data)
+
+```python
+# Store coefficient table for each embedding network [nspline, 6 * last_layer_size]
+compress_data[embedding_idx] = table_data[net]
+
+# Each 6 consecutive coefficients represent polynomial coefficients
+# [f(x), f'(x), f''(x)/2, c3, c4, c5] × last_layer_size
+```
+
+### Tabulation Implementation
+
+- **Table Builder**: `deepmd/pt/utils/tabulate.py` (PyTorch)
+- **Common Utilities**: `deepmd/utils/tabulate.py`
+- **Supported Activations**: tanh, gelu, relu, relu6, softplus, sigmoid
+
+### Polynomial Interpolation Formula
+
+In interval [x_i, x_{i+1}], for variable x, the polynomial is:
+
+```
+f(x) = c₀ + c₁t + c₂t² + c₃t³ + c₄t⁴ + c₅t⁵
+```
+
+Where:
+
+- `t = (x - x_i) / h`, h is step size
+- `c₀ = f(x_i)`
+- `c₁ = f'(x_i) × h`
+- `c₂ = f''(x_i) × h² / 2`
+- `c₃, c₄, c₅` determined by boundary continuity
diff --git a/README.md b/README.md
index 0444469779..94ee6c788a 100644
--- a/README.md
+++ b/README.md
@@ -108,3 +108,5 @@ See [DeePMD-kit Contributing Guide](CONTRIBUTING.md) to become a contributor! 
 [2]: https://journals.aps.org/prl/abstract/10.1103/PhysRevLett.120.143001
 [3]: https://arxiv.org/abs/1805.09003
 [4]: https://aip.scitation.org/doi/full/10.1063/1.5027645
+
+Use this command to generate json schema: `python -c "from deepmd.utils.argcheck import gen_json_schema; import json; json.dump(json.loads(gen_json_schema(multi_task=True)), open('/home/outisli/Research/dpmd/deepmd_json_schema.json', 'w'), indent=2)"`
diff --git a/debug/compress_debug.py b/debug/compress_debug.py
new file mode 100644
index 0000000000..419a1d1e7e
--- /dev/null
+++ b/debug/compress_debug.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Debug script for model compression.
+
+Equivalent to: dp --pt compress -i no.pth -o yes.pth -t input_torch.json
+
+This script can be run directly in VSCode with debugging capabilities.
+"""
+
+import logging
+import os
+import sys
+import time
+from pathlib import (
+    Path,
+)
+
+# Add the deepmd-kit root to Python path
+deepmd_root = Path(__file__).parent.parent
+sys.path.insert(0, str(deepmd_root))
+
+
+def compress_model() -> float:
+    """Compress the model using the same parameters as the CLI command.
+
+    dp --pt compress -i no.pth -o yes.pth -t input_torch.json
+
+    Returns
+    -------
+    float
+        Elapsed time for the compression in seconds.
+    """
+    # Import here to avoid module-level import restriction
+    from deepmd.pt.entrypoints.compress import (
+        enable_compression,
+    )
+
+    # Setup logging with timestamp
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    log = logging.getLogger(__name__)
+
+    # Set working directory to examples/water/se_e3_tebd
+    work_dir = deepmd_root / "examples" / "water" / "se_e3_tebd"
+    original_cwd = os.getcwd()
+
+    try:
+        os.chdir(work_dir)
+        log.debug(f"Changed to working directory: {work_dir}")
+
+        # Model compression parameters
+        input_file = "no.pth"
+        output_file = "yes.pth"
+        training_script = "input_torch.json"
+        stride = 0.01  # default value
+        extrapolate = 5  # default value
+        check_frequency = -1  # default value (disabled)
+
+        # Check if input files exist
+        if not os.path.exists(input_file):
+            raise FileNotFoundError(
+                f"Input model file '{input_file}' not found in {work_dir}"
+            )
+
+        if not os.path.exists(training_script):
+            raise FileNotFoundError(
+                f"Training script '{training_script}' not found in {work_dir}"
+            )
+
+        log.debug(f"Input model: {input_file}")
+        log.debug(f"Output model: {output_file}")
+        log.debug(f"Training script: {training_script}")
+        log.debug(f"Stride: {stride}")
+        log.debug(f"Extrapolate: {extrapolate}")
+        log.debug(f"Check frequency: {check_frequency}")
+
+        log.debug("Starting model compression...")
+
+        # Record time usage
+        start_time = time.monotonic()
+
+        # Call the compression function
+        enable_compression(
+            input_file=input_file,
+            output=output_file,
+            stride=stride,
+            extrapolate=extrapolate,
+            check_frequency=check_frequency,
+            training_script=training_script,
+        )
+
+        elapsed_time = time.monotonic() - start_time
+
+        # Print results (keep these as info level - these are the main results)
+        log.info("Model compression completed successfully!")
+        log.info(f"Compressed model saved to: {output_file}")
+        log.info(f"Elapsed time: {elapsed_time:.2f} seconds")
+
+        return elapsed_time
+
+    except Exception as e:
+        log.error(f"Error during compression: {e}")
+        raise
+    finally:
+        # Restore original working directory
+        os.chdir(original_cwd)
+
+
+if __name__ == "__main__":
+    compress_model()
diff --git a/debug/dptest_debug.py b/debug/dptest_debug.py
new file mode 100644
index 0000000000..ec7e5dcea8
--- /dev/null
+++ b/debug/dptest_debug.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Debug script for model inference/testing.
+
+Equivalent to: dp --pt test -m model.ckpt.pt -s data -n 100 -f test_debug.txt
+
+This script can be run directly in VSCode with debugging capabilities.
+"""
+
+import logging
+import os
+import sys
+import time
+from pathlib import (
+    Path,
+)
+
+import numpy as np
+
+# Add the deepmd-kit root to Python path
+deepmd_root = Path(__file__).parent.parent
+sys.path.insert(0, str(deepmd_root))
+
+
+def test_model() -> float:
+    """Test the model using the same parameters as the CLI command.
+
+    dp --pt test -m model.ckpt.pt -s . -n 100 -f test_debug.txt
+
+    Returns
+    -------
+    float
+        Elapsed time for the testing in seconds.
+    """
+    # Import here to avoid module-level import restriction
+    from deepmd.entrypoints.test import (
+        test,
+    )
+
+    # Setup logging with timestamp
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    log = logging.getLogger(__name__)
+
+    # Set working directory to examples/water/se_e3_tebd
+    work_dir = deepmd_root / "examples" / "water" / "se_e3_tebd"
+    original_cwd = os.getcwd()
+
+    try:
+        os.chdir(work_dir)
+        log.debug(f"Changed to working directory: {work_dir}")
+
+        # Test parameters
+        model_file = "no.pth"  # Model file to test
+        system_dir = "../data/data_3"  # Directory contains test data
+        datafile = None  # Not using a datafile list
+        train_json = None  # Not using training data for testing
+        valid_json = None  # Not using validation data for testing
+        numb_test = 100  # Number of test frames (0 means all)
+        rand_seed = None  # No random seed
+        shuffle_test = False  # Don't shuffle test data
+        detail_file = "test_debug.txt"  # Output file for test details
+        atomic = False  # Don't compute per-atom quantities
+        head = None  # No specific task head for multi-task models
+
+        # Check if model file exists
+        if not os.path.exists(model_file):
+            raise FileNotFoundError(
+                f"Model file '{model_file}' not found in {work_dir}"
+            )
+
+        # Set environment variable to limit batch size for testing
+        os.environ["DP_INFER_BATCH_SIZE"] = "1024"
+
+        log.debug(f"Model: {model_file}")
+        log.debug(f"System directory: {system_dir}")
+        log.debug(f"Number of test frames: {numb_test}")
+        log.debug(f"Detail file: {detail_file}")
+        log.debug(f"Atomic output: {atomic}")
+
+        log.debug("Starting model testing...")
+
+        # Record time usage
+        start_time = time.monotonic()
+        # Call the test function
+        test(
+            model=model_file,
+            system=system_dir,
+            datafile=datafile,
+            train_json=train_json,
+            valid_json=valid_json,
+            numb_test=numb_test,
+            rand_seed=rand_seed,
+            shuffle_test=shuffle_test,
+            detail_file=detail_file,
+            atomic=atomic,
+            head=head,
+        )
+        end_time = time.monotonic()
+        elapsed_time = end_time - start_time
+
+        # Print results (keep these as info level - these are the main results)
+        log.info("Model testing completed successfully!")
+        log.info(f"Test results saved to: {detail_file}")
+        log.info(f"Elapsed time: {elapsed_time:.2f} seconds")
+
+        return elapsed_time
+
+    except Exception as e:
+        log.error(f"Error during testing: {e}")
+        raise
+    finally:
+        # Restore original working directory
+        os.chdir(original_cwd)
+
+
+if __name__ == "__main__":
+    # Run testing 10 times and calculate average timing
+    num_runs = 10
+    times = []
+
+    print(f"Running model testing {num_runs} times...")  # noqa: T201
+    print("=" * 50)  # noqa: T201
+
+    for i in range(num_runs):
+        print(f"\nRun {i + 1}/{num_runs}")  # noqa: T201
+        print("-" * 20)  # noqa: T201
+        elapsed_time = test_model()
+        times.append(elapsed_time)
+
+    # Calculate and display statistics
+    print("\n" + "=" * 50)  # noqa: T201
+    print("Timing Summary:")  # noqa: T201
+    print("=" * 50)  # noqa: T201
+
+    avg_time = sum(times) / len(times)
+    min_time = min(times)
+    max_time = max(times)
+
+    print(f"Average time: {avg_time:.2f} seconds")  # noqa: T201
+    print(f"Min time: {min_time:.2f} seconds")  # noqa: T201
+    print(f"Max time: {max_time:.2f} seconds")  # noqa: T201
+    print(f"Std deviation: {np.std(times):.2f} seconds")  # noqa: T201
+    print(f"All times: {[f'{t:.2f}' for t in times]}")  # noqa: T201
diff --git a/debug/inference_debug.py b/debug/inference_debug.py
new file mode 100644
index 0000000000..3593e5e655
--- /dev/null
+++ b/debug/inference_debug.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Inference performance profiling script with TensorBoard visualization.
+
+This script focuses on identifying performance hotspots in DeePMD-kit inference
+by breaking down the computation into detailed components and visualizing results.
+"""
+
+import logging
+import os
+import sys
+import time
+from pathlib import (
+    Path,
+)
+from typing import (
+    Any,
+)
+
+import numpy as np
+import torch  # noqa: TID253
+from torch.profiler import record_function  # noqa: TID253
+from torch.utils.tensorboard import SummaryWriter  # noqa: TID253
+
+# Add the deepmd-kit root to Python path
+deepmd_root = Path(__file__).parent.parent
+sys.path.insert(0, str(deepmd_root))
+
+
+def load_single_configuration(data_dir: str, frame_idx: int = 0) -> dict[str, Any]:
+    """Load a single configuration from the dataset.
+
+    Parameters
+    ----------
+    data_dir : str
+        Path to the data directory containing set.000/
+    frame_idx : int, optional
+        Index of the frame to load (default: 0)
+
+    Returns
+    -------
+    dict
+        Dictionary containing coord, box, atom_types, and optional energy/force
+    """
+    set_dir = Path(data_dir) / "set.000"
+
+    # Load data
+    coord = np.load(set_dir / "coord.npy")[frame_idx : frame_idx + 1]  # Keep batch dim
+    box = np.load(set_dir / "box.npy")[frame_idx : frame_idx + 1]  # Keep batch dim
+
+    # Load atom types
+    type_map_file = Path(data_dir) / "type_map.raw"
+    type_file = Path(data_dir) / "type.raw"
+
+    if type_map_file.exists():
+        with open(type_map_file) as f:
+            type_map = [line.strip() for line in f]
+    else:
+        type_map = None
+
+    if type_file.exists():
+        with open(type_file) as f:
+            atom_types = [int(line.strip()) for line in f]
+    else:
+        raise FileNotFoundError(f"Atom type file not found: {type_file}")
+
+    # Optionally load reference data
+    data = {
+        "coord": coord,
+        "box": box,
+        "atom_types": np.array(atom_types),
+        "type_map": type_map,
+    }
+
+    # Load energy and force if available (for comparison)
+    energy_file = set_dir / "energy.npy"
+    force_file = set_dir / "force.npy"
+
+    if energy_file.exists():
+        data["energy"] = np.load(energy_file)[frame_idx : frame_idx + 1]
+    if force_file.exists():
+        data["force"] = np.load(force_file)[frame_idx : frame_idx + 1]
+
+    return data
+
+
+def inference_single_config(
+    model_file: str,
+    enable_profiling: bool = False,
+) -> float:
+    """Perform inference on a single configuration with comprehensive TensorBoard logging.
+
+    Parameters
+    ----------
+    model_file : str
+        Path to the model checkpoint file.
+    enable_profiling : bool, optional
+        Whether to enable PyTorch profiling, by default False
+
+    Returns
+    -------
+    float
+        Elapsed time for the inference in seconds.
+    """
+    # Import DeepPot for simplified inference
+    from deepmd.infer import (
+        DeepPot,
+    )
+
+    # Setup logging with timestamp
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    log = logging.getLogger(__name__)
+
+    # Setting working directory
+    work_dir = deepmd_root / "examples" / "water" / "se_e3_tebd"
+    original_cwd = os.getcwd()
+
+    try:
+        os.chdir(work_dir)
+        log.debug(f"Changed to working directory: {work_dir}")
+
+        log_dir = "./profile_logs"
+        os.makedirs(log_dir, exist_ok=True)
+        writer = SummaryWriter(log_dir)
+
+        # Test parameters
+        data_dir = "../data/data_3"  # Directory contains test data
+        frame_idx = 0  # Use first frame
+
+        # Check if model file exists
+        if not os.path.exists(model_file):
+            raise FileNotFoundError(
+                f"Model file '{model_file}' not found in {work_dir}"
+            )
+
+        log.debug(f"Loading model: {model_file}")
+
+        # Initialize model using DeepPot interface (outside profiling for cleaner results)
+        dp = DeepPot(model_file, auto_batch_size=1024)
+
+        log.debug(f"Loading single configuration from: {data_dir}")
+
+        # Load single configuration (outside profiling)
+        data = load_single_configuration(data_dir, frame_idx)
+        coord = data["coord"]
+        box = data["box"]
+        atom_types = data["atom_types"]
+
+        log.debug("Configuration info:")
+        log.debug(f"  Number of atoms: {len(atom_types)}")
+        log.debug(f"  Coordinate shape: {coord.shape}")
+        log.debug(f"  Box shape: {box.shape}")
+        log.debug(f"  Atom types shape: {atom_types.shape}")
+        log.debug(f"  Unique atom types: {np.unique(atom_types)}")
+
+        if data.get("type_map"):
+            log.debug(f"  Type map: {data['type_map']}")
+
+        log.debug("Starting single configuration inference...")
+
+        # Use profiler if enabled
+        if enable_profiling:
+            log.info("PyTorch profiling enabled...")
+
+            with torch.profiler.profile(
+                schedule=torch.profiler.schedule(wait=3, warmup=3, active=3, repeat=1),
+                on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                    writer.get_logdir()
+                ),
+                record_shapes=True,
+                profile_memory=True,
+                with_stack=True,
+                with_flops=True,
+                with_modules=True,
+            ) as prof:
+                # Warmup and active phases for profiling
+                for phase in range(9):  # 3 wait + 3 warmup + 3 active
+                    if phase == 6:  # Start active profiling
+                        log.debug("Starting profiling phase...")
+
+                    # Record time usage
+                    start_time = time.monotonic()
+
+                    # 3: Use record_function to label the core inference step
+                    with record_function("Inference (DeepPot.eval)"):
+                        # Perform inference using DeepPot.eval()
+                        e, f, v = dp.eval(coord, box, atom_types)
+
+                    elapsed_time = time.monotonic() - start_time
+
+                    if phase == 6:  # End active profiling
+                        log.debug("Ending profiling phase...")
+
+                    # Mark profiler step
+                    prof.step()
+
+                # Save profiling summaries to a log file instead of showing on screen
+                profiling_output_path = "profile_summary.log"
+                with open(profiling_output_path, "w") as pf:
+                    pf.write("=== PyTorch Profiling Summary ===\n")
+                    pf.write("Top 10 CPU operations by total time:\n")
+                    cpu_summary = prof.key_averages().table(
+                        sort_by="cpu_time_total", row_limit=10
+                    )
+                    pf.write(f"{cpu_summary}\n\n")
+
+                    pf.write("Top 10 CUDA operations by total time:\n")
+                    cuda_summary = prof.key_averages().table(
+                        sort_by="cuda_time_total", row_limit=10
+                    )
+                    pf.write(f"{cuda_summary}\n\n")
+
+                    pf.write("Top 10 memory allocations:\n")
+                    memory_summary = prof.key_averages().table(
+                        sort_by="cpu_memory_usage", row_limit=10
+                    )
+                    pf.write(f"{memory_summary}\n")
+
+                log.info("Profile logs saved to ./profile_logs/")
+                log.info(
+                    "To view detailed results, run: tensorboard --logdir=./profile_logs"
+                )
+            writer.close()
+        else:
+            # Regular inference without profiling
+            # Record time usage
+            start_time = time.monotonic()
+
+            # Perform inference using DeepPot.eval()
+            e, f, v = dp.eval(coord, box, atom_types)
+
+            elapsed_time = time.monotonic() - start_time
+
+        # Print results (keep these as info level - these are the main results)
+        log.info("\n=== Inference Results ===")
+        predicted_energy = e.reshape(-1)[0]
+        log.info(f"Predicted energy: {predicted_energy:.6f}")
+
+        if "energy" in data:
+            reference_energy = data["energy"][0]
+            energy_diff = abs(predicted_energy - reference_energy)
+            log.info(f"Reference energy: {reference_energy:.6f}")
+            log.info(f"Energy difference: {energy_diff:.6f}")
+
+        predicted_force = f
+        log.info(f"Force norm: {np.linalg.norm(predicted_force):.6f}")
+
+        if "force" in data:
+            reference_force = data["force"].reshape(predicted_force.shape)
+            force_diff = np.linalg.norm(predicted_force - reference_force)
+            log.info(f"Reference force norm: {np.linalg.norm(reference_force):.6f}")
+            log.info(f"Force RMSE: {force_diff / np.sqrt(predicted_force.size):.6f}")
+
+        predicted_virial = v.reshape(-1)
+        log.info(f"Predicted virial: {predicted_virial}")
+
+        log.info("Inference completed successfully!")
+        log.info(f"Elapsed time: {elapsed_time:.6f} seconds")
+
+        return elapsed_time
+
+    except Exception as e:
+        log.error(f"Error during inference: {e}")
+        raise
+    finally:
+        # Restore original working directory
+        os.chdir(original_cwd)
+
+
+if __name__ == "__main__":
+    # Set this to True to enable PyTorch profiling
+    ENABLE_PROFILING = True
+
+    # Run inference and calculate average timing
+    # If profiling is enabled, force single run
+    num_runs = 1 if ENABLE_PROFILING else 10
+    times = []
+
+    model_name = "no"
+
+    print(f"Running inference {num_runs} times...")  # noqa: T201
+    if ENABLE_PROFILING:
+        print("PyTorch profiling ENABLED (single run forced)")  # noqa: T201
+    print("=" * 50)  # noqa: T201
+
+    for i in range(num_runs):
+        print(f"\nRun {i + 1}/{num_runs}")  # noqa: T201
+        print("-" * 20)  # noqa: T201
+
+        # Enable profiling if requested (will only run once anyway)
+        elapsed_time = inference_single_config(
+            model_file=f"{model_name}.pth", enable_profiling=ENABLE_PROFILING
+        )
+        times.append(elapsed_time)
+
+    # Calculate and display statistics
+    print("\n" + "=" * 50)  # noqa: T201
+    print("Timing Summary:")  # noqa: T201
+    print("=" * 50)  # noqa: T201
+
+    # Drop the first run to avoid cold start bias
+    if len(times) > 1:
+        times = times[1:]
+
+    avg_time = sum(times) / len(times)
+    min_time = min(times)
+    max_time = max(times)
+
+    print(f"Average time: {avg_time:.6f} seconds")  # noqa: T201
+    print(f"Min time: {min_time:.6f} seconds")  # noqa: T201
+    print(f"Max time: {max_time:.6f} seconds")  # noqa: T201
+    print(f"Std deviation: {np.std(times):.6f} seconds")  # noqa: T201
+    print(f"All times: {[f'{t:.6f}' for t in times]}")  # noqa: T201
diff --git a/debug/train_debug.py b/debug/train_debug.py
new file mode 100644
index 0000000000..5c47a762dc
--- /dev/null
+++ b/debug/train_debug.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Debug script for model training.
+
+Equivalent to: dp --pt train input_torch.json
+
+This script can be run directly in VSCode with debugging capabilities.
+"""
+
+import logging
+import os
+import sys
+import time
+from pathlib import (
+    Path,
+)
+
+# Add the deepmd-kit root to Python path
+deepmd_root = Path(__file__).parent.parent
+sys.path.insert(0, str(deepmd_root))
+
+
+def train_model() -> float:
+    """Train the model using the same parameters as the CLI command.
+
+    dp --pt train input_torch.json
+
+    Returns
+    -------
+    float
+        Elapsed time for the training in seconds.
+    """
+    # Import here to avoid module-level import restriction
+    from deepmd.pt.entrypoints.main import (
+        train,
+    )
+
+    # Setup logging with timestamp
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    log = logging.getLogger(__name__)
+
+    # Set working directory to examples/water/se_e3_tebd
+    work_dir = deepmd_root / "examples" / "water" / "se_e3_tebd"
+    original_cwd = os.getcwd()
+
+    try:
+        os.chdir(work_dir)
+        log.debug(f"Changed to working directory: {work_dir}")
+
+        # Training parameters
+        input_file = "input_torch.json"
+        init_model = None  # Start training from scratch
+        restart = None  # No restart
+        finetune = None  # No finetuning
+        init_frz_model = None  # No frozen model initialization
+        model_branch = "default"
+        skip_neighbor_stat = True  # Calculate neighbor statistics
+        use_pretrain_script = False  # Don't use pretrain script
+        force_load = False  # Don't force load incompatible models
+        compile_model = False  # Don't compile model (JIT will be used automatically)
+        output = "out.json"  # Output configuration file
+
+        # Check if input file exists
+        if not os.path.exists(input_file):
+            raise FileNotFoundError(
+                f"Training input file '{input_file}' not found in {work_dir}"
+            )
+
+        log.debug(f"Input file: {input_file}")
+        log.debug(f"Output config: {output}")
+        log.debug(f"Skip neighbor stat: {skip_neighbor_stat}")
+        log.debug(f"Compile model: {compile_model}")
+
+        log.debug("Starting model training...")
+
+        # Record time usage
+        start_time = time.monotonic()
+
+        # Call the training function
+        train(
+            input_file=input_file,
+            init_model=init_model,
+            restart=restart,
+            finetune=finetune,
+            init_frz_model=init_frz_model,
+            model_branch=model_branch,
+            skip_neighbor_stat=skip_neighbor_stat,
+            use_pretrain_script=use_pretrain_script,
+            force_load=force_load,
+            compile_model=compile_model,
+            output=output,
+        )
+
+        elapsed_time = time.monotonic() - start_time
+
+        # Print results (keep these as info level - these are the main results)
+        log.info("Model training completed successfully!")
+        log.info(f"Output configuration saved to: {output}")
+        log.info(f"Elapsed time: {elapsed_time:.2f} seconds")
+
+        return elapsed_time
+
+    except Exception as e:
+        log.error(f"Error during training: {e}")
+        raise
+    finally:
+        # Restore original working directory
+        os.chdir(original_cwd)
+
+
+if __name__ == "__main__":
+    train_model()
diff --git a/debug/train_debug_gradient.py b/debug/train_debug_gradient.py
new file mode 100644
index 0000000000..94998452dd
--- /dev/null
+++ b/debug/train_debug_gradient.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Debug script for locating gradient explosion in SeZM-Net + ZBL training.
+
+This script uses torch.autograd.set_detect_anomaly and gradient hooks to
+pinpoint the exact location of NaN/Inf gradients.
+"""
+
+from __future__ import (
+    annotations,
+)
+
+import logging
+import os
+import pdb
+import sys
+from pathlib import (
+    Path,
+)
+
+import torch  # noqa: TID253
+
+# Add the deepmd-kit root to Python path
+deepmd_root = Path(__file__).parent.parent
+sys.path.insert(0, str(deepmd_root))
+
+# Enable anomaly detection BEFORE importing deepmd modules
+torch.autograd.set_detect_anomaly(True)
+
+
+def register_gradient_hooks(model: torch.nn.Module, log: logging.Logger) -> None:
+    """Register hooks to monitor gradients for all parameters.
+
+    Parameters
+    ----------
+    model : torch.nn.Module
+        The model to monitor.
+    log : logging.Logger
+        Logger for output.
+    """
+
+    def make_hook(name: str) -> callable:
+        def hook(grad: torch.Tensor) -> None:
+            if grad is None:
+                return
+            if torch.isnan(grad).any():
+                log.error(f"NaN gradient detected in: {name}")
+                log.error(f"  Gradient shape: {grad.shape}")
+                log.error(f"  Gradient stats: min={grad.min()}, max={grad.max()}")
+                # Set a breakpoint here for debugging
+                pdb.set_trace()
+            elif torch.isinf(grad).any():
+                log.error(f"Inf gradient detected in: {name}")
+                log.error(f"  Gradient shape: {grad.shape}")
+                log.error(f"  Gradient stats: min={grad.min()}, max={grad.max()}")
+                pdb.set_trace()
+            elif grad.abs().max() > 1e6:
+                log.warning(f"Large gradient detected in: {name}")
+                log.warning(f"  Gradient max abs: {grad.abs().max()}")
+
+        return hook
+
+    for name, param in model.named_parameters():
+        if param.requires_grad:
+            param.register_hook(make_hook(name))
+
+
+def register_tensor_hooks(model: torch.nn.Module, log: logging.Logger) -> list:
+    """Register forward hooks to monitor intermediate tensors.
+
+    Parameters
+    ----------
+    model : torch.nn.Module
+        The model to monitor.
+    log : logging.Logger
+        Logger for output.
+
+    Returns
+    -------
+    list
+        List of hook handles for cleanup.
+    """
+    handles = []
+
+    def make_forward_hook(name: str) -> callable:
+        def hook(module: torch.nn.Module, input: tuple, output: object) -> None:
+            # Check inputs
+            for i, inp in enumerate(input):
+                if isinstance(inp, torch.Tensor):
+                    if torch.isnan(inp).any():
+                        log.error(f"NaN in input[{i}] of {name}")
+                        pdb.set_trace()
+                    elif torch.isinf(inp).any():
+                        log.error(f"Inf in input[{i}] of {name}")
+                        pdb.set_trace()
+
+            # Check outputs
+            if isinstance(output, torch.Tensor):
+                if torch.isnan(output).any():
+                    log.error(f"NaN in output of {name}")
+                    log.error(f"  Output shape: {output.shape}")
+                    pdb.set_trace()
+                elif torch.isinf(output).any():
+                    log.error(f"Inf in output of {name}")
+                    log.error(f"  Output shape: {output.shape}")
+                    pdb.set_trace()
+            elif isinstance(output, tuple):
+                for j, out in enumerate(output):
+                    if isinstance(out, torch.Tensor):
+                        if torch.isnan(out).any():
+                            log.error(f"NaN in output[{j}] of {name}")
+                            pdb.set_trace()
+                        elif torch.isinf(out).any():
+                            log.error(f"Inf in output[{j}] of {name}")
+                            pdb.set_trace()
+
+        return hook
+
+    for name, module in model.named_modules():
+        h = module.register_forward_hook(make_forward_hook(name))
+        handles.append(h)
+
+    return handles
+
+
+def train_with_debug() -> None:
+    """Train with gradient debugging enabled."""
+    from deepmd.pt.entrypoints.main import (
+        train,
+    )
+    from deepmd.pt.train.training import (
+        Trainer,
+    )
+
+    # Patch Trainer to add hooks
+    original_init = Trainer.__init__
+
+    def patched_init(self: Trainer, *args: object, **kwargs: object) -> None:
+        original_init(self, *args, **kwargs)
+        log = logging.getLogger("GradientDebug")
+        log.info("Registering gradient hooks...")
+        register_gradient_hooks(self.wrapper, log)
+        # Note: forward hooks can slow down training significantly
+        # Uncomment if you need to debug forward pass as well:
+        # register_tensor_hooks(self.wrapper, log)
+
+    Trainer.__init__ = patched_init
+
+    # Setup logging
+    logging.basicConfig(
+        level=logging.DEBUG,
+        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+    log = logging.getLogger(__name__)
+
+    # Set working directory
+    work_dir = Path("/home/outisli/Research/dp_train/se_zm/pair/l_2")
+    original_cwd = os.getcwd()
+
+    try:
+        os.chdir(work_dir)
+        log.info(f"Working directory: {work_dir}")
+        log.info("Anomaly detection enabled - will show traceback on NaN/Inf")
+
+        train(
+            input_file="input.json",
+            init_model=None,
+            restart=None,
+            finetune=None,
+            init_frz_model=None,
+            model_branch="default",
+            skip_neighbor_stat=True,
+            use_pretrain_script=False,
+            force_load=False,
+            compile_model=False,
+            output="out.json",
+        )
+    except RuntimeError as e:
+        if "nan" in str(e).lower() or "inf" in str(e).lower():
+            log.error(f"Gradient anomaly detected: {e}")
+            log.error("The traceback above shows where the NaN/Inf was introduced.")
+            pdb.post_mortem()
+        else:
+            raise
+    finally:
+        os.chdir(original_cwd)
+
+
+if __name__ == "__main__":
+    train_with_debug()
diff --git a/deepmd/dpmodel/utils/learning_rate.py b/deepmd/dpmodel/utils/learning_rate.py
index 7ea50583e2..c370ad6f58 100644
--- a/deepmd/dpmodel/utils/learning_rate.py
+++ b/deepmd/dpmodel/utils/learning_rate.py
@@ -29,77 +29,243 @@ def __new__(cls: type, *args: Any, **kwargs: Any) -> Any:
         return super().__new__(cls)
 
     def __init__(
-        self, start_lr: float, stop_lr: float, stop_steps: int, **kwargs: Any
+        self,
+        start_lr: float,
+        stop_lr: float | None = None,
+        stop_ratio: float | None = None,
+        stop_steps: int = 100000,
+        warmup_steps: int = 0,
+        warmup_ratio: float | None = None,
+        warmup_start_factor: float = 0.0,
+        **kwargs: Any,
     ) -> None:
         """
-        Base class for learning rate schedules.
+        Base class for learning rate schedules with warmup support.
 
         Parameters
         ----------
-        start_lr
-            The initial learning rate.
-        stop_lr
-            The final learning rate.
-        stop_steps
-            The total training steps for learning rate scheduler.
+        start_lr : float
+            The learning rate at the start of the training (after warmup).
+        stop_lr : float, optional
+            The final learning rate at the end of the training.
+            Mutually exclusive with stop_ratio.
+        stop_ratio : float, optional
+            The ratio of stop_lr to start_lr. stop_lr = start_lr * stop_ratio.
+            Mutually exclusive with stop_lr.
+            One of stop_lr or stop_ratio must be provided.
+        stop_steps : int
+            The total training steps (including warmup).
+        warmup_steps : int, optional
+            The number of steps for learning rate warmup.
+            Mutually exclusive with warmup_ratio. Default is 0 (no warmup).
+        warmup_ratio : float, optional
+            The ratio of warmup steps to total training steps.
+            warmup_steps = int(warmup_ratio * stop_steps).
+            Mutually exclusive with warmup_steps.
+        warmup_start_factor : float, optional
+            The factor of start_lr for the initial warmup learning rate.
+            The warmup learning rate starts from warmup_start_factor * start_lr.
+            Default is 0.0.
         """
+        # === Step 1. Compute stop_lr from stop_ratio if needed ===
+        # Mutual exclusion validated in argcheck.py
+        if stop_ratio is not None:
+            self.stop_lr = start_lr * stop_ratio
+        else:
+            self.stop_lr = stop_lr  # type: ignore[assignment]
+
+        # === Step 2. Compute warmup_steps from warmup_ratio if needed ===
+        # Mutual exclusion validated in argcheck.py
+        if warmup_ratio is not None:
+            self.warmup_steps = int(warmup_ratio * stop_steps)
+        else:
+            self.warmup_steps = warmup_steps
+
+        # === Step 3. Validate step ranges (runtime check) ===
+        if stop_steps <= 0:
+            raise ValueError("stop_steps must be positive")
+        if self.warmup_steps < 0:
+            raise ValueError("warmup_steps must be non-negative")
+        if self.warmup_steps >= stop_steps:
+            raise ValueError("warmup_steps must be smaller than stop_steps")
+
+        # === Step 4. Compute warmup_start_lr ===
+        self.warmup_start_lr = warmup_start_factor * start_lr
+
+        # === Step 5. Store core parameters ===
         self.start_lr = start_lr
-        self.stop_lr = stop_lr
         self.stop_steps = stop_steps
+        # Decay phase covers (stop_steps - warmup_steps) steps
+        self.decay_stop_steps = stop_steps - self.warmup_steps
 
     @abstractmethod
-    def value(self, step: int | Array) -> Array:
-        """Get the learning rate at the given step."""
-        # in optax, step will be a jnp.ndarray passed in JIT mode
+    def _decay_value(self, step: int | Array) -> Array:
+        """
+        Get the decayed learning rate at the given step (after warmup).
+
+        This method should implement the actual decay logic (exp, cosine, etc.)
+        without considering warmup.
+
+        Parameters
+        ----------
+        step : int or Array
+            The step index relative to the end of warmup.
+            For example, if warmup_steps=100 and total_step=150, this method
+            will be called with step=50.
+
+        Returns
+        -------
+        Array
+            The decayed learning rate (absolute value, not factor).
+        """
         pass
 
+    def value(self, step: int | Array) -> Array | float:
+        """
+        Get the learning rate at the given step, including warmup.
+
+        Parameters
+        ----------
+        step : int or Array
+            The absolute step index from the start of training.
+
+        Returns
+        -------
+        Array
+            The learning rate at the given step.
+        """
+        is_scalar = isinstance(step, (int, float))
+        if not array_api_compat.is_array_api_obj(step):
+            step = np.asarray(step)
+        xp = array_api_compat.array_namespace(step)
+
+        # === Step 1. Handle no-warmup case directly ===
+        if self.warmup_steps == 0:
+            lr = self._decay_value(xp.astype(step, xp.float64))
+        else:
+            # === Step 2. Warmup phase ===
+            # Linear warmup from warmup_start_lr to start_lr
+            warmup_progress = xp.astype(step, xp.float64) / self.warmup_steps
+            warmup_lr = (
+                self.warmup_start_lr
+                + (self.start_lr - self.warmup_start_lr) * warmup_progress
+            )
+
+            # === Step 3. Decay phase ===
+            # Call subclass decay logic for steps after warmup
+            decay_step = xp.maximum(
+                xp.astype(step, xp.float64) - self.warmup_steps, 0.0
+            )
+            decay_lr = self._decay_value(decay_step)
+
+            # === Step 4. Select warmup or decay based on step ===
+            lr = xp.where(step < self.warmup_steps, warmup_lr, decay_lr)
+
+        if is_scalar:
+            return float(lr)
+        return lr
+
 
 @BaseLR.register("exp")
 class LearningRateExp(BaseLR):
     def __init__(
         self,
         start_lr: float,
-        stop_lr: float,
-        decay_steps: int,
-        stop_steps: int,
+        stop_lr: float | None = None,
+        stop_ratio: float | None = None,
+        decay_steps: int = 5000,
+        stop_steps: int = 100000,
         decay_rate: float | None = None,
+        warmup_steps: int = 0,
+        warmup_ratio: float | None = None,
+        warmup_start_factor: float = 0.0,
         **kwargs: Any,
     ) -> None:
         """
-        Construct an exponential-decayed learning rate.
+        Construct an exponential-decayed learning rate with optional warmup.
 
         Parameters
         ----------
-        start_lr
-            The learning rate at the start of the training.
-        stop_lr
+        start_lr : float
+            The learning rate at the start of the training (after warmup).
+        stop_lr : float, optional
             The desired learning rate at the end of the training.
             When decay_rate is explicitly set, this value will serve as
-            the minimum learning rate during training. In other words,
-            if the learning rate decays below stop_lr, stop_lr will be applied instead.
-        decay_steps
+            the minimum learning rate during training.
+            Mutually exclusive with stop_ratio.
+        stop_ratio : float, optional
+            The ratio of stop_lr to start_lr.
+            Mutually exclusive with stop_lr.
+        decay_steps : int
             The learning rate is decaying every this number of training steps.
-        stop_steps
-            The total training steps for learning rate scheduler.
-        decay_rate
+            Default is 5000.
+        stop_steps : int
+            The total training steps (including warmup).
+        decay_rate : float, optional
             The decay rate for the learning rate.
             If provided, the decay rate will be set instead of
             calculating it through interpolation between start_lr and stop_lr.
+        warmup_steps : int, optional
+            The number of steps for learning rate warmup.
+            Mutually exclusive with warmup_ratio. Default is 0.
+        warmup_ratio : float, optional
+            The ratio of warmup steps to total training steps.
+            Mutually exclusive with warmup_steps.
+        warmup_start_factor : float, optional
+            The factor of start_lr for the initial warmup learning rate.
+            Default is 0.0.
+
+        Raises
+        ------
+        ValueError
+            If both stop_lr and stop_ratio are provided, or neither is provided.
+            If both warmup_steps and warmup_ratio are provided.
+            If decay_steps is larger than the decay phase total steps.
         """
-        super().__init__(start_lr, stop_lr, stop_steps, **kwargs)
-        default_ds = 100 if stop_steps // 10 > 100 else stop_steps // 100 + 1
+        super().__init__(
+            start_lr=start_lr,
+            stop_lr=stop_lr,
+            stop_ratio=stop_ratio,
+            stop_steps=stop_steps,
+            warmup_steps=warmup_steps,
+            warmup_ratio=warmup_ratio,
+            warmup_start_factor=warmup_start_factor,
+            **kwargs,
+        )
+        # === Step 5. Compute decay_rate for exp scheduler ===
+        # Use decay_stop_steps (stop_steps - warmup_steps) for decay calculation
+        decay_total = self.decay_stop_steps
         self.decay_steps = decay_steps
-        if self.decay_steps >= stop_steps:
-            self.decay_steps = default_ds
+
+        if self.decay_steps > decay_total:
+            raise ValueError(
+                f"decay_steps ({self.decay_steps}) must not exceed decay phase steps ({decay_total})."
+            )
+
+        # Avoid log(0) issues by clamping stop_lr for computation
+        clamped_stop_lr = max(self.stop_lr, 1e-10)
+        self.min_lr = self.stop_lr
+
         self.decay_rate = np.exp(
-            np.log(stop_lr / self.start_lr) / (stop_steps / self.decay_steps)
+            np.log(clamped_stop_lr / self.start_lr) / (decay_total / self.decay_steps)
         ).item()
         if decay_rate is not None:
             self.decay_rate = decay_rate
-        self.min_lr = self.stop_lr
 
-    def value(self, step: int | Array) -> Array:
-        """Get the learning rate at the given step."""
+    def _decay_value(self, step: int | Array) -> Array:
+        """
+        Get the exponential-decayed learning rate factor at the given step.
+
+        Parameters
+        ----------
+        step : int or Array
+            The step index relative to the end of warmup.
+
+        Returns
+        -------
+        Array
+            The decayed learning rate (absolute value).
+        """
         if not array_api_compat.is_array_api_obj(step):
             step = np.asarray(step)
         xp = array_api_compat.array_namespace(step)
@@ -107,8 +273,7 @@ def value(self, step: int | Array) -> Array:
             xp.asarray(self.decay_rate, device=array_api_compat.device(step)),
             xp.astype(step // self.decay_steps, xp.float64),
         )
-        # the original implementation `if step_lr < self.min_lr:`
-        # will cause a dynamic graph which is unsupported in JAX JIT
+        # Clip to min_lr for numerical stability in JIT
         step_lr = xp.clip(step_lr, self.min_lr, None)
         return step_lr
 
@@ -118,29 +283,74 @@ class LearningRateCosine(BaseLR):
     def __init__(
         self,
         start_lr: float,
-        stop_lr: float,
-        stop_steps: int,
+        stop_lr: float | None = None,
+        stop_ratio: float | None = None,
+        stop_steps: int = 100000,
+        warmup_steps: int = 0,
+        warmup_ratio: float | None = None,
+        warmup_start_factor: float = 0.0,
         **kwargs: Any,
     ) -> None:
         """
-        Defines a cosine annealing learning rate schedule.
-        The learning rate starts at `start_lr` and gradually decreases to `stop_lr`
-        following a cosine curve over the training steps.
+        Defines a cosine annealing learning rate schedule with optional warmup.
+
+        The learning rate starts at `start_lr` (after warmup) and gradually
+        decreases to `stop_lr` following a cosine curve over the training steps.
 
         Parameters
         ----------
-        start_lr
-            The initial learning rate at the beginning of training.
-        stop_lr
+        start_lr : float
+            The learning rate at the start of the training (after warmup).
+        stop_lr : float, optional
             The final learning rate at the end of training.
-        stop_steps
-            The total number of training steps over which the learning rate
-            will be annealed from start_lr to stop_lr.
+            Mutually exclusive with stop_ratio.
+        stop_ratio : float, optional
+            The ratio of stop_lr to start_lr.
+            Mutually exclusive with stop_lr.
+        stop_steps : int
+            The total training steps (including warmup).
+        warmup_steps : int, optional
+            The number of steps for learning rate warmup.
+            Mutually exclusive with warmup_ratio. Default is 0.
+        warmup_ratio : float, optional
+            The ratio of warmup steps to total training steps.
+            Mutually exclusive with warmup_steps.
+        warmup_start_factor : float, optional
+            The factor of start_lr for the initial warmup learning rate.
+            Default is 0.0.
+
+        Raises
+        ------
+        ValueError
+            If both stop_lr and stop_ratio are provided, or neither is provided.
+            If both warmup_steps and warmup_ratio are provided.
         """
-        super().__init__(start_lr, stop_lr, stop_steps, **kwargs)
-        self.lr_min_factor = stop_lr / start_lr
+        super().__init__(
+            start_lr=start_lr,
+            stop_lr=stop_lr,
+            stop_ratio=stop_ratio,
+            stop_steps=stop_steps,
+            warmup_steps=warmup_steps,
+            warmup_ratio=warmup_ratio,
+            warmup_start_factor=warmup_start_factor,
+            **kwargs,
+        )
+        self.lr_min_factor = self.stop_lr / self.start_lr
 
-    def value(self, step: int | Array) -> Array:
+    def _decay_value(self, step: int | Array) -> Array:
+        """
+        Get the cosine-annealed learning rate at the given step.
+
+        Parameters
+        ----------
+        step : int or Array
+            The step index relative to the end of warmup.
+
+        Returns
+        -------
+        Array
+            The annealed learning rate (absolute value).
+        """
         if not array_api_compat.is_array_api_obj(step):
             step = np.asarray(step)
         xp = array_api_compat.array_namespace(step)
@@ -153,11 +363,12 @@ def value(self, step: int | Array) -> Array:
                 1
                 + xp.cos(
                     xp.asarray(
-                        xp.pi * (xp.astype(step, xp.float64) / self.stop_steps),
+                        xp.pi * (xp.astype(step, xp.float64) / self.decay_stop_steps),
                         device=array_api_compat.device(step),
                     )
                 )
             )
         )
-        step_lr = xp.where(step >= self.stop_steps, min_lr, step_lr)
+        # Clip to min_lr for steps beyond decay_stop_steps
+        step_lr = xp.where(step >= self.decay_stop_steps, min_lr, step_lr)
         return step_lr
diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
index dd0fbdc94b..df38623132 100644
--- a/deepmd/pd/train/training.py
+++ b/deepmd/pd/train/training.py
@@ -239,7 +239,7 @@ def get_sample():
             return get_sample
 
         def get_lr(lr_params: dict[str, Any]) -> BaseLR:
-            lr_params["stop_steps"] = self.num_steps - self.warmup_steps
+            lr_params["stop_steps"] = self.num_steps
             lr_schedule = BaseLR(**lr_params)
             return lr_schedule
 
@@ -387,11 +387,7 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
                     )
 
         # Learning rate
-        self.warmup_steps = training_params.get("warmup_steps", 0)
         self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
-        assert self.num_steps - self.warmup_steps > 0 or self.warmup_steps == 0, (
-            "Warm up steps must be less than total training steps!"
-        )
         if self.multi_task and config.get("learning_rate_dict", None) is not None:
             self.lr_exp = {}
             for model_key in self.model_keys:
@@ -580,18 +576,13 @@ def single_model_finetune(
 
         # TODO add lr warmups for multitask
         # author: iProzd
-        def warm_up_linear(step, warmup_steps):
-            if step < warmup_steps:
-                return step / warmup_steps
-            else:
-                return self.lr_exp.value(step - warmup_steps) / self.lr_exp.start_lr
-
         # TODO add optimizers for multitask
         # author: iProzd
         if self.opt_type == "Adam":
             self.scheduler = paddle.optimizer.lr.LambdaDecay(
                 learning_rate=self.lr_exp.start_lr,
-                lr_lambda=lambda step: warm_up_linear(step, self.warmup_steps),
+                lr_lambda=lambda step: self.lr_exp.value(step + self.start_step)
+                / self.lr_exp.start_lr,
             )
             self.optimizer = paddle.optimizer.Adam(
                 learning_rate=self.scheduler, parameters=self.wrapper.parameters()
@@ -755,10 +746,7 @@ def step(_step_id, task_key="Default") -> None:
                 fout1.flush()
             if self.opt_type == "Adam":
                 cur_lr = self.scheduler.get_lr()
-                if _step_id < self.warmup_steps:
-                    pref_lr = _lr.start_lr
-                else:
-                    pref_lr = cur_lr
+                pref_lr = cur_lr
 
                 # disable synchronization in forward-backward manually
                 # as derivatives exist in model forward
diff --git a/deepmd/pd/utils/utils.py b/deepmd/pd/utils/utils.py
index 7224547805..e939f84cb3 100644
--- a/deepmd/pd/utils/utils.py
+++ b/deepmd/pd/utils/utils.py
@@ -239,7 +239,11 @@ def to_numpy_array(
 ):
     if xx is None:
         return None
+    if isinstance(xx, (float, int)):
+        return np.array(xx)
     assert xx is not None
+    if isinstance(xx, np.ndarray):
+        return xx
     # Create a reverse mapping of PD_PRECISION_DICT
     reverse_precision_dict = {v: k for k, v in PD_PRECISION_DICT.items()}
     # Use the reverse mapping to find keys with the desired value
@@ -247,8 +251,6 @@ def to_numpy_array(
     prec = NP_PRECISION_DICT.get(prec, np.float64)
     if prec is None:
         raise ValueError(f"unknown precision {xx.dtype}")
-    if isinstance(xx, np.ndarray):
-        return xx.astype(prec)
     if xx.dtype == paddle.bfloat16:
         xx = xx.astype(paddle.get_default_dtype())
     return xx.numpy().astype(prec)
diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
index 0dfbe94b6b..b4aa119cb8 100644
--- a/deepmd/pt/train/training.py
+++ b/deepmd/pt/train/training.py
@@ -51,6 +51,7 @@
 )
 from deepmd.pt.utils import (
     dp_random,
+    env,
 )
 from deepmd.pt.utils.dataloader import (
     DpLoaderSet,
@@ -273,7 +274,7 @@ def get_sample() -> Any:
             return get_sample
 
         def get_lr(lr_params: dict[str, Any]) -> BaseLR:
-            lr_params["stop_steps"] = self.num_steps - self.warmup_steps
+            lr_params["stop_steps"] = self.num_steps
             lr_schedule = BaseLR(**lr_params)
             return lr_schedule
 
@@ -431,27 +432,7 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
                     )
 
         # Learning rate
-        warmup_steps = training_params.get("warmup_steps", None)
-        warmup_ratio = training_params.get("warmup_ratio", None)
-        if warmup_steps is not None:
-            self.warmup_steps = warmup_steps
-        elif warmup_ratio is not None:
-            if not 0 <= warmup_ratio < 1:
-                raise ValueError(f"warmup_ratio must be in [0, 1), got {warmup_ratio}")
-            self.warmup_steps = int(warmup_ratio * self.num_steps)
-            if self.warmup_steps == 0 and warmup_ratio > 0:
-                log.warning(
-                    f"warmup_ratio {warmup_ratio} results in 0 warmup steps "
-                    f"due to truncation. Consider using a larger ratio or "
-                    f"specify warmup_steps directly."
-                )
-        else:
-            self.warmup_steps = 0
-        self.warmup_start_factor = training_params.get("warmup_start_factor", 0.0)
         self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
-        assert self.num_steps - self.warmup_steps > 0 or self.warmup_steps == 0, (
-            "Warm up steps must be less than total training steps!"
-        )
         if self.multi_task and config.get("learning_rate_dict", None) is not None:
             self.lr_exp = {}
             for model_key in self.model_keys:
@@ -697,14 +678,6 @@ def single_model_finetune(
 
         # TODO add lr warmups for multitask
         # author: iProzd
-        def warm_up_linear(step: int, warmup_steps: int) -> float:
-            if step < warmup_steps:
-                return self.warmup_start_factor + (1.0 - self.warmup_start_factor) * (
-                    step / warmup_steps
-                )
-            else:
-                return self.lr_exp.value(step - warmup_steps) / self.lr_exp.start_lr
-
         # TODO add optimizers for multitask
         # author: iProzd
         if self.opt_type in ["Adam", "AdamW"]:
@@ -725,7 +698,8 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
                 self.optimizer.load_state_dict(optimizer_state_dict)
             self.scheduler = torch.optim.lr_scheduler.LambdaLR(
                 self.optimizer,
-                lambda step: warm_up_linear(step + self.start_step, self.warmup_steps),
+                lambda step: self.lr_exp.value(step + self.start_step)
+                / self.lr_exp.start_lr,
             )
         elif self.opt_type == "LKF":
             self.optimizer = LKFOptimizer(
@@ -748,7 +722,8 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
                 self.optimizer.load_state_dict(optimizer_state_dict)
             self.scheduler = torch.optim.lr_scheduler.LambdaLR(
                 self.optimizer,
-                lambda step: warm_up_linear(step + self.start_step, self.warmup_steps),
+                lambda step: self.lr_exp.value(step + self.start_step)
+                / self.lr_exp.start_lr,
             )
         else:
             raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
@@ -822,10 +797,7 @@ def step(_step_id: int, task_key: str = "Default") -> None:
                 fout1.flush()
             if self.opt_type in ["Adam", "AdamW", "AdaMuon"]:
                 cur_lr = self.scheduler.get_last_lr()[0]
-                if _step_id < self.warmup_steps:
-                    pref_lr = _lr.start_lr
-                else:
-                    pref_lr = cur_lr
+                pref_lr = cur_lr
                 model_pred, loss, more_loss = self.wrapper(
                     **input_dict, cur_lr=pref_lr, label=label_dict, task_key=task_key
                 )
@@ -836,7 +808,7 @@ def step(_step_id: int, task_key: str = "Default") -> None:
                         self.gradient_max_norm,
                         error_if_nonfinite=True,
                     )
-                with torch.device("cpu"):
+                with torch.device(env.DEVICE):
                     self.optimizer.step()
                 self.scheduler.step()
             elif self.opt_type == "LKF":
diff --git a/deepmd/pt/utils/utils.py b/deepmd/pt/utils/utils.py
index ab066bdf93..10a4354e8b 100644
--- a/deepmd/pt/utils/utils.py
+++ b/deepmd/pt/utils/utils.py
@@ -227,10 +227,14 @@ def to_numpy_array(xx: None) -> None: ...
 
 
 def to_numpy_array(
-    xx: torch.Tensor | None,
+    xx: torch.Tensor | np.ndarray | float | None,
 ) -> np.ndarray | None:
     if xx is None:
         return None
+    if isinstance(xx, (float, int)):
+        return np.array(xx)
+    if isinstance(xx, np.ndarray):
+        return xx
     assert xx is not None
     # Create a reverse mapping of PT_PRECISION_DICT
     reverse_precision_dict = {v: k for k, v in PT_PRECISION_DICT.items()}
diff --git a/deepmd/tf/fit/dipole.py b/deepmd/tf/fit/dipole.py
index 961198b8e7..ebeec270e0 100644
--- a/deepmd/tf/fit/dipole.py
+++ b/deepmd/tf/fit/dipole.py
@@ -388,7 +388,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
         ----------
         loss : dict
             the loss dict
-        lr : LearningRateExp
+        lr : LearningRateSchedule
             the learning rate
 
         Returns
diff --git a/deepmd/tf/fit/dos.py b/deepmd/tf/fit/dos.py
index 250d803d8f..bec8814d18 100644
--- a/deepmd/tf/fit/dos.py
+++ b/deepmd/tf/fit/dos.py
@@ -655,7 +655,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
         ----------
         loss : dict
             the loss dict
-        lr : LearningRateExp
+        lr : LearningRateSchedule
             the learning rate
 
         Returns
diff --git a/deepmd/tf/fit/ener.py b/deepmd/tf/fit/ener.py
index 2b8b1b906e..6a027b2ec2 100644
--- a/deepmd/tf/fit/ener.py
+++ b/deepmd/tf/fit/ener.py
@@ -856,7 +856,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
         ----------
         loss : dict
             The loss function parameters.
-        lr : LearningRateExp
+        lr : LearningRateSchedule
             The learning rate.
 
         Returns
diff --git a/deepmd/tf/fit/fitting.py b/deepmd/tf/fit/fitting.py
index b33559f12f..f7e5d959ef 100644
--- a/deepmd/tf/fit/fitting.py
+++ b/deepmd/tf/fit/fitting.py
@@ -73,7 +73,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
         ----------
         loss : dict
             the loss dict
-        lr : LearningRateExp
+        lr : LearningRateSchedule
             the learning rate
 
         Returns
diff --git a/deepmd/tf/fit/polar.py b/deepmd/tf/fit/polar.py
index 1e48a5fa59..137695d9b8 100644
--- a/deepmd/tf/fit/polar.py
+++ b/deepmd/tf/fit/polar.py
@@ -863,7 +863,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
         ----------
         loss : dict
             the loss dict
-        lr : LearningRateExp
+        lr : LearningRateSchedule
             the learning rate
 
         Returns
diff --git a/deepmd/tf/train/trainer.py b/deepmd/tf/train/trainer.py
index 4af59fd290..f9c67591d3 100644
--- a/deepmd/tf/train/trainer.py
+++ b/deepmd/tf/train/trainer.py
@@ -4,6 +4,9 @@
 import os
 import shutil
 import time
+from typing import (
+    Any,
+)
 
 import google.protobuf.message
 import numpy as np
@@ -52,7 +55,7 @@
     load_graph_def,
 )
 from deepmd.tf.utils.learning_rate import (
-    LearningRateExp,
+    LearningRateSchedule,
 )
 from deepmd.tf.utils.sess import (
     run_sess,
@@ -100,7 +103,9 @@ def _init_param(self, jdata) -> None:
         self.model = Model(**model_param)
         self.fitting = self.model.get_fitting()
 
-        def get_lr_and_coef(lr_param):
+        def get_lr_and_coef(
+            lr_param: dict[str, Any],
+        ) -> tuple[LearningRateSchedule, float]:
             scale_by_worker = lr_param.get("scale_by_worker", "linear")
             if scale_by_worker == "linear":
                 scale_lr_coef = float(self.run_opt.world_size)
@@ -108,13 +113,8 @@ def get_lr_and_coef(lr_param):
                 scale_lr_coef = np.sqrt(self.run_opt.world_size).real
             else:
                 scale_lr_coef = 1.0
-            lr_type = lr_param.get("type", "exp")
-            if lr_type == "exp":
-                lr = LearningRateExp(
-                    lr_param["start_lr"], lr_param["stop_lr"], lr_param["decay_steps"]
-                )
-            else:
-                raise RuntimeError("unknown learning_rate type " + lr_type)
+            lr_params = {k: v for k, v in lr_param.items() if k != "scale_by_worker"}
+            lr = LearningRateSchedule(lr_params)
             return lr, scale_lr_coef
 
         # learning rate
@@ -427,11 +427,9 @@ def train(self, train_data=None, valid_data=None) -> None:
         is_first_step = True
         self.cur_batch = cur_batch
         log.info(
-            "start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e",
+            "start training at lr %.2e (== %.2e), final lr will be %.2e",
             run_sess(self.sess, self.learning_rate),
             self.lr.value(cur_batch),
-            self.lr.decay_steps_,
-            self.lr.decay_rate_,
             self.lr.value(stop_batch),
         )
 
diff --git a/deepmd/tf/utils/__init__.py b/deepmd/tf/utils/__init__.py
index 7d1e7e67d0..b88c13d445 100644
--- a/deepmd/tf/utils/__init__.py
+++ b/deepmd/tf/utils/__init__.py
@@ -7,7 +7,7 @@
     DeepmdDataSystem,
 )
 from .learning_rate import (
-    LearningRateExp,
+    LearningRateSchedule,
 )
 from .pair_tab import (
     PairTab,
@@ -20,7 +20,7 @@
 __all__ = [
     "DeepmdData",
     "DeepmdDataSystem",
-    "LearningRateExp",
+    "LearningRateSchedule",
     "PairTab",
     "Plugin",
     "PluginVariant",
diff --git a/deepmd/tf/utils/learning_rate.py b/deepmd/tf/utils/learning_rate.py
index 64427e185d..9867e453f9 100644
--- a/deepmd/tf/utils/learning_rate.py
+++ b/deepmd/tf/utils/learning_rate.py
@@ -1,102 +1,128 @@
 # SPDX-License-Identifier: LGPL-3.0-or-later
+from __future__ import (
+    annotations,
+)
+
+from typing import (
+    Any,
+)
 
 import numpy as np
 
+from deepmd.dpmodel.utils.learning_rate import (
+    BaseLR,
+)
 from deepmd.tf.env import (
     tf,
 )
 
 
-class LearningRateExp:
-    r"""The exponentially decaying learning rate.
+class LearningRateSchedule:
+    """
+    TensorFlow wrapper for BaseLR.
+
+    Parameters
+    ----------
+    params : dict[str, Any]
+        Learning rate configuration dictionary.
+    """
 
-    The learning rate at step :math:`t` is given by
+    def __init__(self, params: dict[str, Any]) -> None:
+        # === Step 1. Store configuration ===
+        self._params = dict(params)
+        if "start_lr" not in self._params:
+            raise ValueError("start_lr must be provided")
+        self._start_lr = float(self._params["start_lr"])
+        self._base_lr: BaseLR | None = None
 
-    .. math::
+    def start_lr(self) -> float:
+        """
+        Get the starting learning rate.
 
-        \alpha(t) = \alpha_0 \lambda ^ { t / \tau }
+        Returns
+        -------
+        float
+            The starting learning rate.
+        """
+        return self._start_lr
 
-    where :math:`\alpha` is the learning rate, :math:`\alpha_0` is the starting learning rate,
-    :math:`\lambda` is the decay rate, and :math:`\tau` is the decay steps.
+    @property
+    def base_lr(self) -> BaseLR:
+        """
+        Get the built BaseLR instance.
 
-    Parameters
-    ----------
-    start_lr
-            Starting learning rate :math:`\alpha_0`
-    stop_lr
-            Stop learning rate :math:`\alpha_1`
-    decay_steps
-            Learning rate decay every this number of steps :math:`\tau`
-    decay_rate
-            The decay rate :math:`\lambda`.
-            If `stop_step` is provided in `build`, then it will be determined automatically and overwritten.
-    """
+        Returns
+        -------
+        BaseLR
+            The built learning rate schedule.
 
-    def __init__(
-        self,
-        start_lr: float,
-        stop_lr: float = 5e-8,
-        decay_steps: int = 5000,
-        decay_rate: float = 0.95,
-    ) -> None:
-        """Constructor."""
-        self.cd = {}
-        self.cd["start_lr"] = start_lr
-        self.cd["stop_lr"] = stop_lr
-        self.cd["decay_steps"] = decay_steps
-        self.cd["decay_rate"] = decay_rate
-        self.start_lr_ = self.cd["start_lr"]
-
-    def build(self, global_step: tf.Tensor, stop_step: int | None = None) -> tf.Tensor:
-        """Build the learning rate.
+        Raises
+        ------
+        RuntimeError
+            If the schedule has not been built.
+        """
+        if self._base_lr is None:
+            raise RuntimeError("Learning rate schedule is not built yet.")
+        return self._base_lr
+
+    def build(self, global_step: tf.Tensor, stop_steps: int) -> tf.Tensor:
+        """
+        Build a TensorFlow learning rate tensor.
 
         Parameters
         ----------
-        global_step
-            The tf Tensor providing the global training step
-        stop_step
-            The stop step. If provided, the decay_rate will be determined automatically and overwritten.
+        global_step : tf.Tensor
+            The global training step tensor.
+        stop_steps : int
+            The total training steps.
 
         Returns
         -------
-        learning_rate
-            The learning rate
+        tf.Tensor
+            The learning rate tensor.
         """
-        if stop_step is None:
-            self.decay_steps_ = (
-                self.cd["decay_steps"] if self.cd["decay_steps"] is not None else 5000
-            )
-            self.decay_rate_ = (
-                self.cd["decay_rate"] if self.cd["decay_rate"] is not None else 0.95
-            )
-        else:
-            self.stop_lr_ = (
-                self.cd["stop_lr"] if self.cd["stop_lr"] is not None else 5e-8
-            )
-            default_ds = 100 if stop_step // 10 > 100 else stop_step // 100 + 1
-            self.decay_steps_ = (
-                self.cd["decay_steps"]
-                if self.cd["decay_steps"] is not None
-                else default_ds
-            )
-            if self.decay_steps_ >= stop_step:
-                self.decay_steps_ = default_ds
-            self.decay_rate_ = np.exp(
-                np.log(self.stop_lr_ / self.start_lr_) / (stop_step / self.decay_steps_)
-            )
-
-        return tf.train.exponential_decay(
-            self.start_lr_,
-            global_step,
-            self.decay_steps_,
-            self.decay_rate_,
-            staircase=True,
+        # === Step 1. Instantiate backend-agnostic schedule ===
+        params = dict(self._params)
+        params["stop_steps"] = stop_steps
+        # Default to 'exp' type if not specified
+        if "type" not in params:
+            params["type"] = "exp"
+        self._base_lr = BaseLR(**params)
+
+        # === Step 2. Bind a numpy_function for runtime evaluation ===
+        def _lr_value(step: np.ndarray) -> np.ndarray:
+            return np.asarray(self._base_lr.value(step), dtype=np.float64)
+
+        lr = tf.numpy_function(
+            _lr_value, [global_step], Tout=tf.float64, name="lr_schedule"
         )
-
-    def start_lr(self) -> float:
-        """Get the start lr."""
-        return self.start_lr_
+        lr.set_shape(global_step.get_shape())
+        return tf.cast(lr, tf.float32)
 
     def value(self, step: int) -> float:
-        """Get the lr at a certain step."""
-        return self.start_lr_ * np.power(self.decay_rate_, (step // self.decay_steps_))
+        """
+        Get the learning rate at the given step.
+
+        Parameters
+        ----------
+        step : int
+            The step index.
+
+        Returns
+        -------
+        float
+            The learning rate value.
+
+        Raises
+        ------
+        RuntimeError
+            If the schedule has not been built.
+        """
+        if self._base_lr is None:
+            raise RuntimeError("Learning rate schedule is not built yet.")
+        return float(np.asarray(self._base_lr.value(step)))
+
+
+__all__ = [
+    "LearningRateSchedule",
+]
diff --git a/deepmd/utils/argcheck.py b/deepmd/utils/argcheck.py
index 935762cdc7..80da8a0aa8 100644
--- a/deepmd/utils/argcheck.py
+++ b/deepmd/utils/argcheck.py
@@ -2480,14 +2480,159 @@ def linear_ener_model_args() -> Argument:
 lr_args_plugin = ArgsPlugin()
 
 
+def _check_lr_stop_args(data: dict[str, Any]) -> bool:
+    """
+    Check that stop_lr and stop_ratio are mutually exclusive and at least one is provided.
+
+    Parameters
+    ----------
+    data : dict[str, Any]
+        The learning rate configuration dictionary.
+
+    Returns
+    -------
+    bool
+        True if validation passes.
+
+    Raises
+    ------
+    ValueError
+        If both stop_lr and stop_ratio are provided, or neither is provided.
+    """
+    has_stop_lr = "stop_lr" in data and data["stop_lr"] is not None
+    has_stop_ratio = "stop_ratio" in data and data["stop_ratio"] is not None
+
+    if has_stop_lr and has_stop_ratio:
+        raise ValueError(
+            "stop_lr and stop_ratio are mutually exclusive. "
+            f"Got stop_lr={data['stop_lr']}, stop_ratio={data['stop_ratio']}"
+        )
+    if not has_stop_lr and not has_stop_ratio:
+        raise ValueError(
+            "Either stop_lr or stop_ratio must be provided. "
+            "Got stop_lr=None, stop_ratio=None"
+        )
+    return True
+
+
+def _check_warmup_args(data: dict[str, Any]) -> bool:
+    """
+    Check that warmup_steps and warmup_ratio are mutually exclusive.
+
+    Parameters
+    ----------
+    data : dict[str, Any]
+        The learning rate configuration dictionary.
+
+    Returns
+    -------
+    bool
+        True if validation passes.
+
+    Raises
+    ------
+    ValueError
+        If both warmup_steps (non-zero) and warmup_ratio are provided.
+    """
+    # warmup_steps default is 0, so check for non-zero value
+    has_warmup_steps = "warmup_steps" in data and data["warmup_steps"] != 0
+    has_warmup_ratio = "warmup_ratio" in data and data["warmup_ratio"] is not None
+
+    if has_warmup_steps and has_warmup_ratio:
+        raise ValueError(
+            "warmup_steps and warmup_ratio are mutually exclusive. "
+            f"Got warmup_steps={data['warmup_steps']}, warmup_ratio={data['warmup_ratio']}"
+        )
+    return True
+
+
+def _learning_rate_common_args(
+    doc_stop_lr: str,
+    extra_args: list[Argument] | None = None,
+) -> list[Argument]:
+    doc_start_lr = "The learning rate at the start of the training (after warmup)."
+    doc_stop_ratio = (
+        "The ratio of stop_lr to start_lr. stop_lr = start_lr * stop_ratio. "
+        "Mutually exclusive with stop_lr."
+    )
+    doc_warmup_steps = (
+        "The number of steps for learning rate warmup. "
+        "During warmup, the learning rate increases linearly from "
+        "warmup_start_factor * start_lr to start_lr. "
+        "Mutually exclusive with warmup_ratio. Default is 0 (no warmup)."
+    )
+    doc_warmup_ratio = (
+        "The ratio of warmup steps to total training steps. "
+        "The actual number of warmup steps is int(warmup_ratio * stop_steps). "
+        "Mutually exclusive with warmup_steps."
+    )
+    doc_warmup_start_factor = (
+        "The factor of start_lr for the initial warmup learning rate. "
+        "The warmup learning rate starts from warmup_start_factor * start_lr. "
+        "Default is 0.0, meaning the learning rate starts from zero."
+    )
+
+    args = [
+        Argument("start_lr", float, optional=False, doc=doc_start_lr),
+        Argument(
+            "stop_lr",
+            float,
+            optional=True,
+            default=None,
+            doc=doc_stop_lr,
+        ),
+        Argument(
+            "stop_ratio",
+            float,
+            optional=True,
+            default=None,
+            doc=doc_stop_ratio,
+        ),
+    ]
+    if extra_args:
+        args.extend(extra_args)
+    args.extend(
+        [
+            Argument(
+                "warmup_steps",
+                int,
+                optional=True,
+                default=0,
+                doc=doc_warmup_steps,
+            ),
+            Argument(
+                "warmup_ratio",
+                float,
+                optional=True,
+                default=None,
+                doc=doc_warmup_ratio,
+            ),
+            Argument(
+                "warmup_start_factor",
+                float,
+                optional=True,
+                default=0.0,
+                doc=doc_warmup_start_factor,
+            ),
+        ]
+    )
+    return args
+
+
 @lr_args_plugin.register("exp")
 def learning_rate_exp() -> list[Argument]:
-    doc_start_lr = "The learning rate at the start of the training."
+    """
+    Defines an exponential-decayed learning rate schedule with optional warmup.
+
+    The learning rate starts at `start_lr` (after warmup) and decays exponentially
+    to `stop_lr` over the training steps.
+    """
     doc_stop_lr = (
         "The desired learning rate at the end of the training. "
-        f"When decay_rate {doc_only_pt_supported}is explicitly set, "
+        "When decay_rate is explicitly set, "
         "this value will serve as the minimum learning rate during training. "
-        "In other words, if the learning rate decays below stop_lr, stop_lr will be applied instead."
+        "In other words, if the learning rate decays below stop_lr, stop_lr will be applied instead. "
+        "Mutually exclusive with stop_ratio."
     )
     doc_decay_steps = (
         "The learning rate is decaying every this number of training steps."
@@ -2498,37 +2643,32 @@ def learning_rate_exp() -> list[Argument]:
         "instead of calculating it through interpolation between start_lr and stop_lr."
     )
 
-    args = [
-        Argument("start_lr", float, optional=True, default=1e-3, doc=doc_start_lr),
-        Argument("stop_lr", float, optional=True, default=1e-8, doc=doc_stop_lr),
+    extra_args = [
         Argument("decay_steps", int, optional=True, default=5000, doc=doc_decay_steps),
         Argument(
             "decay_rate",
             float,
             optional=True,
             default=None,
-            doc=doc_only_pt_supported + doc_decay_rate,
+            doc=doc_decay_rate,
         ),
     ]
-    return args
+    return _learning_rate_common_args(doc_stop_lr, extra_args=extra_args)
 
 
-@lr_args_plugin.register("cosine", doc=doc_only_pt_supported)
+@lr_args_plugin.register("cosine")
 def learning_rate_cosine() -> list[Argument]:
     """
-    Defines a cosine annealing learning rate schedule.
+    Defines a cosine annealing learning rate schedule with optional warmup.
 
-    The learning rate starts at `start_lr` and gradually decreases to `stop_lr`
-    following a cosine curve over the training steps.
+    The learning rate starts at `start_lr` (after warmup) and gradually
+    decreases to `stop_lr` following a cosine curve over the training steps.
     """
-    doc_start_lr = "The learning rate at the start of the training."
-    doc_stop_lr = "The desired learning rate at the end of the training. "
-
-    args = [
-        Argument("start_lr", float, optional=True, default=1e-3, doc=doc_start_lr),
-        Argument("stop_lr", float, optional=True, default=1e-5, doc=doc_stop_lr),
-    ]
-    return args
+    doc_stop_lr = (
+        "The desired learning rate at the end of training. "
+        "Mutually exclusive with stop_ratio."
+    )
+    return _learning_rate_common_args(doc_stop_lr)
 
 
 def learning_rate_variant_type_args() -> Variant:
@@ -2546,6 +2686,15 @@ def learning_rate_variant_type_args() -> Variant:
 def learning_rate_args(fold_subdoc: bool = False) -> Argument:
     doc_scale_by_worker = "When parallel training or batch size scaled, how to alter learning rate. Valid values are `linear`(default), `sqrt` or `none`."
     doc_lr = "The definition of learning rate"
+
+    def _check_lr_args(data: dict[str, Any]) -> bool:
+        """Check learning rate argument constraints."""
+        # Check stop_lr and stop_ratio
+        _check_lr_stop_args(data)
+        # Check warmup_steps and warmup_ratio
+        _check_warmup_args(data)
+        return True
+
     return Argument(
         "learning_rate",
         dict,
@@ -2562,6 +2711,7 @@ def learning_rate_args(fold_subdoc: bool = False) -> Argument:
         optional=True,
         doc=doc_lr,
         fold_subdoc=fold_subdoc,
+        extra_check=_check_lr_args,
     )
 
 
@@ -3240,22 +3390,6 @@ def training_args(
     doc_tensorboard = "Enable tensorboard"
     doc_tensorboard_log_dir = "The log directory of tensorboard outputs"
     doc_tensorboard_freq = "The frequency of writing tensorboard events."
-    doc_warmup_steps = (
-        "The number of steps for learning rate warmup. During warmup, "
-        "the learning rate begins at zero and progressively increases linearly to `start_lr`, "
-        "rather than starting directly from `start_lr`"
-    )
-    doc_warmup_ratio = (
-        "The ratio of warmup steps to total training steps. "
-        "The actual number of warmup steps is calculated as `warmup_ratio * numb_steps`. "
-        "Valid values are in the range [0.0, 1.0). "
-        "If `warmup_steps` is set, this option will be ignored."
-    )
-    doc_warmup_start_factor = (
-        "The factor of start learning rate to the target learning rate during warmup. "
-        "The warmup learning rate will linearly increase from `warmup_start_factor * start_lr` to `start_lr`. "
-        "Default is 0.0, meaning the learning rate starts from zero."
-    )
     doc_gradient_max_norm = (
         "Clips the gradient norm to a maximum value. "
         "If the gradient norm exceeds this value, it will be clipped to this limit. "
@@ -3363,25 +3497,6 @@ def training_args(
         Argument(
             "tensorboard_freq", int, optional=True, default=1, doc=doc_tensorboard_freq
         ),
-        Argument(
-            "warmup_steps",
-            int,
-            optional=True,
-            doc=doc_only_pt_supported + doc_warmup_steps,
-        ),
-        Argument(
-            "warmup_ratio",
-            float,
-            optional=True,
-            doc=doc_only_pt_supported + doc_warmup_ratio,
-        ),
-        Argument(
-            "warmup_start_factor",
-            float,
-            optional=True,
-            default=0.0,
-            doc=doc_only_pt_supported + doc_warmup_start_factor,
-        ),
         Argument(
             "gradient_max_norm",
             float,
diff --git a/doc/outisli/DPA3.md b/doc/outisli/DPA3.md
new file mode 100644
index 0000000000..d0747e97b5
--- /dev/null
+++ b/doc/outisli/DPA3.md
@@ -0,0 +1,2630 @@
+# DeePMD 源码导读与 DPA-3 PyTorch 实现技术文档
+
+## 概述
+
+DPA3 是 DeePMD-kit 中基于 PyTorch 实现的高级原子环境描述符。它通过结合节点、边和角度信息，构建了更加精确的原子环境表示。
+
+请注意，该文档由 AI 生成，仅经过大致检查，可能存在出入，仅供阅读 deepmd-kit 源码的参考与指引。且代码行号基于作者本地格式化后的代码，因此与 GitHub 上源代码存在一定差异。
+
+### 文档结构
+
+本文档按照 DPA3 的实际使用流程和技术架构组织，包含以下主要部分：
+
+- **第一部分：快速开始** - 从 CLI 使用到基本配置的快速入门
+- **第二部分：系统架构** - DPA3 的整体设计和组件关系
+- **第三部分：详细实现** - 核心算法和技术实现细节
+- **第四部分：数据处理系统** - PyTorch 后端的数据处理架构
+- **第五部分：推理和部署** - 模型部署和集成方案
+
+---
+
+## 第一部分：快速开始
+
+### 1.1 CLI 入口和基本使用
+
+#### 1.1.1 命令行入口流程
+
+当用户执行 `dp --pt train input.json` 命令时，程序执行以下流程：
+
+1. **主入口点解析**: `deepmd.main.parse_args()` 解析命令行参数
+2. **后端选择**: 根据 `backend` 参数选择 PyTorch 后端
+3. **训练函数调用**: 调用 `deepmd.pt.entrypoints.main.train()`
+
+**关键文件位置**:
+
+- `deepmd/pt/entrypoints/main.py:237-248` - train 函数定义
+- `deepmd/entrypoints/main.py:41-91` - 主入口点分发逻辑
+
+#### 1.1.2 训练初始化流程
+
+在 `train()` 函数中，程序按以下步骤初始化：
+
+```python
+# 1. 配置文件加载和解析
+with open(input_file) as fin:
+    config = json.load(fin)
+
+# 2. 多任务模型处理
+multi_task = "model_dict" in config["model"]
+if multi_task:
+    config["model"], shared_links = preprocess_shared_params(config["model"])
+
+# 3. 邻居统计计算
+if not skip_neighbor_stat:
+    min_nbor_dist, trainer = update_sel(config, model_branch)
+
+# 4. 训练器创建
+trainer = get_trainer(
+    config,
+    init_model,
+    restart,
+    finetune,
+    force_load,
+    init_frz_model,
+    shared_links=shared_links,
+    finetune_links=finetune_links,
+)
+```
+
+**关键代码位置**: `deepmd/pt/entrypoints/main.py:322-331`
+
+#### 1.1.3 模型构建流程
+
+训练器初始化后，通过 `get_model()` 函数构建模型：
+
+1. **模型解析**: 根据配置文件中的 `descriptor` 类型选择对应的描述符
+2. **DPA3 初始化**: 当 descriptor 类型为 `"dpa3"` 时，创建 `DescrptDPA3` 实例
+3. **模型组装**: 将描述符与拟合网络组合成完整模型
+
+**关键文件位置**:
+
+- `deepmd/pt/train/training.py:91-100` - Trainer 类定义
+- `deepmd/pt/model/model/model.py` - BaseModel 类和模型工厂函数
+
+### 1.2 基本配置示例
+
+```json
+{
+  "model": {
+    "descriptor": {
+      "type": "dpa3",
+      "repflow": {
+        "e_rcut": 6.0,
+        "e_sel": 200,
+        "a_rcut": 5.0,
+        "a_sel": 60,
+        "n_dim": 128,
+        "e_dim": 64,
+        "a_dim": 32,
+        "nlayers": 6,
+        "a_compress_rate": 2,
+        "update_angle": true,
+        "update_style": "res_residual"
+      },
+      "concat_output_tebd": true,
+      "precision": "float32"
+    }
+  }
+}
+```
+
+### 1.3 精度控制配置
+
+DPA3 提供了两种精度控制机制，分别控制不同的计算层面：
+
+#### 1.3.1 环境变量精度控制 (DP_INTERFACE_PREC)
+
+**作用范围**: 全局接口精度控制，影响输入/输出数据类型
+
+**设置方式**:
+
+```bash
+# 高精度模式 (默认)
+export DP_INTERFACE_PREC=high
+
+# 低精度模式
+export DP_INTERFACE_PREC=low
+```
+
+**精度影响**:
+
+- `high`: `GLOBAL_NP_FLOAT_PRECISION = np.float64`, `GLOBAL_ENER_FLOAT_PRECISION = np.float64`
+- `low`: `GLOBAL_NP_FLOAT_PRECISION = np.float32`, `GLOBAL_ENER_FLOAT_PRECISION = np.float64`
+
+**文件位置**: `deepmd/env.py:33-48`
+
+#### 1.3.2 模型参数精度控制 (precision)
+
+**作用范围**: 模型组件参数精度，影响神经网络权重和计算精度
+
+**配置位置**: input.json 中的各组件参数
+
+**可选值**:
+
+- `"float64"`: 双精度浮点数
+- `"float32"`: 单精度浮点数
+- `"float16"`: 半精度浮点数
+- `"default"`: 使用系统默认精度
+
+**配置示例**:
+
+```json
+{
+  "model": {
+    "descriptor": {
+      "type": "dpa3",
+      "precision": "float32", // 描述符精度
+      "repflow": {
+        "precision": "float32" // RepFlow组件精度
+      }
+    },
+    "fitting_net": {
+      "precision": "float32" // 拟合网络精度
+    }
+  }
+}
+```
+
+#### 1.3.3 精度控制的工作机制
+
+**文件位置**: `deepmd/pt/model/model/make_model.py:327-337`
+
+在模型执行过程中，精度控制按以下流程工作：
+
+1. **输入类型检测**: `input_type_cast()` 检测输入数据精度
+2. **全局精度转换**: 将输入数据转换为 `GLOBAL_PT_FLOAT_PRECISION`
+3. **模型计算**: 使用模型组件指定的精度进行计算
+4. **输出类型转换**: `output_type_cast()` 将输出转换回输入精度
+
+**关键代码**:
+
+```python
+def input_type_cast(self, coord, box=None, fparam=None, aparam=None):
+    """Cast the input data to global float type."""
+    input_prec = self.reverse_precision_dict[coord.dtype]
+    if input_prec == self.reverse_precision_dict[self.global_pt_float_precision]:
+        return coord, box, fparam, aparam, input_prec
+    else:
+        # 转换为全局精度
+        pp = self.global_pt_float_precision
+        return coord.to(pp), box.to(pp) if box is not None else None, ...
+```
+
+#### 1.3.4 精度设置的最佳实践
+
+**内存敏感场景**:
+
+```bash
+# 使用低精度接口 + 模型单精度
+export DP_INTERFACE_PREC=low
+# 模型配置中使用 "precision": "float32"
+```
+
+**高精度要求场景**:
+
+```bash
+# 使用高精度接口 + 模型双精度
+export DP_INTERFACE_PREC=high
+# 模型配置中使用 "precision": "float64"
+```
+
+**平衡性能和精度**:
+
+```bash
+# 高精度接口保证数据精度，模型使用单精度提高计算效率
+export DP_INTERFACE_PREC=high
+# 模型配置中使用 "precision": "float32"
+```
+
+#### 1.3.5 精度设置的注意事项
+
+1. **兼容性**: `DP_INTERFACE_PREC` 影响整个 DeePMD-kit 的接口，而 `precision` 参数只影响特定模型组件
+2. **性能**: 降低精度通常可以提高计算速度和减少内存使用
+3. **数值稳定性**: 高精度有助于数值稳定性，特别是在训练初期
+4. **能量精度**: 能量相关计算始终使用 `GLOBAL_ENER_FLOAT_PRECISION`，通常为 float64，因此模型在推理输出到时候默认还是双精度（即 lammps 调用时）
+
+### 1.4 快速训练和推理
+
+#### 1.4.1 训练命令
+
+```bash
+# 基本训练默认 tensorflow
+dp train input.json
+
+# 指定后端
+dp --pt train input.json
+```
+
+#### 1.4.2 推理命令
+
+```bash
+# 模型测试
+dp test -m dpa3_model.pt -s test_data
+
+# 模型冻结
+dp freeze -m dpa3_model.pt -o frozen_model.pth
+```
+
+---
+
+## 第二部分：系统架构
+
+### 2.1 整体架构设计
+
+DPA3 采用了模块化的设计架构，从数据输入到模型输出的完整流程：
+
+```
+数据输入层
+├── 原始坐标 (coord)
+├── 原子类型 (atype)
+├── 周期边界 (box)
+└── 邻居列表 (nlist)
+    ↓
+数据处理层
+├── DeepmdData (数据加载)
+├── DpLoaderSet (系统级DataLoader)
+└── 训练级DataLoader (采样和批处理)
+    ↓
+DPA3 描述符层
+├── DescrptDPA3 (主描述符)
+│   ├── TypeEmbedNet (类型嵌入)
+│   └── DescrptBlockRepflows (RepFlow块)
+│       ├── 边嵌入网络
+│       ├── 角度嵌入网络
+│       └── RepFlow层列表
+└── 输出处理
+    ↓
+拟合网络层
+├── 能量拟合
+├── 力拟合
+└── 维里拟合
+```
+
+### 2.2 核心组件关系
+
+#### 2.2.1 类继承关系
+
+```python
+@BaseDescriptor.register("dpa3")
+class DescrptDPA3(BaseDescriptor, torch.nn.Module):
+    """DPA3 描述符实现"""
+
+
+@DescriptorBlock.register("se_repflow")
+class DescrptBlockRepflows(DescriptorBlock):
+    """RepFlow 描述符块"""
+
+
+class RepFlowLayer(torch.nn.Module):
+    """单个 RepFlow 层"""
+```
+
+#### 2.2.2 组件交互流程
+
+1. **输入处理**: 接收扩展坐标、原子类型和邻居列表
+2. **类型嵌入**: 计算原子类型的嵌入向量
+3. **RepFlow 处理**: 多层节点、边、角信息迭代更新
+4. **输出生成**: 生成最终的原子环境描述符
+
+### 2.3 数据流架构
+
+#### 2.3.1 两级 DataLoader 架构
+
+```
+原始数据 (HDF5/.npy 文件)
+    ↓
+DeepmdData (数据系统加载)
+    ↓
+系统级 DataLoaders (每个系统一个 DataLoader, num_workers=0)
+    ↓
+DpLoaderSet (系统级 DataLoader 集合)
+    ↓
+训练级 DataLoader (采样和批处理, num_workers=NUM_WORKERS)
+    ↓
+模型输入 (coord, atype, box, fparam, aparam)
+```
+
+#### 2.3.2 数据变换流程
+
+1. **单帧加载**: `DeepmdDataSetForLoader.__getitem__()` 加载单个构型
+2. **批处理合并**: `collate_batch()` 组合多个帧
+3. **设备转移**: 数据移动到 GPU/CPU
+4. **输入分离**: 模型输入与标签分离
+
+### 2.4 DeePMD-kit 模型架构完整层次结构
+
+#### 2.4.1 模型架构的设计理念
+
+DeePMD-kit 采用分层次、模块化的设计，从底层的原子级计算到顶层的完整模型，每一层都有明确的职责和功能分工。理解这个层次结构对于掌握 DeePMD-kit 的工作原理至关重要。
+
+#### 2.4.2 完整的模型层次结构
+
+##### 2.4.2.1 原子模型层次 (AtomicModel)
+
+**最基础的计算单元** - 负责原子级别的物理量计算：
+
+```text
+# 抽象基类层
+ABC + PluginVariant + make_plugin_registry("atomic model")
+    ↓
+BaseAtomicModel_ (由 make_base_atomic_model() 动态生成)
+    ↓
+BaseAtomicModel (deepmd/dpmodel/atomic_model/base_atomic_model.py:42)
+    ↓
+DPAtomicModel (deepmd/dpmodel/atomic_model/dp_atomic_model.py:29) - 注册为 "standard"
+    ↓
+具体的物理属性原子模型:
+├── DPEnergyAtomicModel (能量模型)
+├── DPDipoleAtomicModel (偶极子模型)
+├── DPPolarAtomicModel (极化率模型)
+├── DPDOSAtomicModel (态密度模型)
+└── DPPropertyAtomicModel (通用属性模型)
+```
+
+**作用和用途**:
+
+- **核心计算单元**: 包含描述器(Descriptor) + 拟合网络(Fitting)
+- **原子级预测**: 负责单个原子的能量/力等物理量预测
+- **不直接用于训练**: 作为组件被更高层模型调用
+- **物理计算核心**: 所有物理计算都在这里发生
+
+##### 2.4.2.2 完整模型层次 (Model)
+
+**真正用于训练和推理的完整模型**：
+
+```text
+# 抽象基类层
+ABC + PluginVariant + make_plugin_registry("model")
+    ↓
+BaseBaseModel (由 make_base_model() 动态生成)
+    ↓
+BaseModel (deepmd/dpmodel/model/base_model.py:175)
+    ↓
+DPModelCommon (提供公共方法如 update_sel 等)
+    ↓
+通过 make_model(T_AtomicModel) 动态生成的模型类
+    ↓
+具体的完整模型实现:
+├── EnergyModel (deepmd/pt/model/model/ener_model.py:30) - 注册为 "ener"
+├── DipoleModel - 注册为 "dipole"
+├── PolarModel - 注册为 "polar"
+├── DOSModel - 注册为 "dos"
+└── PropertyModel - 注册为 "property"
+```
+
+**作用和用途**:
+
+- **训练和推理接口**: `dp train input.json` 时创建的就是这个模型
+- **系统级功能**: 封装原子模型，添加邻居列表构建、坐标变换、批处理等
+- **梯度计算**: 自动计算力和应力
+- **输出格式转换**: 将原子级输出转换为标准格式
+
+##### 2.4.2.3 特殊模型层次 (LinearModel/ZBLModel)
+
+**线性组合和特殊模型**：
+
+```text
+BaseAtomicModel
+    ↓
+LinearEnergyAtomicModel (deepmd/dpmodel/atomic_model/linear_atomic_model.py:42) - 注册为 "linear"
+    ↓
+DPZBLLinearEnergyAtomicModel (线性组合DP和ZBL模型)
+    ↓
+通过 make_model(DPZBLLinearEnergyAtomicModel) 生成完整模型
+    ↓
+DPZBLModel (deepmd/dpmodel/model/dp_zbl_model.py:28) - 注册为 "zbl"
+```
+
+**作用和用途**:
+
+- **模型组合**: 线性组合多个原子模型
+- **物理修正**: DPZBLModel 结合深度势能和 ZBL 势函数
+- **特殊应用**: 处理短程排斥等特殊物理场景
+
+#### 2.4.3 模型创建和使用流程
+
+##### 2.4.3.1 训练时的模型创建流程
+
+```text
+# 1. 用户配置
+"model": {"type": "ener"}  # input.json 中
+
+# 2. 训练脚本执行
+dp train input.json
+  ↓
+# 3. 模型工厂创建 (deepmd/pt/entrypoints/main.py:248)
+model = get_model(model_params)  # 返回 EnergyModel 实例
+  ↓
+# 4. EnergyModel 初始化流程
+# 4a. 创建 DPEnergyAtomicModel 实例（原子级计算核心）
+# 4b. 通过 make_model() 包装成完整模型（添加系统级功能）
+# 4c. 继承 DPModelCommon（添加公共方法）
+  ↓
+# 5. 训练循环中的调用
+loss = model.forward(coord, atype, box, ...)  # EnergyModel.forward()
+  ↓
+# 6. 内部调用链
+# forward() -> forward_common() -> forward_common_lower() -> forward_atomic()
+```
+
+##### 2.4.3.2 推理时的模型加载流程
+
+```text
+# 1. 模型加载
+model = torch.jit.load("frozen_model.pth")  # 实际是 EnergyModel 的实例
+  ↓
+# 2. 推理调用
+output = model(coord, atype, box)  # EnergyModel.forward()
+  ↓
+# 3. 返回标准格式
+{"energy": ..., "force": ..., "virial": ...}
+```
+
+#### 2.4.4 设计模式和架构优势
+
+##### 2.4.4.1 核心设计模式
+
+**1. 工厂模式**
+
+- `make_base_atomic_model()`: 动态生成原子模型基类
+- `make_base_model()`: 动态生成最终模型基类
+- `make_model(T_AtomicModel)`: 将原子模型包装成完整模型
+
+**2. 注册机制**
+
+- 使用 `@BaseAtomicModel.register()` 和 `@BaseModel.register()` 注册不同类型的模型
+- 支持通过字符串名称动态创建模型实例
+
+**3. 组合模式**
+
+- **DPAtomicModel**: 由描述器(Descriptor) + 拟合网络(Fitting) 组成
+- **LinearEnergyAtomicModel**: 线性组合多个原子模型
+- **DPZBLLinearEnergyAtomicModel**: 特殊的线性组合，结合 DP 模型和 ZBL 势函数
+
+**4. 多后端支持**
+每个后端(PyTorch/TensorFlow/JAX/Paddle)都有相应的实现，遵循相同的接口但针对特定框架优化。
+
+##### 2.4.4.2 架构优势
+
+**模块化**:
+
+- 描述器和拟合网络可以独立开发和组合
+- 不同物理量的预测可以共享相同的框架
+
+**可扩展性**:
+
+- 容易添加新的物理属性或模型类型
+- 支持自定义描述器和拟合网络
+
+**多后端支持**:
+
+- 同一套接口支持不同的深度学习框架
+- 代码复用和维护效率高
+
+**类型安全**:
+
+- 通过注册机制确保模型类型的正确性
+- 编译时类型检查和运行时验证
+
+#### 2.4.7 模型压缩功能 (enable_compression)
+
+模型压缩是 DeePMD-kit 中一个重要的性能优化功能，通过表格化(tabulation)的方式来加速模型推理，特别适用于生产环境的部署。
+
+##### 2.4.7.1 压缩功能调用链
+
+**压缩入口点** (`deepmd/pt/entrypoints/compress.py:75`):
+
+```python
+model.enable_compression(
+    extrapolate,  # 外推尺度
+    stride,  # 步长1
+    stride * 10,  # 步长2
+)
+```
+
+**压缩方法层次**:
+
+```
+顶层模型压缩 (make_model.py:246-266)
+    ↓
+model.enable_compression()
+    ↓
+self.atomic_model.enable_compression(
+    self.get_min_nbor_dist(),  # 获取最小邻居距离
+    table_extrapolate,
+    table_stride_1,
+    table_stride_2,
+    check_frequency,
+)
+    ↓
+原子模型和描述符的具体压缩实现
+```
+
+##### 2.4.7.2 压缩参数说明
+
+**关键参数**:
+
+- `table_extrapolate`: 模型外推的尺度参数，控制表格的外推范围
+- `table_stride_1`: 第一个表格的均匀步长，影响近程精度
+- `table_stride_2`: 第二个表格的均匀步长，影响远程精度
+- `check_frequency`: 溢出检查频率，用于数值稳定性监控
+- `get_min_nbor_dist()`: 动态获取训练数据中的最小邻居距离
+
+##### 2.4.7.3 压缩机制的实现原理
+
+**表格化加速**:
+
+1. **距离离散化**: 将连续的原子间距离离散化为表格索引
+2. **预计算存储**: 预先计算并存储常用距离范围内的描述符值
+3. **插值查表**: 推理时通过插值查表替代复杂的神经网络计算
+4. **内存换时间**: 牺牲一定内存空间换取显著的计算速度提升
+
+**多级表格策略**:
+
+- **近程高精度**: `table_stride_1` 控制近程的高精度表格
+- **远程适中精度**: `table_stride_2` 控制远程的适中精度表格
+- **平滑过渡**: 两个表格之间实现平滑过渡，避免不连续性
+
+##### 2.4.7.4 压缩的应用场景和优势
+
+**适用场景**:
+
+- **生产环境部署**: MD 模拟中需要高频调用模型推理
+- **大规模系统**: 原子数量庞大，计算资源有限
+- **实时仿真**: 对推理速度有严格要求的应用
+
+**性能优势**:
+
+- **推理加速**: 可实现数倍到数十倍的推理速度提升
+- **内存可控**: 表格大小可通过步长参数灵活控制
+- **精度平衡**: 在速度和精度之间找到最优平衡点
+
+##### 2.4.7.5 压缩功能的使用建议
+
+**参数调优策略**:
+
+```python
+# 高精度场景 - 较小的步长，更高的精度
+model.enable_compression(
+    extrapolate=5.0,
+    stride_1=0.005,  # 更小的近程步长
+    stride_2=0.05,  # 更小的远程步长
+)
+
+# 高性能场景 - 较大的步长，更快的速度
+model.enable_compression(
+    extrapolate=3.0,
+    stride_1=0.02,  # 较大的近程步长
+    stride_2=0.2,  # 较大的远程步长
+)
+```
+
+**最佳实践**:
+
+1. **测试验证**: 压缩后务必验证模型精度是否满足要求
+2. **参数调优**: 根据具体应用场景调整步长参数
+3. **内存监控**: 关注压缩后的内存使用情况
+4. **性能测试**: 定量测试压缩带来的性能提升效果
+
+#### 2.4.8 在实际使用中的角色分工
+
+**对用户而言**:
+
+- **只需关心最终模型**: EnergyModel、DipoleModel 等
+- **配置简单**: 通过 JSON 配置文件指定模型类型
+- **接口统一**: 所有模型都使用相同的训练和推理接口
+
+**对开发者而言**:
+
+- **清晰的层次**: 每一层都有明确的职责
+- **易于扩展**: 在正确的层级添加新功能
+- **代码复用**: 通过工厂模式避免重复代码
+
+#### 2.4.9 模型架构总结
+
+**对用户而言**:
+
+- **只需关心最终模型**: EnergyModel、DipoleModel 等
+- **配置简单**: 通过 JSON 配置文件指定模型类型
+- **接口统一**: 所有模型都使用相同的训练和推理接口
+
+**对开发者而言**:
+
+- **清晰的层次**: 每一层都有明确的职责
+- **易于扩展**: 在正确的层级添加新功能
+- **代码复用**: 通过工厂模式避免重复代码
+
+**核心基类定义** (`deepmd/pt/model/atomic_model/dp_atomic_model.py:34`):
+
+```python
+@BaseAtomicModel.register("standard")
+class DPAtomicModel(BaseAtomicModel):
+    """Model give atomic prediction of some physical property.
+
+    Parameters
+    ----------
+    descriptor
+            Descriptor
+    fitting_net
+            Fitting net
+    type_map
+            Mapping atom type to the name (str) of the type.
+    """
+```
+
+#### 2.4.6 Forward 方法的多层次架构
+
+DeePMD-kit 中存在多个不同的 forward 方法，每个都有特定的用途和调用层级。理解这些 forward 方法的分工和调用关系对于理解模型的执行流程至关重要。
+
+##### 2.4.6.1 Forward 方法层级结构
+
+**1. 用户接口层** - `forward()`
+
+```text
+# deepmd/pt/model/model/ener_model.py:94
+def forward(self, coord, atype, box=None, fparam=None, aparam=None, do_atomic_virial=False)
+```
+
+**用途**:
+
+- **最高级的用户接口**，训练和推理时直接调用的方法
+- 接收原始的坐标、原子类型、盒子信息
+- 返回标准的物理量格式 `{"energy": ..., "force": ..., "virial": ...}`
+
+**什么时候使用**:
+
+- 训练时的损失函数计算
+- 推理时的预测
+- LAMMPS 等 MD 引擎调用的接口
+
+**2. 坐标处理层** - `forward_common()`
+
+```text
+# deepmd/pt/model/model/make_model.py:152
+def forward_common(self, coord, atype, box=None, fparam=None, aparam=None, do_atomic_virial=False)
+```
+
+**用途**:
+
+- **处理坐标变换和邻居列表构建**
+- 将原始坐标转换为扩展坐标(包含 ghost 原子)
+- 构建邻居列表
+- 调用底层的`forward_common_lower()`
+
+**内部工作流程**:
+
+```python
+# 1. 坐标标准化和扩展
+extended_coord, extended_atype, mapping = extend_coord_with_ghosts(...)
+# 2. 构建邻居列表
+nlist = build_neighbor_list(...)
+# 3. 调用底层计算
+model_ret = self.forward_common_lower(extended_coord, extended_atype, nlist, ...)
+```
+
+**3. 底层计算层** - `forward_common_lower()`
+
+```text
+# deepmd/pt/model/model/make_model.py:278
+def forward_common_lower(self, extended_coord, extended_atype, nlist, mapping=None, ...)
+```
+
+**用途**:
+
+- **真正的模型计算逻辑**
+- 接收已处理好的扩展坐标和邻居列表
+- 调用原子模型进行实际计算
+- 处理输出的格式转换和 reduction 操作
+
+**4. 外部接口层** - `forward_lower()`
+
+```text
+# deepmd/pt/model/model/ener_model.py:135
+def forward_lower(self, extended_coord, extended_atype, nlist, mapping=None, ...)
+```
+
+**用途**:
+
+- **提供给外部程序的底层接口** (如 LAMMPS 插件)
+- 外部程序已经准备好了邻居列表，不需要 DeePMD 重新构建
+- 直接调用`forward_common_lower()`
+- 返回扩展区域的结果(不做 reduction)
+
+**5. 原子级计算层** - `forward_atomic()`
+
+```text
+# deepmd/pt/model/atomic_model/dp_atomic_model.py:273
+def forward_atomic(self, extended_coord, extended_atype, nlist, mapping=None, ...)
+```
+
+**用途**:
+
+- **最底层的原子级计算**
+- 描述器(Descriptor)计算原子环境表示
+- 拟合网络(Fitting)预测原子能量/力等
+- 返回原子级的预测结果
+
+##### 2.4.6.2 Forward 方法调用关系链
+
+**训练/推理时的完整调用链:**
+
+```text
+# 用户调用
+model.forward(coord, atype, box)
+  ↓
+# 坐标处理
+model.forward_common(coord, atype, box)
+  ↓
+# 坐标扩展 + 邻居列表构建
+extended_coord, nlist = preprocess(...)
+  ↓
+# 底层计算
+model.forward_common_lower(extended_coord, extended_atype, nlist)
+  ↓
+# 原子模型计算
+atomic_ret = self.atomic_model.forward_atomic(extended_coord, extended_atype, nlist)
+  ↓
+# 输出转换和reduction
+return transform_output(atomic_ret)
+```
+
+**LAMMPS 等外部程序调用:**
+
+```text
+# 外部程序已经有邻居列表
+model.forward_lower(extended_coord, extended_atype, nlist, mapping)
+  ↓
+# 直接底层计算
+model.forward_common_lower(extended_coord, extended_atype, nlist, mapping)
+  ↓
+# 原子模型计算
+atomic_ret = self.atomic_model.forward_atomic(...)
+```
+
+##### 2.4.6.3 设计多层次 Forward 的原因
+
+**1. 性能优化**
+
+- `forward_lower()`: 外部程序可以复用邻居列表，避免重复计算
+- `forward_common_lower()`: 批处理时可以直接使用预构建的数据
+
+**2. 接口灵活性**
+
+- `forward()`: 简单易用的高级接口
+- `forward_lower()`: 高性能的底层接口
+
+**3. 代码复用**
+
+- `forward_common()`: 坐标处理逻辑可以被多种模型复用
+- `forward_atomic()`: 原子级计算与系统级处理分离
+
+**4. 调试和测试**
+
+- 可以单独测试每个层级的功能
+- 便于定位性能瓶颈
+
+##### 2.4.6.4 实际使用建议
+
+**对于普通用户**:
+
+- **只需关心 `forward()`**: 训练和推理的标准接口
+- **偶尔使用 `forward_lower()`**: 如果你要写 MD 插件或需要高性能推理
+
+**对于开发者**:
+
+- **`forward_common` 系列**: 理解内部实现和优化的关键
+- **`forward_atomic()`**: 自定义原子模型时需要实现的核心方法
+
+**性能优化场景**:
+
+- **外部邻居列表**: 使用 `forward_lower()` 避免重复计算
+- **批处理优化**: 直接调用 `forward_common_lower()` 处理预处理好的数据
+- **调试分析**: 单独调用 `forward_atomic()` 分析原子级计算
+
+#### 2.4.2 具体派生模型
+
+**能量模型** (`deepmd/pt/model/atomic_model/energy_atomic_model.py:13`):
+
+```python
+class DPEnergyAtomicModel(DPAtomicModel):
+    def __init__(self, descriptor, fitting, type_map, **kwargs):
+        if not (
+            isinstance(fitting, EnergyFittingNet)
+            or isinstance(fitting, EnergyFittingNetDirect)
+            or isinstance(fitting, InvarFitting)
+        ):
+            raise TypeError(
+                "fitting must be an instance of EnergyFittingNet, "
+                "EnergyFittingNetDirect or InvarFitting for DPEnergyAtomicModel"
+            )
+        super().__init__(descriptor, fitting, type_map, **kwargs)
+```
+
+**偶极矩模型** (`deepmd/pt/model/atomic_model/dipole_atomic_model.py:14`):
+
+```python
+class DPDipoleAtomicModel(DPAtomicModel):
+    def __init__(self, descriptor, fitting, type_map, **kwargs):
+        if not isinstance(fitting, DipoleFittingNet):
+            raise TypeError(
+                "fitting must be an instance of DipoleFittingNet for DPDipoleAtomicModel"
+            )
+        super().__init__(descriptor, fitting, type_map, **kwargs)
+
+    def apply_out_stat(self, ret: dict[str, torch.Tensor], atype: torch.Tensor):
+        # dipole not applying bias
+        return ret
+```
+
+**极化率模型** (`deepmd/pt/model/atomic_model/polar_atomic_model.py:14`):
+
+```python
+class DPPolarAtomicModel(DPAtomicModel):
+    def __init__(self, descriptor, fitting, type_map, **kwargs):
+        if not isinstance(fitting, PolarFittingNet):
+            raise TypeError(
+                "fitting must be an instance of PolarFittingNet for DPPolarAtomicModel"
+            )
+        super().__init__(descriptor, fitting, type_map, **kwargs)
+```
+
+#### 2.4.3 DPAtomicModel 核心功能
+
+**原子级前向传播** (`dp_atomic_model.py:205-265`):
+
+```python
+def forward_atomic(
+    self,
+    extended_coord,
+    extended_atype,
+    nlist,
+    mapping: Optional[torch.Tensor] = None,
+    fparam: Optional[torch.Tensor] = None,
+    aparam: Optional[torch.Tensor] = None,
+    comm_dict: Optional[dict[str, torch.Tensor]] = None,
+) -> dict[str, torch.Tensor]:
+    """Return atomic prediction.
+
+    Parameters
+    ----------
+    extended_coord
+            coordinates in extended region
+    extended_atype
+            atomic type in extended region
+    nlist
+            neighbor list. nf x nloc x nsel
+    mapping
+            mapps the extended indices to local indices
+    fparam
+            frame parameter. nf x ndf
+    aparam
+            atomic parameter. nf x nloc x nda
+
+    Returns
+    -------
+    result_dict
+            the result dict, defined by the `FittingOutputDef`.
+    """
+    # 1. 数据类型转换和梯度设置
+    nframes, nloc, nnei = nlist.shape
+    atype = extended_atype[:, :nloc]
+    if self.do_grad_r() or self.do_grad_c():
+        extended_coord.requires_grad_(True)
+
+    # 2. 描述符计算
+    descriptor, rot_mat, g2, h2, sw = self.descriptor(
+        extended_coord, extended_atype, nlist, mapping=mapping, comm_dict=comm_dict
+    )
+
+    # 3. 拟合网络计算
+    fit_ret = self.fitting_net(
+        descriptor, atype, gr=rot_mat, g2=g2, h2=h2, fparam=fparam, aparam=aparam
+    )
+
+    return fit_ret
+```
+
+**模型工厂集成** (`deepmd/pt/model/model/__init__.py`):
+
+```python
+def get_model(model_params):
+    model_type = model_params.get("type", "standard")
+    if model_type == "standard":
+        if "spin" in model_params:
+            return get_spin_model(model_params)
+        elif "use_srtab" in model_params:
+            return get_zbl_model(model_params)
+        else:
+            return get_standard_model(model_params)
+    # ... 其他模型类型
+```
+
+#### 2.4.4 在整体系统中的作用
+
+1. **模型创建**: 通过 `get_model()` 函数根据配置参数创建适当的 DPAtomicModel 实例
+2. **训练集成**: 在 `Trainer` 类中被包装用于训练过程
+3. **推理支持**: 在 `DeepEval` 类中用于模型推理和部署
+4. **多任务支持**: 支持多种物理性质的联合训练和预测
+
+DPAtomicModel 通过统一的接口和灵活的设计，为 DPA3 描述符与各种拟合网络的组合提供了标准化的实现框架。
+
+---
+
+## 第三部分：详细实现
+
+### 3.1 DPA3 核心实现
+
+#### 3.1.1 初始化过程 (`__init__`)
+
+**文件位置**: `deepmd/pt/model/descriptor/dpa3.py:105-171`
+
+```text
+def __init__(self,
+             ntypes: int,
+             repflow: Union[RepFlowArgs, dict],
+             concat_output_tebd: bool = False,
+             activation_function: str = "silu",
+             precision: str = "float64",
+             exclude_types: list[tuple[int, int]] = [],
+             env_protection: float = 0.0,
+             trainable: bool = True,
+             seed: Optional[Union[int, list[int]]] = None,
+             use_econf_tebd: bool = False,
+             use_tebd_bias: bool = False,
+             use_loc_mapping: bool = True,
+             type_map: Optional[list[str]] = None):
+```
+
+**关键组件初始化**:
+
+1. **RepFlow 参数处理**:
+
+   ```python
+   self.repflow_args = init_subclass_params(repflow, RepFlowArgs)
+   ```
+
+2. **类型嵌入网络**:
+
+   ```python
+   self.type_embedding = TypeEmbedNetConsistent(
+       ntypes=ntypes,
+       embedding_dim=tebd_dim,
+       precision=precision,
+       seed=child_seed(seed, 0),
+       use_econf_tebd=use_econf_tebd,
+       type_map=type_map,
+   )
+   ```
+
+3. **RepFlow 块创建**:
+   ```python
+   self.repflows = DescrptBlockRepflows(
+       self.repflow_args.e_rcut,
+       self.repflow_args.e_rcut_smth,
+       self.repflow_args.e_sel,
+       self.repflow_args.a_rcut,
+       self.repflow_args.a_rcut_smth,
+       self.repflow_args.a_sel,
+       ntypes=ntypes,
+       n_dim=self.repflow_args.n_dim,
+       e_dim=self.repflow_args.e_dim,
+       a_dim=self.repflow_args.a_dim,
+       # ... 其他参数
+   )
+   ```
+
+#### 3.1.2 前向传播过程 (`forward`)
+
+**文件位置**: `deepmd/pt/model/descriptor/dpa3.py:430-498`
+
+**输入参数**:
+
+- `extended_coord`: 扩展坐标 [nf × (nall × 3)]
+- `extended_atype`: 扩展原子类型 [nf × nall]
+- `nlist`: 邻居列表 [nf × nloc × nnei]
+- `mapping`: 索引映射 (可选)
+- `comm_dict`: 并行通信数据 (可选)
+
+**处理流程**:
+
+```python
+def forward(self, extended_coord, extended_atype, nlist, mapping=None, comm_dict=None):
+    # 1. 数据类型转换
+    extended_coord = extended_coord.to(dtype=self.prec)
+    nframes, nloc, nnei = nlist.shape
+    nall = extended_coord.view(nframes, -1).shape[1] // 3
+
+    # 2. 类型嵌入计算
+    if not parallel_mode and self.use_loc_mapping:
+        node_ebd_ext = self.type_embedding(extended_atype[:, :nloc])
+    else:
+        node_ebd_ext = self.type_embedding(extended_atype)
+    node_ebd_inp = node_ebd_ext[:, :nloc, :]
+
+    # 3. RepFlow 计算
+    node_ebd, edge_ebd, h2, rot_mat, sw = self.repflows(
+        nlist,
+        extended_coord,
+        extended_atype,
+        node_ebd_ext,
+        mapping,
+        comm_dict=comm_dict,
+    )
+
+    # 4. 输出拼接处理
+    if self.concat_output_tebd:
+        node_ebd = torch.cat([node_ebd, node_ebd_inp], dim=-1)
+
+    return node_ebd, rot_mat, edge_ebd, h2, sw
+```
+
+**输出说明**:
+
+- `node_ebd`: 节点描述符 [nf × nloc × n_dim]
+- `rot_mat`: 旋转矩阵 [nf × nloc × e_dim × 3]
+- `edge_ebd`: 边嵌入 [nf × nloc × nnei × e_dim]
+- `h2`: 对表示 [nf × nloc × nnei × 3]
+- `sw`: 平滑开关函数 [nf × nloc × nnei]
+
+### 3.2 RepFlow 块实现
+
+#### 3.2.1 初始化组件
+
+**文件位置**: `deepmd/pt/model/descriptor/repflows.py:77-200`
+
+```text
+class DescrptBlockRepflows(DescriptorBlock):
+    def __init__(self,
+                 n_dim: int = 128,
+                 e_dim: int = 16,
+                 a_dim: int = 64,
+                 nlayers: int = 3,
+                 e_rcut: float = 6.0,
+                 e_rcut_smth: float = 0.5,
+                 e_sel: int = 120,
+                 a_rcut: float = 4.0,
+                 a_rcut_smth: float = 0.5,
+                 a_sel: int = 40,
+                 # ... 其他参数
+                ):
+```
+
+**关键组件**:
+
+1. **边嵌入网络**:
+
+   ```python
+   self.edge_embd = MLPLayer(
+       1,
+       e_dim,
+       activation=activation_function,
+       precision=precision,
+       seed=child_seed(seed, 1),
+   )
+   ```
+
+2. **角度嵌入网络**:
+
+   ```python
+   self.angle_embd = MLPLayer(
+       1,
+       a_dim,
+       activation=activation_function,
+       precision=precision,
+       seed=child_seed(seed, 2),
+   )
+   ```
+
+3. **RepFlow 层列表**:
+   ```python
+   self.layers = torch.nn.ModuleList()
+   for ii in range(nlayers):
+       self.layers.append(
+           RepFlowLayer(
+               e_rcut,
+               e_rcut_smth,
+               e_sel,
+               a_rcut,
+               a_rcut_smth,
+               a_sel,
+               ntypes,
+               n_dim,
+               e_dim,
+               a_dim,
+               ...,
+           )
+       )
+   ```
+
+#### 3.2.2 前向传播流程
+
+**文件位置**: `deepmd/pt/model/descriptor/repflows.py:429-647`
+
+```python
+def forward(
+    self,
+    nlist,
+    extended_coord,
+    extended_atype,
+    extended_atype_embd=None,
+    mapping=None,
+    comm_dict=None,
+):
+    # 1. 环境矩阵计算
+    dmatrix, diff, sw = prod_env_mat(
+        extended_coord,
+        nlist,
+        self.e_rcut,
+        self.e_rcut_smth,
+        protection=self.env_protection,
+    )
+
+    # 2. 边和角度邻居列表处理
+    # 生成边邻居列表和角度邻居列表
+
+    # 3. 嵌入计算
+    edge_input = dmatrix.unsqueeze(-1)  # [nf, nloc, nnei, 1]
+    edge_ebd = self.act(self.edge_embd(edge_input))
+
+    # 4. 角度信息计算
+    angle_input = ...  # 计算角度信息
+    angle_ebd = self.angle_embd(angle_input)
+
+    # 5. RepFlow 层迭代
+    for idx, ll in enumerate(self.layers):
+        node_ebd, edge_ebd, angle_ebd = ll.forward(
+            node_ebd, edge_ebd, angle_ebd, nlist, extended_coord, extended_atype, ...
+        )
+
+    return node_ebd, edge_ebd, h2, rot_mat, sw
+```
+
+### 3.3 RepFlow 层实现
+
+#### 3.3.1 层初始化
+
+**文件位置**: `deepmd/pt/model/descriptor/repflow_layer.py:38-200`
+
+```text
+class RepFlowLayer(torch.nn.Module):
+    def __init__(self,
+                 e_rcut: float,
+                 e_rcut_smth: float,
+                 e_sel: int,
+                 a_rcut: float,
+                 a_rcut_smth: float,
+                 a_sel: int,
+                 ntypes: int,
+                 n_dim: int = 128,
+                 e_dim: int = 16,
+                 a_dim: int = 64,
+                 # ... 其他参数
+                ):
+```
+
+#### 3.3.2 主要功能
+
+1. **节点更新**: 基于边和角度信息更新节点表示
+2. **边更新**: 基于节点和角度信息更新边表示
+3. **角度更新**: 基于节点和边信息更新角度表示
+4. **残差连接**: 支持多种残差连接策略
+
+### 3.4 关键依赖和支持模块
+
+#### 3.4.1 网络组件
+
+- **MLP 网络**: `deepmd/pt/model/network/mlp.py`
+  - `MLPLayer`: 多层感知机实现
+  - `TypeEmbedNet`: 类型嵌入网络
+  - `TypeEmbedNetConsistent`: 一致性类型嵌入网络
+
+- **网络工具**: `deepmd/pt/model/network/network.py`
+  - 激活函数
+  - 网络初始化工具
+  - 图操作工具函数
+
+#### 3.4.2 工具函数
+
+- **环境矩阵**: `deepmd/pt/model/descriptor/env_mat.py`
+  - `prod_env_mat`: 环境矩阵计算
+  - 距离和角度计算
+
+- **邻居列表**: `deepmd/pt/utils/nlist.py`
+  - 邻居列表生成和处理
+  - 排除掩码处理
+
+- **环境配置**: `deepmd/pt/utils/env.py`
+  - 设备配置
+  - 数据精度设置
+  - 并行计算配置
+
+#### 3.4.3 统计和预处理
+
+- **环境矩阵统计**: `deepmd/pt/utils/env_mat_stat.py`
+  - 邻居统计
+  - 数据预处理
+
+- **排除掩码**: `deepmd/pt/utils/exclude_mask.py`
+  - 原子类型排除处理
+  - 掩码生成
+
+### 3.5 PyTorch 后端能量求和机制
+
+#### 3.5.1 深度势能原理的实现
+
+根据深度势能的基本原理，系统的总能量等于系统中每个原子局部环境能量的总和。这一原理在 PyTorch 后端中通过**分离的两阶段计算**得到精确实现，确保了模型的物理正确性和能量守恒。
+
+**核心公式**:
+
+```
+E_total = Σ E_i
+```
+
+其中 E_i 是第 i 个原子的局部环境能量。
+
+#### 3.5.2 原子级能量计算阶段
+
+**文件位置**: `deepmd/pt/model/task/fitting.py:473-614`
+
+在拟合网络的 `_forward_common` 方法中，每个原子的能量被独立计算：
+
+```text
+def _forward_common(self, descriptor, atype, ...):
+    # descriptor shape: [nf, nloc, nd] - 原子环境描述符
+    nf, nloc, nd = xx.shape
+
+    # 初始化输出张量
+    outs = torch.zeros((nf, nloc, net_dim_out), dtype=self.prec, device=descriptor.device)
+
+    if self.mixed_types:
+        # 混合类型模式：统一网络处理所有原子类型
+        atom_property = self.filter_layers.networks[0](xx)  # 神经网络计算
+        outs = outs + atom_property + self.bias_atom_e[atype].to(self.prec)
+    else:
+        # 非混合类型模式：每种原子类型使用独立网络
+        for type_i, ll in enumerate(self.filter_layers.networks):
+            mask = (atype == type_i).unsqueeze(-1)
+            mask = torch.tile(mask, (1, 1, net_dim_out))
+            atom_property = ll(xx)  # 特定类型的神经网络计算
+            atom_property = atom_property + self.bias_atom_e[type_i].to(self.prec)
+            atom_property = torch.where(mask, atom_property, 0.0)
+            outs = outs + atom_property
+
+    # 应用排除掩码
+    mask = self.emask(atype).to(torch.bool)
+    outs = torch.where(mask[:, :, None], outs, 0.0)
+
+    # 返回原子级能量，shape: [nf, nloc, net_dim_out]
+    results.update({self.var_name: outs})
+    return results
+```
+
+**关键特征**:
+
+- **原子级输出**: 网络输出为 `[nf, nloc, net_dim_out]`，每个原子都有独立的能量贡献
+- **类型特定处理**: 支持混合类型和非混合类型两种计算模式
+- **局部环境原理**: 每个原子的能量只依赖于其局部环境描述符，符合深度势能的核心思想
+- **类型偏置**: 每种原子类型都有特定的偏置能量 `bias_atom_e`
+
+#### 3.5.3 系统能量求和阶段
+
+**文件位置**: `deepmd/pt/model/model/transform_output.py:153-192`
+
+**重要发现**: 原子级能量到系统能量的转换是在 `fit_output_to_model_output` 函数中完成的，而不是在拟合网络中！
+
+```text
+def fit_output_to_model_output(fit_ret, fit_output_def, coord_ext, ...):
+    redu_prec = env.GLOBAL_PT_ENER_FLOAT_PRECISION
+    model_ret = dict(fit_ret.items())
+
+    for kk, vv in fit_ret.items():
+        vdef = fit_output_def[kk]
+        shap = vdef.shape  # 对于能量，shap = [1]
+        atom_axis = -(len(shap) + 1)  # atom_axis = -2 (原子维度)
+
+        if vdef.reducible:
+            kk_redu = get_reduce_name(kk)  # "energy" -> "energy_redu"
+            if vdef.intensive:
+                # 强度性质：计算平均原子能量
+                model_ret[kk_redu] = torch.mean(vv.to(redu_prec), dim=atom_axis)
+            else:
+                # 广延性质：计算总和
+                model_ret[kk_redu] = torch.sum(vv.to(redu_prec), dim=atom_axis)
+
+            # 力和维里的自动微分计算
+            if vdef.r_differentiable:
+                kk_derv_r, kk_derv_c = get_deriv_name(kk)
+                dr, dc = take_deriv(vv, model_ret[kk_redu], vdef, coord_ext, ...)
+                model_ret[kk_derv_r] = dr
+                if vdef.c_differentiable:
+                    model_ret[kk_derv_c] = dc
+                    model_ret[kk_derv_c + "_redu"] = torch.sum(model_ret[kk_derv_c].to(redu_prec), dim=1)
+
+    return model_ret
+```
+
+**能量求和详解**:
+
+- **输入**: `vv` shape `[nf, nloc, 1]` - 原子级能量
+- **求和操作**: `torch.mean(vv, dim=-2)` 对原子维度求平均
+- **输出**: `energy_redu` shape `[nf, 1]` - 系统能量
+- **物理意义**: 系统能量 = 平均原子能量 × 原子数量
+- **求和策略**: 通过 `vdef.intensive` 控制使用求和还是求平均
+
+#### 3.5.4 损失函数中的能量处理
+
+**文件位置**: `deepmd/pt/loss/ener.py:319-329`
+
+在训练过程中，能量损失按原子数量归一化：
+
+```text
+def forward(self, model_pred, label, natoms, ...):
+    # 系统能量预测值
+    energy_pred = model_pred["energy"]  # shape: [nf, 1]
+    energy_label = label["energy"]      # shape: [nf, 1]
+
+    # 计算能量损失
+    l2_ener_loss = torch.mean(torch.square(energy_pred - energy_label))
+
+    # 按原子数量归一化 (per atom loss)
+    atom_norm = 1.0 / natoms
+    loss += atom_norm * (pref_e * l2_ener_loss)
+```
+
+**归一化策略**:
+
+- **原子级归一化**: `atom_norm = 1.0 / natoms` 确保损失是 per atom 的
+- **训练稳定性**: 防止大系统主导训练过程
+- **物理一致性**: 保持能量与原子数量的线性关系
+
+#### 3.5.5 完整的能量计算数据流
+
+```
+原子坐标和类型 [nf × natoms × 3], [nf × natoms]
+    ↓
+DPA3 描述符计算 (dpa3.py:430-498)
+    ↓
+原子环境表示 [nf × natoms × n_dim]
+    ↓
+拟合网络计算 (fitting.py:473-614)
+    ↓
+原子级能量 [nf × natoms × 1]  ← 每个原子的局部环境能量
+    ↓
+能量求和变换 (transform_output.py:170-175)
+    ↓
+系统能量 [nf × 1]  ← torch.mean(dim=-2) 求平均
+    ↓
+损失计算 (ener.py:319-329)
+    ↓
+Per Atom 归一化损失 [scalar]
+```
+
+#### 3.5.6 关键设计特点
+
+**分离式计算架构**:
+
+1. **原子能量计算**: 在 `_forward_common` 中计算每个原子的局部环境能量
+2. **系统能量聚合**: 在 `fit_output_to_model_output` 中将原子能量聚合成系统能量
+3. **自动微分支持**: 力的计算通过自动微分实现，保持梯度传递
+
+**灵活的求和策略**:
+
+- **求平均**: `torch.mean()` 用于训练时的能量损失计算
+- **求总和**: `torch.sum()` 用于某些需要总量的场景
+- **精度控制**: 使用 `redu_prec` 确保数值稳定性
+
+**物理正确性保证**:
+
+- **局部性原理**: 每个原子的能量只依赖于其局部环境
+- **可加性**: 系统能量严格等于原子能量之和
+- **不变性**: 保持旋转和平移不变性
+
+**计算效率优化**:
+
+- **并行计算**: 原子级能量计算可以完全并行化
+- **批处理**: 支持多帧同时处理
+- **内存效率**: 分离的计算阶段减少内存占用
+
+### 3.6 DPA3 描述符输出变量详解
+
+在 DPA3 描述符的 forward 方法中，输出的变量包含了原子环境表示的完整信息。这些变量对于理解描述符的工作原理和调试模型行为非常重要。
+
+#### 3.6.1 输出变量概述
+
+**文件位置**: `deepmd/pt/model/descriptor/dpa3.py:430-498`
+
+DPA3 描述符的 forward 方法返回五个核心变量：
+
+```python
+def forward(self, extended_coord, extended_atype, nlist, mapping=None, comm_dict=None):
+    # ... 计算过程 ...
+    return node_ebd, rot_mat, edge_ebd, h2, sw
+```
+
+#### 3.6.2 变量详细说明
+
+**node_ebd: 节点描述符**
+
+- **形状**: `[nf, nloc, n_dim]`
+- **含义**: 主要的原子环境描述符，包含每个原子的环境信息
+- **作用**: 直接输入拟合网络计算原子级能量
+
+**rot_mat: 旋转矩阵**
+
+- **形状**: `[nf, nloc, e_dim, 3]`
+- **含义**: 旋转矩阵用于坐标变换，保持旋转不变性
+- **作用**:
+  - 将局部坐标转换到全局坐标系
+  - 确保描述符在分子旋转时的不变性
+  - 支持 SE(3)等变变换
+
+**edge_ebd: 边嵌入**
+
+- **形状**: `[nf, nloc, nnei, e_dim]`
+- **含义**: 原子间边的嵌入表示
+- **作用**: 描述原子间的成键信息和相互作用
+
+**h2: 角度信息**
+
+- **形状**: `[nf, nloc, nnei, 3]`
+- **含义**: 三体角度相关信息
+- **作用**: 描述原子间的角度关系，支持 3-body 相互作用建模
+
+**sw: 平滑开关函数**
+
+- **形状**: `[nf, nloc, nnei]`
+- **含义**: 用于平滑截止边界的开关函数
+- **作用**: 在 cutoff 半径处平滑过渡到零，避免能量和力的不连续跳跃
+
+#### 3.6.3 变量在模型中的应用
+
+**在拟合网络中的使用** (`deepmd/pt/model/task/fitting.py:473-614`):
+
+```text
+def _forward_common(self, descriptor, atype, ...):
+    # descriptor是node_ebd [nf, nloc, nd]
+    nf, nloc, nd = descriptor.shape
+
+    # 计算原子级能量
+    atom_property = self.filter_layers.networks[0](descriptor)
+    # ...
+    return {self.var_name: outs}  # outs shape [nf, nloc, net_dim_out]
+```
+
+#### 3.6.4 输出变量的数据流
+
+```
+扩展坐标和原子类型
+    ↓
+环境矩阵计算 (prod_env_mat)
+    ↓
+RepFlow边和角度处理
+    ↓
+edge_ebd, h2, sw ← 中间表示
+    ↓
+RepFlow层迭代更新
+    ↓
+node_ebd, rot_mat ← 最终描述符输出
+    ↓
+拟合网络处理
+    ↓
+原子级能量和性质预测
+```
+
+### 3.7 代码修改和功能增强历史
+
+#### 3.7.1 process_systems 函数增强
+
+**修改位置**: `deepmd/utils/data_system.py`
+
+**核心修改**: 增强了`process_systems`函数，支持列表输入的递归搜索功能，每个字符串项都会进行递归子目录查找，同时保持向后兼容性。
+
+#### 3.7.2 功能验证
+
+- **向后兼容性**: 字符串输入行为保持完全一致
+- **新功能测试**: 列表中的字符串项正确进行递归搜索
+- **错误处理**: 边界条件和异常情况处理正确
+
+---
+
+## 第四部分：数据处理系统
+
+### 4.1 数据处理架构概述
+
+DeePMD-kit PyTorch 后端采用了独特的两级 DataLoader 架构，实现了高效的多系统数据管理和训练优化。这种架构专门为处理大规模分子动力学数据而设计，支持多数据源并行加载和智能批处理。
+
+**架构优势**:
+
+- **效率**: 系统级和训练级分离，避免线程爆炸
+- **灵活性**: 支持多种数据源和采样策略
+- **可扩展性**: 天然支持分布式训练和多 GPU
+- **稳定性**: 完善的错误处理和数据验证
+
+### 4.2 原始数据加载
+
+#### 4.2.1 数据文件结构
+
+**文件位置**: `deepmd/utils/data.py` - `DeepmdData` 类
+
+**数据来源**:
+
+- **HDF5 文件**: 高效存储大规模分子动力学数据
+- **.npy 文件**: NumPy 数组格式，存储单个属性
+- **系统目录**: 每个训练数据源独立的目录结构
+
+**目录结构**:
+
+```
+system_path/
+├── type_map.raw          # 原子类型映射
+├── set.0/                # 第一个数据集
+│   ├── coord.npy        # 原子坐标 [nframes × natoms × 3]
+│   ├── box.npy          # 周期边界条件 [nframes × 9]
+│   ├── energy.npy       # 系统能量 [nframes]
+│   ├── force.npy        # 原子力 [nframes × natoms × 3]
+│   └── virial.npy       # 系统维里 [nframes × 9]
+├── set.1/                # 第二个数据集
+└── ...
+```
+
+#### 4.2.2 数据加载过程
+
+**初始化过程** (`data.py:50-122`):
+
+```python
+class DeepmdData:
+    def __init__(
+        self,
+        systems: Union[str, List[str]],
+        batch_size: int = 1,
+        test_size: int = 0,
+        shuffle_test: bool = True,
+        type_map: Optional[List[str]] = None,
+        modifier=None,
+    ):
+        """
+        初始化数据系统
+
+        Args:
+            systems: 系统路径或路径列表
+            batch_size: 批处理大小
+            test_size: 测试集大小
+            shuffle_test: 是否打乱测试集
+            type_map: 原子类型映射
+            modifier: 数据修改器
+        """
+        # 1. 系统路径处理
+        self.system_dirs = self._get_system_dirs(systems)
+
+        # 2. 类型映射加载
+        self.type_map = self._load_type_map()
+
+        # 3. 数据需求定义
+        self.data_dict = {
+            "coord": {"ndof": 3, "atomic": True, "must": True},
+            "box": {"ndof": 9, "atomic": False, "must": self.pbc},
+            "energy": {"ndof": 1, "atomic": False, "must": False},
+            "force": {"ndof": 3, "atomic": True, "must": False},
+            # ... 其他属性
+        }
+
+        # 4. 数据集加载
+        self._load_all_sets()
+```
+
+**数据集加载** (`data.py:233-280`):
+
+```python
+def _load_set(self, set_path: str):
+    """加载单个数据集"""
+    # 1. 扫描数据文件
+    data_files = glob.glob(os.path.join(set_path, "*.npy"))
+
+    # 2. 加载必需属性
+    coord_data = np.load(os.path.join(set_path, "coord.npy"))
+    box_data = np.load(os.path.join(set_path, "box.npy"))
+
+    # 3. 加载可选属性
+    if os.path.exists(os.path.join(set_path, "energy.npy")):
+        energy_data = np.load(os.path.join(set_path, "energy.npy"))
+
+    # 4. 数据验证和预处理
+    self._validate_data(coord_data, box_data, energy_data)
+
+    return {
+        "coord": coord_data,
+        "box": box_data,
+        "energy": energy_data,
+        # ... 其他属性
+    }
+```
+
+#### 4.2.3 数据预处理
+
+**数据格式转换** (`data.py:300-315`):
+
+```python
+def reformat_data_torch(self, data_dict: dict) -> dict:
+    """将数据转换为 PyTorch 格式"""
+    reformatted = {}
+
+    for key, value in data_dict.items():
+        if key in self.data_dict:
+            info = self.data_dict[key]
+            if info["atomic"]:
+                # 原子级属性: [nframes × natoms × ndof]
+                reformatted[key] = torch.tensor(value, dtype=torch.float32)
+            else:
+                # 系统级属性: [nframes × ndof]
+                reformatted[key] = torch.tensor(value, dtype=torch.float32)
+
+    return reformatted
+```
+
+### 4.3 系统级 DataLoader 创建
+
+#### 4.3.1 DpLoaderSet 架构
+
+**文件位置**: `deepmd/pt/utils/dataloader.py` - `DpLoaderSet` 类
+
+**系统级 DataLoader 概述**:
+
+- **目的**: 为每个数据系统创建独立的 DataLoader
+- **特点**: 每个 DataLoader 负责处理一个系统的数据加载和批处理
+- **优势**: 避免线程爆炸，提高内存使用效率
+
+**初始化过程** (`dataloader.py:76-174`):
+
+```python
+class DpLoaderSet:
+    def __init__(
+        self,
+        systems: List[str],
+        batch_size: Union[int, str, List[int]],
+        type_map: List[str],
+        shuffle: bool = True,
+        dist: bool = False,
+    ):
+        """
+        初始化系统级 DataLoader 集合
+
+        Args:
+            systems: 系统路径列表
+            batch_size: 批处理大小 (可以是自动、固定值或列表)
+            type_map: 原子类型映射
+            shuffle: 是否打乱数据
+            dist: 是否使用分布式训练
+        """
+        # 1. 系统数据初始化
+        self.systems = []
+        self.batch_sizes = []
+
+        for system_path in systems:
+            # 创建系统数据对象
+            system_data = DeepmdData(
+                system_path,
+                batch_size=1,  # 系统级批处理在 DataLoader 中处理
+                type_map=type_map,
+            )
+
+            # 转换为 PyTorch 数据集
+            torch_dataset = DeepmdDataSetForLoader(system_data)
+            self.systems.append(torch_dataset)
+
+            # 计算系统级批处理大小
+            if isinstance(batch_size, str) and batch_size == "auto":
+                # 自动批处理: 基于原子数量优化
+                system_batch_size = self._calculate_auto_batch_size(system_data)
+            else:
+                system_batch_size = batch_size
+
+            self.batch_sizes.append(system_batch_size)
+
+        # 2. 创建系统级 DataLoaders
+        self.dataloaders = []
+        for system, batch_size in zip(self.systems, self.batch_sizes):
+            system_dataloader = self._create_system_dataloader(
+                system, batch_size, shuffle, dist
+            )
+            self.dataloaders.append(system_dataloader)
+```
+
+#### 4.3.2 系统级 DataLoader 创建
+
+**创建过程** (`dataloader.py:157-166`):
+
+```python
+def _create_system_dataloader(self, system, batch_size, shuffle, dist):
+    """创建单个系统级 DataLoader"""
+
+    # 分布式采样器
+    if dist and dist.is_available() and dist.is_initialized():
+        system_sampler = DistributedSampler(
+            system,
+            num_replicas=dist.get_world_size(),
+            rank=dist.get_rank(),
+            shuffle=shuffle,
+        )
+    else:
+        system_sampler = None
+
+    # 创建 DataLoader
+    system_dataloader = DataLoader(
+        dataset=system,
+        batch_size=int(batch_size),
+        num_workers=0,  # 关键: 避免线程爆炸
+        sampler=system_sampler,
+        collate_fn=collate_batch,  # 数据批处理函数
+        shuffle=(not (dist.is_available() and dist.is_initialized())) and shuffle,
+    )
+
+    return system_dataloader
+```
+
+**为什么 num_workers=0**:
+
+- **线程管理**: 避免创建过多进程导致系统资源耗尽
+- **内存效率**: 每个系统都有独立的 DataLoader，多进程会导致内存爆炸
+- **稳定性**: 减少进程间通信的复杂性
+- **性能**: 在系统级 DataLoader 中，数据加载相对较快，不需要多进程加速
+
+#### 4.3.3 自动批处理计算
+
+**自动批处理算法** (`dataloader.py:200-220`):
+
+```python
+def _calculate_auto_batch_size(self, system_data: DeepmdData) -> int:
+    """基于系统特征计算最优批处理大小"""
+
+    # 1. 获取系统统计信息
+    natoms = system_data.get_natoms()
+    nframes = system_data.get_nframes()
+
+    # 2. 计算内存需求
+    memory_per_frame = natoms * 3 * 4  # 坐标内存 (float32)
+    memory_per_frame += natoms * 4  # 原子类型内存 (int32)
+    memory_per_frame += 9 * 4  # 盒子内存 (float32)
+
+    # 3. 基于可用内存计算批处理大小
+    available_memory = self._get_available_memory()
+    safe_memory = available_memory * 0.7  # 70% 安全阈值
+
+    batch_size = int(safe_memory / memory_per_frame)
+    batch_size = max(1, min(batch_size, 32))  # 限制在 1-32 之间
+
+    return batch_size
+```
+
+### 4.4 数据变换管道
+
+#### 4.4.1 数据集类实现
+
+**文件位置**: `deepmd/pt/utils/dataloader.py` - `DeepmdDataSetForLoader` 类
+
+**数据集类功能** (`dataloader.py:18-32`):
+
+```python
+class DeepmdDataSetForLoader(torch.utils.data.Dataset):
+    """将 DeepmdData 转换为 PyTorch Dataset"""
+
+    def __init__(self, dp_data: DeepmdData):
+        self.dp_data = dp_data
+        self.nframes = dp_data.get_nframes()
+
+    def __len__(self):
+        """返回数据集大小"""
+        return self.nframes
+
+    def __getitem__(self, idx: int):
+        """获取单个数据帧"""
+        # 1. 获取原始数据
+        frame_data = self.dp_data.get_item(idx)
+
+        # 2. 添加帧 ID
+        frame_data["fid"] = idx
+
+        # 3. 添加系统 ID (如果有多个系统)
+        if hasattr(self, "sid"):
+            frame_data["sid"] = self.sid
+
+        return frame_data
+```
+
+#### 4.4.2 批处理函数实现
+
+**核心批处理函数** (`dataloader.py:223-238`):
+
+```python
+def collate_batch(batch: List[dict]) -> dict:
+    """
+    将多个数据帧合并为批处理
+
+    Args:
+        batch: 数据帧列表，每个元素是一个字典
+
+    Returns:
+        批处理数据字典
+    """
+    example = batch[0]
+    result = {}
+
+    for key in example.keys():
+        if "find_" in key:
+            # 查找键保持为单值
+            result[key] = batch[0][key]
+        elif key == "fid":
+            # 帧 ID 转换为列表
+            result[key] = [d[key] for d in batch]
+        elif key == "type":
+            # 跳过 type 键，作为 atype 处理
+            continue
+        else:
+            # 其他键进行张量批处理
+            result[key] = collate_tensor_fn([torch.as_tensor(d[key]) for d in batch])
+
+    return result
+```
+
+**张量批处理函数** (`dataloader.py:240-250`):
+
+```python
+def collate_tensor_fn(tensors: List[torch.Tensor]) -> torch.Tensor:
+    """将张量列表合并为单个张量"""
+
+    if len(tensors) == 0:
+        return torch.tensor([])
+
+    # 检查张量形状是否一致
+    shapes = [t.shape for t in tensors]
+    if len(set(shapes)) == 1:
+        # 形状一致，直接堆叠
+        return torch.stack(tensors, dim=0)
+    else:
+        # 形状不一致，填充到最大形状
+        max_shape = [max(dim) for dim in zip(*shapes)]
+        padded_tensors = []
+
+        for tensor in tensors:
+            padding = [
+                (0, max_dim - curr_dim)
+                for max_dim, curr_dim in zip(max_shape, tensor.shape)
+            ]
+            padded_tensor = torch.nn.functional.pad(tensor, padding)
+            padded_tensors.append(padded_tensor)
+
+        return torch.stack(padded_tensors, dim=0)
+```
+
+### 4.5 训练级 DataLoader 数据流
+
+#### 4.5.1 训练级 DataLoader 创建
+
+**文件位置**: `deepmd/pt/train/training.py` - `get_data_loader()` 函数
+
+**训练级 DataLoader 概述**:
+
+- **目的**: 管理训练过程中的数据采样和批处理
+- **特点**: 包装系统级 DataLoader 集合，提供统一的数据接口
+- **优势**: 支持多系统采样、分布式训练和无限循环
+
+**创建过程** (`training.py:177-214`):
+
+```python
+def get_data_loader(_training_data, _validation_data, _training_params):
+    """创建训练和验证数据加载器"""
+
+    def get_dataloader_and_iter(_data, _params):
+        """创建单个数据加载器和迭代器"""
+
+        # 1. 采样器配置
+        _sampler = get_sampler_from_params(_data, _params)
+        if _sampler is None:
+            log.warning("Sampler not specified!")
+
+        # 2. 创建训练级 DataLoader
+        _dataloader = DataLoader(
+            _data,  # DpLoaderSet 实例
+            sampler=_sampler,  # 采样器
+            batch_size=None,  # 单系统批处理
+            num_workers=NUM_WORKERS if dist.is_available() else 0,
+            drop_last=False,  # 不丢弃最后一个不完整批次
+            collate_fn=lambda batch: batch,  # 防止额外转换
+            pin_memory=True,  # 锁页内存优化
+        )
+
+        # 3. 创建无限循环迭代器
+        _data_iter = cycle_iterator(_dataloader)
+        return _dataloader, _data_iter
+
+    # 创建训练和验证数据加载器
+    training_dataloader, training_data_iter = get_dataloader_and_iter(
+        _training_data, _training_params["training_data"]
+    )
+
+    validation_dataloader, validation_data_iter = get_dataloader_and_iter(
+        _validation_data, _training_params["validation_data"]
+    )
+
+    return (
+        training_dataloader,
+        training_data_iter,
+        validation_dataloader,
+        validation_data_iter,
+    )
+```
+
+#### 4.5.2 采样器配置
+
+**采样器创建** (`training.py:266-277`):
+
+```python
+def get_sampler_from_params(_data, _params):
+    """基于参数创建采样器"""
+
+    # 1. 获取采样概率
+    if "prob_sys_size" in _params and _params["prob_sys_size"]:
+        # 基于系统大小的采样概率
+        prob = _data.get_sys_prob()
+    elif "prob" in _params:
+        # 用户定义的采样概率
+        prob = _params["prob"]
+    else:
+        # 均匀采样
+        prob = None
+
+    # 2. 创建采样器
+    if prob is not None:
+        sampler = WeightedRandomSampler(
+            weights=prob, num_samples=len(prob), replacement=True
+        )
+    else:
+        sampler = None
+
+    return sampler
+```
+
+**系统概率计算** (`dataloader.py:300-320`):
+
+```python
+def get_sys_prob(self) -> List[float]:
+    """计算系统采样概率"""
+
+    # 1. 获取每个系统的帧数
+    system_sizes = [len(system) for system in self.systems]
+
+    # 2. 基于帧数计算概率
+    total_frames = sum(system_sizes)
+    prob = [size / total_frames for size in system_sizes]
+
+    return prob
+```
+
+#### 4.5.3 无限循环迭代器
+
+**迭代器实现** (`training.py:150-160`):
+
+```python
+def cycle_iterator(dataloader):
+    """创建无限循环的数据迭代器"""
+
+    while True:
+        # 1. 重置迭代器
+        data_iter = iter(dataloader)
+
+        # 2. 遍历所有数据
+        try:
+            while True:
+                batch = next(data_iter)
+                yield batch
+        except StopIteration:
+            # 3. 重新开始循环
+            continue
+```
+
+### 4.6 最终数据提交给模型
+
+#### 4.6.1 数据获取和预处理
+
+**文件位置**: `deepmd/pt/train/training.py` - `Trainer.get_data()` 方法
+
+**数据获取过程** (`training.py:950-990`):
+
+```python
+def get_data(self, is_train=True, task_key="Default"):
+    """获取训练数据并预处理"""
+
+    # 1. 选择数据迭代器
+    if is_train:
+        iterator = self.training_data_iters[task_key]
+    else:
+        iterator = self.validation_data_iters[task_key]
+
+    # 2. 获取下一个批次
+    batch_data = next(iterator)
+
+    # 3. 数据类型和设备转换
+    for key in batch_data.keys():
+        if key not in ["sid", "fid", "box", "find_*"]:
+            # 移动到目标设备
+            batch_data[key] = batch_data[key].to(env.DEVICE, non_blocking=True)
+
+    # 4. 分离输入和标签
+    input_dict, label_dict, log_dict = self._separate_inputs_labels(batch_data)
+
+    return input_dict, label_dict, log_dict
+```
+
+**输入标签分离** (`training.py:1000-1020`):
+
+```python
+def _separate_inputs_labels(self, batch_data: dict) -> tuple:
+    """分离模型输入和标签"""
+
+    # 1. 定义输入键
+    input_keys = ["coord", "atype", "spin", "box", "fparam", "aparam"]
+
+    # 2. 创建输入字典
+    input_dict = {}
+    for key in input_keys:
+        if key in batch_data:
+            input_dict[key] = batch_data[key]
+
+    # 3. 创建标签字典
+    label_dict = {}
+    for key, value in batch_data.items():
+        if key not in input_keys and key not in ["sid", "fid"]:
+            label_dict[key] = value
+
+    # 4. 创建日志字典
+    log_dict = {
+        "natoms": batch_data.get("natoms", None),
+        "find_energy": batch_data.get("find_energy", False),
+        "find_force": batch_data.get("find_force", False),
+    }
+
+    return input_dict, label_dict, log_dict
+```
+
+#### 4.6.2 模型输入提交
+
+**模型执行过程** (`training.py:611-704`):
+
+```python
+def step(self, task_key="Default", **kwargs):
+    """执行单个训练步骤"""
+
+    # 1. 获取数据
+    input_dict, label_dict, log_dict = self.get_data(is_train=True, task_key=task_key)
+
+    # 2. 前向传播
+    with torch.cuda.amp.autocast(enabled=self.mixed_precision):
+        model_pred, loss, more_loss = self.wrapper(
+            **input_dict, cur_lr=self.get_cur_lr(), label=label_dict, task_key=task_key
+        )
+
+        # 3. 反向传播
+        self.optimizer.zero_grad()
+        loss.backward()
+
+        # 4. 梯度裁剪
+        if self.grad_clip > 0:
+            torch.nn.utils.clip_grad_norm_(self.wrapper.parameters(), self.grad_clip)
+
+        # 5. 参数更新
+        self.optimizer.step()
+
+    # 6. 记录损失
+    self.record_loss(loss, more_loss, log_dict)
+
+    return loss, more_loss
+```
+
+### 4.7 数据流程优化特性
+
+#### 4.7.1 内存优化策略
+
+**内存管理**:
+
+- **锁页内存**: 使用 `pin_memory=True` 提高 GPU 数据传输效率
+- **自动批处理**: 基于系统特征动态调整批处理大小
+- **设备管理**: 智能设备选择和内存分配
+
+**NUM_WORKERS 配置** (`env.py:26-31`):
+
+```python
+# 环境变量配置
+NUM_WORKERS = int(os.environ.get("NUM_WORKERS", min(4, ncpus)))
+
+# 多进程方法检查
+if multiprocessing.get_start_method() != "fork":
+    log.warning(
+        "NUM_WORKERS > 0 is not supported with spawn or forkserver start method. Setting NUM_WORKERS to 0."
+    )
+    NUM_WORKERS = 0
+```
+
+#### 4.7.2 性能优化特性
+
+**分布式训练支持**:
+
+- **数据并行**: 支持多 GPU 数据并行训练
+- **分布式采样**: `DistributedSampler` 确保数据均匀分布
+- **梯度同步**: 自动梯度同步和参数更新
+
+**数据增强**:
+
+- **随机打乱**: 支持训练数据随机打乱
+- **加权采样**: 基于系统大小的智能采样
+- **多任务支持**: 支持多任务学习的数据管理
+
+### 4.8 数据流程监控和调试
+
+#### 4.8.1 数据统计信息
+
+**数据统计** (`dataloader.py:400-420`):
+
+```python
+def get_data_statistics(self) -> dict:
+    """获取数据统计信息"""
+
+    stats = {
+        "num_systems": len(self.systems),
+        "total_frames": sum(len(sys) for sys in self.systems),
+        "batch_sizes": self.batch_sizes,
+        "system_sizes": [len(sys) for sys in self.systems],
+        "memory_usage": self._estimate_memory_usage(),
+    }
+
+    return stats
+```
+
+#### 4.8.2 数据验证和错误处理
+
+**数据验证** (`data.py:400-420`):
+
+```python
+def validate_data(self, coord_data, box_data, energy_data=None):
+    """验证数据完整性"""
+
+    # 1. 检查数据形状
+    nframes = coord_data.shape[0]
+    assert box_data.shape[0] == nframes, "Box data frame count mismatch"
+
+    # 2. 检查原子数量一致性
+    natoms = coord_data.shape[1] // 3
+    assert natoms > 0, "Invalid atom count"
+
+    # 3. 检查数值范围
+    assert torch.isfinite(coord_data).all(), "Invalid coordinate values"
+    assert torch.isfinite(box_data).all(), "Invalid box values"
+
+    # 4. 检查能量数据
+    if energy_data is not None:
+        assert energy_data.shape[0] == nframes, "Energy data frame count mismatch"
+        assert torch.isfinite(energy_data).all(), "Invalid energy values"
+```
+
+---
+
+## 第五部分：推理和部署
+
+### 5.1 推理架构概述
+
+DPA3 的推理系统采用分层设计，支持多种部署方式和性能优化策略。推理过程的核心是通过 `DeepEval` 类实现的，它提供了统一的接口来加载训练好的 DPA3 模型并进行高效的原子环境计算。
+
+**推理架构组件**:
+
+```
+用户接口层 (CLI / Python API)
+    ↓
+DeepEval (统一推理接口)
+    ↓
+ModelWrapper (模型包装器)
+    ↓
+DPA3 Descriptor (原子环境计算)
+    ↓
+PyTorch JIT / 原生执行 (计算后端)
+```
+
+### 5.2 推理入口点和接口
+
+#### 5.2.1 Python API 接口
+
+**主要推理类**:
+
+- `DeepEval`: 通用推理接口 (`deepmd/pt/infer/deep_eval.py:75`)
+- `Tester`: 测试和推理工具 (`deepmd/pt/infer/inference.py:25`)
+
+**基本使用方法**:
+
+```python
+from deepmd.pt.infer import DeepEval
+
+# 加载模型
+evaluator = DeepEval("dpa3_model.pt", output_def)
+
+# 执行推理
+result = evaluator.eval(
+    coords=coordinates,  # [nframes x natoms x 3]
+    cells=cell_parameters,  # [nframes x 9] (可选)
+    atom_types=atom_types,  # [natoms] 或 [nframes x natoms]
+    atomic=False,  # 是否计算原子级贡献
+)
+```
+
+#### 5.2.2 CLI 推理命令
+
+**测试命令**:
+
+```bash
+dp test -m dpa3_model.pt -s test_data
+```
+
+**模型冻结**:
+
+```bash
+dp freeze -m dpa3_model.pt -o frozen_model.pth
+```
+
+### 5.3 模型加载和初始化
+
+#### 5.3.1 模型加载过程
+
+**文件位置**: `deepmd/pt/infer/deep_eval.py:96-161`
+
+```python
+def __init__(
+    self,
+    model_file: str,
+    output_def: ModelOutputDef,
+    auto_batch_size: Union[bool, int, AutoBatchSize] = True,
+    neighbor_list: Optional["ase.neighborlist.NewPrimitiveNeighborList"] = None,
+    head: Optional[Union[str, int]] = None,
+    no_jit: bool = False,
+):
+
+    # 1. 加载模型检查点
+    state_dict = torch.load(model_file, map_location=env.DEVICE, weights_only=True)
+
+    # 2. 处理多任务模型
+    if self.multi_task:
+        # 选择指定的任务头
+        model_params = self.input_param["model_dict"][head]
+
+    # 3. 重建模型架构
+    model = get_model(self.input_param).to(DEVICE)
+
+    # 4. JIT 编译优化
+    if not self.input_param.get("hessian_mode") and not no_jit:
+        model = torch.jit.script(model)
+
+    # 5. 包装和加载权重
+    self.dp = ModelWrapper(model)
+    self.dp.load_state_dict(state_dict)
+    self.dp.eval()  # 设置为评估模式
+```
+
+#### 5.3.2 多任务模型支持
+
+对于包含多个任务的 DPA3 模型，推理时需要指定具体的任务头：
+
+```python
+# 多任务模型推理
+evaluator = DeepEval("multi_task_model.pt", output_def, head="task_name")
+```
+
+### 5.4 推理执行流程
+
+#### 5.4.1 主要推理方法
+
+**文件位置**: `deepmd/pt/infer/deep_eval.py:394-462`
+
+**标准推理流程**:
+
+```python
+def _eval_model(self, coords, cells, atom_types, fparam, aparam, request_defs):
+    # 1. 数据预处理
+    coord_input = torch.tensor(
+        coords.reshape([nframes, natoms, 3]),
+        dtype=GLOBAL_PT_FLOAT_PRECISION,
+        device=DEVICE,
+    )
+    type_input = torch.tensor(atom_types, dtype=torch.long, device=DEVICE)
+
+    # 2. 可选参数处理
+    box_input = (
+        torch.tensor(
+            cells.reshape([nframes, 3, 3]),
+            dtype=GLOBAL_PT_FLOAT_PRECISION,
+            device=DEVICE,
+        )
+        if cells is not None
+        else None
+    )
+
+    # 3. 执行模型推理
+    batch_output = model(
+        coord_input,
+        type_input,
+        box=box_input,
+        do_atomic_virial=do_atomic_virial,
+        fparam=fparam_input,
+        aparam=aparam_input,
+    )
+
+    # 4. 后处理和返回结果
+    return self._process_output(batch_output, request_defs)
+```
+
+#### 5.4.2 DPA3 在推理中的执行
+
+在推理过程中，DPA3 描述符的 `forward` 方法被调用来计算原子环境表示：
+
+1. **输入数据**: 接收扩展坐标、原子类型和邻居列表
+2. **类型嵌入**: 计算原子类型嵌入向量
+3. **RepFlow 计算**: 通过多层 RepFlow 处理节点、边和角度信息
+4. **输出生成**: 生成最终的原子环境描述符
+
+### 5.5 性能优化特性
+
+#### 5.5.1 自动批处理
+
+**实现位置**: `deepmd/pt/infer/deep_eval.py:351-375`
+
+```python
+def _eval_func(self, inner_func: Callable, numb_test: int, natoms: int) -> Callable:
+    if self.auto_batch_size is not None:
+
+        def eval_func(*args, **kwargs):
+            return self.auto_batch_size.execute_all(
+                inner_func, numb_test, natoms, *args, **kwargs
+            )
+
+    else:
+        eval_func = inner_func
+    return eval_func
+```
+
+**自动批处理优势**:
+
+- **内存优化**: 根据可用内存自动调整批处理大小
+- **性能平衡**: 在内存使用和计算效率之间找到最佳平衡
+- **适应性**: 能够根据不同的硬件配置自动调整
+
+#### 5.5.2 JIT 编译优化
+
+**JIT 编译过程**:
+
+```python
+# 模型加载时自动进行 JIT 编译
+if not self.input_param.get("hessian_mode") and not no_jit:
+    model = torch.jit.script(model)
+```
+
+**JIT 优化效果**:
+
+- **计算图优化**: 将 Python 代码编译为优化的计算图
+- **内存分配优化**: 减少动态内存分配开销
+- **算子融合**: 将多个操作融合为单个高效算子
+
+#### 5.5.3 设备优化
+
+**多设备支持**:
+
+- **CPU 推理**: 适用于小规模模型和内存受限环境
+- **GPU 推理**: 大规模并行计算，显著提升推理速度
+- **多 GPU**: 支持模型并行和数据并行
+
+**设备选择策略**:
+
+```python
+# 自动选择最佳计算设备
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = model.to(device)
+```
+
+### 5.6 推理部署选项
+
+#### 5.6.1 模型格式
+
+**支持的模型格式**:
+
+1. **.pt 文件**: PyTorch 标准检查点格式
+   - 包含完整的模型权重和配置信息
+   - 支持多任务模型和元数据
+
+2. **.pth 文件**: TorchScript 冻结模型
+   - 经过 JIT 编译优化的模型
+   - 部署时无需重新编译，加载更快
+
+#### 5.6.2 冻结模型生成
+
+**文件位置**: `deepmd/pt/entrypoints/main.py:344-358`
+
+```python
+def freeze(model: str, output: str = "frozen_model.pth", head: Optional[str] = None):
+    # 1. 加载原始模型
+    model = inference.Tester(model, head=head).model
+
+    # 2. 设置为评估模式
+    model.eval()
+
+    # 3. JIT 脚本编译
+    model = torch.jit.script(model)
+
+    # 4. 保存冻结模型
+    torch.jit.save(model, output, extra_files={})
+```
+
+**冻结模型优势**:
+
+- **部署简化**: 无需依赖原始模型定义代码
+- **加载速度**: 避免了模型重建的开销
+- **版本兼容**: 提供更好的版本兼容性
+
+### 5.7 高级推理功能
+
+#### 5.7.1 描述符提取
+
+**方法**: `eval_descriptor()`
+**位置**: `deepmd/pt/infer/deep_eval.py:633-687`
+
+```python
+def eval_descriptor(self, coords, cells, atom_types, fparam=None, aparam=None):
+    """提取 DPA3 原子环境描述符"""
+    # 返回原始的 DPA3 描述符输出
+    # 可用于分析和可视化原子环境表示
+```
+
+#### 5.7.2 类型嵌入分析
+
+**方法**: `eval_typeebd()`
+**位置**: `deepmd/pt/infer/deep_eval.py:565-632`
+
+```python
+def eval_typeebd(self):
+    """评估类型嵌入网络输出"""
+    # 返回原子类型的嵌入向量
+    # 用于分析类型表示的特征空间
+```
+
+#### 5.7.3 拟合网络分析
+
+**方法**: `eval_fitting_last_layer()`
+**位置**: `deepmd/pt/infer/deep_eval.py:688-730`
+
+```python
+def eval_fitting_last_layer(self, coords, cells, atom_types, fparam=None, aparam=None):
+    """评估拟合网络最后一层的输入"""
+    # 用于调试和分析拟合过程
+```
+
+### 5.8 推理性能监控
+
+#### 5.8.1 性能指标
+
+**模型大小分析**:
+
+```python
+def get_model_size(self) -> dict:
+    """获取模型参数统计"""
+    return {
+        "descriptor": sum_param_des,  # 描述符参数数量
+        "fitting-net": sum_param_fit,  # 拟合网络参数数量
+        "total": sum_param_des + sum_param_fit,  # 总参数数量
+    }
+```
+
+#### 5.8.2 内存使用优化
+
+**内存管理策略**:
+
+1. **梯度禁用**: 推理时自动禁用梯度计算
+2. **批处理优化**: 通过自动批处理控制内存使用
+3. **设备内存管理**: 自动管理 GPU 内存分配和释放
+
+### 5.9 推理部署最佳实践
+
+#### 5.9.1 模型选择建议
+
+**小规模系统** (原子数 < 1000):
+
+- 使用标准的 .pt 格式
+- 启用 JIT 编译优化
+- CPU 推理通常足够
+
+**中等规模系统** (原子数 1000-10000):
+
+- 推荐使用冻结的 .pth 格式
+- 启用 GPU 推理
+- 调整自动批处理参数
+
+**大规模系统** (原子数 > 10000):
+
+- 必须使用 GPU 推理
+- 考虑多 GPU 并行
+- 优化邻居列表计算
+
+#### 5.9.2 配置优化
+
+**内存优化配置**:
+
+```python
+# 内存敏感环境
+evaluator = DeepEval("model.pt", output_def, auto_batch_size=False)  # 禁用自动批处理
+
+# 性能优化配置
+evaluator = DeepEval("model.pt", output_def, auto_batch_size=1024)  # 设置固定批处理大小
+```
+
+#### 5.9.3 错误处理和调试
+
+**常见推理问题**:
+
+1. **内存不足**: 减少批处理大小或使用 CPU
+2. **设备不匹配**: 确保模型和数据在同一设备上
+3. **版本兼容**: 使用冻结模型避免版本问题
+
+---
+
+## 总结
+
+DPA3 作为 DeePMD-kit 中最先进的原子环境描述符之一，通过结合节点、边和角度信息，提供了更加精确和全面的原子环境表示。其模块化的设计、丰富的配置选项和优秀的性能优化特性，使其能够广泛应用于各种分子动力学模拟任务中。
+
+### 技术特点总结
+
+**架构优势**:
+
+- **模块化设计**: 清晰的组件分离，易于扩展和维护
+- **高效数据处理**: 两级 DataLoader 架构，避免线程爆炸
+- **并行计算支持**: 天然支持多 GPU 和分布式训练
+- **性能优化**: JIT 编译、自动批处理、内存优化
+
+**核心创新**:
+
+- **RepFlow 架构**: 结合节点、边、角信息的统一表示
+- **3-body 相互作用**: 显式建模三体相互作用，提高精度
+- **动态更新策略**: 多种残差连接策略，优化信息流动
+- **智能压缩**: 角度消息压缩，减少计算开销
+
+### 使用建议
+
+**新手用户**:
+
+- 从基本配置开始，逐步调整参数
+- 使用自动批处理和默认优化选项
+- 关注训练收敛和基本性能指标
+
+**高级用户**:
+
+- 深入调整 RepFlow 参数优化性能
+- 利用分布式训练处理大规模数据
+- 自定义采样策略和损失函数
+
+**生产环境**:
+
+- 使用冻结模型确保部署稳定性
+- 监控推理性能和资源使用
+- 定期验证模型精度和稳定性
+
+### 未来发展方向
+
+**功能扩展**:
+
+- 支持更高阶的相互作用
+- 自适应邻居选择策略
+- 注意力机制集成
+
+**性能优化**:
+
+- 混合精度训练完善
+- 模型量化和压缩
+- 硬件特定优化
+
+**应用拓展**:
+
+- 多尺度建模支持
+- 在线学习和增量更新
+- 可解释性增强
+
+无论是学术研究还是工业应用，DPA3 都能够为用户提供可靠的深度学习势能解决方案。
diff --git a/doc/outisli/compress.md b/doc/outisli/compress.md
new file mode 100644
index 0000000000..187616c8c4
--- /dev/null
+++ b/doc/outisli/compress.md
@@ -0,0 +1,650 @@
+# DeePMD-kit 压缩功能详细分析
+
+## 概述
+
+DeePMD-kit 的 compress 功能通过将 embedding networks 进行 tabulation（查表法）来实现模型压缩，显著提升推理速度并减少内存占用。
+
+## 核心原理
+
+### 基本思想
+
+1. **预计算查表**：将 embedding networks 的输出预先计算并存储在表格中
+2. **分段插值**：使用两个不同步长的表格来平衡精度与存储成本：
+   - 第一段表格：使用精细步长（stride0）
+   - 第二段表格：使用粗糙步长（stride1 = 10 × stride0）
+3. **多项式插值**：基于查表结果进行五次多项式插值
+
+## PyTorch 后端实现
+
+### 1. 命令行入口
+
+#### 主入口
+
+- **文件位置**: `deepmd/main.py`
+- **命令示例**: `dp --pt compress -i model.pth -o compressed_model.pth`
+
+#### 参数配置
+
+```python
+parser_compress.add_argument("-s", "--step", default=0.01, type=float)  # stride0
+parser_compress.add_argument("-e", "--extrapolate", default=5, type=int)  # 外推倍数
+parser_compress.add_argument("-f", "--frequency", default=-1, type=int)  # 溢出检查频率
+parser_compress.add_argument("-t", "--training-script", type=str)  # 训练脚本
+```
+
+#### 命令分发
+
+```text
+# deepmd/main.py:1013-1018
+elif args.command in ("compress", "train", "freeze", ...):
+    deepmd_main = BACKENDS[args.backend]().entry_point_hook
+```
+
+### 2. PyTorch 后端处理
+
+#### 入口函数
+
+**文件位置**: `deepmd/pt/entrypoints/main.py:574-582`
+
+```text
+elif FLAGS.command == "compress":
+    FLAGS.input = str(Path(FLAGS.input).with_suffix(".pth"))
+    FLAGS.output = str(Path(FLAGS.output).with_suffix(".pth"))
+    enable_compression(
+        input_file=FLAGS.input,
+        output=FLAGS.output,
+        stride=FLAGS.step,
+        extrapolate=FLAGS.extrapolate,
+        check_frequency=FLAGS.frequency,
+        training_script=FLAGS.training_script,
+    )
+```
+
+#### 核心压缩函数
+
+**文件位置**: `deepmd/pt/entrypoints/compress.py:32-84`
+
+## 详细执行流程
+
+### 步骤 1：模型加载
+
+```python
+def enable_compression(
+    input_file,
+    output,
+    stride=0.01,
+    extrapolate=5,
+    check_frequency=-1,
+    training_script=None,
+):
+    # 1. 加载JIT模型
+    saved_model = torch.jit.load(input_file, map_location="cpu")
+    model_def_script = json.loads(saved_model.model_def_script)
+
+    # 2. 重建模型实例
+    model = get_model(model_def_script)
+    model.load_state_dict(saved_model.state_dict())
+```
+
+### 步骤 2：最小邻居距离计算
+
+```python
+# 3. 计算最小邻居距离
+if model.get_min_nbor_dist() is None:
+    # 从训练数据计算
+    jdata = j_loader(training_script)
+    jdata = update_deepmd_input(jdata)
+    train_data = get_data(jdata["training"]["training_data"], 0, type_map, None)
+
+    update_sel = UpdateSel()
+    t_min_nbor_dist = update_sel.get_min_nbor_dist(train_data)
+    model.min_nbor_dist = torch.tensor(
+        t_min_nbor_dist, dtype=env.GLOBAL_PT_FLOAT_PRECISION
+    )
+```
+
+### 步骤 3：模型压缩启用
+
+#### 3.1 模型层压缩
+
+**文件位置**: `deepmd/pt/model/model/make_model.py:103-129`
+
+```python
+def enable_compression(
+    self,
+    table_extrapolate=5,
+    table_stride_1=0.01,
+    table_stride_2=0.1,
+    check_frequency=-1,
+):
+    """模型层压缩入口"""
+    self.atomic_model.enable_compression(
+        self.get_min_nbor_dist(),  # 最小邻居距离
+        table_extrapolate,
+        table_stride_1,
+        table_stride_2,
+        check_frequency,
+    )
+```
+
+#### 3.2 原子模型压缩
+
+**文件位置**: `deepmd/pt/model/atomic_model/dp_atomic_model.py:188-217`
+
+```python
+def enable_compression(
+    self,
+    min_nbor_dist,
+    table_extrapolate=5,
+    table_stride_1=0.01,
+    table_stride_2=0.1,
+    check_frequency=-1,
+):
+    """原子模型层压缩入口"""
+    self.descriptor.enable_compression(
+        min_nbor_dist,
+        table_extrapolate,
+        table_stride_1,
+        table_stride_2,
+        check_frequency,
+    )
+```
+
+### 步骤 4：描述符层压缩实现
+
+#### 4.1 SE_A 描述符压缩
+
+**文件位置**: `deepmd/pt/model/descriptor/se_a.py:257-302`
+
+```python
+def enable_compression(
+    self,
+    min_nbor_dist,
+    table_extrapolate=5,
+    table_stride_1=0.01,
+    table_stride_2=0.1,
+    check_frequency=-1,
+):
+    # 1. 检查是否已压缩
+    if self.compress:
+        raise ValueError("Compression is already enabled.")
+
+    # 2. 创建查表器
+    data = self.serialize()
+    self.table = DPTabulate(
+        self,  # 描述符对象
+        data["neuron"],  # 神经网络结构
+        data["type_one_side"],  # 单侧类型
+        data["exclude_types"],  # 排除类型对
+        ActivationFn(data["activation_function"]),  # 激活函数
+    )
+
+    # 3. 存储查表配置
+    self.table_config = [
+        table_extrapolate,
+        table_stride_1,
+        table_stride_2,
+        check_frequency,
+    ]
+
+    # 4. 构建查表数据
+    self.lower, self.upper = self.table.build(
+        min_nbor_dist, table_extrapolate, table_stride_1, table_stride_2
+    )
+
+    # 5. 启用嵌入层压缩
+    self.sea.enable_compression(
+        self.table.data, self.table_config, self.lower, self.upper
+    )
+
+    # 6. 设置压缩标志
+    self.compress = True
+```
+
+#### 4.2 DescrptSeA 压缩数据设置
+
+**文件位置**: `deepmd/pt/model/descriptor/se_a.py:699-733`
+
+```python
+def enable_compression(self, table_data, table_config, lower, upper):
+    """为每个嵌入网络设置压缩数据"""
+    for embedding_idx, ll in enumerate(self.filter_layers.networks):
+        if self.type_one_side:
+            net = f"filter_-1_net_{embedding_idx}"
+        else:
+            ii = embedding_idx // self.ntypes  # 中心原子类型
+            ti = embedding_idx % self.ntypes  # 邻居原子类型
+            net = f"filter_{ii}_net_{ti}"
+
+        # 压缩信息：[lower, upper, upper*extrapolate, stride1, stride2, check_freq]
+        info_ii = torch.as_tensor(
+            [
+                lower[net],
+                upper[net],
+                upper[net] * table_config[0],
+                table_config[1],
+                table_config[2],
+                table_config[3],
+            ],
+            dtype=self.prec,
+            device="cpu",
+        )
+
+        # 压缩数据：多项式系数表
+        tensor_data_ii = table_data[net].to(device=env.DEVICE, dtype=self.prec)
+
+        self.compress_data[embedding_idx] = tensor_data_ii
+        self.compress_info[embedding_idx] = info_ii
+
+    self.compress = True
+```
+
+### 步骤 5：查表器实现
+
+#### 5.1 查表器类
+
+**文件位置**: `deepmd/pt/utils/tabulate.py:52-118`
+
+```python
+class DPTabulate(BaseTabulate):
+    def __init__(
+        self,
+        descrpt,
+        neuron,
+        type_one_side=False,
+        exclude_types=[],
+        activation_fn=ActivationFn("tanh"),
+    ):
+        # 1. 基础初始化
+        super().__init__(descrpt, neuron, type_one_side, exclude_types, True)
+
+        # 2. 描述符类型判断
+        self.descrpt_type = self._get_descrpt_type()  # "A", "Atten", "T", "R"
+
+        # 3. 获取描述符参数
+        self.sel_a = self.descrpt.get_sel()
+        self.rcut = self.descrpt.get_rcut()
+        self.rcut_smth = self.descrpt.get_rcut_smth()
+
+        # 4. 激活函数映射
+        activation_map = {
+            "tanh": 1,
+            "gelu": 2,
+            "relu": 3,
+            "relu6": 4,
+            "softplus": 5,
+            "sigmoid": 6,
+        }
+        self.functype = activation_map[activation_fn.activation]
+
+        # 5. 获取统计参数
+        serialized = self.descrpt.serialize()
+        self.davg = serialized["@variables"]["davg"]  # 均值
+        self.dstd = serialized["@variables"]["dstd"]  # 标准差
+        self.embedding_net_nodes = serialized["embeddings"]["networks"]
+
+        # 6. 提取权重和偏置
+        self.bias = self._get_bias()
+        self.matrix = self._get_matrix()
+```
+
+#### 5.2 查表构建过程
+
+**文件位置**: `deepmd/utils/tabulate.py:70-243`
+
+```python
+def build(self, min_nbor_dist, extrapolate, stride0, stride1):
+    # 1. 计算环境矩阵范围
+    lower, upper = self._get_env_mat_range(min_nbor_dist)
+
+    # 2. 根据描述符类型建表
+    if self.descrpt_type == "A":  # SE_A 描述符
+        for ii in range(self.table_size):
+            if self._should_build_table(ii):
+                # 构建距离网格
+                xx = self._build_distance_grid(
+                    lower, upper, stride0, stride1, extrapolate, ii
+                )
+
+                # 查表数据
+                self._generate_spline_table(
+                    net, xx, ii, uu, ll, stride0, stride1, extrapolate, nspline
+                )
+
+    # 3. 后处理转换
+    self._convert_numpy_to_tensor()
+    self._convert_numpy_float_to_int()
+
+    return self.lower, self.upper
+```
+
+#### 5.3 环境矩阵范围计算
+
+**文件位置**: `deepmd/utils/tabulate.py:445-463`
+
+```python
+def _get_env_mat_range(self, min_nbor_dist):
+    """计算环境矩阵的范围"""
+    # 1. 计算切换函数值
+    sw = self._spline5_switch(min_nbor_dist, self.rcut_smth, self.rcut)
+
+    # 2. 根据描述符类型计算范围
+    if self.descrpt_type in ("Atten", "A"):
+        # 标准化：(r_ij - davg) / dstd
+        lower = -self.davg[:, 0] / self.dstd[:, 0]
+        upper = ((1 / min_nbor_dist) * sw - self.davg[:, 0]) / self.dstd[:, 0]
+
+    # 3. 向下和向上取整
+    return np.floor(lower), np.ceil(upper)
+```
+
+#### 5.4 多项式系数计算
+
+**文件位置**: `deepmd/utils/tabulate.py:245-347`
+
+```python
+def _generate_spline_table(
+    self, net, xx, idx, upper, lower, stride0, stride1, extrapolate, nspline
+):
+    # 1. 通过神经网络前向传播计算数据
+    vv, dd, d2 = self._make_data(xx, idx)  # 值、一阶导数、二阶导数
+
+    # 2. 多项式系数表
+    self.data[net] = np.zeros([nspline, 6 * self.last_layer_size], dtype=self.data_type)
+
+    # 3. 步长处理
+    tt = np.full((nspline, self.last_layer_size), stride1)
+    tt[: int((upper - lower) / stride0), :] = stride0
+
+    # 4. 计算多项式高阶系数
+    hh = (
+        vv[1 : nspline + 1, : self.last_layer_size]
+        - vv[:nspline, : self.last_layer_size]
+    )
+
+    # 系数0：函数值 f(x)
+    self.data[net][:, ::6] = vv[:nspline, : self.last_layer_size]
+
+    # 系数1：一阶导数 f'(x)
+    self.data[net][:, 1::6] = dd[:nspline, : self.last_layer_size]
+
+    # 系数2：二阶导数 f''(x)/2
+    self.data[net][:, 2::6] = 0.5 * d2[:nspline, : self.last_layer_size]
+
+    # 系数3-5：高阶多项式系数（保证连续性）
+    self.data[net][:, 3::6] = (1 / (2 * tt**3)) * (20 * hh - ...)
+    self.data[net][:, 4::6] = (1 / (2 * tt**4)) * (-30 * hh + ...)
+    self.data[net][:, 5::6] = (1 / (2 * tt**5)) * (12 * hh - ...)
+```
+
+#### 5.5 神经网络前向传播
+
+**文件位置**: `deepmd/pt/utils/tabulate.py:119-250`
+
+```text
+def _make_data(self, xx, idx):
+    """通过神经网络前向传播查表数据"""
+    xx = torch.from_numpy(xx).view(-1, 1).to(env.DEVICE)
+
+    # 逐层计算
+    for layer in range(self.layer_size):
+        if layer == 0:
+            # 第一层：线性变换 + 激活函数
+            xbar = torch.matmul(xx, torch.from_numpy(self.matrix[f"layer_{layer + 1}"][idx])) + \
+                   torch.from_numpy(self.bias[f"layer_{layer + 1}"][idx])
+
+            # 处理激活函数（含残差连接）
+            if self.neuron[0] == 1:
+                yy = self._layer_0(...) + xx  # 残差连接
+            else:
+                yy = self._layer_0(...)
+
+            # 计算一阶和二阶导数
+            dy = unaggregated_dy_dx_s(...)
+            dy2 = unaggregated_dy2_dx_s(...)
+        else:
+            # 后续层...
+
+    return vv.cpu().numpy(), dd.cpu().numpy(), d2.cpu().numpy()
+```
+
+### 步骤 6：模型保存
+
+```python
+# 4. 启用压缩
+model.enable_compression(extrapolate, stride, stride * 10, check_frequency)
+
+# 5. JIT脚本化保存
+model = torch.jit.script(model)
+torch.jit.save(model, output)
+```
+
+## 支持的描述符类型
+
+### 已支持的描述符
+
+1. **SE_A (Smooth Edition Angular)**
+   - **文件位置**: `deepmd/pt/model/descriptor/se_a.py`
+   - **压缩方式**: 嵌入网络表格化
+   - **特点**: 支持角度信息的描述符
+
+2. **SE_R (Smooth Edition Radial)**
+   - **文件位置**: `deepmd/pt/model/descriptor/se_r.py`
+   - **压缩方式**: 嵌入网络表格化
+   - **特点**: 仅使用径向距离信息的描述符
+
+3. **SE_T (Smooth Edition Three-body)**
+   - **文件位置**: `deepmd/pt/model/descriptor/se_t.py`
+   - **压缩方式**: 嵌入网络表格化
+   - **特点**: 三体相互作用描述符
+
+4. **SE_Atten (Smooth Edition with Attention)**
+   - **文件位置**: `deepmd/pt/model/descriptor/se_atten.py`
+   - **压缩方式**: 嵌入网络表格化
+   - **特点**: 带注意力机制的描述符
+
+5. **DPA1 (Deep Potential Attention 1)**
+   - **文件位置**: `deepmd/pt/model/descriptor/dpa1.py`
+   - **压缩方式**: 嵌入网络表格化
+   - **特点**: 第一代注意力机制描述符
+
+6. **DPA2 (Deep Potential Attention 2)**
+   - **文件位置**: `deepmd/pt/model/descriptor/dpa2.py`
+   - **压缩方式**: 嵌入网络表格化
+   - **特点**: 第二代注意力机制描述符
+
+### 不支持的描述符
+
+1. **DPA3 (Deep Potential Attention 3)**
+   - **文件位置**: `deepmd/pt/model/descriptor/dpa3.py:578-601`
+   - **压缩方式**: 不支持
+   - **原因**: ```python
+     def enable_compression(self, ...):
+     raise NotImplementedError("Compression is unsupported for DPA3.")
+
+     ```
+
+     ```
+
+### 特殊模型类型
+
+1. **Linear Atomic Model**
+   - **文件位置**: `deepmd/pt/model/atomic_model/linear_atomic_model.py:198-228`
+   - **压缩方式**: 多个子模型分别压缩
+
+2. **Pairtab Atomic Model**
+   - **文件位置**: `deepmd/pt/model/atomic_model/pairtab_atomic_model.py:505-514`
+   - **压缩方式**: 不支持查表压缩
+
+## 数据结构详解
+
+### 压缩数据格式
+
+#### 1. 压缩信息 (compress_info)
+
+```python
+# 每个嵌入网络存储6个参数 [6]
+compress_info[embedding_idx] = torch.tensor(
+    [
+        lower[net],  # 下界
+        upper[net],  # 上界
+        upper[net] * extrapolate,  # 外推上界
+        table_stride_1,  # 第一段步长
+        table_stride_2,  # 第二段步长
+        check_frequency,  # 溢出检查频率
+    ]
+)
+```
+
+#### 2. 压缩数据 (compress_data)
+
+```python
+# 每个嵌入网络存储系数表 [nspline, 6 * last_layer_size]
+compress_data[embedding_idx] = table_data[net]
+
+# 其中每6个连续的系数表示一个多项式的系数
+# [f(x), f'(x), f''(x)/2, c3, c4, c5] × last_layer_size
+```
+
+### 查表数据构建
+
+#### 1. 距离网格生成
+
+```python
+# 第一段：精细数据区间网格
+xx1 = np.arange(lower, upper, stride0)
+
+# 第二段：外推区间网格
+xx2 = np.arange(upper, extrapolate * upper, stride1)
+
+# 合并网格
+xx = np.concatenate([xx1, xx2, [extrapolate * upper]])
+```
+
+#### 2. 神经网络求值
+
+```python
+# 对每个网格点进行神经网络前向传播
+for x_point in xx:
+    output = forward_pass(x_point)  # 网络输出
+    grad1 = compute_gradient(x_point)  # 一阶导数
+    grad2 = compute_hessian(x_point)  # 二阶导数
+```
+
+#### 3. 多项式构造
+
+采用五次 Hermite 插值，满足：
+
+- 函数值连续：f(x_i) = y_i
+- 一阶导数连续：f'(x_i) = y'\_i
+- 二阶导数连续：f''(x_i) = y''\_i
+
+## 性能优化
+
+### 1. 内存管理
+
+- **数据精度**: 支持数据精度调整（0.01）
+- **分段优化**: 粗糙步长在外推区（0.1）
+- **内存复用**: 删除原始网络权重，内存显著降低
+
+### 2. 计算优化
+
+- **预计算查表**: 压缩后嵌入网络不再需要矩阵运算
+- **向量化查表**: 每个原子类型对应一个优化的查表
+- **分支消除**: 消除类型判断的分支开销
+
+### 3. 缓存友好
+
+- **数据局部性**: 查表数据连续存储，提升 cache 命中率
+- **内存访问**: 内存访问模式优化，减少 cache miss
+- **SIMD**: 多项式计算可向量化
+
+## 使用示例
+
+### 基础压缩命令
+
+```bash
+# 压缩PyTorch模型
+dp --pt compress -i frozen_model.pth -o compressed_model.pth
+
+# 自定义参数
+dp --pt compress \
+  -i frozen_model.pth \
+  -o compressed_model.pth \
+  -s 0.005 \
+  -e 10 \
+  -f 1000 \
+  -t input.json
+```
+
+### 参数说明
+
+- `-i, --input`: 输入的冻结模型（.pth）
+- `-o, --output`: 输出的压缩模型（.pth）
+- `-s, --step`: 第一段步长，影响精度与内存（默认 0.01）
+- `-e, --extrapolate`: 外推倍数（默认 5）
+- `-f, --frequency`: 溢出检查频率，-1 表示不检查（默认-1）
+- `-t, --training-script`: 训练脚本（用于计算最小邻居距离）
+
+## 局限性分析
+
+### 1. 描述符局限
+
+- DPA3 描述符不支持压缩
+- Pairtab 模型不支持查表压缩
+- 某些描述符变体可能不完全兼容
+
+### 2. 精度权衡
+
+- 步长设置过大会影响精度
+- 外推区间精度相对较低
+- 激活函数近似可能带来误差
+
+### 3. 内存开销
+
+- 压缩后仍需存储多项式查表数据
+- 精度要求高时查表尺寸增大
+- 激活函数导数计算消耗额外内存
+
+### 4. 兼容性限制
+
+- 压缩后的模型仅适用于 DeePMD-kit 环境
+- JIT 脚本化可能在某些场景下受限
+- LAMMPS 等 MD 引擎需要特定的压缩模型格式
+
+## 实现细节
+
+### 多项式插值公式
+
+在区间 [x_i, x_{i+1}] 内，对于变量 x，多项式为：
+
+```
+f(x) = c₀ + c₁t + c₂t² + c₃t³ + c₄t⁴ + c₅t⁵
+```
+
+其中：
+
+- `t = (x - x_i) / h`，h 为步长
+- `c₀ = f(x_i)`
+- `c₁ = f'(x_i) × h`
+- `c₂ = f''(x_i) × h² / 2`
+- `c₃, c₄, c₅` 根据边界连续性确定
+
+### 切换函数
+
+用于平滑处理截断半径的切换函数：
+
+```text
+def spline5_switch(r, r_min, r_max):
+    if r < r_min:
+        return 1.0
+    elif r < r_max:
+        u = (r - r_min) / (r_max - r_min)
+        return u**3 * (-6 * u**2 + 15 * u - 10) + 1
+    else:
+        return 0.0
+```
+
+## 总结
+
+DeePMD-kit 的 compress 功能通过将神经网络嵌入层用查表法和多项式插值替代，实现了显著的推理加速。PyTorch 后端的实现采用了分层设计，由模型层、原子模型层、描述符层逐级传递压缩请求。查表器构建了精细和粗糙分段的插值表，平衡了精度与性能。该功能对大多数 SE 类和 DPA1/DPA2 描述符提供良好支持，是生产环境中提升 MD 模拟效率的重要工具。
diff --git a/doc/outisli/install.md b/doc/outisli/install.md
new file mode 100644
index 0000000000..220d21ba3f
--- /dev/null
+++ b/doc/outisli/install.md
@@ -0,0 +1,307 @@
+鉴于大家可能觉得从源码安装`DeePMD-kit`门槛较高，而极少使用。然而从源码安装的灵活性最高，为进一步推广，并减少可能的坑，笔者在此根据自己的安装流程结合官方文档给出一个适用性较广的安装教程，各位可自行尝试。
+
+本教程适用于 Linux(with NVIDIA GPU) 及 Mac(with Apple Silicon)
+
+Since some users may find installing `DeePMD-kit` from source to be challenging and rarely attempt it, this guide aims to make the process more accessible. Installing from source offers the highest flexibility. To promote this method and reduce potential pitfalls, I have compiled a broadly applicable installation tutorial based on my own experience and the official documentation. You are encouraged to try it out.
+
+This tutorial is applicable to Linux (with NVIDIA GPU) and Mac (with Apple Silicon).
+
+> 注：
+>
+> 1. 安装过程不强制要求`sudo`权限
+> 2. 若在有`sudo`权限的电脑上，可自行安装 CUDA Toolkit 以及 mpi（可选）
+> 3. 在 HPC 集群上可通过`source\module`方式加载 CUDA Toolkit 以及 mpi 环境
+> 4. 默认安装在用户 home 目录 Software 目录下，若需要修改路径，请修改教程中涉及路径的命令
+> 5. 本教程需有一定计算机（linux）操作常识，若遇到问题，可以评论沟通或询问 AI
+
+> Notes:
+>
+> 1. The installation process does not strictly require `sudo` privileges.
+> 2. If you have `sudo` privileges, you may install CUDA Toolkit and MPI (optional) yourself.
+> 3. On HPC clusters, you can load CUDA Toolkit and MPI environments using `source` or `module` commands.
+> 4. By default, the installation path is set to the user's home directory under the Software folder. If you wish to change the path, please modify the relevant commands in the tutorial.
+> 5. This tutorial assumes some basic knowledge of computer (Linux) operations. If you encounter any issues, feel free to comment or ask AI for help.
+
+# 0. Preparation (Optional)
+
+## 0.1 CUDA Toolkit
+
+```shell
+sudo apt update && sudo apt upgrade -y && sudo apt autoremove -y
+# for Ubuntu 24.04 LTS
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
+# for WSL
+wget https://developer.download.nvidia.com/compute/cuda/repos/wsl-ubuntu/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt update && sudo apt upgrade -y && sudo apt autoremove -y
+sudo apt install cuda-toolkit-12-8 -y
+
+#config cuda
+export CUDA_PATH=/usr/local/cuda
+export CUDA_HOME=/usr/local/cuda
+export PATH=$PATH:$CUDA_HOME/bin
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_HOME/lib64
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CUDA_HOME/extras/CUPTI/lib64
+```
+
+You can also use CUDA 12.6, 12.9 as well.
+
+## 0.2 Intel® oneAPI Toolkit
+
+```shell
+wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/e6ff8e9c-ee28-47fb-abd7-5c524c983e1c/l_BaseKit_p_2024.2.1.100_offline.sh
+sudo sh ./l_BaseKit_p_2024.2.1.100_offline.sh -a --silent --cli --eula accept
+
+wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/d461a695-6481-426f-a22f-b5644cd1fa8b/l_HPCKit_p_2024.2.1.79_offline.sh
+sudo sh ./l_HPCKit_p_2024.2.1.79_offline.sh -a --silent --cli --eula accept
+
+# load intel oneapi
+source /opt/intel/oneapi/setvars.sh --force > /dev/null
+```
+
+# 1. Install Backend’s Python interface
+
+## 1.1 Use Miniforge (Conda/mamba)
+
+```shell
+# 0. no need for HPC
+sudo apt update && sudo apt upgrade -y && sudo apt autoremove -y
+
+# 1. Preparation
+# 1.1 Get source code
+# or git clone https://github.com/deepmodeling/deepmd-kit.git && cd deepmd-kit && git checkout devel
+# 下述链接为笔者自己的fork，时不时增加一些小改进，欢迎star
+git clone git@github.com:OutisLi/deepmd-kit.git && cd deepmd-kit && git checkout outisli
+
+# 1.2 Create virtual environment
+# optional if you installed miniforge: alias mamba="conda"
+# CUDA 13.0 support gcc-15
+mamba update -n base -c conda-forge conda -y ; mamba update -n base -c conda-forge mamba -y
+mamba deactivate && mamba env remove -n dpmd -y ; rm -rf build ; git clean -xdf ; mamba create -n dpmd gcc=15 gxx=15 cmake python=3.13 -c conda-forge -y && mamba activate dpmd && pip install --upgrade pip && pip install uv
+
+# 1.3 (Optional) install openmpi if you do not have mpi
+conda install openmpi -c conda-forge
+
+# 2.1 Install pytorch
+uv pip install -U torch --index-url https://download.pytorch.org/whl/cu130
+
+# 2.2 (Optional) Install tensorflow
+uv pip install -U tensorflow
+
+# 2.3 (Optional) Install jax
+uv pip install -U "tensorflow[and-cuda]" "jax[cuda13]" jax-ai-stack equinox
+
+# 3. Install deepmd-kit
+export CUDA_VERSION=13.1 CUDA_HOME="/usr/local/cuda" && export CUDAToolkit_ROOT=$CUDA_HOME CUDA_PATH=$CUDA_HOME && export DP_VARIANT="cuda" DP_ENABLE_PYTORCH=1 DP_ENABLE_TENSORFLOW=1 DP_ENABLE_PADDLE=0 DP_ENABLE_NATIVE_OPTIMIZATION=1 && pip install -e . -v
+
+# 4.1 Install other useful packages
+uv pip install -U dpdata pymatgen freud-analysis seaborn ipykernel nglview "git+https://gitlab.com/1041176461/ase-abacus.git"
+# 4.2 For developers
+uv pip install -U pytest pre-commit tensorboard torch-tb-profiler tensorboard-plugin-profile
+```
+
+### 1.1+ Check GPU Installation
+
+```shell
+# pytorch
+python -c "import torch; print('PyTorch devices:', [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else 'CPU')"
+
+# tensorflow
+python -c "import tensorflow as tf; print('TF devices:', tf.config.list_physical_devices('GPU'))"
+
+# JAX
+python -c "import jax; print('JAX devices:', jax.devices())"
+
+# All in One
+python -c "import torch, tensorflow as tf, jax; print('PyTorch: ', [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())] if torch.cuda.is_available() else 'CPU'); print('TF:      ', tf.config.list_physical_devices('GPU')); print('JAX:     ', jax.devices())"
+```
+
+## 1.2 For Mac
+
+```shell
+# 1. Preparation
+# 1.1 Get source code
+# or git clone https://github.com/deepmodeling/deepmd-kit.git
+git clone git@github.com:OutisLi/deepmd-kit.git && cd deepmd-kit && git checkout outisli
+# 1.2 Create virtual environment
+conda update -n base -c conda-forge conda -y ; conda update -n base -c conda-forge mamba -y
+conda deactivate && conda env remove -n dpmd -y ; rm -rf build ; git clean -xdf ; mamba create -n dpmd compilers llvm-openmp python=3.13 -c conda-forge -y && mamba activate dpmd && pip install --upgrade pip && pip install uv
+
+# 2. Install pytorch
+uv pip install -U torch
+
+# 3. Install deepmd-kit
+export DP_ENABLE_PYTORCH=1 DP_ENABLE_PADDLE=0 DP_ENABLE_TENSORFLOW=0 DP_ENABLE_NATIVE_OPTIMIZATION=1 && uv pip install -e . -v
+
+# 4.1 Install other useful packages
+uv pip install -U dpdata pymatgen freud-analysis seaborn ipykernel nglview "git+https://gitlab.com/1041176461/ase-abacus.git"
+# 4.2 For developers
+uv pip install -U pytest pre-commit tensorboard torch-tb-profiler tensorboard-plugin-profile
+```
+
+# 2. Install the C++ interface
+
+> If one does not need to use DeePMD-kit with LAMMPS or i-PI, then the python interface installed in the previous section does everything and he/she can safely skip this section.
+
+```shell
+# 0. (Optional) for reinstall
+export software="$HOME/Software" && rm -rfv $software/deepmd-kit_cpp $software/deepmd-kit/source/build
+
+# 1. Environment Variables
+export deepmd_source_dir=$(pwd) && mkdir -p ../deepmd-kit_cpp && cd ../deepmd-kit_cpp && export deepmd_root=$(pwd) && cd ../deepmd-kit && cd source && mkdir -p build && cd build
+# export deepmd_source_dir="$software/deepmd-kit"
+# export deepmd_root="$software/deepmd-kit_cpp"
+
+# 2. CMake (Choice either one)
+
+# 2.1 Option 1: use pytorch & tensorflow & jax (from python env)
+cmake -DCMAKE_INSTALL_PREFIX=$deepmd_root \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+      -DENABLE_NATIVE_OPTIMIZATION=ON \
+      -DUSE_CUDA_TOOLKIT=ON \
+      -DENABLE_PYTORCH=ON \
+      -DUSE_PT_PYTHON_LIBS=ON \
+      -DENABLE_TENSORFLOW=ON \
+      -DUSE_TF_PYTHON_LIBS=ON \
+      -DENABLE_JAX=ON \
+      -DCMAKE_PREFIX_PATH=$CONDA_PREFIX ..
+
+# 2.2 Option 2: use pytorch only (from python env)
+cmake -DCMAKE_INSTALL_PREFIX=$deepmd_root \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+      -DENABLE_NATIVE_OPTIMIZATION=ON \
+      -DUSE_CUDA_TOOLKIT=ON \
+      -DENABLE_PYTORCH=ON \
+      -DUSE_PT_PYTHON_LIBS=ON \
+      -DENABLE_TENSORFLOW=OFF \
+      -DUSE_TF_PYTHON_LIBS=OFF \
+      -DCMAKE_PREFIX_PATH=$CONDA_PREFIX ..
+
+# 2.3 Option 3: use libtorch (standalone)
+wget https://download.pytorch.org/libtorch/cu128/libtorch-cxx11-abi-shared-with-deps-latest.zip
+unzip libtorch-cxx11-abi-shared-with-deps-latest.zip
+# Note: $software/libtorch is the unzipped dir, CMAKE_INSTALL_PREFIX is set to a local dir
+cmake -DCMAKE_INSTALL_PREFIX="../../install" \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+      -DENABLE_NATIVE_OPTIMIZATION=ON \
+      -DUSE_CUDA_TOOLKIT=ON \
+      -DENABLE_PYTORCH=ON \
+      -DUSE_PT_PYTHON_LIBS=OFF \
+      -DCMAKE_PREFIX_PATH=$software/libtorch ..
+
+# 3. Install
+make -j && make install
+
+# 4. (Optional) Link cmake cache
+rm $software/deepmd-kit/compile_commands.json ; ln -s "$(pwd)/compile_commands.json" $software/deepmd-kit
+```
+
+# 3. Install LAMMPS’s DeePMD-kit module (built-in mode)
+
+_Before following this section, [DeePMD-kit C++ interface](https://docs.deepmodeling.com/projects/deepmd/en/master/install/install-from-source.html) should have be installed_ (see 3.3)
+
+Note on GPU Architecture: You must specify your GPU architecture via `-DGPU_ARCH=sm_XX`.
+
+Check yours using `nvidia-smi -q | grep Architecture` or strictly match your card model.
+
+> Common values:
+> Pascal (GTX 1080, Titan X): sm_61
+> Volta (V100): sm_70
+> Turing (RTX 20xx, T4): sm_75
+> Ampere: sm_80 (A100) or sm_86 (RTX 30xx)
+> Lovelace (RTX 40xx): sm_89
+> Hopper (H100): sm_90
+> Blackwell: sm_100 (B200) or sm_103 (B300) or sm_120 (RTX 50xx, RTX PRO 6000)
+
+```shell
+# 0.
+export software="$HOME/Software" && export deepmd_source_dir="$software/deepmd-kit" && export deepmd_root="$software/deepmd-kit_cpp"
+cd "${deepmd_source_dir}/source/build" && make lammps && rm -rf $software/lammps
+
+# 1. Install requirements
+# Or conda install
+# (jpeg, libpng: dependencies for dump image command)
+# (zlib: dependency for COMPRESS package, for .gz trajectory output)
+# (fftw: dependency for KSPACE package)
+# (voro: dependency for VORONOI package, for defect analysis)
+mamba install jpeg libpng zlib fftw voro -c conda-forge -y
+
+# 2. Download lammps
+cd $software && mkdir -p lammps && cd lammps && export version="stable_22Jul2025_update2" && wget "https://gh-proxy.com/github.com/lammps/lammps/archive/${version}.tar.gz" && tar xzf "${version}.tar.gz" && cd "lammps-${version}" && mkdir -p build && cd build
+# wget https://github.com/lammps/lammps/archive/stable_22Jul2025_update2.tar.gz
+
+# 3. Compile
+# !!! CHANGE THIS TO MATCH YOUR GPU !!!
+# Example: sm_80 for A100, sm_86 for RTX 30xx, sm_89 for RTX 40xx, sm_120 for 50xx
+export CUDA_VERSION=13.1 && CUDA_HOME="/usr/local/cuda-${CUDA_VERSION}" export CUDA_PATH="/usr/local/cuda-${CUDA_VERSION}" export LAMMPS_GPU_ARCH="sm_89"
+# WM: export LAMMPS_GPU_ARCH="sm_80" && export CUDA_PATH="/lustre/software/cuda/12.6.0"
+
+# 3.1 Option 1: use pytorch & tensorflow & jax
+echo "include($deepmd_source_dir/source/lmp/builtin.cmake)" >> ../cmake/CMakeLists.txt && export TORCH_CMAKE_DIR=$(python -c "import torch; print(torch.utils.cmake_prefix_path)") && export TF_LIB_PATH=$(find $CONDA_PREFIX -name "libtensorflow_framework.so.2" | xargs dirname)
+
+# for gcc13
+cmake -DCMAKE_INSTALL_PREFIX=$deepmd_root \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DBUILD_SHARED_LIBS=yes \
+      -DLAMMPS_INSTALL_RPATH=ON \
+      -DPKG_KSPACE=ON \
+      -DPKG_VORONOI=ON \
+      -DPKG_PYTHON=ON \
+      -DPKG_COMPRESS=ON \
+      -DPKG_OPENMP=ON \
+      -DPKG_GPU=ON \
+      -DGPU_API=cuda \
+      -DGPU_ARCH=$LAMMPS_GPU_ARCH \
+      -DBIN2C=$CUDA_PATH/bin/bin2c \
+      -DCMAKE_PREFIX_PATH="$deepmd_root;$CONDA_PREFIX;$TORCH_CMAKE_DIR" \
+      -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,$TF_LIB_PATH" ../cmake
+
+# for gcc15/CUDA13+ (above do not work somehow)
+cmake -DCMAKE_INSTALL_PREFIX=$deepmd_root \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DBUILD_SHARED_LIBS=yes \
+      -DLAMMPS_INSTALL_RPATH=ON \
+      -DPKG_KSPACE=ON \
+      -DPKG_VORONOI=ON \
+      -DPKG_PYTHON=ON \
+      -DPKG_COMPRESS=ON \
+      -DPKG_OPENMP=ON \
+      -DPKG_GPU=ON \
+      -DGPU_API=cuda \
+      -DGPU_ARCH=$LAMMPS_GPU_ARCH \
+      -DBIN2C=$CUDA_PATH/bin/bin2c \
+      -DCMAKE_PREFIX_PATH="$deepmd_root;$CONDA_PREFIX;$TORCH_CMAKE_DIR" \
+      -DCMAKE_EXE_LINKER_FLAGS="-Wl,-rpath-link,$TF_LIB_PATH -Wl,-rpath-link,/usr/lib/x86_64-linux-gnu -lm" ../cmake
+
+# 3.2 Option 2: use pytorch only
+echo "include($deepmd_source_dir/source/lmp/builtin.cmake)" >> ../cmake/CMakeLists.txt && export TORCH_CMAKE_DIR=$(python -c "import torch; print(torch.utils.cmake_prefix_path)")
+cmake -DCMAKE_INSTALL_PREFIX=$deepmd_root \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DBUILD_SHARED_LIBS=yes \
+      -DLAMMPS_INSTALL_RPATH=ON \
+      -DPKG_KSPACE=ON \
+      -DPKG_VORONOI=ON \
+      -DPKG_PYTHON=ON \
+      -DPKG_COMPRESS=ON \
+      -DPKG_OPENMP=ON \
+      -DPKG_GPU=ON \
+      -DGPU_API=cuda \
+      -DGPU_ARCH=$LAMMPS_GPU_ARCH \
+      -DBIN2C=$CUDA_PATH/bin/bin2c \
+      -DCMAKE_PREFIX_PATH="$deepmd_root;$CONDA_PREFIX;$TORCH_CMAKE_DIR" ../cmake
+
+make -j && make install
+
+# test
+$deepmd_root/bin/lmp -h
+```
+
+# 4. DPGEN2
+
+```shell
+# alias conda="mamba"
+export software="$HOME/Software"
+cd $software
+git clone git@github.com:OutisLi/dpgen2.git
+cd dpgen2 && conda activate dpmd && pip install uv dpdispatcher && uv pip install -e . -v
+```
diff --git a/examples/water/.gitignore b/examples/water/.gitignore
index 44cec7d508..e5704038b7 100644
--- a/examples/water/.gitignore
+++ b/examples/water/.gitignore
@@ -5,6 +5,7 @@ tab.xvg
 # for training dirs
 *.out
 *.pb
+*.hdf5
 out.json
 model.ckpt*
 checkpoint
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index d356d4cba6..6aa8469c4a 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -2,6 +2,9 @@
 cmake_minimum_required(VERSION 3.25.2)
 project(DeePMD)
 
+# generate compile_commands.json
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
 option(ENABLE_TENSORFLOW "Enable TensorFlow interface" OFF)
 option(ENABLE_PYTORCH "Enable PyTorch interface" OFF)
 option(ENABLE_JAX "Enable JAX interface" OFF)
diff --git a/source/lmp/builtin.cmake b/source/lmp/builtin.cmake
index e051e5c24a..e92468370b 100644
--- a/source/lmp/builtin.cmake
+++ b/source/lmp/builtin.cmake
@@ -57,6 +57,7 @@ configure_file("${CMAKE_CURRENT_LIST_DIR}/deepmd_version.h.in"
 
 file(GLOB DEEPMD_LMP_SRC ${CMAKE_CURRENT_LIST_DIR}/*.cpp)
 
+find_package(Torch REQUIRED)
 find_package(DeePMD REQUIRED)
 target_sources(
   lammps
diff --git a/source/tests/consistent/test_learning_rate.py b/source/tests/consistent/test_learning_rate.py
index 5767f3165e..59ad6741af 100644
--- a/source/tests/consistent/test_learning_rate.py
+++ b/source/tests/consistent/test_learning_rate.py
@@ -42,33 +42,49 @@
             "stop_lr": 1e-8,
             "decay_steps": 1000,
             "stop_steps": 1000000,
+            "warmup_steps": 10000,
         },
         {
             "type": "cosine",
             "start_lr": 1e-3,
             "stop_lr": 1e-8,
-            "decay_steps": 1000,
             "stop_steps": 1000000,
+            "warmup_steps": 10000,
         },
     ),
 )
 class TestLearningRateConsistent(unittest.TestCase):
+    """Test learning rate consistency across different array backends."""
+
     def setUp(self) -> None:
         (lr_param,) = self.param
         self.lr = BaseLR(**lr_param)
         self.step = 500000
         self.ref = self.lr.value(self.step)
+        self.warmup_step = None
+        self.warmup_ref = None
+        if self.lr.warmup_steps > 0:
+            self.warmup_step = self.lr.warmup_steps // 2
+            self.warmup_ref = self.lr.value(self.warmup_step)
 
     def compare_test_with_ref(self, step: Array) -> None:
         test = self.lr.value(step)
         np.testing.assert_allclose(self.ref, to_numpy_array(test), atol=1e-10)
 
+    def compare_test_with_warmup_ref(self, step: Array) -> None:
+        if self.warmup_ref is None:
+            self.skipTest("warmup not enabled")
+        test = self.lr.value(step)
+        np.testing.assert_allclose(self.warmup_ref, to_numpy_array(test), atol=1e-10)
+
     def compare_numpy_with_ref(self, step: Array) -> None:
         self.compare_test_with_ref(np.asarray(step))
 
     @unittest.skipUnless(INSTALLED_PT, "PyTorch is not installed")
     def test_pt_consistent_with_ref(self) -> None:
         self.compare_test_with_ref(to_torch_tensor(self.step))
+        if self.warmup_step is not None:
+            self.compare_test_with_warmup_ref(to_torch_tensor(self.warmup_step))
 
     @unittest.skipUnless(
         INSTALLED_ARRAY_API_STRICT, "array_api_strict is not installed"
@@ -78,7 +94,11 @@ def test_pt_consistent_with_ref(self) -> None:
     )
     def test_array_api_strict(self) -> None:
         self.compare_test_with_ref(xp.asarray(self.step))
+        if self.warmup_step is not None:
+            self.compare_test_with_warmup_ref(xp.asarray(self.warmup_step))
 
     @unittest.skipUnless(INSTALLED_JAX, "JAX is not installed")
     def test_jax_consistent_with_ref(self) -> None:
         self.compare_test_with_ref(jnp.array(self.step))
+        if self.warmup_step is not None:
+            self.compare_test_with_warmup_ref(jnp.array(self.warmup_step))
diff --git a/source/tests/pd/model/test_model.py b/source/tests/pd/model/test_model.py
index e619171e44..848f2dfa47 100644
--- a/source/tests/pd/model/test_model.py
+++ b/source/tests/pd/model/test_model.py
@@ -49,7 +49,7 @@
     DeepmdDataSystem,
 )
 from deepmd.tf.utils.learning_rate import (
-    LearningRateExp,
+    LearningRateSchedule,
 )
 
 from ..test_finetune import (
@@ -226,8 +226,13 @@ def _get_dp_loss(self):
         )
 
     def _get_dp_lr(self):
-        return LearningRateExp(
-            start_lr=self.start_lr, stop_lr=self.stop_lr, decay_steps=self.decay_steps
+        return LearningRateSchedule(
+            {
+                "type": "exp",
+                "start_lr": self.start_lr,
+                "stop_lr": self.stop_lr,
+                "decay_steps": self.decay_steps,
+            }
         )
 
     def _get_dp_placeholders(self, dataset):
diff --git a/source/tests/pd/test_lr.py b/source/tests/pd/test_lr.py
index 9607f982fd..bd4c1a4ea1 100644
--- a/source/tests/pd/test_lr.py
+++ b/source/tests/pd/test_lr.py
@@ -9,8 +9,8 @@
 from deepmd.dpmodel.utils.learning_rate import (
     LearningRateExp,
 )
-from deepmd.tf.utils import (
-    learning_rate,
+from deepmd.tf.utils.learning_rate import (
+    LearningRateSchedule,
 )
 
 
@@ -18,7 +18,8 @@ class TestLearningRate(unittest.TestCase):
     def setUp(self):
         self.start_lr = 0.001
         self.stop_lr = 3.51e-8
-        self.decay_steps = np.arange(400, 601, 100)
+        # decay_steps must not exceed stop_steps
+        self.decay_steps = np.arange(400, 501, 100)
         self.stop_steps = np.arange(500, 1600, 500)
 
     def test_consistency(self):
@@ -30,8 +31,13 @@ def test_consistency(self):
                 self.decay_rate_pd()
 
     def judge_it(self):
-        base_lr = learning_rate.LearningRateExp(
-            self.start_lr, self.stop_lr, self.decay_step
+        base_lr = LearningRateSchedule(
+            {
+                "type": "exp",
+                "start_lr": self.start_lr,
+                "stop_lr": self.stop_lr,
+                "decay_steps": self.decay_step,
+            }
         )
         g = tf.Graph()
         with g.as_default():
@@ -39,7 +45,10 @@ def judge_it(self):
             t_lr = base_lr.build(global_step, self.stop_step)
 
         my_lr = LearningRateExp(
-            self.start_lr, self.stop_lr, self.decay_step, self.stop_step
+            start_lr=self.start_lr,
+            stop_lr=self.stop_lr,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
         )
         with tf.Session(graph=g) as sess:
             base_vals = [
@@ -57,28 +66,34 @@ def judge_it(self):
 
     def decay_rate_pd(self):
         my_lr = LearningRateExp(
-            self.start_lr, self.stop_lr, self.decay_step, self.stop_step
+            start_lr=self.start_lr,
+            stop_lr=self.stop_lr,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
         )
 
         default_ds = 100 if self.stop_step // 10 > 100 else self.stop_step // 100 + 1
-        if self.decay_step >= self.stop_step:
-            self.decay_step = default_ds
+        # Use local variable to avoid modifying instance state
+        decay_step_for_rate = self.decay_step
+        if decay_step_for_rate >= self.stop_step:
+            decay_step_for_rate = default_ds
         decay_rate = np.exp(
-            np.log(self.stop_lr / self.start_lr) / (self.stop_step / self.decay_step)
+            np.log(self.stop_lr / self.start_lr)
+            / (self.stop_step / decay_step_for_rate)
         )
         my_lr_decay = LearningRateExp(
-            self.start_lr,
-            1e-10,
-            self.decay_step,
-            self.stop_step,
+            start_lr=self.start_lr,
+            stop_lr=1e-10,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
             decay_rate=decay_rate,
         )
         min_lr = 1e-5
         my_lr_decay_trunc = LearningRateExp(
-            self.start_lr,
-            min_lr,
-            self.decay_step,
-            self.stop_step,
+            start_lr=self.start_lr,
+            stop_lr=min_lr,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
             decay_rate=decay_rate,
         )
         my_vals = [
diff --git a/source/tests/pt/model/test_model.py b/source/tests/pt/model/test_model.py
index eee0e9beef..501d607cc3 100644
--- a/source/tests/pt/model/test_model.py
+++ b/source/tests/pt/model/test_model.py
@@ -49,7 +49,7 @@
     DeepmdDataSystem,
 )
 from deepmd.tf.utils.learning_rate import (
-    LearningRateExp,
+    LearningRateSchedule,
 )
 
 from ..test_finetune import (
@@ -226,8 +226,13 @@ def _get_dp_loss(self):
         )
 
     def _get_dp_lr(self):
-        return LearningRateExp(
-            start_lr=self.start_lr, stop_lr=self.stop_lr, decay_steps=self.decay_steps
+        return LearningRateSchedule(
+            {
+                "type": "exp",
+                "start_lr": self.start_lr,
+                "stop_lr": self.stop_lr,
+                "decay_steps": self.decay_steps,
+            }
         )
 
     def _get_dp_placeholders(self, dataset):
diff --git a/source/tests/pt/test_lr.py b/source/tests/pt/test_lr.py
index 75f663f041..4e226d54ba 100644
--- a/source/tests/pt/test_lr.py
+++ b/source/tests/pt/test_lr.py
@@ -10,8 +10,8 @@
     LearningRateCosine,
     LearningRateExp,
 )
-from deepmd.tf.utils import (
-    learning_rate,
+from deepmd.tf.utils.learning_rate import (
+    LearningRateSchedule,
 )
 
 
@@ -19,7 +19,8 @@ class TestLearningRate(unittest.TestCase):
     def setUp(self) -> None:
         self.start_lr = 0.001
         self.stop_lr = 3.51e-8
-        self.decay_steps = np.arange(400, 601, 100)
+        # decay_steps must not exceed stop_steps
+        self.decay_steps = np.arange(400, 501, 100)
         self.stop_steps = np.arange(500, 1600, 500)
 
     def test_consistency(self) -> None:
@@ -31,8 +32,13 @@ def test_consistency(self) -> None:
                 self.decay_rate_pt()
 
     def judge_it(self) -> None:
-        base_lr = learning_rate.LearningRateExp(
-            self.start_lr, self.stop_lr, self.decay_step
+        base_lr = LearningRateSchedule(
+            {
+                "type": "exp",
+                "start_lr": self.start_lr,
+                "stop_lr": self.stop_lr,
+                "decay_steps": self.decay_step,
+            }
         )
         g = tf.Graph()
         with g.as_default():
@@ -40,7 +46,10 @@ def judge_it(self) -> None:
             t_lr = base_lr.build(global_step, self.stop_step)
 
         my_lr = LearningRateExp(
-            self.start_lr, self.stop_lr, self.decay_step, self.stop_step
+            start_lr=self.start_lr,
+            stop_lr=self.stop_lr,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
         )
         with tf.Session(graph=g) as sess:
             base_vals = [
@@ -58,28 +67,34 @@ def judge_it(self) -> None:
 
     def decay_rate_pt(self) -> None:
         my_lr = LearningRateExp(
-            self.start_lr, self.stop_lr, self.decay_step, self.stop_step
+            start_lr=self.start_lr,
+            stop_lr=self.stop_lr,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
         )
 
         default_ds = 100 if self.stop_step // 10 > 100 else self.stop_step // 100 + 1
-        if self.decay_step >= self.stop_step:
-            self.decay_step = default_ds
+        # Use local variable to avoid modifying instance state
+        decay_step_for_rate = self.decay_step
+        if decay_step_for_rate >= self.stop_step:
+            decay_step_for_rate = default_ds
         decay_rate = np.exp(
-            np.log(self.stop_lr / self.start_lr) / (self.stop_step / self.decay_step)
+            np.log(self.stop_lr / self.start_lr)
+            / (self.stop_step / decay_step_for_rate)
         )
         my_lr_decay = LearningRateExp(
-            self.start_lr,
-            1e-10,
-            self.decay_step,
-            self.stop_step,
+            start_lr=self.start_lr,
+            stop_lr=1e-10,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
             decay_rate=decay_rate,
         )
         min_lr = 1e-5
         my_lr_decay_trunc = LearningRateExp(
-            self.start_lr,
-            min_lr,
-            self.decay_step,
-            self.stop_step,
+            start_lr=self.start_lr,
+            stop_lr=min_lr,
+            decay_steps=self.decay_step,
+            stop_steps=self.stop_step,
             decay_rate=decay_rate,
         )
         my_vals = [
@@ -108,7 +123,11 @@ def test_basic_curve(self) -> None:
         start_lr = 1.0
         stop_lr = 0.1
         stop_steps = 10
-        lr = LearningRateCosine(start_lr, stop_lr, stop_steps)
+        lr = LearningRateCosine(
+            start_lr=start_lr,
+            stop_lr=stop_lr,
+            stop_steps=stop_steps,
+        )
 
         self.assertTrue(np.allclose(lr.value(0), start_lr))
         self.assertTrue(np.allclose(lr.value(stop_steps), stop_lr))
diff --git a/source/tests/tf/test_lr.py b/source/tests/tf/test_lr.py
new file mode 100644
index 0000000000..44e3eb749c
--- /dev/null
+++ b/source/tests/tf/test_lr.py
@@ -0,0 +1,114 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Tests for TensorFlow learning rate schedule wrapper.
+
+This module tests the TF-specific wrapper logic only.
+Core learning rate algorithms are tested in dpmodel tests.
+"""
+
+import unittest
+
+import numpy as np
+
+from deepmd.dpmodel.utils.learning_rate import (
+    LearningRateExp,
+)
+from deepmd.tf.env import (
+    tf,
+)
+from deepmd.tf.utils.learning_rate import (
+    LearningRateSchedule,
+)
+
+
+class TestLearningRateScheduleValidation(unittest.TestCase):
+    """Test TF wrapper validation and error handling."""
+
+    def test_missing_start_lr(self) -> None:
+        """Test that missing start_lr raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            LearningRateSchedule({"type": "exp", "stop_lr": 1e-5})
+        self.assertIn("start_lr", str(cm.exception))
+
+    def test_value_before_build(self) -> None:
+        """Test that calling value() before build() raises RuntimeError."""
+        lr_schedule = LearningRateSchedule({"start_lr": 1e-3})
+        with self.assertRaises(RuntimeError) as cm:
+            lr_schedule.value(100)
+        self.assertIn("not built", str(cm.exception))
+
+    def test_base_lr_before_build(self) -> None:
+        """Test that accessing base_lr before build() raises RuntimeError."""
+        lr_schedule = LearningRateSchedule({"start_lr": 1e-3})
+        with self.assertRaises(RuntimeError) as cm:
+            _ = lr_schedule.base_lr
+        self.assertIn("not built", str(cm.exception))
+
+
+class TestLearningRateScheduleBuild(unittest.TestCase):
+    """Test TF tensor building and integration."""
+
+    def test_build_returns_tensor(self) -> None:
+        """Test that build() returns a float32 TF tensor."""
+        lr_schedule = LearningRateSchedule({"start_lr": 1e-3, "stop_lr": 1e-5})
+        global_step = tf.constant(0, dtype=tf.int64)
+        lr_tensor = lr_schedule.build(global_step, stop_steps=10000)
+
+        self.assertIsInstance(lr_tensor, tf.Tensor)
+        self.assertEqual(lr_tensor.dtype, tf.float32)
+
+    def test_default_type_exp(self) -> None:
+        """Test that default type is 'exp' when not specified."""
+        lr_schedule = LearningRateSchedule({"start_lr": 1e-3, "stop_lr": 1e-5})
+        global_step = tf.constant(0, dtype=tf.int64)
+        lr_schedule.build(global_step, stop_steps=10000)
+
+        self.assertIsInstance(lr_schedule.base_lr, LearningRateExp)
+
+    def test_tensor_value_matches_base_lr(self) -> None:
+        """Test that TF tensor value matches BaseLR.value()."""
+        lr_schedule = LearningRateSchedule(
+            {
+                "start_lr": 1e-3,
+                "stop_lr": 1e-5,
+                "type": "exp",
+                "decay_steps": 1000,
+            }
+        )
+        test_step = 5000
+        global_step = tf.constant(test_step, dtype=tf.int64)
+        lr_schedule.build(global_step, stop_steps=10000)
+
+        # Use value() method which works in both graph and eager mode
+        # This indirectly verifies tensor computation matches BaseLR
+        tensor_value = lr_schedule.value(test_step)
+        base_lr_value = lr_schedule.base_lr.value(test_step)
+
+        np.testing.assert_allclose(tensor_value, base_lr_value, rtol=1e-10)
+
+    def test_start_lr_accessor(self) -> None:
+        """Test start_lr() accessor returns correct value."""
+        lr_schedule = LearningRateSchedule({"start_lr": 1e-3})
+        self.assertEqual(lr_schedule.start_lr(), 1e-3)
+
+    def test_value_after_build(self) -> None:
+        """Test value() works correctly after build()."""
+        lr_schedule = LearningRateSchedule(
+            {
+                "start_lr": 1e-3,
+                "stop_lr": 1e-5,
+                "type": "exp",
+                "decay_steps": 1000,
+            }
+        )
+        global_step = tf.constant(0, dtype=tf.int64)
+        lr_schedule.build(global_step, stop_steps=10000)
+
+        # value() should work after build
+        lr_value = lr_schedule.value(5000)
+        expected = lr_schedule.base_lr.value(5000)
+
+        np.testing.assert_allclose(lr_value, expected, rtol=1e-10)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/source/tests/universal/dpmodel/utils/test_learning_rate.py b/source/tests/universal/dpmodel/utils/test_learning_rate.py
new file mode 100644
index 0000000000..408300696a
--- /dev/null
+++ b/source/tests/universal/dpmodel/utils/test_learning_rate.py
@@ -0,0 +1,240 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+import unittest
+
+import numpy as np
+
+from deepmd.dpmodel.common import (
+    to_numpy_array,
+)
+from deepmd.dpmodel.utils.learning_rate import (
+    LearningRateCosine,
+    LearningRateExp,
+)
+
+
+class TestLearningRateExpBasic(unittest.TestCase):
+    """Test basic exponential decay learning rate functionality."""
+
+    def test_basic_decay(self) -> None:
+        """Test basic exponential decay without warmup."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=5000,
+        )
+        np.testing.assert_allclose(lr.value(0), 1e-3, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(10000), 1e-5, rtol=1e-5)
+
+    def test_stop_ratio(self) -> None:
+        """Test stop_ratio parameter."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_ratio=0.01,
+            stop_steps=10000,
+            decay_steps=5000,
+        )
+        np.testing.assert_allclose(lr.stop_lr, 1e-5, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(10000), 1e-5, rtol=1e-5)
+
+    def test_decay_rate_override(self) -> None:
+        """Test explicit decay_rate parameter."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=1000,
+            decay_rate=0.9,
+        )
+        self.assertEqual(lr.decay_rate, 0.9)
+        np.testing.assert_allclose(lr.value(1000), 1e-3 * 0.9, rtol=1e-10)
+
+
+class TestLearningRateCosineBasic(unittest.TestCase):
+    """Test basic cosine annealing learning rate functionality."""
+
+    def test_basic_cosine(self) -> None:
+        """Test basic cosine annealing without warmup."""
+        lr = LearningRateCosine(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+        )
+        np.testing.assert_allclose(lr.value(0), 1e-3, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(10000), 1e-5, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(5000), (1e-3 + 1e-5) / 2, rtol=1e-5)
+
+    def test_stop_ratio(self) -> None:
+        """Test stop_ratio parameter."""
+        lr = LearningRateCosine(
+            start_lr=1e-3,
+            stop_ratio=0.01,
+            stop_steps=10000,
+        )
+        np.testing.assert_allclose(lr.stop_lr, 1e-5, rtol=1e-10)
+
+
+class TestLearningRateWarmup(unittest.TestCase):
+    """Test learning rate warmup functionality."""
+
+    def test_warmup_steps_exp(self) -> None:
+        """Test warmup with exponential decay."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=1000,
+            warmup_steps=1000,
+        )
+        self.assertEqual(lr.decay_stop_steps, 9000)
+        np.testing.assert_allclose(lr.value(0), 0.0, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(500), 0.5e-3, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(1000), 1e-3, rtol=1e-10)
+        self.assertLess(to_numpy_array(lr.value(2000)), 1e-3)
+
+    def test_warmup_steps_cosine(self) -> None:
+        """Test warmup with cosine annealing."""
+        lr = LearningRateCosine(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            warmup_steps=1000,
+        )
+        self.assertEqual(lr.decay_stop_steps, 9000)
+        np.testing.assert_allclose(lr.value(0), 0.0, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(1000), 1e-3, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(10000), 1e-5, rtol=1e-10)
+
+    def test_warmup_ratio(self) -> None:
+        """Test warmup_ratio parameter."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=1000,
+            warmup_ratio=0.1,
+        )
+        self.assertEqual(lr.warmup_steps, 1000)
+        self.assertEqual(lr.decay_stop_steps, 9000)
+
+    def test_warmup_start_factor(self) -> None:
+        """Test warmup_start_factor parameter."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=1000,
+            warmup_steps=1000,
+            warmup_start_factor=0.1,
+        )
+        np.testing.assert_allclose(lr.value(0), 0.1e-3, rtol=1e-10)
+        np.testing.assert_allclose(lr.value(1000), 1e-3, rtol=1e-10)
+
+    def test_no_warmup(self) -> None:
+        """Test that warmup_steps=0 works correctly."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=5000,
+            warmup_steps=0,
+        )
+        self.assertEqual(lr.warmup_steps, 0)
+        self.assertEqual(lr.decay_stop_steps, 10000)
+        np.testing.assert_allclose(lr.value(0), 1e-3, rtol=1e-10)
+
+
+class TestLearningRateArrayInput(unittest.TestCase):
+    """Test learning rate with array inputs for JIT compatibility."""
+
+    def test_array_input_exp(self) -> None:
+        """Test exponential decay with array input."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=5000,
+            warmup_steps=1000,
+        )
+        steps = np.array([0, 500, 1000, 5000, 10000])
+        lrs = lr.value(steps)
+        self.assertEqual(lrs.shape, (5,))
+        np.testing.assert_allclose(lrs[0], 0.0, rtol=1e-10)
+        np.testing.assert_allclose(lrs[2], 1e-3, rtol=1e-10)
+
+    def test_array_input_cosine(self) -> None:
+        """Test cosine annealing with array input."""
+        lr = LearningRateCosine(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            warmup_steps=1000,
+        )
+        steps = np.array([0, 1000, 5500, 10000])
+        lrs = lr.value(steps)
+        self.assertEqual(lrs.shape, (4,))
+        np.testing.assert_allclose(lrs[0], 0.0, rtol=1e-10)
+        np.testing.assert_allclose(lrs[1], 1e-3, rtol=1e-10)
+        np.testing.assert_allclose(lrs[3], 1e-5, rtol=1e-10)
+
+
+class TestLearningRateBeyondStopSteps(unittest.TestCase):
+    """Test learning rate behavior beyond stop_steps."""
+
+    def test_exp_beyond_stop_steps(self) -> None:
+        """Test exponential decay clamps to stop_lr."""
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+            decay_steps=1000,
+        )
+        np.testing.assert_allclose(lr.value(20000), 1e-5, rtol=1e-10)
+
+    def test_cosine_beyond_stop_steps(self) -> None:
+        """Test cosine annealing returns stop_lr beyond decay phase."""
+        lr = LearningRateCosine(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=10000,
+        )
+        np.testing.assert_allclose(lr.value(20000), 1e-5, rtol=1e-10)
+
+
+class TestLearningRateValidation(unittest.TestCase):
+    """Test learning rate parameter validation."""
+
+    def test_decay_steps_exceeds_decay_total_without_warmup(self) -> None:
+        """Test that decay_steps > stop_steps raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            LearningRateExp(
+                start_lr=1e-3,
+                stop_lr=1e-5,
+                stop_steps=500,
+                decay_steps=600,
+            )
+        self.assertIn("decay_steps", str(cm.exception))
+        self.assertIn("exceed", str(cm.exception))
+
+    def test_decay_steps_exceeds_decay_total_with_warmup(self) -> None:
+        """Test that decay_steps > (stop_steps - warmup_steps) raises ValueError."""
+        with self.assertRaises(ValueError) as cm:
+            LearningRateExp(
+                start_lr=1e-3,
+                stop_lr=1e-5,
+                stop_steps=1000,
+                decay_steps=900,
+                warmup_steps=200,  # decay_total = 800
+            )
+        self.assertIn("decay_steps", str(cm.exception))
+
+    def test_decay_steps_equals_decay_total_allowed(self) -> None:
+        """Test that decay_steps == decay_total is allowed (boundary case)."""
+        # Should not raise
+        lr = LearningRateExp(
+            start_lr=1e-3,
+            stop_lr=1e-5,
+            stop_steps=500,
+            decay_steps=500,
+        )
+        self.assertEqual(lr.decay_steps, 500)