diff --git a/.github/workflows/build_test_cmake.yml b/.github/workflows/build_test_cmake.yml
index 8938a1ede6..0cfaeee60f 100644
--- a/.github/workflows/build_test_cmake.yml
+++ b/.github/workflows/build_test_cmake.yml
@@ -17,10 +17,10 @@ jobs:
             name: "Build with Intel toolchain"
 
           - tag: gnu
-            build_args: "-DENABLE_LIBXC=1 -DENABLE_DEEPKS=1 -DENABLE_LIBRI=1 -DENABLE_PAW=1"
+            build_args: "-DENABLE_LIBXC=1 -DENABLE_DEEPKS=1 -DENABLE_MLKEDF=1 -DENABLE_LIBRI=1 -DENABLE_PAW=1"
             name: "Build extra components with GNU toolchain"
           - tag: intel
-            build_args: "-DENABLE_LIBXC=1 -DENABLE_DEEPKS=1 -DENABLE_LIBRI=1"
+            build_args: "-DENABLE_LIBXC=1 -DENABLE_DEEPKS=1 -DENABLE_MLKEDF=1 -DENABLE_LIBRI=1"
             name: "Build extra components with Intel toolchain"
 
           - tag: cuda
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2f9996e070..3e0b14b744 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -31,7 +31,7 @@ jobs:
 
       - name: Configure
         run: |
-          cmake -B build -DBUILD_TESTING=ON -DENABLE_DEEPKS=ON -DENABLE_LIBXC=ON -DENABLE_LIBRI=ON -DENABLE_PAW=ON -DENABLE_GOOGLEBENCH=ON -DENABLE_RAPIDJSON=ON  -DCMAKE_EXPORT_COMPILE_COMMANDS=1
+          cmake -B build -DBUILD_TESTING=ON -DENABLE_DEEPKS=ON -DENABLE_MLKEDF=ON -DENABLE_LIBXC=ON -DENABLE_LIBRI=ON -DENABLE_PAW=ON -DENABLE_GOOGLEBENCH=ON -DENABLE_RAPIDJSON=ON  -DCMAKE_EXPORT_COMPILE_COMMANDS=1
 
       - uses: pre-commit/action@v3.0.1
         with:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4701f76d75..7d6e74f898 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,7 @@ project(
 
 option(ENABLE_LCAO "Enable LCAO calculation." ON)
 option(ENABLE_DEEPKS "Enable DeePKS functionality" OFF)
+option(ENABLE_MLKEDF "Enable Machine Learning based KEDF for OFDFT" OFF)
 option(ENABLE_LIBXC "Enable LibXC functionality" OFF)
 option(USE_CUDA "Enable support to CUDA for ABACUS." OFF)
 option(ENABLE_FLOAT_FFTW "Enable support to single precision FFTW library." OFF)
@@ -469,10 +470,29 @@ if(ENABLE_DEEPKS)
   add_compile_definitions(__DEEPKS)
 endif()
 
+if(ENABLE_MLKEDF)
+  target_link_libraries(${ABACUS_BIN_NAME} hamilt_mlkedf)
+
+  find_path(libnpy_SOURCE_DIR npy.hpp HINTS ${libnpy_INCLUDE_DIR})
+  if(NOT libnpy_SOURCE_DIR)
+    include(FetchContent)
+    FetchContent_Declare(
+      libnpy
+      GIT_REPOSITORY https://github.com/llohse/libnpy.git
+      GIT_SHALLOW TRUE
+      GIT_PROGRESS TRUE)
+    FetchContent_MakeAvailable(libnpy)
+  else()
+    include_directories(${libnpy_INCLUDE_DIR})
+  endif()
+  include_directories(${libnpy_SOURCE_DIR}/include)
+  add_compile_definitions(__MLKEDF)
+endif()
+
 # Torch uses outdated components to detect CUDA arch, causing failure on
 # latest CUDA kits. Set CMake variable TORCH_CUDA_ARCH_LIST in the form of
 # "major.minor" if required.
-if(ENABLE_DEEPKS OR DEFINED Torch_DIR)
+if(ENABLE_DEEPKS OR ENABLE_MLKEDF OR DEFINED Torch_DIR)
   find_package(Torch REQUIRED)
   if(NOT Torch_VERSION VERSION_LESS "2.1.0")
     set_if_higher(CMAKE_CXX_STANDARD 17)
diff --git a/docs/CITATIONS.md b/docs/CITATIONS.md
index d54627292d..406fc91205 100644
--- a/docs/CITATIONS.md
+++ b/docs/CITATIONS.md
@@ -33,3 +33,9 @@ The following references are required to be cited when using ABACUS. Specificall
     Peize Lin, Xinguo Ren, and Lixin He. "Efficient Hybrid Density Functional Calculations for Large Periodic Systems Using Numerical Atomic Orbitals." Journal of Chemical Theory and Computation 2021, 17(1), 222–239.
 
     Peize Lin, Xinguo Ren, and Lixin He. "Accuracy of Localized Resolution of the Identity in Periodic Hybrid Functional Calculations with Numerical Atomic Orbitals." Journal of Physical Chemistry Letters 2020, 11, 3082-3088.
+
+- **If ML-KEDF is used:**
+
+    Sun, Liang, and Mohan Chen. "Machine learning based nonlocal kinetic energy density functional for simple metals and alloys." Physical Review B 109.11 (2024): 115135.
+
+    Sun, Liang, and Mohan Chen. "Multi-channel machine learning based nonlocal kinetic energy density functional for semiconductors." Electronic Structure 6.4 (2024): 045006.
diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index 7eb1b382a0..b201919adb 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -209,6 +209,36 @@
     - [of\_kernel\_file](#of_kernel_file)
     - [of\_full\_pw](#of_full_pw)
     - [of\_full\_pw\_dim](#of_full_pw_dim)
+  - [ML-KEDF: machine learning based kinetic energy density functional for OFDFT](#ml-kedf-machine-learning-based-kinetic-energy-density-functional-for-ofdft)
+    - [of\_ml\_gene\_data](#of_ml_gene_data)
+    - [of\_ml\_device](#of_ml_device)
+    - [of\_ml\_feg](#of_ml_feg)
+    - [of\_ml\_nkernel](#of_ml_nkernel)
+    - [of\_ml\_kernel](#of_ml_kernel)
+    - [of\_ml\_kernel\_scaling](#of_ml_kernel_scaling)
+    - [of\_ml\_yukawa\_alpha](#of_ml_yukawa_alpha)
+    - [of\_ml\_kernel\_file](#of_ml_kernel_file)
+    - [of\_ml\_gamma](#of_ml_gamma)
+    - [of\_ml\_p](#of_ml_p)
+    - [of\_ml\_q](#of_ml_q)
+    - [of\_ml\_tanhp](#of_ml_tanhp)
+    - [of\_ml\_tanhq](#of_ml_tanhq)
+    - [of\_ml\_gammanl](#of_ml_gammanl)
+    - [of\_ml\_pnl](#of_ml_pnl)
+    - [of\_ml\_qnl](#of_ml_qnl)
+    - [of\_ml\_xi](#of_ml_xi)
+    - [of\_ml\_tanhxi](#of_ml_tanhxi)
+    - [of\_ml\_tanhxi\_nl](#of_ml_tanhxi_nl)
+    - [of\_ml\_tanh\_pnl](#of_ml_tanh_pnl)
+    - [of\_ml\_tanh\_qnl](#of_ml_tanh_qnl)
+    - [of\_ml\_tanhp\_nl](#of_ml_tanhp_nl)
+    - [of\_ml\_tanhq\_nl](#of_ml_tanhq_nl)
+    - [of\_ml\_chi\_p](#of_ml_chi_p)
+    - [of\_ml\_chi\_q](#of_ml_chi_q)
+    - [of\_ml\_chi\_xi](#of_ml_chi_xi)
+    - [of\_ml\_chi\_pnl](#of_ml_chi_pnl)
+    - [of\_ml\_chi\_qnl](#of_ml_chi_qnl)
+    - [of\_ml\_local\_test](#of_ml_local_test)
   - [Electric field and dipole correction](#electric-field-and-dipole-correction)
     - [efield\_flag](#efield_flag)
     - [dip\_cor\_flag](#dip_cor_flag)
@@ -2077,11 +2107,18 @@ Warning: this function is not robust enough for the current version. Please try
 - **Type**: String
 - **Availability**: OFDFT
 - **Description**: The type of KEDF (kinetic energy density functional).
+
+  Analytical functionals:
   - **wt**: Wang-Teter.
   - **tf**: Thomas-Fermi.
   - **vw**: von Weizsäcker.
-  - **tf+**: TF$\rm{\lambda}$vW, the parameter $\rm{\lambda}$ can be set by `of_vw_weight`.
+  - **tf+**: TF $\rm{\lambda}$ vW, the parameter $\rm{\lambda}$ can be set by `of_vw_weight`.
   - **lkt**: Luo-Karasiev-Trickey.
+
+  Machine learning (ML) based functionals:
+  - **ml**: ML-based KEDF allows for greater flexibility, enabling users to set related ML model parameters themselves. see [ML-KEDF: machine learning based kinetic energy density functional for OFDFT](#ml-kedf-machine-learning-based-kinetic-energy-density-functional-for-ofdft).
+  - **mpn**: ML-based Physically-constrained Non-local (MPN) KEDF. ABACUS automatically configures the necessary parameters, requiring no manual intervention from the user.
+  - **cpn5**: Multi-Channel MPN (CPN) KEDF with 5 channels. Similar to mpn, ABACUS handles all parameter settings automatically.
 - **Default**: wt
 
 ### of_method
@@ -2212,6 +2249,221 @@ Warning: this function is not robust enough for the current version. Please try
 
 [back to top](#full-list-of-input-keywords)
 
+## ML-KEDF: machine learning based kinetic energy density functional for OFDFT
+
+### of_ml_gene_data
+
+- **Type**: Boolean
+- **Availability**: OFDFT
+- **Description**: Generate training data or not.
+- **Default**: False
+
+### of_ml_device
+
+- **Type**: String
+- **Availability**: OFDFT
+- **Description**: Run Neural Network on GPU or CPU.
+  - **cpu**: CPU
+  - **gpu**: GPU
+- **Default**: cpu
+
+### of_ml_feg
+
+- **Type**: Integer
+- **Availability**: OFDFT
+- **Description**: The method to incorporate the Free Electron Gas (FEG) limit: $F_\theta |_{\rm{FEG}} = 1$, where $F_\theta$ is enhancement factor of Pauli energy.
+  - **0**: Do not incorporate the FEG limit.
+  - **1**: Incorporate the FEG limit by translation: $F_\theta = F^{\rm{NN}}_\theta - F^{\rm{NN}}_\theta|_{\rm{FEG}} + 1$.
+  - **3**: Incorporate the FEG limit by nonlinear transformation: $F_\theta = f(F^{\rm{NN}}_\theta - F^{\rm{NN}}_\theta|_{\rm{FEG}} + \ln(e - 1))$, where $f = \ln(1 + e^x)$ is softplus function, satisfying $f(x)|_{x=\ln(e-1)} = 1$. 
+- **Default**: 0
+
+### of_ml_nkernel
+
+- **Type**: Integer
+- **Availability**: OFDFT
+- **Description**: Number of kernel functions.
+- **Default**: 1
+
+### of_ml_kernel
+
+- **Type**: Vector of Integer 
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element specifies the type of the $i$-th kernel function.
+  - **1**: Wang-Teter kernel function.
+  - **2**: Modified Yukawa function: $k_{\rm{F}}^2\frac{\exp{({-\alpha k_{\rm{F}}|\mathbf{r}-\mathbf{r}'|})}}{|\mathbf{r}-\mathbf{r}'|}$, and $\alpha$ is specified by [of_ml_yukawa_alpha](#of_ml_yukawa_alpha).
+  - **3**: Truncated kinetic kernel (TKK), the file containing TKK is specified by [of_ml_kernel_file](#of_kernel_file).
+- **Default**: 1
+
+### of_ml_kernel_scaling
+
+- **Type**: Vector of Real
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element specifies the RECIPROCAL of scaling parameter $\lambda$ of the $i$-th kernel function. $w_i(\mathbf{r}-\mathbf{r}') = \lambda^3 w_i'(\lambda(\mathbf{r}-\mathbf{r}'))$.
+- **Default**: 1.0
+
+### of_ml_yukawa_alpha
+
+- **Type**: Vector of Real
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element specifies the parameter $\alpha$ of $i$-th kernel function. ONLY used for Yukawa kernel function.
+- **Default**: 1.0
+
+### of_ml_kernel_file
+
+- **Type**: Vector of String
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element specifies the file containint the $i$-th kernel function. ONLY used for TKK.
+- **Default**: none
+
+### of_ml_gamma
+
+- **Type**: Boolean
+- **Availability**: OFDFT
+- **Description**: Local descriptor: $\gamma(\mathbf{r}) = (\rho(\mathbf{r}) / \rho_0)^{1/3}$.
+- **Default**: False
+
+### of_ml_p
+
+- **Type**: Boolean
+- **Availability**: OFDFT
+- **Description**: Semi-local descriptor: $p(\mathbf{r}) = \frac{|\nabla \rho(\mathbf{r})|^2} {[2 (3 \pi^2)^{1/3} \rho^{4/3}(\mathbf{r})]^2}$.
+- **Default**: False
+
+### of_ml_q
+
+- **Type**: Boolean
+- **Availability**: OFDFT
+- **Description**: Semi-local descriptor: $q(\mathbf{r}) = \frac{\nabla^2 \rho(\mathbf{r})} {[4 (3 \pi^2)^{2/3} \rho^{5/3}(\mathbf{r})]}$.
+- **Default**: False
+
+### of_ml_tanhp
+
+- **Type**: Boolean
+- **Availability**: OFDFT
+- **Description**: Semi-local descriptor: $\tilde{p}(\mathbf{r}) = \tanh(\chi_p p(\mathbf{r}))$.
+- **Default**: False
+
+### of_ml_tanhq
+
+- **Type**: Boolean
+- **Availability**: OFDFT
+- **Description**: Semi-local descriptor: $\tilde{q}(\mathbf{r}) = \tanh(\chi_q q(\mathbf{r}))$.
+- **Default**: False
+
+### of_ml_chi_p
+
+- **Type**: Real
+- **Availability**: OFDFT
+- **Description**: Hyperparameter $\chi_p$: $\tilde{p}(\mathbf{r}) = \tanh(\chi_p p(\mathbf{r}))$.
+- **Default**: 1.0
+
+### of_ml_chi_q
+
+- **Type**: Real
+- **Availability**: OFDFT
+- **Description**: Hyperparameter $\chi_q$: $\tilde{q}(\mathbf{r}) = \tanh(\chi_q q(\mathbf{r}))$.
+- **Default**: False
+
+### of_ml_gammanl
+
+- **Type**: Vector of Integer 
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element controls the non-local descriptor defined by the $i$-th kernel function $w_i(\mathbf{r}-\mathbf{r}')$: $\gamma_{\rm{nl}}(\mathbf{r}) = \int{w_i(\mathbf{r}-\mathbf{r}') \gamma(\mathbf{r}') dr'}$. 
+- **Default**: 0
+
+### of_ml_pnl
+
+- **Type**: Vector of Integer 
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element controls the non-local descriptor defined by the $i$-th kernel function $w_i(\mathbf{r}-\mathbf{r}')$: $p_{\rm{nl}}(\mathbf{r}) = \int{w_i(\mathbf{r}-\mathbf{r}') p(\mathbf{r}') dr'}$. 
+- **Default**: 0
+
+### of_ml_qnl
+
+- **Type**: Vector of Integer 
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element controls the non-local descriptor defined by the $i$-th kernel function $w_i(\mathbf{r}-\mathbf{r}')$: $q_{\rm{nl}}(\mathbf{r}) = \int{w_i(\mathbf{r}-\mathbf{r}') q(\mathbf{r}') dr'}$. 
+- **Default**: 0
+
+### of_ml_xi
+
+- **Type**: Vector of Integer 
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element controls the non-local descriptor defined by the $i$-th kernel function $w_i(\mathbf{r}-\mathbf{r}')$: $\xi(\mathbf{r}) = \frac{\int{w_i(\mathbf{r}-\mathbf{r}') \rho^{1/3}(\mathbf{r}') dr'}}{\rho^{1/3}(\mathbf{r})}$. 
+- **Default**: 0
+
+### of_ml_tanhxi
+
+- **Type**: Vector of Integer 
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element controls the non-local descriptor defined by the $i$-th kernel function $w_i(\mathbf{r}-\mathbf{r}')$: $\tilde{\xi}(\mathbf{r}) = \tanh(\chi_{\xi} \xi(\mathbf{r}))$. 
+- **Default**: 0
+
+### of_ml_tanhxi_nl
+
+- **Type**: Vector of Integer 
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element controls the non-local descriptor defined by the $i$-th kernel function $w_i(\mathbf{r}-\mathbf{r}')$: $\tilde{\xi}_{\rm{nl}}(\mathbf{r}) = \int{w_i(\mathbf{r}-\mathbf{r}') \tilde{\xi}(\mathbf{r}') dr'}$. 
+- **Default**: 0
+
+### of_ml_tanh_pnl
+
+- **Type**: Vector of Integer 
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element controls the non-local descriptor defined by the $i$-th kernel function $w_i(\mathbf{r}-\mathbf{r}')$: $\tilde{p_{\rm{nl}}}(\mathbf{r}) = \tanh{(\chi_{p_{\rm{nl}}} p_{\rm{nl}}(\mathbf{r}))}$. 
+- **Default**: 0
+
+### of_ml_tanh_qnl
+
+- **Type**: Vector of Integer 
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element controls the non-local descriptor defined by the $i$-th kernel function $w_i(\mathbf{r}-\mathbf{r}')$: $\tilde{q_{\rm{nl}}}(\mathbf{r}) = \tanh{(\chi_{q_{\rm{nl}}} q_{\rm{nl}}(\mathbf{r}))}$. 
+- **Default**: 0
+
+### of_ml_tanhp_nl
+
+- **Type**: Vector of Integer 
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element controls the non-local descriptor defined by the $i$-th kernel function $w_i(\mathbf{r}-\mathbf{r}')$: $\tilde{p}_{\rm{nl}}(\mathbf{r}) = \int{w_i(\mathbf{r}-\mathbf{r}') \tilde{p}(\mathbf{r}') dr'}$. 
+- **Default**: 0
+
+### of_ml_tanhq_nl
+
+- **Type**: Vector of Integer 
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element controls the non-local descriptor defined by the $i$-th kernel function $w_i(\mathbf{r}-\mathbf{r}')$: $\tilde{q}_{\rm{nl}}(\mathbf{r}) = \int{w_i(\mathbf{r}-\mathbf{r}') \tilde{q}(\mathbf{r}') dr'}$. 
+- **Default**: 0
+
+### of_ml_chi_xi
+
+- **Type**: Vector of Real
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element specifies the hyperparameter $\chi_\xi$ of non-local descriptor defined by the $i$-th kernel function $w_i(\mathbf{r}-\mathbf{r}')$: $\tilde{\xi}(\mathbf{r}) = \tanh(\chi_{\xi} \xi(\mathbf{r}))$. 
+- **Default**: 1.0
+
+### of_ml_chi_pnl
+
+- **Type**: Vector of Real
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element specifies the hyperparameter $\chi_{p_{\rm{nl}}}$ of non-local descriptor defined by the $i$-th kernel function $w_i(\mathbf{r}-\mathbf{r}')$: $\tilde{p_{\rm{nl}}}(\mathbf{r}) = \tanh{(\chi_{p_{\rm{nl}}} p_{\rm{nl}}(\mathbf{r}))}$. 
+- **Default**: 1.0
+
+### of_ml_chi_qnl
+
+- **Type**: Vector of Real
+- **Availability**: OFDFT
+- **Description**: Containing nkernel (see [of_ml_nkernel](#of_ml_nkernel)) elements. The $i$-th element specifies the hyperparameter $\chi_{q_{\rm{nl}}}$ of non-local descriptor defined by the $i$-th kernel function $w_i(\mathbf{r}-\mathbf{r}')$: $\tilde{q_{\rm{nl}}}(\mathbf{r}) = \tanh{(\chi_{q_{\rm{nl}}} q_{\rm{nl}}(\mathbf{r}))}$. 
+- **Default**: 1.0
+
+### of_ml_local_test
+
+- **Type**: Boolean
+- **Availability**: OFDFT
+- **Description**: FOR TEST. Read in the density, and output the F and Pauli potential.
+- **Default**: False
+
+[back to top](#full-list-of-input-keywords)
+
 ## Electric field and dipole correction
 
 These variables are relevant to electric field and dipole correction
diff --git a/docs/advanced/install.md b/docs/advanced/install.md
index 4fa6b906c4..88714b2d7d 100644
--- a/docs/advanced/install.md
+++ b/docs/advanced/install.md
@@ -31,6 +31,21 @@ cmake -B build -DENABLE_DEEPKS=1 -DTorch_DIR=~/libtorch/share/cmake/Torch/ -Dlib
 
 > CMake will try to download Libnpy if it cannot be found locally.
 
+## Build with ML-KEDF
+
+If machine learning based kinetic energy density functional (ML-KEDF) is required for OFDFT calculation, the following prerequisites and steps are needed:
+
+- C++ compiler, supporting **C++14** (GCC >= 5 is sufficient)
+- CMake >= 3.18
+- [LibTorch](https://pytorch.org/) with cxx11 ABI supporting CPU or GPU
+- [Libnpy](https://github.com/llohse/libnpy/)
+
+```bash
+cmake -B build -DENABLE_MLKEDF=1 -DTorch_DIR=~/libtorch/share/cmake/Torch/ -Dlibnpy_INCLUDE_DIR=~/libnpy/include
+```
+
+> CMake will try to download Libnpy if it cannot be found locally.
+
 ## Build with DeePMD-kit
 
 > Note: This part is only required if you want to load a trained DeeP Potential and run molecular dynamics with that. To train the DeeP Potential with DP-GEN, no extra prerequisite is needed and please refer to [this page](http://abacus.deepmodeling.com/en/latest/advanced/interface/dpgen.html) for ABACUS interface with DP-GEN.
@@ -141,7 +156,7 @@ CXX = mpiicpc
 # icpc:      compile intel sequential version
 # make: ELPA_DIR, ELPA_INCLUDE_DIR, CEREAL_DIR must also be set.
 # make pw: nothing need to be set except LIBXC_DIR
-#
+# 
 # mpicxx:    compile gnu parallel version
 # g++:       compile gnu sequential version
 # make: FFTW_DIR, OPENBLAS_LIB_DIR, SCALAPACK_LIB_DIR, ELPA_DIR, ELPA_INCLUDE_DIR, CEREAL_DIR must also be set.
@@ -150,6 +165,10 @@ CXX = mpiicpc
 # GPU = OFF  #We do not support GPU yet
 # OFF:  do not use GPU
 # CUDA: use CUDA
+OPENMP = OFF
+# the default is not to use OPENMP to accelerate. 
+# Change OPENMP to ON to use OPENMP.
+
 #======================================================================
 
 
@@ -167,7 +186,7 @@ CEREAL_DIR    = /usr/local/include/cereal
 
 ##-------------------  FOR GNU COMPILER  ------------------------------
 ## FFTW_DIR          should contain lib/libfftw3.a.
-## OPENBLAS_LIB_DIR  should contain libopenblas.a.
+## OPENBLAS_LIB_DIR  should contain libopenblas.a. 
 ## SCALAPACK_LIB_DIR should contain libscalapack.a
 ## All three above will only be used when CXX=mpicxx or g++
 ## ELPA_DIR          should contain an include folder and lib/libelpa.a
@@ -185,14 +204,18 @@ CEREAL_DIR    = /usr/local/include/cereal
 
 
 ##-------------------  OPTIONAL LIBS  ---------------------------------
-## To use DEEPKS: set LIBTORCH_DIR and LIBNPY_DIR
+## To use DEEPKS: set ENABLE_DEEPKS = ON, and set LIBTORCH_DIR and LIBNPY_DIR
+## To use MLKEDF: set ENABLE_MLKEDF = ON, and set LIBTORCH_DIR and LIBNPY_DIR
 ## To use LIBXC:  set LIBXC_DIR which contains include and lib/libxc.a (>5.1.7)
-## To use DeePMD: set DeePMD_DIR and TensorFlow_DIR
+## To use DeePMD: set DeePMD_DIR LIBTORCH_DIR and TensorFlow_DIR
 ## To use LibRI:  set LIBRI_DIR and LIBCOMM_DIR
+## To use PEXSI: set PEXSI_DIR DSUPERLU_DIR and PARMETIS_DIR
 ##---------------------------------------------------------------------
 
 # LIBTORCH_DIR  = /usr/local
 # LIBNPY_DIR    = /usr/local
+ENABLE_DEEPKS   = OFF
+ENABLE_MLKEDF   = OFF
 
 # LIBXC_DIR    		= /public/soft/libxc
 
@@ -202,6 +225,10 @@ CEREAL_DIR    = /usr/local/include/cereal
 # LIBRI_DIR     = /public/software/LibRI
 # LIBCOMM_DIR   = /public/software/LibComm
 
+# PEXSI_DIR = /public/software/pexsi
+# DSUPERLU_DIR = /public/software/superlu_dist
+# PARMETIS_DIR    = /public/software/parmetis
+
 ##---------------------------------------------------------------------
 # NP = 14 # It is not supported. use make -j14 or make -j to parallelly compile
 # DEBUG = OFF
@@ -247,20 +274,6 @@ CEREAL_DIR=/public/soft/cereal
 
 ABACUS now support full version and pw version. Use `make` or `make abacus` to compile full version which supports LCAO calculations. Use `make pw` to compile pw version which only supports pw calculations. For pw version, `make pw CXX=mpiicpc`, you do not need to provide any libs. For `make pw CXX=mpicxx`, you need provide `FFTW_DIR` and `OPENBLAS_LIB_DIR`.
 
-Besides, libxc and deepks are optional libs to compile abacus.
-They will be used when `LIBXC_DIR` is defined like
-
-```makefile
-LIBXC_DIR = /public/soft/libxc
-```
-
-or `LIBTORCH_DIR` and `LIBNPY_DIR` like
-
-```makefile
-LIBTORCH_DIR  = /usr/local
-LIBNPY_DIR    = /usr/local
-```
-
 After modifying the `Makefile.vars` file, execute `make` or `make -j12` to build the program.
 
 After the compilation finishes without error messages (except perhaps for some warnings), an executable program `ABACUS.mpi` will be created in directory `bin/`.
@@ -279,10 +292,20 @@ directly.
 
 ### Add DeePKS Support
 
-To compile ABACUS with DEEPKS, you need to define `LIBTORCH_DIR` and `LIBNPY_DIR` in the file `Makefile.vars` or use
+To compile ABACUS with DEEPKS, you need to set `ENABLE_DEEPKS = ON`, and define `LIBTORCH_DIR` and `LIBNPY_DIR` in the file `Makefile.vars` or use
+
+```makefile
+make ENABLE_DEEPKS=ON LIBTORCH_DIR=/opt/libtorch/ LIBNPY_DIR=/opt/libnpy/
+```
+
+directly.
+
+### Add ML-KEDF Support
+
+To compile ABACUS with ML-KEDF, you need to set `ENABLE_MLKEDF = ON`, and define `LIBTORCH_DIR` and `LIBNPY_DIR` in the file `Makefile.vars` or use
 
 ```makefile
-make LIBTORCH_DIR=/opt/libtorch/ LIBNPY_DIR=/opt/libnpy/
+make ENABLE_MLKEDF=ON LIBTORCH_DIR=/opt/libtorch/ LIBNPY_DIR=/opt/libnpy/
 ```
 
 directly.
diff --git a/source/Makefile b/source/Makefile
index ee71b45363..81a541ec25 100644
--- a/source/Makefile
+++ b/source/Makefile
@@ -137,7 +137,12 @@ endif
 
 ifdef LIBNPY_DIR
     CNPY_INCLUDE_DIR = -I${LIBNPY_DIR}/include
-    HONG += -D__DEEPKS
+    ifeq ($(ENABLE_DEEPKS), ON)
+        HONG += -D__DEEPKS
+    endif
+    ifeq ($(ENABLE_MLKEDF), ON)
+        HONG += -D__MLKEDF
+    endif
     INCLUDES += $(CNPY_INCLUDE_DIR)
 endif
 
diff --git a/source/Makefile.vars b/source/Makefile.vars
index c542d3811b..db15ed30d3 100644
--- a/source/Makefile.vars
+++ b/source/Makefile.vars
@@ -55,7 +55,8 @@ CEREAL_DIR    = /usr/local/include/cereal
 
 
 ##-------------------  OPTIONAL LIBS  ---------------------------------
-## To use DEEPKS: set LIBTORCH_DIR and LIBNPY_DIR
+## To use DEEPKS: set ENABLE_DEEPKS = ON, and define LIBTORCH_DIR and LIBNPY_DIR
+## To use MLKEDF: set ENABLE_MLKEDF = ON, and define LIBTORCH_DIR and LIBNPY_DIR
 ## To use LIBXC:  set LIBXC_DIR which contains include and lib/libxc.a (>5.1.7)
 ## To use DeePMD: set DeePMD_DIR LIBTORCH_DIR and TensorFlow_DIR
 ## To use LibRI:  set LIBRI_DIR and LIBCOMM_DIR
@@ -64,6 +65,8 @@ CEREAL_DIR    = /usr/local/include/cereal
 
 # LIBTORCH_DIR  = /usr/local
 # LIBNPY_DIR    = /usr/local
+ENABLE_DEEPKS   ?= OFF
+ENABLE_MLKEDF   ?= OFF
 
 # LIBXC_DIR    		= /public/soft/libxc
 
diff --git a/source/driver_run.cpp b/source/driver_run.cpp
index 8084106fe6..1019887df5 100644
--- a/source/driver_run.cpp
+++ b/source/driver_run.cpp
@@ -7,7 +7,6 @@
 #include "module_io/print_info.h"
 #include "module_io/winput.h"
 #include "module_md/run_md.h"
-#include "module_parameter/parameter.h"
 
 /**
  * @brief This is the driver function which defines the workflow of ABACUS
diff --git a/source/module_elecstate/elecstate.h b/source/module_elecstate/elecstate.h
index 99e77f0353..401988d455 100644
--- a/source/module_elecstate/elecstate.h
+++ b/source/module_elecstate/elecstate.h
@@ -155,6 +155,7 @@ class ElecState
     }
 
     double get_dftu_energy();
+    double get_local_pp_energy();
 
 #ifdef __DEEPKS
     double get_deepks_E_delta();
diff --git a/source/module_elecstate/elecstate_energy.cpp b/source/module_elecstate/elecstate_energy.cpp
index 21537dab5d..bbf88217a9 100644
--- a/source/module_elecstate/elecstate_energy.cpp
+++ b/source/module_elecstate/elecstate_energy.cpp
@@ -311,6 +311,8 @@ void ElecState::cal_energies(const int type)
         this->f_en.edftu = get_dftu_energy();
     }
 
+    this->f_en.e_local_pp = get_local_pp_energy();
+
 #ifdef __DEEPKS
     // energy from deepks
     if (PARAM.inp.deepks_scf)
diff --git a/source/module_elecstate/elecstate_energy_terms.cpp b/source/module_elecstate/elecstate_energy_terms.cpp
index 535e9c83a3..a0551a7bdd 100644
--- a/source/module_elecstate/elecstate_energy_terms.cpp
+++ b/source/module_elecstate/elecstate_energy_terms.cpp
@@ -39,6 +39,18 @@ double ElecState::get_dftu_energy()
     return GlobalC::dftu.get_energy();
 }
 
+double ElecState::get_local_pp_energy()
+{
+    double local_pseudopot_energy = 0.;                   // electron-ion interaction energy from local pseudopotential
+    for (int is = 0; is < PARAM.inp.nspin; ++is)
+    {
+        local_pseudopot_energy += BlasConnector::dot(this->charge->rhopw->nrxx, this->pot->get_fixed_v(), 1, this->charge->rho[is], 1)
+                                * this->charge->rhopw->omega / this->charge->rhopw->nxyz;
+    }
+    Parallel_Reduce::reduce_all(local_pseudopot_energy);
+    return local_pseudopot_energy;
+}
+
 #ifdef __DEEPKS
 double ElecState::get_deepks_E_delta()
 {
diff --git a/source/module_elecstate/elecstate_print.cpp b/source/module_elecstate/elecstate_print.cpp
index fdca966ea6..0951379811 100644
--- a/source/module_elecstate/elecstate_print.cpp
+++ b/source/module_elecstate/elecstate_print.cpp
@@ -341,6 +341,8 @@ void ElecState::print_etot(const Magnetism& magnet,
         energies_Ry.push_back(this->f_en.demet);
         titles.push_back("E_descf");
         energies_Ry.push_back(this->f_en.descf);
+        titles.push_back("E_LocalPP");
+        energies_Ry.push_back(this->f_en.e_local_pp);
         std::string vdw_method = PARAM.inp.vdw_method;
         if (vdw_method == "d2") // Peize Lin add 2014-04, update 2021-03-09
         {
diff --git a/source/module_elecstate/fp_energy.h b/source/module_elecstate/fp_energy.h
index 9c922a3276..adbb3fe3bd 100644
--- a/source/module_elecstate/fp_energy.h
+++ b/source/module_elecstate/fp_energy.h
@@ -45,7 +45,7 @@ struct fenergy
     double escon = 0.0; ///< spin constraint energy
 
     double ekinetic = 0.0;  /// kinetic energy, used in OFDFT
-    double eion_elec = 0.0; /// ion-electron interaction energy, used in OFDFT
+    double e_local_pp = 0.0; /// ion-electron interaction energy contributed by local pp, used in OFDFT
 
     double calculate_etot();
     double calculate_harris();
diff --git a/source/module_elecstate/test/elecstate_energy_test.cpp b/source/module_elecstate/test/elecstate_energy_test.cpp
index 3d145ea187..b8078c1962 100644
--- a/source/module_elecstate/test/elecstate_energy_test.cpp
+++ b/source/module_elecstate/test/elecstate_energy_test.cpp
@@ -47,6 +47,10 @@ double ElecState::get_dftu_energy()
     return 0.6;
 }
 #endif
+double ElecState::get_local_pp_energy()
+{
+    return 0.7;
+}
 } // namespace elecstate
 
 #include "module_cell/klist.h"
diff --git a/source/module_esolver/esolver_fp.cpp b/source/module_esolver/esolver_fp.cpp
index 8dbf5ee254..80ad1ad349 100644
--- a/source/module_esolver/esolver_fp.cpp
+++ b/source/module_esolver/esolver_fp.cpp
@@ -67,7 +67,7 @@ void ESolver_FP::before_all_runners(UnitCell& ucell, const Input_para& inp)
 #ifdef __MPI
     this->pw_rho->initmpi(GlobalV::NPROC_IN_POOL, GlobalV::RANK_IN_POOL, POOL_WORLD);
 #endif
-    if (this->classname == "ESolver_OF")
+	if (this->classname == "ESolver_OF" || PARAM.inp.of_ml_gene_data == 1)
     {
         this->pw_rho->setfullpw(inp.of_full_pw, inp.of_full_pw_dim);
     }
diff --git a/source/module_esolver/esolver_ks.cpp b/source/module_esolver/esolver_ks.cpp
index 3f97814ca2..8de7f56a05 100644
--- a/source/module_esolver/esolver_ks.cpp
+++ b/source/module_esolver/esolver_ks.cpp
@@ -670,6 +670,7 @@ void ESolver_KS<T, Device>::iter_finish(UnitCell& ucell, const int istep, int& i
     {
         dkin = p_chgmix->get_dkin(pelec->charge, PARAM.inp.nelec);
     }
+
     this->pelec->print_etot(ucell.magnet,this->conv_esolver, iter, drho, dkin, duration, PARAM.inp.printe, diag_ethr);
 
     // Json, need to be moved to somewhere else
diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
index 6dd0ac3513..f3d8a13f99 100644
--- a/source/module_esolver/esolver_ks_pw.cpp
+++ b/source/module_esolver/esolver_ks_pw.cpp
@@ -46,6 +46,9 @@
 #ifdef USE_PAW
 #include "module_cell/module_paw/paw_cell.h"
 #endif
+#ifdef __MLKEDF
+#include "module_hamilt_pw/hamilt_ofdft/ml_data.h"
+#endif
 
 #include <ATen/kernels/blas.h>
 #include <ATen/kernels/lapack.h>
@@ -956,6 +959,20 @@ void ESolver_KS_PW<T, Device>::after_all_runners(UnitCell& ucell)
                      PARAM.inp.cond_nonlocal,
                      this->pelec->wg);
     }
+
+#ifdef __MLKEDF
+    // generate training data for ML-KEDF
+    if(PARAM.inp.of_ml_gene_data == 1)
+    {
+        this->pelec->pot->update_from_charge(this->pelec->charge, &ucell);
+
+        ML_data ml_data;
+        ml_data.set_para(this->pelec->charge->nrxx, PARAM.inp.nelec, PARAM.inp.of_tf_weight, PARAM.inp.of_vw_weight,
+                            PARAM.inp.of_ml_chi_p, PARAM.inp.of_ml_chi_q, PARAM.inp.of_ml_chi_xi, PARAM.inp.of_ml_chi_pnl, PARAM.inp.of_ml_chi_qnl,
+                            PARAM.inp.of_ml_nkernel, PARAM.inp.of_ml_kernel, PARAM.inp.of_ml_kernel_scaling, PARAM.inp.of_ml_yukawa_alpha, PARAM.inp.of_ml_kernel_file, ucell.omega, this->pw_rho);
+        ml_data.generateTrainData_KS(this->kspw_psi, this->pelec, this->pw_wfc, this->pw_rho, ucell, this->pelec->pot->get_effective_v(0));
+    }
+#endif
 }
 
 template class ESolver_KS_PW<std::complex<float>, base_device::DEVICE_CPU>;
diff --git a/source/module_esolver/esolver_of.cpp b/source/module_esolver/esolver_of.cpp
index eb86e8f6ef..fcf326543e 100644
--- a/source/module_esolver/esolver_of.cpp
+++ b/source/module_esolver/esolver_of.cpp
@@ -159,6 +159,11 @@ void ESolver_OF::runner(UnitCell& ucell, const int istep)
     this->before_opt(istep, ucell);
     this->iter_ = 0;
 
+#ifdef __MLKEDF
+    // for ML KEDF test
+    if (PARAM.inp.of_ml_local_test) this->ml_->localTest(pelec->charge->rho, this->pw_rho);
+#endif
+
     while (true)
     {
         // once we get a new rho and phi, update potential
@@ -494,6 +499,44 @@ void ESolver_OF::after_opt(const int istep, UnitCell& ucell)
         this->kinetic_energy_density(this->pelec->charge->rho, this->pphi_, this->pelec->charge->kin_r);
     }
 
+    for (int ir = 0; ir < this->pw_rho->nrxx; ++ir)
+    {
+        this->pelec->charge->rho_save[0][ir] = this->pelec->charge->rho[0][ir];
+    }
+
+#ifdef __MLKEDF
+    // Check the positivity of Pauli energy
+    if (this->of_kinetic_ == "ml")
+    {
+        this->tf_->get_energy(this->pelec->charge->rho);
+        std::cout << "ML Term = " << this->ml_->ml_energy << " Ry, TF Term = " << this->tf_->tf_energy << " Ry." << std::endl;
+        if (this->ml_->ml_energy >= this->tf_->tf_energy)
+        {
+            std::cout << "WARNING: ML >= TF" << std::endl;
+        }
+    }
+
+    // Generate data if needed
+    if (PARAM.inp.of_ml_gene_data)
+    {
+        this->pelec->pot->update_from_charge(pelec->charge, &ucell); // Hartree + XC + external
+        this->kinetic_potential(pelec->charge->rho, this->pphi_, this->pelec->pot->get_effective_v()); // (kinetic + Hartree + XC + external) * 2 * phi
+        
+        const double* vr_eff = this->pelec->pot->get_effective_v(0);
+        for (int ir = 0; ir < this->pw_rho->nrxx; ++ir)
+        {
+            this->pdEdphi_[0][ir] = vr_eff[ir];
+        }
+        this->pelec->eferm.get_ef(0) = this->cal_mu(this->pphi_[0], this->pdEdphi_[0], this->nelec_[0]);
+
+        // === temporary ===
+        // assert(GlobalV::of_kinetic == "wt" || GlobalV::of_kinetic == "ml");
+        // =================
+        std::cout << "Generating Training data..." << std::endl;
+        std::cout << "mu = " << this->pelec->eferm.get_efval(0) << std::endl;
+        this->ml_->generateTrainData(pelec->charge->rho, *(this->wt_), *(this->tf_), this->pw_rho, vr_eff);
+    }
+#endif
     // 2) call after_scf() of ESolver_FP
     ESolver_FP::after_scf(ucell, istep);
 
@@ -532,7 +575,7 @@ double ESolver_OF::cal_energy()
     }
     Parallel_Reduce::reduce_all(pseudopot_energy);
     this->pelec->f_en.ekinetic = kinetic_energy;
-    this->pelec->f_en.eion_elec = pseudopot_energy;
+    this->pelec->f_en.e_local_pp = pseudopot_energy;
     this->pelec->f_en.etot += kinetic_energy + pseudopot_energy;
     return this->pelec->f_en.etot;
 }
diff --git a/source/module_esolver/esolver_of.h b/source/module_esolver/esolver_of.h
index 081a077606..8c69a477c5 100644
--- a/source/module_esolver/esolver_of.h
+++ b/source/module_esolver/esolver_of.h
@@ -9,6 +9,7 @@
 #include "module_hamilt_pw/hamilt_ofdft/kedf_tf.h"
 #include "module_hamilt_pw/hamilt_ofdft/kedf_vw.h"
 #include "module_hamilt_pw/hamilt_ofdft/kedf_wt.h"
+#include "module_hamilt_pw/hamilt_ofdft/kedf_ml.h"
 #include "module_psi/psi.h"
 
 namespace ModuleESolver
@@ -38,6 +39,9 @@ class ESolver_OF : public ESolver_FP
     KEDF_vW* vw_ = nullptr;
     KEDF_WT* wt_ = nullptr;
     KEDF_LKT* lkt_ = nullptr;
+#ifdef __MLKEDF
+    KEDF_ML* ml_ = nullptr;
+#endif
 
     // ----------------- the optimization methods ------------------
     ModuleBase::Opt_CG* opt_cg_ = nullptr;
diff --git a/source/module_esolver/esolver_of_interface.cpp b/source/module_esolver/esolver_of_interface.cpp
index a92eb0290a..057b764118 100644
--- a/source/module_esolver/esolver_of_interface.cpp
+++ b/source/module_esolver/esolver_of_interface.cpp
@@ -13,7 +13,10 @@ namespace ModuleESolver
 void ESolver_OF::init_kedf(const Input_para& inp)
 {
     //! Thomas-Fermi (TF) KEDF, TF+ KEDF, and Want-Teter (WT) KEDF
-    if (this->of_kinetic_ == "tf" || this->of_kinetic_ == "tf+" || this->of_kinetic_ == "wt")
+    if (this->of_kinetic_ == "tf"
+     || this->of_kinetic_ == "tf+"
+     || this->of_kinetic_ == "wt"
+     || this->of_kinetic_ == "ml")
     {
         if (this->tf_ == nullptr)
         {
@@ -24,7 +27,7 @@ void ESolver_OF::init_kedf(const Input_para& inp)
 
     //! vW, TF+, WT, and LKT KEDFs
     if (this->of_kinetic_ == "vw" || this->of_kinetic_ == "tf+" || this->of_kinetic_ == "wt"
-        || this->of_kinetic_ == "lkt")
+        || this->of_kinetic_ == "lkt" || this->of_kinetic_ == "ml")
     {
         if (this->vw_ == nullptr)
         {
@@ -62,6 +65,19 @@ void ESolver_OF::init_kedf(const Input_para& inp)
         }
         this->lkt_->set_para(this->dV_, inp.of_lkt_a);
     }
+#ifdef __MLKEDF
+    if (this->of_kinetic_ == "ml")
+    {
+        if (this->ml_ == nullptr)
+            this->ml_ = new KEDF_ML();
+        this->ml_->set_para(this->pw_rho->nrxx, this->dV_, this->nelec_[0], inp.of_tf_weight, inp.of_vw_weight, 
+                        inp.of_ml_chi_p, inp.of_ml_chi_q, inp.of_ml_chi_xi, inp.of_ml_chi_pnl, inp.of_ml_chi_qnl,
+                        inp.of_ml_nkernel, inp.of_ml_kernel, inp.of_ml_kernel_scaling,
+                        inp.of_ml_yukawa_alpha, inp.of_ml_kernel_file, inp.of_ml_gamma, inp.of_ml_p, inp.of_ml_q, inp.of_ml_tanhp, inp.of_ml_tanhq,
+                        inp.of_ml_gammanl, inp.of_ml_pnl, inp.of_ml_qnl, inp.of_ml_xi, inp.of_ml_tanhxi,
+                        inp.of_ml_tanhxi_nl, inp.of_ml_tanh_pnl, inp.of_ml_tanh_qnl, inp.of_ml_tanhp_nl, inp.of_ml_tanhq_nl, inp.of_ml_device, this->pw_rho);
+    }
+#endif
 }
 
 /**
@@ -86,6 +102,13 @@ void ESolver_OF::kinetic_potential(double** prho, double** pphi, ModuleBase::mat
     {
         this->lkt_->lkt_potential(prho, this->pw_rho, rpot);
     }
+#ifdef __MLKEDF
+    if (this->of_kinetic_ == "ml")
+    {
+        this->ml_->ml_potential(prho, this->pw_rho, rpot);
+        this->tf_->get_energy(prho); // temp
+    }
+#endif
 
     // Before call vw_potential, change rpot to rpot * 2 * pphi
     for (int is = 0; is < PARAM.inp.nspin; ++is)
@@ -97,7 +120,7 @@ void ESolver_OF::kinetic_potential(double** prho, double** pphi, ModuleBase::mat
     }
 
     if (this->of_kinetic_ == "vw" || this->of_kinetic_ == "tf+" || this->of_kinetic_ == "wt"
-        || this->of_kinetic_ == "lkt")
+        || this->of_kinetic_ == "lkt" || this->of_kinetic_ == "ml")
     {
         this->vw_->vw_potential(pphi, this->pw_rho, rpot);
     }
@@ -119,7 +142,7 @@ double ESolver_OF::kinetic_energy()
     }
 
     if (this->of_kinetic_ == "vw" || this->of_kinetic_ == "tf+" || this->of_kinetic_ == "wt"
-        || this->of_kinetic_ == "lkt")
+        || this->of_kinetic_ == "lkt" || this->of_kinetic_ == "ml")
     {
         kinetic_energy += this->vw_->vw_energy;
     }
@@ -133,6 +156,17 @@ double ESolver_OF::kinetic_energy()
     {
         kinetic_energy += this->lkt_->lkt_energy;
     }
+#ifdef __MLKEDF
+    if (this->of_kinetic_ == "ml")
+    {
+        kinetic_energy += this->ml_->ml_energy;
+        if (this->ml_->ml_energy >= this->tf_->tf_energy)
+        {
+            std::cout << "WARNING: ML >= TF" << std::endl;
+            std::cout << "ML Term = " << this->ml_->ml_energy << " Ry, TF Term = " << this->tf_->tf_energy << " Ry." << std::endl;
+        }
+    }
+#endif
 
     return kinetic_energy;
 }
@@ -211,6 +245,10 @@ void ESolver_OF::kinetic_stress(ModuleBase::matrix& kinetic_stress_)
         this->lkt_->get_stress(pelec->charge->rho, this->pw_rho);
         kinetic_stress_ += this->lkt_->stress;
     }
+    if (this->of_kinetic_ == "ml")
+    {
+        std::cout << "Sorry, the stress of MPN KEDF is not yet supported." << std::endl;
+    }
 }
 
 /**
diff --git a/source/module_esolver/esolver_of_tool.cpp b/source/module_esolver/esolver_of_tool.cpp
index e430347215..5ac09580e4 100644
--- a/source/module_esolver/esolver_of_tool.cpp
+++ b/source/module_esolver/esolver_of_tool.cpp
@@ -431,8 +431,8 @@ void ESolver_OF::print_info()
     std::vector<std::string> titles;
     std::vector<double> energies_Ry;
     std::vector<double> energies_eV;
-    if (PARAM.inp.printe > 0
-        && ((this->iter_ + 1) % PARAM.inp.printe == 0 || this->conv_esolver || this->iter_ == PARAM.inp.scf_nmax))
+    if ((PARAM.inp.printe > 0
+        && ((this->iter_ + 1) % PARAM.inp.printe == 0 || this->conv_esolver || this->iter_ == PARAM.inp.scf_nmax)) || PARAM.inp.init_chg == "file")
     {
         titles.push_back("E_Total");
         energies_Ry.push_back(this->pelec->f_en.etot);
@@ -442,8 +442,8 @@ void ESolver_OF::print_info()
         energies_Ry.push_back(this->pelec->f_en.hartree_energy);
         titles.push_back("E_xc");
         energies_Ry.push_back(this->pelec->f_en.etxc - this->pelec->f_en.etxcc);
-        titles.push_back("E_IonElec");
-        energies_Ry.push_back(this->pelec->f_en.eion_elec);
+        titles.push_back("E_LocalPP");
+        energies_Ry.push_back(this->pelec->f_en.e_local_pp);
         titles.push_back("E_Ewald");
         energies_Ry.push_back(this->pelec->f_en.ewald_energy);
         if (this->of_kinetic_ == "tf" || this->of_kinetic_ == "tf+" || this->of_kinetic_ == "wt")
@@ -452,7 +452,7 @@ void ESolver_OF::print_info()
             energies_Ry.push_back(this->tf_->tf_energy);
         }
         if (this->of_kinetic_ == "vw" || this->of_kinetic_ == "tf+" || this->of_kinetic_ == "wt"
-            || this->of_kinetic_ == "lkt")
+            || this->of_kinetic_ == "lkt" || this->of_kinetic_ == "ml")
         {
             titles.push_back("vW KEDF");
             energies_Ry.push_back(this->vw_->vw_energy);
@@ -467,6 +467,13 @@ void ESolver_OF::print_info()
             titles.push_back("LKT KEDF");
             energies_Ry.push_back(this->lkt_->lkt_energy);
         }
+#ifdef __MLKEDF
+        if (this->of_kinetic_ == "ml")
+        {
+            titles.push_back("MPN KEDF");
+            energies_Ry.push_back(this->ml_->ml_energy);
+        }
+#endif
         std::string vdw_method = PARAM.inp.vdw_method;
         if (vdw_method == "d2") // Peize Lin add 2014-04, update 2021-03-09
         {
diff --git a/source/module_hamilt_pw/hamilt_ofdft/CMakeLists.txt b/source/module_hamilt_pw/hamilt_ofdft/CMakeLists.txt
index 260dbb9ba8..e095ffa12b 100644
--- a/source/module_hamilt_pw/hamilt_ofdft/CMakeLists.txt
+++ b/source/module_hamilt_pw/hamilt_ofdft/CMakeLists.txt
@@ -15,3 +15,24 @@ add_library(
 if(ENABLE_COVERAGE)
   add_coverage(hamilt_ofdft)
 endif()
+
+if(ENABLE_MLKEDF)
+  list(APPEND hamilt_mlkedf_srcs
+      kedf_ml.cpp
+      kedf_ml_pot.cpp
+      kedf_ml_label.cpp
+      ml_data.cpp
+      ml_data_descriptor.cpp
+      ml_tools/nn_of.cpp
+  )
+
+  add_library(
+      hamilt_mlkedf
+      OBJECT
+      ${hamilt_mlkedf_srcs}
+  )
+
+  if(ENABLE_COVERAGE)
+    add_coverage(hamilt_mlkedf)
+  endif()
+endif()
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/kedf_lkt.cpp b/source/module_hamilt_pw/hamilt_ofdft/kedf_lkt.cpp
index 06f253c55c..733c10bc4c 100644
--- a/source/module_hamilt_pw/hamilt_ofdft/kedf_lkt.cpp
+++ b/source/module_hamilt_pw/hamilt_ofdft/kedf_lkt.cpp
@@ -35,7 +35,7 @@ double KEDF_LKT::get_energy(const double* const* prho, ModulePW::PW_Basis* pw_rh
         this->get_as(prho[0], nabla_rho, pw_rho->nrxx, as);
         for (int ir = 0; ir < pw_rho->nrxx; ++ir)
         {
-            energy += pow(prho[0][ir], 5. / 3.) / std::cosh(as[ir]);
+            energy += std::pow(prho[0][ir], 5. / 3.) / std::cosh(as[ir]);
         }
         energy *= this->dV_ * this->c_tf_;
     }
@@ -77,7 +77,7 @@ double KEDF_LKT::get_energy_density(const double* const* prho, int is, int ir, M
 
     this->nabla(prho[is], pw_rho, nabla_rho);
     this->get_as(prho[is], nabla_rho, pw_rho->nrxx, as);
-    energy_den = this->c_tf_ * pow(prho[is][ir], 5. / 3.) / std::cosh(as[ir]);
+    energy_den = this->c_tf_ * std::pow(prho[is][ir], 5. / 3.) / std::cosh(as[ir]);
 
     delete[] as;
     for (int i = 0; i < 3; ++i) {
diff --git a/source/module_hamilt_pw/hamilt_ofdft/kedf_ml.cpp b/source/module_hamilt_pw/hamilt_ofdft/kedf_ml.cpp
new file mode 100644
index 0000000000..976a2bf920
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/kedf_ml.cpp
@@ -0,0 +1,501 @@
+#ifdef __MLKEDF
+
+#include "kedf_ml.h"
+
+#include "module_base/parallel_reduce.h"
+#include "module_base/global_function.h"
+#include "module_hamilt_pw/hamilt_pwdft/global.h"
+
+void KEDF_ML::set_para(
+    const int nx, 
+    const double dV, 
+    const double nelec, 
+    const double tf_weight, 
+    const double vw_weight, 
+    const double chi_p,
+    const double chi_q,
+    const std::vector<double> &chi_xi,
+    const std::vector<double> &chi_pnl,
+    const std::vector<double> &chi_qnl,
+    const int &nkernel,
+    const std::vector<int> &kernel_type,
+    const std::vector<double> &kernel_scaling,
+    const std::vector<double> &yukawa_alpha,
+    const std::vector<std::string> &kernel_file,
+    const bool &of_ml_gamma,
+    const bool &of_ml_p,
+    const bool &of_ml_q,
+    const bool &of_ml_tanhp,
+    const bool &of_ml_tanhq,
+    const std::vector<int> &of_ml_gammanl,
+    const std::vector<int> &of_ml_pnl,
+    const std::vector<int> &of_ml_qnl,
+    const std::vector<int> &of_ml_xi,
+    const std::vector<int> &of_ml_tanhxi,
+    const std::vector<int> &of_ml_tanhxi_nl,
+    const std::vector<int> &of_ml_tanh_pnl,
+    const std::vector<int> &of_ml_tanh_qnl,
+    const std::vector<int> &of_ml_tanhp_nl,
+    const std::vector<int> &of_ml_tanhq_nl,
+    const std::string device_inpt,
+    ModulePW::PW_Basis *pw_rho
+)
+{
+    torch::set_default_dtype(caffe2::TypeMeta::fromScalarType(torch::kDouble));
+    auto output = torch::get_default_dtype();
+    std::cout << "Default type: " << output << std::endl;
+
+    this->set_device(device_inpt);
+
+    this->nx = nx;
+    this->nx_tot = nx;
+    this->dV = dV;
+    this->nkernel = nkernel;
+
+    this->init_data(
+        nkernel,
+        of_ml_gamma,
+        of_ml_p,
+        of_ml_q,
+        of_ml_tanhp,
+        of_ml_tanhq,
+        of_ml_gammanl,
+        of_ml_pnl,
+        of_ml_qnl,
+        of_ml_xi,
+        of_ml_tanhxi,
+        of_ml_tanhxi_nl,
+        of_ml_tanh_pnl,
+        of_ml_tanh_qnl,
+        of_ml_tanhp_nl,
+        of_ml_tanhq_nl);
+
+    std::cout << "ninput = " << ninput << std::endl;
+
+    if (PARAM.inp.of_kinetic == "ml")
+    {
+        int nnode = 100;
+        int nlayer = 3;
+        this->nn = std::make_shared<NN_OFImpl>(this->nx, 0, this->ninput, nnode, nlayer, this->device);
+        torch::load(this->nn, "net.pt", this->device_type);
+        std::cout << "load net done" << std::endl;
+        if (PARAM.inp.of_ml_feg != 0)
+        {
+            torch::Tensor feg_inpt = torch::zeros(this->ninput, this->device_type);
+            for (int i = 0; i < this->ninput; ++i)
+            {
+                if (this->descriptor_type[i] == "gamma") feg_inpt[i] = 1.;
+            }
+
+            if (PARAM.inp.of_ml_feg == 1) 
+                this->feg_net_F = torch::softplus(this->nn->forward(feg_inpt)).to(this->device_CPU).contiguous().data_ptr<double>()[0];
+            else
+            {
+                this->feg_net_F = this->nn->forward(feg_inpt).to(this->device_CPU).contiguous().data_ptr<double>()[0];
+            }
+
+            std::cout << "feg_net_F = " << this->feg_net_F << std::endl;
+        }
+    } 
+    
+    if (PARAM.inp.of_kinetic == "ml" || PARAM.inp.of_ml_gene_data == 1)
+    {
+        this->ml_data = new ML_data;
+
+        this->chi_p = chi_p;
+        this->chi_q = chi_q;
+        this->chi_xi = chi_xi;
+        this->chi_pnl = chi_pnl;
+        this->chi_qnl = chi_qnl;
+
+        this->ml_data->set_para(nx, nelec, tf_weight, vw_weight, chi_p, chi_q,
+                                chi_xi, chi_pnl, chi_qnl, nkernel, kernel_type, kernel_scaling, yukawa_alpha, kernel_file, this->dV * pw_rho->nxyz, pw_rho);
+    }
+}
+
+/**
+ * @brief Get the energy of ML KEDF
+ * \f[ E_{ML} = c_{TF} * \int{F(\rho) \rho^{5/3} dr} \f]
+ * 
+ * @param prho charge density
+ * @param pw_rho PW_Basis
+ * @return the energy of ML KEDF
+ */
+double KEDF_ML::get_energy(const double * const * prho, ModulePW::PW_Basis *pw_rho)
+{
+    this->updateInput(prho, pw_rho);
+
+    this->NN_forward(prho, pw_rho, false);
+    
+    torch::Tensor enhancement_cpu_tensor = this->nn->F.to(this->device_CPU).contiguous();
+    this->enhancement_cpu_ptr = enhancement_cpu_tensor.data_ptr<double>();
+
+    double energy = 0.;
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        energy += enhancement_cpu_ptr[ir] * std::pow(prho[0][ir], 5./3.);
+    }
+    std::cout << "energy" << energy << std::endl;
+    energy *= this->dV * this->cTF;
+    this->ml_energy = energy;
+    Parallel_Reduce::reduce_all(this->ml_energy);
+    return this->ml_energy;
+}
+
+/**
+ * @brief Get the potential of ML KEDF, and add it into rpotential
+ * 
+ * @param prho charge density
+ * @param pw_rho PW_Basis
+ * @param rpotential rpotential => rpotential + V_{ML}
+ */
+void KEDF_ML::ml_potential(const double * const * prho, ModulePW::PW_Basis *pw_rho, ModuleBase::matrix &rpotential)
+{
+    this->updateInput(prho, pw_rho);
+
+    this->NN_forward(prho, pw_rho, true);
+    
+    torch::Tensor enhancement_cpu_tensor = this->nn->F.to(this->device_CPU).contiguous();
+    this->enhancement_cpu_ptr = enhancement_cpu_tensor.data_ptr<double>();
+    torch::Tensor gradient_cpu_tensor = this->nn->inputs.grad().to(this->device_CPU).contiguous();
+    this->gradient_cpu_ptr = gradient_cpu_tensor.data_ptr<double>();
+
+    this->get_potential_(prho, pw_rho, rpotential);
+
+    // get energy
+    ModuleBase::timer::tick("KEDF_ML", "Pauli Energy");
+    double energy = 0.;
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        energy += enhancement_cpu_ptr[ir] * std::pow(prho[0][ir], 5./3.);
+    }
+    energy *= this->dV * this->cTF;
+    this->ml_energy = energy;
+    Parallel_Reduce::reduce_all(this->ml_energy);
+    ModuleBase::timer::tick("KEDF_ML", "Pauli Energy");
+}
+
+/**
+ * @brief Generate training data for ML KEDF
+ * 
+ * @param prho charge density
+ * @param wt KEDF_WT
+ * @param tf KEDF_TF
+ * @param pw_rho PW_Basis
+ * @param veff effective potential
+ */
+void KEDF_ML::generateTrainData(const double * const *prho, KEDF_WT &wt, KEDF_TF &tf,  ModulePW::PW_Basis *pw_rho, const double *veff)
+{
+    this->ml_data->generateTrainData_WT(prho, wt, tf, pw_rho, veff);
+    if (PARAM.inp.of_kinetic == "ml")
+    {
+        this->updateInput(prho, pw_rho);
+
+        this->NN_forward(prho, pw_rho, true);
+        
+        torch::Tensor enhancement_cpu_tensor = this->nn->F.to(this->device_CPU).contiguous();
+        this->enhancement_cpu_ptr = enhancement_cpu_tensor.data_ptr<double>();
+        torch::Tensor gradient_cpu_tensor = this->nn->inputs.grad().to(this->device_CPU).contiguous();
+        this->gradient_cpu_ptr = gradient_cpu_tensor.data_ptr<double>();
+
+        torch::Tensor enhancement = this->nn->F.reshape({this->nx});
+        ModuleBase::matrix potential(1, this->nx);
+
+        this->get_potential_(prho, pw_rho, potential);
+
+        std::cout << "dumpdump\n";
+        this->dumpTensor(enhancement, "enhancement.npy");
+        this->dumpMatrix(potential, "potential.npy");
+    }
+}
+
+/**
+ * @brief For test
+ * 
+ * @param prho charge density
+ * @param pw_rho PW_Basis
+ */
+void KEDF_ML::localTest(const double * const *pprho, ModulePW::PW_Basis *pw_rho)
+{
+    // for test =====================
+    std::vector<long unsigned int> cshape = {(long unsigned) this->nx};
+    bool fortran_order = false;
+
+    std::vector<double> temp_prho(this->nx);
+    this->ml_data->loadVector("dir_of_input_rho", temp_prho);
+    double ** prho = new double *[1];
+    prho[0] = new double[this->nx];
+    for (int ir = 0; ir < this->nx; ++ir) prho[0][ir] = temp_prho[ir];
+    for (int ir = 0; ir < this->nx; ++ir) 
+    {
+        if (prho[0][ir] == 0.){
+            std::cout << "WARNING: rho = 0" << std::endl;
+        }
+    };
+    std::cout << "Load rho done" << std::endl;
+    // ==============================
+
+    this->updateInput(prho, pw_rho);
+    std::cout << "update done" << std::endl;
+
+    this->NN_forward(prho, pw_rho, true);
+    
+    torch::Tensor enhancement_cpu_tensor = this->nn->F.to(this->device_CPU).contiguous();
+    this->enhancement_cpu_ptr = enhancement_cpu_tensor.data_ptr<double>();
+    torch::Tensor gradient_cpu_tensor = this->nn->inputs.grad().to(this->device_CPU).contiguous();
+    this->gradient_cpu_ptr = gradient_cpu_tensor.data_ptr<double>();
+
+    std::cout << "enhancement done" << std::endl;
+
+    torch::Tensor enhancement = this->nn->F.reshape({this->nx});
+    ModuleBase::matrix potential(1, this->nx);
+
+    this->get_potential_(prho, pw_rho, potential);
+    std::cout << "potential done" << std::endl;
+
+    this->dumpTensor(enhancement, "enhancement-abacus.npy");
+    this->dumpMatrix(potential, "potential-abacus.npy");
+    exit(0);
+}
+
+/**
+ * @brief Set the device for ML KEDF
+ * 
+ * @param device_inpt "cpu" or "gpu"
+ */
+void KEDF_ML::set_device(std::string device_inpt)
+{
+    if (device_inpt == "cpu")
+    {
+        std::cout << "---------- Running NN on CPU ----------" << std::endl;
+        this->device_type = torch::kCPU;
+    }
+    else if (device_inpt == "gpu")
+    {
+        if (torch::cuda::cudnn_is_available())
+        {
+            std::cout << "---------- Running NN on GPU ----------" << std::endl;
+            this->device_type = torch::kCUDA;
+        }
+        else
+        {
+            std::cout << "------ Warning: GPU is unaviable ------" << std::endl;
+            std::cout << "---------- Running NN on CPU ----------" << std::endl;
+            this->device_type = torch::kCPU;
+        }
+    }
+    this->device = torch::Device(this->device_type);
+}
+
+/**
+ * @brief Interface to Neural Network forward
+ * 
+ * @param prho charge density
+ * @param pw_rho PW_Basis
+ * @param cal_grad whether to calculate the gradient
+ */
+void KEDF_ML::NN_forward(const double * const * prho, ModulePW::PW_Basis *pw_rho, bool cal_grad)
+{
+    ModuleBase::timer::tick("KEDF_ML", "Forward");
+
+    this->nn->zero_grad();
+    this->nn->inputs.requires_grad_(false);
+    this->nn->set_data(this, this->descriptor_type, this->kernel_index, this->nn->inputs);
+    this->nn->inputs.requires_grad_(true);
+
+    this->nn->F = this->nn->forward(this->nn->inputs);    
+    if (this->nn->inputs.grad().numel()) 
+    {
+        this->nn->inputs.grad().zero_(); // In the first step, inputs.grad() returns an undefined Tensor, so that numel() = 0.
+    }
+
+    if (PARAM.inp.of_ml_feg != 3)
+    {
+        this->nn->F = torch::softplus(this->nn->F);
+    }
+    if (PARAM.inp.of_ml_feg == 1)
+    {
+        this->nn->F = this->nn->F - this->feg_net_F + 1.;
+    }
+    else if (PARAM.inp.of_ml_feg == 3)
+    {
+        this->nn->F = torch::softplus(this->nn->F - this->feg_net_F + this->feg3_correct);
+    }
+    ModuleBase::timer::tick("KEDF_ML", "Forward");
+
+    if (cal_grad)
+    {
+        ModuleBase::timer::tick("KEDF_ML", "Backward");
+        this->nn->F.backward(torch::ones({this->nx, 1}, this->device_type));
+        ModuleBase::timer::tick("KEDF_ML", "Backward");
+    }
+}
+
+/**
+ * @brief Dump the torch::Tensor into .npy file
+ * 
+ * @param data torch::Tensor
+ * @param filename file name
+ */
+void KEDF_ML::dumpTensor(const torch::Tensor &data, std::string filename)
+{
+    std::cout << "Dumping " << filename << std::endl;
+    torch::Tensor data_cpu = data.to(this->device_CPU).contiguous();
+    std::vector<double> v(data_cpu.data_ptr<double>(), data_cpu.data_ptr<double>() + data_cpu.numel());
+    // for (int ir = 0; ir < this->nx; ++ir) assert(v[ir] == data[ir].item<double>());
+    this->ml_data->dumpVector(filename, v);
+}
+
+/**
+ * @brief Dump the matrix into .npy file
+ * 
+ * @param data matrix
+ * @param filename file name
+ */
+void KEDF_ML::dumpMatrix(const ModuleBase::matrix &data, std::string filename)
+{
+    std::cout << "Dumping " << filename << std::endl;
+    std::vector<double> v(data.c, data.c + this->nx);
+    // for (int ir = 0; ir < this->nx; ++ir) assert(v[ir] == data[ir].item<double>());
+    this->ml_data->dumpVector(filename, v);
+}
+
+/**
+ * @brief Update the desciptors for ML KEDF
+ * 
+ * @param prho charge density
+ * @param pw_rho PW_Basis
+ */
+void KEDF_ML::updateInput(const double * const * prho, ModulePW::PW_Basis *pw_rho)
+{
+    ModuleBase::timer::tick("KEDF_ML", "updateInput");
+    // std::cout << "updata_input" << std::endl;
+    if (this->gene_data_label["gamma"][0])
+    {   
+        this->ml_data->getGamma(prho, this->gamma);
+    }
+    if (this->gene_data_label["p"][0])
+    {
+        this->ml_data->getNablaRho(prho, pw_rho, this->nablaRho);
+        this->ml_data->getP(prho, pw_rho, this->nablaRho, this->p);
+    }
+    if (this->gene_data_label["q"][0])
+    {
+        this->ml_data->getQ(prho, pw_rho, this->q);
+    }
+    if (this->gene_data_label["tanhp"][0])
+    {
+        this->ml_data->getTanhP(this->p, this->tanhp);
+    }
+    if (this->gene_data_label["tanhq"][0])
+    {
+        this->ml_data->getTanhQ(this->q, this->tanhq);
+    }
+
+    for (int ik = 0; ik < nkernel; ++ik)
+    {
+        if (this->gene_data_label["gammanl"][ik]){
+            this->ml_data->getGammanl(ik, this->gamma, pw_rho, this->gammanl[ik]);
+        }
+        if (this->gene_data_label["pnl"][ik]){
+            this->ml_data->getPnl(ik, this->p, pw_rho, this->pnl[ik]);
+        }
+        if (this->gene_data_label["qnl"][ik]){
+            this->ml_data->getQnl(ik, this->q, pw_rho, this->qnl[ik]);
+        }
+        if (this->gene_data_label["xi"][ik]){
+            this->ml_data->getXi(this->gamma, this->gammanl[ik], this->xi[ik]);
+        }
+        if (this->gene_data_label["tanhxi"][ik]){
+            this->ml_data->getTanhXi(ik, this->gamma, this->gammanl[ik], this->tanhxi[ik]);
+        }
+        if (this->gene_data_label["tanhxi_nl"][ik]){
+            this->ml_data->getTanhXi_nl(ik, this->tanhxi[ik], pw_rho, this->tanhxi_nl[ik]);
+        }
+        if (this->gene_data_label["tanh_pnl"][ik]){
+            this->ml_data->getTanh_Pnl(ik, this->pnl[ik], this->tanh_pnl[ik]);
+        }
+        if (this->gene_data_label["tanh_qnl"][ik]){
+            this->ml_data->getTanh_Qnl(ik, this->qnl[ik], this->tanh_qnl[ik]);
+        }
+        if (this->gene_data_label["tanhp_nl"][ik]){
+            this->ml_data->getTanhP_nl(ik, this->tanhp, pw_rho, this->tanhp_nl[ik]);
+        }
+        if (this->gene_data_label["tanhq_nl"][ik]){
+            this->ml_data->getTanhQ_nl(ik, this->tanhq, pw_rho, this->tanhq_nl[ik]);
+        }
+    }
+    ModuleBase::timer::tick("KEDF_ML", "updateInput");
+}
+
+/**
+ * @brief Return the descriptors for ML KEDF
+ * 
+ * @param parameter "gamma", "p", "q", "tanhp", "tanhq", "gammanl", "pnl", "qnl", "xi", "tanhxi", "tanhxi_nl", "tanh_pnl", "tanh_qnl", "tanhp_nl", "tanhq_nl"
+ * @param ikernel kernel index
+ */
+torch::Tensor KEDF_ML::get_data(std::string parameter, const int ikernel){
+
+    if (parameter == "gamma")
+    {
+        return torch::tensor(this->gamma, this->device_type);
+    }
+    if (parameter == "p")
+    {
+        return torch::tensor(this->p, this->device_type);
+    }
+    if (parameter == "q")
+    {
+        return torch::tensor(this->q, this->device_type);
+    }
+    if (parameter == "tanhp")
+    {
+        return torch::tensor(this->tanhp, this->device_type);
+    }
+    if (parameter == "tanhq")
+    {
+        return torch::tensor(this->tanhq, this->device_type);
+    }
+    if (parameter == "gammanl")
+    {
+        return torch::tensor(this->gammanl[ikernel], this->device_type);
+    }
+    if (parameter == "pnl")
+    {
+        return torch::tensor(this->pnl[ikernel], this->device_type);
+    }
+    if (parameter == "qnl")
+    {
+        return torch::tensor(this->qnl[ikernel], this->device_type);
+    }
+    if (parameter == "xi")
+    {
+        return torch::tensor(this->xi[ikernel], this->device_type);
+    }
+    if (parameter == "tanhxi")
+    {
+        return torch::tensor(this->tanhxi[ikernel], this->device_type);
+    }
+    if (parameter == "tanhxi_nl")
+    {
+        return torch::tensor(this->tanhxi_nl[ikernel], this->device_type);
+    }
+    if (parameter == "tanh_pnl")
+    {
+        return torch::tensor(this->tanh_pnl[ikernel], this->device_type);
+    }
+    if (parameter == "tanh_qnl")
+    {
+        return torch::tensor(this->tanh_qnl[ikernel], this->device_type);
+    }
+    if (parameter == "tanhp_nl")
+    {
+        return torch::tensor(this->tanhp_nl[ikernel], this->device_type);
+    }
+    if (parameter == "tanhq_nl")
+    {
+        return torch::tensor(this->tanhq_nl[ikernel], this->device_type);
+    }
+    return torch::zeros({});
+}
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/kedf_ml.h b/source/module_hamilt_pw/hamilt_ofdft/kedf_ml.h
new file mode 100644
index 0000000000..3825a85bd9
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/kedf_ml.h
@@ -0,0 +1,197 @@
+#ifndef KEDF_ML_H
+#define KEDF_ML_H
+
+#ifdef __MLKEDF
+
+#include "ml_data.h"
+
+#include <vector>
+#include "module_hamilt_pw/hamilt_ofdft/kedf_wt.h"
+#include "module_hamilt_pw/hamilt_ofdft/kedf_tf.h"
+#include "./ml_tools/nn_of.h"
+
+class KEDF_ML
+{
+public:
+    KEDF_ML()
+    {
+        // this->stress.create(3,3);
+    }
+    ~KEDF_ML()
+    {
+        delete this->ml_data;
+    }
+
+    void set_para(
+        const int nx, 
+        const double dV, 
+        const double nelec, 
+        const double tf_weight, 
+        const double vw_weight, 
+        const double chi_p,
+        const double chi_q,
+        const std::vector<double> &chi_xi,
+        const std::vector<double> &chi_pnl,
+        const std::vector<double> &chi_qnl,
+        const int &nkernel,
+        const std::vector<int> &kernel_type,
+        const std::vector<double> &kernel_scaling,
+        const std::vector<double> &yukawa_alpha,
+        const std::vector<std::string> &kernel_file,
+        const bool &of_ml_gamma,
+        const bool &of_ml_p,
+        const bool &of_ml_q,
+        const bool &of_ml_tanhp,
+        const bool &of_ml_tanhq,
+        const std::vector<int> &of_ml_gammanl,
+        const std::vector<int> &of_ml_pnl,
+        const std::vector<int> &of_ml_qnl,
+        const std::vector<int> &of_ml_xi,
+        const std::vector<int> &of_ml_tanhxi,
+        const std::vector<int> &of_ml_tanhxi_nl,
+        const std::vector<int> &of_ml_tanh_pnl,
+        const std::vector<int> &of_ml_tanh_qnl,
+        const std::vector<int> &of_ml_tanhp_nl,
+        const std::vector<int> &of_ml_tanhq_nl,
+        const std::string device_inpt,
+        ModulePW::PW_Basis *pw_rho);
+
+    void set_device(std::string device_inpt);
+
+    double get_energy(const double * const * prho, ModulePW::PW_Basis *pw_rho);
+    // double get_energy_density(const double * const *prho, int is, int ir, ModulePW::PW_Basis *pw_rho);
+    void ml_potential(const double * const * prho, ModulePW::PW_Basis *pw_rho, ModuleBase::matrix &rpotential);
+    // void get_stress(double cellVol, const double * const * prho, ModulePW::PW_Basis *pw_rho, double vw_weight);
+    // double diffLinhard(double eta, double vw_weight);
+
+    // output all parameters
+    void generateTrainData(const double * const *prho, KEDF_WT &wt, KEDF_TF &tf, ModulePW::PW_Basis *pw_rho, const double *veff);
+    void localTest(const double * const *prho, ModulePW::PW_Basis *pw_rho);
+
+    // interface to NN
+    void NN_forward(const double * const * prho, ModulePW::PW_Basis *pw_rho, bool cal_grad);
+
+    void get_potential_(const double * const * prho, ModulePW::PW_Basis *pw_rho, ModuleBase::matrix &rpotential);
+
+    // potentials
+    double potGammaTerm(int ir);
+    double potPTerm1(int ir);
+    double potQTerm1(int ir);
+    double potXiTerm1(int ir);
+    double potTanhxiTerm1(int ir);
+    double potTanhpTerm1(int ir);
+    double potTanhqTerm1(int ir);
+    void potGammanlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rGammanlTerm);
+    void potXinlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rXinlTerm);
+    void potTanhxinlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rTanhxinlTerm);
+    void potTanhxi_nlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rTanhxi_nlTerm); // 2023-03-20 for tanhxi_nl
+    void potPPnlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rPPnlTerm);
+    void potQQnlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rQQnlTerm);
+    void potTanhpTanh_pnlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rTanhpTanh_pnlTerm);
+    void potTanhqTanh_qnlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rTanhqTanh_qnlTerm);
+    void potTanhpTanhp_nlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rTanhpTanhp_nlTerm);
+    void potTanhqTanhq_nlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rTanhqTanhq_nlTerm);
+    // tools
+    void dumpTensor(const torch::Tensor &data, std::string filename);
+    void dumpMatrix(const ModuleBase::matrix &data, std::string filename);
+    void updateInput(const double * const * prho, ModulePW::PW_Basis *pw_rho);
+
+    ML_data *ml_data = nullptr;
+
+    int nx = 0; // number of grid points
+    int nx_tot = 0; // equal to nx (called by NN)
+    double dV = 0.;
+    // double weightml = 1.;
+    const double cTF = 3.0/10.0 * std::pow(3*std::pow(M_PI, 2.0), 2.0/3.0) * 2; // 10/3*(3*pi^2)^{2/3}, multiply by 2 to convert unit from Hartree to Ry, finally in Ry*Bohr^(-2)
+    const double pqcoef = 1.0 / (4.0 * std::pow(3*std::pow(M_PI, 2.0), 2.0/3.0)); // coefficient of p and q
+    double ml_energy = 0.;
+    // ModuleBase::matrix stress;
+    double feg_net_F = 0.;
+    double feg3_correct = 0.541324854612918; // ln(e - 1)
+
+    // Descriptors and hyperparameters
+    int ninput = 0; // number of descriptors
+    std::vector<double> gamma = {};
+    std::vector<double> p = {};
+    std::vector<double> q = {};
+    std::vector<std::vector<double>> gammanl = {};
+    std::vector<std::vector<double>> pnl = {};
+    std::vector<std::vector<double>> qnl = {};
+    std::vector<std::vector<double>> nablaRho = {};
+    // new parameters 2023-02-13
+    std::vector<double> chi_xi = {1.0};
+    double chi_p = 1.;
+    double chi_q = 1.;
+    std::vector<std::vector<double>> xi = {}; // we assume ONLY ONE of them is used.
+    std::vector<std::vector<double>> tanhxi = {};
+    std::vector<std::vector<double>> tanhxi_nl= {}; // 2023-03-20
+    std::vector<double> tanhp = {};
+    std::vector<double> tanhq = {};
+    // plan 1
+    std::vector<double> chi_pnl = {1.0};
+    std::vector<double> chi_qnl = {1.0};
+    std::vector<std::vector<double>> tanh_pnl = {};
+    std::vector<std::vector<double>> tanh_qnl = {};
+    // plan 2
+    std::vector<std::vector<double>> tanhp_nl = {};
+    std::vector<std::vector<double>> tanhq_nl = {};
+    // GPU
+    torch::DeviceType device_type = torch::kCPU;
+    torch::Device device = torch::Device(torch::kCPU);
+    torch::Device device_CPU = torch::Device(torch::kCPU);
+
+    // Nueral Network
+    std::shared_ptr<NN_OFImpl> nn;
+    double* enhancement_cpu_ptr = nullptr;
+    double* gradient_cpu_ptr = nullptr;
+
+    int nkernel = 1; // number of kernels
+
+    // maps
+    void init_data(
+        const int &nkernel,
+        const bool &of_ml_gamma,
+        const bool &of_ml_p,
+        const bool &of_ml_q,
+        const bool &of_ml_tanhp,
+        const bool &of_ml_tanhq,
+        const std::vector<int> &of_ml_gammanl_,
+        const std::vector<int> &of_ml_pnl,
+        const std::vector<int> &of_ml_qnl,
+        const std::vector<int> &of_ml_xi,
+        const std::vector<int> &of_ml_tanhxi,
+        const std::vector<int> &of_ml_tanhxi_nl,
+        const std::vector<int> &of_ml_tanh_pnl,
+        const std::vector<int> &of_ml_tanh_qnl,
+        const std::vector<int> &of_ml_tanhp_nl,
+        const std::vector<int> &of_ml_tanhq_nl
+    );
+    
+    // Whether to use corresponding descriptors
+    bool ml_gamma = false;
+    bool ml_p = false;
+    bool ml_q = false;
+    bool ml_tanhp = false;
+    bool ml_tanhq = false;
+    bool ml_gammanl = false;
+    bool ml_pnl = false;
+    bool ml_qnl = false;
+    bool ml_xi = false;
+    bool ml_tanhxi = false;
+    bool ml_tanhxi_nl = false;
+    bool ml_tanh_pnl = false;
+    bool ml_tanh_qnl = false;
+    bool ml_tanhp_nl = false;
+    bool ml_tanhq_nl = false;
+
+    std::vector<std::string> descriptor_type = {};                  // the descriptors used
+    std::vector<int> kernel_index = {};                             // the index of the kernel used
+    std::map<std::string, std::vector<int>> descriptor2kernel = {}; // the map from descriptor to kernel index
+    std::map<std::string, std::vector<int>> descriptor2index = {};  // the map from descriptor to index
+    std::map<std::string, std::vector<bool>> gene_data_label = {};  // the map from descriptor to gene label
+
+    torch::Tensor get_data(std::string parameter, const int ikernel);   // get the descriptor data for the ikernel-th kernel
+};
+
+#endif
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/kedf_ml_label.cpp b/source/module_hamilt_pw/hamilt_ofdft/kedf_ml_label.cpp
new file mode 100644
index 0000000000..2d6ba25c99
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/kedf_ml_label.cpp
@@ -0,0 +1,313 @@
+#ifdef __MLKEDF
+
+#include "kedf_ml.h"
+
+/**
+ * @brief Initialize the data for ML KEDF, and generate the mapping between descriptor and kernel
+ * 
+ * @param nkernel number of kernels
+ * @param of_ml_gamma whether to use gamma descriptor
+ * @param of_ml_p whether to use p descriptor
+ * @param of_ml_q whether to use q descriptor
+ * @param of_ml_tanhp whether to use tanhp descriptor
+ * @param of_ml_tanhq whether to use tanhq descriptor
+ * @param of_ml_gammanl whether to use gammanl descriptor
+ * @param of_ml_pnl whether to use pnl descriptor
+ * @param of_ml_qnl whether to use qnl descriptor
+ * @param of_ml_xi whether to use xi descriptor
+ * @param of_ml_tanhxi whether to use tanhxi descriptor
+ * @param of_ml_tanhxi_nl whether to use tanhxi_nl descriptor
+ * @param of_ml_tanh_pnl whether to use tanh_pnl descriptor
+ * @param of_ml_tanh_qnl whether to use tanh_qnl descriptor
+ * @param of_ml_tanhp_nl whether to use tanhp_nl descriptor
+ * @param of_ml_tanhq_nl whether to use tanhq_nl descriptor
+ */
+void KEDF_ML::init_data(
+    const int &nkernel,
+    const bool &of_ml_gamma,
+    const bool &of_ml_p,
+    const bool &of_ml_q,
+    const bool &of_ml_tanhp,
+    const bool &of_ml_tanhq,
+    const std::vector<int> &of_ml_gammanl,
+    const std::vector<int> &of_ml_pnl,
+    const std::vector<int> &of_ml_qnl,
+    const std::vector<int> &of_ml_xi,
+    const std::vector<int> &of_ml_tanhxi,
+    const std::vector<int> &of_ml_tanhxi_nl,
+    const std::vector<int> &of_ml_tanh_pnl,
+    const std::vector<int> &of_ml_tanh_qnl,
+    const std::vector<int> &of_ml_tanhp_nl,
+    const std::vector<int> &of_ml_tanhq_nl
+)
+{
+
+    this->ninput = 0;
+
+    // --------- semi-local descriptors ---------
+    if (of_ml_gamma){
+        this->descriptor_type.push_back("gamma");
+        this->kernel_index.push_back(-1);
+        ninput++;
+    } 
+    if (of_ml_p){
+        this->descriptor_type.push_back("p");
+        this->kernel_index.push_back(-1);
+        ninput++;
+    }
+    if (of_ml_q){
+        this->descriptor_type.push_back("q");
+        this->kernel_index.push_back(-1);
+        ninput++;
+    }
+    // --------- non-local descriptors ---------
+    for (int ik = 0; ik < nkernel; ++ik)
+    {
+        if (of_ml_gammanl[ik]){
+            this->descriptor_type.push_back("gammanl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (of_ml_pnl[ik]){
+            this->descriptor_type.push_back("pnl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (of_ml_qnl[ik]){
+            this->descriptor_type.push_back("qnl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (of_ml_xi[ik]){
+            this->descriptor_type.push_back("xi");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (of_ml_tanhxi[ik]){
+            this->descriptor_type.push_back("tanhxi");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (of_ml_tanhxi_nl[ik]){
+            this->descriptor_type.push_back("tanhxi_nl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+    }
+    // --------- semi-local descriptors ---------
+    if (of_ml_tanhp){
+        this->descriptor_type.push_back("tanhp");
+        this->kernel_index.push_back(-1);
+        ninput++;
+    }
+    if (of_ml_tanhq){
+        this->descriptor_type.push_back("tanhq");
+        this->kernel_index.push_back(-1);
+        ninput++;
+    }
+    // --------- non-local descriptors ---------
+    for (int ik = 0; ik < nkernel; ++ik)
+    {
+        if (of_ml_tanh_pnl[ik]){
+            this->descriptor_type.push_back("tanh_pnl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (of_ml_tanh_qnl[ik]){
+            this->descriptor_type.push_back("tanh_qnl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (of_ml_tanhp_nl[ik]){
+            this->descriptor_type.push_back("tanhp_nl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (of_ml_tanhq_nl[ik]){
+            this->descriptor_type.push_back("tanhq_nl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+    }
+
+    this->descriptor2kernel = {{"gamma", {}},
+                               {"p", {}},
+                               {"q", {}},
+                               {"tanhp", {}},
+                               {"tanhq", {}},
+                               {"gammanl", {}},
+                               {"pnl", {}},
+                               {"qnl", {}},
+                               {"xi", {}},
+                               {"tanhxi", {}},
+                               {"tanhxi_nl", {}},
+                               {"tanh_pnl", {}},
+                               {"tanh_qnl", {}},
+                               {"tanhp_nl", {}},
+                               {"tanhq_nl", {}}};
+    this->descriptor2index = this->descriptor2kernel;
+
+    for (int i = 0; i < ninput; ++i)
+    {
+        this->descriptor2kernel[descriptor_type[i]].push_back(kernel_index[i]);
+        this->descriptor2index[descriptor_type[i]].push_back(i);
+    }
+    std::cout << "descriptor2index    " << descriptor2index << std::endl;
+    std::cout << "descriptor2kernel    " << descriptor2kernel << std::endl;
+
+    this->ml_gamma = this->descriptor2index["gamma"].size() > 0;
+    this->ml_p = this->descriptor2index["p"].size() > 0;
+    this->ml_q = this->descriptor2index["q"].size() > 0;
+    this->ml_tanhp = this->descriptor2index["tanhp"].size() > 0;
+    this->ml_tanhq = this->descriptor2index["tanhq"].size() > 0;
+    this->ml_gammanl = this->descriptor2index["gammanl"].size() > 0;
+    this->ml_pnl = this->descriptor2index["pnl"].size() > 0;
+    this->ml_qnl = this->descriptor2index["qnl"].size() > 0;
+    this->ml_xi = this->descriptor2index["xi"].size() > 0;
+    this->ml_tanhxi = this->descriptor2index["tanhxi"].size() > 0;
+    this->ml_tanhxi_nl = this->descriptor2index["tanhxi_nl"].size() > 0;
+    this->ml_tanh_pnl = this->descriptor2index["tanh_pnl"].size() > 0;
+    this->ml_tanh_qnl = this->descriptor2index["tanh_qnl"].size() > 0;
+    this->ml_tanhp_nl = this->descriptor2index["tanhp_nl"].size() > 0;
+    this->ml_tanhq_nl = this->descriptor2index["tanhq_nl"].size() > 0;
+
+    bool gene_gammanl_tot = false;
+    bool gene_pnl_tot = false;
+    bool gene_qnl_tot = false;
+    bool gene_tanh_pnl_tot = false;
+    bool gene_tanh_qnl_tot = false;
+    bool gene_tanhp_nl_tot = false;
+    bool gene_tanhq_nl_tot = false;
+
+    this->gene_data_label = {{"gamma", {}},
+                               {"p", {}},
+                               {"q", {}},
+                               {"tanhp", {}},
+                               {"tanhq", {}},
+                               {"gammanl", {}},
+                               {"pnl", {}},
+                               {"qnl", {}},
+                               {"xi", {}},
+                               {"tanhxi", {}},
+                               {"tanhxi_nl", {}},
+                               {"tanh_pnl", {}},
+                               {"tanh_qnl", {}},
+                               {"tanhp_nl", {}},
+                               {"tanhq_nl", {}}};
+
+    for (std::string descriptor : {"gamma", "p", "q", "tanhp", "tanhq"})
+    {
+        this->gene_data_label[descriptor].push_back(0);
+    }
+    for (std::string descriptor : {"gammanl", "pnl", "qnl", "xi", "tanhxi", "tanhxi_nl",
+                                    "tanh_pnl", "tanh_qnl", "tanhp_nl", "tanhq_nl"})
+    {
+        for (int ik = 0; ik < nkernel; ++ik)
+        {
+            this->gene_data_label[descriptor].push_back(0);
+        }
+    }
+
+    for (int ik = 0; ik < nkernel; ++ik)
+    {
+        this->gene_data_label["pnl"][ik] = of_ml_pnl[ik] || of_ml_tanh_pnl[ik];
+        this->gene_data_label["qnl"][ik] = of_ml_qnl[ik] || of_ml_tanh_qnl[ik];
+        this->gene_data_label["tanhxi_nl"][ik] = of_ml_tanhxi_nl[ik];
+        this->gene_data_label["tanhxi"][ik] = of_ml_tanhxi[ik] || of_ml_tanhxi_nl[ik];
+        this->gene_data_label["xi"][ik] = of_ml_xi[ik] || this->gene_data_label["tanhxi"][ik];
+        this->gene_data_label["gammanl"][ik] = of_ml_gammanl[ik] || this->gene_data_label["xi"][ik];
+        this->gene_data_label["tanh_pnl"][ik] = of_ml_tanh_pnl[ik];
+        this->gene_data_label["tanh_qnl"][ik] = of_ml_tanh_qnl[ik];
+        this->gene_data_label["tanhp_nl"][ik] = of_ml_tanhp_nl[ik];
+        this->gene_data_label["tanhq_nl"][ik] = of_ml_tanhq_nl[ik];
+        // this->gene_data_label["pnl"][ik] = of_ml_pnl[ik] || of_ml_tanh_pnl[ik];
+
+        gene_gammanl_tot = gene_gammanl_tot || this->gene_data_label["gammanl"][ik];
+        gene_pnl_tot = gene_pnl_tot || this->gene_data_label["pnl"][ik];
+        gene_qnl_tot = gene_qnl_tot || this->gene_data_label["qnl"][ik];
+        gene_tanh_pnl_tot = gene_tanh_pnl_tot || this->gene_data_label["tanh_pnl"][ik];
+        gene_tanh_qnl_tot = gene_tanh_qnl_tot || this->gene_data_label["tanh_qnl"][ik];
+        gene_tanhp_nl_tot = gene_tanhp_nl_tot || this->gene_data_label["tanhp_nl"][ik];
+        gene_tanhq_nl_tot = gene_tanhq_nl_tot || this->gene_data_label["tanhq_nl"][ik];
+        
+        // std::cout << "gene_gammanl    " << this->gene_data_label["gammanl"][ik] << std::endl;
+        // std::cout << "gene_pnl    " << this->gene_data_label["pnl"][ik] << std::endl;
+        // std::cout << "gene_qnl    " << this->gene_data_label["qnl"][ik] << std::endl;
+        // std::cout << "gene_tanhxi_nl    " << this->gene_data_label["tanhxi_nl"][ik] << std::endl;
+        // std::cout << "gene_tanhxi    " << this->gene_data_label["tanhxi"][ik] << std::endl;
+        // std::cout << "gene_xi    " << this->gene_data_label["xi"][ik] << std::endl;
+        // std::cout << "gene_tanh_pnl    " << this->gene_data_label["tanh_pnl"][ik] << std::endl;
+        // std::cout << "gene_tanh_qnl    " << this->gene_data_label["tanh_qnl"][ik] << std::endl;
+        // std::cout << "gene_tanhp_nl    " << this->gene_data_label["tanhp_nl"][ik] << std::endl;
+        // std::cout << "gene_tanhq_nl    " << this->gene_data_label["tanhq_nl"][ik] << std::endl;
+    }
+    this->gene_data_label["gamma"][0] = of_ml_gamma || gene_gammanl_tot;
+    this->gene_data_label["tanhp"][0] = of_ml_tanhp || gene_tanhp_nl_tot || gene_tanh_pnl_tot;
+    this->gene_data_label["tanhq"][0] = of_ml_tanhq || gene_tanhq_nl_tot || gene_tanh_qnl_tot;
+    this->gene_data_label["p"][0] = of_ml_p || this->gene_data_label["tanhp"][0] || gene_pnl_tot;
+    this->gene_data_label["q"][0] = of_ml_q || this->gene_data_label["tanhq"][0] || gene_qnl_tot;
+
+
+    if (this->gene_data_label["gamma"][0]){
+        this->gamma = std::vector<double>(this->nx, 0.);
+    }
+    if (this->gene_data_label["p"][0]){
+        this->nablaRho = std::vector<std::vector<double> >(3, std::vector<double>(this->nx, 0.));
+        this->p = std::vector<double>(this->nx, 0.);
+    }
+    if (this->gene_data_label["q"][0]){
+        this->q = std::vector<double>(this->nx, 0.);
+    }
+    if (this->gene_data_label["tanhp"][0]){
+        this->tanhp = std::vector<double>(this->nx, 0.);
+    }
+    if (this->gene_data_label["tanhq"][0]){
+        this->tanhq = std::vector<double>(this->nx, 0.);
+    }
+
+    for (int ik = 0; ik < nkernel; ++ik)
+    {
+        this->gammanl.push_back({});
+        this->pnl.push_back({});
+        this->qnl.push_back({});
+        this->xi.push_back({});
+        this->tanhxi.push_back({});
+        this->tanhxi_nl.push_back({});
+        this->tanh_pnl.push_back({});
+        this->tanh_qnl.push_back({});
+        this->tanhp_nl.push_back({});
+        this->tanhq_nl.push_back({});
+
+        if (this->gene_data_label["gammanl"][ik]){
+            this->gammanl[ik] = std::vector<double>(this->nx, 0.);
+        }
+        if (this->gene_data_label["pnl"][ik]){
+            this->pnl[ik] = std::vector<double>(this->nx, 0.);
+        }
+        if (this->gene_data_label["qnl"][ik]){
+            this->qnl[ik] = std::vector<double>(this->nx, 0.);
+        }
+        if (this->gene_data_label["xi"][ik]){
+            this->xi[ik] = std::vector<double>(this->nx, 0.);
+        }
+        if (this->gene_data_label["tanhxi"][ik]){
+            this->tanhxi[ik] = std::vector<double>(this->nx, 0.);
+        }
+        if (this->gene_data_label["tanhxi_nl"][ik]){
+            this->tanhxi_nl[ik] = std::vector<double>(this->nx, 0.);
+        }
+        if (this->gene_data_label["tanh_pnl"][ik]){
+            this->tanh_pnl[ik] = std::vector<double>(this->nx, 0.);
+        }
+        if (this->gene_data_label["tanh_qnl"][ik]){
+            this->tanh_qnl[ik] = std::vector<double>(this->nx, 0.);
+        }
+        if (this->gene_data_label["tanhp_nl"][ik]){
+            this->tanhp_nl[ik] = std::vector<double>(this->nx, 0.);
+        }
+        if (this->gene_data_label["tanhq_nl"][ik]){
+            this->tanhq_nl[ik] = std::vector<double>(this->nx, 0.);
+        }
+    }
+}
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/kedf_ml_pot.cpp b/source/module_hamilt_pw/hamilt_ofdft/kedf_ml_pot.cpp
new file mode 100644
index 0000000000..1efbcaf368
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/kedf_ml_pot.cpp
@@ -0,0 +1,534 @@
+#ifdef __MLKEDF
+
+#include "kedf_ml.h"
+
+#include "module_base/parallel_reduce.h"
+#include "module_base/global_function.h"
+
+/**
+ * @brief Calculate the Pauli potential of ML KEDF
+ * 
+ * @param prho charge density
+ * @param pw_rho PW_Basis
+ * @param rpotential rpotential => rpotential + V_{ML}
+ */
+void KEDF_ML::get_potential_(const double * const * prho, ModulePW::PW_Basis *pw_rho, ModuleBase::matrix &rpotential)
+{
+    // get potential
+    ModuleBase::timer::tick("KEDF_ML", "Pauli Potential");
+
+    std::vector<double> pauli_potential(this->nx, 0.);
+
+    if (this->ml_gammanl){
+        this->potGammanlTerm(prho, pw_rho, pauli_potential);
+    }
+    if (this->ml_xi){
+        this->potXinlTerm(prho, pw_rho, pauli_potential);
+    }
+    if (this->ml_tanhxi){
+        this->potTanhxinlTerm(prho, pw_rho, pauli_potential);
+    }
+    if (this->ml_tanhxi_nl){
+        this->potTanhxi_nlTerm(prho, pw_rho, pauli_potential);
+    }
+    if (this->ml_p || this->ml_pnl){
+        this->potPPnlTerm(prho, pw_rho, pauli_potential);
+    }
+    if (this->ml_q || this->ml_qnl){
+        this->potQQnlTerm(prho, pw_rho, pauli_potential);
+    }
+    if (this->ml_tanh_pnl){
+        this->potTanhpTanh_pnlTerm(prho, pw_rho, pauli_potential);
+    }
+    if (this->ml_tanh_qnl){
+        this->potTanhqTanh_qnlTerm(prho, pw_rho, pauli_potential);
+    }
+    if ((this->ml_tanhp || this->ml_tanhp_nl) && !this->ml_tanh_pnl){
+        this->potTanhpTanhp_nlTerm(prho, pw_rho, pauli_potential);
+    }
+    if ((this->ml_tanhq || this->ml_tanhq_nl) && !this->ml_tanh_qnl){
+        this->potTanhqTanhq_nlTerm(prho, pw_rho, pauli_potential);
+    }
+
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        pauli_potential[ir] += this->cTF * std::pow(prho[0][ir], 5./3.) / prho[0][ir] *
+                      (5./3. * this->enhancement_cpu_ptr[ir] + this->potGammaTerm(ir) + this->potPTerm1(ir) + this->potQTerm1(ir)
+                      + this->potXiTerm1(ir) + this->potTanhxiTerm1(ir) + this->potTanhpTerm1(ir) + this->potTanhqTerm1(ir));
+        rpotential(0, ir) += pauli_potential[ir];
+    }
+    ModuleBase::timer::tick("KEDF_ML", "Pauli Potential");
+}
+
+
+double KEDF_ML::potGammaTerm(int ir)
+{
+    return (this->ml_gamma) ? 1./3. * gamma[ir] * this->gradient_cpu_ptr[ir * this->ninput + this->descriptor2index["gamma"][0]] : 0.;
+}
+
+double KEDF_ML::potPTerm1(int ir)
+{
+    return (this->ml_p) ? - 8./3. * p[ir] * this->gradient_cpu_ptr[ir * this->ninput + this->descriptor2index["p"][0]] : 0.;
+}
+
+double KEDF_ML::potQTerm1(int ir)
+{
+    return (this->ml_q) ? - 5./3. * q[ir] * this->gradient_cpu_ptr[ir * this->ninput + this->descriptor2index["q"][0]] : 0.;
+}
+
+double KEDF_ML::potXiTerm1(int ir)
+{
+    double result = 0.;
+    for (int ik = 0; ik < this->descriptor2kernel["xi"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["xi"][ik];
+        int d2i = this->descriptor2index["xi"][ik];
+        result += -1./3. * xi[d2k][ir] * this->gradient_cpu_ptr[ir * this->ninput + d2i];
+    }
+    return result;
+}
+
+double KEDF_ML::potTanhxiTerm1(int ir)
+{
+    double result = 0.;
+    for (int ik = 0; ik < this->descriptor2kernel["tanhxi"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanhxi"][ik];
+        int d2i = this->descriptor2index["tanhxi"][ik];
+        result += -1./3. * xi[d2k][ir] * this->ml_data->dtanh(this->tanhxi[d2k][ir], this->chi_xi[d2k])
+                                    * this->gradient_cpu_ptr[ir * this->ninput + d2i];
+    }
+    return result;
+}
+
+double KEDF_ML::potTanhpTerm1(int ir)
+{
+    return (this->ml_tanhp) ? - 8./3. * p[ir] * this->ml_data->dtanh(this->tanhp[ir], this->chi_p)
+                                 * this->gradient_cpu_ptr[ir * this->ninput + this->descriptor2index["tanhp"][0]] : 0.;
+}
+
+double KEDF_ML::potTanhqTerm1(int ir)
+{
+    return (this->ml_tanhq) ? - 5./3. * q[ir] * this->ml_data->dtanh(this->tanhq[ir], this->chi_q)
+                                 * this->gradient_cpu_ptr[ir * this->ninput + this->descriptor2index["tanhq"][0]] : 0.;
+}
+
+void KEDF_ML::potGammanlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rGammanlTerm)
+{
+    double *dFdgammanl = new double[this->nx];
+    for (int ik = 0; ik < this->descriptor2kernel["gammanl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["gammanl"][ik];
+        int d2i = this->descriptor2index["gammanl"][ik];
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdgammanl[ir] = this->cTF * std::pow(prho[0][ir], 5./3.) * this->gradient_cpu_ptr[ir * this->ninput + d2i];
+        }
+        this->ml_data->multiKernel(d2k, dFdgammanl, pw_rho, dFdgammanl);
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            rGammanlTerm[ir] += 1./3. * this->gamma[ir] / prho[0][ir] * dFdgammanl[ir];
+        }
+    }
+    delete[] dFdgammanl;
+}
+
+void KEDF_ML::potXinlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rXinlTerm)
+{
+    double *dFdxi = new double[this->nx];
+    for (int ik = 0; ik < this->descriptor2kernel["xi"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["xi"][ik];
+        int d2i = this->descriptor2index["xi"][ik];
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdxi[ir] = this->cTF * std::pow(prho[0][ir], 4./3.) * this->gradient_cpu_ptr[ir * this->ninput + d2i];
+        }
+        this->ml_data->multiKernel(d2k, dFdxi, pw_rho, dFdxi);
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            rXinlTerm[ir] += 1./3. * std::pow(prho[0][ir], -2./3.) * dFdxi[ir];
+        }
+    }
+    delete[] dFdxi;
+}
+
+void KEDF_ML::potTanhxinlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rTanhxinlTerm)
+{
+    double *dFdtanhxi = new double[this->nx];
+    for (int ik = 0; ik < this->descriptor2kernel["tanhxi"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanhxi"][ik];
+        int d2i = this->descriptor2index["tanhxi"][ik];
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdtanhxi[ir] = this->cTF * std::pow(prho[0][ir], 4./3.) * this->ml_data->dtanh(this->tanhxi[d2k][ir], this->chi_xi[d2k])
+                        * this->gradient_cpu_ptr[ir * this->ninput + d2i];
+        }
+        this->ml_data->multiKernel(d2k, dFdtanhxi, pw_rho, dFdtanhxi);
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            rTanhxinlTerm[ir] += 1./3. * std::pow(prho[0][ir], -2./3.) * dFdtanhxi[ir];
+        }
+    }
+    delete[] dFdtanhxi;
+}
+
+void KEDF_ML::potTanhxi_nlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rTanhxi_nlTerm)
+{
+    double *dFdtanhxi_nl = new double[this->nx];
+    double *dFdtanhxi_nl_nl = new double[this->nx];
+    std::vector<double> result(this->nx, 0.);
+    for (int ik = 0; ik < this->descriptor2kernel["tanhxi_nl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanhxi_nl"][ik];
+        int d2i = this->descriptor2index["tanhxi_nl"][ik];
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdtanhxi_nl[ir] = this->cTF * std::pow(prho[0][ir], 5./3.) * this->gradient_cpu_ptr[ir * this->ninput + d2i];
+        }
+        this->ml_data->multiKernel(d2k, dFdtanhxi_nl, pw_rho, dFdtanhxi_nl);
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdtanhxi_nl[ir] *= this->ml_data->dtanh(this->tanhxi[d2k][ir], this->chi_xi[d2k]) / std::pow(prho[0][ir], 1./3.);
+        }
+        this->ml_data->multiKernel(d2k, dFdtanhxi_nl, pw_rho, dFdtanhxi_nl_nl);
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            result[ir] += dFdtanhxi_nl_nl[ir] - dFdtanhxi_nl[ir] * this->xi[d2k][ir];
+        }
+    }
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        rTanhxi_nlTerm[ir] += result[ir] * 1./3. * std::pow(prho[0][ir], -2./3.);
+    }
+    delete[] dFdtanhxi_nl;
+    delete[] dFdtanhxi_nl_nl;
+}
+
+// get contribution of p and pnl
+void KEDF_ML::potPPnlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rPPnlTerm)
+{
+    double *dFdpnl = new double[this->nx];
+    std::vector<double> dFdpnl_tot(this->nx, 0.);
+    std::vector<double> result(this->nx, 0.);
+    for (int ik = 0; ik < this->descriptor2index["pnl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["pnl"][ik];
+        int d2i = this->descriptor2index["pnl"][ik];
+
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdpnl[ir] = this->cTF * std::pow(prho[0][ir], 5./3.) * this->gradient_cpu_ptr[ir * this->ninput + d2i];
+        }
+        this->ml_data->multiKernel(d2k, dFdpnl, pw_rho, dFdpnl);
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdpnl_tot[ir] += dFdpnl[ir];
+        }
+    }
+    delete[] dFdpnl;
+
+    double ** tempP = new double*[3];
+    for (int i = 0; i < 3; ++i)
+    {
+        tempP[i] = new double[this->nx];
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            tempP[i][ir] = (this->ml_p)? - 3./20. * this->gradient_cpu_ptr[ir * this->ninput + this->descriptor2index["p"][0]] * nablaRho[i][ir] / prho[0][ir] * /*Ha2Ry*/ 2. : 0.;
+            if (this->ml_pnl)
+            {
+                tempP[i][ir] += - this->pqcoef * 2. * this->nablaRho[i][ir] / std::pow(prho[0][ir], 8./3.) * dFdpnl_tot[ir];
+            }
+        }
+    }
+    this->ml_data->divergence(tempP, pw_rho, result.data());
+
+    if (this->ml_pnl)
+    {
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            result[ir] += -8./3. * this->p[ir]/prho[0][ir] * dFdpnl_tot[ir];
+        }
+    }
+
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        rPPnlTerm[ir] += result[ir];
+    }
+
+    for (int i = 0; i < 3; ++i)
+    { 
+        delete[] tempP[i];
+    }
+    delete[] tempP;
+}
+
+void KEDF_ML::potQQnlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rQQnlTerm)
+{
+    double *dFdqnl = new double[this->nx];
+    std::vector<double> dFdqnl_tot(this->nx, 0.);
+    std::vector<double> result(this->nx, 0.);
+    for (int ik = 0; ik < this->descriptor2index["qnl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["qnl"][ik];
+        int d2i = this->descriptor2index["qnl"][ik];
+
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdqnl[ir] = this->cTF * std::pow(prho[0][ir], 5./3.) * this->gradient_cpu_ptr[ir * this->ninput + d2i];
+        }
+        this->ml_data->multiKernel(d2k, dFdqnl, pw_rho, dFdqnl);
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdqnl_tot[ir] += dFdqnl[ir];
+        }
+    }
+    delete[] dFdqnl;
+
+    double * tempQ = new double[this->nx];
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        tempQ[ir] = (this->ml_q)? 3./40. * this->gradient_cpu_ptr[ir * this->ninput + this->descriptor2index["q"][0]] * /*Ha2Ry*/ 2. : 0.;
+        if (this->ml_qnl)
+        {
+            tempQ[ir] += this->pqcoef / std::pow(prho[0][ir], 5./3.) * dFdqnl_tot[ir];
+        }
+    }
+    this->ml_data->Laplacian(tempQ, pw_rho, result.data());
+
+    if (this->ml_qnl)
+    {
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            result[ir] += -5./3. * this->q[ir] / prho[0][ir] * dFdqnl_tot[ir];
+        }
+    }
+
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        rQQnlTerm[ir] += result[ir];
+    }
+    delete[] tempQ;
+}
+
+void KEDF_ML::potTanhpTanh_pnlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rTanhpTanh_pnlTerm)
+{
+    // Note we assume that tanhp_nl and tanh_pnl will NOT be used together.
+    double *dFdpnl = new double[this->nx];
+    std::vector<double> dFdpnl_tot(this->nx, 0.);
+    std::vector<double> result(this->nx, 0.);
+    for (int ik = 0; ik < this->descriptor2kernel["tanh_pnl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanh_pnl"][ik];
+        int d2i = this->descriptor2index["tanh_pnl"][ik];
+
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdpnl[ir] = this->cTF * std::pow(prho[0][ir], 5./3.) * this->ml_data->dtanh(this->tanh_pnl[d2k][ir], this->chi_pnl[d2k])
+                         * this->gradient_cpu_ptr[ir * this->ninput + d2i];
+        }
+        this->ml_data->multiKernel(d2k, dFdpnl, pw_rho, dFdpnl);
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdpnl_tot[ir] += dFdpnl[ir];
+        }
+    }
+    delete[] dFdpnl;
+
+    double ** tempP = new double*[3];
+    for (int i = 0; i < 3; ++i)
+    {
+        tempP[i] = new double[this->nx];
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            tempP[i][ir] = (this->ml_tanhp)? - 3./20. * this->ml_data->dtanh(this->tanhp[ir], this->chi_p)
+                           * this->gradient_cpu_ptr[ir * this->ninput + this->descriptor2index["tanhp"][0]] * nablaRho[i][ir] / prho[0][ir] * /*Ha2Ry*/ 2. : 0.;
+            if (this->ml_tanh_pnl)
+            {
+                tempP[i][ir] += - this->pqcoef * 2. * this->nablaRho[i][ir] / std::pow(prho[0][ir], 8./3.) * dFdpnl_tot[ir];
+            }
+        }
+    }
+    this->ml_data->divergence(tempP, pw_rho, result.data());
+
+    if (this->ml_tanh_pnl)
+    {
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            result[ir] += -8./3. * this->p[ir]/prho[0][ir] * dFdpnl_tot[ir];
+        }
+    }
+
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        rTanhpTanh_pnlTerm[ir] += result[ir];
+    }
+    for (int i = 0; i < 3; ++i) 
+    { 
+        delete[] tempP[i];
+    }
+    delete[] tempP;
+}
+
+void KEDF_ML::potTanhqTanh_qnlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rTanhqTanh_qnlTerm)
+{
+    // Note we assume that tanhq_nl and tanh_qnl will NOT be used together.
+    double *dFdqnl = new double[this->nx];
+    std::vector<double> dFdqnl_tot(this->nx, 0.);
+    std::vector<double> result(this->nx, 0.);
+    for (int ik = 0; ik < this->descriptor2kernel["tanh_qnl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanh_qnl"][ik];
+        int d2i = this->descriptor2index["tanh_qnl"][ik];
+
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdqnl[ir] = this->cTF * std::pow(prho[0][ir], 5./3.) * this->ml_data->dtanh(this->tanh_qnl[d2k][ir], this->chi_qnl[d2k])
+                         * this->gradient_cpu_ptr[ir * this->ninput + d2i];
+        }
+        this->ml_data->multiKernel(d2k, dFdqnl, pw_rho, dFdqnl);
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdqnl_tot[ir] += dFdqnl[ir];
+        }
+    }
+    delete[] dFdqnl;
+
+    double * tempQ = new double[this->nx];
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        tempQ[ir] = (this->ml_tanhq)? 3./40. * this->ml_data->dtanh(this->tanhq[ir], this->chi_q)
+                    * this->gradient_cpu_ptr[ir * this->ninput + this->descriptor2index["tanhq"][0]] * /*Ha2Ry*/ 2. : 0.;
+        if (this->ml_tanh_qnl)
+        {
+            tempQ[ir] += this->pqcoef / std::pow(prho[0][ir], 5./3.) * dFdqnl_tot[ir];
+        }
+    }
+    this->ml_data->Laplacian(tempQ, pw_rho, result.data());
+
+    if (this->ml_tanh_qnl)
+    {
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            result[ir] += -5./3. * this->q[ir] / prho[0][ir] * dFdqnl_tot[ir];
+        }
+    }
+
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        rTanhqTanh_qnlTerm[ir] += result[ir];
+    }
+    delete[] tempQ;
+}
+
+// Note we assume that tanhp_nl and tanh_pnl will NOT be used together.
+void KEDF_ML::potTanhpTanhp_nlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rTanhpTanhp_nlTerm)
+{
+    double *dFdpnl = new double[this->nx];
+    std::vector<double> dFdpnl_tot(this->nx, 0.);
+    std::vector<double> result(this->nx, 0.);
+    for (int ik = 0; ik < this->descriptor2kernel["tanhp_nl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanhp_nl"][ik];
+        int d2i = this->descriptor2index["tanhp_nl"][ik];
+
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdpnl[ir] = this->cTF * std::pow(prho[0][ir], 5./3.)
+                         * this->gradient_cpu_ptr[ir * this->ninput + d2i];
+        }
+        this->ml_data->multiKernel(d2k, dFdpnl, pw_rho, dFdpnl);
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdpnl_tot[ir] += this->ml_data->dtanh(this->tanhp[ir], this->chi_p) * dFdpnl[ir];
+        }
+    }
+    delete[] dFdpnl;
+
+    double ** tempP = new double*[3];
+    for (int i = 0; i < 3; ++i)
+    {
+        tempP[i] = new double[this->nx];
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            tempP[i][ir] = (this->ml_tanhp)? - 3./20. * this->ml_data->dtanh(this->tanhp[ir], this->chi_p)
+                           * this->gradient_cpu_ptr[ir * this->ninput + this->descriptor2index["tanhp"][0]] * nablaRho[i][ir] / prho[0][ir] * /*Ha2Ry*/ 2. : 0.;
+            if (this->ml_tanhp_nl)
+            {
+                tempP[i][ir] += - this->pqcoef * 2. * this->nablaRho[i][ir] / std::pow(prho[0][ir], 8./3.) * dFdpnl_tot[ir];
+            }
+        }
+    }
+    this->ml_data->divergence(tempP, pw_rho, result.data());
+
+    if (this->ml_tanhp_nl)
+    {
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            result[ir] += -8./3. * this->p[ir]/prho[0][ir] * dFdpnl_tot[ir];
+        }
+    }
+
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        rTanhpTanhp_nlTerm[ir] += result[ir];
+    }
+    for (int i = 0; i < 3; ++i) 
+    { 
+        delete[] tempP[i];
+    }
+    delete[] tempP;
+}
+
+void KEDF_ML::potTanhqTanhq_nlTerm(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rTanhqTanhq_nlTerm)
+{
+    double *dFdqnl = new double[this->nx];
+    std::vector<double> dFdqnl_tot(this->nx, 0.);
+    std::vector<double> result(this->nx, 0.);
+    for (int ik = 0; ik < this->descriptor2kernel["tanhq_nl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanhq_nl"][ik];
+        int d2i = this->descriptor2index["tanhq_nl"][ik];
+
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdqnl[ir] = this->cTF * std::pow(prho[0][ir], 5./3.)
+                         * this->gradient_cpu_ptr[ir * this->ninput + d2i];
+        }
+        this->ml_data->multiKernel(d2k, dFdqnl, pw_rho, dFdqnl);
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            dFdqnl_tot[ir] += dFdqnl[ir];
+        }
+    }
+    delete[] dFdqnl;
+
+    double * tempQ = new double[this->nx];
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        tempQ[ir] = (this->ml_tanhq)? 3./40. * this->ml_data->dtanh(this->tanhq[ir], this->chi_q)
+                    * this->gradient_cpu_ptr[ir * this->ninput + this->descriptor2index["tanhq"][0]] * /*Ha2Ry*/ 2. : 0.;
+        if (this->ml_tanhq_nl)
+        {
+            dFdqnl_tot[ir] *= this->ml_data->dtanh(this->tanhq[ir], this->chi_q);
+            tempQ[ir] += this->pqcoef / std::pow(prho[0][ir], 5./3.) * dFdqnl_tot[ir];
+        }
+    }
+    this->ml_data->Laplacian(tempQ, pw_rho, result.data());
+
+    if (this->ml_tanhq_nl)
+    {
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            result[ir] += -5./3. * this->q[ir] / prho[0][ir] * dFdqnl_tot[ir];
+        }
+    }
+
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        rTanhqTanhq_nlTerm[ir] += result[ir];
+    }
+    delete[] tempQ;
+}
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_data.cpp b/source/module_hamilt_pw/hamilt_ofdft/ml_data.cpp
new file mode 100644
index 0000000000..e13c8581db
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_data.cpp
@@ -0,0 +1,507 @@
+#ifdef __MLKEDF
+
+#include "ml_data.h"
+
+#include "npy.hpp"
+#include "module_elecstate/module_charge/symmetry_rho.h"
+
+void ML_data::set_para(
+    const int &nx,
+    const double &nelec, 
+    const double &tf_weight, 
+    const double &vw_weight,
+    const double &chi_p,
+    const double &chi_q,
+    const std::vector<double> &chi_xi,
+    const std::vector<double> &chi_pnl,
+    const std::vector<double> &chi_qnl,
+    const int &nkernel,
+    const std::vector<int> &kernel_type,
+    const std::vector<double> &kernel_scaling,
+    const std::vector<double> &yukawa_alpha,
+    const std::vector<std::string> &kernel_file,
+    const double &omega,
+    ModulePW::PW_Basis *pw_rho
+)
+{
+    this->nx = nx;
+    this->nkernel = nkernel;
+    this->chi_p = chi_p;
+    this->chi_q = chi_q;
+    this->chi_xi = chi_xi;
+    this->chi_pnl = chi_pnl;
+    this->chi_qnl = chi_qnl;
+
+    this->kernel_type = kernel_type;
+    this->kernel_scaling = kernel_scaling;
+    this->yukawa_alpha = yukawa_alpha;
+    this->kernel_file = kernel_file;
+    std::cout << "nkernel = " << nkernel << std::endl;
+
+    if (PARAM.inp.of_wt_rho0 != 0)
+    {
+        this->rho0 = PARAM.inp.of_wt_rho0;
+    }
+    else
+    {
+        this->rho0 = 1./omega * nelec;
+    }
+
+    this->kF = std::pow(3. * std::pow(ModuleBase::PI, 2) * this->rho0, 1./3.);
+    this->tkF = 2. * this->kF;
+
+    this->kernel = new double*[this->nkernel];
+    for (int ik = 0; ik < this->nkernel; ++ik)
+    {
+        // delete[] this->kernel[ik];
+        this->kernel[ik] = new double[pw_rho->npw];
+        if (this->kernel_type[ik] == 3 || this->kernel_type[ik] == 4) // 3 for TKK Al, and 4 for TKK Si
+        {
+            this->read_kernel(this->kernel_file[ik], this->kernel_scaling[ik], pw_rho, this->kernel[ik]);
+        }
+        else
+        {
+            double eta = 0.;
+            for (int ip = 0; ip < pw_rho->npw; ++ip)
+            {
+                eta = sqrt(pw_rho->gg[ip]) * pw_rho->tpiba / this->tkF * this->kernel_scaling[ik];
+                if (this->kernel_type[ik] == 1)
+                {
+                    this->kernel[ik][ip] = this->MLkernel(eta, tf_weight, vw_weight);
+                }
+                else if (this->kernel_type[ik] == 2)
+                {
+                    this->kernel[ik][ip] = this->MLkernel_yukawa(eta, this->yukawa_alpha[ik]);
+                }
+            }
+        }
+    }
+}
+
+void ML_data::generateTrainData_WT(
+    const double * const *prho, 
+    KEDF_WT &wt, 
+    KEDF_TF &tf, 
+    ModulePW::PW_Basis *pw_rho,
+    const double* veff    
+)
+{
+    std::vector<std::vector<double>> nablaRho(3, std::vector<double>(this->nx, 0.));
+
+    this->generate_descriptor(prho, pw_rho, nablaRho);
+
+    std::vector<double> container(this->nx);
+    const long unsigned cshape[] = {(long unsigned) this->nx}; // shape of container and containernl
+    
+    // enhancement factor of Pauli potential
+    if (PARAM.inp.of_kinetic == "wt")
+    {
+        this->getF_WT(wt, tf, prho, pw_rho, container);
+        npy::SaveArrayAsNumpy("enhancement.npy", false, 1, cshape, container);
+
+        // Pauli potential
+        this->getPauli_WT(wt, tf, prho, pw_rho, container);
+        npy::SaveArrayAsNumpy("pauli.npy", false, 1, cshape, container);
+    }
+
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        container[ir] = veff[ir];
+    }
+    npy::SaveArrayAsNumpy("veff.npy", false, 1, cshape, container);
+}
+
+void ML_data::generateTrainData_KS(
+    psi::Psi<std::complex<double>> *psi,
+    elecstate::ElecState *pelec,
+    ModulePW::PW_Basis_K *pw_psi,
+    ModulePW::PW_Basis *pw_rho,
+    UnitCell& ucell,
+    const double* veff
+)
+{
+    std::vector<std::vector<double>> nablaRho(3, std::vector<double>(this->nx, 0.));
+
+    this->generate_descriptor(pelec->charge->rho, pw_rho, nablaRho);
+
+    std::vector<double> container(this->nx);
+    std::vector<double> containernl(this->nx);
+
+    const long unsigned cshape[] = {(long unsigned) this->nx}; // shape of container and containernl
+    // enhancement factor of Pauli energy, and Pauli potential
+    this->getF_KS1(psi, pelec, pw_psi, pw_rho, ucell, nablaRho, container, containernl);
+
+    Symmetry_rho srho;
+
+    Charge* ptempRho = new Charge();
+    ptempRho->nspin = PARAM.inp.nspin;
+    ptempRho->nrxx = this->nx;
+    ptempRho->rho_core = pelec->charge->rho_core;
+    ptempRho->rho = new double*[1];
+    ptempRho->rho[0] = new double[this->nx];
+    ptempRho->rhog = new std::complex<double>*[1];
+    ptempRho->rhog[0] = new std::complex<double>[pw_rho->npw];
+
+    for (int ir = 0; ir < this->nx; ++ir){
+        ptempRho->rho[0][ir] = container[ir];
+    }
+    srho.begin(0, *ptempRho, pw_rho, ucell.symm);
+    for (int ir = 0; ir < this->nx; ++ir){
+        container[ir] = ptempRho->rho[0][ir];
+    }
+
+    for (int ir = 0; ir < this->nx; ++ir){
+        ptempRho->rho[0][ir] = containernl[ir];
+    }
+    srho.begin(0, *ptempRho, pw_rho, ucell.symm);
+    for (int ir = 0; ir < this->nx; ++ir){
+        containernl[ir] = ptempRho->rho[0][ir];
+    }
+
+    npy::SaveArrayAsNumpy("enhancement.npy", false, 1, cshape, container);
+    npy::SaveArrayAsNumpy("pauli.npy", false, 1, cshape, containernl);
+
+    // enhancement factor of Pauli energy, and Pauli potential
+    this->getF_KS2(psi, pelec, pw_psi, pw_rho, ucell, container, containernl);
+
+    for (int ir = 0; ir < this->nx; ++ir){
+        ptempRho->rho[0][ir] = container[ir];
+    }
+    srho.begin(0, *ptempRho, pw_rho, ucell.symm);
+    for (int ir = 0; ir < this->nx; ++ir){
+        container[ir] = ptempRho->rho[0][ir];
+    }
+
+    for (int ir = 0; ir < this->nx; ++ir){
+        ptempRho->rho[0][ir] = containernl[ir];
+    }
+    srho.begin(0, *ptempRho, pw_rho, ucell.symm);
+    for (int ir = 0; ir < this->nx; ++ir){
+        containernl[ir] = ptempRho->rho[0][ir];
+    }
+
+    npy::SaveArrayAsNumpy("enhancement2.npy", false, 1, cshape, container);
+    npy::SaveArrayAsNumpy("pauli2.npy", false, 1, cshape, containernl);
+
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        container[ir] = veff[ir];
+    }
+    npy::SaveArrayAsNumpy("veff.npy", false, 1, cshape, container);
+
+    delete ptempRho;
+}
+
+void ML_data::generate_descriptor(
+    const double * const *prho, 
+    ModulePW::PW_Basis *pw_rho,
+    std::vector<std::vector<double>> &nablaRho
+)
+{
+    // container which will contain gamma, p, q in turn
+    std::vector<double> container(this->nx);
+    std::vector<double> new_container(this->nx);
+    // container contains gammanl, pnl, qnl in turn
+    std::vector<double> containernl(this->nx);
+    std::vector<double> new_containernl(this->nx);
+
+    const long unsigned cshape[] = {(long unsigned) this->nx}; // shape of container and containernl
+
+    // rho
+    std::vector<double> rho(this->nx);
+    for (int ir = 0; ir < this->nx; ++ir){
+        rho[ir] = prho[0][ir];
+    }
+    npy::SaveArrayAsNumpy("rho.npy", false, 1, cshape, rho);
+
+    // gamma
+    this->getGamma(prho, container);
+    npy::SaveArrayAsNumpy("gamma.npy", false, 1, cshape, container);
+
+    for (int ik = 0; ik < this->nkernel; ++ik)
+    {
+        int ktype = this->kernel_type[ik];
+        double kscaling = this->kernel_scaling[ik];
+
+        // gamma_nl
+        this->getGammanl(ik, container, pw_rho, containernl);
+        npy::SaveArrayAsNumpy(this->file_name("gammanl", ktype, kscaling), false, 1, cshape, containernl);
+
+        // xi = gamma_nl/gamma
+        this->getXi(container, containernl, new_container);
+        npy::SaveArrayAsNumpy(this->file_name("xi", ktype, kscaling), false, 1, cshape, new_container);
+
+        // tanhxi = tanh(xi)
+        this->getTanhXi(ik, container, containernl, new_container);
+        npy::SaveArrayAsNumpy(this->file_name("tanhxi", ktype, kscaling), false, 1, cshape, new_container);
+
+        // (tanhxi)_nl
+        this->getTanhXi_nl(ik, new_container, pw_rho, new_containernl);
+        npy::SaveArrayAsNumpy(this->file_name("tanhxi_nl", ktype, kscaling), false, 1, cshape, new_containernl);
+    }
+
+    // nabla rho
+    this->getNablaRho(prho, pw_rho, nablaRho);
+    npy::SaveArrayAsNumpy("nablaRhox.npy", false, 1, cshape, nablaRho[0]);
+    npy::SaveArrayAsNumpy("nablaRhoy.npy", false, 1, cshape, nablaRho[1]);
+    npy::SaveArrayAsNumpy("nablaRhoz.npy", false, 1, cshape, nablaRho[2]);
+
+    // p
+    this->getP(prho, pw_rho, nablaRho, container);
+    npy::SaveArrayAsNumpy("p.npy", false, 1, cshape, container);
+
+    for (int ik = 0; ik < this->nkernel; ++ik)
+    {
+        int ktype = this->kernel_type[ik];
+        double kscaling = this->kernel_scaling[ik];
+
+        // p_nl
+        this->getPnl(ik, container, pw_rho, containernl);
+        npy::SaveArrayAsNumpy(this->file_name("pnl", ktype, kscaling), false, 1, cshape, containernl);
+
+        // tanh(p_nl)
+        this->getTanh_Pnl(ik, containernl, new_containernl);
+        npy::SaveArrayAsNumpy(this->file_name("tanh_pnl", ktype, kscaling), false, 1, cshape, new_containernl);
+
+        // tanh(p)
+        this->getTanhP(container, new_container);
+        npy::SaveArrayAsNumpy("tanhp.npy", false, 1, cshape, new_container);
+
+        // tanh(p)_nl
+        this->getTanhP_nl(ik, new_container, pw_rho, new_containernl);
+        npy::SaveArrayAsNumpy(this->file_name("tanhp_nl", ktype, kscaling), false, 1, cshape, new_containernl);
+    }
+
+    // q
+    this->getQ(prho, pw_rho, container);
+    npy::SaveArrayAsNumpy("q.npy", false, 1, cshape, container);
+
+    for (int ik = 0; ik < this->nkernel; ++ik)
+    {
+        int ktype = this->kernel_type[ik];
+        double kscaling = this->kernel_scaling[ik];
+
+        // q_nl
+        this->getQnl(ik, container, pw_rho, containernl);
+        npy::SaveArrayAsNumpy(this->file_name("qnl", ktype, kscaling), false, 1, cshape, containernl);
+
+        // tanh(q_nl)
+        this->getTanh_Qnl(ik, containernl, new_containernl);
+        npy::SaveArrayAsNumpy(this->file_name("tanh_qnl", ktype, kscaling), false, 1, cshape, new_containernl);
+
+        // tanh(q)
+        this->getTanhQ(container, new_container);
+        npy::SaveArrayAsNumpy("tanhq.npy", false, 1, cshape, new_container);
+
+        // tanh(q)_nl
+        this->getTanhQ_nl(ik, new_container, pw_rho, new_containernl);
+        npy::SaveArrayAsNumpy(this->file_name("tanhq_nl", ktype, kscaling), false, 1, cshape, new_containernl);
+    }
+}
+
+double ML_data::MLkernel(double eta, double tf_weight, double vw_weight)
+{
+    if (eta < 0.) 
+    {
+        return 0.;
+    }
+    // limit for small eta
+    else if (eta < 1e-10)
+    {
+        return 1. - tf_weight + eta * eta * (1./3. - 3. * vw_weight);
+    }
+    // around the singularity
+    else if (std::abs(eta - 1.) < 1e-10)
+    {
+        return 2. - tf_weight - 3. * vw_weight + 20. * (eta - 1);
+    }
+    // Taylor expansion for high eta
+    else if (eta > 3.65)
+    {
+        double eta2 = eta * eta;
+        double invEta2 = 1. / eta2;
+        double LindG = 3. * (1. - vw_weight) * eta2 
+                        -tf_weight-0.6 
+                        + invEta2 * (-0.13714285714285712 
+                        + invEta2 * (-6.39999999999999875E-2
+                        + invEta2 * (-3.77825602968460128E-2
+                        + invEta2 * (-2.51824061652633074E-2
+                        + invEta2 * (-1.80879839616166146E-2
+                        + invEta2 * (-1.36715733124818332E-2
+                        + invEta2 * (-1.07236045520990083E-2
+                        + invEta2 * (-8.65192783339199453E-3 
+                        + invEta2 * (-7.1372762502456763E-3
+                        + invEta2 * (-5.9945117538835746E-3
+                        + invEta2 * (-5.10997527675418131E-3 
+                        + invEta2 * (-4.41060829979912465E-3 
+                        + invEta2 * (-3.84763737842981233E-3 
+                        + invEta2 * (-3.38745061493813488E-3 
+                        + invEta2 * (-3.00624946457977689E-3)))))))))))))));
+        return LindG;
+    }
+    else
+    {
+        return 1. / (0.5 + 0.25 * (1. - eta * eta) / eta * std::log((1 + eta)/std::abs(1 - eta)))
+                 - 3. * vw_weight * eta * eta - tf_weight;
+    }
+}
+
+double ML_data::MLkernel_yukawa(double eta, double alpha)
+{
+    return (eta == 0 && alpha == 0) ? 0. : M_PI / (eta * eta + alpha * alpha / 4.);
+}
+
+// Read kernel from file
+void ML_data::read_kernel(const std::string &fileName, const double& scaling, ModulePW::PW_Basis *pw_rho, double* kernel_)
+{
+    std::ifstream ifs(fileName.c_str(), std::ios::in);
+
+    GlobalV::ofs_running << "Read WT kernel from " << fileName << std::endl;
+    if (!ifs) ModuleBase::WARNING_QUIT("ml_data.cpp", "The kernel file not found");
+
+    int kineType = 0;
+    double kF_in = 0.;
+    double rho0_in = 0.;
+    int nq_in = 0;
+    double maxEta_in = 0.;
+
+    ifs >> kineType;
+    ifs >> kF_in;
+    ifs >> rho0_in;
+    ifs >> nq_in;
+
+    double *eta_in = new double[nq_in];
+    double *w0_in = new double[nq_in];
+
+    for (int iq = 0; iq < nq_in; ++iq)
+    {
+        ifs >> eta_in[iq] >> w0_in[iq];
+    }
+
+    maxEta_in = eta_in[nq_in-1];
+
+    double eta = 0.;
+    double maxEta = 0.;
+    int ind1 = 0;
+    int ind2 = 0;
+    int ind_mid = 0;
+    double fac1 = 0.;
+    double fac2 = 0.;
+    for (int ig = 0; ig < pw_rho->npw; ++ig)
+    {
+        eta = sqrt(pw_rho->gg[ig]) * pw_rho->tpiba / this->tkF;
+        eta = eta * scaling;
+        maxEta = std::max(eta, maxEta);
+
+        if (eta <= eta_in[0])
+            kernel_[ig] = w0_in[0];
+        else if (eta > maxEta_in)
+            kernel_[ig] = w0_in[nq_in-1];
+        else
+        {
+            ind1 = 1;
+            ind2 = nq_in;
+            while (ind1 < ind2 - 1)
+            {
+                ind_mid = (ind1 + ind2)/2;
+                if (eta > eta_in[ind_mid])
+                {
+                    ind1 = ind_mid;
+                }
+                else
+                {
+                    ind2 = ind_mid;
+                }
+            }
+            fac1 = (eta_in[ind2] - eta)/(eta_in[ind2] - eta_in[ind1]);
+            fac2 = (eta - eta_in[ind1])/(eta_in[ind2] - eta_in[ind1]);
+            kernel_[ig] = fac1 * w0_in[ind1] + fac2 * w0_in[ind2];
+        }
+    }
+
+    if (maxEta > maxEta_in) ModuleBase::WARNING("kedf_wt.cpp", "Please increase the maximal eta value in KEDF kernel file");
+
+    delete[] eta_in;
+    delete[] w0_in;
+    ModuleBase::GlobalFunc::DONE(GlobalV::ofs_running, "FILL WT KERNEL");
+}
+
+void ML_data::multiKernel(const int ikernel, double *pinput, ModulePW::PW_Basis *pw_rho, double *routput)
+{
+    std::complex<double> *recipOutput = new std::complex<double>[pw_rho->npw];
+
+    pw_rho->real2recip(pinput, recipOutput);
+    for (int ip = 0; ip < pw_rho->npw; ++ip)
+    {
+        recipOutput[ip] *= this->kernel[ikernel][ip];
+    }
+    pw_rho->recip2real(recipOutput, routput);
+
+    delete[] recipOutput;
+}
+
+void ML_data::Laplacian(double * pinput, ModulePW::PW_Basis *pw_rho, double * routput)
+{
+    std::complex<double> *recipContainer = new std::complex<double>[pw_rho->npw];
+
+    pw_rho->real2recip(pinput, recipContainer);
+    for (int ip = 0; ip < pw_rho->npw; ++ip)
+    {
+        recipContainer[ip] *= - pw_rho->gg[ip] * pw_rho->tpiba2;
+    }
+    pw_rho->recip2real(recipContainer, routput);
+
+    delete[] recipContainer;
+}
+
+void ML_data::divergence(double ** pinput, ModulePW::PW_Basis *pw_rho, double * routput)
+{
+    std::complex<double> *recipContainer = new std::complex<double>[pw_rho->npw];
+    std::complex<double> img(0.0, 1.0);
+    ModuleBase::GlobalFunc::ZEROS(routput, this->nx);
+    for (int i = 0; i < 3; ++i)
+    {
+        pw_rho->real2recip(pinput[i], recipContainer);
+        for (int ip = 0; ip < pw_rho->npw; ++ip)
+        {
+            recipContainer[ip] = img * pw_rho->gcar[ip][i] * pw_rho->tpiba * recipContainer[ip];
+        }
+        pw_rho->recip2real(recipContainer, routput, true);
+    }
+
+    delete[] recipContainer;
+}
+
+void ML_data::loadVector(std::string filename, std::vector<double> &data)
+{
+    std::vector<long unsigned int> cshape = {(long unsigned) this->nx};
+    bool fortran_order = false;
+    npy::LoadArrayFromNumpy(filename, cshape, fortran_order, data);
+}
+
+void ML_data::dumpVector(std::string filename, const std::vector<double> &data)
+{
+    const long unsigned cshape[] = {(long unsigned) this->nx}; // shape
+    npy::SaveArrayAsNumpy(filename, false, 1, cshape, data);
+}
+
+void ML_data::tanh(std::vector<double> &pinput, std::vector<double> &routput, double chi)
+{
+    for (int i = 0; i < this->nx; ++i)
+    {
+        routput[i] = std::tanh(pinput[i] * chi);
+    }
+}
+
+double ML_data::dtanh(double tanhx, double chi)
+{
+    return (1. - tanhx * tanhx) * chi;
+}
+
+std::string ML_data::file_name(std::string parameter, const int kernel_type, const double kernel_scaling)
+{
+    std::stringstream ss;
+    ss << "./" << parameter << "_" << kernel_type << "_" << kernel_scaling << ".npy";
+    return ss.str();
+}
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_data.h b/source/module_hamilt_pw/hamilt_ofdft/ml_data.h
new file mode 100644
index 0000000000..8f71f98db9
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_data.h
@@ -0,0 +1,155 @@
+#ifndef ML_DATA_H
+#define ML_DATA_H
+
+#ifdef __MLKEDF
+
+#include <vector>
+#include "kedf_wt.h"
+#include "kedf_tf.h"
+#include "module_elecstate/elecstate_pw.h"
+#include "module_base/global_function.h"
+#include "module_hamilt_pw/hamilt_pwdft/global.h"
+#include "module_parameter/parameter.h"
+
+class ML_data{
+public:
+    ~ML_data()
+    {
+        for (int ik = 0; ik < this->nkernel; ++ik)
+        {
+            delete[] this->kernel[ik];
+        }
+    }
+
+    void set_para(
+        const int &nx,
+        const double &nelec, 
+        const double &tf_weight, 
+        const double &vw_weight,
+        const double &chi_p,
+        const double &chi_q,
+        const std::vector<double> &chi_xi,
+        const std::vector<double> &chi_pnl,
+        const std::vector<double> &chi_qnl,
+        const int &nkernel,
+        const std::vector<int> &kernel_type,
+        const std::vector<double> &kernel_scaling,
+        const std::vector<double> &yukawa_alpha,
+        const std::vector<std::string> &kernel_file,
+        const double &omega,
+        ModulePW::PW_Basis *pw_rho);
+    // output all parameters
+    void generateTrainData_WT(
+        const double * const *prho, 
+        KEDF_WT &wt, 
+        KEDF_TF &tf, 
+        ModulePW::PW_Basis *pw_rho,
+        const double *veff
+    );
+    void generateTrainData_KS(
+        psi::Psi<std::complex<double>> *psi,
+        elecstate::ElecState *pelec,
+        ModulePW::PW_Basis_K *pw_psi,
+        ModulePW::PW_Basis *pw_rho,
+        UnitCell& ucell,
+        const double *veff
+    );
+    void generateTrainData_KS(
+        psi::Psi<std::complex<float>> *psi,
+        elecstate::ElecState *pelec,
+        ModulePW::PW_Basis_K *pw_psi,
+        ModulePW::PW_Basis *pw_rho,
+        UnitCell& ucell,
+        const double *veff
+    ){} // a mock function
+    void generate_descriptor(
+        const double * const *prho, 
+        ModulePW::PW_Basis *pw_rho,
+        std::vector<std::vector<double>> &nablaRho
+    );
+    // get input parameters
+    void getGamma(const double * const *prho, std::vector<double> &rgamma);
+    void getP(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<std::vector<double>> &pnablaRho, std::vector<double> &rp);
+    void getQ(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rq);
+    void getGammanl(const int ikernel, std::vector<double> &pgamma, ModulePW::PW_Basis *pw_rho, std::vector<double> &rgammanl);
+    void getPnl(const int ikernel, std::vector<double> &pp, ModulePW::PW_Basis *pw_rho, std::vector<double> &rpnl);
+    void getQnl(const int ikernel, std::vector<double> &pq, ModulePW::PW_Basis *pw_rho, std::vector<double> &rqnl);
+    // new parameters 2023-02-03
+    void getXi(std::vector<double> &pgamma, std::vector<double> &pgammanl, std::vector<double> &rxi);
+    void getTanhXi(const int ikernel, std::vector<double> &pgamma, std::vector<double> &pgammanl, std::vector<double> &rtanhxi);
+    void getTanhP(std::vector<double> &pp, std::vector<double> &rtanhp);
+    void getTanhQ(std::vector<double> &pq, std::vector<double> &rtanhq);
+    void getTanh_Pnl(const int ikernel, std::vector<double> &ppnl, std::vector<double> &rtanh_pnl);
+    void getTanh_Qnl(const int ikernel, std::vector<double> &pqnl, std::vector<double> &rtanh_qnl);
+    void getTanhP_nl(const int ikernel, std::vector<double> &ptanhp, ModulePW::PW_Basis *pw_rho, std::vector<double> &rtanhp_nl);
+    void getTanhQ_nl(const int ikernel, std::vector<double> &ptanhq, ModulePW::PW_Basis *pw_rho, std::vector<double> &rtanhq_nl);
+    // 2023-03-20
+    void getTanhXi_nl(const int ikernel, std::vector<double> &ptanhxi, ModulePW::PW_Basis *pw_rho, std::vector<double> &rtanhxi_nl);
+    // get target
+    void getF_WT(KEDF_WT &wt, KEDF_TF &tf, const double * const *prho, ModulePW::PW_Basis *pw_rho,std::vector<double> &rF);
+    void getPauli_WT(KEDF_WT &wt, KEDF_TF &tf, const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rpauli);
+
+    void getF_KS1(
+        psi::Psi<std::complex<double>> *psi,
+        elecstate::ElecState *pelec,
+        ModulePW::PW_Basis_K *pw_psi,
+        ModulePW::PW_Basis *pw_rho,
+        UnitCell& ucell,
+        const std::vector<std::vector<double>> &nablaRho,
+        std::vector<double> &rF,
+        std::vector<double> &rpauli
+    );
+    void getF_KS2(
+        psi::Psi<std::complex<double>> *psi,
+        elecstate::ElecState *pelec,
+        ModulePW::PW_Basis_K *pw_psi,
+        ModulePW::PW_Basis *pw_rho,
+        UnitCell& ucell,
+        std::vector<double> &rF,
+        std::vector<double> &rpauli
+    );
+    // get intermediate variables of V_Pauli
+    void getNablaRho(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<std::vector<double>> &rnablaRho);
+
+    // tools
+    double MLkernel(double eta, double tf_weight, double vw_weight);
+    double MLkernel_yukawa(double eta, double alpha);
+    void read_kernel(const std::string &fileName, const double& scaling, ModulePW::PW_Basis *pw_rho, double* kernel_);
+    void multiKernel(const int ikernel, double *pinput, ModulePW::PW_Basis *pw_rho, double *routput);
+    void Laplacian(double * pinput, ModulePW::PW_Basis *pw_rho, double * routput);
+    void divergence(double ** pinput, ModulePW::PW_Basis *pw_rho, double * routput);
+    // void dumpTensor(const torch::Tensor &data, std::string filename);
+    void loadVector(std::string filename, std::vector<double> &data);
+    void dumpVector(std::string filename, const std::vector<double> &data);
+
+    void tanh(std::vector<double> &pinput, std::vector<double> &routput, double chi=1.);
+    double dtanh(double tanhx, double chi=1.);
+
+    // new parameters 2023-02-13
+    std::vector<double> chi_xi = {1.0};
+    double chi_p = 1.;
+    double chi_q = 1.;
+    std::vector<double> chi_pnl = {1.0};
+    std::vector<double> chi_qnl = {1.0};
+
+    int nx = 0;
+    double dV = 0.;
+    double rho0 = 0.; // average rho
+    double kF = 0.; // Fermi vector kF = (3 pi^2 rho0)^(1/3)
+    double tkF = 0.; // 2 * kF
+    // double weightml = 1.;
+    const double cTF = 3.0/10.0 * std::pow(3*std::pow(M_PI, 2.0), 2.0/3.0) * 2; // 10/3*(3*pi^2)^{2/3}, multiply by 2 to convert unit from Hartree to Ry, finally in Ry*Bohr^(-2)
+    const double pqcoef = 1.0 / (4.0 * std::pow(3*std::pow(M_PI, 2.0), 2.0/3.0)); // coefficient of p and q
+    
+    int nkernel = 1;
+    std::vector<int> kernel_type = {1};
+    std::vector<double> kernel_scaling = {1.0};
+    std::vector<double> yukawa_alpha = {1.0};
+    std::vector<std::string> kernel_file = {"none"};
+    double **kernel = nullptr;
+
+    std::string file_name(std::string parameter, const int kernel_type, const double kernel_scaling);
+};
+
+#endif
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_data_descriptor.cpp b/source/module_hamilt_pw/hamilt_ofdft/ml_data_descriptor.cpp
new file mode 100644
index 0000000000..85135d789f
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_data_descriptor.cpp
@@ -0,0 +1,380 @@
+#ifdef __MLKEDF
+
+#include "ml_data.h"
+
+void ML_data::getGamma(const double * const *prho, std::vector<double> &rgamma)
+{
+    for(int ir = 0; ir < this->nx; ++ir)
+    {
+        rgamma[ir] = std::pow(prho[0][ir]/this->rho0, 1./3.);
+    }
+}
+
+void ML_data::getP(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<std::vector<double>> &pnablaRho, std::vector<double> &rp)
+{
+    for(int ir = 0; ir < this->nx; ++ir)
+    {
+        rp[ir] = 0.;
+        for (int j = 0; j < 3; ++j)
+        {
+            rp[ir] += std::pow(pnablaRho[j][ir], 2);
+        }
+        rp[ir] *= this->pqcoef / std::pow(prho[0][ir], 8.0/3.0);
+    }
+}
+
+void ML_data::getQ(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rq)
+{
+    // get Laplacian rho
+    std::complex<double> *recipRho = new std::complex<double>[pw_rho->npw];
+    pw_rho->real2recip(prho[0], recipRho);
+    for (int ip = 0; ip < pw_rho->npw; ++ip)
+    {
+        recipRho[ip] *= - pw_rho->gg[ip] * pw_rho->tpiba2;
+    }
+    pw_rho->recip2real(recipRho, rq.data());
+
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        rq[ir] *= this->pqcoef / std::pow(prho[0][ir], 5.0/3.0);
+    }
+
+    delete[] recipRho;
+}
+
+void ML_data::getGammanl(const int ikernel, std::vector<double> &pgamma, ModulePW::PW_Basis *pw_rho, std::vector<double> &rgammanl)
+{
+    this->multiKernel(ikernel, pgamma.data(), pw_rho, rgammanl.data());
+}
+
+void ML_data::getPnl(const int ikernel, std::vector<double> &pp, ModulePW::PW_Basis *pw_rho, std::vector<double> &rpnl)
+{
+    this->multiKernel(ikernel, pp.data(), pw_rho, rpnl.data());
+}
+
+void ML_data::getQnl(const int ikernel, std::vector<double> &pq, ModulePW::PW_Basis *pw_rho, std::vector<double> &rqnl)
+{
+    this->multiKernel(ikernel, pq.data(), pw_rho, rqnl.data());
+}
+
+// xi = gammanl/gamma
+void ML_data::getXi(std::vector<double> &pgamma, std::vector<double> &pgammanl, std::vector<double> &rxi)
+{
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        if (pgamma[ir] == 0)
+        {
+            std::cout << "WARNING: gamma=0" << std::endl;
+            rxi[ir] = 0.;
+        }
+        else
+        {
+            rxi[ir] = pgammanl[ir]/pgamma[ir];
+        }
+    }
+}
+
+// tanhxi = tanh(gammanl/gamma)
+void ML_data::getTanhXi(const int ikernel, std::vector<double> &pgamma, std::vector<double> &pgammanl, std::vector<double> &rtanhxi)
+{
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        if (pgamma[ir] == 0)
+        {
+            std::cout << "WARNING: gamma=0" << std::endl;
+            rtanhxi[ir] = 0.;
+        }
+        else
+        {
+            rtanhxi[ir] = std::tanh(pgammanl[ir]/pgamma[ir] * this->chi_xi[ikernel]);
+        }
+    }
+}
+
+// tanh(p)
+void ML_data::getTanhP(std::vector<double> &pp, std::vector<double> &rtanhp)
+{
+    this->tanh(pp, rtanhp, this->chi_p);
+}
+
+// tanh(q)
+void ML_data::getTanhQ(std::vector<double> &pq, std::vector<double> &rtanhq)
+{
+    this->tanh(pq, rtanhq, this->chi_q);
+}
+
+// tanh(pnl)
+void ML_data::getTanh_Pnl(const int ikernel, std::vector<double> &ppnl, std::vector<double> &rtanh_pnl)
+{
+    this->tanh(ppnl, rtanh_pnl, this->chi_pnl[ikernel]);
+}
+
+// tanh(qnl)
+void ML_data::getTanh_Qnl(const int ikernel, std::vector<double> &pqnl, std::vector<double> &rtanh_qnl)
+{
+    this->tanh(pqnl, rtanh_qnl, this->chi_qnl[ikernel]);
+}
+
+// tanh(p)_nl
+void ML_data::getTanhP_nl(const int ikernel, std::vector<double> &ptanhp, ModulePW::PW_Basis *pw_rho, std::vector<double> &rtanhp_nl)
+{
+    this->multiKernel(ikernel, ptanhp.data(), pw_rho, rtanhp_nl.data());
+}
+
+// tanh(q)_nl
+void ML_data::getTanhQ_nl(const int ikernel, std::vector<double> &ptanhq, ModulePW::PW_Basis *pw_rho, std::vector<double> &rtanhq_nl)
+{
+    this->multiKernel(ikernel, ptanhq.data(), pw_rho, rtanhq_nl.data());
+}
+
+// (tanhxi)_nl
+void ML_data::getTanhXi_nl(const int ikernel, std::vector<double> &ptanhxi, ModulePW::PW_Basis *pw_rho, std::vector<double> &rtanhxi_nl)
+{
+    this->multiKernel(ikernel, ptanhxi.data(), pw_rho, rtanhxi_nl.data());
+}
+
+void ML_data::getPauli_WT(KEDF_WT &wt, KEDF_TF &tf, const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rpauli)
+{
+    ModuleBase::matrix potential(1, this->nx, true);
+
+    tf.tf_potential(prho, potential);
+    wt.wt_potential(prho, pw_rho, potential);
+
+    for (int ir = 0; ir < this->nx; ++ir){
+        rpauli[ir] = potential(0, ir);
+    }
+}
+
+void ML_data::getF_WT(KEDF_WT &wt, KEDF_TF &tf, const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<double> &rF)
+{
+    double wtden = 0.;
+    double tfden = 0.;
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        wtden = wt.get_energy_density(prho, 0, ir, pw_rho);
+        tfden = tf.get_energy_density(prho, 0, ir);
+        rF[ir] = 1. + wtden/tfden;
+        // if (wtden < 0) std::cout << wtden/tfden << std::endl;
+    }
+}
+
+void ML_data::getF_KS1(
+    psi::Psi<std::complex<double>> *psi,
+    elecstate::ElecState *pelec,
+    ModulePW::PW_Basis_K *pw_psi,
+    ModulePW::PW_Basis *pw_rho,
+    UnitCell& ucell,
+    const std::vector<std::vector<double>> &nablaRho,
+    std::vector<double> &rF,
+    std::vector<double> &rpauli
+)
+{
+    double *pauliED = new double[this->nx]; // Pauli Energy Density
+    ModuleBase::GlobalFunc::ZEROS(pauliED, this->nx);
+
+    double *pauliPot = new double[this->nx];
+    ModuleBase::GlobalFunc::ZEROS(pauliPot, this->nx);
+
+    std::complex<double> *wfcr = new std::complex<double>[this->nx];
+    ModuleBase::GlobalFunc::ZEROS(wfcr, this->nx);
+
+    double epsilonM = pelec->ekb(0,0);
+    assert(PARAM.inp.nspin == 1);
+
+    base_device::DEVICE_CPU* ctx;
+
+    // calculate positive definite kinetic energy density
+    for (int ik = 0; ik < psi->get_nk(); ++ik)
+    {
+        psi->fix_k(ik);
+        int ikk = psi->get_current_k();
+        assert(ikk == ik);
+        int npw = psi->get_current_nbas();
+        int nbands = psi->get_nbands();
+        for (int ibnd = 0; ibnd < nbands; ++ibnd)
+        {
+            if (pelec->wg(ik, ibnd) < ModuleBase::threshold_wg) {
+                continue;
+            }
+
+            pw_psi->recip_to_real(ctx, &psi->operator()(ibnd,0), wfcr, ik);
+            const double w1 = pelec->wg(ik, ibnd) / ucell.omega;
+            
+            // output one wf, to check KS equation
+            if (ik == 0 && ibnd == 0)
+            {
+                std::vector<double> wf_real = std::vector<double>(this->nx);
+                std::vector<double> wf_imag = std::vector<double>(this->nx);
+                for (int ir = 0; ir < this->nx; ++ir)
+                {
+                    wf_real[ir] = wfcr[ir].real();
+                    wf_imag[ir] = wfcr[ir].imag();
+                }
+                const long unsigned cshape[] = {(long unsigned) this->nx}; // shape of container and containernl
+            }
+
+            if (w1 != 0.0)
+            {
+                // Find the energy of HOMO
+                if (pelec->ekb(ik,ibnd) > epsilonM)
+                {
+                    epsilonM = pelec->ekb(ik,ibnd);
+                }
+                // The last term of Pauli potential
+                for (int ir = 0; ir < pelec->charge->nrxx; ir++)
+                {
+                    pauliPot[ir] -= w1 * pelec->ekb(ik,ibnd) * norm(wfcr[ir]);
+                }
+            }
+
+            for (int j = 0; j < 3; ++j)
+            {
+                ModuleBase::GlobalFunc::ZEROS(wfcr, pelec->charge->nrxx);
+                for (int ig = 0; ig < npw; ig++)
+                {
+                    double fact
+                        = pw_psi->getgpluskcar(ik, ig)[j] * ucell.tpiba;
+                    wfcr[ig] = psi->operator()(ibnd, ig) * complex<double>(0.0, fact);
+                }
+
+                pw_psi->recip2real(wfcr, wfcr, ik);
+                
+                for (int ir = 0; ir < this->nx; ++ir)
+                {
+                    pauliED[ir] += w1 * norm(wfcr[ir]); // actually, here should be w1/2 * norm(wfcr[ir]), but we multiply 2 to convert Ha to Ry.
+                }
+            }
+        }
+    }
+
+    std::cout << "(1) epsilon max = " << epsilonM << std::endl;
+    // calculate the positive definite vW energy density
+    for (int j = 0; j < 3; ++j)
+    {
+        for (int ir = 0; ir < this->nx; ++ir)
+        {
+            pauliED[ir] -= nablaRho[j][ir] * nablaRho[j][ir] / (8. * pelec->charge->rho[0][ir]) * 2.; // convert Ha to Ry.
+        }
+    }
+
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        rF[ir] = pauliED[ir] / (this->cTF * std::pow(pelec->charge->rho[0][ir], 5./3.));
+        rpauli[ir] = (pauliED[ir] + pauliPot[ir])/pelec->charge->rho[0][ir] + epsilonM;
+    }
+}
+
+void ML_data::getF_KS2(
+    psi::Psi<std::complex<double>> *psi,
+    elecstate::ElecState *pelec,
+    ModulePW::PW_Basis_K *pw_psi,
+    ModulePW::PW_Basis *pw_rho,
+    UnitCell& ucell,
+    std::vector<double> &rF,
+    std::vector<double> &rpauli
+)
+{
+    double *pauliED = new double[this->nx]; // Pauli Energy Density
+    ModuleBase::GlobalFunc::ZEROS(pauliED, this->nx);
+
+    double *pauliPot = new double[this->nx];
+    ModuleBase::GlobalFunc::ZEROS(pauliPot, this->nx);
+
+    std::complex<double> *wfcr = new std::complex<double>[this->nx];
+    ModuleBase::GlobalFunc::ZEROS(wfcr, this->nx);
+
+    std::complex<double> *wfcg = nullptr;
+    std::complex<double> *LapWfcr = new std::complex<double>[this->nx];
+    ModuleBase::GlobalFunc::ZEROS(LapWfcr, this->nx);
+
+    double epsilonM = pelec->ekb(0,0);
+    assert(PARAM.inp.nspin == 1);
+
+    base_device::DEVICE_CPU* ctx;
+
+    // calculate kinetic energy density
+    for (int ik = 0; ik < psi->get_nk(); ++ik)
+    {
+        psi->fix_k(ik);
+        int ikk = psi->get_current_k();
+        assert(ikk == ik);
+        int npw = psi->get_current_nbas();
+        int nbands = psi->get_nbands();
+        delete[] wfcg;
+        wfcg = new std::complex<double>[npw];
+        for (int ibnd = 0; ibnd < nbands; ++ibnd)
+        {
+            if (pelec->wg(ik, ibnd) < ModuleBase::threshold_wg) {
+                continue;
+            }
+
+            pw_psi->recip_to_real(ctx, &psi->operator()(ibnd,0), wfcr, ik);
+            const double w1 = pelec->wg(ik, ibnd) / ucell.omega;
+
+            if (pelec->ekb(ik,ibnd) > epsilonM)
+            {
+                epsilonM = pelec->ekb(ik,ibnd);
+            }
+            for (int ir = 0; ir < pelec->charge->nrxx; ir++)
+            {
+                pauliPot[ir] -= w1 * pelec->ekb(ik,ibnd) * norm(wfcr[ir]);
+            }
+
+            ModuleBase::GlobalFunc::ZEROS(wfcg, npw);
+            for (int ig = 0; ig < npw; ig++)
+            {
+                double fact = pw_psi->getgk2(ik, ig) * ucell.tpiba2;
+                wfcg[ig] = - psi->operator()(ibnd, ig) * fact;
+            }
+
+            pw_psi->recip2real(wfcg, LapWfcr, ik);
+            
+            for (int ir = 0; ir < this->nx; ++ir)
+            {
+                pauliED[ir] += - w1 * (conj(wfcr[ir]) * LapWfcr[ir]).real(); // actually, here should be w1/2 * norm(wfcr[ir]), but we multiply 2 to convert Ha to Ry.
+            }
+        }
+    }
+
+    std::cout << "(2) epsilon max = " << epsilonM << std::endl;
+    // calculate the positive definite vW energy density
+    double *phi = new double[this->nx];
+    double *LapPhi = new double[this->nx];
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        phi[ir] = sqrt(pelec->charge->rho[0][ir]);
+    }
+    this->Laplacian(phi, pw_rho, LapPhi);
+
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        pauliED[ir] -= - phi[ir] * LapPhi[ir]; // convert Ha to Ry.
+    }
+
+    for (int ir = 0; ir < this->nx; ++ir)
+    {
+        rF[ir] = pauliED[ir] / (this->cTF * std::pow(pelec->charge->rho[0][ir], 5./3.));
+        rpauli[ir] = (pauliED[ir] + pauliPot[ir])/pelec->charge->rho[0][ir] + epsilonM;
+    }
+}
+
+void ML_data::getNablaRho(const double * const *prho, ModulePW::PW_Basis *pw_rho, std::vector<std::vector<double>> &rnablaRho)
+{
+    std::complex<double> *recipRho = new std::complex<double>[pw_rho->npw];
+    std::complex<double> *recipNablaRho = new std::complex<double>[pw_rho->npw];
+    pw_rho->real2recip(prho[0], recipRho);
+    
+    std::complex<double> img(0.0, 1.0);
+    for (int j = 0; j < 3; ++j)
+    {
+        for (int ip = 0; ip < pw_rho->npw; ++ip)
+        {
+            recipNablaRho[ip] = img * pw_rho->gcar[ip][j] * recipRho[ip] * pw_rho->tpiba;
+        }
+        pw_rho->recip2real(recipNablaRho, rnablaRho[j].data());
+    }
+
+    delete[] recipRho;
+    delete[] recipNablaRho;
+}
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/CMakeLists.txt b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/CMakeLists.txt
new file mode 100644
index 0000000000..96f0617216
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/CMakeLists.txt
@@ -0,0 +1,28 @@
+cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
+project(nnof)
+
+#list(APPEND CMAKE_PREFIX_PATH /path/to/Torch)
+#list(APPEND libnpy_INCLUDE_DIR /path/to/libnpy)
+
+find_package(Torch REQUIRED)
+include_directories(${TORCH_INCLUDE_DIRS})
+
+find_path(libnpy_SOURCE_DIR npy.hpp HINTS ${libnpy_INCLUDE_DIR})
+if(NOT libnpy_SOURCE_DIR)
+    include(FetchContent)
+    FetchContent_Declare(
+        libnpy
+        GIT_REPOSITORY https://github.com/llohse/libnpy.git
+        GIT_SHALLOW TRUE
+        GIT_PROGRESS TRUE)
+    FetchContent_MakeAvailable(libnpy)
+else()
+    include_directories(${libnpy_INCLUDE_DIR})
+endif()
+include_directories(${libnpy_SOURCE_DIR}/include)
+
+add_executable(nnof main.cpp data.cpp nn_of.cpp grid.cpp input.cpp kernel.cpp pauli_potential.cpp train_kedf.cpp)
+target_link_libraries(nnof "${TORCH_LIBRARIES}")
+set_property(TARGET nnof PROPERTY CXX_STANDARD 14)
+
+#cmake -B build -Dlibnpy_INCLUDE_DIR=/path/to/libnpy/ -DTorch_DIR=/path/to/Torch/
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/data.cpp b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/data.cpp
new file mode 100644
index 0000000000..cd6585bf25
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/data.cpp
@@ -0,0 +1,357 @@
+#include "./data.h"
+#include "npy.hpp"
+
+Data::~Data()
+{
+    delete[] load_gammanl;
+    delete[] load_pnl;
+    delete[] load_qnl;
+    delete[] load_xi;
+    delete[] load_tanhxi;
+    delete[] load_tanhxi_nl;
+    delete[] load_tanh_pnl;
+    delete[] load_tanh_qnl;
+    delete[] load_tanhp_nl;
+    delete[] load_tanhq_nl;
+}
+
+void Data::load_data(Input &input, const int ndata, std::string *dir, const torch::Device device)
+{
+    if (ndata <= 0) { return;
+}
+    this->init_label(input);
+    this->init_data(input.nkernel, ndata, input.fftdim, device);
+    this->load_data_(input, ndata, input.fftdim, dir);
+    std::cout << "enhancement mean: " << this->enhancement_mean << std::endl;
+    std::cout << "exponent: " << input.exponent << std::endl;
+    std::cout << "tau mean: " << this->tau_mean << std::endl;
+    std::cout << "pauli potential mean: " << this->pauli_mean << std::endl;
+    std::cout << "Load data done" << std::endl;
+}
+
+torch::Tensor Data::get_data(std::string parameter, const int ikernel){
+    if (parameter == "gamma"){
+        return this->gamma.reshape({this->nx_tot});
+    }
+    if (parameter == "p"){
+        return this->p.reshape({this->nx_tot});
+    }
+    if (parameter == "q"){
+        return this->q.reshape({this->nx_tot});
+    }
+    if (parameter == "tanhp"){
+        return this->tanhp.reshape({this->nx_tot});
+    }
+    if (parameter == "tanhq"){
+        return this->tanhq.reshape({this->nx_tot});
+    }
+    if (parameter == "gammanl"){
+        return this->gammanl[ikernel].reshape({this->nx_tot});
+    }
+    if (parameter == "pnl"){
+        return this->pnl[ikernel].reshape({this->nx_tot});
+    }
+    if (parameter == "qnl"){
+        return this->qnl[ikernel].reshape({this->nx_tot});
+    }
+    if (parameter == "xi"){
+        return this->xi[ikernel].reshape({this->nx_tot});
+    }
+    if (parameter == "tanhxi"){
+        return this->tanhxi[ikernel].reshape({this->nx_tot});
+    }
+    if (parameter == "tanhxi_nl"){
+        return this->tanhxi_nl[ikernel].reshape({this->nx_tot});
+    }
+    if (parameter == "tanh_pnl"){
+        return this->tanh_pnl[ikernel].reshape({this->nx_tot});
+    }
+    if (parameter == "tanh_qnl"){
+        return this->tanh_qnl[ikernel].reshape({this->nx_tot});
+    }
+    if (parameter == "tanhp_nl"){
+        return this->tanhp_nl[ikernel].reshape({this->nx_tot});
+    }
+    if (parameter == "tanhq_nl"){
+        return this->tanhq_nl[ikernel].reshape({this->nx_tot});
+    }
+    return torch::zeros({});
+}
+
+void Data::init_label(Input &input)
+{
+    // Input::print("init_label");
+    this->load_gammanl = new bool[input.nkernel];
+    this->load_pnl = new bool[input.nkernel];
+    this->load_qnl = new bool[input.nkernel];
+    this->load_xi = new bool[input.nkernel];
+    this->load_tanhxi = new bool[input.nkernel];
+    this->load_tanhxi_nl = new bool[input.nkernel];
+    this->load_tanh_pnl = new bool[input.nkernel];
+    this->load_tanh_qnl = new bool[input.nkernel];
+    this->load_tanhp_nl = new bool[input.nkernel];
+    this->load_tanhq_nl = new bool[input.nkernel];
+
+    bool load_gammanl_tot = false;
+    bool load_pnl_tot = false;
+    bool load_qnl_tot = false;
+    // bool load_xi_tot = false;
+    // bool load_tanhxi_tot = false;
+    // bool load_tanhxi_nl_tot = false;
+    bool load_tanh_pnl_tot = false;
+    bool load_tanh_qnl_tot = false;
+    bool load_tanhp_nl_tot = false;
+    bool load_tanhq_nl_tot = false;
+
+    for (int ik = 0; ik < input.nkernel; ++ik)
+    {
+        this->load_gammanl[ik] = input.ml_gammanl[ik];
+        this->load_pnl[ik] = input.ml_pnl[ik];
+        this->load_qnl[ik] = input.ml_qnl[ik];
+        this->load_tanhxi_nl[ik] = input.ml_tanhxi_nl[ik];
+        this->load_tanhxi[ik] = input.ml_tanhxi[ik] || input.ml_tanhxi_nl[ik];
+        this->load_xi[ik] = input.ml_xi[ik] || this->load_tanhxi[ik];
+        this->load_tanh_pnl[ik] = input.ml_tanh_pnl[ik];
+        this->load_tanh_qnl[ik] = input.ml_tanh_qnl[ik];
+        this->load_tanhp_nl[ik] = input.ml_tanhp_nl[ik];
+        this->load_tanhq_nl[ik] = input.ml_tanhq_nl[ik];
+        // this->load_pnl[ik] = input.ml_pnl[ik] || input.ml_tanh_pnl[ik];
+
+        load_gammanl_tot = load_gammanl_tot || this->load_gammanl[ik];
+        load_pnl_tot = load_pnl_tot || this->load_pnl[ik];
+        load_qnl_tot = load_qnl_tot || this->load_qnl[ik];
+        // load_xi_tot = load_xi_tot || this->load_xi[ik];
+        // load_tanhxi_tot = load_tanhxi_tot || this->load_tanhxi[ik];
+        // load_tanhxi_nl_tot = load_tanhxi_nl_tot || this->load_tanhxi_nl[ik];
+        load_tanh_pnl_tot = load_tanh_pnl_tot || this->load_tanh_pnl[ik];
+        load_tanh_qnl_tot = load_tanh_qnl_tot || this->load_tanh_qnl[ik];
+        load_tanhp_nl_tot = load_tanhp_nl_tot || this->load_tanhp_nl[ik];
+        load_tanhq_nl_tot = load_tanhq_nl_tot || this->load_tanhq_nl[ik];
+        
+        // std::cout << "load_gammanl    " << this->load_gammanl[ik] << std::endl;
+        // std::cout << "load_pnl    " << this->load_pnl[ik] << std::endl;
+        // std::cout << "load_qnl    " << this->load_qnl[ik] << std::endl;
+        // std::cout << "load_tanhxi_nl    " << this->load_tanhxi_nl[ik] << std::endl;
+        // std::cout << "load_tanhxi    " << this->load_tanhxi[ik] << std::endl;
+        // std::cout << "load_xi    " << this->load_xi[ik] << std::endl;
+        // std::cout << "load_tanh_pnl    " << this->load_tanh_pnl[ik] << std::endl;
+        // std::cout << "load_tanh_qnl    " << this->load_tanh_qnl[ik] << std::endl;
+        // std::cout << "load_tanhp_nl    " << this->load_tanhp_nl[ik] << std::endl;
+        // std::cout << "load_tanhq_nl    " << this->load_tanhq_nl[ik] << std::endl;
+    }
+    this->load_gamma = input.ml_gamma || load_gammanl_tot;
+    this->load_tanhp = input.ml_tanhp || load_tanhp_nl_tot || load_tanh_pnl_tot;
+    this->load_tanhq = input.ml_tanhq || load_tanhq_nl_tot || load_tanh_qnl_tot;
+    this->load_p = input.ml_p || this->load_tanhp || load_pnl_tot;
+    this->load_q = input.ml_q || this->load_tanhq || load_qnl_tot;
+    // Input::print("init_label done");
+}
+
+void Data::init_data(const int nkernel, const int ndata, const int fftdim, const torch::Device device)
+{
+    // Input::print("init_data");
+    this->nx = std::pow(fftdim, 3);
+    this->nx_tot = this->nx * ndata;
+
+    this->rho = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+    if (this->load_p){
+        this->nablaRho = torch::zeros({ndata, 3, fftdim, fftdim, fftdim}).to(device);
+    }
+
+    this->enhancement = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+    this->enhancement_mean = torch::zeros(ndata).to(device);
+    this->tau_mean = torch::zeros(ndata).to(device);
+    this->pauli = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+    this->pauli_mean = torch::zeros(ndata).to(device);
+
+    if (this->load_gamma){
+        this->gamma = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+    }
+    if (this->load_p){
+        this->p = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+    }
+    if (this->load_q){
+        this->q = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+    }
+    if (this->load_tanhp){
+        this->tanhp = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+    }
+    if (this->load_tanhq){
+        this->tanhq = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+    }
+
+    for (int ik = 0; ik < nkernel; ++ik)
+    {
+        this->gammanl.push_back(torch::zeros({}).to(device));
+        this->pnl.push_back(torch::zeros({}).to(device));
+        this->qnl.push_back(torch::zeros({}).to(device));
+        this->xi.push_back(torch::zeros({}).to(device));
+        this->tanhxi.push_back(torch::zeros({}).to(device));
+        this->tanhxi_nl.push_back(torch::zeros({}).to(device));
+        this->tanh_pnl.push_back(torch::zeros({}).to(device));
+        this->tanh_qnl.push_back(torch::zeros({}).to(device));
+        this->tanhp_nl.push_back(torch::zeros({}).to(device));
+        this->tanhq_nl.push_back(torch::zeros({}).to(device));
+
+        if (this->load_gammanl[ik]){
+            this->gammanl[ik]   = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+        }
+        if (this->load_pnl[ik]){
+            this->pnl[ik]       = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+        }
+        if (this->load_qnl[ik]){
+            this->qnl[ik]       = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+        }
+        if (this->load_xi[ik]){
+            this->xi[ik]        = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+        }
+        if (this->load_tanhxi[ik]){
+            this->tanhxi[ik]    = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+        }
+        if (this->load_tanhxi_nl[ik{
+            this->tanhxi_nl[ik] = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+        }
+        if (this->load_tanh_pnl[ik]){
+            this->tanh_pnl[ik]  = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+        }
+        if (this->load_tanh_qnl[ik]){
+            this->tanh_qnl[ik]  = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+        }
+        if (this->load_tanhp_nl[ik]){
+            this->tanhp_nl[ik]  = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+        }
+        if (this->load_tanhq_nl[ik]){
+            this->tanhq_nl[ik]  = torch::zeros({ndata, fftdim, fftdim, fftdim}).to(device);
+        }
+    }
+
+    // Input::print("init_data done");
+}
+
+void Data::load_data_(
+    Input &input,
+    const int ndata,
+    const int fftdim,
+    std::string *dir 
+)
+{
+    // Input::print("load_data");
+    if (ndata <= 0){ 
+        return;
+    }
+
+    std::vector<long unsigned int> cshape = {(long unsigned) nx};
+    std::vector<double> container(nx);
+    bool fortran_order = false;
+
+    for (int idata = 0; idata < ndata; ++idata)
+    {
+        this->loadTensor(dir[idata] + "/rho.npy", cshape, fortran_order, container, idata, fftdim, rho);
+        if (this->load_gamma){
+            this->loadTensor(dir[idata] + "/gamma.npy", cshape, fortran_order, container, idata, fftdim, gamma);
+        }
+        if (this->load_p){
+            this->loadTensor(dir[idata] + "/p.npy", cshape, fortran_order, container, idata, fftdim, p);
+            npy::LoadArrayFromNumpy(dir[idata] + "/nablaRhox.npy", cshape, fortran_order, container);
+            nablaRho[idata][0] = torch::tensor(container).reshape({fftdim, fftdim, fftdim});
+            npy::LoadArrayFromNumpy(dir[idata] + "/nablaRhoy.npy", cshape, fortran_order, container);
+            nablaRho[idata][1] = torch::tensor(container).reshape({fftdim, fftdim, fftdim});
+            npy::LoadArrayFromNumpy(dir[idata] + "/nablaRhoz.npy", cshape, fortran_order, container);
+            nablaRho[idata][2] = torch::tensor(container).reshape({fftdim, fftdim, fftdim});
+        }
+        if (this->load_q){
+            this->loadTensor(dir[idata] + "/q.npy", cshape, fortran_order, container, idata, fftdim, q);
+        }
+        if (this->load_tanhp){
+            this->loadTensor(dir[idata] + "/tanhp.npy", cshape, fortran_order, container, idata, fftdim, tanhp);
+        }
+        if (this->load_tanhq){
+            this->loadTensor(dir[idata] + "/tanhq.npy", cshape, fortran_order, container, idata, fftdim, tanhq);
+        }
+
+        for (int ik = 0; ik < input.nkernel; ++ik)
+        {
+            int ktype = input.kernel_type[ik];
+            double kscaling = input.kernel_scaling[ik];
+
+            if (this->load_gammanl[ik]){
+                this->loadTensor(dir[idata] + this->file_name("gammanl", ktype, kscaling), cshape, fortran_order, container, idata, fftdim, gammanl[ik]);
+            }
+            if (this->load_pnl[ik]){
+                this->loadTensor(dir[idata] + this->file_name("pnl", ktype, kscaling), cshape, fortran_order, container, idata, fftdim, pnl[ik]);
+            }
+            if (this->load_qnl[ik]){
+                this->loadTensor(dir[idata] + this->file_name("qnl", ktype, kscaling), cshape, fortran_order, container, idata, fftdim, qnl[ik]);
+            }
+            if (this->load_xi[ik]){
+                this->loadTensor(dir[idata] + this->file_name("xi", ktype, kscaling), cshape, fortran_order, container, idata, fftdim, xi[ik]);
+            }
+            if (this->load_tanhxi[ik]){
+                this->loadTensor(dir[idata] + this->file_name("tanhxi", ktype, kscaling), cshape, fortran_order, container, idata, fftdim, tanhxi[ik]);
+            }
+            if (this->load_tanhxi_nl[ik]){
+                this->loadTensor(dir[idata] + this->file_name("tanhxi_nl", ktype, kscaling), cshape, fortran_order, container, idata, fftdim, tanhxi_nl[ik]);
+            }
+            if (this->load_tanh_pnl[ik]){
+                this->loadTensor(dir[idata] + this->file_name("tanh_pnl", ktype, kscaling), cshape, fortran_order, container, idata, fftdim, tanh_pnl[ik]);
+            }
+            if (this->load_tanh_qnl[ik]){
+                this->loadTensor(dir[idata] + this->file_name("tanh_qnl", ktype, kscaling), cshape, fortran_order, container, idata, fftdim, tanh_qnl[ik]);
+            }
+            if (this->load_tanhp_nl[ik]){
+                this->loadTensor(dir[idata] + this->file_name("tanhp_nl", ktype, kscaling), cshape, fortran_order, container, idata, fftdim, tanhp_nl[ik]);
+            }
+            if (this->load_tanhq_nl[ik]){
+                this->loadTensor(dir[idata] + this->file_name("tanhq_nl", ktype, kscaling), cshape, fortran_order, container, idata, fftdim, tanhq_nl[ik]);
+            }
+        }
+
+        this->loadTensor(dir[idata] + "/enhancement.npy", cshape, fortran_order, container, idata, fftdim, enhancement);
+        enhancement_mean[idata] = torch::mean(enhancement[idata]);
+        tau_mean[idata] = torch::mean(torch::pow(rho[idata], input.exponent/3.) * enhancement[idata]);
+
+        if (input.loss == "potential" || input.loss == "both" || input.loss == "both_new")
+        {
+            this->loadTensor(dir[idata] + "/pauli.npy", cshape, fortran_order, container, idata, fftdim, pauli);
+            pauli_mean[idata] = torch::mean(pauli[idata]);
+        }
+    }
+    enhancement.resize_({this->nx_tot, 1});
+    pauli.resize_({nx_tot, 1});
+
+    this->tau_tf = this->cTF * torch::pow(this->rho, 5./3.);
+    // Input::print("load_data done");
+}
+
+void Data::loadTensor(
+    std::string file,
+    std::vector<long unsigned int> cshape,
+    bool fortran_order, 
+    std::vector<double> &container,
+    const int index,
+    const int fftdim,
+    torch::Tensor &data
+)
+{
+    npy::LoadArrayFromNumpy(file, cshape, fortran_order, container);
+    data[index] = torch::tensor(container).reshape({fftdim, fftdim, fftdim});
+}
+
+void Data::dumpTensor(const torch::Tensor &data, std::string filename, int nx)
+{
+    std::vector<double> v(nx);
+    for (int ir = 0; ir < nx; ++ir){
+        v[ir] = data[ir].item<double>();
+    }
+    // std::vector<double> v(data.data_ptr<float>(), data.data_ptr<float>() + data.numel()); // this works, but only supports float tensor
+    const long unsigned cshape[] = {(long unsigned) nx}; // shape
+    npy::SaveArrayAsNumpy(filename, false, 1, cshape, v);
+    std::cout << "Dumping " << filename << " done" << std::endl;
+}
+
+std::string Data::file_name(std::string parameter, const int kernel_type, const double kernel_scaling)
+{
+    std::stringstream ss;
+    ss << "/" << parameter << "_" << kernel_type << "_" << kernel_scaling << ".npy";
+    return ss.str();
+}
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/data.h b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/data.h
new file mode 100644
index 0000000000..e6bb6f2a6d
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/data.h
@@ -0,0 +1,84 @@
+#ifndef DATA_H
+#define DATA_H
+
+#include "./input.h"
+
+#include <torch/torch.h>
+
+class Data
+{
+  public:
+    // --------- load the data from .npy files ------
+    ~Data();
+    
+    int nx = 0;
+    int nx_tot = 0;
+
+    // =========== data ===========
+    torch::Tensor rho;
+    torch::Tensor nablaRho;
+    torch::Tensor tau_tf;
+    // semi-local descriptors
+    torch::Tensor gamma;
+    torch::Tensor p;
+    torch::Tensor q;
+    torch::Tensor tanhp;
+    torch::Tensor tanhq;
+    // non-local descriptors
+    std::vector<torch::Tensor> gammanl = {};
+    std::vector<torch::Tensor> pnl = {};
+    std::vector<torch::Tensor> qnl = {};
+    std::vector<torch::Tensor> xi = {};
+    std::vector<torch::Tensor> tanhxi = {};
+    std::vector<torch::Tensor> tanhxi_nl = {};
+    std::vector<torch::Tensor> tanh_pnl = {};
+    std::vector<torch::Tensor> tanh_qnl = {};
+    std::vector<torch::Tensor> tanhp_nl = {};
+    std::vector<torch::Tensor> tanhq_nl = {};
+    // target
+    torch::Tensor enhancement;
+    torch::Tensor pauli;
+    torch::Tensor enhancement_mean;
+    torch::Tensor tau_mean; // mean Pauli energy
+    torch::Tensor pauli_mean;
+
+    // =========== label ===========
+    bool load_gamma = false;
+    bool load_p = false;
+    bool load_q = false;
+    bool load_tanhp = false;
+    bool load_tanhq = false;
+    bool* load_gammanl = nullptr;
+    bool* load_pnl = nullptr;
+    bool* load_qnl = nullptr;
+    bool* load_xi = nullptr;
+    bool* load_tanhxi = nullptr;
+    bool* load_tanhxi_nl = nullptr;
+    bool* load_tanh_pnl = nullptr;
+    bool* load_tanh_qnl = nullptr;
+    bool* load_tanhp_nl = nullptr;
+    bool* load_tanhq_nl = nullptr;
+
+    void load_data(Input &input, const int ndata, std::string *dir, const torch::Device device);
+    torch::Tensor get_data(std::string parameter, const int ikernel);
+
+  private:
+    void init_label(Input &input);
+    void init_data(const int nkernel, const int ndata, const int fftdim, const torch::Device device);
+    void load_data_(Input &input, const int ndata, const int fftdim, std::string *dir);
+    
+    const double cTF = 3.0/10.0 * std::pow(3*std::pow(M_PI, 2.0), 2.0/3.0) * 2; // 10/3*(3*pi^2)^{2/3}, multiply by 2 to convert unit from Hartree to Ry, finally in Ry*Bohr^(-2)
+
+  public:
+    void loadTensor(std::string file,
+                    std::vector<long unsigned int> cshape,
+                    bool fortran_order,
+                    std::vector<double> &container,
+                    const int index,
+                    const int fftdim,
+                    torch::Tensor &data);
+    // -------- dump Tensor into .npy files ---------
+    void dumpTensor(const torch::Tensor &data, std::string filename, int nx);
+    std::string file_name(std::string parameter, const int kernel_type, const double kernel_scaling);
+};
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/grid.cpp b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/grid.cpp
new file mode 100644
index 0000000000..d36094de7e
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/grid.cpp
@@ -0,0 +1,95 @@
+#include "./grid.h"
+
+void Grid::initGrid(const int fftdim,
+                    const int ndata,
+                    const std::string *cell,
+                    const double *a,
+                    const torch::Device device,
+                    double *volume)
+{
+    this->initGrid_(fftdim, ndata, cell, a, device, volume, this->fft_grid, this->fft_gg);
+    std::cout << "Init grid done" << std::endl;
+}
+
+void Grid::initGrid_(const int fftdim,
+                     const int ndata,
+                     const std::string *cell,
+                     const double *a,
+                     const torch::Device device,
+                     double *volume,
+                     std::vector<std::vector<torch::Tensor>> &grid,
+                     std::vector<torch::Tensor> &gg)
+{
+    this->fft_grid = std::vector<std::vector<torch::Tensor>>(ndata);
+    this->fft_gg = std::vector<torch::Tensor>(ndata);
+    for (int i = 0; i < ndata; ++i)
+    {
+        this->fft_grid[i] = std::vector<torch::Tensor>(3);
+    }
+
+    for (int it = 0; it < ndata; ++it)
+    {
+        if (cell[it] == "sc"){
+            this->initScRecipGrid(fftdim, a[it], it, device, volume, grid, gg);
+        }
+        else if (cell[it] == "fcc"){
+            this->initFccRecipGrid(fftdim, a[it], it, device, volume, grid, gg);
+        }
+        else if (cell[it] == "bcc"){
+            this->initBccRecipGrid(fftdim, a[it], it, device, volume, grid, gg);
+        }
+    }
+}
+
+void Grid::initScRecipGrid(const int fftdim,
+                           const double a,
+                           const int index,
+                           const torch::Device device,
+                           double *volume,
+                           std::vector<std::vector<torch::Tensor>> &grid,
+                           std::vector<torch::Tensor> &gg)
+{
+    volume[index] = std::pow(a, 3);
+    torch::Tensor fre = torch::fft::fftfreq(fftdim, a / fftdim).to(device) * 2. * M_PI;
+    grid[index] = torch::meshgrid({fre, fre, fre});
+    gg[index] = grid[index][0] * grid[index][0] + grid[index][1] * grid[index][1] + grid[index][2] * grid[index][2];
+}
+
+void Grid::initFccRecipGrid(const int fftdim,
+                            const double a,
+                            const int index,
+                            const torch::Device device,
+                            double *volume,
+                            std::vector<std::vector<torch::Tensor>> &grid,
+                            std::vector<torch::Tensor> &gg)
+{
+    // std::cout << "init grid" << std::endl;
+    volume[index] = std::pow(a, 3) / 4.;
+    double coef = 1. / sqrt(2.);
+    // std::cout << "fftfreq" << std::endl;
+    torch::Tensor fre = torch::fft::fftfreq(fftdim, a * coef / fftdim).to(device) * 2. * M_PI;
+    auto originalGrid = torch::meshgrid({fre, fre, fre});
+    grid[index][0] = coef * (-originalGrid[0] + originalGrid[1] + originalGrid[2]);
+    grid[index][1] = coef * (originalGrid[0] - originalGrid[1] + originalGrid[2]);
+    grid[index][2] = coef * (originalGrid[0] + originalGrid[1] - originalGrid[2]);
+    // std::cout << "gg" << std::endl;
+    gg[index] = grid[index][0] * grid[index][0] + grid[index][1] * grid[index][1] + grid[index][2] * grid[index][2];
+}
+
+void Grid::initBccRecipGrid(const int fftdim,
+                            const double a,
+                            const int index,
+                            const torch::Device device,
+                            double *volume,
+                            std::vector<std::vector<torch::Tensor>> &grid,
+                            std::vector<torch::Tensor> &gg)
+{
+    volume[index] = std::pow(a, 3) / 2.;
+    double coef = sqrt(3.) / 2.;
+    torch::Tensor fre = torch::fft::fftfreq(fftdim, a * coef / fftdim).to(device) * 2. * M_PI;
+    auto originalGrid = torch::meshgrid({fre, fre, fre});
+    grid[index][0] = coef * (originalGrid[1] + originalGrid[2]);
+    grid[index][1] = coef * (originalGrid[0] + originalGrid[2]);
+    grid[index][2] = coef * (originalGrid[0] + originalGrid[1]);
+    gg[index] = grid[index][0] * grid[index][0] + grid[index][1] * grid[index][1] + grid[index][2] * grid[index][2];
+}
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/grid.h b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/grid.h
new file mode 100644
index 0000000000..9d615a0a5f
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/grid.h
@@ -0,0 +1,51 @@
+#ifndef GRID_H
+#define GRID_H
+
+#include <torch/torch.h>
+
+class Grid
+{
+  public:
+    void initGrid(const int fftdim,
+                  const int ndata,
+                  const std::string *cell,
+                  const double *a,
+                  const torch::Device device,
+                  double *volume);
+
+    // fft grid
+    std::vector<std::vector<torch::Tensor>> fft_grid; // ntrain*3*fftdim*fftdim*fftdim
+    std::vector<torch::Tensor> fft_gg;
+
+  private:
+    void initGrid_(const int fftdim,
+                   const int ndata,
+                   const std::string *cell,
+                   const double *a,
+                   const torch::Device device,
+                   double *volume,
+                   std::vector<std::vector<torch::Tensor>> &grid,
+                   std::vector<torch::Tensor> &gg);
+    void initScRecipGrid(const int fftdim,
+                         const double a,
+                         const int index,
+                         const torch::Device device,
+                         double *volume,
+                         std::vector<std::vector<torch::Tensor>> &grid,
+                         std::vector<torch::Tensor> &gg);
+    void initFccRecipGrid(const int fftdim,
+                          const double a,
+                          const int index,
+                          const torch::Device device,
+                          double *volume,
+                          std::vector<std::vector<torch::Tensor>> &grid,
+                          std::vector<torch::Tensor> &gg);
+    void initBccRecipGrid(const int fftdim,
+                          const double a,
+                          const int index,
+                          const torch::Device device,
+                          double *volume,
+                          std::vector<std::vector<torch::Tensor>> &grid,
+                          std::vector<torch::Tensor> &gg);
+};
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/input.cpp b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/input.cpp
new file mode 100644
index 0000000000..aadb1ecc02
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/input.cpp
@@ -0,0 +1,283 @@
+#include "./input.h"
+
+void Input::readInput()
+{
+    std::ifstream ifs("nnINPUT", std::ios::in);
+    if (!ifs)
+    {
+        std::cout << " Can't find the nnINPUT file." << std::endl;
+        exit(0);
+    }
+
+    char word[80];
+    int ierr = 0;
+
+    ifs.rdstate();
+    while (ifs.good())
+    {
+        ifs >> word;
+        if (ifs.eof())
+            break;
+
+        if (strcmp("fftdim", word) == 0)
+        {
+            this->read_value(ifs, this->fftdim);
+        }
+        else if (strcmp("nbatch", word) == 0)
+        {
+            this->read_value(ifs, this->nbatch);
+        }
+        else if (strcmp("ntrain", word) == 0)
+        {
+            this->read_value(ifs, this->ntrain);
+            this->train_dir = new std::string[this->ntrain];
+            this->train_cell = new std::string[this->ntrain];
+            this->train_a = new double[this->ntrain];
+        }
+        else if (strcmp("nvalidation", word) == 0)
+        {
+            this->read_value(ifs, this->nvalidation);
+            if (this->nvalidation > 0)
+            {
+                this->validation_dir = new std::string[this->nvalidation];
+                this->validation_cell = new std::string[this->nvalidation];
+                this->validation_a = new double[this->nvalidation];
+            }
+        }
+        else if (strcmp("train_dir", word) == 0)
+        {
+            this->read_values(ifs, this->ntrain, this->train_dir);
+        }
+        else if (strcmp("train_cell", word) == 0)
+        {
+            this->read_values(ifs, this->ntrain, this->train_cell);
+        }
+        else if (strcmp("train_a", word) == 0)
+        {
+            this->read_values(ifs, this->ntrain, this->train_a);
+        }
+        else if (strcmp("validation_dir", word) == 0)
+        {
+            this->read_values(ifs, this->nvalidation, this->validation_dir);
+        }
+        else if (strcmp("validation_cell", word) == 0 && this->nvalidation > 0)
+        {
+            this->read_values(ifs, this->nvalidation, this->validation_cell);
+        }
+        else if (strcmp("validation_a", word) == 0 && this->nvalidation > 0)
+        {
+            this->read_values(ifs, this->nvalidation, this->validation_a);
+        }
+        else if (strcmp("loss", word) == 0)
+        {
+            this->read_value(ifs, this->loss);
+        }
+        else if (strcmp("exponent", word) == 0)
+        {
+            this->read_value(ifs, this->exponent);
+        }
+        else if (strcmp("nepoch", word) == 0)
+        {
+            this->read_value(ifs, this->nepoch);
+        }
+        else if (strcmp("lr_start", word) == 0)
+        {
+            this->read_value(ifs, this->lr_start);
+        }
+        else if (strcmp("lr_end", word) == 0)
+        {
+            this->read_value(ifs, this->lr_end);
+        }
+        else if (strcmp("lr_fre", word) == 0)
+        {
+            this->read_value(ifs, this->lr_fre);
+        }
+        else if (strcmp("dump_fre", word) == 0)
+        {
+            this->read_value(ifs, this->dump_fre);
+        }
+        else if (strcmp("print_fre", word) == 0)
+        {
+            this->read_value(ifs, this->print_fre);
+        }
+        else if (strcmp("gamma", word) == 0)
+        {
+            this->read_value(ifs, this->ml_gamma);
+        }
+        else if (strcmp("p", word) == 0)
+        {
+            this->read_value(ifs, this->ml_p);
+        }
+        else if (strcmp("q", word) == 0)
+        {
+            this->read_value(ifs, this->ml_q);
+        }
+        else if (strcmp("gammanl", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->ml_gammanl);
+        }
+        else if (strcmp("pnl", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->ml_pnl);
+        }
+        else if (strcmp("qnl", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->ml_qnl);
+        }
+        else if (strcmp("xi", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->ml_xi);
+        }
+        else if (strcmp("tanhxi", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->ml_tanhxi);
+        }
+        else if (strcmp("tanhxi_nl", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->ml_tanhxi_nl);
+        }
+        else if (strcmp("tanhp", word) == 0)
+        {
+            this->read_value(ifs, this->ml_tanhp);
+        }
+        else if (strcmp("tanhq", word) == 0)
+        {
+            this->read_value(ifs, this->ml_tanhq);
+        }
+        else if (strcmp("tanh_pnl", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->ml_tanh_pnl);
+        }
+        else if (strcmp("tanh_qnl", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->ml_tanh_qnl);
+        }
+        else if (strcmp("tanhp_nl", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->ml_tanhp_nl);
+        }
+        else if (strcmp("tanhq_nl", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->ml_tanhq_nl);
+        }
+        else if (strcmp("chi_xi", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->chi_xi);
+        }
+        else if (strcmp("chi_p", word) == 0)
+        {
+            this->read_value(ifs, this->chi_p);
+        }
+        else if (strcmp("chi_q", word) == 0)
+        {
+            this->read_value(ifs, this->chi_q);
+        }
+        else if (strcmp("chi_pnl", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->chi_pnl);
+        }
+        else if (strcmp("chi_qnl", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->chi_qnl);
+        }
+        else if (strcmp("feg_limit", word) == 0)
+        {
+            this->read_value(ifs, this->feg_limit);
+        }
+        else if (strcmp("change_step", word) == 0)
+        {
+            this->read_value(ifs, this->change_step);
+        }
+        else if (strcmp("coef_e", word) == 0)
+        {
+            this->read_value(ifs, this->coef_e);
+        }
+        else if (strcmp("coef_p", word) == 0)
+        {
+            this->read_value(ifs, this->coef_p);
+        }
+        else if (strcmp("coef_feg_e", word) == 0)
+        {
+            this->read_value(ifs, this->coef_feg_e);
+        }
+        else if (strcmp("coef_feg_p", word) == 0)
+        {
+            this->read_value(ifs, this->coef_feg_p);
+        }
+        else if (strcmp("check_pot", word) == 0)
+        {
+            this->read_value(ifs, this->check_pot);
+        }
+        else if (strcmp("nnode", word) == 0)
+        {
+            this->read_value(ifs, this->nnode);
+        }
+        else if (strcmp("nlayer", word) == 0)
+        {
+            this->read_value(ifs, this->nlayer);
+        }
+        else if (strcmp("nkernel", word) == 0)
+        {
+            this->read_value(ifs, this->nkernel);
+            this->ml_gammanl = new bool[this->nkernel];
+            this->ml_pnl = new bool[this->nkernel];
+            this->ml_qnl = new bool[this->nkernel];
+            this->ml_xi = new bool[this->nkernel];
+            this->ml_tanhxi = new bool[this->nkernel];
+            this->ml_tanhxi_nl = new bool[this->nkernel];
+            this->ml_tanh_pnl = new bool[this->nkernel];
+            this->ml_tanh_qnl = new bool[this->nkernel];
+            this->ml_tanhp_nl = new bool[this->nkernel];
+            this->ml_tanhq_nl = new bool[this->nkernel];
+            this->chi_xi = new double[this->nkernel];
+            this->chi_pnl = new double[this->nkernel];
+            this->chi_qnl = new double[this->nkernel];
+            this->kernel_type = new int[this->nkernel];
+            this->kernel_scaling = new double[this->nkernel];
+            this->yukawa_alpha = new double[this->nkernel];
+            this->kernel_file = new std::string[this->nkernel];
+            for (int ik = 0; ik < this->nkernel; ++ik)
+            {
+                this->ml_gammanl[ik] = 0;
+                this->ml_pnl[ik] = 0;
+                this->ml_qnl[ik] = 0;
+                this->ml_xi[ik] = 0;
+                this->ml_tanhxi[ik] = 0;
+                this->ml_tanhxi_nl[ik] = 0;
+                this->ml_tanh_pnl[ik] = 0;
+                this->ml_tanh_qnl[ik] = 0;
+                this->ml_tanhp_nl[ik] = 0;
+                this->ml_tanhq_nl[ik] = 0;
+                this->chi_xi[ik] = 1.;
+                this->chi_pnl[ik] = 1.;
+                this->chi_qnl[ik] = 1.;
+                this->kernel_type[ik] = 1;
+                this->kernel_scaling[ik] = 1.;
+                this->yukawa_alpha[ik] = 1.;
+                this->kernel_file[ik] = "none";
+            }
+        }
+        else if (strcmp("kernel_type", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->kernel_type);
+        }
+        else if (strcmp("yukawa_alpha", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->yukawa_alpha);
+        }
+        else if (strcmp("kernel_scaling", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->kernel_scaling);
+        }
+        else if (strcmp("kernel_file", word) == 0)
+        {
+            this->read_values(ifs, this->nkernel, this->kernel_file);
+        }
+        else if (strcmp("device_type", word) == 0)
+        {
+            this->read_value(ifs, this->device_type);
+        }
+    }
+
+    std::cout << "Read nnINPUT done" << std::endl;
+}
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/input.h b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/input.h
new file mode 100644
index 0000000000..715b4c6064
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/input.h
@@ -0,0 +1,134 @@
+#ifndef INPUT_H
+#define INPUT_H
+
+#include <torch/torch.h>
+
+class Input
+{
+    // ---------- read in the settings from nnINPUT --------
+  public:
+    Input(){};
+    ~Input()
+    {
+        delete[] this->train_dir;
+        delete[] this->train_cell;
+        delete[] this->train_a;
+        delete[] this->validation_dir;
+        delete[] this->validation_cell;
+        delete[] this->validation_a;
+
+        delete[] this->ml_gammanl;
+        delete[] this->ml_pnl;
+        delete[] this->ml_qnl;
+        delete[] this->ml_xi;
+        delete[] this->ml_tanhxi;
+        delete[] this->ml_tanhxi_nl;
+        delete[] this->ml_tanh_pnl;
+        delete[] this->ml_tanh_qnl;
+        delete[] this->ml_tanhp_nl;
+        delete[] this->ml_tanhq_nl;
+        delete[] this->chi_xi;
+        delete[] this->chi_pnl;
+        delete[] this->chi_qnl;
+        delete[] this->kernel_type;
+        delete[] this->kernel_scaling;
+        delete[] this->yukawa_alpha;
+        delete[] this->kernel_file;
+
+    };
+
+    void readInput();
+
+    template <class T> static void read_value(std::ifstream &ifs, T &var)
+    {
+        ifs >> var;
+        ifs.ignore(150, '\n');
+        return;
+    }
+
+    template <class T> static void read_values(std::ifstream &ifs, const int length, T *var)
+    {
+        for (int i = 0; i < length; ++i)
+        {
+            ifs >> var[i];
+        }
+        ifs.ignore(150, '\n');
+        return;
+    }
+
+    // training
+    int fftdim = 0;
+    int nbatch = 0;
+    int ntrain = 1;
+    int nvalidation = 0;
+    std::string *train_dir = nullptr;
+    std::string *train_cell = nullptr;
+    double *train_a = nullptr;
+    std::string *validation_dir = nullptr;
+    std::string *validation_cell = nullptr;
+    double *validation_a = nullptr;
+    std::string loss = "both";
+    int nepoch = 1000;
+    double lr_start = 0.01; // learning rate 2023-02-24
+    double lr_end = 1e-4;
+    int lr_fre = 5000;
+    double exponent = 5.; // exponent of weight rho^{exponent/3.}
+
+    // output
+    int dump_fre = 1;
+    int print_fre = 1;
+
+    // descriptors
+    // semi-local descriptors
+    bool ml_gamma = false;
+    bool ml_p = false;
+    bool ml_q = false;
+    bool ml_tanhp = false;
+    bool ml_tanhq = false;
+    double chi_p = 1.;
+    double chi_q = 1.;
+    // non-local descriptors
+    bool* ml_gammanl = nullptr;
+    bool* ml_pnl = nullptr;
+    bool* ml_qnl = nullptr;
+    bool* ml_xi = nullptr;
+    bool* ml_tanhxi = nullptr;
+    bool* ml_tanhxi_nl = nullptr;
+    bool* ml_tanh_pnl = nullptr;
+    bool* ml_tanh_qnl = nullptr;
+    bool* ml_tanhp_nl = nullptr;
+    bool* ml_tanhq_nl = nullptr;
+    double* chi_xi = nullptr;
+    double* chi_pnl = nullptr;
+    double* chi_qnl = nullptr;
+
+    int feg_limit = 0; // Free Electron Gas
+    int change_step = 0; // when feg_limit=3, change the output of net after change_step
+
+    // coefficients in loss function
+    double coef_e = 1.;
+    double coef_p = 1.;
+    double coef_feg_e = 1.;
+    double coef_feg_p = 1.;
+
+    // size of nn
+    int nnode = 10;
+    int nlayer = 3;
+
+    // kernel
+    int nkernel = 1;
+    int* kernel_type = nullptr;
+    double* kernel_scaling = nullptr;
+    double* yukawa_alpha = nullptr;
+    std::string* kernel_file = nullptr;
+
+    // GPU
+    std::string device_type = "gpu";
+    bool check_pot = false;
+
+    static void print(std::string message)
+    {
+        std::cout << message << std::endl;
+    }
+};
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/kernel.cpp b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/kernel.cpp
new file mode 100644
index 0000000000..c0d23d6a1f
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/kernel.cpp
@@ -0,0 +1,215 @@
+#include "./kernel.h"
+
+void Kernel::fill_kernel(const int fftdim,
+                         const int ndata,
+                         const torch::Tensor &rho,
+                         const double *volume,
+                         const std::string *cell,
+                         const torch::Device device,
+                         const std::vector<torch::Tensor> &fft_gg)
+{
+    double rho0 = 0.;
+    double tkF = 0.;
+    double eta = 0.;
+    this->kernel = std::vector<torch::Tensor>(ndata);
+
+    // read in the kernel
+    if (this->kernel_type == 3 || this->kernel_type == 4) // 3 for TKK Al, and 4 for TKK Si
+    {
+        this->read_kernel(fftdim, ndata, rho, volume, cell, device, fft_gg);
+    }
+    else
+    {
+        for (int id = 0; id < ndata; ++id)
+        {
+            rho0 = torch::sum(rho[id]).item<double>() / std::pow(fftdim, 3);
+            std::cout << "There are " << rho0 * volume[id] << " electrons in " << cell[id] << " strcture." << std::endl;
+            tkF = 2. * std::pow(3. * std::pow(M_PI, 2) * rho0, 1. / 3.);
+            this->kernel[id] = torch::zeros({fftdim, fftdim, fftdim}).to(device);
+            for (int ix = 0; ix < fftdim; ++ix)
+            {
+                for (int iy = 0; iy < fftdim; ++iy)
+                {
+                    for (int iz = 0; iz < fftdim; ++iz)
+                    {
+                        eta = sqrt(fft_gg[id][ix][iy][iz].item<double>()) / tkF;
+                        eta = eta * this->scaling;
+                        if (this->kernel_type == 1)
+                        {
+                            this->kernel[id][ix][iy][iz] = this->wt_kernel(eta);
+                            // this->kernel[id][ix][iy][iz] = std::pow(1. / this->scaling, 3) * this->wt_kernel(eta);
+                            // if (ix == 26 && iy == 0 && iz == 26)
+                            // {
+                            //     std::cout << "kernel1    " << this->kernel[id][ix][iy][iz] << std::endl;
+                            //     std::cout << "eta        " << eta << std::endl;
+                            //     std::cout << "tkF        " << tkF << std::endl;
+                            // }
+                        }
+                        else if (this->kernel_type == 2)
+                        {
+                            this->kernel[id][ix][iy][iz] = this->yukawa_kernel(eta, this->yukawa_alpha);
+                        }
+                    }
+                }
+            }
+        }
+    }
+    std::cout << "Fill kernel done" << std::endl;
+}
+
+double Kernel::wt_kernel(double eta, double tf_weight, double vw_weight)
+{
+    if (eta < 0.) 
+    {
+        return 0.;
+    }
+    // limit for small eta
+    else if (eta < 1e-10)
+    {
+        return 1. - tf_weight + eta * eta * (1./3. - 3. * vw_weight);
+    }
+    // around the singularity
+    else if (std::abs(eta - 1.) < 1e-10)
+    {
+        return 2. - tf_weight - 3. * vw_weight + 20. * (eta - 1);
+    }
+    // Taylor expansion for high eta
+    else if (eta > 3.65)
+    {
+        double eta2 = eta * eta;
+        double invEta2 = 1. / eta2;
+        double LindG = 3. * (1. - vw_weight) * eta2 
+                        -tf_weight-0.6 
+                        + invEta2 * (-0.13714285714285712 
+                        + invEta2 * (-6.39999999999999875E-2
+                        + invEta2 * (-3.77825602968460128E-2
+                        + invEta2 * (-2.51824061652633074E-2
+                        + invEta2 * (-1.80879839616166146E-2
+                        + invEta2 * (-1.36715733124818332E-2
+                        + invEta2 * (-1.07236045520990083E-2
+                        + invEta2 * (-8.65192783339199453E-3 
+                        + invEta2 * (-7.1372762502456763E-3
+                        + invEta2 * (-5.9945117538835746E-3
+                        + invEta2 * (-5.10997527675418131E-3 
+                        + invEta2 * (-4.41060829979912465E-3 
+                        + invEta2 * (-3.84763737842981233E-3 
+                        + invEta2 * (-3.38745061493813488E-3 
+                        + invEta2 * (-3.00624946457977689E-3)))))))))))))));
+        return LindG;
+    }
+    else
+    {
+        return 1. / (0.5 + 0.25 * (1. - eta * eta) / eta * std::log((1 + eta)/std::abs(1 - eta)))
+                 - 3. * vw_weight * eta * eta - tf_weight;
+    }
+}
+
+double Kernel::yukawa_kernel(double eta, double alpha)
+{
+    return (eta == 0 && alpha == 0) ? 0. : M_PI / (eta * eta + alpha * alpha / 4.);
+}
+
+// Read kernel from file
+void Kernel::read_kernel(const int fftdim,
+                         const int ndata,
+                         const torch::Tensor &rho,
+                         const double *volume,
+                         const std::string *cell,
+                         const torch::Device device,
+                         const std::vector<torch::Tensor> &fft_gg)
+{
+    std::ifstream ifs(kernel_file.c_str(), std::ios::in);
+
+    if (!ifs)
+    {
+        std::cout << " Can't find the kernel file " << kernel_file << std::endl;
+        exit(0);
+    }
+
+    std::cout << "Read WT kernel from " << kernel_file << std::endl;
+
+    int kineType = 0;
+    double kF_in = 0.;
+    double rho0_in = 0.;
+    int nq_in = 0;
+    double maxEta_in = 0.;
+
+    ifs >> kineType;
+    ifs >> kF_in;
+    ifs >> rho0_in;
+    ifs >> nq_in;
+
+    double *eta_in = new double[nq_in];
+    double *w0_in = new double[nq_in];
+
+    for (int iq = 0; iq < nq_in; ++iq)
+    {
+        ifs >> eta_in[iq] >> w0_in[iq];
+    }
+
+    maxEta_in = eta_in[nq_in-1];
+
+    double rho0 = 0.;
+    double tkF = 0.;
+    double eta = 0.;
+    for (int id = 0; id < ndata; ++id)
+    {
+        rho0 = torch::sum(rho[id]).item<double>() / std::pow(fftdim, 3);
+        std::cout << "There are " << rho0 * volume[id] << " electrons in " << cell[id] << " strcture." << std::endl;
+        tkF = 2. * std::pow(3. * std::pow(M_PI, 2) * rho0, 1. / 3.);
+        this->kernel[id] = torch::zeros({fftdim, fftdim, fftdim}).to(device);
+
+        double eta = 0.;
+        double maxEta = 0.;
+        int ind1 = 0;
+        int ind2 = 0;
+        int ind_mid = 0;
+        double fac1 = 0.;
+        double fac2 = 0.;
+
+        for (int ix = 0; ix < fftdim; ++ix)
+        {
+            for (int iy = 0; iy < fftdim; ++iy)
+            {
+                for (int iz = 0; iz < fftdim; ++iz)
+                {
+                    eta = sqrt(fft_gg[id][ix][iy][iz].item<double>()) / tkF;
+                    eta = eta * this->scaling;
+                    maxEta = std::max(eta, maxEta);
+
+                    if (eta <= eta_in[0]) {
+                        this->kernel[id][ix][iy][iz] = w0_in[0];
+                    } else if (eta > maxEta_in) {
+                        this->kernel[id][ix][iy][iz] = w0_in[nq_in-1];
+                    } else
+                    {
+                        ind1 = 1;
+                        ind2 = nq_in;
+                        while (ind1 < ind2 - 1)
+                        {
+                            ind_mid = (ind1 + ind2)/2;
+                            if (eta > eta_in[ind_mid])
+                            {
+                                ind1 = ind_mid;
+                            }
+                            else
+                            {
+                                ind2 = ind_mid;
+                            }
+                        }
+                        fac1 = (eta_in[ind2] - eta)/(eta_in[ind2] - eta_in[ind1]);
+                        fac2 = (eta - eta_in[ind1])/(eta_in[ind2] - eta_in[ind1]);
+                        this->kernel[id][ix][iy][iz] = fac1 * w0_in[ind1] + fac2 * w0_in[ind2];
+                        // this->kernel[id][ix][iy][iz] *= std::pow(1. / this->scaling, 3);
+                    }
+                }
+            }
+        }
+        if (maxEta > maxEta_in) { std::cout << "Please increase the maximal eta value in KEDF kernel file" << std::endl;
+}
+    }
+
+
+    delete[] eta_in;
+    delete[] w0_in;
+}
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/kernel.h b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/kernel.h
new file mode 100644
index 0000000000..7199f2a6ef
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/kernel.h
@@ -0,0 +1,43 @@
+#ifndef KERNEL_H
+#define KERNEL_H
+
+#include <torch/torch.h>
+
+class Kernel
+{
+    // ------------ fill the kernel in reciprocal space ----------
+  public:
+    Kernel(){};
+
+    void set_para(const int kernel_type_in, const double scaling_in, const double yukawa_alpha_in, const std::string &kernel_file_in)
+    {
+        this->kernel_type = kernel_type_in;
+        this->scaling = scaling_in;
+        this->yukawa_alpha = yukawa_alpha_in;
+        this->kernel_file = kernel_file_in;
+    }
+
+    int kernel_type = 0; // 1: WT, 2: Yukawa
+    double scaling = 0.;
+    double yukawa_alpha = 0.;
+    std::string kernel_file = "none";
+    std::vector<torch::Tensor> kernel;
+
+    void fill_kernel(const int fftdim,
+                     const int ndata,
+                     const torch::Tensor &rho,
+                     const double *volume,
+                     const std::string *cell,
+                     const torch::Device device,
+                     const std::vector<torch::Tensor> &fft_gg);
+    double wt_kernel(double eta, double tf_weight = 1., double vw_weight = 1.);
+    double yukawa_kernel(double eta, double alpha);
+    void read_kernel(const int fftdim,
+                     const int ndata,
+                     const torch::Tensor &rho,
+                     const double *volume,
+                     const std::string *cell,
+                     const torch::Device device,
+                     const std::vector<torch::Tensor> &fft_gg);
+};
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/main.cpp b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/main.cpp
new file mode 100644
index 0000000000..bfd6c4ff24
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/main.cpp
@@ -0,0 +1,20 @@
+#include "./train_kedf.h"
+
+int main()
+{
+    torch::set_default_dtype(caffe2::TypeMeta::fromScalarType(torch::kDouble));
+    auto output = torch::get_default_dtype();
+    std::cout << "Default type: " << output << std::endl;
+
+    Train_KEDF train;
+    train.input.readInput();
+    if (train.input.check_pot)
+    {
+        train.potTest();
+    }
+    else
+    {
+        train.init();
+        train.train();
+    }
+}
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/nn_of.cpp b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/nn_of.cpp
new file mode 100644
index 0000000000..5aa8106958
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/nn_of.cpp
@@ -0,0 +1,35 @@
+#include "nn_of.h"
+
+NN_OFImpl::NN_OFImpl(int nrxx, int nrxx_vali, int ninpt, int nnode, int nlayer, torch::Device device)
+{
+    this->nrxx = nrxx;
+    this->nrxx_vali = nrxx_vali;
+    this->ninpt = ninpt;
+    this->nnode = nnode;
+    std::cout << "nnode = " << this->nnode << std::endl;
+    this->nlayer = nlayer;
+    std::cout << "nlayer = " << this->nlayer << std::endl;
+    this->nfc = nlayer + 1;
+
+    this->inputs = torch::zeros({this->nrxx, this->ninpt}).to(device);
+    this->F = torch::zeros({this->nrxx, 1}).to(device);
+    if (nrxx_vali > 0) this->input_vali = torch::zeros({nrxx_vali, this->ninpt}).to(device);
+
+    fc1 = register_module("fc1", torch::nn::Linear(ninpt, nnode));
+    fc2 = register_module("fc2", torch::nn::Linear(nnode, nnode));
+    fc3 = register_module("fc3", torch::nn::Linear(nnode, nnode));
+    fc4 = register_module("fc4", torch::nn::Linear(nnode, 1));
+
+    this->to(device);
+}
+
+
+torch::Tensor NN_OFImpl::forward(torch::Tensor inpt)
+{
+    inpt = torch::tanh(fc1->forward(inpt)); // covert data into (-1,1)
+    inpt = torch::tanh(fc2->forward(inpt));
+    inpt = torch::tanh(fc3->forward(inpt));
+    inpt = fc4->forward(inpt); // for feg = 3
+
+    return inpt;
+}
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/nn_of.h b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/nn_of.h
new file mode 100644
index 0000000000..6623ddcfc8
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/nn_of.h
@@ -0,0 +1,57 @@
+#ifndef NN_OF_H
+#define NN_OF_H
+
+#include <torch/torch.h>
+
+struct NN_OFImpl:torch::nn::Module{
+    // three hidden layers and one output layer
+    NN_OFImpl(
+        int nrxx, 
+        int nrxx_vali, 
+        int ninpt, 
+        int nnode,
+        int nlayer,
+        torch::Device device
+        );
+    ~NN_OFImpl()
+    {
+        // delete[] this->fcs;
+    };
+
+
+    template <class T>
+    void set_data(
+        T *data,
+        const std::vector<std::string> &descriptor_type,
+        const std::vector<int> &kernel_index,
+        torch::Tensor &nn_input
+    )
+    {
+        if (data->nx_tot <= 0) return;
+        for (int i = 0; i < descriptor_type.size(); ++i)
+        {
+            nn_input.index({"...", i}) = data->get_data(descriptor_type[i], kernel_index[i]);
+        }
+    }
+
+    torch::Tensor forward(torch::Tensor inpt);
+
+    // torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr}, fc4{nullptr}, fc5{nullptr};
+    // torch::nn::Linear fcs[5] = {fc1, fc2, fc3, fc4, fc5};
+
+    torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr}, fc4{nullptr};
+
+    torch::Tensor inputs;
+    torch::Tensor input_vali;
+    torch::Tensor F; // enhancement factor, output of NN
+
+    int nrxx = 10;
+    int nrxx_vali = 0;
+    int ninpt = 6;
+    int nnode = 10;
+    int nlayer = 3;
+    int nfc = 4;
+};
+TORCH_MODULE(NN_OF);
+
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/pauli_potential.cpp b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/pauli_potential.cpp
new file mode 100644
index 0000000000..544beef8d9
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/pauli_potential.cpp
@@ -0,0 +1,623 @@
+#include "./pauli_potential.h"
+
+void PauliPotential::init(const Input &input,
+                     const int ninput,
+                     const std::vector<std::string> &descriptor_type,
+                     const std::vector<int> &kernel_index)
+{
+    this->fftdim = input.fftdim;
+    std::cout << "descriptor_type    " << descriptor_type << std::endl;
+    std::cout << "kernel_index    " << kernel_index << std::endl;
+
+    // this->descriptor_type = descriptor_type;
+    // this->kernel_index = kernel_index;
+
+    this->chi_xi = input.chi_xi;
+    this->chi_p = input.chi_p;
+    this->chi_q = input.chi_q;
+    this->chi_pnl = input.chi_pnl;
+    this->chi_qnl = input.chi_qnl;
+
+    this->descriptor2kernel = {{"gamma", {}},
+                               {"p", {}},
+                               {"q", {}},
+                               {"tanhp", {}},
+                               {"tanhq", {}},
+                               {"gammanl", {}},
+                               {"pnl", {}},
+                               {"qnl", {}},
+                               {"xi", {}},
+                               {"tanhxi", {}},
+                               {"tanhxi_nl", {}},
+                               {"tanh_pnl", {}},
+                               {"tanh_qnl", {}},
+                               {"tanhp_nl", {}},
+                               {"tanhq_nl", {}}};
+    this->descriptor2index = this->descriptor2kernel;
+
+    for (int i = 0; i < ninput; ++i)
+    {
+        this->descriptor2kernel[descriptor_type[i]].push_back(kernel_index[i]);
+        std::cout << "this->descriptor2kernel[descriptor_type[i]]    " << this->descriptor2kernel[descriptor_type[i]]
+                  << std::endl;
+        this->descriptor2index[descriptor_type[i]].push_back(i);
+        std::cout << "this->descriptor2index[descriptor_type[i]]    " << this->descriptor2index[descriptor_type[i]]
+                  << std::endl;
+    }
+    std::cout << "descriptor2index    " << descriptor2index << std::endl;
+    std::cout << "descriptor2kernel    " << descriptor2kernel << std::endl;
+
+    this->ml_gamma = this->descriptor2index["gamma"].size() > 0;
+    this->ml_p = this->descriptor2index["p"].size() > 0;
+    this->ml_q = this->descriptor2index["q"].size() > 0;
+    this->ml_tanhp = this->descriptor2index["tanhp"].size() > 0;
+    this->ml_tanhq = this->descriptor2index["tanhq"].size() > 0;
+    this->ml_gammanl = this->descriptor2index["gammanl"].size() > 0;
+    this->ml_pnl = this->descriptor2index["pnl"].size() > 0;
+    this->ml_qnl = this->descriptor2index["qnl"].size() > 0;
+    this->ml_xi = this->descriptor2index["xi"].size() > 0;
+    this->ml_tanhxi = this->descriptor2index["tanhxi"].size() > 0;
+    this->ml_tanhxi_nl = this->descriptor2index["tanhxi_nl"].size() > 0;
+    this->ml_tanh_pnl = this->descriptor2index["tanh_pnl"].size() > 0;
+    this->ml_tanh_qnl = this->descriptor2index["tanh_qnl"].size() > 0;
+    this->ml_tanhp_nl = this->descriptor2index["tanhp_nl"].size() > 0;
+    this->ml_tanhq_nl = this->descriptor2index["tanhq_nl"].size() > 0;
+}
+
+torch::Tensor PauliPotential::get_potential(const int istru,
+                                       const Data &data,
+                                       const torch::Tensor &F,
+                                       const torch::Tensor &gradient,
+                                       const Kernel *kernels,
+                                       const Grid &grid)
+{
+    // Input::print("get potential begin");
+    this->istru = istru;
+    torch::Tensor potential = 5. / 3. * F;
+
+    // semi-local potential terms
+    if (this->ml_gamma) {
+        potential += this->potGammaTerm(data.gamma[istru], gradient);
+    }
+    if (this->ml_p) {
+        potential += this->potPTerm1(data.p[istru], gradient);
+    }
+    if (this->ml_q) {
+        potential += this->potQTerm1(data.q[istru], gradient);
+    }
+    if (this->ml_xi) {
+        potential += this->potXiTerm1(data.rho[istru], data.xi, gradient);
+    }
+    if (this->ml_tanhxi) {
+        potential += this->potTanhxiTerm1(data.rho[istru], data.xi, data.tanhxi, gradient);
+    }
+    if (this->ml_tanhp) {
+        potential += this->potTanhpTerm1(data.p[istru], data.tanhp[istru], gradient);
+    }
+    if (this->ml_tanhq) {
+        potential += this->potTanhqTerm1(data.q[istru], data.tanhq[istru], gradient);
+    }
+    potential *= data.tau_tf[istru] / data.rho[istru];
+
+    // non-local potential terms
+    if (this->ml_gammanl) {
+        potential += this->potGammanlTerm(data.rho[istru], data.gamma[istru], kernels, data.tau_tf[istru], gradient);
+    }
+    if (this->ml_p || this->ml_pnl) {
+        potential += this->potPPnlTerm(data.rho[istru],
+                                       data.nablaRho[istru],
+                                       data.p[istru],
+                                       kernels,
+                                       data.tau_tf[istru],
+                                       gradient,
+                                       grid.fft_grid[istru]);
+    }
+    if (this->ml_q || this->ml_qnl) {
+        potential += this->potQQnlTerm(data.rho[istru],
+                                       data.q[istru],
+                                       kernels,
+                                       data.tau_tf[istru],
+                                       gradient,
+                                       grid.fft_gg[istru]);
+    }
+    if (this->ml_xi) {
+        potential += this->potXinlTerm(data.rho[istru], kernels, data.tau_tf[istru], gradient);
+    }
+    if (this->ml_tanhxi) {
+        potential += this->potTanhxinlTerm(data.rho[istru], data.tanhxi, kernels, data.tau_tf[istru], gradient);
+    }
+    if (this->ml_tanhxi_nl) {
+        potential
+            += this->potTanhxi_nlTerm(data.rho[istru], data.xi, data.tanhxi, kernels, data.tau_tf[istru], gradient);
+    }
+    if ((this->ml_tanhp || this->ml_tanhp_nl) && !this->ml_tanh_pnl) {
+        potential += this->potTanhpTanhp_nlTerm(data.rho[istru],
+                                                data.nablaRho[istru],
+                                                data.p[istru],
+                                                data.tanhp[istru],
+                                                kernels,
+                                                data.tau_tf[istru],
+                                                gradient,
+                                                grid.fft_grid[istru]);
+    }
+    if ((this->ml_tanhq || this->ml_tanhq_nl) && !this->ml_tanh_qnl) {
+        potential += this->potTanhqTanhq_nlTerm(data.rho[istru],
+                                                data.q[istru],
+                                                data.tanhq[istru],
+                                                kernels,
+                                                data.tau_tf[istru],
+                                                gradient,
+                                                grid.fft_gg[istru]);
+    }
+    if (this->ml_tanh_pnl) {
+        potential += this->potTanhpTanh_pnlTerm(data.rho[istru],
+                                                data.nablaRho[istru],
+                                                data.p[istru],
+                                                data.tanhp[istru],
+                                                data.tanh_pnl,
+                                                kernels,
+                                                data.tau_tf[istru],
+                                                gradient,
+                                                grid.fft_grid[istru]);
+    }
+    if (this->ml_tanh_qnl) {
+        potential += this->potTanhqTanh_qnlTerm(data.rho[istru],
+                                                data.q[istru],
+                                                data.tanhq[istru],
+                                                data.tanh_qnl,
+                                                kernels,
+                                                data.tau_tf[istru],
+                                                gradient,
+                                                grid.fft_gg[istru]);
+    }
+
+    // Input::print("get potential done");
+    return potential;
+}
+
+torch::Tensor PauliPotential::potGammaTerm(const torch::Tensor &gamma, const torch::Tensor &gradient)
+{
+    // std::cout << "potGammaTerm" << std::endl;
+    return 1. / 3. * gamma
+           * gradient.index({"...", this->descriptor2index["gamma"][0]})
+                 .reshape({this->fftdim, this->fftdim, this->fftdim});
+}
+
+torch::Tensor PauliPotential::potPTerm1(const torch::Tensor &p, const torch::Tensor &gradient)
+{
+    // std::cout << "potPTerm1" << std::endl;
+    return -8. / 3. * p
+           * gradient.index({"...", this->descriptor2index["p"][0]})
+                 .reshape({this->fftdim, this->fftdim, this->fftdim});
+}
+
+torch::Tensor PauliPotential::potQTerm1(const torch::Tensor &q, const torch::Tensor &gradient)
+{
+    // std::cout << "potQTerm1" << std::endl;
+    return -5. / 3. * q
+           * gradient.index({"...", this->descriptor2index["q"][0]})
+                 .reshape({this->fftdim, this->fftdim, this->fftdim});
+}
+
+torch::Tensor PauliPotential::potGammanlTerm(const torch::Tensor &rho,
+                                        const torch::Tensor &gamma,
+                                        const Kernel *kernels,
+                                        // const torch::Tensor &kernel,
+                                        const torch::Tensor &tauTF,
+                                        const torch::Tensor &gradient)
+{
+    // std::cout << "potGmmamnlTerm" << std::endl;
+    torch::Tensor result = torch::zeros_like(gamma);
+    for (int ik = 0; ik < this->descriptor2kernel["gammanl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["gammanl"][ik];
+        int d2i = this->descriptor2index["gammanl"][ik];
+        result += 1. / 3. * gamma / rho
+                  * torch::real(
+                      torch::fft::ifftn(torch::fft::fftn(gradient.index({"...", d2i})
+                                                             .reshape({this->fftdim, this->fftdim, this->fftdim})
+                                                         * tauTF)
+                                        * kernels[d2k].kernel[istru]));
+    }
+    return result;
+}
+
+torch::Tensor PauliPotential::potPPnlTerm(const torch::Tensor &rho,
+                                     const torch::Tensor &nablaRho,
+                                     const torch::Tensor &p,
+                                     const Kernel *kernels,
+                                     //  const torch::Tensor &kernel,
+                                     const torch::Tensor &tauTF,
+                                     const torch::Tensor &gradient,
+                                     const std::vector<torch::Tensor> &grid)
+{
+    // std::cout << "potPPnlTerm" << std::endl;
+    torch::Tensor dFdpnl_nl = torch::zeros_like(p);
+    for (int ik = 0; ik < this->descriptor2index["pnl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["pnl"][ik];
+        int d2i = this->descriptor2index["pnl"][ik];
+        dFdpnl_nl
+            += torch::real(torch::fft::ifftn(torch::fft::fftn(gradient.index({"...", d2i})
+                                                                  .reshape({this->fftdim, this->fftdim, this->fftdim})
+                                                              * tauTF)
+                                             * kernels[d2k].kernel[istru]));
+    }
+
+    torch::Tensor temp = torch::zeros_like(nablaRho);
+    for (int i = 0; i < 3; ++i)
+    {
+        temp[i] = (this->ml_p) ? -3. / 20.
+                                     * gradient.index({"...", this->descriptor2index["p"][0]})
+                                           .reshape({this->fftdim, this->fftdim, this->fftdim})
+                                     * nablaRho[i] / rho * /*Ha to Ry*/ 2.
+                               : torch::zeros_like(nablaRho[i]);
+        if (this->ml_pnl) {
+            temp[i] += -this->pqcoef * 2. * nablaRho[i] / torch::pow(rho, 8. / 3.) * dFdpnl_nl;
+        }
+    }
+    // std::cout << torch::slice(temp[0][0][0], 0, 0, 10);
+    torch::Tensor result = this->divergence(temp, grid);
+
+    if (this->ml_pnl) {
+        result += -8. / 3. * p / rho * dFdpnl_nl;
+    }
+    // std::cout << torch::slice(result[0][0], 0, 20) << std::endl;
+
+    // std::cout << "potPPnlTerm done" << std::endl;
+    return result;
+}
+
+torch::Tensor PauliPotential::potQQnlTerm(const torch::Tensor &rho,
+                                     const torch::Tensor &q,
+                                     const Kernel *kernels,
+                                     //  const torch::Tensor &kernel,
+                                     const torch::Tensor &tauTF,
+                                     const torch::Tensor &gradient,
+                                     const torch::Tensor &gg)
+{
+    // std::cout << "potQQnlTerm" << std::endl;
+    torch::Tensor dFdqnl_nl = torch::zeros_like(q);
+    for (int ik = 0; ik < this->descriptor2index["qnl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["qnl"][ik];
+        int d2i = this->descriptor2index["qnl"][ik];
+        dFdqnl_nl
+            = torch::real(torch::fft::ifftn(torch::fft::fftn(gradient.index({"...", d2i})
+                                                                 .reshape({this->fftdim, this->fftdim, this->fftdim})
+                                                             * tauTF)
+                                            * kernels[d2k].kernel[istru]));
+    }
+
+    torch::Tensor temp = (this->ml_q) ? 3. / 40.
+                                            * gradient.index({"...", this->descriptor2index["q"][0]})
+                                                  .reshape({this->fftdim, this->fftdim, this->fftdim})
+                                            * /*Ha2Ry*/ 2.
+                                      : torch::zeros_like(q);
+    if (this->ml_qnl) {
+        temp += this->pqcoef / torch::pow(rho, 5. / 3.) * dFdqnl_nl;
+    }
+    torch::Tensor result = this->Laplacian(temp, gg);
+
+    if (this->ml_qnl) {
+        result += -5. / 3. * q / rho * dFdqnl_nl;
+    }
+
+    // std::cout << "potQQnlTerm done" << std::endl;
+    return result;
+}
+
+torch::Tensor PauliPotential::potXiTerm1(const torch::Tensor &rho, const std::vector<torch::Tensor> &xi, const torch::Tensor &gradient)
+{
+    torch::Tensor result = torch::zeros_like(rho);
+    for (int ik = 0; ik < this->descriptor2kernel["xi"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["xi"][ik];
+        int d2i = this->descriptor2index["xi"][ik];
+        result += -1. / 3. * xi[d2k][istru]
+                  * gradient.index({"...", d2i})
+                        .reshape({this->fftdim, this->fftdim, this->fftdim});
+    }
+    return result;
+}
+
+torch::Tensor PauliPotential::potTanhxiTerm1(const torch::Tensor &rho,
+                                        const std::vector<torch::Tensor> &xi,
+                                        const std::vector<torch::Tensor> &tanhxi,
+                                        const torch::Tensor &gradient)
+{
+    torch::Tensor result = torch::zeros_like(rho);
+    for (int ik = 0; ik < this->descriptor2kernel["tanhxi"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanhxi"][ik];
+        int d2i = this->descriptor2index["tanhxi"][ik];
+        result += -1. / 3. * xi[d2k][istru] * this->dtanh(tanhxi[d2k][istru], this->chi_xi[d2k])
+                  * gradient.index({"...", d2i}).reshape({this->fftdim, this->fftdim, this->fftdim});
+    }
+    return result;
+}
+
+torch::Tensor PauliPotential::potTanhpTerm1(const torch::Tensor &p,
+                                       const torch::Tensor &tanhp,
+                                       const torch::Tensor &gradient)
+{
+    return -8. / 3. * p * this->dtanh(tanhp, this->chi_p)
+           * gradient.index({"...", this->descriptor2index["tanhp"][0]})
+                 .reshape({this->fftdim, this->fftdim, this->fftdim});
+}
+
+torch::Tensor PauliPotential::potTanhqTerm1(const torch::Tensor &q,
+                                       const torch::Tensor &tanhq,
+                                       const torch::Tensor &gradient)
+{
+    return -5. / 3. * q * this->dtanh(tanhq, this->chi_q)
+           * gradient.index({"...", this->descriptor2index["tanhq"][0]})
+                 .reshape({this->fftdim, this->fftdim, this->fftdim});
+}
+
+torch::Tensor PauliPotential::potXinlTerm(const torch::Tensor &rho,
+                                     const Kernel *kernels,
+                                     //  const torch::Tensor &kernel,
+                                     const torch::Tensor &tauTF,
+                                     const torch::Tensor &gradient)
+{
+    torch::Tensor result = torch::zeros_like(rho);
+    for (int ik = 0; ik < this->descriptor2kernel["xi"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["xi"][ik];
+        int d2i = this->descriptor2index["xi"][ik];
+        result += 1. / 3. * torch::pow(rho, -2. / 3.)
+                  * torch::real(
+                      torch::fft::ifftn(torch::fft::fftn(gradient.index({"...", d2i})
+                                                             .reshape({this->fftdim, this->fftdim, this->fftdim})
+                                                         * tauTF * torch::pow(rho, -1. / 3.))
+                                        * kernels[d2k].kernel[istru]));
+    }
+    return result;
+}
+
+torch::Tensor PauliPotential::potTanhxinlTerm(const torch::Tensor &rho,
+                                         const std::vector<torch::Tensor> &tanhxi,
+                                         const Kernel *kernels,
+                                         //  const torch::Tensor &kernel,
+                                         const torch::Tensor &tauTF,
+                                         const torch::Tensor &gradient)
+{
+    torch::Tensor result = torch::zeros_like(rho);
+
+    for (int ik = 0; ik < this->descriptor2kernel["tanhxi"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanhxi"][ik];
+        int d2i = this->descriptor2index["tanhxi"][ik];
+        result += 1. / 3. * torch::pow(rho, -2. / 3.)
+                  * torch::real(torch::fft::ifftn(
+                      torch::fft::fftn(gradient.index({"...", d2i}).reshape({this->fftdim, this->fftdim, this->fftdim})
+                                       * this->dtanh(tanhxi[d2k][istru], this->chi_xi[d2k]) * tauTF
+                                       * torch::pow(rho, -1. / 3.))
+                      * kernels[d2k].kernel[istru]));
+    }
+    return result;
+}
+
+torch::Tensor PauliPotential::potTanhxi_nlTerm(const torch::Tensor &rho,
+                                          const std::vector<torch::Tensor> &xi,
+                                          const std::vector<torch::Tensor> &tanhxi,
+                                          const Kernel *kernels,
+                                          //   const torch::Tensor &kernel,
+                                          const torch::Tensor &tauTF,
+                                          const torch::Tensor &gradient)
+{
+    torch::Tensor result = torch::zeros_like(rho);
+    for (int ik = 0; ik < this->descriptor2kernel["tanhxi_nl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanhxi_nl"][ik];
+        int d2i = this->descriptor2index["tanhxi_nl"][ik];
+        torch::Tensor dFdxi
+            = torch::real(torch::fft::ifftn(
+                  torch::fft::fftn(tauTF
+                                   * gradient.index({"...", d2i}).reshape({this->fftdim, this->fftdim, this->fftdim}))
+                  * kernels[d2k].kernel[istru]))
+              * this->dtanh(tanhxi[d2k][istru], this->chi_xi[d2k]) * torch::pow(rho, -1. / 3.);
+        // std::cout << "tanhxi\n" << torch::slice(tanhxi[d2k][istru][0][0], 0, 0, 10) << std::endl;
+        // std::cout << "gradient\n" << torch::slice(gradient, 0, 0, 10) << std::endl;
+        // std::cout << "kernel\n" << torch::slice(kernels[d2k].kernel[istru][0][0], 0, 0, 10) << std::endl;
+        result += 1. / 3. * torch::pow(rho, -2. / 3.)
+                  * (-xi[d2k][istru] * dFdxi
+                     + torch::real(torch::fft::ifftn(torch::fft::fftn(dFdxi) * kernels[d2k].kernel[istru])));
+    }
+    return result;
+}
+
+torch::Tensor PauliPotential::potTanhpTanh_pnlTerm(const torch::Tensor &rho,
+                                              const torch::Tensor &nablaRho,
+                                              const torch::Tensor &p,
+                                              const torch::Tensor &tanhp,
+                                              const std::vector<torch::Tensor> &tanh_pnl,
+                                              const Kernel *kernels,
+                                              //   const torch::Tensor &kernel,
+                                              const torch::Tensor &tauTF,
+                                              const torch::Tensor &gradient,
+                                              const std::vector<torch::Tensor> &grid)
+{
+    torch::Tensor dFdpnl_nl = torch::zeros_like(tanhp);
+    for (int ik = 0; ik < this->descriptor2kernel["tanh_pnl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanh_pnl"][ik];
+        int d2i = this->descriptor2index["tanh_pnl"][ik];
+        dFdpnl_nl += torch::real(torch::fft::ifftn(
+            torch::fft::fftn(gradient.index({"...", d2i})
+                                 .reshape({this->fftdim, this->fftdim, this->fftdim})
+                             * this->dtanh(tanh_pnl[d2k][istru], this->chi_pnl[d2k])
+                             * tauTF)
+            * kernels[d2k].kernel[istru]));
+    }
+
+    torch::Tensor temp = torch::zeros_like(nablaRho);
+    for (int i = 0; i < 3; ++i)
+    {
+        temp[i] = (this->ml_tanhp) ? -3. / 20.
+                                         * gradient.index({"...", this->descriptor2index["tanhp"][0]})
+                                               .reshape({this->fftdim, this->fftdim, this->fftdim})
+                                         * this->dtanh(tanhp, this->chi_p) * nablaRho[i] / rho * /*Ha to Ry*/ 2.
+                                   : torch::zeros_like(nablaRho[i]);
+        if (this->ml_tanh_pnl) {
+            temp[i] += -this->pqcoef * 2. * nablaRho[i] / torch::pow(rho, 8. / 3.) * dFdpnl_nl;
+        }
+    }
+    torch::Tensor result = this->divergence(temp, grid);
+
+    if (this->ml_tanh_pnl) {
+        result += -8. / 3. * p / rho * dFdpnl_nl;
+    }
+
+    return result;
+}
+
+torch::Tensor PauliPotential::potTanhqTanh_qnlTerm(const torch::Tensor &rho,
+                                              const torch::Tensor &q,
+                                              const torch::Tensor &tanhq,
+                                              const std::vector<torch::Tensor> &tanh_qnl,
+                                              const Kernel *kernels,
+                                              //   const torch::Tensor &kernel,
+                                              const torch::Tensor &tauTF,
+                                              const torch::Tensor &gradient,
+                                              const torch::Tensor &gg)
+{
+    torch::Tensor dFdqnl_nl = torch::zeros_like(tanhq);
+    for (int ik = 0; ik < this->descriptor2kernel["tanh_qnl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanh_qnl"][ik];
+        int d2i = this->descriptor2index["tanh_qnl"][ik];
+        dFdqnl_nl += torch::real(torch::fft::ifftn(
+            torch::fft::fftn(gradient.index({"...", d2i})
+                                 .reshape({this->fftdim, this->fftdim, this->fftdim})
+                             * this->dtanh(tanh_qnl[d2k][istru], this->chi_qnl[d2k])
+                             * tauTF)
+            * kernels[d2k].kernel[istru]));
+    }
+
+    torch::Tensor temp = (this->ml_tanhq) ? 3. / 40.
+                                                * gradient.index({"...", this->descriptor2index["tanhq"][0]})
+                                                      .reshape({this->fftdim, this->fftdim, this->fftdim})
+                                                * this->dtanh(tanhq, this->chi_q) * /*Ha2Ry*/ 2.
+                                          : torch::zeros_like(q);
+    if (this->ml_tanh_qnl) {
+        temp += this->pqcoef / torch::pow(rho, 5. / 3.) * dFdqnl_nl;
+    }
+    torch::Tensor result = this->Laplacian(temp, gg);
+
+    if (this->ml_tanh_qnl) {
+        result += -5. / 3. * q / rho * dFdqnl_nl;
+    }
+
+    return result;
+}
+
+torch::Tensor PauliPotential::potTanhpTanhp_nlTerm(const torch::Tensor &rho,
+                                              const torch::Tensor &nablaRho,
+                                              const torch::Tensor &p,
+                                              const torch::Tensor &tanhp,
+                                              const Kernel *kernels,
+                                              //   const torch::Tensor &kernel,
+                                              const torch::Tensor &tauTF,
+                                              const torch::Tensor &gradient,
+                                              const std::vector<torch::Tensor> &grid)
+{
+    torch::Tensor dFdpnl_nl = torch::zeros_like(tanhp);
+    for (int ik = 0; ik < this->descriptor2kernel["tanhp_nl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanhp_nl"][ik];
+        int d2i = this->descriptor2index["tanhp_nl"][ik];
+        dFdpnl_nl += torch::real(torch::fft::ifftn(
+                         torch::fft::fftn(gradient.index({"...", d2i})
+                                              .reshape({this->fftdim, this->fftdim, this->fftdim})
+                                          * tauTF)
+                         * kernels[d2k].kernel[istru]))
+                     * this->dtanh(tanhp, this->chi_p);
+    }
+
+    torch::Tensor temp = torch::zeros_like(nablaRho);
+    for (int i = 0; i < 3; ++i)
+    {
+        temp[i] = (this->ml_tanhp) ? -3. / 20.
+                                         * gradient.index({"...", this->descriptor2index["tanhp"][0]})
+                                               .reshape({this->fftdim, this->fftdim, this->fftdim})
+                                         * this->dtanh(tanhp, this->chi_p) * nablaRho[i] / rho * /*Ha to Ry*/ 2.
+                                   : torch::zeros_like(nablaRho[i]);
+        if (this->ml_tanhp_nl) {
+            temp[i] += -this->pqcoef * 2. * nablaRho[i] / torch::pow(rho, 8. / 3.) * dFdpnl_nl;
+        }
+    }
+    torch::Tensor result = this->divergence(temp, grid);
+
+    if (this->ml_tanhp_nl) {
+        result += -8. / 3. * p / rho * dFdpnl_nl;
+    }
+
+    return result;
+}
+
+torch::Tensor PauliPotential::potTanhqTanhq_nlTerm(const torch::Tensor &rho,
+                                              const torch::Tensor &q,
+                                              const torch::Tensor &tanhq,
+                                              const Kernel *kernels,
+                                              //   const torch::Tensor &kernel,
+                                              const torch::Tensor &tauTF,
+                                              const torch::Tensor &gradient,
+                                              const torch::Tensor &gg)
+{
+    torch::Tensor dFdqnl_nl = torch::zeros_like(tanhq);
+    for (int ik = 0; ik < this->descriptor2kernel["tanhq_nl"].size(); ++ik)
+    {
+        int d2k = this->descriptor2kernel["tanhq_nl"][ik];
+        int d2i = this->descriptor2index["tanhq_nl"][ik];
+        dFdqnl_nl += torch::real(torch::fft::ifftn(
+                         torch::fft::fftn(gradient.index({"...", d2i})
+                                              .reshape({this->fftdim, this->fftdim, this->fftdim})
+                                          * tauTF)
+                         * kernels[d2k].kernel[istru]))
+                     * this->dtanh(tanhq, this->chi_q);
+    }
+
+    torch::Tensor temp = (this->ml_tanhq) ? 3. / 40.
+                                                * gradient.index({"...", this->descriptor2index["tanhq"][0]})
+                                                      .reshape({this->fftdim, this->fftdim, this->fftdim})
+                                                * this->dtanh(tanhq, this->chi_q) * /*Ha2Ry*/ 2.
+                                          : torch::zeros_like(q);
+    if (this->ml_tanhq_nl) {
+        temp += this->pqcoef / torch::pow(rho, 5. / 3.) * dFdqnl_nl;
+    }
+    torch::Tensor result = this->Laplacian(temp, gg);
+
+    if (this->ml_tanhq_nl) {
+        result += -5. / 3. * q / rho * dFdqnl_nl;
+    }
+
+    return result;
+}
+
+torch::Tensor PauliPotential::divergence(const torch::Tensor &input, const std::vector<torch::Tensor> &grid)
+{
+    torch::Tensor result = torch::zeros_like(input[0]);
+    // torch::Tensor img = torch::tensor({1.0j});
+    // for (int i = 0; i < 3; ++i)
+    // {
+    //     result += torch::real(torch::fft::ifftn(torch::fft::fftn(input[i]) * grid[i] * img));
+    // }
+    for (int i = 0; i < 3; ++i)
+    {
+        result -= torch::imag(torch::fft::ifftn(torch::fft::fftn(input[i]) * grid[i]));
+    }
+    return result;
+}
+
+torch::Tensor PauliPotential::Laplacian(const torch::Tensor &input, const torch::Tensor &gg)
+{
+    return torch::real(torch::fft::ifftn(torch::fft::fftn(input) * -gg));
+}
+
+torch::Tensor PauliPotential::dtanh(const torch::Tensor &tanhx, const double chi)
+{
+    return (torch::ones_like(tanhx) - tanhx * tanhx) * chi;
+    // return (1. - tanhx * tanhx) * chi;
+}
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/pauli_potential.h b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/pauli_potential.h
new file mode 100644
index 0000000000..d5bd9b9c06
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/pauli_potential.h
@@ -0,0 +1,203 @@
+#ifndef PAULI_POTENTIAL_H
+#define PAULI_POTENTIAL_H
+
+#include <torch/torch.h>
+#include "./input.h"
+#include "./data.h"
+#include "./kernel.h"
+#include "./grid.h"
+
+class PauliPotential{
+
+public:
+    void init(const Input &input, const int ninput, const std::vector<std::string> &descriptor_type, const std::vector<int> &kernel_index);
+
+    int fftdim = 0;
+    int istru = 0;
+
+    std::map<std::string, std::vector<int>> descriptor2kernel = {};
+    std::map<std::string, std::vector<int>> descriptor2index = {};
+    
+    // semi-local descriptors
+    bool ml_gamma = false;
+    bool ml_p = false;
+    bool ml_q = false;
+    bool ml_tanhp = false;
+    bool ml_tanhq = false;
+    // non-local descriptors
+    bool ml_gammanl = false;
+    bool ml_pnl = false;
+    bool ml_qnl = false;
+    bool ml_xi = false;
+    bool ml_tanhxi = false;
+    bool ml_tanhxi_nl = false;
+    bool ml_tanh_pnl = false;
+    bool ml_tanh_qnl = false;
+    bool ml_tanhp_nl = false;
+    bool ml_tanhq_nl = false;
+
+    double chi_p = 1.;
+    double chi_q = 1.;
+    double* chi_xi = nullptr;
+    double* chi_pnl = nullptr;
+    double* chi_qnl = nullptr;
+
+    torch::Tensor get_potential(
+        const int istru,
+        const Data &data,
+        const torch::Tensor &F,
+        const torch::Tensor &gradient,
+        const Kernel *kernels,
+        const Grid &grid
+    );
+private:
+
+    torch::Tensor potGammaTerm(
+        const torch::Tensor &gamma,
+        const torch::Tensor &gradient
+    );
+    torch::Tensor potPTerm1(
+        const torch::Tensor &p,
+        const torch::Tensor &gradient
+    );
+    torch::Tensor potQTerm1(
+        const torch::Tensor &q,
+        const torch::Tensor &gradient
+    );
+    torch::Tensor potGammanlTerm(
+        const torch::Tensor &rho,
+        const torch::Tensor &gamma,
+        const Kernel *kernels,
+        // const torch::Tensor &kernel,
+        const torch::Tensor &tauTF,
+        const torch::Tensor &gradient
+    );
+    torch::Tensor potPPnlTerm(
+        const torch::Tensor &rho,
+        const torch::Tensor &nablaRho,
+        const torch::Tensor &p,
+        const Kernel *kernels,
+        // const torch::Tensor &kernel,
+        const torch::Tensor &tauTF,
+        const torch::Tensor &gradient,
+        const std::vector<torch::Tensor> &grid
+    );
+    torch::Tensor potQQnlTerm(
+        const torch::Tensor &rho,
+        const torch::Tensor &q,
+        const Kernel *kernels,
+        // const torch::Tensor &kernel,
+        const torch::Tensor &tauTF,
+        const torch::Tensor &gradient,
+        const torch::Tensor &gg    
+    );
+
+
+    torch::Tensor potXiTerm1(
+        const torch::Tensor &rho,
+        const std::vector<torch::Tensor> &xi,
+        const torch::Tensor &gradient
+    );
+    torch::Tensor potTanhxiTerm1(
+        const torch::Tensor &rho,
+        const std::vector<torch::Tensor> &xi,
+        const std::vector<torch::Tensor> &tanhxi,
+        const torch::Tensor &gradient
+    );
+    torch::Tensor potTanhpTerm1(
+        const torch::Tensor &p,
+        const torch::Tensor &tanhp,
+        const torch::Tensor &gradient
+    );
+    torch::Tensor potTanhqTerm1(
+        const torch::Tensor &q,
+        const torch::Tensor &tanhq,
+        const torch::Tensor &gradient
+    );
+    torch::Tensor potXinlTerm(
+        const torch::Tensor &rho,
+        const Kernel *kernels,
+        // const torch::Tensor &kernel,
+        const torch::Tensor &tauTF,
+        const torch::Tensor &gradient
+    );
+    torch::Tensor potTanhxinlTerm(
+        const torch::Tensor &rho,
+        const std::vector<torch::Tensor> &tanhxi,
+        const Kernel *kernels,
+        // const torch::Tensor &kernel,
+        const torch::Tensor &tauTF,
+        const torch::Tensor &gradient
+    );
+    torch::Tensor potTanhxi_nlTerm(
+        const torch::Tensor &rho,
+        const std::vector<torch::Tensor> &xi,
+        const std::vector<torch::Tensor> &tanhxi,
+        const Kernel *kernels,
+        // const torch::Tensor &kernel,
+        const torch::Tensor &tauTF,
+        const torch::Tensor &gradient
+    );
+    torch::Tensor potTanhpTanh_pnlTerm(
+        const torch::Tensor &rho,
+        const torch::Tensor &nablaRho,
+        const torch::Tensor &p,
+        const torch::Tensor &tanhp,
+        const std::vector<torch::Tensor> &tanh_pnl,
+        const Kernel *kernels,
+        // const torch::Tensor &kernel,
+        const torch::Tensor &tauTF,
+        const torch::Tensor &gradient,
+        const std::vector<torch::Tensor> &grid
+    );
+    torch::Tensor potTanhqTanh_qnlTerm(
+        const torch::Tensor &rho,
+        const torch::Tensor &q,
+        const torch::Tensor &tanhq,
+        const std::vector<torch::Tensor> &tanh_qnl,
+        const Kernel *kernels,
+        // const torch::Tensor &kernel,
+        const torch::Tensor &tauTF,
+        const torch::Tensor &gradient,
+        const torch::Tensor &gg
+    );
+    torch::Tensor potTanhpTanhp_nlTerm(
+        const torch::Tensor &rho,
+        const torch::Tensor &nablaRho,
+        const torch::Tensor &p,
+        const torch::Tensor &tanhp,
+        const Kernel *kernels,
+        // const torch::Tensor &kernel,
+        const torch::Tensor &tauTF,
+        const torch::Tensor &gradient,
+        const std::vector<torch::Tensor> &grid
+    );
+    torch::Tensor potTanhqTanhq_nlTerm(
+        const torch::Tensor &rho,
+        const torch::Tensor &q,
+        const torch::Tensor &tanhq,
+        const Kernel *kernels,
+        // const torch::Tensor &kernel,
+        const torch::Tensor &tauTF,
+        const torch::Tensor &gradient,
+        const torch::Tensor &gg
+    );
+
+    // Tools for getting potential
+    torch::Tensor divergence(
+        const torch::Tensor &input,
+        const std::vector<torch::Tensor> &grid
+    );
+    torch::Tensor Laplacian(
+        const torch::Tensor &input,
+        const torch::Tensor &gg
+    );
+    torch::Tensor dtanh(
+        const torch::Tensor &tanhx,
+        const double chi
+    );
+
+    const double cTF = 3.0/10.0 * std::pow(3*std::pow(M_PI, 2.0), 2.0/3.0) * 2; // 10/3*(3*pi^2)^{2/3}, multiply by 2 to convert unit from Hartree to Ry, finally in Ry*Bohr^(-2)
+    const double pqcoef = 1.0 / (4.0 * std::pow(3*std::pow(M_PI, 2.0), 2.0/3.0)); // coefficient of p and q
+};
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/train_kedf.cpp b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/train_kedf.cpp
new file mode 100644
index 0000000000..6613829be7
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/train_kedf.cpp
@@ -0,0 +1,512 @@
+#include "./train_kedf.h"
+#include <sstream>
+#include <math.h>
+#include <chrono>
+
+Train_KEDF::~Train_KEDF()
+{
+    delete[] this->train_volume;
+    delete[] this->vali_volume;
+    delete[] this->kernel_train;
+    delete[] this->kernel_vali;
+}
+
+void Train_KEDF::setUpFFT()
+{
+    this->train_volume = new double[this->input.ntrain];
+    this->grid_train.initGrid(
+        this->input.fftdim,
+        this->input.ntrain,
+        this->input.train_cell,
+        this->input.train_a,
+        this->device,
+        this->train_volume
+    );
+    this->kernel_train = new Kernel[this->input.nkernel];
+    for (int ik = 0; ik < this->input.nkernel; ++ik)
+    {
+        this->kernel_train[ik].set_para(this->input.kernel_type[ik], this->input.kernel_scaling[ik], this->input.yukawa_alpha[ik], this->input.kernel_file[ik]);
+        this->kernel_train[ik].fill_kernel(
+            this->input.fftdim,
+            this->input.ntrain,
+            this->data_train.rho,
+            this->train_volume,
+            this->input.train_cell,
+            this->device,
+            this->grid_train.fft_gg
+        );
+    }
+    if (this->input.nvalidation > 0){
+        this->vali_volume = new double[this->input.nvalidation];
+        this->grid_vali.initGrid(
+            this->input.fftdim,
+            this->input.nvalidation,
+            this->input.validation_cell,
+            this->input.validation_a,
+            this->device,
+            this->vali_volume
+        );
+        this->kernel_vali = new Kernel[this->input.nkernel];
+        for (int ik = 0; ik < this->input.nkernel; ++ik)
+        {
+            this->kernel_vali[ik].set_para(this->input.kernel_type[ik], this->input.kernel_scaling[ik], this->input.yukawa_alpha[ik], this->input.kernel_file[ik]);
+            this->kernel_vali[ik].fill_kernel(
+                this->input.fftdim,
+                this->input.nvalidation,
+                this->data_vali.rho,
+                this->vali_volume,
+                this->input.validation_cell,
+                this->device,
+                this->grid_vali.fft_gg
+            );
+        }
+    }
+    
+    // this->dumpTensor(this->fft_kernel_train[0].reshape({this->data_train.nx}), "kernel_fcc.npy", this->data_train.nx);
+    // this->dumpTensor(this->fft_kernel_vali[0].reshape({this->data_train.nx}), "kernel_bcc.npy", this->data_train.nx);
+}
+
+void Train_KEDF::set_device()
+{
+    if (this->input.device_type == "cpu")
+    {
+        std::cout << "---------- Running on CPU ----------" << std::endl;
+        this->device = torch::Device(torch::kCPU);
+    }
+    else if (this->input.device_type == "gpu")
+    {
+        if (torch::cuda::cudnn_is_available())
+        {
+            std::cout << "---------- Running on GPU ----------" << std::endl;
+            this->device = torch::Device(torch::kCUDA);
+        }
+        else
+        {
+            std::cout << "---- Warning: GPU is unaviable -----" << std::endl;
+            std::cout << "---------- Running on CPU ----------" << std::endl;
+            this->device = torch::Device(torch::kCPU);
+        }
+    }
+}
+
+void Train_KEDF::init_input_index()
+{
+    this->ninput = 0;
+
+    // --------- semi-local descriptors ---------
+    if (this->input.ml_gamma){
+        this->descriptor_type.push_back("gamma");
+        this->kernel_index.push_back(-1);
+        ninput++;
+    } 
+    if (this->input.ml_p){
+        this->descriptor_type.push_back("p");
+        this->kernel_index.push_back(-1);
+        ninput++;
+    }
+    if (this->input.ml_q){
+        this->descriptor_type.push_back("q");
+        this->kernel_index.push_back(-1);
+        ninput++;
+    }
+    // --------- non-local descriptors ---------
+    for (int ik = 0; ik < this->input.nkernel; ++ik)
+    {
+        if (this->input.ml_gammanl[ik]){
+            this->descriptor_type.push_back("gammanl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (this->input.ml_pnl[ik]){
+            this->descriptor_type.push_back("pnl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (this->input.ml_qnl[ik]){
+            this->descriptor_type.push_back("qnl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (this->input.ml_xi[ik]){
+            this->descriptor_type.push_back("xi");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (this->input.ml_tanhxi[ik]){
+            this->descriptor_type.push_back("tanhxi");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (this->input.ml_tanhxi_nl[ik]){
+            this->descriptor_type.push_back("tanhxi_nl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+    }
+    // --------- semi-local descriptors ---------
+    if (this->input.ml_tanhp){
+        this->descriptor_type.push_back("tanhp");
+        this->kernel_index.push_back(-1);
+        ninput++;
+    }
+    if (this->input.ml_tanhq){
+        this->descriptor_type.push_back("tanhq");
+        this->kernel_index.push_back(-1);
+        ninput++;
+    }
+    // --------- non-local descriptors ---------
+    for (int ik = 0; ik < this->input.nkernel; ++ik)
+    {
+        if (this->input.ml_tanh_pnl[ik]){
+            this->descriptor_type.push_back("tanh_pnl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (this->input.ml_tanh_qnl[ik]){
+            this->descriptor_type.push_back("tanh_qnl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (this->input.ml_tanhp_nl[ik]){
+            this->descriptor_type.push_back("tanhp_nl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+        if (this->input.ml_tanhq_nl[ik]){
+            this->descriptor_type.push_back("tanhq_nl");
+            this->kernel_index.push_back(ik);
+            this->ninput++;
+        }
+    }
+
+    std::cout << "ninput = " << ninput << std::endl;
+
+    if (this->input.feg_limit != 0)
+    {
+        this->feg_inpt = torch::zeros(ninput).to(device);
+        for (int i = 0; i < this->ninput; ++i)
+        {
+            if (this->descriptor_type[i] == "gamma") this->feg_inpt[i] = 1.;
+        }
+
+        this->feg_predict = torch::zeros(1).to(device);
+        this->feg_dFdgamma = torch::zeros(1).to(device);
+    }
+    std::cout << "feg_limit = " << this->input.feg_limit << std::endl;
+}
+
+void Train_KEDF::init()
+{
+    this->set_device();
+    this->init_input_index();
+    this->data_train.load_data(this->input, this->input.ntrain, this->input.train_dir, this->device);
+    this->data_vali.load_data(this->input, this->input.nvalidation, this->input.validation_dir, this->device);
+    // Input::print("LOAD DATA done");
+
+    this->potential.init(this->input, this->ninput, this->descriptor_type, this->kernel_index);
+    // Input::print("init potential done");
+    if (this->input.loss == "potential" || this->input.loss == "both" || this->input.loss == "both_new") this->setUpFFT();
+    // Input::print("init fft done");
+    
+    this->nn = std::make_shared<NN_OFImpl>(this->data_train.nx_tot, this->data_vali.nx_tot, this->ninput, this->input.nnode, this->input.nlayer, this->device);
+    // Input::print("init_nn done");
+    this->nn->set_data(&(this->data_train), this->descriptor_type, this->kernel_index, this->nn->inputs);
+    this->nn->set_data(&(this->data_vali), this->descriptor_type, this->kernel_index, this->nn->input_vali);
+}
+
+torch::Tensor Train_KEDF::lossFunction(torch::Tensor enhancement, torch::Tensor target, torch::Tensor coef)
+{
+    return torch::sum(torch::pow(enhancement - target, 2))/this->data_train.nx/coef/coef;
+}
+
+torch::Tensor Train_KEDF::lossFunction_new(torch::Tensor enhancement, torch::Tensor target, torch::Tensor weight, torch::Tensor coef)
+{
+    return torch::sum(torch::pow(weight * (enhancement - target), 2.))/this->data_train.nx/coef/coef;
+}
+
+
+void Train_KEDF::train()
+{
+    // time
+    double tot = 0.;
+    double totF = 0.;
+    double totFback = 0.;
+    double totLoss = 0.;
+    double totLback = 0.;
+    double totP = 0.;
+    double totStep = 0.;
+    std::chrono::_V2::system_clock::time_point start, startF, startFB, startL, startLB, startP, startStep, end, endF, endFB, endL, endLB, endP, endStep;
+
+    start = std::chrono::high_resolution_clock::now();
+
+    std::cout << "========== Train_KEDF begin ==========" << std::endl;
+    // torch::Tensor target = (this->input.loss=="energy") ? this->data_train.enhancement : this->data_train.pauli;
+    if (this->input.loss == "potential" || this->input.loss == "both" || this->input.loss == "both_new")
+    {
+        this->data_train.pauli.resize_({this->input.ntrain, this->input.fftdim, this->input.fftdim, this->input.fftdim});
+    }
+
+    torch::optim::SGD optimizer(this->nn->parameters(), this->input.lr_start);
+    double update_coef = this->input.nepoch/std::log(this->input.lr_end/this->input.lr_start); // used to reduce the learning rate
+
+    std::cout << "Epoch\tLoss\tValidation\tLoss_pot\tLoss_E\tLoss_FEG_pot\tLoss_FEG_E\n";
+    double lossTrain = 0.;
+    double lossPot = 0.;
+    double lossE = 0.;
+    double lossFEG_pot = 0.;
+    double lossFEG_E = 0.;
+    double lossVali = 0.;
+    double maxLoss = 100.;
+
+    // bool increase_coef_feg_e = false;
+    torch::Tensor weight = torch::pow(this->data_train.rho, this->input.exponent/3.);
+    for (size_t epoch = 1; epoch <= this->input.nepoch; ++epoch)
+    {
+        for (int batch_index = 0; batch_index < this->input.ntrain; ++batch_index)
+        {
+            startF = std::chrono::high_resolution_clock::now();
+
+            optimizer.zero_grad();
+            if (this->input.loss == "energy")
+            {
+                torch::Tensor inpt = torch::slice(this->nn->inputs, 0, batch_index*this->data_train.nx, (batch_index + 1)*this->data_train.nx);
+                inpt.requires_grad_(true);
+                torch::Tensor prediction = this->nn->forward(inpt);
+                startL = std::chrono::high_resolution_clock::now();
+                torch::Tensor loss = this->lossFunction(prediction, torch::slice(this->data_train.enhancement, 0, batch_index*this->data_train.nx, (batch_index + 1)*this->data_train.nx), this->data_train.enhancement_mean[batch_index]) * this->input.coef_e;
+                lossTrain = loss.item<double>();
+                endL = std::chrono::high_resolution_clock::now();
+                totLoss += double(std::chrono::duration_cast<std::chrono::microseconds>(endL - startL).count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
+
+                startLB = std::chrono::high_resolution_clock::now();
+                loss.backward();
+                endLB = std::chrono::high_resolution_clock::now();
+                totLback += double(std::chrono::duration_cast<std::chrono::microseconds>(endLB - startLB).count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
+            }
+            else if (this->input.loss == "potential" || this->input.loss == "both" || this->input.loss == "both_new")
+            {
+                torch::Tensor inpt = torch::slice(this->nn->inputs, 0, batch_index*this->data_train.nx, (batch_index + 1)*this->data_train.nx);
+                inpt.requires_grad_(true);
+                torch::Tensor prediction = this->nn->forward(inpt);
+
+                if (this->input.feg_limit != 3)
+                {
+                    prediction = torch::softplus(prediction);
+                }
+                if (this->input.feg_limit != 0)
+                {
+                    // if (this->feg_inpt.grad().numel()) this->feg_inpt.grad().zero_();
+                    this->feg_predict = this->nn->forward(this->feg_inpt);
+                    // if (this->input.ml_gamma) this->feg_dFdgamma = torch::autograd::grad({this->feg_predict}, {this->feg_inpt},
+                    //                                                                 {torch::ones_like(this->feg_predict)}, true, true)[0][this->nn_input_index["gamma"]];
+                    if (this->input.feg_limit == 1) prediction = prediction - torch::softplus(this->feg_predict) + 1.;
+                    if (this->input.feg_limit == 3 && epoch < this->input.change_step) prediction = torch::softplus(prediction);
+                    if (this->input.feg_limit == 3 && epoch >= this->input.change_step)  prediction = torch::softplus(prediction - this->feg_predict + this->feg3_correct);
+                }
+                endF = std::chrono::high_resolution_clock::now();
+                totF += double(std::chrono::duration_cast<std::chrono::microseconds>(endF - startF).count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
+
+                startFB = std::chrono::high_resolution_clock::now();
+                torch::Tensor gradient = torch::autograd::grad({prediction}, {inpt},
+                                                           {torch::ones_like(prediction)}, true, true)[0];
+                endFB = std::chrono::high_resolution_clock::now();
+                totFback += double(std::chrono::duration_cast<std::chrono::microseconds>(endFB - startFB).count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
+
+                startL = std::chrono::high_resolution_clock::now();
+                torch::Tensor pot = this->potential.get_potential(batch_index, this->data_train, prediction.reshape({this->input.fftdim, this->input.fftdim, this->input.fftdim}), gradient, this->kernel_train, this->grid_train);
+                torch::Tensor loss = this->lossFunction(pot, this->data_train.pauli[batch_index], this->data_train.pauli_mean[batch_index])
+                                     * this->input.coef_p;
+                lossPot = loss.item<double>();
+                if (this->input.loss == "both")
+                {
+                    loss = loss + this->input.coef_e * this->lossFunction(prediction, torch::slice(this->data_train.enhancement, 0, batch_index*this->data_train.nx, (batch_index + 1)*this->data_train.nx), this->data_train.enhancement_mean[batch_index]);
+                    lossE = loss.item<double>() - lossPot;
+                }
+                if (this->input.loss == "both_new")
+                {
+                    loss = loss + this->input.coef_e * this->lossFunction_new(prediction, torch::slice(this->data_train.enhancement, 0, batch_index*this->data_train.nx, (batch_index + 1)*this->data_train.nx), weight[batch_index].reshape({this->data_train.nx, 1}), this->data_train.tau_mean[batch_index]);
+                    lossE = loss.item<double>() - lossPot;
+                }
+                if (this->input.feg_limit != 0)
+                {
+                    if (this->input.feg_limit == 1 || this->input.feg_limit == 2)
+                    {
+                        loss = loss + torch::pow(this->feg_predict - 1., 2) * this->input.coef_feg_e;
+                        lossFEG_E = loss.item<double>() - (lossPot + lossE + lossFEG_pot);
+                        // if (lossFEG_E/lossE < 1e-3 && increase_coef_feg_e == false)
+                        // {
+                        //     this->input.coef_feg_e *= 2.;
+                        //     increase_coef_feg_e = true;
+                        //     std::cout << "---------ICREASE COEF FEG E--------" << std::endl;
+                        // }
+                    }
+                    if (this->input.feg_limit == 3)
+                    {
+                        loss = loss + torch::pow(this->feg_predict - this->feg3_correct, 2) * this->input.coef_feg_e;
+                        lossFEG_E = loss.item<double>() - (lossPot + lossE + lossFEG_pot);
+                    }
+                }
+
+                lossTrain = loss.item<double>();
+                endL = std::chrono::high_resolution_clock::now();
+                totLoss += double(std::chrono::duration_cast<std::chrono::microseconds>(endL - startL).count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
+                
+                startLB = std::chrono::high_resolution_clock::now();
+                loss.backward();
+                endLB = std::chrono::high_resolution_clock::now();
+                totLback += double(std::chrono::duration_cast<std::chrono::microseconds>(endLB - startLB).count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
+                // this->dumpTensor(pot.reshape({this->data_train.nx}), "pot_fcc.npy", this->data_train.nx);
+                // this->dumpTensor(torch::slice(prediction, 0, batch_index*this->data_train.nx, (batch_index + 1)*this->data_train.nx).reshape({this->data_train.nx}), "F_fcc.npy", this->data_train.nx);
+            }
+            
+            startP = std::chrono::high_resolution_clock::now();
+            if (epoch % this->input.print_fre == 0) {
+                if (this->input.nvalidation > 0)
+                {
+                    torch::Tensor valid_pre = this->nn->forward(this->nn->input_vali);
+                    if (this->input.feg_limit == 3)
+                    {
+                        valid_pre = torch::softplus(valid_pre - this->feg_predict + this->feg3_correct);
+                    }
+                    lossVali = this->lossFunction(valid_pre, this->data_vali.enhancement, this->data_vali.enhancement_mean).item<double>();
+                }
+                std::cout << std::setiosflags(std::ios::scientific) << std::setprecision(3) << epoch 
+                          << std::setw(12) << lossTrain 
+                          << std::setw(12) << lossVali 
+                          << std::setw(12) << lossPot
+                          << std::setw(12) << lossE 
+                          << std::setw(12) << lossFEG_pot
+                          << std::setw(12) << lossFEG_E
+                          << std::endl;
+            }
+            
+            if (lossTrain > maxLoss){
+                std::cout << "ERROR: too large loss: " << lossTrain << std::endl;
+                exit(0);
+            }
+            endP = std::chrono::high_resolution_clock::now();
+            totP += double(std::chrono::duration_cast<std::chrono::microseconds>(endP - startP).count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
+
+            startStep = std::chrono::high_resolution_clock::now();
+            optimizer.step();
+            endStep = std::chrono::high_resolution_clock::now();
+            totStep += double(std::chrono::duration_cast<std::chrono::microseconds>(endStep - startStep).count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
+        }
+        if (epoch % this->input.dump_fre == 0)
+        {
+            std::stringstream file;
+            file << "model/net" << epoch << ".pt";
+            torch::save(this->nn, file.str());
+        }
+        // Reduce the learning_rate
+        if (epoch % this->input.lr_fre == 0)
+        {
+            for (auto &group : optimizer.param_groups())
+            {
+                if(group.has_options())
+                {
+                    auto &options = static_cast<torch::optim::SGDOptions &>(group.options());
+                    options.lr(this->input.lr_start * std::exp(epoch/update_coef));
+                }
+            }
+        }
+    }
+    end = std::chrono::high_resolution_clock::now();
+
+    tot = double(std::chrono::duration_cast<std::chrono::microseconds>(end - start).count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den;
+
+    std::cout << "=========== Done ============" << std::endl;
+    std::cout << std::setprecision(2) << std::setiosflags(std::ios::fixed) << "Item\t\t\tTime (s)\tPercentage" << std::endl;
+    std::cout.unsetf(std::ios::scientific);
+    std::cout << "Total\t\t\t"          << tot      << "\t\t" << tot/tot        * 100. << " %" << std::endl;
+    std::cout << "Forward\t\t\t"        << totF     << "\t\t" << totF/tot       * 100. << " %" << std::endl;
+    std::cout << "Enhancement back\t"   << totFback << "\t\t" << totFback/tot   * 100. << " %" << std::endl;
+    std::cout << "Loss function\t\t"    << totLoss  << "\t\t" << totLoss/tot    * 100. << " %" << std::endl;
+    std::cout << "Loss backward\t\t"    << totLback << "\t\t" << totLback/tot   * 100. << " %" << std::endl;
+    std::cout << "Print\t\t\t"          << totP     << "\t\t" << totP/tot       * 100. << " %" << std::endl;
+    std::cout << "Step\t\t\t"           << totStep  << "\t\t" << totStep/tot    * 100. << " %" << std::endl;
+}
+
+void Train_KEDF::potTest()
+{
+    this->set_device();
+    this->init_input_index();
+    this->data_train.load_data(this->input, this->input.ntrain, this->input.train_dir, this->device);
+    this->data_vali.load_data(this->input, this->input.nvalidation, this->input.validation_dir, this->device);
+    Input::print("LOAD DATA done");
+
+    this->potential.init(this->input, this->ninput, this->descriptor_type, this->kernel_index);
+    Input::print("init potential done");
+    if (this->input.loss == "potential" || this->input.loss == "both" || this->input.loss == "both_new") this->setUpFFT();
+    Input::print("init fft done");
+    
+    std::chrono::_V2::system_clock::time_point start, end;
+    this->nn = std::make_shared<NN_OFImpl>(this->data_train.nx_tot, this->data_vali.nx_tot, this->ninput, this->input.nnode, this->input.nlayer, this->device);
+    torch::DeviceType device_type = torch::kCPU;
+    torch::load(this->nn, "net.pt", device_type);
+    // this->nn->to(this->device);
+    Input::print("init_nn done");
+    this->nn->set_data(&(this->data_train), this->descriptor_type, this->kernel_index, this->nn->inputs);
+    this->nn->set_data(&(this->data_vali), this->descriptor_type, this->kernel_index, this->nn->input_vali);
+
+    this->nn->inputs.requires_grad_(true);
+
+    if (this->input.loss == "potential" || this->input.loss == "both" || this->input.loss == "both_new") this->data_train.pauli.resize_({this->input.ntrain, this->input.fftdim, this->input.fftdim, this->input.fftdim});
+
+    for (int batch_index = 0; batch_index < this->input.ntrain; ++batch_index)
+    {
+        for (int ii = 0; ii < 1; ++ii)
+        {
+            torch::Tensor inpts = torch::slice(this->nn->inputs, 0, ii*this->data_train.nx, (ii + 1)*this->data_train.nx);
+            inpts.requires_grad_(true);
+            torch::Tensor prediction = this->nn->forward(inpts);
+            if (this->input.feg_limit != 3)
+            {
+                prediction = torch::softplus(prediction);
+            }
+            if (this->input.feg_limit != 0)
+            {
+                // if (this->input.ml_gamma) if (this->feg_inpt[this->nn_input_index["gamma"]].grad().numel()) this->feg_inpt[this->nn_input_index["gamma"]].grad().zero_();
+                if (this->feg_inpt.grad().numel()) this->feg_inpt.grad().zero_();
+                this->feg_predict = this->nn->forward(this->feg_inpt);
+                // if (this->input.ml_gamma) this->feg_dFdgamma = torch::autograd::grad({this->feg_predict}, {this->feg_inpt[this->nn_input_index["gamma"]]},
+                //                                                                 {torch::ones(1)}, true, true)[0];
+                // if (this->input.ml_gamma) this->feg_dFdgamma = torch::autograd::grad({this->feg_predict}, {this->feg_inpt},
+                //                                                                 {torch::ones_like(this->feg_predict)}, true, true)[0][this->nn_input_index["gamma"]];
+                if (this->input.feg_limit == 1) prediction = prediction - torch::softplus(this->feg_predict) + 1.;
+                if (this->input.feg_limit == 3) prediction = torch::softplus(prediction - this->feg_predict + this->feg3_correct);
+            }
+
+            start = std::chrono::high_resolution_clock::now();
+            torch::Tensor gradient = torch::autograd::grad({prediction}, {inpts},
+                                                    {torch::ones_like(prediction)}, true, true)[0];
+            end = std::chrono::high_resolution_clock::now();
+            std::cout << "spend " << double(std::chrono::duration_cast<std::chrono::microseconds>(end - start).count()) * std::chrono::microseconds::period::num / std::chrono::microseconds::period::den << " s" << std::endl;
+            
+            std::cout << "begin potential" << std::endl;
+            torch::Tensor weight = torch::pow(this->data_train.rho, this->input.exponent/3.);
+
+            torch::Tensor pot = this->potential.get_potential(ii, this->data_train, torch::slice(prediction, 0, ii*this->data_train.nx, (ii + 1)*this->data_train.nx).reshape({this->input.fftdim, this->input.fftdim, this->input.fftdim}), gradient, this->kernel_train, this->grid_train);
+            std::cout << "after potential" << std::endl;
+
+            torch::Tensor loss = this->lossFunction(pot, this->data_train.pauli[ii], this->data_train.pauli_mean[ii]) * this->input.coef_p;
+            if (this->input.loss == "both")
+            {
+                loss = loss + this->input.coef_e * this->lossFunction(prediction, torch::slice(this->data_train.enhancement, 0, ii*this->data_train.nx, (ii + 1)*this->data_train.nx), this->data_train.enhancement_mean[ii]);
+            }
+            if (this->input.loss == "both_new")
+            {
+                loss = loss + this->input.coef_e * this->lossFunction_new(prediction, torch::slice(this->data_train.enhancement, 0, ii*this->data_train.nx, (ii + 1)*this->data_train.nx), weight[ii].reshape({this->data_train.nx, 1}), this->data_train.tau_mean[ii]);
+            }
+            std::cout << "after loss" << std::endl;
+            // loss = loss + this->input.coef_e * this->lossFunction(prediction, torch::slice(this->data_train.enhancement, 0, ii*this->data_train.nx, (ii + 1)*this->data_train.nx));
+            double lossTrain = loss.item<double>();
+            std::cout << "loss = " << lossTrain << std::endl;
+            this->data_train.dumpTensor(pot.reshape({this->data_train.nx}), "potential-nnof.npy", this->data_train.nx);
+            this->data_train.dumpTensor(torch::slice(prediction, 0, ii*this->data_train.nx, (ii + 1)*this->data_train.nx).reshape({this->data_train.nx}), "enhancement-nnof.npy", this->data_train.nx);
+            // this->dumpTensor(torch::slice(this->nn->F, 0, ii*this->data_train.nx, (ii + 1)*this->data_train.nx).reshape({this->data_train.nx}), "F_fcc.npy", this->data_train.nx);
+        }
+        // std::cout << std::setiosflags(std::ios::scientific) <<  std::setprecision(12) << this->nn->parameters() << std::endl;
+    }
+}
+
diff --git a/source/module_hamilt_pw/hamilt_ofdft/ml_tools/train_kedf.h b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/train_kedf.h
new file mode 100644
index 0000000000..f744de00df
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_ofdft/ml_tools/train_kedf.h
@@ -0,0 +1,89 @@
+#ifndef TRAIN_KEDF_H
+#define TRAIN_KEDF_H
+
+#include "./data.h"
+#include "./grid.h"
+#include "./input.h"
+#include "./kernel.h"
+#include "./nn_of.h"
+#include "./pauli_potential.h"
+
+#include <torch/torch.h>
+
+class Train_KEDF
+{
+  public:
+    Train_KEDF(){};
+    ~Train_KEDF();
+
+    std::shared_ptr<NN_OFImpl> nn;
+    Input input;
+    Grid grid_train;
+    Grid grid_vali;
+    Kernel *kernel_train = nullptr;
+    Kernel *kernel_vali = nullptr;
+    PauliPotential potential;
+    //----------- training set -----------
+    Data data_train;
+    double *train_volume = nullptr;
+    //---------validation set ------------
+    Data data_vali;
+    double *vali_volume = nullptr;
+    // ------------------------------------
+
+    torch::Device device = torch::Device(torch::kCUDA);
+    int ninput = 0;
+    std::vector<std::string> descriptor_type = {};
+    std::vector<int> kernel_index = {};
+
+    // -------- free electron gas ---------
+    torch::Tensor feg_inpt;
+    torch::Tensor feg_predict;
+    torch::Tensor feg_dFdgamma;
+
+    // ----------- constants ---------------
+    double feg3_correct = 0.541324854612918; // ln(e - 1)
+    const double cTF
+        = 3.0 / 10.0 * std::pow(3 * std::pow(M_PI, 2.0), 2.0 / 3.0)
+          * 2; // 10/3*(3*pi^2)^{2/3}, multiply by 2 to convert unit from Hartree to Ry, finally in Ry*Bohr^(-2)
+    const double pqcoef = 1.0 / (4.0 * std::pow(3 * std::pow(M_PI, 2.0), 2.0 / 3.0)); // coefficient of p and q
+
+    void train();
+    void potTest();
+    void setUpFFT();
+    void set_device();
+    void init_input_index();
+    void init();
+
+    torch::Tensor lossFunction(torch::Tensor enhancement, torch::Tensor target, torch::Tensor coef = torch::ones(1));
+    torch::Tensor lossFunction_new(torch::Tensor enhancement,
+                                   torch::Tensor target,
+                                   torch::Tensor weight,
+                                   torch::Tensor coef = torch::ones(1));
+};
+
+// class OF_data : public torch::data::Dataset<OF_data>
+// {
+// private:
+//     torch::Tensor input;
+//     torch::Tensor target;
+
+// public:
+//     explicit OF_data(torch::Tensor &input, torch::Tensor &target)
+//     {
+//         this->input = input.clone();
+//         this->target = target.clone();
+//     }
+
+//     torch::data::Example<> get(size_t index) override
+//     {
+//         return {this->input[index], this->target[index]};
+//     }
+
+//     torch::optional<size_t> size() const override
+//     {
+//         return this->input.size(0);
+//     }
+// };
+
+#endif
\ No newline at end of file
diff --git a/source/module_io/read_input_item_ofdft.cpp b/source/module_io/read_input_item_ofdft.cpp
index 9d4b90268e..dd29330a92 100644
--- a/source/module_io/read_input_item_ofdft.cpp
+++ b/source/module_io/read_input_item_ofdft.cpp
@@ -10,6 +10,84 @@ void ReadInput::item_ofdft()
     {
         Input_Item item("of_kinetic");
         item.annotation = "kinetic energy functional, such as tf, vw, wt";
+        item.check_value = [](const Input_Item& item, const Parameter& para) {
+#ifndef __MLKEDF
+            if (para.input.of_kinetic == "ml" || para.input.of_kinetic == "mpn" || para.input.of_kinetic == "cpn5")
+            {
+                ModuleBase::WARNING_QUIT("ReadInput", "ML KEDF is not supported.");
+            }
+#endif
+            if (para.input.of_kinetic != "tf" && para.input.of_kinetic != "vw" && para.input.of_kinetic != "wt"
+                && para.input.of_kinetic != "lkt" && para.input.of_kinetic != "tf+" 
+                && para.input.of_kinetic != "ml" && para.input.of_kinetic != "mpn" && para.input.of_kinetic != "cpn5")
+            {
+                ModuleBase::WARNING_QUIT("ReadInput", "of_kinetic must be tf, vw, tf+, wt, lkt, ml, mpn, or cpn5");
+            }
+        };
+        item.reset_value = [](const Input_Item& item, Parameter& para) {
+            // Set the default parameters for MPN or CPN5 KEDF
+            if (para.input.of_kinetic == "mpn")
+            {
+                para.input.of_kinetic = "ml";
+
+                para.input.of_ml_feg = 3;
+                para.input.of_ml_nkernel = 1;
+                para.input.of_ml_kernel = {1};
+                para.input.of_ml_kernel_scaling = {1.0};
+                para.input.of_ml_yukawa_alpha = {1.0};
+                para.input.of_ml_gamma = false;
+                para.input.of_ml_p = false;
+                para.input.of_ml_q = false;
+                para.input.of_ml_tanhp = true;
+                para.input.of_ml_tanhq = false;
+                para.input.of_ml_chi_p = 0.2;
+                para.input.of_ml_chi_q = 0.1;
+                para.input.of_ml_gammanl = {0};
+                para.input.of_ml_pnl = {0};
+                para.input.of_ml_qnl = {0};
+                para.input.of_ml_xi = {0};
+                para.input.of_ml_tanhxi = {1};
+                para.input.of_ml_tanhxi_nl = {1};
+                para.input.of_ml_tanh_pnl = {0};
+                para.input.of_ml_tanh_qnl = {0};
+                para.input.of_ml_tanhp_nl = {1};
+                para.input.of_ml_tanhq_nl = {0};
+                para.input.of_ml_chi_xi = {1.0};
+                para.input.of_ml_chi_pnl = {0.2};
+                para.input.of_ml_chi_qnl = {0.1};
+            }
+
+            if (para.input.of_kinetic == "cpn5")
+            {
+                para.input.of_kinetic = "ml";
+
+                para.input.of_ml_feg = 3;
+                para.input.of_ml_nkernel = 5;
+                para.input.of_ml_kernel = {1, 1, 1, 1, 1};
+                para.input.of_ml_kernel_scaling = {2.0, 1.5, 1.0, 0.75, 0.5};
+                para.input.of_ml_yukawa_alpha = {1.0, 1.0, 1.0, 1.0, 1.0};
+                para.input.of_ml_gamma = false;
+                para.input.of_ml_p = false;
+                para.input.of_ml_q = false;
+                para.input.of_ml_tanhp = true;
+                para.input.of_ml_tanhq = false;
+                para.input.of_ml_chi_p = 0.2;
+                para.input.of_ml_chi_q = 0.1;
+                para.input.of_ml_gammanl = {0, 0, 0, 0, 0};
+                para.input.of_ml_pnl = {0, 0, 0, 0, 0};
+                para.input.of_ml_qnl = {0, 0, 0, 0, 0};
+                para.input.of_ml_xi = {0, 0, 0, 0, 0};
+                para.input.of_ml_tanhxi = {1, 1, 1, 1, 1};
+                para.input.of_ml_tanhxi_nl = {1, 1, 1, 1, 1};
+                para.input.of_ml_tanh_pnl = {0, 0, 0, 0, 0};
+                para.input.of_ml_tanh_qnl = {0, 0, 0, 0, 0};
+                para.input.of_ml_tanhp_nl = {1, 1, 1, 1, 1};
+                para.input.of_ml_tanhq_nl = {0, 0, 0, 0, 0};
+                para.input.of_ml_chi_xi = {0.6, 0.8, 1.0, 1.5, 3.0};
+                para.input.of_ml_chi_pnl = {0.2, 0.2, 0.2, 0.2, 0.2};
+                para.input.of_ml_chi_qnl = {0.1, 0.1, 0.1, 0.1, 0.1};
+            }
+        };
         read_sync_string(input.of_kinetic);
         this->add_item(item);
     }
@@ -131,5 +209,257 @@ void ReadInput::item_ofdft()
         read_sync_string(input.of_kernel_file);
         this->add_item(item);
     }
+    {
+        Input_Item item("of_ml_gene_data");
+        item.annotation = "Generate training data or not";
+        read_sync_bool(input.of_ml_gene_data);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_device");
+        item.annotation = "Run NN on GPU or CPU";
+        read_sync_string(input.of_ml_device);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_feg");
+        item.annotation = "The Free Electron Gas limit: 0: no, 3: yes";
+        read_sync_int(input.of_ml_feg);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_nkernel");
+        item.annotation = "Number of kernels";
+        item.reset_value = [](const Input_Item& item, Parameter& para) {
+            if (para.input.of_ml_nkernel > 0)
+            {
+                reset_vector(para.input.of_ml_gammanl, para.input.of_ml_nkernel, 0);
+                reset_vector(para.input.of_ml_pnl, para.input.of_ml_nkernel, 0);
+                reset_vector(para.input.of_ml_qnl, para.input.of_ml_nkernel, 0);
+                reset_vector(para.input.of_ml_xi, para.input.of_ml_nkernel, 0);
+                reset_vector(para.input.of_ml_tanhxi, para.input.of_ml_nkernel, 0);
+                reset_vector(para.input.of_ml_tanhxi_nl, para.input.of_ml_nkernel, 0);
+                reset_vector(para.input.of_ml_tanh_pnl, para.input.of_ml_nkernel, 0);
+                reset_vector(para.input.of_ml_tanh_qnl, para.input.of_ml_nkernel, 0);
+                reset_vector(para.input.of_ml_tanhp_nl, para.input.of_ml_nkernel, 0);
+                reset_vector(para.input.of_ml_tanhq_nl, para.input.of_ml_nkernel, 0);
+                reset_vector(para.input.of_ml_chi_xi, para.input.of_ml_nkernel, 1.0);
+                reset_vector(para.input.of_ml_chi_pnl, para.input.of_ml_nkernel, 1.0);
+                reset_vector(para.input.of_ml_chi_qnl, para.input.of_ml_nkernel, 1.0);
+                reset_vector(para.input.of_ml_kernel, para.input.of_ml_nkernel, 1);
+                reset_vector(para.input.of_ml_kernel_scaling, para.input.of_ml_nkernel, 1.0);
+                reset_vector(para.input.of_ml_yukawa_alpha, para.input.of_ml_nkernel, 1.0);
+                std::string none = "none";
+                reset_vector(para.input.of_ml_kernel_file, para.input.of_ml_nkernel, none);
+            }
+        };
+        read_sync_int(input.of_ml_nkernel);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_kernel");
+        item.annotation = "Type of kernel, 1 for wt, 2 for yukawa, and 3 for TKK";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_kernel);
+        };
+        sync_intvec(input.of_ml_kernel, para.input.of_ml_kernel.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_kernel_scaling");
+        item.annotation = "Scaling parameter of kernel, w(r-r') = scaling^3 * w(scaling (r-r'))";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_kernel_scaling);
+        };
+        sync_doublevec(input.of_ml_kernel_scaling, para.input.of_ml_kernel_scaling.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_yukawa_alpha");
+        item.annotation = "Parameter alpha of yukawa kernel";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_yukawa_alpha);
+        };
+        sync_doublevec(input.of_ml_yukawa_alpha, para.input.of_ml_yukawa_alpha.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_kernel_file");
+        item.annotation = "The file of TKK";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            size_t count = item.get_size();
+            for (int i = 0; i < count; i++)
+            { 
+                para.input.of_ml_kernel_file.push_back(item.str_values[i]);
+            }
+        };
+        sync_stringvec(input.of_ml_kernel_file, para.input.of_ml_kernel_file.size(), "");
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_gamma");
+        item.annotation = "Descriptor: gamma = (rho / rho0)^(1/3)";
+        read_sync_bool(input.of_ml_gamma);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_p");
+        item.annotation = "Descriptor: p = |nabla rho|^2 / [2 (3 pi^2)^(1/3) rho^(4/3)]^2";
+        read_sync_bool(input.of_ml_p);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_q");
+        item.annotation = "Descriptor: q = nabla^2 rho / [4 (3 pi^2)^(2/3) rho^(5/3)]";
+        read_sync_bool(input.of_ml_q);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_tanhp");
+        item.annotation = "Descriptor: tanhp = tanh(chi_p * p)";
+        read_sync_bool(input.of_ml_tanhp);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_tanhq");
+        item.annotation = "Descriptor: tanhq = tanh(chi_q * q)";
+        read_sync_bool(input.of_ml_tanhq);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_chi_p");
+        item.annotation = "Hyperparameter: tanhp = tanh(chi_p * p)";
+        read_sync_double(input.of_ml_chi_p);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_chi_q");
+        item.annotation = "Hyperparameter: tanhq = tanh(chi_q * q)";
+        read_sync_double(input.of_ml_chi_q);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_gammanl");
+        item.annotation = "Descriptor: gammanl = int{gamma(r') * w(r-r') dr'}";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_gammanl);
+        };
+        sync_intvec(input.of_ml_gammanl, para.input.of_ml_gammanl.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_pnl");
+        item.annotation = "Descriptor: pnl = int{p(r') * w(r-r') dr'}";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_pnl);
+        };
+        sync_intvec(input.of_ml_pnl, para.input.of_ml_pnl.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_qnl");
+        item.annotation = "Descriptor: qnl = int{q(r') * w(r-r') dr'}";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_qnl);
+        };
+        sync_intvec(input.of_ml_qnl, para.input.of_ml_qnl.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_xi");
+        item.annotation = "Descriptor: xi = int{rho(r')^(1/3) * w(r-r') dr'} / rho^(1/3)";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_xi);
+        };
+        sync_intvec(input.of_ml_xi, para.input.of_ml_xi.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_tanhxi");
+        item.annotation = "Descriptor: tanhxi = tanh(chi_xi * xi)";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_tanhxi);
+        };
+        sync_intvec(input.of_ml_tanhxi, para.input.of_ml_tanhxi.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_tanhxi_nl");
+        item.annotation = "Descriptor: tanhxi_nl = int{tanhxi(r') * w(r-r') dr'}";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_tanhxi_nl);
+        };
+        sync_intvec(input.of_ml_tanhxi_nl, para.input.of_ml_tanhxi_nl.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_tanh_pnl");
+        item.annotation = "Descriptor: tanh_pnl = tanh(chi_pnl * pnl)";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_tanh_pnl);
+        };
+        sync_intvec(input.of_ml_tanh_pnl, para.input.of_ml_tanh_pnl.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_tanh_qnl");
+        item.annotation = "Descriptor: tanh_qnl = tanh(chi_qnl * qnl)";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_tanh_qnl);
+        };
+        sync_intvec(input.of_ml_tanh_qnl, para.input.of_ml_tanh_qnl.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_tanhp_nl");
+        item.annotation = "Descriptor: tanhp_nl = int{tanhp(r') * w(r-r') dr'}";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_tanhp_nl);
+        };
+        sync_intvec(input.of_ml_tanhp_nl, para.input.of_ml_tanhp_nl.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_tanhq_nl");
+        item.annotation = "Descriptor: tanhq_nl = int{tanhq(r') * w(r-r') dr'}";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_tanhq_nl);
+        };
+        sync_intvec(input.of_ml_tanhq_nl, para.input.of_ml_tanhq_nl.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_chi_xi");
+        item.annotation = "Hyperparameter: tanhpxi = tanh(chi_xi * xi)";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_chi_xi);
+        };
+        sync_doublevec(input.of_ml_chi_xi, para.input.of_ml_chi_xi.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_chi_pnl");
+        item.annotation = "Hyperparameter: tanh_pnl = tanh(chi_pnl * pnl)";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_chi_pnl);
+        };
+        sync_doublevec(input.of_ml_chi_pnl, para.input.of_ml_chi_pnl.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_chi_qnl");
+        item.annotation = "Hyperparameter: tanh_qnl = tanh(chi_qnl * qnl)";
+        item.read_value = [](const Input_Item& item, Parameter& para) {
+            parse_expression(item.str_values, para.input.of_ml_chi_qnl);
+        };
+        sync_doublevec(input.of_ml_chi_qnl, para.input.of_ml_chi_qnl.size(), 0);
+        this->add_item(item);
+    }
+    {
+        Input_Item item("of_ml_local_test");
+        item.annotation = "Test: read in the density, and output the F and Pauli potential";
+        read_sync_bool(input.of_ml_local_test);
+        this->add_item(item);
+    }
 }
 } // namespace ModuleIO
\ No newline at end of file
diff --git a/source/module_io/read_input_tool.h b/source/module_io/read_input_tool.h
index 665d045ce8..649b0044e5 100644
--- a/source/module_io/read_input_tool.h
+++ b/source/module_io/read_input_tool.h
@@ -203,4 +203,13 @@ void parse_expression(const std::vector<std::string>& expressions, std::vector<T
             throw std::runtime_error("Invalid expression: " + expr + " - More than one '*' found.");
         }
     }
+}
+
+template <typename T>
+void reset_vector(std::vector<T>& vec, int size, T default_value)
+{
+    if (vec.size() != size)
+    {
+        vec.resize(size, default_value);
+    }
 }
\ No newline at end of file
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index 7d50b3db1e..3e2a1a3528 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -208,6 +208,42 @@ struct Input_para
                                                  ///< formula. Only usable for WT KEDF.
     std::string of_kernel_file = "WTkernel.txt"; ///< The name of WT kernel file.
 
+    // ML KEDF, sunliang added on 2022-11-07
+    bool of_ml_gene_data = false;                ///< Generate training data or not
+    // device
+    std::string of_ml_device = "cpu";                      ///< Run NN on GPU or CPU
+    int of_ml_feg = 0;                                     ///< The Free Electron Gas limit: 0: no, 3: yes
+    // kernel
+    int of_ml_nkernel = 1;                                 ///< Number of kernels
+    std::vector<int> of_ml_kernel = {1};                   ///< Type of kernel, 1 for wt, 2 for yukawa, and 3 for TKK
+    std::vector<double> of_ml_kernel_scaling = {1.0};      ///< Scaling parameter of kernel, w(r-r') = lambda^3 * w(lambda (r-r')), lambda = 1/scaling
+    std::vector<double> of_ml_yukawa_alpha = {1.0};        ///< Parameter alpha of yukawa kernel
+    std::vector<std::string> of_ml_kernel_file = {"none"}; ///< The file of TKK
+    // semi-local descriptors
+    bool of_ml_gamma = false;                    ///< Descriptor: gamma = (rho / rho0)^(1/3)
+    bool of_ml_p = false;                        ///< Descriptor: p = |nabla rho|^2 / [2 (3 pi^2)^(1/3) rho^(4/3)]^2
+    bool of_ml_q = false;                        ///< Descriptor: q = nabla^2 rho / [4 (3 pi^2)^(2/3) rho^(5/3)]
+    bool of_ml_tanhp = false;                    ///< Descriptor: tanhp = tanh(chi_p * p)
+    bool of_ml_tanhq = false;                    ///< Descriptor: tanhq = tanh(chi_q * q)
+    double of_ml_chi_p = 1.0;                    ///< Hyperparameter: tanhp = tanh(chi_p * p)
+    double of_ml_chi_q = 1.0;                    ///< Hyperparameter: tanhq = tanh(chi_q * q)
+    // non-local descriptors
+    // of_ml_gammanl should be a vector of bool, but here we use a vector of int for convinience
+    std::vector<int> of_ml_gammanl = {0};        ///< Descriptor: gammanl = int{gamma(r') * w(r-r') dr'}
+    std::vector<int> of_ml_pnl = {0};            ///< Descriptor: pnl = int{p(r') * w(r-r') dr'}
+    std::vector<int> of_ml_qnl = {0};            ///< Descriptor: qnl = int{q(r') * w(r-r') dr'}
+    std::vector<int> of_ml_xi = {0};             ///< Descriptor: xi = int{rho(r')^(1/3) * w(r-r') dr'} / rho^(1/3)
+    std::vector<int> of_ml_tanhxi = {0};         ///< Descriptor: tanhxi = tanh(chi_xi * xi)
+    std::vector<int> of_ml_tanhxi_nl = {0};      ///< Descriptor: tanhxi_nl = int{tanhxi(r') * w(r-r') dr'}
+    std::vector<int> of_ml_tanh_pnl = {0};       ///< Descriptor: tanh_pnl = tanh(chi_pnl * pnl)
+    std::vector<int> of_ml_tanh_qnl = {0};       ///< Descriptor: tanh_qnl = tanh(chi_qnl * qnl)
+    std::vector<int> of_ml_tanhp_nl = {0};       ///< Descriptor: tanhp_nl = int{tanhp(r') * w(r-r') dr'}
+    std::vector<int> of_ml_tanhq_nl = {0};       ///< Descriptor: tanhq_nl = int{tanhq(r') * w(r-r') dr'}
+    std::vector<double> of_ml_chi_xi = {1.0};    ///< Hyperparameter: tanhpxi = tanh(chi_xi * xi)
+    std::vector<double> of_ml_chi_pnl = {1.0};   ///< Hyperparameter: tanh_pnl = tanh(chi_pnl * pnl)
+    std::vector<double> of_ml_chi_qnl = {1.0};   ///< Hyperparameter: tanh_qnl = tanh(chi_qnl * qnl)
+    bool of_ml_local_test = false;               ///< Test: read in the density, and output the F and Pauli potential
+
     // ==============   #Parameters (7.stochastic DFT) ===========================
     int method_sto = 2;        ///< different methods for sdft, 1: slow, less memory 2:
                                ///< fast, more memory
diff --git a/tests/integrate/902_OF_KE_CPN5/INPUT b/tests/integrate/902_OF_KE_CPN5/INPUT
new file mode 100644
index 0000000000..432af30174
--- /dev/null
+++ b/tests/integrate/902_OF_KE_CPN5/INPUT
@@ -0,0 +1,40 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix			autotest
+calculation     scf
+esolver_type    ofdft
+ntype			1
+symmetry		1
+out_chg                 1
+pseudo_dir      ../../PP_ORB/
+pseudo_rcut     16
+
+#Parameters (2.Iteration)
+ecutwfc			20
+scf_nmax        200
+
+#OFDFT
+of_kinetic      cpn5
+of_method       tn
+of_conv         energy
+of_tole         2e-6
+of_full_pw      1
+of_full_pw_dim  0
+
+#Parameters (3.Basis)
+basis_type		pw
+of_ml_device	cpu
+# of_ml_feg       3
+# of_ml_nkernel   5
+# of_ml_chi_xi    0.6 0.8 1.0 1.5 3.0
+# of_ml_chi_p    0.2
+# of_ml_chi_q    0.1
+# of_ml_chi_pnl    0.2 0.2 0.2 0.2 0.2
+# of_ml_chi_qnl    0.1 0.1 0.1 0.1 0.1
+# of_ml_kernel    1 1 1 1 1
+# of_ml_yukawa_alpha    1 1 1 1 1
+# of_ml_kernel_scaling    2.0 1.5 1.0 0.75 0.5
+# of_ml_tanhxi    1 1 1 1 1
+# of_ml_tanhxi_nl    1 1 1 1 1
+# of_ml_tanhp    1
+# of_ml_tanhp_nl    1 1 1 1 1
diff --git a/tests/integrate/902_OF_KE_CPN5/KPT b/tests/integrate/902_OF_KE_CPN5/KPT
new file mode 100644
index 0000000000..c289c0158a
--- /dev/null
+++ b/tests/integrate/902_OF_KE_CPN5/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/integrate/902_OF_KE_CPN5/STRU b/tests/integrate/902_OF_KE_CPN5/STRU
new file mode 100644
index 0000000000..2e022904e7
--- /dev/null
+++ b/tests/integrate/902_OF_KE_CPN5/STRU
@@ -0,0 +1,19 @@
+ATOMIC_SPECIES
+Si 3 si.lda.lps blps
+
+LATTICE_CONSTANT
+10.334452882595281  // add lattice constant
+
+LATTICE_VECTORS
+0.000000000000    0.500000000000    0.500000000000    
+0.500000000000    0.000000000000    0.500000000000    
+0.500000000000    0.500000000000    0.000000000000    
+
+ATOMIC_POSITIONS
+Direct
+
+Si
+0.0
+2
+    0.000000000000    0.000000000000    0.000000000000 1 1 1
+    0.250000000000    0.250000000000    0.250000000000 1 1 1
diff --git a/tests/integrate/902_OF_KE_CPN5/jd b/tests/integrate/902_OF_KE_CPN5/jd
new file mode 100644
index 0000000000..f24ecd6117
--- /dev/null
+++ b/tests/integrate/902_OF_KE_CPN5/jd
@@ -0,0 +1 @@
+Test the energy of CPN5 kinetic energy functional (of_method = ml) in OFDFT, symmetry=on
diff --git a/tests/integrate/902_OF_KE_CPN5/net.pt b/tests/integrate/902_OF_KE_CPN5/net.pt
new file mode 100644
index 0000000000..fa0158e40a
Binary files /dev/null and b/tests/integrate/902_OF_KE_CPN5/net.pt differ
diff --git a/tests/integrate/902_OF_KE_CPN5/result.ref b/tests/integrate/902_OF_KE_CPN5/result.ref
new file mode 100644
index 0000000000..777b03954d
--- /dev/null
+++ b/tests/integrate/902_OF_KE_CPN5/result.ref
@@ -0,0 +1,6 @@
+etotref -218.1572355140357
+etotperatomref -109.0786177570
+pointgroupref T_d
+spacegroupref O_h
+nksibzref 1
+totaltimeref 28.35
diff --git a/tests/integrate/902_OF_KE_MPN/INPUT b/tests/integrate/902_OF_KE_MPN/INPUT
new file mode 100644
index 0000000000..aca1a6227d
--- /dev/null
+++ b/tests/integrate/902_OF_KE_MPN/INPUT
@@ -0,0 +1,34 @@
+INPUT_PARAMETERS
+#Parameters (1.General)
+suffix			autotest
+calculation     scf
+esolver_type    ofdft
+
+symmetry		1
+pseudo_dir      ../../PP_ORB/
+pseudo_rcut     16
+
+#Parameters (2.Iteration)
+ecutwfc			20
+scf_nmax		100
+dft_functional  XC_GGA_X_PBE+XC_GGA_C_PBE
+
+
+#Parameters (3.Basis)
+basis_type		pw
+
+#OFDFT
+of_kinetic      mpn
+of_ml_device    cpu
+
+#of_ml_chi_xi      1
+#of_ml_chi_p       0.2
+#of_ml_chi_q       0.1
+#of_ml_chi_pnl     0.2
+#of_ml_chi_qnl     0.1
+#
+#of_ml_tanhxi    1
+#of_ml_tanhxi_nl    1
+#of_ml_tanhp    1
+#of_ml_tanhp_nl    1
+#of_ml_feg      3
\ No newline at end of file
diff --git a/tests/integrate/902_OF_KE_MPN/KPT b/tests/integrate/902_OF_KE_MPN/KPT
new file mode 100644
index 0000000000..c289c0158a
--- /dev/null
+++ b/tests/integrate/902_OF_KE_MPN/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+1 1 1 0 0 0
diff --git a/tests/integrate/902_OF_KE_MPN/STRU b/tests/integrate/902_OF_KE_MPN/STRU
new file mode 100644
index 0000000000..18e837d78e
--- /dev/null
+++ b/tests/integrate/902_OF_KE_MPN/STRU
@@ -0,0 +1,18 @@
+ATOMIC_SPECIES
+Al 13 al.gga.psp blps
+
+LATTICE_CONSTANT
+7.6513590200098225  // add lattice constant
+
+LATTICE_VECTORS
+0.000000000000    0.500000000000    0.500000000000    
+0.500000000000    0.000000000000    0.500000000000    
+0.500000000000    0.500000000000    0.000000000000    
+
+ATOMIC_POSITIONS
+Direct
+
+Al
+0.0
+1
+    0.000000000000    0.000000000000    0.000000000000 1 1 1
diff --git a/tests/integrate/902_OF_KE_MPN/jd b/tests/integrate/902_OF_KE_MPN/jd
new file mode 100644
index 0000000000..59599453f4
--- /dev/null
+++ b/tests/integrate/902_OF_KE_MPN/jd
@@ -0,0 +1 @@
+Test the energy of MPN kinetic energy functional (of_method = ml) in OFDFT, symmetry=on
diff --git a/tests/integrate/902_OF_KE_MPN/net.pt b/tests/integrate/902_OF_KE_MPN/net.pt
new file mode 100644
index 0000000000..a3e52b9126
Binary files /dev/null and b/tests/integrate/902_OF_KE_MPN/net.pt differ
diff --git a/tests/integrate/902_OF_KE_MPN/result.ref b/tests/integrate/902_OF_KE_MPN/result.ref
new file mode 100644
index 0000000000..d59b152a4b
--- /dev/null
+++ b/tests/integrate/902_OF_KE_MPN/result.ref
@@ -0,0 +1,6 @@
+etotref -57.16291065525348
+etotperatomref -57.1629106553
+pointgroupref O_h
+spacegroupref O_h
+nksibzref 1
+totaltimeref 1.07
diff --git a/tests/integrate/CASES_CPU.txt b/tests/integrate/CASES_CPU.txt
index 134be1fbae..cce742247b 100644
--- a/tests/integrate/CASES_CPU.txt
+++ b/tests/integrate/CASES_CPU.txt
@@ -342,6 +342,8 @@
 902_OF_KE_vW
 902_OF_KE_WT
 902_OF_KE_LKT
+902_OF_KE_MPN
+902_OF_KE_CPN5
 903_OF_TF_weight
 903_OF_vW_weight
 903_OF_WT_alphabeta